189 lines
6.3 KiB
Python
189 lines
6.3 KiB
Python
import os
|
|
import time
|
|
|
|
from src.pg_agent import PGAgent
|
|
|
|
import os
|
|
import time
|
|
|
|
import gymnasium as gym
|
|
import numpy as np
|
|
import torch
|
|
from src import pytorch_util as ptu
|
|
|
|
from src import utils
|
|
from src.logger import Logger
|
|
from src.action_noise_wrapper import ActionNoiseWrapper
|
|
|
|
MAX_NVIDEO = 2
|
|
|
|
|
|
def run_training_loop(args):
|
|
logger = Logger(args.logdir)
|
|
|
|
# set random seeds
|
|
np.random.seed(args.seed)
|
|
torch.manual_seed(args.seed)
|
|
ptu.init_gpu(use_gpu=not args.no_gpu, gpu_id=args.which_gpu)
|
|
|
|
# make the gym environment
|
|
env = gym.make(args.env_name, render_mode=None)
|
|
discrete = isinstance(env.action_space, gym.spaces.Discrete)
|
|
|
|
# add action noise, if needed
|
|
if args.action_noise_std > 0:
|
|
assert not discrete, f"Cannot use --action_noise_std for discrete environment {args.env_name}"
|
|
env = ActionNoiseWrapper(env, args.seed, args.action_noise_std)
|
|
|
|
ob_dim = env.observation_space.shape[0]
|
|
ac_dim = env.action_space.n if discrete else env.action_space.shape[0]
|
|
|
|
# simulation timestep, will be used for video saving
|
|
if hasattr(env, "model"):
|
|
fps = 1 / env.model.opt.timestep
|
|
else:
|
|
fps = env.env.metadata["render_fps"]
|
|
|
|
# initialize agent
|
|
agent = PGAgent(
|
|
ob_dim,
|
|
ac_dim,
|
|
discrete,
|
|
n_layers=args.n_layers,
|
|
layer_size=args.layer_size,
|
|
gamma=args.discount,
|
|
learning_rate=args.learning_rate,
|
|
use_baseline=args.use_baseline,
|
|
use_reward_to_go=args.use_reward_to_go,
|
|
normalize_advantages=args.normalize_advantages,
|
|
baseline_learning_rate=args.baseline_learning_rate,
|
|
baseline_gradient_steps=args.baseline_gradient_steps,
|
|
gae_lambda=args.gae_lambda,
|
|
)
|
|
|
|
total_envsteps = 0
|
|
start_time = time.time()
|
|
|
|
for itr in range(args.n_iter):
|
|
print(f"\n********** Iteration {itr} ************")
|
|
# sample `args.batch_size` transitions using utils.sample_trajectories
|
|
trajs, envsteps_this_batch = utils.sample_trajectories(
|
|
env, agent.actor, args.batch_size, False
|
|
)
|
|
|
|
total_envsteps += envsteps_this_batch
|
|
|
|
# trajs should be a list of dictionaries of NumPy arrays, where each dictionary corresponds to a trajectory.
|
|
# this line converts this into a single dictionary of lists of NumPy arrays.
|
|
trajs_dict = {k: [traj[k] for traj in trajs] for k in trajs[0]}
|
|
|
|
# train the agent using the sampled trajectories and the agent's update function
|
|
# agent.update
|
|
train_info: dict = agent.update(
|
|
trajs_dict["observation"], trajs_dict["action"], trajs_dict["reward"], trajs_dict["terminal"]
|
|
)
|
|
|
|
if itr % args.scalar_log_freq == 0:
|
|
# save eval metrics
|
|
print("\nCollecting data for eval...")
|
|
eval_trajs, eval_envsteps_this_batch = utils.sample_trajectories(
|
|
env, agent.actor, args.eval_batch_size,
|
|
)
|
|
|
|
logs = utils.compute_metrics(trajs, eval_trajs)
|
|
# compute additional metrics
|
|
logs.update(train_info)
|
|
logs["Train_EnvstepsSoFar"] = total_envsteps
|
|
logs["TimeSinceStart"] = time.time() - start_time
|
|
if itr == 0:
|
|
logs["Initial_DataCollection_AverageReturn"] = logs[
|
|
"Train_AverageReturn"
|
|
]
|
|
|
|
# perform the logging
|
|
for key, value in logs.items():
|
|
print("{} : {}".format(key, value))
|
|
logger.log_scalar(value, key, itr)
|
|
print("Done logging...\n\n")
|
|
|
|
logger.flush()
|
|
|
|
if args.video_log_freq != -1 and itr % args.video_log_freq == 0:
|
|
print("\nCollecting video rollouts...")
|
|
eval_video_trajs = utils.sample_n_trajectories(
|
|
env, agent.actor, MAX_NVIDEO, render=True
|
|
)
|
|
|
|
logger.log_trajs_as_videos(
|
|
eval_video_trajs,
|
|
itr,
|
|
fps=fps,
|
|
max_videos_to_save=MAX_NVIDEO,
|
|
video_title="eval_rollouts",
|
|
)
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--env_name", type=str, required=True)
|
|
parser.add_argument("--exp_name", type=str, required=True)
|
|
parser.add_argument("--n_iter", "-n", type=int, default=200)
|
|
|
|
parser.add_argument("--use_reward_to_go", "-rtg", action="store_true")
|
|
parser.add_argument("--use_baseline", action="store_true")
|
|
parser.add_argument("--baseline_learning_rate", "-blr", type=float, default=5e-3)
|
|
parser.add_argument("--baseline_gradient_steps", "-bgs", type=int, default=5)
|
|
parser.add_argument("--gae_lambda", type=float, default=None)
|
|
parser.add_argument("--normalize_advantages", "-na", action="store_true")
|
|
parser.add_argument(
|
|
"--batch_size", "-b", type=int, default=1000
|
|
) # steps collected per train iteration
|
|
parser.add_argument(
|
|
"--eval_batch_size", "-eb", type=int, default=400
|
|
) # steps collected per eval iteration
|
|
|
|
parser.add_argument("--discount", type=float, default=1.0)
|
|
parser.add_argument("--learning_rate", "-lr", type=float, default=5e-3)
|
|
parser.add_argument("--n_layers", "-l", type=int, default=2)
|
|
parser.add_argument("--layer_size", "-s", type=int, default=64)
|
|
|
|
parser.add_argument("--seed", type=int, default=1)
|
|
parser.add_argument("--no_gpu", "-ngpu", action="store_true")
|
|
parser.add_argument("--which_gpu", "-gpu_id", default=0)
|
|
parser.add_argument("--video_log_freq", type=int, default=-1)
|
|
parser.add_argument("--scalar_log_freq", type=int, default=1)
|
|
|
|
parser.add_argument("--action_noise_std", type=float, default=0)
|
|
|
|
parser.add_argument("--data_path",type=str,default='./data')
|
|
args = parser.parse_args()
|
|
|
|
# create directory for logging
|
|
logdir_prefix = "pg_" # keep for autograder
|
|
|
|
data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), args.data_path)
|
|
|
|
if not (os.path.exists(data_path)):
|
|
os.makedirs(data_path)
|
|
|
|
logdir = (
|
|
logdir_prefix
|
|
+ args.exp_name
|
|
+ "_"
|
|
+ args.env_name
|
|
+ "_"
|
|
+ time.strftime("%d-%m-%Y_%H-%M-%S")
|
|
)
|
|
logdir = os.path.join(data_path, logdir)
|
|
args.logdir = logdir
|
|
if not (os.path.exists(logdir)):
|
|
os.makedirs(logdir)
|
|
|
|
run_training_loop(args)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|