import os import time from src.pg_agent import PGAgent import os import time import gymnasium as gym import numpy as np import torch from src import pytorch_util as ptu from src import utils from src.logger import Logger from src.action_noise_wrapper import ActionNoiseWrapper MAX_NVIDEO = 2 def run_training_loop(args): logger = Logger(args.logdir) # set random seeds np.random.seed(args.seed) torch.manual_seed(args.seed) ptu.init_gpu(use_gpu=not args.no_gpu, gpu_id=args.which_gpu) # make the gym environment env = gym.make(args.env_name, render_mode=None) discrete = isinstance(env.action_space, gym.spaces.Discrete) # add action noise, if needed if args.action_noise_std > 0: assert not discrete, f"Cannot use --action_noise_std for discrete environment {args.env_name}" env = ActionNoiseWrapper(env, args.seed, args.action_noise_std) ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] # simulation timestep, will be used for video saving if hasattr(env, "model"): fps = 1 / env.model.opt.timestep else: fps = env.env.metadata["render_fps"] # initialize agent agent = PGAgent( ob_dim, ac_dim, discrete, n_layers=args.n_layers, layer_size=args.layer_size, gamma=args.discount, learning_rate=args.learning_rate, use_baseline=args.use_baseline, use_reward_to_go=args.use_reward_to_go, normalize_advantages=args.normalize_advantages, baseline_learning_rate=args.baseline_learning_rate, baseline_gradient_steps=args.baseline_gradient_steps, gae_lambda=args.gae_lambda, ) total_envsteps = 0 start_time = time.time() for itr in range(args.n_iter): print(f"\n********** Iteration {itr} ************") # sample `args.batch_size` transitions using utils.sample_trajectories trajs, envsteps_this_batch = utils.sample_trajectories( env, agent.actor, args.batch_size, False ) total_envsteps += envsteps_this_batch # trajs should be a list of dictionaries of NumPy arrays, where each dictionary corresponds to a trajectory. # this line converts this into a single dictionary of lists of NumPy arrays. trajs_dict = {k: [traj[k] for traj in trajs] for k in trajs[0]} # train the agent using the sampled trajectories and the agent's update function # agent.update train_info: dict = agent.update( trajs_dict["observation"], trajs_dict["action"], trajs_dict["reward"], trajs_dict["terminal"] ) if itr % args.scalar_log_freq == 0: # save eval metrics print("\nCollecting data for eval...") eval_trajs, eval_envsteps_this_batch = utils.sample_trajectories( env, agent.actor, args.eval_batch_size, ) logs = utils.compute_metrics(trajs, eval_trajs) # compute additional metrics logs.update(train_info) logs["Train_EnvstepsSoFar"] = total_envsteps logs["TimeSinceStart"] = time.time() - start_time if itr == 0: logs["Initial_DataCollection_AverageReturn"] = logs[ "Train_AverageReturn" ] # perform the logging for key, value in logs.items(): print("{} : {}".format(key, value)) logger.log_scalar(value, key, itr) print("Done logging...\n\n") logger.flush() if args.video_log_freq != -1 and itr % args.video_log_freq == 0: print("\nCollecting video rollouts...") eval_video_trajs = utils.sample_n_trajectories( env, agent.actor, MAX_NVIDEO, render=True ) logger.log_trajs_as_videos( eval_video_trajs, itr, fps=fps, max_videos_to_save=MAX_NVIDEO, video_title="eval_rollouts", ) def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--env_name", type=str, required=True) parser.add_argument("--exp_name", type=str, required=True) parser.add_argument("--n_iter", "-n", type=int, default=200) parser.add_argument("--use_reward_to_go", "-rtg", action="store_true") parser.add_argument("--use_baseline", action="store_true") parser.add_argument("--baseline_learning_rate", "-blr", type=float, default=5e-3) parser.add_argument("--baseline_gradient_steps", "-bgs", type=int, default=5) parser.add_argument("--gae_lambda", type=float, default=None) parser.add_argument("--normalize_advantages", "-na", action="store_true") parser.add_argument( "--batch_size", "-b", type=int, default=1000 ) # steps collected per train iteration parser.add_argument( "--eval_batch_size", "-eb", type=int, default=400 ) # steps collected per eval iteration parser.add_argument("--discount", type=float, default=1.0) parser.add_argument("--learning_rate", "-lr", type=float, default=5e-3) parser.add_argument("--n_layers", "-l", type=int, default=2) parser.add_argument("--layer_size", "-s", type=int, default=64) parser.add_argument("--seed", type=int, default=1) parser.add_argument("--no_gpu", "-ngpu", action="store_true") parser.add_argument("--which_gpu", "-gpu_id", default=0) parser.add_argument("--video_log_freq", type=int, default=-1) parser.add_argument("--scalar_log_freq", type=int, default=1) parser.add_argument("--action_noise_std", type=float, default=0) parser.add_argument("--data_path",type=str,default='./data') args = parser.parse_args() # create directory for logging logdir_prefix = "pg_" # keep for autograder data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), args.data_path) if not (os.path.exists(data_path)): os.makedirs(data_path) logdir = ( logdir_prefix + args.exp_name + "_" + args.env_name + "_" + time.strftime("%d-%m-%Y_%H-%M-%S") ) logdir = os.path.join(data_path, logdir) args.logdir = logdir if not (os.path.exists(logdir)): os.makedirs(logdir) run_training_loop(args) if __name__ == "__main__": main()