Files
CSE5100H3/hw3/run.py
Zheyuan Wu 292e04832d init
2025-10-25 13:26:54 -05:00

189 lines
6.3 KiB
Python

import os
import time
from src.pg_agent import PGAgent
import os
import time
import gymnasium as gym
import numpy as np
import torch
from src import pytorch_util as ptu
from src import utils
from src.logger import Logger
from src.action_noise_wrapper import ActionNoiseWrapper
MAX_NVIDEO = 2
def run_training_loop(args):
logger = Logger(args.logdir)
# set random seeds
np.random.seed(args.seed)
torch.manual_seed(args.seed)
ptu.init_gpu(use_gpu=not args.no_gpu, gpu_id=args.which_gpu)
# make the gym environment
env = gym.make(args.env_name, render_mode=None)
discrete = isinstance(env.action_space, gym.spaces.Discrete)
# add action noise, if needed
if args.action_noise_std > 0:
assert not discrete, f"Cannot use --action_noise_std for discrete environment {args.env_name}"
env = ActionNoiseWrapper(env, args.seed, args.action_noise_std)
ob_dim = env.observation_space.shape[0]
ac_dim = env.action_space.n if discrete else env.action_space.shape[0]
# simulation timestep, will be used for video saving
if hasattr(env, "model"):
fps = 1 / env.model.opt.timestep
else:
fps = env.env.metadata["render_fps"]
# initialize agent
agent = PGAgent(
ob_dim,
ac_dim,
discrete,
n_layers=args.n_layers,
layer_size=args.layer_size,
gamma=args.discount,
learning_rate=args.learning_rate,
use_baseline=args.use_baseline,
use_reward_to_go=args.use_reward_to_go,
normalize_advantages=args.normalize_advantages,
baseline_learning_rate=args.baseline_learning_rate,
baseline_gradient_steps=args.baseline_gradient_steps,
gae_lambda=args.gae_lambda,
)
total_envsteps = 0
start_time = time.time()
for itr in range(args.n_iter):
print(f"\n********** Iteration {itr} ************")
# sample `args.batch_size` transitions using utils.sample_trajectories
trajs, envsteps_this_batch = utils.sample_trajectories(
env, agent.actor, args.batch_size, False
)
total_envsteps += envsteps_this_batch
# trajs should be a list of dictionaries of NumPy arrays, where each dictionary corresponds to a trajectory.
# this line converts this into a single dictionary of lists of NumPy arrays.
trajs_dict = {k: [traj[k] for traj in trajs] for k in trajs[0]}
# train the agent using the sampled trajectories and the agent's update function
# agent.update
train_info: dict = agent.update(
trajs_dict["observation"], trajs_dict["action"], trajs_dict["reward"], trajs_dict["terminal"]
)
if itr % args.scalar_log_freq == 0:
# save eval metrics
print("\nCollecting data for eval...")
eval_trajs, eval_envsteps_this_batch = utils.sample_trajectories(
env, agent.actor, args.eval_batch_size,
)
logs = utils.compute_metrics(trajs, eval_trajs)
# compute additional metrics
logs.update(train_info)
logs["Train_EnvstepsSoFar"] = total_envsteps
logs["TimeSinceStart"] = time.time() - start_time
if itr == 0:
logs["Initial_DataCollection_AverageReturn"] = logs[
"Train_AverageReturn"
]
# perform the logging
for key, value in logs.items():
print("{} : {}".format(key, value))
logger.log_scalar(value, key, itr)
print("Done logging...\n\n")
logger.flush()
if args.video_log_freq != -1 and itr % args.video_log_freq == 0:
print("\nCollecting video rollouts...")
eval_video_trajs = utils.sample_n_trajectories(
env, agent.actor, MAX_NVIDEO, render=True
)
logger.log_trajs_as_videos(
eval_video_trajs,
itr,
fps=fps,
max_videos_to_save=MAX_NVIDEO,
video_title="eval_rollouts",
)
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--env_name", type=str, required=True)
parser.add_argument("--exp_name", type=str, required=True)
parser.add_argument("--n_iter", "-n", type=int, default=200)
parser.add_argument("--use_reward_to_go", "-rtg", action="store_true")
parser.add_argument("--use_baseline", action="store_true")
parser.add_argument("--baseline_learning_rate", "-blr", type=float, default=5e-3)
parser.add_argument("--baseline_gradient_steps", "-bgs", type=int, default=5)
parser.add_argument("--gae_lambda", type=float, default=None)
parser.add_argument("--normalize_advantages", "-na", action="store_true")
parser.add_argument(
"--batch_size", "-b", type=int, default=1000
) # steps collected per train iteration
parser.add_argument(
"--eval_batch_size", "-eb", type=int, default=400
) # steps collected per eval iteration
parser.add_argument("--discount", type=float, default=1.0)
parser.add_argument("--learning_rate", "-lr", type=float, default=5e-3)
parser.add_argument("--n_layers", "-l", type=int, default=2)
parser.add_argument("--layer_size", "-s", type=int, default=64)
parser.add_argument("--seed", type=int, default=1)
parser.add_argument("--no_gpu", "-ngpu", action="store_true")
parser.add_argument("--which_gpu", "-gpu_id", default=0)
parser.add_argument("--video_log_freq", type=int, default=-1)
parser.add_argument("--scalar_log_freq", type=int, default=1)
parser.add_argument("--action_noise_std", type=float, default=0)
parser.add_argument("--data_path",type=str,default='./data')
args = parser.parse_args()
# create directory for logging
logdir_prefix = "pg_" # keep for autograder
data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), args.data_path)
if not (os.path.exists(data_path)):
os.makedirs(data_path)
logdir = (
logdir_prefix
+ args.exp_name
+ "_"
+ args.env_name
+ "_"
+ time.strftime("%d-%m-%Y_%H-%M-%S")
)
logdir = os.path.join(data_path, logdir)
args.logdir = logdir
if not (os.path.exists(logdir)):
os.makedirs(logdir)
run_training_loop(args)
if __name__ == "__main__":
main()