init
This commit is contained in:
23
hw3/README.md
Normal file
23
hw3/README.md
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
## Setup the environment
|
||||||
|
```
|
||||||
|
# remove the content 'pytorch-cuda=11.7 -c pytorch -c nvidia' if you are a mac user or are not going to use GPU
|
||||||
|
conda install pytorch==2.0.0 pytorch-cuda=11.7 -c pytorch -c nvidia
|
||||||
|
pip install 'gymnasium[classic_control]==0.27.1'
|
||||||
|
pip install matplotlib==3.7.1
|
||||||
|
pip install tensorboardX==2.6.4
|
||||||
|
```
|
||||||
|
|
||||||
|
## Complete the code
|
||||||
|
|
||||||
|
The files that you are going to implement are:
|
||||||
|
|
||||||
|
- `src/pg_agent.py`
|
||||||
|
- `src/policies.py`
|
||||||
|
- `src/critics.py`
|
||||||
|
- `src/utils.py`
|
||||||
|
|
||||||
|
See the [Assignment PDF](hw3.pdf) for more instructions.
|
||||||
|
|
||||||
|
## Submission
|
||||||
|
|
||||||
|
You should submit your code and the training logs, as well as your report on Canvas.
|
||||||
BIN
hw3/hw3.pdf
Normal file
BIN
hw3/hw3.pdf
Normal file
Binary file not shown.
188
hw3/run.py
Normal file
188
hw3/run.py
Normal file
@@ -0,0 +1,188 @@
|
|||||||
|
import os
|
||||||
|
import time
|
||||||
|
|
||||||
|
from src.pg_agent import PGAgent
|
||||||
|
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
|
||||||
|
import gymnasium as gym
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from src import pytorch_util as ptu
|
||||||
|
|
||||||
|
from src import utils
|
||||||
|
from src.logger import Logger
|
||||||
|
from src.action_noise_wrapper import ActionNoiseWrapper
|
||||||
|
|
||||||
|
MAX_NVIDEO = 2
|
||||||
|
|
||||||
|
|
||||||
|
def run_training_loop(args):
|
||||||
|
logger = Logger(args.logdir)
|
||||||
|
|
||||||
|
# set random seeds
|
||||||
|
np.random.seed(args.seed)
|
||||||
|
torch.manual_seed(args.seed)
|
||||||
|
ptu.init_gpu(use_gpu=not args.no_gpu, gpu_id=args.which_gpu)
|
||||||
|
|
||||||
|
# make the gym environment
|
||||||
|
env = gym.make(args.env_name, render_mode=None)
|
||||||
|
discrete = isinstance(env.action_space, gym.spaces.Discrete)
|
||||||
|
|
||||||
|
# add action noise, if needed
|
||||||
|
if args.action_noise_std > 0:
|
||||||
|
assert not discrete, f"Cannot use --action_noise_std for discrete environment {args.env_name}"
|
||||||
|
env = ActionNoiseWrapper(env, args.seed, args.action_noise_std)
|
||||||
|
|
||||||
|
ob_dim = env.observation_space.shape[0]
|
||||||
|
ac_dim = env.action_space.n if discrete else env.action_space.shape[0]
|
||||||
|
|
||||||
|
# simulation timestep, will be used for video saving
|
||||||
|
if hasattr(env, "model"):
|
||||||
|
fps = 1 / env.model.opt.timestep
|
||||||
|
else:
|
||||||
|
fps = env.env.metadata["render_fps"]
|
||||||
|
|
||||||
|
# initialize agent
|
||||||
|
agent = PGAgent(
|
||||||
|
ob_dim,
|
||||||
|
ac_dim,
|
||||||
|
discrete,
|
||||||
|
n_layers=args.n_layers,
|
||||||
|
layer_size=args.layer_size,
|
||||||
|
gamma=args.discount,
|
||||||
|
learning_rate=args.learning_rate,
|
||||||
|
use_baseline=args.use_baseline,
|
||||||
|
use_reward_to_go=args.use_reward_to_go,
|
||||||
|
normalize_advantages=args.normalize_advantages,
|
||||||
|
baseline_learning_rate=args.baseline_learning_rate,
|
||||||
|
baseline_gradient_steps=args.baseline_gradient_steps,
|
||||||
|
gae_lambda=args.gae_lambda,
|
||||||
|
)
|
||||||
|
|
||||||
|
total_envsteps = 0
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
for itr in range(args.n_iter):
|
||||||
|
print(f"\n********** Iteration {itr} ************")
|
||||||
|
# sample `args.batch_size` transitions using utils.sample_trajectories
|
||||||
|
trajs, envsteps_this_batch = utils.sample_trajectories(
|
||||||
|
env, agent.actor, args.batch_size, False
|
||||||
|
)
|
||||||
|
|
||||||
|
total_envsteps += envsteps_this_batch
|
||||||
|
|
||||||
|
# trajs should be a list of dictionaries of NumPy arrays, where each dictionary corresponds to a trajectory.
|
||||||
|
# this line converts this into a single dictionary of lists of NumPy arrays.
|
||||||
|
trajs_dict = {k: [traj[k] for traj in trajs] for k in trajs[0]}
|
||||||
|
|
||||||
|
# train the agent using the sampled trajectories and the agent's update function
|
||||||
|
# agent.update
|
||||||
|
train_info: dict = agent.update(
|
||||||
|
trajs_dict["observation"], trajs_dict["action"], trajs_dict["reward"], trajs_dict["terminal"]
|
||||||
|
)
|
||||||
|
|
||||||
|
if itr % args.scalar_log_freq == 0:
|
||||||
|
# save eval metrics
|
||||||
|
print("\nCollecting data for eval...")
|
||||||
|
eval_trajs, eval_envsteps_this_batch = utils.sample_trajectories(
|
||||||
|
env, agent.actor, args.eval_batch_size,
|
||||||
|
)
|
||||||
|
|
||||||
|
logs = utils.compute_metrics(trajs, eval_trajs)
|
||||||
|
# compute additional metrics
|
||||||
|
logs.update(train_info)
|
||||||
|
logs["Train_EnvstepsSoFar"] = total_envsteps
|
||||||
|
logs["TimeSinceStart"] = time.time() - start_time
|
||||||
|
if itr == 0:
|
||||||
|
logs["Initial_DataCollection_AverageReturn"] = logs[
|
||||||
|
"Train_AverageReturn"
|
||||||
|
]
|
||||||
|
|
||||||
|
# perform the logging
|
||||||
|
for key, value in logs.items():
|
||||||
|
print("{} : {}".format(key, value))
|
||||||
|
logger.log_scalar(value, key, itr)
|
||||||
|
print("Done logging...\n\n")
|
||||||
|
|
||||||
|
logger.flush()
|
||||||
|
|
||||||
|
if args.video_log_freq != -1 and itr % args.video_log_freq == 0:
|
||||||
|
print("\nCollecting video rollouts...")
|
||||||
|
eval_video_trajs = utils.sample_n_trajectories(
|
||||||
|
env, agent.actor, MAX_NVIDEO, render=True
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.log_trajs_as_videos(
|
||||||
|
eval_video_trajs,
|
||||||
|
itr,
|
||||||
|
fps=fps,
|
||||||
|
max_videos_to_save=MAX_NVIDEO,
|
||||||
|
video_title="eval_rollouts",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--env_name", type=str, required=True)
|
||||||
|
parser.add_argument("--exp_name", type=str, required=True)
|
||||||
|
parser.add_argument("--n_iter", "-n", type=int, default=200)
|
||||||
|
|
||||||
|
parser.add_argument("--use_reward_to_go", "-rtg", action="store_true")
|
||||||
|
parser.add_argument("--use_baseline", action="store_true")
|
||||||
|
parser.add_argument("--baseline_learning_rate", "-blr", type=float, default=5e-3)
|
||||||
|
parser.add_argument("--baseline_gradient_steps", "-bgs", type=int, default=5)
|
||||||
|
parser.add_argument("--gae_lambda", type=float, default=None)
|
||||||
|
parser.add_argument("--normalize_advantages", "-na", action="store_true")
|
||||||
|
parser.add_argument(
|
||||||
|
"--batch_size", "-b", type=int, default=1000
|
||||||
|
) # steps collected per train iteration
|
||||||
|
parser.add_argument(
|
||||||
|
"--eval_batch_size", "-eb", type=int, default=400
|
||||||
|
) # steps collected per eval iteration
|
||||||
|
|
||||||
|
parser.add_argument("--discount", type=float, default=1.0)
|
||||||
|
parser.add_argument("--learning_rate", "-lr", type=float, default=5e-3)
|
||||||
|
parser.add_argument("--n_layers", "-l", type=int, default=2)
|
||||||
|
parser.add_argument("--layer_size", "-s", type=int, default=64)
|
||||||
|
|
||||||
|
parser.add_argument("--seed", type=int, default=1)
|
||||||
|
parser.add_argument("--no_gpu", "-ngpu", action="store_true")
|
||||||
|
parser.add_argument("--which_gpu", "-gpu_id", default=0)
|
||||||
|
parser.add_argument("--video_log_freq", type=int, default=-1)
|
||||||
|
parser.add_argument("--scalar_log_freq", type=int, default=1)
|
||||||
|
|
||||||
|
parser.add_argument("--action_noise_std", type=float, default=0)
|
||||||
|
|
||||||
|
parser.add_argument("--data_path",type=str,default='./data')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# create directory for logging
|
||||||
|
logdir_prefix = "pg_" # keep for autograder
|
||||||
|
|
||||||
|
data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), args.data_path)
|
||||||
|
|
||||||
|
if not (os.path.exists(data_path)):
|
||||||
|
os.makedirs(data_path)
|
||||||
|
|
||||||
|
logdir = (
|
||||||
|
logdir_prefix
|
||||||
|
+ args.exp_name
|
||||||
|
+ "_"
|
||||||
|
+ args.env_name
|
||||||
|
+ "_"
|
||||||
|
+ time.strftime("%d-%m-%Y_%H-%M-%S")
|
||||||
|
)
|
||||||
|
logdir = os.path.join(data_path, logdir)
|
||||||
|
args.logdir = logdir
|
||||||
|
if not (os.path.exists(logdir)):
|
||||||
|
os.makedirs(logdir)
|
||||||
|
|
||||||
|
run_training_loop(args)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
BIN
hw3/src/__pycache__/action_noise_wrapper.cpython-311.pyc
Normal file
BIN
hw3/src/__pycache__/action_noise_wrapper.cpython-311.pyc
Normal file
Binary file not shown.
BIN
hw3/src/__pycache__/critics.cpython-311.pyc
Normal file
BIN
hw3/src/__pycache__/critics.cpython-311.pyc
Normal file
Binary file not shown.
BIN
hw3/src/__pycache__/logger.cpython-311.pyc
Normal file
BIN
hw3/src/__pycache__/logger.cpython-311.pyc
Normal file
Binary file not shown.
BIN
hw3/src/__pycache__/pg_agent.cpython-311.pyc
Normal file
BIN
hw3/src/__pycache__/pg_agent.cpython-311.pyc
Normal file
Binary file not shown.
BIN
hw3/src/__pycache__/policies.cpython-311.pyc
Normal file
BIN
hw3/src/__pycache__/policies.cpython-311.pyc
Normal file
Binary file not shown.
BIN
hw3/src/__pycache__/pytorch_util.cpython-311.pyc
Normal file
BIN
hw3/src/__pycache__/pytorch_util.cpython-311.pyc
Normal file
Binary file not shown.
BIN
hw3/src/__pycache__/utils.cpython-311.pyc
Normal file
BIN
hw3/src/__pycache__/utils.cpython-311.pyc
Normal file
Binary file not shown.
12
hw3/src/action_noise_wrapper.py
Normal file
12
hw3/src/action_noise_wrapper.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
import gymnasium as gym
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
class ActionNoiseWrapper(gym.ActionWrapper):
|
||||||
|
def __init__(self, env, seed, std):
|
||||||
|
super().__init__(env)
|
||||||
|
self.rng = np.random.default_rng(seed)
|
||||||
|
self.std = std
|
||||||
|
|
||||||
|
def action(self, act):
|
||||||
|
act = act + self.rng.normal(0, self.std, act.shape)
|
||||||
|
return act
|
||||||
63
hw3/src/critics.py
Normal file
63
hw3/src/critics.py
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
import itertools
|
||||||
|
from torch import nn
|
||||||
|
from torch.nn import functional as F
|
||||||
|
from torch import optim
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from torch import distributions
|
||||||
|
|
||||||
|
import src.pytorch_util as ptu
|
||||||
|
|
||||||
|
|
||||||
|
class ValueCritic(nn.Module):
|
||||||
|
"""Value network, which takes an observation and outputs a value for that observation."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
ob_dim: int,
|
||||||
|
n_layers: int,
|
||||||
|
layer_size: int,
|
||||||
|
learning_rate: float,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self.network = ptu.build_mlp(
|
||||||
|
input_size=ob_dim,
|
||||||
|
output_size=1,
|
||||||
|
n_layers=n_layers,
|
||||||
|
size=layer_size,
|
||||||
|
).to(ptu.device)
|
||||||
|
|
||||||
|
self.optimizer = optim.Adam(
|
||||||
|
self.network.parameters(),
|
||||||
|
learning_rate,
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, obs: torch.Tensor) -> torch.Tensor:
|
||||||
|
# implement the forward pass of the critic network
|
||||||
|
|
||||||
|
values = None
|
||||||
|
|
||||||
|
############################
|
||||||
|
# YOUR IMPLEMENTATION HERE #
|
||||||
|
|
||||||
|
############################
|
||||||
|
|
||||||
|
return values
|
||||||
|
|
||||||
|
|
||||||
|
def update(self, obs: np.ndarray, q_values: np.ndarray) -> dict:
|
||||||
|
obs = ptu.from_numpy(obs)
|
||||||
|
q_values = ptu.from_numpy(q_values)
|
||||||
|
|
||||||
|
# compute loss, update the critic using the observations and q_values
|
||||||
|
loss = None
|
||||||
|
############################
|
||||||
|
# YOUR IMPLEMENTATION HERE #
|
||||||
|
|
||||||
|
############################
|
||||||
|
|
||||||
|
return {
|
||||||
|
"Baseline Loss": ptu.to_numpy(loss),
|
||||||
|
}
|
||||||
74
hw3/src/logger.py
Normal file
74
hw3/src/logger.py
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
import os
|
||||||
|
from tensorboardX import SummaryWriter
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
class Logger:
|
||||||
|
def __init__(self, log_dir, n_logged_samples=10, summary_writer=None):
|
||||||
|
self._log_dir = log_dir
|
||||||
|
print('########################')
|
||||||
|
print('logging outputs to ', log_dir)
|
||||||
|
print('########################')
|
||||||
|
self._n_logged_samples = n_logged_samples
|
||||||
|
self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1)
|
||||||
|
|
||||||
|
def log_scalar(self, scalar, name, step_):
|
||||||
|
self._summ_writer.add_scalar('{}'.format(name), scalar, step_)
|
||||||
|
|
||||||
|
def log_scalars(self, scalar_dict, group_name, step, phase):
|
||||||
|
"""Will log all scalars in the same plot."""
|
||||||
|
self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step)
|
||||||
|
|
||||||
|
def log_image(self, image, name, step):
|
||||||
|
assert(len(image.shape) == 3) # [C, H, W]
|
||||||
|
self._summ_writer.add_image('{}'.format(name), image, step)
|
||||||
|
|
||||||
|
def log_video(self, video_frames, name, step, fps=10):
|
||||||
|
assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!"
|
||||||
|
self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps)
|
||||||
|
|
||||||
|
def log_trajs_as_videos(self, trajs, step, max_videos_to_save=2, fps=10, video_title='video'):
|
||||||
|
|
||||||
|
# reshape the rollouts
|
||||||
|
videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in trajs]
|
||||||
|
|
||||||
|
# max rollout length
|
||||||
|
max_videos_to_save = np.min([max_videos_to_save, len(videos)])
|
||||||
|
max_length = videos[0].shape[0]
|
||||||
|
for i in range(max_videos_to_save):
|
||||||
|
if videos[i].shape[0]>max_length:
|
||||||
|
max_length = videos[i].shape[0]
|
||||||
|
|
||||||
|
# pad rollouts to all be same length
|
||||||
|
for i in range(max_videos_to_save):
|
||||||
|
if videos[i].shape[0]<max_length:
|
||||||
|
padding = np.tile([videos[i][-1]], (max_length-videos[i].shape[0],1,1,1))
|
||||||
|
videos[i] = np.concatenate([videos[i], padding], 0)
|
||||||
|
|
||||||
|
# log videos to tensorboard event file
|
||||||
|
videos = np.stack(videos[:max_videos_to_save], 0)
|
||||||
|
self.log_video(videos, video_title, step, fps=fps)
|
||||||
|
|
||||||
|
def log_figures(self, figure, name, step, phase):
|
||||||
|
"""figure: matplotlib.pyplot figure handle"""
|
||||||
|
assert figure.shape[0] > 0, "Figure logging requires input shape [batch x figures]!"
|
||||||
|
self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
|
||||||
|
|
||||||
|
def log_figure(self, figure, name, step, phase):
|
||||||
|
"""figure: matplotlib.pyplot figure handle"""
|
||||||
|
self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
|
||||||
|
|
||||||
|
def log_graph(self, array, name, step, phase):
|
||||||
|
"""figure: matplotlib.pyplot figure handle"""
|
||||||
|
im = plot_graph(array)
|
||||||
|
self._summ_writer.add_image('{}_{}'.format(name, phase), im, step)
|
||||||
|
|
||||||
|
def dump_scalars(self, log_path=None):
|
||||||
|
log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path
|
||||||
|
self._summ_writer.export_scalars_to_json(log_path)
|
||||||
|
|
||||||
|
def flush(self):
|
||||||
|
self._summ_writer.flush()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
186
hw3/src/pg_agent.py
Normal file
186
hw3/src/pg_agent.py
Normal file
@@ -0,0 +1,186 @@
|
|||||||
|
from typing import Optional, Sequence
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from src.policies import MLPPolicyPG
|
||||||
|
from src.critics import ValueCritic
|
||||||
|
import src.pytorch_util as ptu
|
||||||
|
from torch import nn
|
||||||
|
|
||||||
|
|
||||||
|
class PGAgent(nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
ob_dim: int,
|
||||||
|
ac_dim: int,
|
||||||
|
discrete: bool,
|
||||||
|
n_layers: int,
|
||||||
|
layer_size: int,
|
||||||
|
gamma: float,
|
||||||
|
learning_rate: float,
|
||||||
|
use_baseline: bool,
|
||||||
|
use_reward_to_go: bool,
|
||||||
|
baseline_learning_rate: Optional[float],
|
||||||
|
baseline_gradient_steps: Optional[int],
|
||||||
|
gae_lambda: Optional[float],
|
||||||
|
normalize_advantages: bool,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
# create the actor (policy) network
|
||||||
|
self.actor = MLPPolicyPG(
|
||||||
|
ac_dim, ob_dim, discrete, n_layers, layer_size, learning_rate
|
||||||
|
)
|
||||||
|
|
||||||
|
# create the critic (baseline) network, if needed
|
||||||
|
if use_baseline:
|
||||||
|
self.critic = ValueCritic(
|
||||||
|
ob_dim, n_layers, layer_size, baseline_learning_rate
|
||||||
|
)
|
||||||
|
self.baseline_gradient_steps = baseline_gradient_steps
|
||||||
|
else:
|
||||||
|
self.critic = None
|
||||||
|
|
||||||
|
# other agent parameters
|
||||||
|
self.gamma = gamma
|
||||||
|
self.use_reward_to_go = use_reward_to_go
|
||||||
|
self.gae_lambda = gae_lambda
|
||||||
|
self.normalize_advantages = normalize_advantages
|
||||||
|
|
||||||
|
def update(
|
||||||
|
self,
|
||||||
|
obs: Sequence[np.ndarray],
|
||||||
|
actions: Sequence[np.ndarray],
|
||||||
|
rewards: Sequence[np.ndarray],
|
||||||
|
terminals: Sequence[np.ndarray],
|
||||||
|
) -> dict:
|
||||||
|
"""The train step for PG involves updating its actor using the given observations/actions and the calculated
|
||||||
|
qvals/advantages that come from the seen rewards.
|
||||||
|
|
||||||
|
Each input is a list of NumPy arrays, where each array corresponds to a single trajectory. The batch size is the
|
||||||
|
total number of samples across all trajectories (i.e. the sum of the lengths of all the arrays).
|
||||||
|
"""
|
||||||
|
|
||||||
|
# step 1: calculate Q values of each (s_t, a_t) point, using rewards (r_0, ..., r_t, ..., r_T)
|
||||||
|
q_values: Sequence[np.ndarray] = self._calculate_q_vals(rewards)
|
||||||
|
|
||||||
|
obs = np.concatenate(obs)
|
||||||
|
actions = np.concatenate(actions)
|
||||||
|
rewards = np.concatenate(rewards)
|
||||||
|
terminals = np.concatenate(terminals)
|
||||||
|
q_values = np.concatenate(q_values)
|
||||||
|
|
||||||
|
# step 2: calculate advantages from Q values
|
||||||
|
advantages: np.ndarray = self._estimate_advantage(
|
||||||
|
obs, rewards, q_values, terminals
|
||||||
|
)
|
||||||
|
|
||||||
|
# step 3: use all datapoints (s_t, a_t, adv_t) to update the PG actor/policy
|
||||||
|
# update the PG actor/policy network once using the advantages
|
||||||
|
info: dict = self.actor.update(obs, actions, advantages)
|
||||||
|
|
||||||
|
# step 4: if needed, use all datapoints (s_t, a_t, q_t) to update the PG critic/baseline
|
||||||
|
if self.critic is not None:
|
||||||
|
# perform `self.baseline_gradient_steps` updates to the critic/baseline network
|
||||||
|
critic_info: dict = None
|
||||||
|
############################
|
||||||
|
# YOUR IMPLEMENTATION HERE #
|
||||||
|
|
||||||
|
############################
|
||||||
|
|
||||||
|
info.update(critic_info)
|
||||||
|
|
||||||
|
return info
|
||||||
|
|
||||||
|
def _calculate_q_vals(self, rewards: Sequence[np.ndarray]) -> Sequence[np.ndarray]:
|
||||||
|
"""Monte Carlo estimation of the Q function."""
|
||||||
|
|
||||||
|
if not self.use_reward_to_go:
|
||||||
|
# Case 1: in trajectory-based PG, we ignore the timestep and instead use the discounted return for the entire
|
||||||
|
# trajectory at each point.
|
||||||
|
# In other words: Q(s_t, a_t) = sum_{t'=0}^T gamma^t' r_{t'}
|
||||||
|
# TODO: use the helper function self._discounted_return to calculate the Q-values
|
||||||
|
q_values = None
|
||||||
|
############################
|
||||||
|
# YOUR IMPLEMENTATION HERE #
|
||||||
|
|
||||||
|
############################
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Case 2: in reward-to-go PG, we only use the rewards after timestep t to estimate the Q-value for (s_t, a_t).
|
||||||
|
# In other words: Q(s_t, a_t) = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
|
||||||
|
# TODO: use the helper function self._discounted_reward_to_go to calculate the Q-values
|
||||||
|
q_values = None
|
||||||
|
|
||||||
|
############################
|
||||||
|
# YOUR IMPLEMENTATION HERE #
|
||||||
|
|
||||||
|
############################
|
||||||
|
|
||||||
|
return q_values
|
||||||
|
|
||||||
|
def _estimate_advantage(
|
||||||
|
self,
|
||||||
|
obs: np.ndarray,
|
||||||
|
rewards: np.ndarray,
|
||||||
|
q_values: np.ndarray,
|
||||||
|
terminals: np.ndarray,
|
||||||
|
) -> np.ndarray:
|
||||||
|
"""Computes advantages by (possibly) subtracting a value baseline from the estimated Q-values.
|
||||||
|
|
||||||
|
Operates on flat 1D NumPy arrays.
|
||||||
|
"""
|
||||||
|
if self.critic is None:
|
||||||
|
advantages = q_values.copy()
|
||||||
|
else:
|
||||||
|
# run the critic and use it as a baseline to compute values and advantages
|
||||||
|
values = None
|
||||||
|
advantages = None
|
||||||
|
############################
|
||||||
|
# YOUR IMPLEMENTATION HERE #
|
||||||
|
|
||||||
|
############################
|
||||||
|
assert values.shape == q_values.shape
|
||||||
|
|
||||||
|
# normalize the advantages to have a mean of zero and a standard deviation of one within the batch
|
||||||
|
if self.normalize_advantages:
|
||||||
|
|
||||||
|
advantages = None
|
||||||
|
############################
|
||||||
|
# YOUR IMPLEMENTATION HERE #
|
||||||
|
|
||||||
|
############################
|
||||||
|
|
||||||
|
return advantages
|
||||||
|
|
||||||
|
def _discounted_return(self, rewards: Sequence[float]) -> Sequence[float]:
|
||||||
|
"""
|
||||||
|
Helper function which takes a list of rewards {r_0, r_1, ..., r_t', ... r_T} and returns
|
||||||
|
a list where each index t contains sum_{t'=0}^T gamma^t' r_{t'}
|
||||||
|
|
||||||
|
Note that all entries of the output list should be the exact same because each sum is from 0 to T (and doesn't
|
||||||
|
involve t)!
|
||||||
|
|
||||||
|
self.gamma
|
||||||
|
"""
|
||||||
|
|
||||||
|
############################
|
||||||
|
# YOUR IMPLEMENTATION HERE #
|
||||||
|
|
||||||
|
############################
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _discounted_reward_to_go(self, rewards: Sequence[float]) -> Sequence[float]:
|
||||||
|
"""
|
||||||
|
Helper function which takes a list of rewards {r_0, r_1, ..., r_t', ... r_T} and returns a list where the entry
|
||||||
|
in each index t' is sum_{t'=t}^T gamma^(t'-t) * r_{t'}.
|
||||||
|
|
||||||
|
self.gamma
|
||||||
|
"""
|
||||||
|
|
||||||
|
############################
|
||||||
|
# YOUR IMPLEMENTATION HERE #
|
||||||
|
|
||||||
|
############################
|
||||||
|
pass
|
||||||
124
hw3/src/policies.py
Normal file
124
hw3/src/policies.py
Normal file
@@ -0,0 +1,124 @@
|
|||||||
|
import itertools
|
||||||
|
from torch import nn
|
||||||
|
from torch.nn import functional as F
|
||||||
|
from torch import optim
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from torch import distributions
|
||||||
|
|
||||||
|
import src.pytorch_util as ptu
|
||||||
|
|
||||||
|
|
||||||
|
class MLPPolicy(nn.Module):
|
||||||
|
"""Base MLP policy, which can take an observation and output a distribution over actions.
|
||||||
|
|
||||||
|
This class should implement the `forward` and `get_action` methods. The `update` method should be written in the
|
||||||
|
subclasses, since the policy update rule differs for different algorithms.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
ac_dim: int,
|
||||||
|
ob_dim: int,
|
||||||
|
discrete: bool,
|
||||||
|
n_layers: int,
|
||||||
|
layer_size: int,
|
||||||
|
learning_rate: float,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
if discrete:
|
||||||
|
self.logits_net = ptu.build_mlp(
|
||||||
|
input_size=ob_dim,
|
||||||
|
output_size=ac_dim,
|
||||||
|
n_layers=n_layers,
|
||||||
|
size=layer_size,
|
||||||
|
).to(ptu.device)
|
||||||
|
parameters = self.logits_net.parameters()
|
||||||
|
else:
|
||||||
|
self.mean_net = ptu.build_mlp(
|
||||||
|
input_size=ob_dim,
|
||||||
|
output_size=ac_dim,
|
||||||
|
n_layers=n_layers,
|
||||||
|
size=layer_size,
|
||||||
|
).to(ptu.device)
|
||||||
|
self.logstd = nn.Parameter(
|
||||||
|
torch.zeros(ac_dim, dtype=torch.float32, device=ptu.device)
|
||||||
|
)
|
||||||
|
parameters = itertools.chain([self.logstd], self.mean_net.parameters())
|
||||||
|
|
||||||
|
self.optimizer = optim.Adam(
|
||||||
|
parameters,
|
||||||
|
learning_rate,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.discrete = discrete
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def get_action(self, obs: np.ndarray) -> np.ndarray:
|
||||||
|
"""Takes a single observation (as a numpy array) and returns a single action (as a numpy array)."""
|
||||||
|
# get action from the policy for a single observation
|
||||||
|
action = None
|
||||||
|
|
||||||
|
############################
|
||||||
|
# YOUR IMPLEMENTATION HERE #
|
||||||
|
|
||||||
|
############################
|
||||||
|
|
||||||
|
return action
|
||||||
|
|
||||||
|
def forward(self, obs: torch.FloatTensor):
|
||||||
|
"""
|
||||||
|
This function defines the forward pass of the network. You can return anything you want, but you should be
|
||||||
|
able to differentiate through it. For example, you can return a torch.FloatTensor. You can also return more
|
||||||
|
flexible objects, such as a `torch.distributions.Distribution` object. It's up to you!
|
||||||
|
"""
|
||||||
|
if self.discrete:
|
||||||
|
# define the forward pass for a policy with a discrete action space.
|
||||||
|
action = None
|
||||||
|
|
||||||
|
############################
|
||||||
|
# YOUR IMPLEMENTATION HERE #
|
||||||
|
|
||||||
|
############################
|
||||||
|
|
||||||
|
else:
|
||||||
|
# define the forward pass for a policy with a continuous action space.
|
||||||
|
mean_prob = self.mean_net(obs)
|
||||||
|
std_prob = torch.exp(self.logstd)
|
||||||
|
action = distributions.MultivariateNormal(mean_prob, scale_tril=torch.diag(std_prob))
|
||||||
|
|
||||||
|
return action
|
||||||
|
|
||||||
|
|
||||||
|
def update(self, obs: np.ndarray, actions: np.ndarray, *args, **kwargs) -> dict:
|
||||||
|
"""Performs one iteration of gradient descent on the provided batch of data."""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
class MLPPolicyPG(MLPPolicy):
|
||||||
|
"""Policy subclass for the policy gradient algorithm."""
|
||||||
|
|
||||||
|
def update(
|
||||||
|
self,
|
||||||
|
obs: np.ndarray,
|
||||||
|
actions: np.ndarray,
|
||||||
|
advantages: np.ndarray,
|
||||||
|
) -> dict:
|
||||||
|
"""Implements the policy gradient actor update."""
|
||||||
|
obs = ptu.from_numpy(obs)
|
||||||
|
actions = ptu.from_numpy(actions)
|
||||||
|
advantages = ptu.from_numpy(advantages)
|
||||||
|
|
||||||
|
# compute loss, implement the policy gradient actor update.
|
||||||
|
loss = None
|
||||||
|
|
||||||
|
############################
|
||||||
|
# YOUR IMPLEMENTATION HERE #
|
||||||
|
|
||||||
|
############################
|
||||||
|
|
||||||
|
return {
|
||||||
|
"Actor Loss": ptu.to_numpy(loss),
|
||||||
|
}
|
||||||
84
hw3/src/pytorch_util.py
Normal file
84
hw3/src/pytorch_util.py
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
from typing import Union
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from torch import nn
|
||||||
|
|
||||||
|
Activation = Union[str, nn.Module]
|
||||||
|
|
||||||
|
|
||||||
|
_str_to_activation = {
|
||||||
|
'relu': nn.ReLU(),
|
||||||
|
'tanh': nn.Tanh(),
|
||||||
|
'leaky_relu': nn.LeakyReLU(),
|
||||||
|
'sigmoid': nn.Sigmoid(),
|
||||||
|
'selu': nn.SELU(),
|
||||||
|
'softplus': nn.Softplus(),
|
||||||
|
'identity': nn.Identity(),
|
||||||
|
}
|
||||||
|
|
||||||
|
device = None
|
||||||
|
|
||||||
|
def build_mlp(
|
||||||
|
input_size: int,
|
||||||
|
output_size: int,
|
||||||
|
n_layers: int,
|
||||||
|
size: int,
|
||||||
|
activation: Activation = 'tanh',
|
||||||
|
output_activation: Activation = 'identity',
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Builds a feedforward neural network
|
||||||
|
|
||||||
|
arguments:
|
||||||
|
input_placeholder: placeholder variable for the state (batch_size, input_size)
|
||||||
|
scope: variable scope of the network
|
||||||
|
|
||||||
|
n_layers: number of hidden layers
|
||||||
|
size: dimension of each hidden layer
|
||||||
|
activation: activation of each hidden layer
|
||||||
|
|
||||||
|
input_size: size of the input layer
|
||||||
|
output_size: size of the output layer
|
||||||
|
output_activation: activation of the output layer
|
||||||
|
|
||||||
|
returns:
|
||||||
|
output_placeholder: the result of a forward pass through the hidden layers + the output layer
|
||||||
|
"""
|
||||||
|
if isinstance(activation, str):
|
||||||
|
activation = _str_to_activation[activation]
|
||||||
|
if isinstance(output_activation, str):
|
||||||
|
output_activation = _str_to_activation[output_activation]
|
||||||
|
layers = []
|
||||||
|
in_size = input_size
|
||||||
|
for _ in range(n_layers):
|
||||||
|
layers.append(nn.Linear(in_size, size))
|
||||||
|
layers.append(activation)
|
||||||
|
in_size = size
|
||||||
|
layers.append(nn.Linear(in_size, output_size))
|
||||||
|
layers.append(output_activation)
|
||||||
|
|
||||||
|
mlp = nn.Sequential(*layers)
|
||||||
|
mlp.to(device)
|
||||||
|
return mlp
|
||||||
|
|
||||||
|
|
||||||
|
def init_gpu(use_gpu=True, gpu_id=0):
|
||||||
|
global device
|
||||||
|
if torch.cuda.is_available() and use_gpu:
|
||||||
|
device = torch.device("cuda:" + str(gpu_id))
|
||||||
|
print("Using GPU id {}".format(gpu_id))
|
||||||
|
else:
|
||||||
|
device = torch.device("cpu")
|
||||||
|
print("Using CPU.")
|
||||||
|
|
||||||
|
|
||||||
|
def set_device(gpu_id):
|
||||||
|
torch.cuda.set_device(gpu_id)
|
||||||
|
|
||||||
|
|
||||||
|
def from_numpy(*args, **kwargs):
|
||||||
|
return torch.from_numpy(*args, **kwargs).float().to(device)
|
||||||
|
|
||||||
|
|
||||||
|
def to_numpy(tensor):
|
||||||
|
return tensor.to('cpu').detach().numpy()
|
||||||
144
hw3/src/utils.py
Normal file
144
hw3/src/utils.py
Normal file
@@ -0,0 +1,144 @@
|
|||||||
|
from collections import OrderedDict
|
||||||
|
import numpy as np
|
||||||
|
import copy
|
||||||
|
from src.policies import MLPPolicy
|
||||||
|
import gymnasium as gym
|
||||||
|
import cv2
|
||||||
|
import src.pytorch_util as ptu
|
||||||
|
from typing import Dict, Tuple, List
|
||||||
|
|
||||||
|
############################################
|
||||||
|
############################################
|
||||||
|
|
||||||
|
|
||||||
|
def sample_trajectory(
|
||||||
|
env: gym.Env, policy: MLPPolicy, render: bool = False
|
||||||
|
) -> Dict[str, np.ndarray]:
|
||||||
|
"""Sample a rollout in the environment from a policy."""
|
||||||
|
ob,_ = env.reset()
|
||||||
|
obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], []
|
||||||
|
steps = 0
|
||||||
|
while True:
|
||||||
|
# render an image
|
||||||
|
if render:
|
||||||
|
if hasattr(env, "sim"):
|
||||||
|
img = env.sim.render(camera_name="track", height=500, width=500)[::-1]
|
||||||
|
else:
|
||||||
|
img = env.render(mode="single_rgb_array")
|
||||||
|
image_obs.append(
|
||||||
|
cv2.resize(img, dsize=(250, 250), interpolation=cv2.INTER_CUBIC)
|
||||||
|
)
|
||||||
|
|
||||||
|
ac, rew, next_ob, rollout_done = None, None, None, False
|
||||||
|
|
||||||
|
############################
|
||||||
|
# YOUR IMPLEMENTATION HERE #
|
||||||
|
|
||||||
|
############################
|
||||||
|
|
||||||
|
|
||||||
|
# record result of taking that action
|
||||||
|
obs.append(ob)
|
||||||
|
acs.append(ac)
|
||||||
|
rewards.append(rew)
|
||||||
|
next_obs.append(next_ob)
|
||||||
|
terminals.append(rollout_done)
|
||||||
|
|
||||||
|
ob = next_ob # jump to next timestep
|
||||||
|
|
||||||
|
# end the rollout if the rollout ended
|
||||||
|
if rollout_done:
|
||||||
|
break
|
||||||
|
|
||||||
|
return {
|
||||||
|
"observation": np.array(obs, dtype=np.float32),
|
||||||
|
"image_obs": np.array(image_obs, dtype=np.uint8),
|
||||||
|
"reward": np.array(rewards, dtype=np.float32),
|
||||||
|
"action": np.array(acs, dtype=np.float32),
|
||||||
|
"next_observation": np.array(next_obs, dtype=np.float32),
|
||||||
|
"terminal": np.array(terminals, dtype=np.float32),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def sample_trajectories(
|
||||||
|
env: gym.Env,
|
||||||
|
policy: MLPPolicy,
|
||||||
|
min_timesteps_per_batch: int,
|
||||||
|
render: bool = False,
|
||||||
|
) -> Tuple[List[Dict[str, np.ndarray]], int]:
|
||||||
|
"""Collect rollouts using policy until we have collected min_timesteps_per_batch steps."""
|
||||||
|
timesteps_this_batch = 0
|
||||||
|
trajs = []
|
||||||
|
while timesteps_this_batch < min_timesteps_per_batch:
|
||||||
|
# collect rollout
|
||||||
|
traj = sample_trajectory(env, policy, render)
|
||||||
|
trajs.append(traj)
|
||||||
|
|
||||||
|
# count steps
|
||||||
|
timesteps_this_batch += get_traj_length(traj)
|
||||||
|
return trajs, timesteps_this_batch
|
||||||
|
|
||||||
|
|
||||||
|
def sample_n_trajectories(
|
||||||
|
env: gym.Env, policy: MLPPolicy, ntraj: int, render: bool = False
|
||||||
|
):
|
||||||
|
"""Collect ntraj rollouts."""
|
||||||
|
trajs = []
|
||||||
|
for _ in range(ntraj):
|
||||||
|
# collect rollout
|
||||||
|
traj = sample_trajectory(env, policy, render)
|
||||||
|
trajs.append(traj)
|
||||||
|
return trajs
|
||||||
|
|
||||||
|
|
||||||
|
def compute_metrics(trajs, eval_trajs):
|
||||||
|
"""Compute metrics for logging."""
|
||||||
|
|
||||||
|
# returns, for logging
|
||||||
|
train_returns = [traj["reward"].sum() for traj in trajs]
|
||||||
|
eval_returns = [eval_traj["reward"].sum() for eval_traj in eval_trajs]
|
||||||
|
|
||||||
|
# episode lengths, for logging
|
||||||
|
train_ep_lens = [len(traj["reward"]) for traj in trajs]
|
||||||
|
eval_ep_lens = [len(eval_traj["reward"]) for eval_traj in eval_trajs]
|
||||||
|
|
||||||
|
# decide what to log
|
||||||
|
logs = OrderedDict()
|
||||||
|
logs["Eval_AverageReturn"] = np.mean(eval_returns)
|
||||||
|
logs["Eval_StdReturn"] = np.std(eval_returns)
|
||||||
|
logs["Eval_MaxReturn"] = np.max(eval_returns)
|
||||||
|
logs["Eval_MinReturn"] = np.min(eval_returns)
|
||||||
|
logs["Eval_AverageEpLen"] = np.mean(eval_ep_lens)
|
||||||
|
|
||||||
|
logs["Train_AverageReturn"] = np.mean(train_returns)
|
||||||
|
logs["Train_StdReturn"] = np.std(train_returns)
|
||||||
|
logs["Train_MaxReturn"] = np.max(train_returns)
|
||||||
|
logs["Train_MinReturn"] = np.min(train_returns)
|
||||||
|
logs["Train_AverageEpLen"] = np.mean(train_ep_lens)
|
||||||
|
|
||||||
|
return logs
|
||||||
|
|
||||||
|
|
||||||
|
def convert_listofrollouts(trajs):
|
||||||
|
"""
|
||||||
|
Take a list of rollout dictionaries and return separate arrays, where each array is a concatenation of that array
|
||||||
|
from across the rollouts.
|
||||||
|
"""
|
||||||
|
observations = np.concatenate([traj["observation"] for traj in trajs])
|
||||||
|
actions = np.concatenate([traj["action"] for traj in trajs])
|
||||||
|
next_observations = np.concatenate([traj["next_observation"] for traj in trajs])
|
||||||
|
terminals = np.concatenate([traj["terminal"] for traj in trajs])
|
||||||
|
concatenated_rewards = np.concatenate([traj["reward"] for traj in trajs])
|
||||||
|
unconcatenated_rewards = [traj["reward"] for traj in trajs]
|
||||||
|
return (
|
||||||
|
observations,
|
||||||
|
actions,
|
||||||
|
next_observations,
|
||||||
|
terminals,
|
||||||
|
concatenated_rewards,
|
||||||
|
unconcatenated_rewards,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_traj_length(traj):
|
||||||
|
return len(traj["reward"])
|
||||||
Reference in New Issue
Block a user