init

2025-10-25 13:26:54 -05:00
commit 292e04832d
17 changed files with 898 additions and 0 deletions
--- a/hw3/README.md
+++ b/hw3/README.md
@@ -0,0 +1,23 @@
 ## Setup the environment
 ```
 # remove the content 'pytorch-cuda=11.7 -c pytorch -c nvidia' if you are a mac user or are not going to use GPU
 conda install pytorch==2.0.0 pytorch-cuda=11.7 -c pytorch -c nvidia
 pip install 'gymnasium[classic_control]==0.27.1'
 pip install matplotlib==3.7.1
 pip install tensorboardX==2.6.4
 ```
 ## Complete the code
 The files that you are going to implement are:
 - `src/pg_agent.py`
 - `src/policies.py`
 - `src/critics.py`
 - `src/utils.py`
 See the [Assignment PDF](hw3.pdf) for more instructions.
 ## Submission
 You should submit your code and the training logs, as well as your report on Canvas.
--- a/hw3/hw3.pdf
+++ b/hw3/hw3.pdf
--- a/hw3/run.py
+++ b/hw3/run.py
@@ -0,0 +1,188 @@
 import os
 import time
 from src.pg_agent import PGAgent
 import os
 import time
 import gymnasium as gym
 import numpy as np
 import torch
 from src import pytorch_util as ptu
 from src import utils
 from src.logger import Logger
 from src.action_noise_wrapper import ActionNoiseWrapper
 MAX_NVIDEO = 2
 def run_training_loop(args):
    logger = Logger(args.logdir)
    # set random seeds
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    ptu.init_gpu(use_gpu=not args.no_gpu, gpu_id=args.which_gpu)
    # make the gym environment
    env = gym.make(args.env_name, render_mode=None)
    discrete = isinstance(env.action_space, gym.spaces.Discrete)
    # add action noise, if needed
    if args.action_noise_std > 0:
        assert not discrete, f"Cannot use --action_noise_std for discrete environment {args.env_name}"
        env = ActionNoiseWrapper(env, args.seed, args.action_noise_std)
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]
    # simulation timestep, will be used for video saving
    if hasattr(env, "model"):
        fps = 1 / env.model.opt.timestep
    else:
        fps = env.env.metadata["render_fps"]
    # initialize agent
    agent = PGAgent(
        ob_dim,
        ac_dim,
        discrete,
        n_layers=args.n_layers,
        layer_size=args.layer_size,
        gamma=args.discount,
        learning_rate=args.learning_rate,
        use_baseline=args.use_baseline,
        use_reward_to_go=args.use_reward_to_go,
        normalize_advantages=args.normalize_advantages,
        baseline_learning_rate=args.baseline_learning_rate,
        baseline_gradient_steps=args.baseline_gradient_steps,
        gae_lambda=args.gae_lambda,
    )
    total_envsteps = 0
    start_time = time.time()
    for itr in range(args.n_iter):
        print(f"\n********** Iteration {itr} ************")
        # sample `args.batch_size` transitions using utils.sample_trajectories
        trajs, envsteps_this_batch = utils.sample_trajectories(
            env, agent.actor, args.batch_size, False
        )
        total_envsteps += envsteps_this_batch
        # trajs should be a list of dictionaries of NumPy arrays, where each dictionary corresponds to a trajectory.
        # this line converts this into a single dictionary of lists of NumPy arrays.
        trajs_dict = {k: [traj[k] for traj in trajs] for k in trajs[0]}
        # train the agent using the sampled trajectories and the agent's update function
        # agent.update 
        train_info: dict = agent.update(
            trajs_dict["observation"], trajs_dict["action"], trajs_dict["reward"], trajs_dict["terminal"]
        )
        if itr % args.scalar_log_freq == 0:
            # save eval metrics
            print("\nCollecting data for eval...")
            eval_trajs, eval_envsteps_this_batch = utils.sample_trajectories(
                env, agent.actor, args.eval_batch_size, 
            )
            logs = utils.compute_metrics(trajs, eval_trajs)
            # compute additional metrics
            logs.update(train_info)
            logs["Train_EnvstepsSoFar"] = total_envsteps
            logs["TimeSinceStart"] = time.time() - start_time
            if itr == 0:
                logs["Initial_DataCollection_AverageReturn"] = logs[
                    "Train_AverageReturn"
                ]
            # perform the logging
            for key, value in logs.items():
                print("{} : {}".format(key, value))
                logger.log_scalar(value, key, itr)
            print("Done logging...\n\n")
            logger.flush()
        if args.video_log_freq != -1 and itr % args.video_log_freq == 0:
            print("\nCollecting video rollouts...")
            eval_video_trajs = utils.sample_n_trajectories(
                env, agent.actor, MAX_NVIDEO, render=True
            )
            logger.log_trajs_as_videos(
                eval_video_trajs,
                itr,
                fps=fps,
                max_videos_to_save=MAX_NVIDEO,
                video_title="eval_rollouts",
            )
 def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--env_name", type=str, required=True)
    parser.add_argument("--exp_name", type=str, required=True)
    parser.add_argument("--n_iter", "-n", type=int, default=200)
    parser.add_argument("--use_reward_to_go", "-rtg", action="store_true")
    parser.add_argument("--use_baseline", action="store_true")
    parser.add_argument("--baseline_learning_rate", "-blr", type=float, default=5e-3)
    parser.add_argument("--baseline_gradient_steps", "-bgs", type=int, default=5)
    parser.add_argument("--gae_lambda", type=float, default=None)
    parser.add_argument("--normalize_advantages", "-na", action="store_true")
    parser.add_argument(
        "--batch_size", "-b", type=int, default=1000
    )  # steps collected per train iteration
    parser.add_argument(
        "--eval_batch_size", "-eb", type=int, default=400
    )  # steps collected per eval iteration
    parser.add_argument("--discount", type=float, default=1.0)
    parser.add_argument("--learning_rate", "-lr", type=float, default=5e-3)
    parser.add_argument("--n_layers", "-l", type=int, default=2)
    parser.add_argument("--layer_size", "-s", type=int, default=64)
    parser.add_argument("--seed", type=int, default=1)
    parser.add_argument("--no_gpu", "-ngpu", action="store_true")
    parser.add_argument("--which_gpu", "-gpu_id", default=0)
    parser.add_argument("--video_log_freq", type=int, default=-1)
    parser.add_argument("--scalar_log_freq", type=int, default=1)
    parser.add_argument("--action_noise_std", type=float, default=0)
    parser.add_argument("--data_path",type=str,default='./data')
    args = parser.parse_args()
    # create directory for logging
    logdir_prefix = "pg_"  # keep for autograder
    data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), args.data_path)
    if not (os.path.exists(data_path)):
        os.makedirs(data_path)
    logdir = (
        logdir_prefix
        + args.exp_name
        + "_"
        + args.env_name
        + "_"
        + time.strftime("%d-%m-%Y_%H-%M-%S")
    )
    logdir = os.path.join(data_path, logdir)
    args.logdir = logdir
    if not (os.path.exists(logdir)):
        os.makedirs(logdir)
    run_training_loop(args)
 if __name__ == "__main__":
    main()
--- a/hw3/src/pycache/action_noise_wrapper.cpython-311.pyc
+++ b/hw3/src/pycache/action_noise_wrapper.cpython-311.pyc
--- a/hw3/src/pycache/critics.cpython-311.pyc
+++ b/hw3/src/pycache/critics.cpython-311.pyc
--- a/hw3/src/pycache/logger.cpython-311.pyc
+++ b/hw3/src/pycache/logger.cpython-311.pyc
--- a/hw3/src/pycache/pg_agent.cpython-311.pyc
+++ b/hw3/src/pycache/pg_agent.cpython-311.pyc
--- a/hw3/src/pycache/policies.cpython-311.pyc
+++ b/hw3/src/pycache/policies.cpython-311.pyc
--- a/hw3/src/pycache/pytorch_util.cpython-311.pyc
+++ b/hw3/src/pycache/pytorch_util.cpython-311.pyc
--- a/hw3/src/pycache/utils.cpython-311.pyc
+++ b/hw3/src/pycache/utils.cpython-311.pyc
--- a/hw3/src/action_noise_wrapper.py
+++ b/hw3/src/action_noise_wrapper.py
@@ -0,0 +1,12 @@
 import gymnasium as gym
 import numpy as np
 class ActionNoiseWrapper(gym.ActionWrapper):
    def __init__(self, env, seed, std):
        super().__init__(env)
        self.rng = np.random.default_rng(seed)
        self.std = std
    def action(self, act):
        act = act + self.rng.normal(0, self.std, act.shape)
        return act
--- a/hw3/src/critics.py
+++ b/hw3/src/critics.py
@@ -0,0 +1,63 @@
 import itertools
 from torch import nn
 from torch.nn import functional as F
 from torch import optim
 import numpy as np
 import torch
 from torch import distributions
 import src.pytorch_util as ptu
 class ValueCritic(nn.Module):
    """Value network, which takes an observation and outputs a value for that observation."""
    def __init__(
        self,
        ob_dim: int,
        n_layers: int,
        layer_size: int,
        learning_rate: float,
    ):
        super().__init__()
        self.network = ptu.build_mlp(
            input_size=ob_dim,
            output_size=1,
            n_layers=n_layers,
            size=layer_size,
        ).to(ptu.device)
        self.optimizer = optim.Adam(
            self.network.parameters(),
            learning_rate,
        )
    def forward(self, obs: torch.Tensor) -> torch.Tensor:
        # implement the forward pass of the critic network
        values = None
        ############################
        # YOUR IMPLEMENTATION HERE #
        ############################
        return values
    def update(self, obs: np.ndarray, q_values: np.ndarray) -> dict:
        obs = ptu.from_numpy(obs)
        q_values = ptu.from_numpy(q_values)
        # compute loss, update the critic using the observations and q_values
        loss = None
        ############################
        # YOUR IMPLEMENTATION HERE #
        ############################
        return {
            "Baseline Loss": ptu.to_numpy(loss),
        }
--- a/hw3/src/logger.py
+++ b/hw3/src/logger.py
@@ -0,0 +1,74 @@
 import os
 from tensorboardX import SummaryWriter
 import numpy as np
 class Logger:
    def __init__(self, log_dir, n_logged_samples=10, summary_writer=None):
        self._log_dir = log_dir
        print('########################')
        print('logging outputs to ', log_dir)
        print('########################')
        self._n_logged_samples = n_logged_samples
        self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1)
    def log_scalar(self, scalar, name, step_):
        self._summ_writer.add_scalar('{}'.format(name), scalar, step_)
    def log_scalars(self, scalar_dict, group_name, step, phase):
        """Will log all scalars in the same plot."""
        self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step)
    def log_image(self, image, name, step):
        assert(len(image.shape) == 3)  # [C, H, W]
        self._summ_writer.add_image('{}'.format(name), image, step)
    def log_video(self, video_frames, name, step, fps=10):
        assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!"
        self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps)
    def log_trajs_as_videos(self, trajs, step, max_videos_to_save=2, fps=10, video_title='video'):
        # reshape the rollouts
        videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in trajs]
        # max rollout length
        max_videos_to_save = np.min([max_videos_to_save, len(videos)])
        max_length = videos[0].shape[0]
        for i in range(max_videos_to_save):
            if videos[i].shape[0]>max_length:
                max_length = videos[i].shape[0]
        # pad rollouts to all be same length
        for i in range(max_videos_to_save):
            if videos[i].shape[0]<max_length:
                padding = np.tile([videos[i][-1]], (max_length-videos[i].shape[0],1,1,1))
                videos[i] = np.concatenate([videos[i], padding], 0)
        # log videos to tensorboard event file
        videos = np.stack(videos[:max_videos_to_save], 0)
        self.log_video(videos, video_title, step, fps=fps)
    def log_figures(self, figure, name, step, phase):
        """figure: matplotlib.pyplot figure handle"""
        assert figure.shape[0] > 0, "Figure logging requires input shape [batch x figures]!"
        self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
    def log_figure(self, figure, name, step, phase):
        """figure: matplotlib.pyplot figure handle"""
        self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
    def log_graph(self, array, name, step, phase):
        """figure: matplotlib.pyplot figure handle"""
        im = plot_graph(array)
        self._summ_writer.add_image('{}_{}'.format(name, phase), im, step)
    def dump_scalars(self, log_path=None):
        log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path
        self._summ_writer.export_scalars_to_json(log_path)
    def flush(self):
        self._summ_writer.flush()
--- a/hw3/src/pg_agent.py
+++ b/hw3/src/pg_agent.py
@@ -0,0 +1,186 @@
 from typing import Optional, Sequence
 import numpy as np
 import torch
 from src.policies import MLPPolicyPG
 from src.critics import ValueCritic
 import src.pytorch_util as ptu
 from torch import nn
 class PGAgent(nn.Module):
    def __init__(
        self,
        ob_dim: int,
        ac_dim: int,
        discrete: bool,
        n_layers: int,
        layer_size: int,
        gamma: float,
        learning_rate: float,
        use_baseline: bool,
        use_reward_to_go: bool,
        baseline_learning_rate: Optional[float],
        baseline_gradient_steps: Optional[int],
        gae_lambda: Optional[float],
        normalize_advantages: bool,
    ):
        super().__init__()
        # create the actor (policy) network
        self.actor = MLPPolicyPG(
            ac_dim, ob_dim, discrete, n_layers, layer_size, learning_rate
        )
        # create the critic (baseline) network, if needed
        if use_baseline:
            self.critic = ValueCritic(
                ob_dim, n_layers, layer_size, baseline_learning_rate
            )
            self.baseline_gradient_steps = baseline_gradient_steps
        else:
            self.critic = None
        # other agent parameters
        self.gamma = gamma
        self.use_reward_to_go = use_reward_to_go
        self.gae_lambda = gae_lambda
        self.normalize_advantages = normalize_advantages
    def update(
        self,
        obs: Sequence[np.ndarray],
        actions: Sequence[np.ndarray],
        rewards: Sequence[np.ndarray],
        terminals: Sequence[np.ndarray],
    ) -> dict:
        """The train step for PG involves updating its actor using the given observations/actions and the calculated
        qvals/advantages that come from the seen rewards.
        Each input is a list of NumPy arrays, where each array corresponds to a single trajectory. The batch size is the
        total number of samples across all trajectories (i.e. the sum of the lengths of all the arrays).
        """
        # step 1: calculate Q values of each (s_t, a_t) point, using rewards (r_0, ..., r_t, ..., r_T)
        q_values: Sequence[np.ndarray] = self._calculate_q_vals(rewards)
        obs = np.concatenate(obs)
        actions = np.concatenate(actions)
        rewards = np.concatenate(rewards)
        terminals = np.concatenate(terminals)
        q_values = np.concatenate(q_values)
        # step 2: calculate advantages from Q values
        advantages: np.ndarray = self._estimate_advantage(
            obs, rewards, q_values, terminals
        )
        # step 3: use all datapoints (s_t, a_t, adv_t) to update the PG actor/policy
        # update the PG actor/policy network once using the advantages
        info: dict = self.actor.update(obs, actions, advantages)
        # step 4: if needed, use all datapoints (s_t, a_t, q_t) to update the PG critic/baseline
        if self.critic is not None:
            # perform `self.baseline_gradient_steps` updates to the critic/baseline network
            critic_info: dict = None
            ############################
            # YOUR IMPLEMENTATION HERE #
            ############################
            info.update(critic_info)
        return info
    def _calculate_q_vals(self, rewards: Sequence[np.ndarray]) -> Sequence[np.ndarray]:
        """Monte Carlo estimation of the Q function."""
        if not self.use_reward_to_go:
            # Case 1: in trajectory-based PG, we ignore the timestep and instead use the discounted return for the entire
            # trajectory at each point.
            # In other words: Q(s_t, a_t) = sum_{t'=0}^T gamma^t' r_{t'}
            # TODO: use the helper function self._discounted_return to calculate the Q-values
            q_values = None
            ############################
            # YOUR IMPLEMENTATION HERE #
            ############################
        else:
            # Case 2: in reward-to-go PG, we only use the rewards after timestep t to estimate the Q-value for (s_t, a_t).
            # In other words: Q(s_t, a_t) = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
            # TODO: use the helper function self._discounted_reward_to_go to calculate the Q-values
            q_values = None
            ############################
            # YOUR IMPLEMENTATION HERE #
            ############################
        return q_values
    def _estimate_advantage(
        self,
        obs: np.ndarray,
        rewards: np.ndarray,
        q_values: np.ndarray,
        terminals: np.ndarray,
    ) -> np.ndarray:
        """Computes advantages by (possibly) subtracting a value baseline from the estimated Q-values.
        Operates on flat 1D NumPy arrays.
        """
        if self.critic is None:   
            advantages = q_values.copy()
        else:
            # run the critic and use it as a baseline to compute values and advantages
            values = None
            advantages = None
            ############################
            # YOUR IMPLEMENTATION HERE #
            ############################
            assert values.shape == q_values.shape
        # normalize the advantages to have a mean of zero and a standard deviation of one within the batch
        if self.normalize_advantages:
            advantages = None
            ############################
            # YOUR IMPLEMENTATION HERE #
            ############################
        return advantages
    def _discounted_return(self, rewards: Sequence[float]) -> Sequence[float]:
        """
        Helper function which takes a list of rewards {r_0, r_1, ..., r_t', ... r_T} and returns
        a list where each index t contains sum_{t'=0}^T gamma^t' r_{t'}
        Note that all entries of the output list should be the exact same because each sum is from 0 to T (and doesn't
        involve t)!
        self.gamma
        """
        ############################
        # YOUR IMPLEMENTATION HERE #
        ############################
        pass
    def _discounted_reward_to_go(self, rewards: Sequence[float]) -> Sequence[float]:
        """
        Helper function which takes a list of rewards {r_0, r_1, ..., r_t', ... r_T} and returns a list where the entry
        in each index t' is sum_{t'=t}^T gamma^(t'-t) * r_{t'}.
        self.gamma
        """
        ############################
        # YOUR IMPLEMENTATION HERE #
        ############################
        pass
--- a/hw3/src/policies.py
+++ b/hw3/src/policies.py
@@ -0,0 +1,124 @@
 import itertools
 from torch import nn
 from torch.nn import functional as F
 from torch import optim
 import numpy as np
 import torch
 from torch import distributions
 import src.pytorch_util as ptu
 class MLPPolicy(nn.Module):
    """Base MLP policy, which can take an observation and output a distribution over actions.
    This class should implement the `forward` and `get_action` methods. The `update` method should be written in the
    subclasses, since the policy update rule differs for different algorithms.
    """
    def __init__(
        self,
        ac_dim: int,
        ob_dim: int,
        discrete: bool,
        n_layers: int,
        layer_size: int,
        learning_rate: float,
    ):
        super().__init__()
        if discrete:
            self.logits_net = ptu.build_mlp(
                input_size=ob_dim,
                output_size=ac_dim,
                n_layers=n_layers,
                size=layer_size,
            ).to(ptu.device)
            parameters = self.logits_net.parameters()
        else:
            self.mean_net = ptu.build_mlp(
                input_size=ob_dim,
                output_size=ac_dim,
                n_layers=n_layers,
                size=layer_size,
            ).to(ptu.device)
            self.logstd = nn.Parameter(
                torch.zeros(ac_dim, dtype=torch.float32, device=ptu.device)
            )
            parameters = itertools.chain([self.logstd], self.mean_net.parameters())
        self.optimizer = optim.Adam(
            parameters,
            learning_rate,
        )
        self.discrete = discrete
    @torch.no_grad()
    def get_action(self, obs: np.ndarray) -> np.ndarray:
        """Takes a single observation (as a numpy array) and returns a single action (as a numpy array)."""
        # get action from the policy for a single observation
        action = None
        ############################
        # YOUR IMPLEMENTATION HERE #
        ############################
        return action
    def forward(self, obs: torch.FloatTensor):
        """
        This function defines the forward pass of the network.  You can return anything you want, but you should be
        able to differentiate through it. For example, you can return a torch.FloatTensor. You can also return more
        flexible objects, such as a `torch.distributions.Distribution` object. It's up to you!
        """
        if self.discrete:
            # define the forward pass for a policy with a discrete action space.
            action = None
            ############################
            # YOUR IMPLEMENTATION HERE #
            ############################
        else:
            # define the forward pass for a policy with a continuous action space.
            mean_prob = self.mean_net(obs)
            std_prob = torch.exp(self.logstd)
            action = distributions.MultivariateNormal(mean_prob, scale_tril=torch.diag(std_prob))
        return action
    def update(self, obs: np.ndarray, actions: np.ndarray, *args, **kwargs) -> dict:
        """Performs one iteration of gradient descent on the provided batch of data."""
        raise NotImplementedError
 class MLPPolicyPG(MLPPolicy):
    """Policy subclass for the policy gradient algorithm."""
    def update(
        self,
        obs: np.ndarray,
        actions: np.ndarray,
        advantages: np.ndarray,
    ) -> dict:
        """Implements the policy gradient actor update."""
        obs = ptu.from_numpy(obs)
        actions = ptu.from_numpy(actions)
        advantages = ptu.from_numpy(advantages)
        # compute loss, implement the policy gradient actor update.
        loss = None
        ############################
        # YOUR IMPLEMENTATION HERE #
        ############################
        return {
            "Actor Loss": ptu.to_numpy(loss),
        }
--- a/hw3/src/pytorch_util.py
+++ b/hw3/src/pytorch_util.py
@@ -0,0 +1,84 @@
 from typing import Union
 import torch
 from torch import nn
 Activation = Union[str, nn.Module]
 _str_to_activation = {
    'relu': nn.ReLU(),
    'tanh': nn.Tanh(),
    'leaky_relu': nn.LeakyReLU(),
    'sigmoid': nn.Sigmoid(),
    'selu': nn.SELU(),
    'softplus': nn.Softplus(),
    'identity': nn.Identity(),
 }
 device = None
 def build_mlp(
        input_size: int,
        output_size: int,
        n_layers: int,
        size: int,
        activation: Activation = 'tanh',
        output_activation: Activation = 'identity',
 ):
    """
        Builds a feedforward neural network
        arguments:
            input_placeholder: placeholder variable for the state (batch_size, input_size)
            scope: variable scope of the network
            n_layers: number of hidden layers
            size: dimension of each hidden layer
            activation: activation of each hidden layer
            input_size: size of the input layer
            output_size: size of the output layer
            output_activation: activation of the output layer
        returns:
            output_placeholder: the result of a forward pass through the hidden layers + the output layer
    """
    if isinstance(activation, str):
        activation = _str_to_activation[activation]
    if isinstance(output_activation, str):
        output_activation = _str_to_activation[output_activation]
    layers = []
    in_size = input_size
    for _ in range(n_layers):
        layers.append(nn.Linear(in_size, size))
        layers.append(activation)
        in_size = size
    layers.append(nn.Linear(in_size, output_size))
    layers.append(output_activation)
    mlp = nn.Sequential(*layers)
    mlp.to(device)
    return mlp
 def init_gpu(use_gpu=True, gpu_id=0):
    global device
    if torch.cuda.is_available() and use_gpu:
        device = torch.device("cuda:" + str(gpu_id))
        print("Using GPU id {}".format(gpu_id))
    else:
        device = torch.device("cpu")
        print("Using CPU.")
 def set_device(gpu_id):
    torch.cuda.set_device(gpu_id)
 def from_numpy(*args, **kwargs):
    return torch.from_numpy(*args, **kwargs).float().to(device)
 def to_numpy(tensor):
    return tensor.to('cpu').detach().numpy()
--- a/hw3/src/utils.py
+++ b/hw3/src/utils.py
@@ -0,0 +1,144 @@
 from collections import OrderedDict
 import numpy as np
 import copy
 from src.policies import MLPPolicy
 import gymnasium as gym
 import cv2
 import src.pytorch_util as ptu
 from typing import Dict, Tuple, List
 ############################################
 ############################################
 def sample_trajectory(
    env: gym.Env, policy: MLPPolicy, render: bool = False
 ) -> Dict[str, np.ndarray]:
    """Sample a rollout in the environment from a policy."""
    ob,_ = env.reset()
    obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], []
    steps = 0
    while True:
        # render an image
        if render:
            if hasattr(env, "sim"):
                img = env.sim.render(camera_name="track", height=500, width=500)[::-1]
            else:
                img = env.render(mode="single_rgb_array")
            image_obs.append(
                cv2.resize(img, dsize=(250, 250), interpolation=cv2.INTER_CUBIC)
            )
        ac, rew, next_ob, rollout_done = None, None, None, False
        ############################
        # YOUR IMPLEMENTATION HERE #
        ############################
        # record result of taking that action
        obs.append(ob)
        acs.append(ac)
        rewards.append(rew)
        next_obs.append(next_ob)
        terminals.append(rollout_done)
        ob = next_ob  # jump to next timestep
        # end the rollout if the rollout ended
        if rollout_done:
            break
    return {
        "observation": np.array(obs, dtype=np.float32),
        "image_obs": np.array(image_obs, dtype=np.uint8),
        "reward": np.array(rewards, dtype=np.float32),
        "action": np.array(acs, dtype=np.float32),
        "next_observation": np.array(next_obs, dtype=np.float32),
        "terminal": np.array(terminals, dtype=np.float32),
    }
 def sample_trajectories(
    env: gym.Env,
    policy: MLPPolicy,
    min_timesteps_per_batch: int,
    render: bool = False,
 ) -> Tuple[List[Dict[str, np.ndarray]], int]:
    """Collect rollouts using policy until we have collected min_timesteps_per_batch steps."""
    timesteps_this_batch = 0
    trajs = []
    while timesteps_this_batch < min_timesteps_per_batch:
        # collect rollout
        traj = sample_trajectory(env, policy, render)
        trajs.append(traj)
        # count steps
        timesteps_this_batch += get_traj_length(traj)
    return trajs, timesteps_this_batch
 def sample_n_trajectories(
    env: gym.Env, policy: MLPPolicy, ntraj: int, render: bool = False
 ):
    """Collect ntraj rollouts."""
    trajs = []
    for _ in range(ntraj):
        # collect rollout
        traj = sample_trajectory(env, policy, render)
        trajs.append(traj)
    return trajs
 def compute_metrics(trajs, eval_trajs):
    """Compute metrics for logging."""
    # returns, for logging
    train_returns = [traj["reward"].sum() for traj in trajs]
    eval_returns = [eval_traj["reward"].sum() for eval_traj in eval_trajs]
    # episode lengths, for logging
    train_ep_lens = [len(traj["reward"]) for traj in trajs]
    eval_ep_lens = [len(eval_traj["reward"]) for eval_traj in eval_trajs]
    # decide what to log
    logs = OrderedDict()
    logs["Eval_AverageReturn"] = np.mean(eval_returns)
    logs["Eval_StdReturn"] = np.std(eval_returns)
    logs["Eval_MaxReturn"] = np.max(eval_returns)
    logs["Eval_MinReturn"] = np.min(eval_returns)
    logs["Eval_AverageEpLen"] = np.mean(eval_ep_lens)
    logs["Train_AverageReturn"] = np.mean(train_returns)
    logs["Train_StdReturn"] = np.std(train_returns)
    logs["Train_MaxReturn"] = np.max(train_returns)
    logs["Train_MinReturn"] = np.min(train_returns)
    logs["Train_AverageEpLen"] = np.mean(train_ep_lens)
    return logs
 def convert_listofrollouts(trajs):
    """
    Take a list of rollout dictionaries and return separate arrays, where each array is a concatenation of that array
    from across the rollouts.
    """
    observations = np.concatenate([traj["observation"] for traj in trajs])
    actions = np.concatenate([traj["action"] for traj in trajs])
    next_observations = np.concatenate([traj["next_observation"] for traj in trajs])
    terminals = np.concatenate([traj["terminal"] for traj in trajs])
    concatenated_rewards = np.concatenate([traj["reward"] for traj in trajs])
    unconcatenated_rewards = [traj["reward"] for traj in trajs]
    return (
        observations,
        actions,
        next_observations,
        terminals,
        concatenated_rewards,
        unconcatenated_rewards,
    )
 def get_traj_length(traj):
    return len(traj["reward"])