diff --git a/.gitignore b/.gitignore index 5560dac..f8ef896 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ # Python -# __pycache__/ -# *.pyc -# *.pyo -# *.pyd -# *.pyw -# *.pyz \ No newline at end of file +__pycache__/ +*.pyc +*.pyo +*.pyd +*.pyw +*.pyz \ No newline at end of file diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..a7a34f3 --- /dev/null +++ b/environment.yml @@ -0,0 +1,17 @@ +name: DRL +channels: + - pytorch + - nvidia + - defaults +dependencies: + - python=3.10 + - pip=23.0.1 + - pip: + - gymnasium[classic-control]==0.27.1 + - hydra-core==1.3.2 + - matplotlib==3.7.1 + - moviepy==1.0.3 + - torch torchvision --index-url https://download.pytorch.org/whl/cu126 # change this to your own cuda version + - opencv-python + - tensorboardX==2.6.4 + - tensorboard==2.20.0 \ No newline at end of file diff --git a/hw3/bash/1-2-experiments.sh b/hw3/bash/1-2-experiments.sh new file mode 100644 index 0000000..391f19c --- /dev/null +++ b/hw3/bash/1-2-experiments.sh @@ -0,0 +1,8 @@ +python run.py --env_name CartPole-v1 -n 200 -b 1000 --exp_name cartpole +python run.py --env_name CartPole-v1 -n 200 -b 1000 -rtg --exp_name cartpole_rtg +python run.py --env_name CartPole-v1 -n 200 -b 1000 -na --exp_name cartpole_na +python run.py --env_name CartPole-v1 -n 200 -b 1000 -rtg -na --exp_name cartpole_rtg_na +python run.py --env_name CartPole-v1 -n 200 -b 4000 --exp_name cartpole_lb +python run.py --env_name CartPole-v1 -n 200 -b 4000 -rtg --exp_name cartpole_lb_rtg +python run.py --env_name CartPole-v1 -n 200 -b 4000 -na --exp_name cartpole_lb_na +python run.py --env_name CartPole-v1 -n 200 -b 4000 -rtg -na --exp_name cartpole_lb_rtg_na \ No newline at end of file diff --git a/hw3/bash/read-results.sh b/hw3/bash/read-results.sh new file mode 100644 index 0000000..4761d09 --- /dev/null +++ b/hw3/bash/read-results.sh @@ -0,0 +1 @@ +tensorboard --logdir data/pg_cartpole_CartPole-v1_25-10-2025_15-00-04 \ No newline at end of file diff --git a/hw3/data/pg_cartpole_CartPole-v1_25-10-2025_15-00-04/events.out.tfevents.1761422404.soragoto-MSI b/hw3/data/pg_cartpole_CartPole-v1_25-10-2025_15-00-04/events.out.tfevents.1761422404.soragoto-MSI new file mode 100644 index 0000000..34491cb Binary files /dev/null and b/hw3/data/pg_cartpole_CartPole-v1_25-10-2025_15-00-04/events.out.tfevents.1761422404.soragoto-MSI differ diff --git a/hw3/data/pg_cartpole_CartPole-v1_25-10-2025_15-03-41/events.out.tfevents.1761422621.soragoto-MSI b/hw3/data/pg_cartpole_CartPole-v1_25-10-2025_15-03-41/events.out.tfevents.1761422621.soragoto-MSI new file mode 100644 index 0000000..70355a7 Binary files /dev/null and b/hw3/data/pg_cartpole_CartPole-v1_25-10-2025_15-03-41/events.out.tfevents.1761422621.soragoto-MSI differ diff --git a/hw3/data/pg_cartpole_CartPole-v1_25-10-2025_15-16-37/events.out.tfevents.1761423397.soragoto-MSI b/hw3/data/pg_cartpole_CartPole-v1_25-10-2025_15-16-37/events.out.tfevents.1761423397.soragoto-MSI new file mode 100644 index 0000000..983d9ce Binary files /dev/null and b/hw3/data/pg_cartpole_CartPole-v1_25-10-2025_15-16-37/events.out.tfevents.1761423397.soragoto-MSI differ diff --git a/hw3/data/pg_cartpole_CartPole-v1_25-10-2025_15-59-08/events.out.tfevents.1761425948.soragoto-MSI b/hw3/data/pg_cartpole_CartPole-v1_25-10-2025_15-59-08/events.out.tfevents.1761425948.soragoto-MSI new file mode 100644 index 0000000..01c2a5b Binary files /dev/null and b/hw3/data/pg_cartpole_CartPole-v1_25-10-2025_15-59-08/events.out.tfevents.1761425948.soragoto-MSI differ diff --git a/hw3/data/pg_cartpole_lb_CartPole-v1_25-10-2025_15-23-16/events.out.tfevents.1761423796.soragoto-MSI b/hw3/data/pg_cartpole_lb_CartPole-v1_25-10-2025_15-23-16/events.out.tfevents.1761423796.soragoto-MSI new file mode 100644 index 0000000..2853311 Binary files /dev/null and b/hw3/data/pg_cartpole_lb_CartPole-v1_25-10-2025_15-23-16/events.out.tfevents.1761423796.soragoto-MSI differ diff --git a/hw3/data/pg_cartpole_lb_na_CartPole-v1_25-10-2025_15-33-29/events.out.tfevents.1761424409.soragoto-MSI b/hw3/data/pg_cartpole_lb_na_CartPole-v1_25-10-2025_15-33-29/events.out.tfevents.1761424409.soragoto-MSI new file mode 100644 index 0000000..c3bd2ac Binary files /dev/null and b/hw3/data/pg_cartpole_lb_na_CartPole-v1_25-10-2025_15-33-29/events.out.tfevents.1761424409.soragoto-MSI differ diff --git a/hw3/data/pg_cartpole_lb_rtg_CartPole-v1_25-10-2025_15-28-23/events.out.tfevents.1761424103.soragoto-MSI b/hw3/data/pg_cartpole_lb_rtg_CartPole-v1_25-10-2025_15-28-23/events.out.tfevents.1761424103.soragoto-MSI new file mode 100644 index 0000000..48515fb Binary files /dev/null and b/hw3/data/pg_cartpole_lb_rtg_CartPole-v1_25-10-2025_15-28-23/events.out.tfevents.1761424103.soragoto-MSI differ diff --git a/hw3/data/pg_cartpole_lb_rtg_na_CartPole-v1_25-10-2025_15-38-36/events.out.tfevents.1761424716.soragoto-MSI b/hw3/data/pg_cartpole_lb_rtg_na_CartPole-v1_25-10-2025_15-38-36/events.out.tfevents.1761424716.soragoto-MSI new file mode 100644 index 0000000..90686c5 Binary files /dev/null and b/hw3/data/pg_cartpole_lb_rtg_na_CartPole-v1_25-10-2025_15-38-36/events.out.tfevents.1761424716.soragoto-MSI differ diff --git a/hw3/data/pg_cartpole_na_CartPole-v1_25-10-2025_15-19-54/events.out.tfevents.1761423594.soragoto-MSI b/hw3/data/pg_cartpole_na_CartPole-v1_25-10-2025_15-19-54/events.out.tfevents.1761423594.soragoto-MSI new file mode 100644 index 0000000..d9a59f8 Binary files /dev/null and b/hw3/data/pg_cartpole_na_CartPole-v1_25-10-2025_15-19-54/events.out.tfevents.1761423594.soragoto-MSI differ diff --git a/hw3/data/pg_cartpole_rtg_CartPole-v1_25-10-2025_15-18-16/events.out.tfevents.1761423496.soragoto-MSI b/hw3/data/pg_cartpole_rtg_CartPole-v1_25-10-2025_15-18-16/events.out.tfevents.1761423496.soragoto-MSI new file mode 100644 index 0000000..0c44f7c Binary files /dev/null and b/hw3/data/pg_cartpole_rtg_CartPole-v1_25-10-2025_15-18-16/events.out.tfevents.1761423496.soragoto-MSI differ diff --git a/hw3/data/pg_cartpole_rtg_na_CartPole-v1_25-10-2025_15-21-34/events.out.tfevents.1761423694.soragoto-MSI b/hw3/data/pg_cartpole_rtg_na_CartPole-v1_25-10-2025_15-21-34/events.out.tfevents.1761423694.soragoto-MSI new file mode 100644 index 0000000..ca4d218 Binary files /dev/null and b/hw3/data/pg_cartpole_rtg_na_CartPole-v1_25-10-2025_15-21-34/events.out.tfevents.1761423694.soragoto-MSI differ diff --git a/hw3/src/pg_agent.py b/hw3/src/pg_agent.py index 48ca1bf..44b6a99 100644 --- a/hw3/src/pg_agent.py +++ b/hw3/src/pg_agent.py @@ -103,7 +103,7 @@ class PGAgent(nn.Module): q_values = None ############################ # YOUR IMPLEMENTATION HERE # - + q_values = [self._discounted_return(reward) for reward in rewards] ############################ else: @@ -114,7 +114,7 @@ class PGAgent(nn.Module): ############################ # YOUR IMPLEMENTATION HERE # - + q_values = [self._discounted_reward_to_go(reward) for reward in rewards] ############################ return q_values @@ -148,7 +148,10 @@ class PGAgent(nn.Module): advantages = None ############################ # YOUR IMPLEMENTATION HERE # - + source = rewards.copy() + mean = np.mean(source) + std = np.std(source) + advantages = (source - mean)/std ############################ return advantages @@ -166,9 +169,9 @@ class PGAgent(nn.Module): ############################ # YOUR IMPLEMENTATION HERE # - + q_value=sum(self.gamma ** i * reward for i, reward in enumerate(rewards)) + return [q_value] * len(rewards) ############################ - pass def _discounted_reward_to_go(self, rewards: Sequence[float]) -> Sequence[float]: @@ -181,6 +184,12 @@ class PGAgent(nn.Module): ############################ # YOUR IMPLEMENTATION HERE # - + q_values = [] + current_sum = 0 + for t in range (len(rewards)-1,-1,-1): + current_sum *= self.gamma + current_sum += rewards[t] + q_values.append(current_sum) + q_values.reverse() + return q_values ############################ - pass diff --git a/hw3/src/policies.py b/hw3/src/policies.py index 20331b0..e6b262a 100644 --- a/hw3/src/policies.py +++ b/hw3/src/policies.py @@ -63,7 +63,9 @@ class MLPPolicy(nn.Module): ############################ # YOUR IMPLEMENTATION HERE # - + obs = ptu.from_numpy(obs) + action_tensor = self.forward(obs) + action = ptu.to_numpy(action_tensor.sample()) ############################ return action @@ -80,7 +82,8 @@ class MLPPolicy(nn.Module): ############################ # YOUR IMPLEMENTATION HERE # - + logits_prob = self.logits_net(obs) + action = distributions.Categorical(logits=logits_prob) ############################ else: @@ -116,7 +119,21 @@ class MLPPolicyPG(MLPPolicy): ############################ # YOUR IMPLEMENTATION HERE # + + dist = self.forward(obs) + if self.discrete: + log_p = dist.log_prob(actions.long()) + else: + log_p = dist.log_prob(actions) + + loss = -torch.mean(advantages * log_p) + + # update gradients + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + ############################ return { diff --git a/hw3/src/utils.py b/hw3/src/utils.py index 281d6eb..44429b6 100644 --- a/hw3/src/utils.py +++ b/hw3/src/utils.py @@ -33,7 +33,10 @@ def sample_trajectory( ############################ # YOUR IMPLEMENTATION HERE # + ac = policy.get_action(ob) + next_ob, rew, terminated, truncated, _ = env.step(ac) + rollout_done = terminated or truncated ############################