partial update for section 1

2025-11-01 16:28:48 -05:00
parent 0f109ac389
commit ac986ec69a
40 changed files with 1439 additions and 3 deletions
--- a/hw3/data/a1/pg_cartpole_CartPole-v1_25-10-2025_15-16-37/events.out.tfevents.1761423397.soragoto-MSI
+++ b/hw3/data/a1/pg_cartpole_CartPole-v1_25-10-2025_15-16-37/events.out.tfevents.1761423397.soragoto-MSI
--- a/hw3/data/a1/pg_cartpole_lb_CartPole-v1_25-10-2025_15-23-16/events.out.tfevents.1761423796.soragoto-MSI
+++ b/hw3/data/a1/pg_cartpole_lb_CartPole-v1_25-10-2025_15-23-16/events.out.tfevents.1761423796.soragoto-MSI
--- a/hw3/data/a1/pg_cartpole_lb_na_CartPole-v1_25-10-2025_15-33-29/events.out.tfevents.1761424409.soragoto-MSI
+++ b/hw3/data/a1/pg_cartpole_lb_na_CartPole-v1_25-10-2025_15-33-29/events.out.tfevents.1761424409.soragoto-MSI
--- a/hw3/data/a1/pg_cartpole_lb_rtg_CartPole-v1_25-10-2025_15-28-23/events.out.tfevents.1761424103.soragoto-MSI
+++ b/hw3/data/a1/pg_cartpole_lb_rtg_CartPole-v1_25-10-2025_15-28-23/events.out.tfevents.1761424103.soragoto-MSI
--- a/hw3/data/a1/pg_cartpole_lb_rtg_na_CartPole-v1_25-10-2025_15-38-36/events.out.tfevents.1761424716.soragoto-MSI
+++ b/hw3/data/a1/pg_cartpole_lb_rtg_na_CartPole-v1_25-10-2025_15-38-36/events.out.tfevents.1761424716.soragoto-MSI
--- a/hw3/data/a1/pg_cartpole_na_CartPole-v1_25-10-2025_15-19-54/events.out.tfevents.1761423594.soragoto-MSI
+++ b/hw3/data/a1/pg_cartpole_na_CartPole-v1_25-10-2025_15-19-54/events.out.tfevents.1761423594.soragoto-MSI
--- a/hw3/data/a1/pg_cartpole_rtg_CartPole-v1_25-10-2025_15-18-16/events.out.tfevents.1761423496.soragoto-MSI
+++ b/hw3/data/a1/pg_cartpole_rtg_CartPole-v1_25-10-2025_15-18-16/events.out.tfevents.1761423496.soragoto-MSI
--- a/hw3/data/a1/pg_cartpole_rtg_na_CartPole-v1_25-10-2025_15-21-34/events.out.tfevents.1761423694.soragoto-MSI
+++ b/hw3/data/a1/pg_cartpole_rtg_na_CartPole-v1_25-10-2025_15-21-34/events.out.tfevents.1761423694.soragoto-MSI
--- a/hw3/data/a1/q1/events.out.tfevents.1761423397.soragoto-MSI
+++ b/hw3/data/a1/q1/events.out.tfevents.1761423397.soragoto-MSI
--- a/hw3/data/a1/q1/events.out.tfevents.1761423496.soragoto-MSI
+++ b/hw3/data/a1/q1/events.out.tfevents.1761423496.soragoto-MSI
--- a/hw3/data/a1/q1/events.out.tfevents.1761423594.soragoto-MSI
+++ b/hw3/data/a1/q1/events.out.tfevents.1761423594.soragoto-MSI
--- a/hw3/data/a1/q1/events.out.tfevents.1761423694.soragoto-MSI
+++ b/hw3/data/a1/q1/events.out.tfevents.1761423694.soragoto-MSI
--- a/hw3/data/a1/q1/events.out.tfevents.1761423796.soragoto-MSI
+++ b/hw3/data/a1/q1/events.out.tfevents.1761423796.soragoto-MSI
--- a/hw3/data/a1/q1/events.out.tfevents.1761424103.soragoto-MSI
+++ b/hw3/data/a1/q1/events.out.tfevents.1761424103.soragoto-MSI
--- a/hw3/data/a1/q1/events.out.tfevents.1761424409.soragoto-MSI
+++ b/hw3/data/a1/q1/events.out.tfevents.1761424409.soragoto-MSI
--- a/hw3/data/a1/q1/events.out.tfevents.1761424716.soragoto-MSI
+++ b/hw3/data/a1/q1/events.out.tfevents.1761424716.soragoto-MSI
--- a/hw3/data/p1311/pg_cartpole_CartPole-v1_01-11-2025_13-06-50/events.out.tfevents.1762020410.SlackR
+++ b/hw3/data/p1311/pg_cartpole_CartPole-v1_01-11-2025_13-06-50/events.out.tfevents.1762020410.SlackR
--- a/hw3/data/p1311/pg_cartpole_na_CartPole-v1_01-11-2025_13-19-09/events.out.tfevents.1762021149.SlackR
+++ b/hw3/data/p1311/pg_cartpole_na_CartPole-v1_01-11-2025_13-19-09/events.out.tfevents.1762021149.SlackR
--- a/hw3/data/p1311/pg_cartpole_rtg_CartPole-v1_01-11-2025_13-12-25/events.out.tfevents.1762020745.SlackR
+++ b/hw3/data/p1311/pg_cartpole_rtg_CartPole-v1_01-11-2025_13-12-25/events.out.tfevents.1762020745.SlackR
--- a/hw3/data/p1311/pg_cartpole_rtg_na_CartPole-v1_01-11-2025_13-25-22/events.out.tfevents.1762021522.SlackR
+++ b/hw3/data/p1311/pg_cartpole_rtg_na_CartPole-v1_01-11-2025_13-25-22/events.out.tfevents.1762021522.SlackR
--- a/hw3/data/p1312/pg_cartpole_lb_CartPole-v1_01-11-2025_13-32-24/events.out.tfevents.1762021944.SlackR
+++ b/hw3/data/p1312/pg_cartpole_lb_CartPole-v1_01-11-2025_13-32-24/events.out.tfevents.1762021944.SlackR
--- a/hw3/data/p1312/pg_cartpole_lb_na_CartPole-v1_01-11-2025_14-12-43/events.out.tfevents.1762024363.SlackR
+++ b/hw3/data/p1312/pg_cartpole_lb_na_CartPole-v1_01-11-2025_14-12-43/events.out.tfevents.1762024363.SlackR
--- a/hw3/data/p1312/pg_cartpole_lb_rtg_CartPole-v1_01-11-2025_13-50-37/events.out.tfevents.1762023037.SlackR
+++ b/hw3/data/p1312/pg_cartpole_lb_rtg_CartPole-v1_01-11-2025_13-50-37/events.out.tfevents.1762023037.SlackR
--- a/hw3/data/p1312/pg_cartpole_lb_rtg_na_CartPole-v1_01-11-2025_14-33-35/events.out.tfevents.1762025615.SlackR
+++ b/hw3/data/p1312/pg_cartpole_lb_rtg_na_CartPole-v1_01-11-2025_14-33-35/events.out.tfevents.1762025615.SlackR
--- a/hw3/data/pg_cartpole_CartPole-v1_25-10-2025_15-00-04/events.out.tfevents.1761422404.soragoto-MSI
+++ b/hw3/data/pg_cartpole_CartPole-v1_25-10-2025_15-00-04/events.out.tfevents.1761422404.soragoto-MSI
--- a/hw3/data/pg_cartpole_CartPole-v1_25-10-2025_15-03-41/events.out.tfevents.1761422621.soragoto-MSI
+++ b/hw3/data/pg_cartpole_CartPole-v1_25-10-2025_15-03-41/events.out.tfevents.1761422621.soragoto-MSI
--- a/hw3/data/pg_cartpole_CartPole-v1_25-10-2025_15-59-08/events.out.tfevents.1761425948.soragoto-MSI
+++ b/hw3/data/pg_cartpole_CartPole-v1_25-10-2025_15-59-08/events.out.tfevents.1761425948.soragoto-MSI
--- a/hw3/src/critics.py
+++ b/hw3/src/critics.py
@@ -41,7 +41,7 @@ class ValueCritic(nn.Module):

        ############################
        # YOUR IMPLEMENTATION HERE #
-        
+        values=self.network(obs)
        ############################

        return values
@@ -55,7 +55,12 @@ class ValueCritic(nn.Module):
        loss = None
        ############################
        # YOUR IMPLEMENTATION HERE #
+        values = self.forward(obs)
+        loss = F.mse_loss(values, q_values)

+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
        ############################

        return {
--- a/hw3/src/pg_agent.py
+++ b/hw3/src/pg_agent.py
@@ -148,10 +148,11 @@ class PGAgent(nn.Module):
            advantages = None
            ############################
            # YOUR IMPLEMENTATION HERE #
-            source = rewards.copy()
+            source = q_values.copy()
            mean = np.mean(source)
            std = np.std(source)
-            advantages = (source - mean)/std
+            # avoid division by zero
+            advantages = (source - mean)/(std+1e-8)
            ############################

        return advantages