This commit is contained in:
Zheyuan Wu
2025-10-14 20:34:47 -05:00
parent 250f763f1f
commit e74aac95e3
517 changed files with 1418 additions and 16701 deletions

View File

@@ -5,6 +5,8 @@
\usepackage{fullpage}
\usepackage{mathrsfs}
\usepackage{mathtools}
\usepackage{float}
\usepackage{hyperref}
%%
%% Stuff above here is packages that will be used to compile your document.
@@ -85,6 +87,29 @@
\begin{enumerate}
\item[] \textbf{Use Of GenAI}
This homework is completed with the help of Windsurf VS code extension.\url{https://windsurf.com/}
What is used:
\begin{itemize}
\item Autofill feature to generate syntactically correct latex code (each tab key pressed filled no more than 100 characters, at most $20\%$ of the predicted text is adapted) for the homework with human supervision.
\item Use AI to debug the latex code and find unclosed parentheses or other syntax errors.
\item Use AI to autofill the parts that follows the same structure as the previous parts (example: case by case proofs).
\item Use AI to auto correct misspelled words or latex commands.
\end{itemize}
What is not used:
\begin{itemize}
\item Directly use AI to generate the solutions in latex document.
\item Use AI to ask for hint or solution for the problems.
\item Select part of the document and ask AI to fill the parts missing.
\end{itemize}
\newpage
\item[1.] \textbf{Answer questions in Section 3} Due to the state space complexity of some visual input environments, we may represent Q-functions using a class of parameterized function approximators $\mathcal{Q}=\{Q_w\mid w\in \R^p\}$, where $p$ is the number of parameters. Remember that in the \textit{tabular setting} given a 4-tuple of sampled experience $(s,a,r,s')$, the vanilla Q-learning update is
\[
@@ -109,12 +134,14 @@ where the dependency of $\max_{a'\in A} Q_w(s',a')$ on $w$ is ignored, i.e., it
\item [1.] [\textbf{10pt}] Show that the update \ref{1} and update \ref{2} are the same when the functions in $\mathcal{Q}$ are of the form $Q_w(s,a)=w^T\phi(s,a)$, with $w\in \R^{|S||A|}$ and $\phi:S\times A\to \R^{|S||A|}$, where the feature function $\phi$ is of the form $\phi(s,a)_{s',a'}=\mathbb{I}[s'=s,a'=a]$, where $\mathbb{I}$ denotes the indicator function which evaluates to $1$ if the condition evaluates to true and vice versa. Note that the coordinates in the vector space $\R^{|S||A|}$ can be seen as being indexed by pairs $(s',a')$, where $s'\in S$, $a'\in A$.
\begin{proof}
When the functions in $\mathcal{Q}$ are of the form $Q_w(s,a)=w^T\phi(s,a)$, with $w\in \R^{|S||A|}$ and $\phi:S\times A\to \R^{|S||A|}$, then it is linear.
When the functions in $\mathcal{Q}$ are of the form $Q_w(s,a)=w^T\phi(s,a)$, with $w\in \R^{|S||A|}$ and $\phi:S\times A\to \R^{|S||A|}$, note that $\sum_{s\in S}\sum_{a\in A} \phi(s,a)^T\phi(s,a)=\sum_{s\in S}\sum_{a\in A} \mathbb{I}[s'=s,a'=a]=1$.
\[
\begin{aligned}
Q(s,a)&= Q(s,a)+\alpha\left(r+\gamma\max_{a'\in A} Q(s',a')-Q(s,a)\right)\\
w^T\phi(s,a)&= w^T\phi(s,a)+\alpha\left(r+\gamma\max_{a'\in A} Q(s',a')-Q(s,a)\right)\\
w^T\phi(s,a)&= w^T\phi(s,a)+\alpha\left(r+\gamma\max_{a'\in A} Q(s',a')-Q(s,a)\right)\phi(s,a)^T\phi(s,a)\\
w^T\phi(s,a)&= w^T\phi(s,a)+\alpha\left(r+\gamma\max_{a'\in A} Q(s',a')-Q(s,a)\right)\nabla_w (w^T\phi(s,a))^T\phi(s,a)\\
w^T\phi(s,a)&=\left(w^T+\alpha\left(r+\gamma\max_{a'\in A} Q(s',a')-Q(s,a)\right)\nabla_w Q_w(s,a)\right)^T\phi(s,a)\\
w&= w+\alpha\left(r+\gamma\max_{a'\in A} Q(s',a')-Q(s,a)\right)\nabla_w Q_w(s,a)
\end{aligned}
\]
@@ -143,10 +170,71 @@ where the dependency of $\max_{a'\in A} Q_w(s',a')$ on $w$ is ignored, i.e., it
\item [2.] \textbf{The auto-generated results figure} along with a brief description about what has the figures shown.
\begin{enumerate}
\item [1.] \textbf{DQN}
\begin{figure}[H]
\centering
\includegraphics[width=0.9\textwidth]{./runs/DQN/results.png}
\caption{DQN. Nothing to say but what expected from training.}
\end{figure}
\item [2.] \textbf{Double DQN}
\begin{figure}[H]
\centering
\includegraphics[width=0.9\textwidth]{./runs/Double DQN/results.png}
\caption{Double DQN. I found there is interesting camel like bump for q-value when training with Double DQN. It is less stable than the vanilla DQN.}
\end{figure}
\item [3.] \textbf{Dueling DQN}
\begin{figure}[H]
\centering
\includegraphics[width=0.9\textwidth]{./runs/Dueling DQN/results.png}
\caption{Dueling DQN. Using Advantage network creates comparable results as the DQN.}
\end{figure}
\item [4.] \textbf{Prioritized Experience Replay}
\begin{figure}[H]
\centering
\includegraphics[width=1.0\textwidth]{./runs/PER/results.png}
\caption{Prioritized Experience Replay. Using this alone makes the training process less stable and loss is significantly higher than the previous methods.}
\end{figure}
\item [5.] \textbf{N-Step Experience Replay}
\begin{figure}[H]
\centering
\includegraphics[width=1.0\textwidth]{./runs/NStep/results.png}
\caption{N-Step Experience Replay. So far the most stable method of training, especially when the replay buffer size is large. However, when the replay buffer size is too small, typically $\le 70$, the training process may not converge to optimal performance.}
\end{figure}
\item [6.] \textbf{N-Step + PER}
\begin{figure}[H]
\centering
\includegraphics[width=1.0\textwidth]{./runs/NStep + PER/results.png}
\caption{NStep + PER. Combining the two methods counter the unstable loss function for training in PER.}
\end{figure}
\item [7.] \textbf{Noisy DQN}
\begin{figure}[H]
\centering
\includegraphics[width=1.0\textwidth]{./runs/Noisy DQN/results.png}
\caption{Noisy DQN. Experiment for sigma = 0.017 gets comparable result with normal DQN. Stability issue persist when sigma is too large.}
\end{figure}
\end{enumerate}
\newpage
\item [3.] \textbf{Any other findings}
I implemented Extra credit Noisy DQN. Helpful commands to run in ./commands/4.8.sh Found that when sigma is too large, for example $\sigma=0.5$. The model may not converge to optimal performance. Intuitively, the Noisy linear layer shall improve the robustness of the model. But the effect is not obvious as expected.
\end{enumerate}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%