CSE5100H2/result.tex

\documentclass[11pt]{article}
\usepackage{amsmath, amsfonts, amsthm}
\usepackage{amssymb}
\usepackage{fancyhdr,parskip}
\usepackage{fullpage}
\usepackage{mathrsfs}
\usepackage{mathtools}
\usepackage{float}
\usepackage{hyperref}

%%
%% Stuff above here is packages that will be used to compile your document.
%% If you've used unusual LaTeX features, you may have to install extra packages by adding them to this list.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\setlength{\headheight}{15.2pt}
\setlength{\headsep}{20pt}
\pagestyle{fancyplain}

%%
%% Stuff above here is layout and formatting.  If you've never used LaTeX before, you probably don't need to change any of it.
%% Later, you can learn how it all works and adjust it to your liking, or write your own formatting code.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%%%%%%%%%%%%%%%%%%%%
% These commands create theorem-like environments.
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{prop}[theorem]{Proposition}
\newtheorem{defn}[theorem]{Definition}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% This section contains some useful macros that will save you time typing.
%%

% Using \displaystyle (or \ds) in a block of math has a number of effects, but most notably, it makes your fractions come out bigger.
\newcommand{\ds}{\displaystyle}

% These lines are for displaying integrals; typing \dx will make the dx at the end of the integral look better.
\newcommand{\is}{\hspace{2pt}}
\newcommand{\dx}{\is dx}

% These commands produce the fancy Z (for the integers) and other letters conveniently.
\newcommand{\Z}{\mathbb{Z}}
\newcommand{\Q}{\mathbb{Q}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\C}{\mathbb{C}}
\newcommand{\F}{\mathbb{F}}
\newcommand{\T}{\mathcal{T}}
\newcommand{\B}{\mathcal{B}}

% for fancy empty set char
\renewcommand{\emptyset}{\varnothing}

% customized commands for future assignements
\newcommand{\imply}{\Rightarrow}
\def\P{\mathscr{P}}
\def\L{\mathscr{L}}
\def\M{\mathscr{M}}
\DeclarePairedDelimiterX{\inp}[2]{\langle}{\rangle}{#1, #2}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% This is the header.  It will appear on every page, and it's a good place to put your name, the assignment title, and stuff like that.
%% I usually leave the center header blank to avoid clutter.
%%

\fancyhead[L]{\textbf{CSE5100 Homework 2}}
\fancyhead[C]{\empty}
\fancyhead[R]{Zheyuan Wu}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\begin{document}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Actual math starts here!


% Use an enumerated list to write up problems.  First we begin a list.

\begin{enumerate}

\item[] \textbf{Use Of GenAI}

This homework is completed with the help of Windsurf VS code extension.\url{https://windsurf.com/}

What is used:

\begin{itemize}
    \item Autofill feature to generate syntactically correct latex code (each tab key pressed filled no more than 100 characters, at most $20\%$ of the predicted text is adapted) for the homework with human supervision.
    \item Use AI to debug the latex code and find unclosed parentheses or other syntax errors.
    \item Use AI to autofill the parts that follows the same structure as the previous parts (example: case by case proofs).
    \item Use AI to auto correct misspelled words or latex commands.
\end{itemize}

What is not used:

\begin{itemize}
    \item Directly use AI to generate the solutions in latex document.
    \item Use AI to ask for hint or solution for the problems.
    \item Select part of the document and ask AI to fill the parts missing.
\end{itemize}

\newpage

\item[1.] \textbf{Answer questions in Section 3} Due to the state space complexity of some visual input environments, we may represent Q-functions using a class of parameterized function approximators $\mathcal{Q}=\{Q_w\mid w\in \R^p\}$, where $p$ is the number of parameters. Remember that in the \textit{tabular setting} given a 4-tuple of sampled experience $(s,a,r,s')$, the vanilla Q-learning update is

\[
Q(s,a)\coloneqq Q(s,a)+\alpha\left(r+\gamma\max_{a'\in A} Q(s',a')-Q(s,a)\right),\tag{1}\label{1}
\]

where $\alpha\in \R$ is the learning rate. In the \textit{function approximation setting}, the update is similar:

\[
w\coloneqq w+\alpha\left(r+\gamma\max_{a'\in A} Q_w(s',a')-Q_w(s,a)\right)\nabla_w Q_w(s,a).\tag{2}\label{2}
\]

Q-learning can seem as a pseudo stochastic gradient descent step on

\[
\ell(w)=\mathbb{E}_{s,a,r,s'}\left(r+\gamma \max_{a'\in A} Q_w(s',a')-Q_w(s,a)\right)^2.\tag{3}\label{3}
\]

where the dependency of $\max_{a'\in A} Q_w(s',a')$ on $w$ is ignored, i.e., it is treated as a fixed target.

\begin{enumerate}
    \item [1.] [\textbf{10pt}] Show that the update \ref{1} and update \ref{2} are the same when the functions in $\mathcal{Q}$ are of the form $Q_w(s,a)=w^T\phi(s,a)$, with $w\in \R^{|S||A|}$ and $\phi:S\times A\to \R^{|S||A|}$, where the feature function $\phi$ is of the form $\phi(s,a)_{s',a'}=\mathbb{I}[s'=s,a'=a]$, where $\mathbb{I}$ denotes the indicator function which evaluates to $1$ if the condition evaluates to true and vice versa. Note that the coordinates in the vector space $\R^{|S||A|}$ can be seen as being indexed by pairs $(s',a')$, where $s'\in S$, $a'\in A$.

    \begin{proof}
        When the functions in $\mathcal{Q}$ are of the form $Q_w(s,a)=w^T\phi(s,a)$, with $w\in \R^{|S||A|}$ and $\phi:S\times A\to \R^{|S||A|}$, note that $\sum_{s\in S}\sum_{a\in A} \phi(s,a)^T\phi(s,a)=\sum_{s\in S}\sum_{a\in A} \mathbb{I}[s'=s,a'=a]=1$.

        \[
        \begin{aligned}
              Q(s,a)&= Q(s,a)+\alpha\left(r+\gamma\max_{a'\in A} Q(s',a')-Q(s,a)\right)\\
              w^T\phi(s,a)&= w^T\phi(s,a)+\alpha\left(r+\gamma\max_{a'\in A} Q(s',a')-Q(s,a)\right)\phi(s,a)^T\phi(s,a)\\
              w^T\phi(s,a)&= w^T\phi(s,a)+\alpha\left(r+\gamma\max_{a'\in A} Q(s',a')-Q(s,a)\right)\nabla_w (w^T\phi(s,a))^T\phi(s,a)\\
              w^T\phi(s,a)&=\left(w^T+\alpha\left(r+\gamma\max_{a'\in A} Q(s',a')-Q(s,a)\right)\nabla_w Q_w(s,a)\right)^T\phi(s,a)\\
              w&= w+\alpha\left(r+\gamma\max_{a'\in A} Q(s',a')-Q(s,a)\right)\nabla_w Q_w(s,a)
        \end{aligned}
        \]
    \end{proof}


    \item [2.] [\textbf{10pt}] What is the deadly triad in the reinforcement learning? What are the main challenges of using deep learning for function approximation with Q-learning? How does Deep Q-Learning method overcome these challenges?

    The deadly triad in the reinforcement learning are

    \begin{enumerate}
        \item Bootstraping
        \item Function approximation
        \item Off-policy
    \end{enumerate}

    The Deep Q-Learning method overcome the instability caused by the deadly triad interact with statistical estimation issues induced by the bootstrap method used by boostrapping on a separate network and by reducing the overestimation bias. (Use double Q-learning to reduce the overestimation bias.)

    \item [3.] [\textbf{10pt}] Explain how double Q-learning helps with the maximization bias in Q-learning.

    The double Q-learning decouple the action selection and evaluation of action to separate networks.

\end{enumerate}

\newpage

\item [2.] \textbf{The auto-generated results figure} along with a brief description about what has the figures shown.

\begin{enumerate}
    \item [1.] \textbf{DQN}

    \begin{figure}[H]
        \centering
        \includegraphics[width=0.9\textwidth]{./runs/DQN/results.png}
        \caption{DQN. Nothing to say but what expected from training.}
    \end{figure}

    \item [2.] \textbf{Double DQN}

    \begin{figure}[H]
        \centering
        \includegraphics[width=0.9\textwidth]{./runs/Double DQN/results.png}
        \caption{Double DQN. I found there is interesting camel like bump for q-value when training with Double DQN. It is less stable than the vanilla DQN.}
    \end{figure}

    \item [3.] \textbf{Dueling DQN}

    \begin{figure}[H]
        \centering
        \includegraphics[width=0.9\textwidth]{./runs/Dueling DQN/results.png}
        \caption{Dueling DQN. Using Advantage network creates comparable results as the DQN.}
    \end{figure}

    \item [4.] \textbf{Prioritized Experience Replay}

    \begin{figure}[H]
        \centering
        \includegraphics[width=1.0\textwidth]{./runs/PER/results.png}
        \caption{Prioritized Experience Replay. Using this alone makes the training process less stable and loss is significantly higher than the previous methods.}
    \end{figure}

    \item [5.] \textbf{N-Step Experience Replay}

    \begin{figure}[H]
        \centering
        \includegraphics[width=1.0\textwidth]{./runs/NStep/results.png}
        \caption{N-Step Experience Replay. So far the most stable method of training, especially when the replay buffer size is large. However, when the replay buffer size is too small, typically $\le 70$, the training process may not converge to optimal performance.}
    \end{figure}

    \item [6.] \textbf{N-Step + PER}

    \begin{figure}[H]
        \centering
        \includegraphics[width=1.0\textwidth]{./runs/NStep + PER/results.png}
        \caption{NStep + PER. Combining the two methods counter the unstable loss function for training in PER.}
    \end{figure}

    \item [7.] \textbf{Noisy DQN}

    \begin{figure}[H]
        \centering
        \includegraphics[width=1.0\textwidth]{./runs/Noisy DQN/results.png}
        \caption{Noisy DQN. Experiment for sigma = 0.017 gets comparable result with normal DQN. Stability issue persist when sigma is too large.}
    \end{figure}

\end{enumerate}

\newpage

\item [3.] \textbf{Any other findings}

I implemented Extra credit Noisy DQN. Helpful commands to run in ./commands/4.8.sh Found that when sigma is too large, for example $\sigma=0.5$. The model may not converge to optimal performance. Intuitively, the Noisy linear layer shall improve the robustness of the model. But the effect is not obvious as expected.

\end{enumerate}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Actual math ends here.  Don't put any content below the \end{document} line.
%%

\end{document}