Merge branch 'main' of https://github.com/Trance-0/NoteNextra

2025-11-05 01:23:23 -06:00
parent 1d662e1f32 b28f6c5d9f
commit 2c5f1b98ca
36 changed files with 663 additions and 107 deletions
--- a/content/CSE510/CSE510_L11.md
+++ b/content/CSE510/CSE510_L11.md
@@ -198,20 +198,20 @@ $$
 Take the softmax policy as example:
-Weight actions using the linear combination of features $\phi(s,a)^T\theta$:
+Weight actions using the linear combination of features $\phi(s,a)^\top\theta$:
 Probability of action is proportional to the exponentiated weights:
 $$
-\pi_\theta(s,a) \propto \exp(\phi(s,a)^T\theta)
+\pi_\theta(s,a) \propto \exp(\phi(s,a)^\top\theta)
 $$
 The score function is
 $$
 \begin{aligned}
-\nabla_\theta \ln\left[\frac{\exp(\phi(s,a)^T\theta)}{\sum_{a'\in A}\exp(\phi(s,a')^T\theta)}\right] &= \nabla_\theta(\ln \exp(\phi(s,a)^T\theta) - (\ln \sum_{a'\in A}\exp(\phi(s,a')^T\theta))) \\
+\nabla_\theta \ln\left[\frac{\exp(\phi(s,a)^\top\theta)}{\sum_{a'\in A}\exp(\phi(s,a')^\top\theta)}\right] &= \nabla_\theta(\ln \exp(\phi(s,a)^\top\theta) - (\ln \sum_{a'\in A}\exp(\phi(s,a')^\top\theta))) \\
-&= \nabla_\theta\left(\phi(s,a)^T\theta -\frac{\phi(s,a)\sum_{a'\in A}\exp(\phi(s,a')^T\theta)}{\sum_{a'\in A}\exp(\phi(s,a')^T\theta)}\right) \\
+&= \nabla_\theta\left(\phi(s,a)^\top\theta -\frac{\phi(s,a)\sum_{a'\in A}\exp(\phi(s,a')^\top\theta)}{\sum_{a'\in A}\exp(\phi(s,a')^\top\theta)}\right) \\
 &=\phi(s,a) - \sum_{a'\in A} \prod_\theta(s,a') \phi(s,a')
 &= \phi(s,a) - \mathbb{E}_{a'\sim \pi_\theta(s,a')}[\phi(s,a')]
 \end{aligned}
@@ -221,7 +221,7 @@ $$
 In continuous action spaces, a Gaussian policy is natural
-Mean is a linear combination of state features $\mu(s) = \phi(s)^T\theta$
+Mean is a linear combination of state features $\mu(s) = \phi(s)^\top\theta$
 Variance may be fixed $\sigma^2$, or can also parametrized
--- a/content/CSE510/CSE510_L12.md
+++ b/content/CSE510/CSE510_L12.md
@@ -53,7 +53,7 @@ $$
 Action-Value Actor-Critic
 - Simple actor-critic algorithm based on action-value critic
- Using linear value function approximation $Q_w(s,a)=\phi(s,a)^T w$
+- Using linear value function approximation $Q_w(s,a)=\phi(s,a)^\top w$
 Critic: updates $w$ by linear $TD(0)$
 Actor: updates $\theta$ by policy gradient
--- a/content/CSE510/CSE510_L13.md
+++ b/content/CSE510/CSE510_L13.md
@@ -193,7 +193,7 @@ $$
 Make linear approximation to $L_{\pi_{\theta_{old}}}$ and quadratic approximation to KL term.
-Maximize $g\cdot(\theta-\theta_{old})-\frac{\beta}{2}(\theta-\theta_{old})^T F(\theta-\theta_{old})$
+Maximize $g\cdot(\theta-\theta_{old})-\frac{\beta}{2}(\theta-\theta_{old})^\top F(\theta-\theta_{old})$
 where $g=\frac{\partial}{\partial \theta}L_{\pi_{\theta_{old}}}(\pi_{\theta})\vert_{\theta=\theta_{old}}$ and $F=\frac{\partial^2}{\partial \theta^2}\overline{KL}_{\pi_{\theta_{old}}}(\pi_{\theta})\vert_{\theta=\theta_{old}}$
@@ -201,7 +201,7 @@ where $g=\frac{\partial}{\partial \theta}L_{\pi_{\theta_{old}}}(\pi_{\theta})\ve
 <summary>Taylor Expansion of KL Term</summary>
 $$
-D_{KL}(\pi_{\theta_{old}}|\pi_{\theta})\approx D_{KL}(\pi_{\theta_{old}}|\pi_{\theta_{old}})+d^T \nabla_\theta D_{KL}(\pi_{\theta_{old}}|\pi_{\theta})\vert_{\theta=\theta_{old}}+\frac{1}{2}d^T \nabla_\theta^2 D_{KL}(\pi_{\theta_{old}}|\pi_{\theta})\vert_{\theta=\theta_{old}}d
+D_{KL}(\pi_{\theta_{old}}|\pi_{\theta})\approx D_{KL}(\pi_{\theta_{old}}|\pi_{\theta_{old}})+d^\top \nabla_\theta D_{KL}(\pi_{\theta_{old}}|\pi_{\theta})\vert_{\theta=\theta_{old}}+\frac{1}{2}d^\top \nabla_\theta^2 D_{KL}(\pi_{\theta_{old}}|\pi_{\theta})\vert_{\theta=\theta_{old}}d
 $$
 $$
@@ -220,9 +220,9 @@ $$
 \begin{aligned}
 \nabla_\theta^2 D_{KL}(\pi_{\theta_{old}}|\pi_{\theta})\vert_{\theta=\theta_{old}}&=-\mathbb{E}_{x\sim \pi_{\theta_{old}}}\nabla_\theta^2 \log P_\theta(x)\vert_{\theta=\theta_{old}}\\
 &=-\mathbb{E}_{x\sim \pi_{\theta_{old}}}\nabla_\theta \left(\frac{\nabla_\theta P_\theta(x)}{P_\theta(x)}\right)\vert_{\theta=\theta_{old}}\\
-&=-\mathbb{E}_{x\sim \pi_{\theta_{old}}}\left(\frac{\nabla_\theta^2 P_\theta(x)-\nabla_\theta P_\theta(x)\nabla_\theta P_\theta(x)^T}{P_\theta(x)^2}\right)\vert_{\theta=\theta_{old}}\\
+&=-\mathbb{E}_{x\sim \pi_{\theta_{old}}}\left(\frac{\nabla_\theta^2 P_\theta(x)-\nabla_\theta P_\theta(x)\nabla_\theta P_\theta(x)^\top}{P_\theta(x)^2}\right)\vert_{\theta=\theta_{old}}\\
-&=-\mathbb{E}_{x\sim \pi_{\theta_{old}}}\left(\frac{\nabla_\theta^2 P_\theta(x)\vert_{\theta=\theta_{old}}}P_{\theta_{old}}(x)\right)+\mathbb{E}_{x\sim \pi_{\theta_{old}}}\left(\nabla_\theta \log P_\theta(x)\nabla_\theta \log P_\theta(x)^T\right)\vert_{\theta=\theta_{old}}\\
+&=-\mathbb{E}_{x\sim \pi_{\theta_{old}}}\left(\frac{\nabla_\theta^2 P_\theta(x)\vert_{\theta=\theta_{old}}}P_{\theta_{old}}(x)\right)+\mathbb{E}_{x\sim \pi_{\theta_{old}}}\left(\nabla_\theta \log P_\theta(x)\nabla_\theta \log P_\theta(x)^\top\right)\vert_{\theta=\theta_{old}}\\
-&=\mathbb{E}_{x\sim \pi_{\theta_{old}}}\nabla_\theta\log P_\theta(x)\nabla_\theta\log P_\theta(x)^T\vert_{\theta=\theta_{old}}\\
+&=\mathbb{E}_{x\sim \pi_{\theta_{old}}}\nabla_\theta\log P_\theta(x)\nabla_\theta\log P_\theta(x)^\top\vert_{\theta=\theta_{old}}\\
 \end{aligned}
 $$
--- a/content/CSE510/CSE510_L14.md
+++ b/content/CSE510/CSE510_L14.md
@@ -27,7 +27,7 @@ $\theta_{new}=\theta_{old}+d$
 First order Taylor expansion for the loss and second order for the KL:
 $$
-\approx \arg\max_{d} J(\theta_{old})+\nabla_\theta J(\theta)\mid_{\theta=\theta_{old}}d-\frac{1}{2}\lambda(d^T\nabla_\theta^2 D_{KL}\left[\pi_{\theta_{old}}||\pi_{\theta}\right]\mid_{\theta=\theta_{old}}d)+\lambda \delta
+\approx \arg\max_{d} J(\theta_{old})+\nabla_\theta J(\theta)\mid_{\theta=\theta_{old}}d-\frac{1}{2}\lambda(d^\top\nabla_\theta^2 D_{KL}\left[\pi_{\theta_{old}}||\pi_{\theta}\right]\mid_{\theta=\theta_{old}}d)+\lambda \delta
 $$
 If you are really interested, try to fill the solving the KL Constrained Problem section.
@@ -38,7 +38,7 @@ Setting the gradient to zero:
 $$
 \begin{aligned}
-0&=\frac{\partial}{\partial d}\left(-\nabla_\theta J(\theta)\mid_{\theta=\theta_{old}}d+\frac{1}{2}\lambda(d^T F(\theta_{old})d\right)\\
+0&=\frac{\partial}{\partial d}\left(-\nabla_\theta J(\theta)\mid_{\theta=\theta_{old}}d+\frac{1}{2}\lambda(d^\top F(\theta_{old})d\right)\\
 &=-\nabla_\theta J(\theta)\mid_{\theta=\theta_{old}}+\frac{1}{2}\lambda F(\theta_{old})d
 \end{aligned}
 $$
@@ -58,15 +58,15 @@ $$
 $$
 $$
-D_{KL}(\pi_{\theta_{old}}||\pi_{\theta})\approx \frac{1}{2}(\theta-\theta_{old})^T F(\theta_{old})(\theta-\theta_{old})
+D_{KL}(\pi_{\theta_{old}}||\pi_{\theta})\approx \frac{1}{2}(\theta-\theta_{old})^\top F(\theta_{old})(\theta-\theta_{old})
 $$
 $$
-\frac{1}{2}(\alpha g_N)^T F(\alpha g_N)=\delta
+\frac{1}{2}(\alpha g_N)^\top F(\alpha g_N)=\delta
 $$
 $$
-\alpha=\sqrt{\frac{2\delta}{g_N^T F g_N}}
+\alpha=\sqrt{\frac{2\delta}{g_N^\top F g_N}}
 $$
 However, due to the quadratic approximation, the KL constrains may be violated.
--- a/content/CSE510/CSE510_L18.md
+++ b/content/CSE510/CSE510_L18.md
@@ -16,7 +16,7 @@ So we can learn $f(s_t,a_t)$ from data, and _then_ plan through it.
 Model-based reinforcement learning version **0.5**:
-1. Run base polity $\pi_0$ (e.g. random policy) to collect $\mathcal{D} = \{(s_t, a_t, s_{t+1})\}_{t=0}^T$
+1. Run base polity $\pi_0$ (e.g. random policy) to collect $\mathcal{D} = \{(s_t, a_t, s_{t+1})\}_{t=0}^\top$
 2. Learn dynamics model $f(s_t,a_t)$ to minimize $\sum_{i}\|f(s_i,a_i)-s_{i+1}\|^2$
 3. Plan through $f(s_t,a_t)$ to choose action $a_t$
@@ -52,10 +52,10 @@ Version 2.0: backpropagate directly into policy
 Final version:
-1. Run base polity $\pi_0$ (e.g. random policy) to collect $\mathcal{D} = \{(s_t, a_t, s_{t+1})\}_{t=0}^T$
+1. Run base polity $\pi_0$ (e.g. random policy) to collect $\mathcal{D} = \{(s_t, a_t, s_{t+1})\}_{t=0}^\top$
 2. Learn dynamics model $f(s_t,a_t)$ to minimize $\sum_{i}\|f(s_i,a_i)-s_{i+1}\|^2$
 3. Backpropagate through $f(s_t,a_t)$ into the policy to optimized $\pi_\theta(s_t,a_t)$
-4. Run the policy $\pi_\theta(s_t,a_t)$ to collect $\mathcal{D} = \{(s_t, a_t, s_{t+1})\}_{t=0}^T$
+4. Run the policy $\pi_\theta(s_t,a_t)$ to collect $\mathcal{D} = \{(s_t, a_t, s_{t+1})\}_{t=0}^\top$
 5. Goto 2
 ## Model Learning with High-Dimensional Observations
--- a/content/CSE510/CSE510_L20.md
+++ b/content/CSE510/CSE510_L20.md
@@ -0,0 +1,143 @@
 # CSE510 Deep Reinforcement Learning (Lecture 20)
 ## Exploration in RL
 ### Motivations
 #### Exploration vs. Exploitation Dilemma
 Online decision-making involves a fundamental choice:
 - Exploration: trying out new things (new behaviors), with the hope of discovering higher rewards
 - Exploitation: doing what you know will yield the highest reward
 The best long-term strategy may involve short-term sacrifices
 Gather enough knowledge early to make the best long-term decisions
 <details>
 <summary>Example</summary>
 Restaurant Selection
 - Exploitation: Go to your favorite restaurant
 - Exploration: Try a new restaurant
 Oil Drilling
 - Exploitation: Drill at the best known location
 - Exploration: Drill at a new location
 Game Playing
 - Exploitation: Play the move you believe is best
 - Exploration: Play an experimental move
 </details>
 #### Breakout vs. Montezuma's Revenge
 | Property | Breakout | Montezuma's Revenge |
 |----------|----------|--------------------|
 | **Reward frequency** | Dense (every brick hit gives points) | Extremely sparse (only after collecting key or treasure) |
 | **State space** | Simple (ball, paddle, bricks) | Complex (many rooms, objects, ladders, timing) |
 | **Action relevance** | Almost any action affects reward soon | Most actions have no immediate feedback |
 | **Exploration depth** | Shallow (few steps to reward) | Deep (dozens/hundreds of steps before reward) |
 | **Determinism** | Mostly deterministic dynamics | Deterministic but requires long sequences of precise actions |
 | **Credit assignment** | Easy — short time gap | Very hard — long delay from cause to effect |
 #### Motivation
 - Motivation: "Forces" that energize an organism to act and that direct its activity
 - Extrinsic Motivation: being motivated to do something because of some external reward ($, a prize, food, water, etc.)
 - Intrinsic Motivation: being motivated to do something because it is inherently enjoyable (curiosity, exploration, novelty, surprise, incongruity, complexity…)
 ### Intuitive Exploration Strategy
 - Intrinsic motivation drives the exploration for unknowns
 - Intuitively, we explore efficiently once we know what we do not know, and target our exploration efforts to the unknown part of the space.
 - All non-naive exploration methods consider some form of uncertainty estimation, regarding state (or state-action) I have visited, transition dynamics, or Q-functions.
 - Optimal methods in smaller settings don't work, but can inspire for larger settings
 - May use some hacks
 ### Classes of Exploration Methods in Deep RL
 - Optimistic exploration
  - Uncertainty about states
  - Visiting novel states (state visitation counting)
 - Information state search
  - Uncertainty about state transitions or dynamics
  - Dynamics prediction error or Information gain for dynamics learning
 - Posterior sampling
  - Uncertainty about Q-value functions or policies
  - Selecting actions according to the probability they are best
 ### Optimistic Exploration
 #### Count-Based Exploration in Small MDPs
 Book-keep state visitation counts $N(s)$
 Add exploration reward bonuses that encourage policies that visit states with fewer counts.
 $$
 R(s,a,s') = r(s,a,s') + \mathcal{B}(N(s))
 $$
 where $\mathcal{B}(N(s))$ is the intrinsic exploration reward bonus.
 - UCB: $\mathcal{B}(N(s)) = \sqrt{\frac{2\ln n}{N(s)}}$ (more aggressive exploration)
 - MBIE-EB (Strehl & Littman): $\mathcal{B}(N(s)) = \sqrt{\frac{1}{N(s)}}$
 - BEB (Kolter & Ng): $\mathcal{B}(N(s)) = \frac{1}{N(s)}$
 - We want to come up with something that rewards states that we have not visited often.
 - But in large MDPs, we rarely visit a state twice!
 - We need to capture a notion of state similarity, and reward states that are most dissimilar to what we have seen so far
  - as opposed to different (as they will always be different).   
 #### Fitting Generative Models
 Idea: fit a density model $p_\theta(s)$ (or $p_\theta(s,a)$)
 $p_\theta(s)$ might be high even for a new $s$.
 If $s$ is similar to perviously seen states, can we use $p_\theta(s)$ to get a "pseudo-count" for $s$?
 If we have small MDPs, the true probability is
 $$
 P(s)=\frac{N(s)}{n}
 $$
 where $N(s)$ is the number of times $s$ has been visited and $n$ is the total states visited.
 after we visit $s$, then
 $$
 P'(s)=\frac{N(s)+1}{n+1}
 $$
 1. fit model $p_\theta(s)$ to all states $\mathcal{D}$ so far.
 2. take a step $i$ and observe $s_i$.
 3. fit new model $p_\theta'(s)$ to all states $\mathcal{D} \cup {s_i}$.
 4. use $p_\theta(s_i)$ and $p_\theta'(s_i)$ to estimate the "pseudo-count" for $\hat{N}(s_i)$.
 5. set $r_i^+=r_i+\mathcal{B}(\hat{N}(s_i))$
 6. go to 1
 How to get $\hat{N}(s_i)$? use the equations
 $$
 p_\theta(s_i)=\frac{\hat{N}(s_i)}{\hat{n}}\quad p_\theta'(s_i)=\frac{\hat{N}(s_i)+1}{\hat{n}+1}
 $$
 [link to the paper](https://arxiv.org/pdf/1606.01868)
 #### Density models
 [link to the paper](https://arxiv.org/pdf/1703.01310)
 #### State Counting with DeepHashing
 - We still count states (images) but not in pixel space, but in latent compressed space.
 - Compress $s$ into a latent code, then count occurrences of the code.
 - How do we get the image encoding? e.g., using autoencoders.
 - There is no guarantee such reconstruction loss will capture the important things that make two states to be similar
--- a/content/CSE510/_meta.js
+++ b/content/CSE510/_meta.js
@@ -22,4 +22,5 @@ export default {
    CSE510_L17: "CSE510 Deep Reinforcement Learning (Lecture 17)",
    CSE510_L18: "CSE510 Deep Reinforcement Learning (Lecture 18)",
    CSE510_L19: "CSE510 Deep Reinforcement Learning (Lecture 19)",
    CSE510_L20: "CSE510 Deep Reinforcement Learning (Lecture 20)",
 }
--- a/content/CSE5313/CSE5313_L10.md
+++ b/content/CSE5313/CSE5313_L10.md
@@ -40,20 +40,20 @@ Let $G$ and $H$ be the generator and parity-check matrices of (any) linear code
 #### Lemma 1
 $$
-H G^T = 0
+H G^\top = 0
 $$
 <details>
 <summary>Proof</summary>
-By definition of generator matrix and parity-check matrix, $forall e_i\in H$, $e_iG^T=0$.
+By definition of generator matrix and parity-check matrix, $forall e_i\in H$, $e_iG^\top=0$.
-So $H G^T = 0$.
+So $H G^\top = 0$.
 </details>
 #### Lemma 2
-Any matrix $M\in \mathbb{F}_q^{(n-k)\times n}$ such that $\operatorname{rank}(M) = n - k$ and $M G^T = 0$ is a parity-check matrix for $C$ (i.e. $C = \ker M$).
+Any matrix $M\in \mathbb{F}_q^{(n-k)\times n}$ such that $\operatorname{rank}(M) = n - k$ and $M G^\top = 0$ is a parity-check matrix for $C$ (i.e. $C = \ker M$).
 <details>
 <summary>Proof</summary>
@@ -62,7 +62,7 @@ It is sufficient to show that the two statements
 1. $\forall c\in C, c=uG, u\in \mathbb{F}^k$
-$M c^T = M(uG)^T = M(G^T u^T) = 0$ since $M G^T = 0$.
+$M c^\top = M(uG)^\top = M(G^\top u^\top) = 0$ since $M G^\top = 0$.
 Thus $C \subseteq \ker M$.
@@ -84,15 +84,15 @@ We proceed by applying the lemma 2.
 1. $\operatorname{rank}(H) = n - k$ since $H$ is a Vandermonde matrix times a diagonal matrix with no zero entries, so $H$ is invertible.
-2. $H G^T = 0$.
+2. $H G^\top = 0$.
-note that $\forall$ row $i$ of $H$, $0\leq i\leq n-k-1$, $\forall$ column $j$ of $G^T$, $0\leq j\leq k-1$
+note that $\forall$ row $i$ of $H$, $0\leq i\leq n-k-1$, $\forall$ column $j$ of $G^\top$, $0\leq j\leq k-1$
 So
 $$
 \begin{aligned}
-H G^T &= \begin{bmatrix}
+H G^\top &= \begin{bmatrix}
 1 & 1 & \cdots & 1\\
 \alpha_1 & \alpha_2 & \cdots & \alpha_n\\
 \alpha_1^2 & \alpha_2^2 & \cdots & \alpha_n^2\\
--- a/content/CSE5313/CSE5313_L11.md
+++ b/content/CSE5313/CSE5313_L11.md
@@ -101,7 +101,7 @@ $$
 Let $\mathcal{C}=[n,k,d]_q$.
-The dual code of $\mathcal{C}$ is $\mathcal{C}^\perp=\{x\in \mathbb{F}^n_q|xc^T=0\text{ for all }c\in \mathcal{C}\}$.
+The dual code of $\mathcal{C}$ is $\mathcal{C}^\perp=\{x\in \mathbb{F}^n_q|xc^\top=0\text{ for all }c\in \mathcal{C}\}$.
 <details>
 <summary>Example</summary>
@@ -151,7 +151,7 @@ So $\langle f,h\rangle=0$.
 <details>
 <summary>Proof for the theorem</summary>
-Recall that the dual code of $\operatorname{RM}(r,m)^\perp=\{x\in \mathbb{F}_2^m|xc^T=0\text{ for all }c\in \operatorname{RM}(r,m)\}$.
+Recall that the dual code of $\operatorname{RM}(r,m)^\perp=\{x\in \mathbb{F}_2^m|xc^\top=0\text{ for all }c\in \operatorname{RM}(r,m)\}$.
 So $\operatorname{RM}(m-r-1,m)\subseteq \operatorname{RM}(r,m)^\perp$.
--- a/content/CSE5313/CSE5313_L14.md
+++ b/content/CSE5313/CSE5313_L14.md
@@ -230,7 +230,7 @@ Step 1: Arrange the $B=\binom{k+1}{2}+k(d-k)$ symbols in a matrix $M$ follows:
 $$
 M=\begin{pmatrix}
 S & T\\
-T^T & 0
+T^\top & 0
 \end{pmatrix}\in \mathbb{F}_q^{d\times d}
 $$
@@ -267,15 +267,15 @@ Repair from (any) nodes $H = \{h_1, \ldots, h_d\}$.
 Newcomer contacts each $h_j$: “My name is $i$, and I’m lost.”
-Node $h_j$ sends $c_{h_j}M c_i^T$ (inner product).
+Node $h_j$ sends $c_{h_j}M c_i^\top$ (inner product).
-Newcomer assembles $C_H Mc_i^T$.
+Newcomer assembles $C_H Mc_i^\top$.
 $CH$ invertible by construction!
- Recover $Mc_i^T$.
+- Recover $Mc_i^\top$.
- Recover $c_i^TM$ ($M$ is symmetric)
+- Recover $c_i^\topM$ ($M$ is symmetric)
 #### Reconstruction on Product-Matrix MBR codes
@@ -292,9 +292,9 @@ DC assembles $C_D M$.
 $\Psi_D$ invertible by construction.
- DC computes $\Psi_D^{-1}C_DM = (S+\Psi_D^{-1}\Delta_D^T, T)$
+- DC computes $\Psi_D^{-1}C_DM = (S+\Psi_D^{-1}\Delta_D^\top, T)$
 - DC obtains $T$.
- Subtracts $\Psi_D^{-1}\Delta_D T^T$ from $S+\Psi_D^{-1}\Delta_D T^T$ to obtain $S$.
+- Subtracts $\Psi_D^{-1}\Delta_D T^\top$ from $S+\Psi_D^{-1}\Delta_D T^\top$ to obtain $S$.
 <details>
 <summary>Fill an example here please.</summary>
--- a/content/CSE5313/CSE5313_L19.md
+++ b/content/CSE5313/CSE5313_L19.md
@@ -0,0 +1,232 @@
 # CSE5313 Coding and information theory for data science (Lecture 19)
 ## Private information retrieval
 ### Problem setup
 Premise:
 - Database $X = \{x_1, \ldots, x_m\}$, each $x_i \in \mathbb{F}_q^k$ is a "file" (e.g., medical record).
 - $X$ is coded $X \mapsto \{y_1, \ldots, y_n\}$, $y_j$ stored at server $j$.
 - The user (physician) wants $x_i$.
 - The user sends a query $q_j \sim Q_j$ to server $j$.
 - Server $j$ responds with $a_j \sim A_j$.
 Decodability:
 - The user can retrieve the file: $H(X_i | A_1, \ldots, A_n) = 0$.
 Privacy:
 - $i$ is seen as $i \sim U = U_{m}$, reflecting server's lack of knowledge.
 - $i$ must be kept private: $I(Q_j; U) = 0$ for all $j \in n$.
 > In short, we want to retrieve $x_i$ from the servers without revealing $i$ to the servers.
 ### Private information retrieval from Replicated Databases
 #### Simple case, one server
 Say $n = 1, y_1 = X$.
 - All data is stored in one server.
 - Simple solution:
 - $q_1 =$ "send everything".
 - $a_1 = y_1 = X$.
 Theorem: Information Theoretic PIR with $n = 1$ can only be achieved by downloading the entire database.
 - Can we do better if $n > 1$?
 #### Collusion parameter
 Key question for $n > 1$: Can servers collude?
 - I.e., does server $j$ see any $Q_\ell$, $\ell \neq j$?
 - Key assumption:
  - Privacy parameter $z$.
  - At most $z$ servers can collude.
  - $z = 1\implies$ No collusion.
 - Requirement for $z = 1$: $I(Q_j; U) = 0$ for all $j \in n$.
 - Requirement for a general $z$:
  - $I(Q_\mathcal{T}; U) = 0$ for all $\mathcal{T} \in n$, $|\mathcal{T}| \leq z$, where $Q_\mathcal{T} = Q_\ell$ for all $\ell \in \mathcal{T}$.
 - Motivation:
  - Interception of communication links.
  - Data breaches.
 Other assumptions:
 - Computational Private information retrieval (even all the servers are hacked, still cannot get the information -> solve np-hard problem):
 - Non-zero MI
 #### Private information retrieval from 2-replicated databases
 First PIR protocol: Chor et al. FOCS ‘95.
 - The data $X = \{x_1, \ldots, x_m\}$ is replicated on two servers.
  - $z = 1$, i.e., no collusion.
 - Protocol: User has $i \sim U_{m}$.
  - User generates $r \sim U_{\mathbb{F}_q^m}$.
  - $q_1 = r, q_2 = r + e_i$ ($e_i \in \mathbb{F}_q^m$ is the $i$-th unit vector, $q_2$ is equivalent to one-time pad encryption of $x_i$ with key $r$).
  - $a_j = q_j X^\top = \sum_{\ell \in m} q_j, \ell x_\ell$
  - Linear combination of the files according to the query vector $q_j$.
 - Decoding?
  - $a_2 - a_1 = q_2 - q_1 X^\top = e_i X^\top = x_i$.
 - Download?
  - $a_j =$ size of file $\implies$ downloading **twice** the size of the file.
 - Privacy?
  - Since $z = 1$, need to show $I(U; Q_i) = 0$.
    - $I(U; Q_1) = I(e_U; F) = 0$ since $U$ and $F$ are independent.
    - $I(U; Q_2) = I(e_U; F + e_U) = 0$ since this is one-time pad!
 ##### Parameters and notations in PIR
 Parameters of the system:
 - $n =$ # servers (as in storage).
 - $m =$ # files.
 - $k =$ size of each file (as in storage).
 - $z =$ max. collusion (as in secret sharing).
 - $t =$ # of answers required to obtain $x_i$ (as in secret sharing).
  - $n - t$ servers are “stragglers”, i.e., might not respond.
 Figures of merit:
 - PIR-rate = $\#$ desired symbols / $\#$ downloaded symbols
 - PIR-capacity = largest possible rate.
 Notaional conventions:
 -The dataset $X = \{x_j\}_{j \in m} = \{x_{j, \ell}\}_{(j, \ell) \in [m] \times [k]}$ is seen as a vector in $\mathbb{F}_q^{mk}$.
 - Index $\mathbb{F}_q^{mk}$ using $[m] \times [k]$, i.e., $x_{j, \ell}$ is the $\ell$-th symbol of the $j$-th file.
 #### Private information retrieval from 4-replicated databases
 Consider $n = 4$ replicated servers, file size $k = 2$, collusion $z = 1$.
 Protocol: User has $i \sim U_{m}$.
 - Fix distinct nonzero $\alpha_1, \ldots, \alpha_4 \in \mathbb{F}_q$.
 - Choose $r \sim U_{\mathbb{F}_q^{2m}}$.
 - User sends $q_j = e_{i, 1} + \alpha_j e_{i, 2} + \alpha_j^2 r$ to each server $j$.
 - Server $j$ responds with
  $$
  a_j = q_j X^\top = e_{i, 1} X^\top + \alpha_j e_{i, 2} X^\top + \alpha_j^2 r X^\top
  $$
  - This is an evaluation at $\alpha_j$ of the polynomial $f_i(w) = x_{i, 1} + x_{i, 2} \cdot w + r \cdot w^2$.
  - Where $r$ is some random combination of the entries of $X$.
 - Decoding?
  - Any 3 responses suffice to interpolate $f_i$ and obtain $x_i = x_{i, 1}, x_{i, 2}$.
  - $\implies t = 3$, (one straggler is allowed)
 - Privacy?
  - Does $q_j = e_{i, 1} + \alpha_j e_{i, 2} + \alpha_j^2 r$ look familiar?
  - This is a share in [ramp scheme](CSE5313_L18.md#scheme-2-ramp-secret-sharing-scheme-mceliece-sarwate-scheme) with vector messages $m_1 = e_{i, 1}, m_2 = e_{i, 2}, m_i \in \mathbb{F}_q^{2m}$.
  - This is equivalent to $2m$ "parallel" ramp scheme over $\mathbb{F}_q$.
  - Each one reveals nothing to any $z = 1$ shareholders $\implies$ Private!
 ### Private information retrieval from general replicated databases
 $n$ servers, $m$ files, file size $k$, $X \in \mathbb{F}_q^{mk}$.
 Server decodes $x_i$ from any $t$ responses.
 Any $\leq z$ servers might collude to infer $i$ ($z < t$).
 Protocol: User has $i \sim U_{m}$.
 - User chooses $r_1, \ldots, r_z \sim U_{\mathbb{F}_q^{mk}}$.
 - User sends $q_j = \sum_{\ell=1}^k e_{i, \ell} \alpha_j^{\ell-1} + \sum_{\ell=1}^z r_\ell \alpha_j^{k+\ell-1}$ to each server $j$.
 - Server $j$ responds with $a_j = q_j X^\top = f_i(\alpha_j)$.
  - $f_i(w) = \sum_{\ell=1}^k e_{i, \ell} X^\top w^{\ell-1} + \sum_{\ell=1}^z r_\ell X^\top w^{k+\ell-1}$ (random combinations of $X$).
  - Caveat: must have $t = k + z$.
  - $\implies \deg f_i = k + z - 1 = t - 1$.
 - Decoding?
  - Interpolation from any $t$ evaluations of $f_i$.
 - Privacy?
  - Against any $z = t - k$ colluding servers, immediate from the proof of the ramp scheme.
 PIR-rate?
 - Each $a_j$ is a single field element.
 - Download $t = k + z$ elements in $\mathbb{F}_q$ in order to obtain $x_i \in \mathbb{F}_q^k$.
 - $\implies$ PIR-rate = $\frac{k}{k+z} = \frac{k}{t}$.
 #### Theorem: PIR-capacity for general replicated databases
 The PIR-capacity for $n$ replicated databases with $z$ colluding servers, $n - t$ unresponsive servers, and $m$ files is $C = \frac{1-\frac{z}{t}}{1-(\frac{z}{t})^m}$.
 - When $m \to \infty$, $C \to 1 - \frac{z}{t} = \frac{t-z}{t} = \frac{k}{t}$.
 - The above scheme achieves PIR-capacity as $m \to \infty$
 ### Private information retrieval from coded databases
 #### Problem setup:
 Example:
 - $n = 3$ servers, $m$ files $x_j$, $x_j = x_{j, 1}, x_{j, 2}$, $k = 2$, and $q = 2$.
 - Code each file with a parity code: $x_{j, 1}, x_{j, 2} \mapsto x_{j, 1}, x_{j, 2}, x_{j, 1} + x_{j, 2}$.
 - Server $j \in 3$ stores all $j$-th symbols of all coded files.
 Queries, answers, decoding, and privacy must be tailored for the code at hand.
 With respect to a code $C$ and parameters $n, k, t, z$, such scheme is called coded-PIR.
 - The content for server $j$ is denoted by $c_j = c_{j, 1}, \ldots, c_{j, m}$.
 - $C$ is usually an MDS code.
 #### Private information retrieval from parity-check codes
 Example:
 Say $z = 1$ (no collusion).
 - Protocol: User has $i \sim U_{m}$.
 - User chooses $r_1, r_2 \sim U_{\mathbb{F}_2^m}$.
 - Two queries to each server:
  - $q_{1, 1} = r_1 + e_i$, $q_{1, 2} = r_2$.
  - $q_{2, 1} = r_1$, $q_{2, 2} = r_2 + e_i$.
  - $q_{3, 1} = r_1$, $q_{3, 2} = r_2$.
 - Server $j$ responds with $q_{j, 1} c_j^\top$ and $q_{j, 2} c_j^\top$.
 - Decoding?
  - $q_{1, 1} c_1^\top + q_{2, 1} c_2^\top + q_{3, 1} c_3^\top = r_1 c_1 + c_2 + c_3 + e_i c_1^\top = r_1 \cdot 0^\top + x_{i, 1} = x_{i, 1}$.
  - $q_{1, 1} c_1^\top + q_{2, 1} c_2^\top + q_{3, 1} c_3^\top = r_1 \cdot 0^\top + x_{i, 1} = x_{i, 1}$.
  - $q_{1, 2} c_1^\top + q_{2, 2} c_2^\top + q_{3, 2} c_3^\top = r_2 c_1 + c_2 + c_3^\top + e_i c_2^\top = x_{i, 2}$.
 - Privacy?
  - Every server sees two uniformly random vectors in $\mathbb{F}_2^m$.
 <details>
 <summary>Proof from coding-theoretic interpretation</summary>
 Let $G = g_1^\top, g_2^\top, g_3^\top$ be the generator matrix. 
 - For every file $x_j = x_{j, 1}, x_{j, 2}$ we encode $x_j G = (x_{j, 1} g_1^\top, x_{j, 2} g_2^\top, x_{j, 1} g_3^\top) = (c_{j, 1}, c_{j, 2}, c_{j, 3})$.
 - Server $j$ stores $X g_j^\top = (x_1^\top, \ldots, x_m^\top)^\top g_j^\top = (c_{j, 1}, \ldots, c_{j, m})^\top$.
 - By multiplying by $r_1$, the servers together store a codeword in $C$:
  - $r_1 X g_1^\top, r_1 X g_2^\top, r_1 X g_3^\top = r_1 X G$.
 - By replacing one of the $r_1$’s by $r_1 + e_i$, we introduce an error in that entry:
  - $\left((r_1 + e_i) X g_1^\top, r_1 X g_2^\top, r_1 X g_3^\top\right) = r_1 X G + (e_i X g_1^\top, 0,0)$.
 - Downloading this “erroneous” word from the servers and multiply by $H = h_1^\top, h_2^\top, h_3^\top$ be the parity-check matrix.
 $$
 \begin{aligned}
 \left((r_1 + e_i) X g_1^\top, r_1 X g_2^\top, r_1 X g_3^\top\right) H^\top &= \left(r_1 X G + (e_i X g_1^\top, 0,0)\right) H^\top \\
 &= r_1 X G H^\top + (e_i X g_1^\top, 0,0) H^\top \\
 &= 0 + x_{i, 1} g_1^\top \\
 &= x_{i, 1}.
 \end{aligned}
 $$
 > In homework we will show tha this work with any MDS code ($z=1$).
 - Say we obtained $x_{i, 1} g_1^\top, \ldots, x_{i, k} g_k^\top$ (𝑑 − 1 at a time, how?).
 - $x_{i, 1} g_1^\top, \ldots, x_{i, k} g_k^\top = x_{i, B}$, where $B$ is a $k \times k$ submatrix of $G$.
 - $B$ is a $k \times k$ submatrix of $G$ $\implies$ invertible! $\implies$ Obtain $x_{i}$.
 </details>
 > [!TIP]
 >
 > error + known location $\implies$ erasure. $d = 2 \implies$ 1 erasure is correctable.
--- a/content/CSE5313/CSE5313_L6.md
+++ b/content/CSE5313/CSE5313_L6.md
@@ -92,10 +92,10 @@ Two equivalent ways to constructing a linear code:
 - A **parity check** matrix $H\in \mathbb{F}^{(n-k)\times n}$ with $(n-k)$ rows and $n$ columns.
  $$
-  \mathcal{C}=\{c\in \mathbb{F}^n:Hc^T=0\}
+  \mathcal{C}=\{c\in \mathbb{F}^n:Hc^\top=0\}
  $$
  - The right kernel of $H$ is $\mathcal{C}$.
-  - Multiplying $c^T$ by $H$ "checks" if $c\in \mathcal{C}$.
+  - Multiplying $c^\top$ by $H$ "checks" if $c\in \mathcal{C}$.
 ### Encoding of linear codes
@@ -144,7 +144,7 @@ Decoding: $(y+e)\to x$, $y=xG$.
 Use **syndrome** to identify which coset $\mathcal{C}_i$ that the noisy-code to $\mathcal{C}_i+e$ belongs to.
 $$
-H(y+e)^T=H(y+e)=Hx+He=He
+H(y+e)^\top=H(y+e)=Hx+He=He
 $$
 ### Syndrome decoding
@@ -215,7 +215,7 @@ Fourth row is $\mathcal{C}+(00100)$.
 Any two elements in a row are of the form $y_1'=y_1+e$ and $y_2'=y_2+e$ for some $e\in \mathbb{F}^n$.
-Same syndrome if $H(y_1'+e)^T=H(y_2'+e)^T$.
+Same syndrome if $H(y_1'+e)^\top=H(y_2'+e)^\top$.
 Entries in different rows have different syndrome.
--- a/content/CSE5313/CSE5313_L7.md
+++ b/content/CSE5313/CSE5313_L7.md
@@ -7,7 +7,7 @@ Let $\mathcal{C}= [n,k,d]_{\mathbb{F}}$ be a linear code.
 There are two equivalent ways to describe a linear code:
 1. A generator matrix $G\in \mathbb{F}^{k\times n}_q$ with $k$ rows and $n$ columns, entry taken from $\mathbb{F}_q$. $\mathcal{C}=\{xG|x\in \mathbb{F}^k\}$
-2. A parity check matrix $H\in \mathbb{F}^{(n-k)\times n}_q$ with $(n-k)$ rows and $n$ columns, entry taken from $\mathbb{F}_q$. $\mathcal{C}=\{c\in \mathbb{F}^n:Hc^T=0\}$
+2. A parity check matrix $H\in \mathbb{F}^{(n-k)\times n}_q$ with $(n-k)$ rows and $n$ columns, entry taken from $\mathbb{F}_q$. $\mathcal{C}=\{c\in \mathbb{F}^n:Hc^\top=0\}$
 ### Dual code
@@ -21,7 +21,7 @@ $$
 Also, the alternative definition is:
-1. $C^{\perp}=\{x\in \mathbb{F}^n:Gx^T=0\}$ (only need to check basis of $C$)
+1. $C^{\perp}=\{x\in \mathbb{F}^n:Gx^\top=0\}$ (only need to check basis of $C$)
 2. $C^{\perp}=\{xH|x\in \mathbb{F}^{n-k}\}$
 By rank-nullity theorem, $dim(C^{\perp})=n-dim(C)=n-k$.
@@ -87,7 +87,7 @@ Assume minimum distance is $d$. Show that every $d-1$ columns of $H$ are indepen
 - Fact: In linear codes minimum distance is the minimum weight ($d_H(x,y)=w_H(x-y)$).
-Indeed, if there exists a $d-1$ columns of $H$ that are linearly dependent, then we have $Hc^T=0$ for some $c\in \mathcal{C}$ with $w_H(c)<d$.
+Indeed, if there exists a $d-1$ columns of $H$ that are linearly dependent, then we have $Hc^\top=0$ for some $c\in \mathcal{C}$ with $w_H(c)<d$.
 Reverse are similar.
@@ -130,7 +130,7 @@ $k=2^m-m-1$.
 Define the code by encoding function:
-$E(x): \mathbb{F}_2^m\to \mathbb{F}_2^{2^m}=(xy_1^T,\cdots,xy_{2^m}^T)$ ($y\in \mathbb{F}_2^m$)
+$E(x): \mathbb{F}_2^m\to \mathbb{F}_2^{2^m}=(xy_1^\top,\cdots,xy_{2^m}^\top)$ ($y\in \mathbb{F}_2^m$)
 Space of codewords is image of $E$.
--- a/content/CSE5313/CSE5313_L8.md
+++ b/content/CSE5313/CSE5313_L8.md
@@ -258,7 +258,7 @@ Algorithm:
 - Begin with $(n-k)\times (n-k)$ identity matrix.
 - Assume we choose columns $h_1,h_2,\ldots,h_\ell$ (each $h_i$ is in $\mathbb{F}^n_q$)
 - Then next column $h_{\ell}$ must not be in the space of any previous $d-2$ columns.
-  - $h_{\ell}$ cannot be written as $[h_1,h_2,\ldots,h_{\ell-1}]x^T$ for $x$ of Hamming weight at most $d-2$.
+  - $h_{\ell}$ cannot be written as $[h_1,h_2,\ldots,h_{\ell-1}]x^\top$ for $x$ of Hamming weight at most $d-2$.
  - So the ineligible candidates for $h_{\ell}$ is:
    - $B_{\ell-1}(0,d-2)=\{x\in \mathbb{F}^{\ell-1}_q: d_H(0,x)\leq d-2\}$.
    - $|B_{\ell-1}(0,d-2)|=\sum_{i=0}^{d-2}\binom{\ell-1}{i}(q-1)^i$, denoted by $V_q(\ell-1, d-2)$.
--- a/content/CSE5313/CSE5313_L9.md
+++ b/content/CSE5313/CSE5313_L9.md
@@ -148,15 +148,15 @@ The generator matrix for Reed-Solomon code is a Vandermonde matrix $V(a_1,a_2,\l
 Fact: $V(a_1,a_2,\ldots,a_n)$ is invertible if and only if $a_1,a_2,\ldots,a_n$ are distinct. (that's how we choose $a_1,a_2,\ldots,a_n$)
-The parity check matrix for Reed-Solomon code is also a Vandermonde matrix $V(a_1,a_2,\ldots,a_n)^T$ with scalar multiples of the columns.
+The parity check matrix for Reed-Solomon code is also a Vandermonde matrix $V(a_1,a_2,\ldots,a_n)^\top$ with scalar multiples of the columns.
 Some technical lemmas:
 Let $G$ and $H$ be the generator and parity-check matrices of (any) linear code
 $C = [n, k, d]_{\mathbb{F}_q}$. Then:
-I. Then $H G^T = 0$.
+I. Then $H G^\top = 0$.
-II. Any matrix $M \in \mathbb{F}_q^{n-k \times k}$ such that $\rank(M) = n - k$ and $M G^T = 0$ is a parity-check matrix for $C$ (i.e. $C = \ker M$).
+II. Any matrix $M \in \mathbb{F}_q^{n-k \times k}$ such that $\rank(M) = n - k$ and $M G^\top = 0$ is a parity-check matrix for $C$ (i.e. $C = \ker M$).
 ## Reed-Muller code
--- a/content/CSE5313/_meta.js
+++ b/content/CSE5313/_meta.js
@@ -22,4 +22,5 @@ export default {
    CSE5313_L16: "CSE5313 Coding and information theory for data science (Exam Review)",
    CSE5313_L17: "CSE5313 Coding and information theory for data science (Lecture 17)",
    CSE5313_L18: "CSE5313 Coding and information theory for data science (Lecture 18)",
    CSE5313_L19: "CSE5313 Coding and information theory for data science (Exam Review)",
 }
--- a/content/CSE5519/CSE5519_B4.md
+++ b/content/CSE5519/CSE5519_B4.md
@@ -1,2 +1,17 @@
 # CSE5519 Advances in Computer Vision (Topic B: 2024: Vision-Language Models)
 ## Improved Baselines with Visual Instruction Tuning (LLaVA-1.5)
 [link to the paper](https://openaccess.thecvf.com/content/CVPR2024/papers/Liu_Improved_Baselines_with_Visual_Instruction_Tuning_CVPR_2024_paper.pdf)
 This paper shows that the visual instruction tuning can improve the performance of the vision-language model.
 ### Novelty in LLaVA-1.5
 1. Scaling to high resolution images by dividing images into grids and maintaining the data efficiency.
 2. Compositional ability, (use long-form language reasoning together with shorter visual reasoning can improve the model's writing ability)
 3. Random downsampling will not degrade the performance.
 >[!TIP] 
 >
 > This paper shows that LLaVA-1.5 obeys the scaling law and splitting the high resolution images into grids to maintain the data efficiency. I wonder why this method is not applicable to multi-image understanding tasks? Why we cannot assign index embeddings to each image and push the image sets to the model for better understanding? What are the technical challenges to implement this idea?
--- a/content/CSE5519/CSE5519_E1.md
+++ b/content/CSE5519/CSE5519_E1.md
@@ -469,7 +469,7 @@ $$
 Then we use $\mathcal{L}_{ds}$ to enforce the smoothness of the disparity map.
 $$
-\mathcal{L}_{ds}=\sum_{p\in I^l}\left|\partial_x d^l_p\right|e^{-\left|\partial_x d^l_p\right|}+\left|\partial_y d^l_p\right|e^{-\left|\partial_y d^l_p\right|}=\sum_{p_t}|\nabla D(p_t)|\cdot \left(e^{-|\nabla I(p_t)|}\right)^T\tag{2}
+\mathcal{L}_{ds}=\sum_{p\in I^l}\left|\partial_x d^l_p\right|e^{-\left|\partial_x d^l_p\right|}+\left|\partial_y d^l_p\right|e^{-\left|\partial_y d^l_p\right|}=\sum_{p_t}|\nabla D(p_t)|\cdot \left(e^{-|\nabla I(p_t)|}\right)^\top\tag{2}
 $$
 Replacing $\hat{I}^{rig}_s$ with $\hat{I}^{full}_s$, in (1) and (2), we get the $\mathcal{L}_{fw}$ and $\mathcal{L}_{fs}$ for the non-rigid motion localizer.
--- a/content/CSE5519/CSE5519_H4.md
+++ b/content/CSE5519/CSE5519_H4.md
@@ -1,2 +1,20 @@
 # CSE5519 Advances in Computer Vision (Topic H: 2024: Safety, Robustness, and Evaluation of CV Models)
 ## Efficient Bias Mitigation Without Privileged Information
 [link to the paper](https://arxiv.org/pdf/2409.17691)
 TAB: Targeted Augmentation for Bias mitigation
 1. Loss history embedding construction (use Helper model to generate loss history for training dataset)
 2. Loss aware partitioning (partition the training dataset into groups based on the loss history, reweight the loss of each group to balance the dataset)
 3. Group-balanced dataset generation (generate a new dataset by sampling from the groups based on the reweighting)
 4. Robust model training (train the model on the new dataset)
 > [!TIP]
 >
 > This paper is a good example of how to mitigate bias in a dataset without using privileged information.
 >
 > However, the mitigation is heavy relied on the loss history, which might be different for each model architecture. Thus, the produced dataset may not be generalizable to other models.
 >
 > How to evaluate the bias mitigation effect across different models and different datasets?
--- a/content/CSE559A/CSE559A_L21.md
+++ b/content/CSE559A/CSE559A_L21.md
@@ -64,7 +64,7 @@ $d = \begin{bmatrix}
 u \\ v
 \end{bmatrix}$
-The solution is $d=(A^T A)^{-1} A^T b$
+The solution is $d=(A^\top A)^{-1} A^\top b$
 Lucas-Kanade flow: 
@@ -170,7 +170,7 @@ E = \sum_{i=1}^n (a(x_i-\bar{x})+b(y_i-\bar{y}))^2 = \left\|\begin{bmatrix}x_1-\
 $$
 We want to find $N$ that minimizes $\|UN\|^2$ subject to $\|N\|^2= 1$
-Solution is given by the eigenvector of $U^T U$ associated with the smallest eigenvalue
+Solution is given by the eigenvector of $U^\top U$ associated with the smallest eigenvalue
 Drawbacks:
--- a/content/CSE559A/CSE559A_L22.md
+++ b/content/CSE559A/CSE559A_L22.md
@@ -178,7 +178,7 @@ $$
 \begin{pmatrix}a\\b\\c\end{pmatrix} \times \begin{pmatrix}a'\\b'\\c'\end{pmatrix} = \begin{pmatrix}bc'-b'c\\ca'-c'a\\ab'-a'b\end{pmatrix}
 $$
-Let $h_1^T, h_2^T, h_3^T$ be the rows of $H$. Then
+Let $h_1^\top, h_2^\top, h_3^\top$ be the rows of $H$. Then
 $$
 x_i' × Hx_i=\begin{pmatrix}
@@ -186,15 +186,15 @@ x_i' × Hx_i=\begin{pmatrix}
    y_i' \\
    1
 \end{pmatrix} \times \begin{pmatrix}
-    h_1^T x_i \\
+    h_1^\top x_i \\
-    h_2^T x_i \\
+    h_2^\top x_i \\
-    h_3^T x_i
+    h_3^\top x_i
 \end{pmatrix}
 =
 \begin{pmatrix}
-    y_i' h_3^T x_i−h_2^T x_i \\
+    y_i' h_3^\top x_i−h_2^\top x_i \\
-    h_1^T x_i−x_i' h_3^T x_i \\
+    h_1^\top x_i−x_i' h_3^\top x_i \\
-    x_i' h_2^T x_i−y_i' h_1^T x_i
+    x_i' h_2^\top x_i−y_i' h_1^\top x_i
 \end{pmatrix}
 $$
@@ -206,15 +206,15 @@ x_i' × Hx_i=\begin{pmatrix}
    y_i' \\
    1
 \end{pmatrix} \times \begin{pmatrix}
-    h_1^T x_i \\
+    h_1^\top x_i \\
-    h_2^T x_i \\
+    h_2^\top x_i \\
-    h_3^T x_i
+    h_3^\top x_i
 \end{pmatrix}
 =
 \begin{pmatrix}
-    y_i' h_3^T x_i−h_2^T x_i \\
+    y_i' h_3^\top x_i−h_2^\top x_i \\
-    h_1^T x_i−x_i' h_3^T x_i \\
+    h_1^\top x_i−x_i' h_3^\top x_i \\
-    x_i' h_2^T x_i−y_i' h_1^T x_i
+    x_i' h_2^\top x_i−y_i' h_1^\top x_i
 \end{pmatrix}
 $$
@@ -222,9 +222,9 @@ Rearranging the terms:
 $$
 \begin{bmatrix}
-    0^T &-x_i^T &y_i' x_i^T \\
+    0^\top &-x_i^\top &y_i' x_i^\top \\
-    x_i^T &0^T &-x_i' x_i^T \\
+    x_i^\top &0^\top &-x_i' x_i^\top \\
-    y_i' x_i^T &x_i' x_i^T &0^T
+    y_i' x_i^\top &x_i' x_i^\top &0^\top
 \end{bmatrix}
 \begin{bmatrix}
    h_1 \\
--- a/content/CSE559A/CSE559A_L26.md
+++ b/content/CSE559A/CSE559A_L26.md
@@ -17,16 +17,16 @@ If we set the config for the first camera as the world origin and $[I|0]\begin{p
 Notice that $x'\cdot [t\times (Ry)]=0$
 $$
-x'^T E x_1 = 0
+x'^\top E x_1 = 0
 $$
 We denote the constraint defined by the Essential Matrix as $E$.
 $E x$ is the epipolar line associated with $x$ ($l'=Ex$)
-$E^T x'$ is the epipolar line associated with $x'$ ($l=E^T x'$)
+$E^\top x'$ is the epipolar line associated with $x'$ ($l=E^\top x'$)
-$E e=0$ and $E^T e'=0$ ($x$ and $x'$ don't matter)
+$E e=0$ and $E^\top e'=0$ ($x$ and $x'$ don't matter)
 $E$ is singular (rank 2) and have five degrees of freedom.
@@ -35,13 +35,13 @@ $E$ is singular (rank 2) and have five degrees of freedom.
 If the calibration matrices $K$ and $K'$ are unknown, we can write the epipolar constraint in terms of unknown normalized coordinates:
 $$
-x'^T_{norm} E x_{norm} = 0
+x'^\top_{norm} E x_{norm} = 0
 $$
 where $x_{norm}=K^{-1} x$, $x'_{norm}=K'^{-1} x'$
 $$
-x'^T_{norm} E x_{norm} = 0\implies x'^T_{norm} Fx=0
+x'^\top_{norm} E x_{norm} = 0\implies x'^\top_{norm} Fx=0
 $$
 where $F=K'^{-1}EK^{-1}$ is the **Fundamental Matrix**.
@@ -60,17 +60,17 @@ Properties of $F$:
 $F x$ is the epipolar line associated with $x$ ($l'=F x$)
-$F^T x'$ is the epipolar line associated with $x'$ ($l=F^T x'$)
+$F^\top x'$ is the epipolar line associated with $x'$ ($l=F^\top x'$)
-$F e=0$ and $F^T e'=0$
+$F e=0$ and $F^\top e'=0$
 $F$ is singular (rank two) and has seven degrees of freedom
 #### Estimating the fundamental matrix
-Given: correspondences $x=(x,y,1)^T$ and $x'=(x',y',1)^T$
+Given: correspondences $x=(x,y,1)^\top$ and $x'=(x',y',1)^\top$
-Constraint: $x'^T F x=0$
+Constraint: $x'^\top F x=0$
 $$
 (x',y',1)\begin{bmatrix}
@@ -95,7 +95,7 @@ F=U\begin{bmatrix}
 \sigma_1 & 0 \\
 0 & \sigma_2 \\
 0 & 0
-\end{bmatrix}V^T
+\end{bmatrix}V^\top
 $$
 ## Structure from Motion
@@ -126,7 +126,7 @@ a_{21} & a_{22} & a_{23} & t_2 \\
 0 & 0 & 0 & 1
 \end{bmatrix}=\begin{bmatrix}
 A & t \\
-0^T & 1
+0^\top & 1
 \end{bmatrix}
 $$
@@ -160,10 +160,10 @@ The reconstruction is defined up to an arbitrary affine transformation $Q$ (12 d
 $$
 \begin{bmatrix}
 A & t \\
-0^T & 1
+0^\top & 1
 \end{bmatrix}\rightarrow\begin{bmatrix}
 A & t \\
-0^T & 1
+0^\top & 1
 \end{bmatrix}Q^{-1}, \quad \begin{pmatrix}X_j\\1\end{pmatrix}\rightarrow Q\begin{pmatrix}X_j\\1\end{pmatrix}
 $$
--- a/content/CSE559A/CSE559A_L5.md
+++ b/content/CSE559A/CSE559A_L5.md
@@ -74,7 +74,7 @@ x\\y
 \end{pmatrix}
 $$
-To undo the rotation, we need to rotate the image by $-\theta$. This is equivalent to apply $R^T$ to the image.
+To undo the rotation, we need to rotate the image by $-\theta$. This is equivalent to apply $R^\top$ to the image.
 #### Affine transformation
--- a/content/CSE559A/CSE559A_L7.md
+++ b/content/CSE559A/CSE559A_L7.md
@@ -96,7 +96,7 @@ Example: Linear classification models
 Find a linear function that separates the data.
 $$
-f(x) = w^T x + b
+f(x) = w^\top x + b
 $$
 [Linear classification models](http://cs231n.github.io/linear-classify/)
@@ -144,13 +144,13 @@ This is a convex function, so we can find the global minimum.
 The gradient is:
 $$
-\nabla_w||Xw-Y||^2 = 2X^T(Xw-Y)
+\nabla_w||Xw-Y||^2 = 2X^\top(Xw-Y)
 $$
 Set the gradient to 0, we get:
 $$
-w = (X^T X)^{-1} X^T Y
+w = (X^\top X)^{-1} X^\top Y
 $$
 From the maximum likelihood perspective, we can also derive the same result.
--- a/content/CSE559A/CSE559A_L8.md
+++ b/content/CSE559A/CSE559A_L8.md
@@ -59,7 +59,7 @@ Suppose $k=1$, $e=l(f_1(x,w_1),y)$
 Example: $e=(f_1(x,w_1)-y)^2$
-So $h_1=f_1(x,w_1)=w^T_1x$, $e=l(h_1,y)=(y-h_1)^2$
+So $h_1=f_1(x,w_1)=w^\top_1x$, $e=l(h_1,y)=(y-h_1)^2$
 $$
 \frac{\partial e}{\partial w_1}=\frac{\partial e}{\partial h_1}\frac{\partial h_1}{\partial w_1}
--- a/content/CSE559A/CSE559A_L9.md
+++ b/content/CSE559A/CSE559A_L9.md
@@ -20,7 +20,7 @@ Suppose $k=1$, $e=l(f_1(x,w_1),y)$
 Example: $e=(f_1(x,w_1)-y)^2$
-So $h_1=f_1(x,w_1)=w^T_1x$, $e=l(h_1,y)=(y-h_1)^2$
+So $h_1=f_1(x,w_1)=w^\top_1x$, $e=l(h_1,y)=(y-h_1)^2$
 $$
 \frac{\partial e}{\partial w_1}=\frac{\partial e}{\partial h_1}\frac{\partial h_1}{\partial w_1}
--- a/content/Math401/Extending_thesis/Math401_R1.md
+++ b/content/Math401/Extending_thesis/Math401_R1.md
@@ -262,10 +262,10 @@ Basic definitions
 The special orthogonal group $SO(n)$ is the set of all **distance preserving** linear transformations on $\mathbb{R}^n$.
-It is the group of all $n\times n$ orthogonal matrices ($A^T A=I_n$) on $\mathbb{R}^n$ with determinant $1$.
+It is the group of all $n\times n$ orthogonal matrices ($A^\top A=I_n$) on $\mathbb{R}^n$ with determinant $1$.
 $$
-SO(n)=\{A\in \mathbb{R}^{n\times n}: A^T A=I_n, \det(A)=1\}
+SO(n)=\{A\in \mathbb{R}^{n\times n}: A^\top A=I_n, \det(A)=1\}
 $$
 <details>
@@ -276,7 +276,7 @@ In [The random Matrix Theory of the Classical Compact groups](https://case.edu/a
 $O(n)$ (the group of all $n\times n$ **orthogonal matrices** over $\mathbb{R}$),
 $$
-O(n)=\{A\in \mathbb{R}^{n\times n}: AA^T=A^T A=I_n\}
+O(n)=\{A\in \mathbb{R}^{n\times n}: AA^\top=A^\top A=I_n\}
 $$
 $U(n)$ (the group of all $n\times n$ **unitary matrices** over $\mathbb{C}$), 
@@ -296,7 +296,7 @@ $$
 $Sp(2n)$ (the group of all $2n\times 2n$ symplectic matrices over $\mathbb{C}$),
 $$
-Sp(2n)=\{U\in U(2n): U^T J U=UJU^T=J\}
+Sp(2n)=\{U\in U(2n): U^\top J U=UJU^\top=J\}
 $$
 where $J=\begin{pmatrix}
--- a/content/Math401/Freiwald_summer/Math401_P1_2.md
+++ b/content/Math401/Freiwald_summer/Math401_P1_2.md
@@ -8,10 +8,10 @@ The page's lemma is a fundamental result in quantum information theory that prov
 The special orthogonal group $SO(n)$ is the set of all **distance preserving** linear transformations on $\mathbb{R}^n$.
-It is the group of all $n\times n$ orthogonal matrices ($A^T A=I_n$) on $\mathbb{R}^n$ with determinant $1$.
+It is the group of all $n\times n$ orthogonal matrices ($A^\top A=I_n$) on $\mathbb{R}^n$ with determinant $1$.
 $$
-SO(n)=\{A\in \mathbb{R}^{n\times n}: A^T A=I_n, \det(A)=1\}
+SO(n)=\{A\in \mathbb{R}^{n\times n}: A^\top A=I_n, \det(A)=1\}
 $$
 <details>
@@ -22,7 +22,7 @@ In [The random Matrix Theory of the Classical Compact groups](https://case.edu/a
 $O(n)$ (the group of all $n\times n$ **orthogonal matrices** over $\mathbb{R}$),
 $$
-O(n)=\{A\in \mathbb{R}^{n\times n}: AA^T=A^T A=I_n\}
+O(n)=\{A\in \mathbb{R}^{n\times n}: AA^\top=A^\top A=I_n\}
 $$
 $U(n)$ (the group of all $n\times n$ **unitary matrices** over $\mathbb{C}$), 
@@ -42,7 +42,7 @@ $$
 $Sp(2n)$ (the group of all $2n\times 2n$ symplectic matrices over $\mathbb{C}$),
 $$
-Sp(2n)=\{U\in U(2n): U^T J U=UJU^T=J\}
+Sp(2n)=\{U\in U(2n): U^\top J U=UJU^\top=J\}
 $$
 where $J=\begin{pmatrix}
--- a/content/Math401/Freiwald_summer/Math401_T2.md
+++ b/content/Math401/Freiwald_summer/Math401_T2.md
@@ -74,7 +74,7 @@ $c\in \mathbb{C}$.
 The matrix transpose is defined by
 $$
-u^T=(a_1,a_2,\cdots,a_n)^T=\begin{pmatrix}
+u^\top=(a_1,a_2,\cdots,a_n)^\top=\begin{pmatrix}
 a_1 \\
 a_2 \\
 \vdots \\
@@ -694,7 +694,7 @@ $$
 The unitary group $U(n)$ is the group of all $n\times n$ unitary matrices.
-Such that $A^*=A$, where $A^*$ is the complex conjugate transpose of $A$. $A^*=(\overline{A})^T$.
+Such that $A^*=A$, where $A^*$ is the complex conjugate transpose of $A$. $A^*=(\overline{A})^\top$.
 #### Cyclic group $\mathbb{Z}_n$
--- a/content/Math4201/Math4201_L28.md
+++ b/content/Math4201/Math4201_L28.md
@@ -0,0 +1,144 @@
 # Math4201 Topology I (Lecture 28)
 ## Compact spaces
 ### Extreme value theorem
 #### Definition of diameter
 Let $(X,d)$ be a metric space and $A\subseteq X$. The diameter of $A$ is defined as
 $$
 \operatorname{diam}(A) = \sup\{d(x,y):x,y\in A\}
 $$
 #### Lebesgue number lemma
 Let $X$ be a compact metric space and $\{U_\alpha\}_{\alpha\in I}$ be an open cover of $X$. Then there is $\delta>0$ such that for every subset $A\subseteq X$ with diameter less than $\delta$, there is $\alpha\in I$ such that $A\subseteq U_\alpha$.
 <details>
 <summary>Proof</summary>
 Consider $x\in X$, there is an element $U_\alpha$ in the open covering such that $x\in U_\alpha$.
 In particular, there is $r_x$ such that $B_{r_x}(x)\subseteq U_\alpha$.
 Then the collection $\{B_{\frac{r_x}{2}}(x)\}_{x\in X}$ is an open covering of $X$. (each $x\in X$ is contained in some $B_{\frac{r_x}{2}}(x)$)
 Since $X$ is compact, there is a finite subcover $\{B_{\frac{r_{x_i}}{2}}(x_i)\}_{i=1}^n$ of $X$. Such that $\bigcup_{i=1}^n B_{\frac{r_{x_i}}{2}}(x_i)=X$.
 Let $\delta = \min\{r_{\frac{r_{x_1}}{2}}, ..., r_{\frac{r_{x_n}}{2}}\}>0$.
 Let $A\subseteq X$ be a subset with diameter less than $\delta$.
 Take $y\in A$, then $A\subseteq B_\delta(y)$.
 Take $x_i$ such that $y\in B_{\frac{r_{x_i}}{2}}(x_i)$. (such cover exists by definition of the subcover)
 And then $\alpha$ such that $B_{\frac{r_{x_i}}{2}}(x_i)\subseteq U_\alpha$.
 We claim that $B_\delta(y)\subseteq U_\alpha$, which would imply that $A\subseteq U_\alpha$.
 $y\in B_{\frac{r_{x_i}}{2}}(x_i)$, and we know that $B_{r_{x_i}}(x_i)\subseteq U_\alpha$.
 Since $\delta < \frac{r_{x_i}}{2}$, it suffices to show that $B_{\frac{r_{x_i}}{2}}(y)\subseteq U_\alpha$.
 For any $z\in B_{\frac{r_{x_i}}{2}}(y)$, we have $d(z,y)<\frac{r_{x_i}}{2}$, and $d(y,x_i)<\frac{r_{x_i}}{2}$.
 So $d(z,x_i)\leq d(z,y)+d(y,x_i)<\frac{r_{x_i}}{2}+\frac{r_{x_i}}{2}=r_{x_i}$, so $z\in B_{r_{x_i}}(x_i)\subseteq U_\alpha$.
 So $B_{\frac{r_{x_i}}{2}}(y)\subseteq U_\alpha$.
 </details>
 #### Definition of finite intersection property
 A collection $\{C_\alpha\}_{\alpha\in I}$ of subsets of a set $X$ has finite intersection property if for every finite subcollection $\{C_{\alpha_1}, ..., C_{\alpha_n}\}$ of $\{C_\alpha\}_{\alpha\in I}$, we have $\bigcap_{i=1}^n C_{\alpha_i}\neq \emptyset$.
 #### Theorem 
 A space $X$ is compact if and only if every collection $\{Z_\alpha\}_{\alpha\in I}$ of closed subsets of $X$ satisfies the **finite** intersection property has a non-empty intersection.
 $$
 \bigcap_{\alpha\in I} Z_\alpha \neq \emptyset
 $$
 <details>
 <summary>Non-example</summary>
 Consider $X=(0,1)$ is not compact with the standard topology.
 Consider $Z_n=(0,\frac{1}{n}]$, each interval is closed in $X$. This satisfies the finite intersection property because $\bigcap_{i=1}^k Z_{n_i}\neq \emptyset$ for any finite subcollection $\{Z_{n_1}, ..., Z_{n_k}\}$. _We can find a smaller for any finite subcollection to get a non-empty intersection._
 But $\bigcap_{n=1}^\infty Z_n = \emptyset$.
 </details>
 <details>
 <summary>Proof</summary>
 $\implies$
 Let $U_\alpha=X-Z_\alpha$ is open for each $\alpha\in I$. By contradiction, suppose that $\bigcap_{\alpha\in I} Z_\alpha = \emptyset$.
 So $X-\bigcap_{\alpha\in I} Z_\alpha = X=\bigcup_{\alpha\in I} U_\alpha = \bigcup_{\alpha\in I} (X-Z_\alpha)=X$.
 So $\{U_\alpha\}_{\alpha\in I}$ is an open cover of $X$. Since $X$ is compact, there is a finite subcover $\{U_{\alpha_1}, ..., U_{\alpha_n}\}$.
 So $\bigcap_{i=1}^n U_{\alpha_i} = X-\bigcup_{i=1}^n Z_{\alpha_i} = X$, $\bigcap_{i=1}^n Z_{\alpha_i} = \emptyset$. This contradicts the finite intersection property.
 $\impliedby$
 Proof is similar.
 </details>
 #### Definition of isolated point
 A point $x\in X$ is an isolated point if $\{x\}$ is an open subset of $X$.
 <details>
 <summary>Example of isolated point</summary>
 $X=[0,1]\cup \{2\}$ with subspace topology from $\mathbb{R}$.
 Then $\{2\}$ is an isolated point $\{2\}=X\cap (2-\frac{1}{2}, 2+\frac{1}{2})$.
 </details>
 #### Theorem of compact Hausdorff spaces without isolated points
 Any non-empty compact Hausdorff space without an isolated point is uncountable.
 <details>
 <summary>Proof</summary>
 Proof by contradiction.
 Let $X=\{x_n\}_{n\in\mathbb{N}}$ be a countable set.
 Since $x_1$ is not an isolated point, so there exists $y_1\in X$ such that $y_1\neq x_1$. Apply the Hausdorff property, there exists disjoint open neighborhoods $U_1$ and $V_1$ such that $x_1\in U_1$ and $y_1\in V_1$.
 In particular $\overline{V_1}$ does not contain $x_1$, but it contains $y_1$. (Follows from disjoint open neighborhoods)
 Since $x_2$ is not an isolated point, so there exists $y_2\in X$ such that $y_2\neq x_2$. Apply the Hausdorff property, there exists disjoint open neighborhoods $U_2$ and $V_2$ such that $x_2\in U_2$ and $y_2\in V_2$.
 If $x_2\notin V_1$, then we define $V_2$ as $V_1$.
 If $x_2\in V_1$, then by the assumption, there is another point $y_2$ in $V_1$ which isn't the same as $x_2$.
 CONTINUE NEXT TIME.
 </details>
 #### Theorem real numbers is uncountable
 $\mathbb{R}$ is uncountable, and any interval in $\mathbb{R}$ is uncountable.
 <details>
 <summary>Proof</summary>
 It suffices to prove this for a closed interval $[a,b]$ with $a<b$. Because any interval contains such a closed interval.
 The claim for a closed interval $[a,b]$ follows from the following theorem because $[a,b]$ is a non-empty compact Hausdorff space without an isolated point.
 </details>
--- a/content/Math4201/_meta.js
+++ b/content/Math4201/_meta.js
@@ -31,4 +31,5 @@ export default {
    Math4201_L25: "Topology I (Lecture 25)",
    Math4201_L26: "Topology I (Lecture 26)",
    Math4201_L27: "Topology I (Lecture 27)",
    Math4201_L28: "Topology I (Lecture 28)",
 }
--- a/content/Math429/Math429_L12.md
+++ b/content/Math429/Math429_L12.md
@@ -25,7 +25,7 @@ Let $A$ be an $m \times n$ matrix, then
 * The column rank of $A$ is the dimension of the span of the columns in $\mathbb{F}^{m,1}$.
 * The row range of $A$ is the dimension of the span of the row in $\mathbb{F}^{1,n}$.
-> Transpose: $A^t=A^T$ refers to swapping rows and columns
+> Transpose: $A^t=A^\top$ refers to swapping rows and columns
 #### Theorem 3.56 (Column-Row Factorization)
@@ -64,7 +64,7 @@ Proof:
 Note that by **Theorem 3.56**, if $A$ is $m\times n$ and has column rank $c$. $A=CR$ for some $C$ is a $m\times c$ matrix, $R$ is a $c\times n$ matrices, ut the rows of $CR$ are a linear combination of the rows of $R$, and row rank of $R\leq C$. So row rank $A\leq$ column rank of $A$.
-Taking a transpose of matrix, then row rank of $A^T$ (column rank of $A$) $\leq$ column rank of $A^T$ (row rank $A$).
+Taking a transpose of matrix, then row rank of $A^\top$ (column rank of $A$) $\leq$ column rank of $A^\top$ (row rank $A$).
 So column rank is equal to row rank.
--- a/content/Math429/Math429_L18.md
+++ b/content/Math429/Math429_L18.md
@@ -39,13 +39,13 @@ $T$ is surjective $\iff range\ T=W\iff null\ T'=0\iff T'$ injective
 Let $V,W$ be a finite dimensional vector space, $T\in \mathscr{L}(V,W)$
-Then $M(T')=(M(T))^T$. Where the basis for $M(T)'$ are the dual basis to the ones for $M(T)$
+Then $M(T')=(M(T))^\top$. Where the basis for $M(T)'$ are the dual basis to the ones for $M(T)$
 #### Theorem 3.133
 $col\ rank\ A=row\ rank\ A$
-Proof: $col\ rank\ A=col\ rank\ (M(T))=dim\ range\ T=dim\ range\ T'=dim\ range\ T'=col\ rank\ (M(T'))=col\ rank\ (M(T)^T)=row\ rank\ (M(T))$
+Proof: $col\ rank\ A=col\ rank\ (M(T))=dim\ range\ T=dim\ range\ T'=dim\ range\ T'=col\ rank\ (M(T'))=col\ rank\ (M(T)^\top)=row\ rank\ (M(T))$
 ## Chapter V Eigenvalue and Eigenvectors
--- a/docker/Jenkinsfile
+++ b/docker/Jenkinsfile
@@ -34,7 +34,7 @@ pipeline {
                    steps {
                        script {
                            echo "Building docker image ${registry}-math:${version}.${env.BUILD_ID}"
-                            def customImage = docker.build("${registry}-math:v${version}.${env.BUILD_ID}","-f ./docker/math/Dockerfile --no-cache --progress=plain -t notenextra-math:latest .")
+                            def customImage = docker.build("${registry}-math:v${version}.${env.BUILD_ID}","-f ./docker/math/Dockerfile --no-cache --progress=plain .")
                            echo "Logging in to docker hub"
                            // docker.withRegistry('https://registry.hub.docker.com', 'docker-hub-creds') {
                            //     echo "Pushing docker image ${registry}:v${version}.${env.BUILD_ID}"
@@ -48,7 +48,7 @@ pipeline {
                    steps {
                        script {
                            echo "Building docker image ${registry}-cse:${version}.${env.BUILD_ID}"
-                            def customImage = docker.build("${registry}-cse:v${version}.${env.BUILD_ID}","-f ./docker/cse/Dockerfile --no-cache --progress=plain -t notenextra-cse:latest .")
+                            def customImage = docker.build("${registry}-cse:v${version}.${env.BUILD_ID}","-f ./docker/cse/Dockerfile --no-cache --progress=plain -t .")
                            echo "Logging in to docker hub"
                            // docker.withRegistry('https://registry.hub.docker.com', 'docker-hub-creds') {
                            //     echo "Pushing docker image ${registry}:v${version}.${env.BUILD_ID}"
--- a/docker/docker-compose.yaml
+++ b/docker/docker-compose.yaml
@@ -1,5 +1,4 @@
 name: notenextra
 services:
  # you may need to update relative directories if you move this file
  # default created directories is /docker/docker-compose.yaml
--- a/package.json
+++ b/package.json
@@ -8,20 +8,22 @@
    "start": "next start"
  },
  "dependencies": {
-    "@docsearch/react": "^4.0.0-beta.2",
+    "@docsearch/css": "^4.2.0",
    "@docsearch/react": "^4.2.0",
    "@napi-rs/simple-git": "^0.1.22",
    "@next/bundle-analyzer": "^15.3.5",
    "@vercel/analytics": "^1.5.0",
    "@vercel/speed-insights": "^1.2.0",
    "cross-env": "^7.0.3",
    "eslint-config-next": "^16.0.1",
    "katex": "^0.16.22",
-    "next": "^15.5.2",
+    "next": "^16.0.1",
    "next-sitemap": "^4.2.3",
    "nextra": "^4.2.17",
    "nextra-theme-docs": "^4.2.17",
    "pagefind": "^1.4.0",
-    "react": "^19.1.0",
+    "react": "^19.2.0",
-    "react-dom": "^19.1.0"
+    "react-dom": "^19.2.0"
  },
  "devDependencies": {
    "@types/node": "24.0.10",