From 8d909a31331da7cd2b7d5b48bfd91e1a628181ba Mon Sep 17 00:00:00 2001
From: Zheyuan Wu <60459821+Trance-0@users.noreply.github.com>
Date: Tue, 4 Feb 2025 15:14:00 -0600
Subject: [PATCH] update

---
 pages/CSE559A/CSE559A_L7.md | 228 ++++++++++++++++++++++++++++++++++++
 pages/CSE559A/_meta.js      |   1 +
 2 files changed, 229 insertions(+)
 create mode 100644 pages/CSE559A/CSE559A_L7.md

diff --git a/pages/CSE559A/CSE559A_L7.md b/pages/CSE559A/CSE559A_L7.md
new file mode 100644
index 0000000..252ad04
--- /dev/null
+++ b/pages/CSE559A/CSE559A_L7.md
@@ -0,0 +1,228 @@
+# Lecture 7
+
+## Computer Vision (In Artificial Neural Networks for Image Understanding)
+
+Early example of image understanding using Neural Networks: [Back propagation for zip code recognition]
+
+Central idea; representation change, on each layer of feature.
+
+Plan for next few weeks:
+
+1. How do we train such models?
+2. What are those building blocks
+3. How should we combine those building blocks?
+
+## How do we train such models?
+
+CV is finally useful...
+
+1. Image classification
+2. Image segmentation
+3. Object detection
+
+ImageNet Large Scale Visual Recognition Challenge (ILSVRC)
+
+- 1000 classes
+- 1.2 million images
+- 10000 test images
+
+### Deep Learning (Just neural networks)
+
+Bigger datasets, larger models, faster computers, lots of incremental improvements.
+
+```python
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(1, 6, 5)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+
+    def forward(self, x):
+        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
+        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
+        x = x.view(-1, self.num_flat_features(x))
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+    def num_flat_features(self, x):
+        size = x.size()[1:]
+        num_features = 1
+        for s in size:
+            num_features *= s
+        return num_features
+
+# create pytorch dataset and dataloader
+dataset = torch.utils.data.TensorDataset(torch.randn(1000, 1, 28, 28), torch.randint(10, (1000,)))
+dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=2)
+
+# training process
+
+net = Net()
+optimizer = optim.Adam(net.parameters(), lr=0.001)
+criterion = nn.CrossEntropyLoss()
+
+# loop over the dataset multiple times
+for epoch in range(2):
+    for i, data in enumerate(dataloader, 0):
+        inputs, labels = data
+        optimizer.zero_grad()
+        outputs = net(inputs)
+        loss = criterion(outputs, labels)
+        loss.backward()
+        optimizer.step()
+
+print(f"Finished Training")
+```
+
+Some generated code above.
+
+### Supervised Learning
+
+Training: given a dataset, learn a mapping from input to output.
+
+Testing: given a new input, predict the output.
+
+Example: Linear classification models
+
+Find a linear function that separates the data.
+
+$$
+f(x) = w^T x + b
+$$
+
+[Linear classification models](http://cs231n.github.io/linear-classify/)
+
+Simple representation of a linear classifier.
+
+### Empirical loss minimization framework
+
+Given a training set, find a model that minimizes the loss function.
+
+Assume iid samples.
+
+Example of loss function:
+
+l1 loss:
+
+$$
+\ell(f(x; w), y) = |f(x; w) - y|
+$$
+
+l2 loss:
+
+$$
+\ell(f(x; w), y) = (f(x; w) - y)^2
+$$
+
+### Linear classification models
+
+$$
+\hat{L}(w) = \frac{1}{n} \sum_{i=1}^n \ell(f(x_i; w), y_i)
+$$
+
+hard to find the global minimum.
+
+#### Linear regression
+
+However, if we use l2 loss, we can find the global minimum.
+
+$$
+\hat{L}(w) = \frac{1}{n} \sum_{i=1}^n (f(x_i; w) - y_i)^2
+$$
+
+This is a convex function, so we can find the global minimum.
+
+The gradient is:
+
+$$
+\nabla_w||Xw-Y||^2 = 2X^T(Xw-Y)
+$$
+
+Set the gradient to 0, we get:
+
+$$
+w = (X^T X)^{-1} X^T Y
+$$
+
+From the maximum likelihood perspective, we can also derive the same result.
+
+#### Logistic regression
+
+Sigmoid function:
+
+$$
+\sigma(x) = \frac{1}{1 + e^{-x}}
+$$
+
+The loss of logistic regression is not convex, so we cannot find the global minimum using normal equations.
+
+#### Gradient Descent
+
+Full batch gradient descent:
+
+$$
+w \leftarrow w - \eta \nabla_w \hat{L}(w)
+$$
+
+Stochastic gradient descent:
+
+$$
+w \leftarrow w - \eta \nabla_w \hat{L}(w; x_i, y_i)
+$$
+
+Mini-batch gradient descent:
+
+$$
+w \leftarrow w - \eta \nabla_w \hat{L}(w; x_i, y_i)
+$$
+
+Mini-batch Gradient Descent:
+
+$$
+w \leftarrow w - \eta \nabla_w \hat{L}(w; x_i, y_i)
+$$
+
+at each step, we update the weights using the average gradient of the mini-batch.
+
+the mini-batch is selected randomly from the training set.
+
+#### Multi-class classification
+
+Use softmax function to convert the output to a probability distribution.
+
+## Neural Networks
+
+From linear to non-linear.
+
+- Shadow approach:
+  - Use feature transformation to make the data linearly separable.
+- Deep approach:
+  - Stack multiple layers of linear models.
+
+Common non-linear functions:
+
+- ReLU:
+  - $$
+    \text{ReLU}(x) = \max(0, x)
+    $$
+- Sigmoid:
+  - $$
+    \text{Sigmoid}(x) = \frac{1}{1 + e^{-x}}
+    $$
+- Tanh:
+  - $$
+    \text{Tanh}(x) = \frac{e^x - e^{-x}}{e^x + e^{-x}}
+    $$
+
+
+
+### Backpropagation
\ No newline at end of file
diff --git a/pages/CSE559A/_meta.js b/pages/CSE559A/_meta.js
index 649aa52..90cbcf9 100644
--- a/pages/CSE559A/_meta.js
+++ b/pages/CSE559A/_meta.js
@@ -9,4 +9,5 @@ export default {
     CSE559A_L4: "Computer Vision (Lecture 4)",
     CSE559A_L5: "Computer Vision (Lecture 5)",
     CSE559A_L6: "Computer Vision (Lecture 6)",
+    CSE559A_L7: "Computer Vision (Lecture 7)",
 }