update notations and fix typos
This commit is contained in:
@@ -2,6 +2,58 @@
|
||||
|
||||
## Positional Encodings
|
||||
|
||||
### Fixed Positional Encodings
|
||||
|
||||
Set of sinusoids of different frequencies.
|
||||
|
||||
$$
|
||||
f(p,2i)=\sin(\frac{p}{10000^{2i/d}})\quad f(p,2i+1)=\cos(\frac{p}{10000^{2i/d}})\
|
||||
$$
|
||||
|
||||
[source](https://kazemnejad.com/blog/transformer_architecture_positional_encoding/)
|
||||
|
||||
### Positional Encodings in Reconstruction
|
||||
|
||||
MLP is hard to learn high-frequency information from scaler input $(x,y)$.
|
||||
|
||||
Example: network mapping from $(x,y)$ to $(r,g,b)$.
|
||||
|
||||
### Generalized Positional Encodings
|
||||
|
||||
- Dependence on location, scaler, metadata, etc.
|
||||
- Can just be fully learned (use `nn.Embedding` and optimize based on a categorical input.)
|
||||
|
||||
## Vision Transformer (ViT)
|
||||
|
||||
### Class Token
|
||||
|
||||
In Vision Transformers, a special token called the class token is added to the input sequence to aggregate information for classification tasks.
|
||||
|
||||
### Hidden CNN Modules
|
||||
|
||||
- PxP convolution with stride P (split the image into patches and use positional encoding)
|
||||
|
||||
### ViT + ResNet Hybrid
|
||||
|
||||
Build a hybrid model that combines the vision transformer after 50 layer of ResNet.
|
||||
|
||||
## Moving Forward
|
||||
|
||||
At least for now, CNN and ViT architectures have similar performance at least in ImageNet.
|
||||
|
||||
- General Consensus: once the architecture is big enough, and not designed terribly, it can do well.
|
||||
- Differences remain:
|
||||
- Computational efficiency
|
||||
- Ease of use in other tasks and with other input data
|
||||
- Ease of training
|
||||
|
||||
## Wrap up
|
||||
|
||||
Self attention as a key building block
|
||||
|
||||
Flexible input specification using tokens with positional encodings
|
||||
|
||||
A wide variety of architectural styles
|
||||
|
||||
Up Next:
|
||||
Training deep neural networks
|
||||
82
pages/CSE559A/mlp_image_reconstruction.py
Normal file
82
pages/CSE559A/mlp_image_reconstruction.py
Normal file
@@ -0,0 +1,82 @@
|
||||
import torch
|
||||
from torchvision import transforms
|
||||
from PIL import Image
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
class MLPScalar(torch.nn.Module):
|
||||
# Define your MLPScalar architecture here
|
||||
|
||||
def __init__(self):
|
||||
super(MLPScalar, self).__init__()
|
||||
# Example architecture
|
||||
self.fc1 = torch.nn.Linear(2, 128)
|
||||
self.fc2 = torch.nn.Linear(128, 3) # Outputs RGB
|
||||
|
||||
def forward(self, x):
|
||||
x = torch.nn.functional.relu(self.fc1(x))
|
||||
x = torch.sigmoid(self.fc2(x)) # Normalize output to [0, 1]
|
||||
return x
|
||||
|
||||
class MLPPositional(torch.nn.Module):
|
||||
# Define your MLPPositional architecture here
|
||||
|
||||
def __init__(self, num_frequencies=10, include_input=True):
|
||||
super(MLPPositional, self).__init__()
|
||||
# Example architecture
|
||||
self.include_input = include_input
|
||||
self.fc1 = torch.nn.Linear(2, 128)
|
||||
self.fc2 = torch.nn.Linear(128, 3) # Outputs RGB
|
||||
|
||||
def forward(self, x):
|
||||
if self.include_input:
|
||||
# Process coordinates, add positional encoding here if needed
|
||||
x = torch.cat([x, self.positional_encoding(x)], dim=-1)
|
||||
x = torch.nn.functional.relu(self.fc1(x))
|
||||
x = torch.sigmoid(self.fc2(x)) # Normalize output to [0, 1]
|
||||
return x
|
||||
|
||||
def positional_encoding(self, x):
|
||||
# Example positional encoding
|
||||
return torch.cat([torch.sin(x * (2 ** i)) for i in range(10)], dim=-1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Load a real image
|
||||
image_path = input()[1:-1] # Replace with your image file path
|
||||
image = Image.open(image_path).convert('RGB')
|
||||
|
||||
# Normalize and resize the image
|
||||
transform = transforms.Compose([
|
||||
transforms.Resize((256, 256)), # Resize image to desired dimensions
|
||||
transforms.ToTensor(), # Convert to Tensor and normalize to [0,1]
|
||||
])
|
||||
|
||||
image_tensor = transform(image)
|
||||
|
||||
# Create dummy normalized coordinates (assume image coordinates normalized to [0,1])
|
||||
coords = torch.rand(10, 2) # 10 random coordinate pairs
|
||||
print("Input coordinates:")
|
||||
print(coords)
|
||||
|
||||
# Test MLP with scalar input
|
||||
model_scalar = MLPScalar()
|
||||
out_scalar = model_scalar(coords)
|
||||
print("\nMLPScalar output (RGB):")
|
||||
print(out_scalar)
|
||||
|
||||
# Test MLP with positional encoding
|
||||
model_positional = MLPPositional(num_frequencies=10, include_input=True)
|
||||
out_positional = model_positional(coords)
|
||||
print("\nMLPPositional output (RGB):")
|
||||
print(out_positional)
|
||||
|
||||
# Optionally, use the output to create a new image
|
||||
output_image = (out_positional.view(10, 1, 3) * 255).byte().numpy() # Reshape and scale
|
||||
output_image = output_image.transpose(0, 2, 1) # Prepare for visualization
|
||||
|
||||
# Visualize the output
|
||||
plt.figure(figsize=(10, 2))
|
||||
for i in range(output_image.shape[0]):
|
||||
plt.subplot(2, 5, i + 1)
|
||||
plt.imshow(output_image[i].reshape(1, 3), aspect='auto')
|
||||
plt.axis('off')
|
||||
plt.show()
|
||||
Reference in New Issue
Block a user