sad

2025-11-03 23:56:22 -06:00
parent bc44c59707
commit 71ada8d498
2 changed files with 19 additions and 3 deletions
--- a/content/CSE5519/CSE5519_B4.md
+++ b/content/CSE5519/CSE5519_B4.md
@@ -1,2 +1,17 @@
 # CSE5519 Advances in Computer Vision (Topic B: 2024: Vision-Language Models)

+## Improved Baselines with Visual Instruction Tuning (LLaVA-1.5)
+
+[link to the paper](https://openaccess.thecvf.com/content/CVPR2024/papers/Liu_Improved_Baselines_with_Visual_Instruction_Tuning_CVPR_2024_paper.pdf)
+
+This paper shows that the visual instruction tuning can improve the performance of the vision-language model.
+
+### Novelty in LLaVA-1.5
+
+1. Scaling to high resolution images by dividing images into grids and maintaining the data efficiency.
+2. Compositional ability, (use long-form language reasoning together with shorter visual reasoning can improve the model's writing ability)
+3. Random downsampling will not degrade the performance.
+
+>[!TIP] 
+>
+> This paper shows that LLaVA-1.5 obeys the scaling law and splitting the high resolution images into grids to maintain the data efficiency. I wonder why this method is not applicable to multi-image understanding tasks? Why we cannot assign index embeddings to each image and push the image sets to the model for better understanding?
--- a/package.json
+++ b/package.json
@@ -15,14 +15,15 @@
    "@vercel/analytics": "^1.5.0",
    "@vercel/speed-insights": "^1.2.0",
    "cross-env": "^7.0.3",
+    "eslint-config-next": "^16.0.1",
    "katex": "^0.16.22",
-    "next": "^15.5.2",
+    "next": "^16.0.1",
    "next-sitemap": "^4.2.3",
    "nextra": "^4.2.17",
    "nextra-theme-docs": "^4.2.17",
    "pagefind": "^1.4.0",
-    "react": "^19.1.0",
-    "react-dom": "^19.1.0"
+    "react": "^19.2.0",
+    "react-dom": "^19.2.0"
  },
  "devDependencies": {
    "@types/node": "24.0.10",