From 71ada8d4986f2801a70a4c8cced1b1fe03d2dde5 Mon Sep 17 00:00:00 2001 From: Trance-0 <60459821+Trance-0@users.noreply.github.com> Date: Mon, 3 Nov 2025 23:56:22 -0600 Subject: [PATCH] sad --- content/CSE5519/CSE5519_B4.md | 15 +++++++++++++++ package.json | 7 ++++--- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/content/CSE5519/CSE5519_B4.md b/content/CSE5519/CSE5519_B4.md index e6037b4..74ad181 100644 --- a/content/CSE5519/CSE5519_B4.md +++ b/content/CSE5519/CSE5519_B4.md @@ -1,2 +1,17 @@ # CSE5519 Advances in Computer Vision (Topic B: 2024: Vision-Language Models) +## Improved Baselines with Visual Instruction Tuning (LLaVA-1.5) + +[link to the paper](https://openaccess.thecvf.com/content/CVPR2024/papers/Liu_Improved_Baselines_with_Visual_Instruction_Tuning_CVPR_2024_paper.pdf) + +This paper shows that the visual instruction tuning can improve the performance of the vision-language model. + +### Novelty in LLaVA-1.5 + +1. Scaling to high resolution images by dividing images into grids and maintaining the data efficiency. +2. Compositional ability, (use long-form language reasoning together with shorter visual reasoning can improve the model's writing ability) +3. Random downsampling will not degrade the performance. + +>[!TIP] +> +> This paper shows that LLaVA-1.5 obeys the scaling law and splitting the high resolution images into grids to maintain the data efficiency. I wonder why this method is not applicable to multi-image understanding tasks? Why we cannot assign index embeddings to each image and push the image sets to the model for better understanding? \ No newline at end of file diff --git a/package.json b/package.json index d741347..6714dcb 100644 --- a/package.json +++ b/package.json @@ -15,14 +15,15 @@ "@vercel/analytics": "^1.5.0", "@vercel/speed-insights": "^1.2.0", "cross-env": "^7.0.3", + "eslint-config-next": "^16.0.1", "katex": "^0.16.22", - "next": "^15.5.2", + "next": "^16.0.1", "next-sitemap": "^4.2.3", "nextra": "^4.2.17", "nextra-theme-docs": "^4.2.17", "pagefind": "^1.4.0", - "react": "^19.1.0", - "react-dom": "^19.1.0" + "react": "^19.2.0", + "react-dom": "^19.2.0" }, "devDependencies": { "@types/node": "24.0.10",