From c74816ad22b7def9c4588dd22bf8d20c1db04a58 Mon Sep 17 00:00:00 2001
From: charlescxk <cxk@deepseek.com>
Date: Wed, 29 Jan 2025 16:02:52 +0800
Subject: [PATCH] fix typo

---
 README.md    |  6 ++++--
 inference.py | 27 ++-------------------------
 2 files changed, 6 insertions(+), 27 deletions(-)
diff --git a/README.md b/README.md
index 8e4a851..c7591b0 100644
--- a/README.md
+++ b/README.md
@@ -119,6 +119,8 @@ vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(model_pat
 vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
 
 ## single image conversation example
+## Please note that <|ref|> and <|/ref|> are designed specifically for the object localization feature. These special tokens are not required for normal conversations.
+## If you would like to experience the grounded captioning functionality (responses that include both object localization and reasoning), you need to add the special token <|grounding|> at the beginning of the prompt. Examples could be found in Figure 9 of our paper.
 conversation = [
     {
         "role": "<|User|>",
@@ -334,10 +336,10 @@ This is image_3: <image>
 ### Full Inference Example
 ```shell
 # without incremental prefilling
-CUDA_VISIBLE_DEVICES=0 python inference.py --model_patn "deepseek-ai/deepseek-vl2"
+CUDA_VISIBLE_DEVICES=0 python inference.py --model_path "deepseek-ai/deepseek-vl2"
 
 # with incremental prefilling, when using 40G GPU for vl2-small
-CUDA_VISIBLE_DEVICES=0 python inference.py --model_patn "deepseek-ai/deepseek-vl2-small" --chunck_size 512
+CUDA_VISIBLE_DEVICES=0 python inference.py --model_path "deepseek-ai/deepseek-vl2-small" --chunk_size 512
 
 ```
 
diff --git a/inference.py b/inference.py
index 9f94aa6..86bc10e 100644
--- a/inference.py
+++ b/inference.py
@@ -76,7 +76,8 @@ def main(args):
     )
     vl_gpt = vl_gpt.cuda().eval()
 
-    # single image conversation example
+    # multiple images conversation example
+    # Please note that <|grounding|> token is specifically designed for the grounded caption feature. It is not needed for normal conversations.
     conversation = [
         {
             "role": "<|User|>",
@@ -89,28 +90,11 @@ def main(args):
         {"role": "<|Assistant|>", "content": ""},
     ]
 
-    # conversation = [
-    #     {
-    #         "role": "<|User|>",
-    #         "content": "<image>\n<|ref|>The giraffe at the back.<|/ref|>.",
-    #         "images": ["./images/visual_grounding_1.jpeg"],
-    #     },
-    #     {"role": "<|Assistant|>", "content": ""},
-    # ]
 
     # load images and prepare for inputs
     pil_images = load_pil_images(conversation)
     print(f"len(pil_images) = {len(pil_images)}")
 
-    # input_ids = batched_input_ids,
-    # attention_mask = batched_attention_mask,
-    # labels = batched_labels,
-    # images_tiles = batched_images,
-    # images_seq_mask = batched_images_seq_mask,
-    # images_spatial_crop = batched_images_spatial_crop,
-    # sft_format = batched_sft_format,
-    # seq_lens = seq_lens
-
     prepare_inputs = vl_chat_processor.__call__(
         conversations=conversation,
         images=pil_images,
@@ -118,13 +102,6 @@ def main(args):
         system_prompt=""
     ).to(vl_gpt.device, dtype=dtype)
 
-    # for key in prepare_inputs.keys():
-    #     value = prepare_inputs[key]
-    #     if isinstance(value, list):
-    #         print(key, len(value), type(value))
-    #     elif isinstance(value, torch.Tensor):
-    #         print(key, value.shape, type(value))
-
     with torch.no_grad():
 
         if args.chunk_size == -1: