fix typo

2025-07-11 19:58:51 -04:00 · 2025-01-29 16:02:52 +08:00 · 2025-01-29 16:02:52 +08:00 · c74816ad22
commit c74816ad22
parent ff23960c5c
2 changed files with 6 additions and 27 deletions
--- a/README.md
+++ b/README.md
@ -119,6 +119,8 @@ vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(model_pat
 vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

 ## single image conversation example
+## Please note that <|ref|> and <|/ref|> are designed specifically for the object localization feature. These special tokens are not required for normal conversations.
+## If you would like to experience the grounded captioning functionality (responses that include both object localization and reasoning), you need to add the special token <|grounding|> at the beginning of the prompt. Examples could be found in Figure 9 of our paper.
 conversation = [
    {
        "role": "<|User|>",
@ -334,10 +336,10 @@ This is image_3: <image>
 ### Full Inference Example
 ```shell
 # without incremental prefilling
-CUDA_VISIBLE_DEVICES=0 python inference.py --model_patn "deepseek-ai/deepseek-vl2"
+CUDA_VISIBLE_DEVICES=0 python inference.py --model_path "deepseek-ai/deepseek-vl2"

 # with incremental prefilling, when using 40G GPU for vl2-small
-CUDA_VISIBLE_DEVICES=0 python inference.py --model_patn "deepseek-ai/deepseek-vl2-small" --chunck_size 512
+CUDA_VISIBLE_DEVICES=0 python inference.py --model_path "deepseek-ai/deepseek-vl2-small" --chunk_size 512

 ```

--- a/inference.py
+++ b/inference.py
@ -76,7 +76,8 @@ def main(args):
    )
    vl_gpt = vl_gpt.cuda().eval()

-    # single image conversation example
+    # multiple images conversation example
+    # Please note that <|grounding|> token is specifically designed for the grounded caption feature. It is not needed for normal conversations.
    conversation = [
        {
            "role": "<|User|>",
@ -89,28 +90,11 @@ def main(args):
        {"role": "<|Assistant|>", "content": ""},
    ]

-    # conversation = [
-    #     {
-    #         "role": "<|User|>",
-    #         "content": "<image>\n<|ref|>The giraffe at the back.<|/ref|>.",
-    #         "images": ["./images/visual_grounding_1.jpeg"],
-    #     },
-    #     {"role": "<|Assistant|>", "content": ""},
-    # ]

    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    print(f"len(pil_images) = {len(pil_images)}")

-    # input_ids = batched_input_ids,
-    # attention_mask = batched_attention_mask,
-    # labels = batched_labels,
-    # images_tiles = batched_images,
-    # images_seq_mask = batched_images_seq_mask,
-    # images_spatial_crop = batched_images_spatial_crop,
-    # sft_format = batched_sft_format,
-    # seq_lens = seq_lens
-
    prepare_inputs = vl_chat_processor.__call__(
        conversations=conversation,
        images=pil_images,
@ -118,13 +102,6 @@ def main(args):
        system_prompt=""
    ).to(vl_gpt.device, dtype=dtype)

-    # for key in prepare_inputs.keys():
-    #     value = prepare_inputs[key]
-    #     if isinstance(value, list):
-    #         print(key, len(value), type(value))
-    #     elif isinstance(value, torch.Tensor):
-    #         print(key, value.shape, type(value))
-
    with torch.no_grad():

        if args.chunk_size == -1: