From c74816ad22b7def9c4588dd22bf8d20c1db04a58 Mon Sep 17 00:00:00 2001 From: charlescxk Date: Wed, 29 Jan 2025 16:02:52 +0800 Subject: [PATCH] fix typo --- README.md | 6 ++++-- inference.py | 27 ++------------------------- 2 files changed, 6 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index 8e4a851..c7591b0 100644 --- a/README.md +++ b/README.md @@ -119,6 +119,8 @@ vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(model_pat vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval() ## single image conversation example +## Please note that <|ref|> and <|/ref|> are designed specifically for the object localization feature. These special tokens are not required for normal conversations. +## If you would like to experience the grounded captioning functionality (responses that include both object localization and reasoning), you need to add the special token <|grounding|> at the beginning of the prompt. Examples could be found in Figure 9 of our paper. conversation = [ { "role": "<|User|>", @@ -334,10 +336,10 @@ This is image_3: ### Full Inference Example ```shell # without incremental prefilling -CUDA_VISIBLE_DEVICES=0 python inference.py --model_patn "deepseek-ai/deepseek-vl2" +CUDA_VISIBLE_DEVICES=0 python inference.py --model_path "deepseek-ai/deepseek-vl2" # with incremental prefilling, when using 40G GPU for vl2-small -CUDA_VISIBLE_DEVICES=0 python inference.py --model_patn "deepseek-ai/deepseek-vl2-small" --chunck_size 512 +CUDA_VISIBLE_DEVICES=0 python inference.py --model_path "deepseek-ai/deepseek-vl2-small" --chunk_size 512 ``` diff --git a/inference.py b/inference.py index 9f94aa6..86bc10e 100644 --- a/inference.py +++ b/inference.py @@ -76,7 +76,8 @@ def main(args): ) vl_gpt = vl_gpt.cuda().eval() - # single image conversation example + # multiple images conversation example + # Please note that <|grounding|> token is specifically designed for the grounded caption feature. It is not needed for normal conversations. conversation = [ { "role": "<|User|>", @@ -89,28 +90,11 @@ def main(args): {"role": "<|Assistant|>", "content": ""}, ] - # conversation = [ - # { - # "role": "<|User|>", - # "content": "\n<|ref|>The giraffe at the back.<|/ref|>.", - # "images": ["./images/visual_grounding_1.jpeg"], - # }, - # {"role": "<|Assistant|>", "content": ""}, - # ] # load images and prepare for inputs pil_images = load_pil_images(conversation) print(f"len(pil_images) = {len(pil_images)}") - # input_ids = batched_input_ids, - # attention_mask = batched_attention_mask, - # labels = batched_labels, - # images_tiles = batched_images, - # images_seq_mask = batched_images_seq_mask, - # images_spatial_crop = batched_images_spatial_crop, - # sft_format = batched_sft_format, - # seq_lens = seq_lens - prepare_inputs = vl_chat_processor.__call__( conversations=conversation, images=pil_images, @@ -118,13 +102,6 @@ def main(args): system_prompt="" ).to(vl_gpt.device, dtype=dtype) - # for key in prepare_inputs.keys(): - # value = prepare_inputs[key] - # if isinstance(value, list): - # print(key, len(value), type(value)) - # elif isinstance(value, torch.Tensor): - # print(key, value.shape, type(value)) - with torch.no_grad(): if args.chunk_size == -1: