This commit is contained in:
charlescxk 2025-01-29 16:02:52 +08:00
parent ff23960c5c
commit c74816ad22
2 changed files with 6 additions and 27 deletions

View File

@ -119,6 +119,8 @@ vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(model_pat
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval() vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
## single image conversation example ## single image conversation example
## Please note that <|ref|> and <|/ref|> are designed specifically for the object localization feature. These special tokens are not required for normal conversations.
## If you would like to experience the grounded captioning functionality (responses that include both object localization and reasoning), you need to add the special token <|grounding|> at the beginning of the prompt. Examples could be found in Figure 9 of our paper.
conversation = [ conversation = [
{ {
"role": "<|User|>", "role": "<|User|>",
@ -334,10 +336,10 @@ This is image_3: <image>
### Full Inference Example ### Full Inference Example
```shell ```shell
# without incremental prefilling # without incremental prefilling
CUDA_VISIBLE_DEVICES=0 python inference.py --model_patn "deepseek-ai/deepseek-vl2" CUDA_VISIBLE_DEVICES=0 python inference.py --model_path "deepseek-ai/deepseek-vl2"
# with incremental prefilling, when using 40G GPU for vl2-small # with incremental prefilling, when using 40G GPU for vl2-small
CUDA_VISIBLE_DEVICES=0 python inference.py --model_patn "deepseek-ai/deepseek-vl2-small" --chunck_size 512 CUDA_VISIBLE_DEVICES=0 python inference.py --model_path "deepseek-ai/deepseek-vl2-small" --chunk_size 512
``` ```

View File

@ -76,7 +76,8 @@ def main(args):
) )
vl_gpt = vl_gpt.cuda().eval() vl_gpt = vl_gpt.cuda().eval()
# single image conversation example # multiple images conversation example
# Please note that <|grounding|> token is specifically designed for the grounded caption feature. It is not needed for normal conversations.
conversation = [ conversation = [
{ {
"role": "<|User|>", "role": "<|User|>",
@ -89,28 +90,11 @@ def main(args):
{"role": "<|Assistant|>", "content": ""}, {"role": "<|Assistant|>", "content": ""},
] ]
# conversation = [
# {
# "role": "<|User|>",
# "content": "<image>\n<|ref|>The giraffe at the back.<|/ref|>.",
# "images": ["./images/visual_grounding_1.jpeg"],
# },
# {"role": "<|Assistant|>", "content": ""},
# ]
# load images and prepare for inputs # load images and prepare for inputs
pil_images = load_pil_images(conversation) pil_images = load_pil_images(conversation)
print(f"len(pil_images) = {len(pil_images)}") print(f"len(pil_images) = {len(pil_images)}")
# input_ids = batched_input_ids,
# attention_mask = batched_attention_mask,
# labels = batched_labels,
# images_tiles = batched_images,
# images_seq_mask = batched_images_seq_mask,
# images_spatial_crop = batched_images_spatial_crop,
# sft_format = batched_sft_format,
# seq_lens = seq_lens
prepare_inputs = vl_chat_processor.__call__( prepare_inputs = vl_chat_processor.__call__(
conversations=conversation, conversations=conversation,
images=pil_images, images=pil_images,
@ -118,13 +102,6 @@ def main(args):
system_prompt="" system_prompt=""
).to(vl_gpt.device, dtype=dtype) ).to(vl_gpt.device, dtype=dtype)
# for key in prepare_inputs.keys():
# value = prepare_inputs[key]
# if isinstance(value, list):
# print(key, len(value), type(value))
# elif isinstance(value, torch.Tensor):
# print(key, value.shape, type(value))
with torch.no_grad(): with torch.no_grad():
if args.chunk_size == -1: if args.chunk_size == -1: