mirror of
https://github.com/deepseek-ai/DeepSeek-VL2.git
synced 2025-02-22 05:39:07 -05:00
fix typo
This commit is contained in:
parent
ff23960c5c
commit
c74816ad22
@ -119,6 +119,8 @@ vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(model_pat
|
||||
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
|
||||
|
||||
## single image conversation example
|
||||
## Please note that <|ref|> and <|/ref|> are designed specifically for the object localization feature. These special tokens are not required for normal conversations.
|
||||
## If you would like to experience the grounded captioning functionality (responses that include both object localization and reasoning), you need to add the special token <|grounding|> at the beginning of the prompt. Examples could be found in Figure 9 of our paper.
|
||||
conversation = [
|
||||
{
|
||||
"role": "<|User|>",
|
||||
@ -334,10 +336,10 @@ This is image_3: <image>
|
||||
### Full Inference Example
|
||||
```shell
|
||||
# without incremental prefilling
|
||||
CUDA_VISIBLE_DEVICES=0 python inference.py --model_patn "deepseek-ai/deepseek-vl2"
|
||||
CUDA_VISIBLE_DEVICES=0 python inference.py --model_path "deepseek-ai/deepseek-vl2"
|
||||
|
||||
# with incremental prefilling, when using 40G GPU for vl2-small
|
||||
CUDA_VISIBLE_DEVICES=0 python inference.py --model_patn "deepseek-ai/deepseek-vl2-small" --chunck_size 512
|
||||
CUDA_VISIBLE_DEVICES=0 python inference.py --model_path "deepseek-ai/deepseek-vl2-small" --chunk_size 512
|
||||
|
||||
```
|
||||
|
||||
|
27
inference.py
27
inference.py
@ -76,7 +76,8 @@ def main(args):
|
||||
)
|
||||
vl_gpt = vl_gpt.cuda().eval()
|
||||
|
||||
# single image conversation example
|
||||
# multiple images conversation example
|
||||
# Please note that <|grounding|> token is specifically designed for the grounded caption feature. It is not needed for normal conversations.
|
||||
conversation = [
|
||||
{
|
||||
"role": "<|User|>",
|
||||
@ -89,28 +90,11 @@ def main(args):
|
||||
{"role": "<|Assistant|>", "content": ""},
|
||||
]
|
||||
|
||||
# conversation = [
|
||||
# {
|
||||
# "role": "<|User|>",
|
||||
# "content": "<image>\n<|ref|>The giraffe at the back.<|/ref|>.",
|
||||
# "images": ["./images/visual_grounding_1.jpeg"],
|
||||
# },
|
||||
# {"role": "<|Assistant|>", "content": ""},
|
||||
# ]
|
||||
|
||||
# load images and prepare for inputs
|
||||
pil_images = load_pil_images(conversation)
|
||||
print(f"len(pil_images) = {len(pil_images)}")
|
||||
|
||||
# input_ids = batched_input_ids,
|
||||
# attention_mask = batched_attention_mask,
|
||||
# labels = batched_labels,
|
||||
# images_tiles = batched_images,
|
||||
# images_seq_mask = batched_images_seq_mask,
|
||||
# images_spatial_crop = batched_images_spatial_crop,
|
||||
# sft_format = batched_sft_format,
|
||||
# seq_lens = seq_lens
|
||||
|
||||
prepare_inputs = vl_chat_processor.__call__(
|
||||
conversations=conversation,
|
||||
images=pil_images,
|
||||
@ -118,13 +102,6 @@ def main(args):
|
||||
system_prompt=""
|
||||
).to(vl_gpt.device, dtype=dtype)
|
||||
|
||||
# for key in prepare_inputs.keys():
|
||||
# value = prepare_inputs[key]
|
||||
# if isinstance(value, list):
|
||||
# print(key, len(value), type(value))
|
||||
# elif isinstance(value, torch.Tensor):
|
||||
# print(key, value.shape, type(value))
|
||||
|
||||
with torch.no_grad():
|
||||
|
||||
if args.chunk_size == -1:
|
||||
|
Loading…
Reference in New Issue
Block a user