mirror of
https://github.com/deepseek-ai/DeepSeek-VL2.git
synced 2025-02-22 13:49:00 -05:00
fix typo
This commit is contained in:
parent
ff23960c5c
commit
c74816ad22
@ -119,6 +119,8 @@ vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(model_pat
|
|||||||
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
|
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
|
||||||
|
|
||||||
## single image conversation example
|
## single image conversation example
|
||||||
|
## Please note that <|ref|> and <|/ref|> are designed specifically for the object localization feature. These special tokens are not required for normal conversations.
|
||||||
|
## If you would like to experience the grounded captioning functionality (responses that include both object localization and reasoning), you need to add the special token <|grounding|> at the beginning of the prompt. Examples could be found in Figure 9 of our paper.
|
||||||
conversation = [
|
conversation = [
|
||||||
{
|
{
|
||||||
"role": "<|User|>",
|
"role": "<|User|>",
|
||||||
@ -334,10 +336,10 @@ This is image_3: <image>
|
|||||||
### Full Inference Example
|
### Full Inference Example
|
||||||
```shell
|
```shell
|
||||||
# without incremental prefilling
|
# without incremental prefilling
|
||||||
CUDA_VISIBLE_DEVICES=0 python inference.py --model_patn "deepseek-ai/deepseek-vl2"
|
CUDA_VISIBLE_DEVICES=0 python inference.py --model_path "deepseek-ai/deepseek-vl2"
|
||||||
|
|
||||||
# with incremental prefilling, when using 40G GPU for vl2-small
|
# with incremental prefilling, when using 40G GPU for vl2-small
|
||||||
CUDA_VISIBLE_DEVICES=0 python inference.py --model_patn "deepseek-ai/deepseek-vl2-small" --chunck_size 512
|
CUDA_VISIBLE_DEVICES=0 python inference.py --model_path "deepseek-ai/deepseek-vl2-small" --chunk_size 512
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
27
inference.py
27
inference.py
@ -76,7 +76,8 @@ def main(args):
|
|||||||
)
|
)
|
||||||
vl_gpt = vl_gpt.cuda().eval()
|
vl_gpt = vl_gpt.cuda().eval()
|
||||||
|
|
||||||
# single image conversation example
|
# multiple images conversation example
|
||||||
|
# Please note that <|grounding|> token is specifically designed for the grounded caption feature. It is not needed for normal conversations.
|
||||||
conversation = [
|
conversation = [
|
||||||
{
|
{
|
||||||
"role": "<|User|>",
|
"role": "<|User|>",
|
||||||
@ -89,28 +90,11 @@ def main(args):
|
|||||||
{"role": "<|Assistant|>", "content": ""},
|
{"role": "<|Assistant|>", "content": ""},
|
||||||
]
|
]
|
||||||
|
|
||||||
# conversation = [
|
|
||||||
# {
|
|
||||||
# "role": "<|User|>",
|
|
||||||
# "content": "<image>\n<|ref|>The giraffe at the back.<|/ref|>.",
|
|
||||||
# "images": ["./images/visual_grounding_1.jpeg"],
|
|
||||||
# },
|
|
||||||
# {"role": "<|Assistant|>", "content": ""},
|
|
||||||
# ]
|
|
||||||
|
|
||||||
# load images and prepare for inputs
|
# load images and prepare for inputs
|
||||||
pil_images = load_pil_images(conversation)
|
pil_images = load_pil_images(conversation)
|
||||||
print(f"len(pil_images) = {len(pil_images)}")
|
print(f"len(pil_images) = {len(pil_images)}")
|
||||||
|
|
||||||
# input_ids = batched_input_ids,
|
|
||||||
# attention_mask = batched_attention_mask,
|
|
||||||
# labels = batched_labels,
|
|
||||||
# images_tiles = batched_images,
|
|
||||||
# images_seq_mask = batched_images_seq_mask,
|
|
||||||
# images_spatial_crop = batched_images_spatial_crop,
|
|
||||||
# sft_format = batched_sft_format,
|
|
||||||
# seq_lens = seq_lens
|
|
||||||
|
|
||||||
prepare_inputs = vl_chat_processor.__call__(
|
prepare_inputs = vl_chat_processor.__call__(
|
||||||
conversations=conversation,
|
conversations=conversation,
|
||||||
images=pil_images,
|
images=pil_images,
|
||||||
@ -118,13 +102,6 @@ def main(args):
|
|||||||
system_prompt=""
|
system_prompt=""
|
||||||
).to(vl_gpt.device, dtype=dtype)
|
).to(vl_gpt.device, dtype=dtype)
|
||||||
|
|
||||||
# for key in prepare_inputs.keys():
|
|
||||||
# value = prepare_inputs[key]
|
|
||||||
# if isinstance(value, list):
|
|
||||||
# print(key, len(value), type(value))
|
|
||||||
# elif isinstance(value, torch.Tensor):
|
|
||||||
# print(key, value.shape, type(value))
|
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
|
|
||||||
if args.chunk_size == -1:
|
if args.chunk_size == -1:
|
||||||
|
Loading…
Reference in New Issue
Block a user