mirror of
https://github.com/deepseek-ai/DeepSeek-VL.git
synced 2025-04-19 18:19:03 -04:00
add multiple images (or in-context learning) conversation example
This commit is contained in:
parent
3c02b24219
commit
dd980e3429
26
README.md
26
README.md
@ -132,18 +132,34 @@ tokenizer = vl_chat_processor.tokenizer
|
|||||||
vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
||||||
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
|
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
|
||||||
|
|
||||||
|
## single image conversation example
|
||||||
conversation = [
|
conversation = [
|
||||||
{
|
{
|
||||||
"role": "User",
|
"role": "User",
|
||||||
"content": "<image_placeholder>Describe each stage of this image.",
|
"content": "<image_placeholder>Describe each stage of this image.",
|
||||||
"images": ["./images/training_pipelines.jpg"]
|
"images": ["./images/training_pipelines.jpg"],
|
||||||
},
|
},
|
||||||
{
|
{"role": "Assistant", "content": ""},
|
||||||
"role": "Assistant",
|
|
||||||
"content": ""
|
|
||||||
}
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
## multiple images (or in-context learning) conversation example
|
||||||
|
# conversation = [
|
||||||
|
# {
|
||||||
|
# "role": "User",
|
||||||
|
# "content": "<image_placeholder>A dog wearing nothing in the foreground, "
|
||||||
|
# "<image_placeholder>a dog wearing a santa hat, "
|
||||||
|
# "<image_placeholder>a dog wearing a wizard outfit, and "
|
||||||
|
# "<image_placeholder>what's the dog wearing?",
|
||||||
|
# "images": [
|
||||||
|
# "images/dog_a.png",
|
||||||
|
# "images/dog_b.png",
|
||||||
|
# "images/dog_c.png",
|
||||||
|
# "images/dog_d.png",
|
||||||
|
# ],
|
||||||
|
# },
|
||||||
|
# {"role": "Assistant", "content": ""}
|
||||||
|
# ]
|
||||||
|
|
||||||
# load images and prepare for inputs
|
# load images and prepare for inputs
|
||||||
pil_images = load_pil_images(conversation)
|
pil_images = load_pil_images(conversation)
|
||||||
prepare_inputs = vl_chat_processor(
|
prepare_inputs = vl_chat_processor(
|
||||||
|
BIN
images/dog_a.png
Normal file
BIN
images/dog_a.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 204 KiB |
BIN
images/dog_b.png
Normal file
BIN
images/dog_b.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 356 KiB |
BIN
images/dog_c.png
Normal file
BIN
images/dog_c.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 418 KiB |
BIN
images/dog_d.png
Normal file
BIN
images/dog_d.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 363 KiB |
21
inference.py
21
inference.py
@ -24,7 +24,8 @@ from deepseek_vl.models import MultiModalityCausalLM, VLChatProcessor
|
|||||||
from deepseek_vl.utils.io import load_pil_images
|
from deepseek_vl.utils.io import load_pil_images
|
||||||
|
|
||||||
# specify the path to the model
|
# specify the path to the model
|
||||||
model_path = "deepseek-ai/deepseek-vl-7b-chat"
|
# model_path = "deepseek-ai/deepseek-vl-7b-chat"
|
||||||
|
model_path = "/hf3fs-jd/prod/deepseek/shared/liuwen/ckpts/deepseek-vl-7b-chat"
|
||||||
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
|
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
|
||||||
tokenizer = vl_chat_processor.tokenizer
|
tokenizer = vl_chat_processor.tokenizer
|
||||||
|
|
||||||
@ -33,6 +34,7 @@ vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
|
|||||||
)
|
)
|
||||||
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
|
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
|
||||||
|
|
||||||
|
# single image conversation example
|
||||||
conversation = [
|
conversation = [
|
||||||
{
|
{
|
||||||
"role": "User",
|
"role": "User",
|
||||||
@ -42,6 +44,23 @@ conversation = [
|
|||||||
{"role": "Assistant", "content": ""},
|
{"role": "Assistant", "content": ""},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# multiple images (or in-context learning) conversation example
|
||||||
|
# conversation = [
|
||||||
|
# {
|
||||||
|
# "role": "User",
|
||||||
|
# "content": "<image_placeholder>A dog wearing nothing in the foreground, "
|
||||||
|
# "<image_placeholder>a dog wearing a santa hat, "
|
||||||
|
# "<image_placeholder>a dog wearing a wizard outfit, and "
|
||||||
|
# "<image_placeholder>what's the dog wearing?",
|
||||||
|
# "images": [
|
||||||
|
# "images/dog_a.png",
|
||||||
|
# "images/dog_b.png",
|
||||||
|
# "images/dog_c.png",
|
||||||
|
# "images/dog_d.png",
|
||||||
|
# ],
|
||||||
|
# },
|
||||||
|
# {"role": "Assistant", "content": ""}
|
||||||
|
# ]
|
||||||
|
|
||||||
# load images and prepare for inputs
|
# load images and prepare for inputs
|
||||||
pil_images = load_pil_images(conversation)
|
pil_images = load_pil_images(conversation)
|
||||||
|
Loading…
Reference in New Issue
Block a user