mirror of
https://github.com/deepseek-ai/DeepSeek-VL.git
synced 2025-04-19 01:59:13 -04:00
add multiple images (or in-context learning) conversation example
This commit is contained in:
parent
3c02b24219
commit
dd980e3429
26
README.md
26
README.md
@ -132,18 +132,34 @@ tokenizer = vl_chat_processor.tokenizer
|
||||
vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
|
||||
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
|
||||
|
||||
## single image conversation example
|
||||
conversation = [
|
||||
{
|
||||
"role": "User",
|
||||
"content": "<image_placeholder>Describe each stage of this image.",
|
||||
"images": ["./images/training_pipelines.jpg"]
|
||||
"images": ["./images/training_pipelines.jpg"],
|
||||
},
|
||||
{
|
||||
"role": "Assistant",
|
||||
"content": ""
|
||||
}
|
||||
{"role": "Assistant", "content": ""},
|
||||
]
|
||||
|
||||
## multiple images (or in-context learning) conversation example
|
||||
# conversation = [
|
||||
# {
|
||||
# "role": "User",
|
||||
# "content": "<image_placeholder>A dog wearing nothing in the foreground, "
|
||||
# "<image_placeholder>a dog wearing a santa hat, "
|
||||
# "<image_placeholder>a dog wearing a wizard outfit, and "
|
||||
# "<image_placeholder>what's the dog wearing?",
|
||||
# "images": [
|
||||
# "images/dog_a.png",
|
||||
# "images/dog_b.png",
|
||||
# "images/dog_c.png",
|
||||
# "images/dog_d.png",
|
||||
# ],
|
||||
# },
|
||||
# {"role": "Assistant", "content": ""}
|
||||
# ]
|
||||
|
||||
# load images and prepare for inputs
|
||||
pil_images = load_pil_images(conversation)
|
||||
prepare_inputs = vl_chat_processor(
|
||||
|
BIN
images/dog_a.png
Normal file
BIN
images/dog_a.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 204 KiB |
BIN
images/dog_b.png
Normal file
BIN
images/dog_b.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 356 KiB |
BIN
images/dog_c.png
Normal file
BIN
images/dog_c.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 418 KiB |
BIN
images/dog_d.png
Normal file
BIN
images/dog_d.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 363 KiB |
21
inference.py
21
inference.py
@ -24,7 +24,8 @@ from deepseek_vl.models import MultiModalityCausalLM, VLChatProcessor
|
||||
from deepseek_vl.utils.io import load_pil_images
|
||||
|
||||
# specify the path to the model
|
||||
model_path = "deepseek-ai/deepseek-vl-7b-chat"
|
||||
# model_path = "deepseek-ai/deepseek-vl-7b-chat"
|
||||
model_path = "/hf3fs-jd/prod/deepseek/shared/liuwen/ckpts/deepseek-vl-7b-chat"
|
||||
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
|
||||
tokenizer = vl_chat_processor.tokenizer
|
||||
|
||||
@ -33,6 +34,7 @@ vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
|
||||
)
|
||||
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
|
||||
|
||||
# single image conversation example
|
||||
conversation = [
|
||||
{
|
||||
"role": "User",
|
||||
@ -42,6 +44,23 @@ conversation = [
|
||||
{"role": "Assistant", "content": ""},
|
||||
]
|
||||
|
||||
# multiple images (or in-context learning) conversation example
|
||||
# conversation = [
|
||||
# {
|
||||
# "role": "User",
|
||||
# "content": "<image_placeholder>A dog wearing nothing in the foreground, "
|
||||
# "<image_placeholder>a dog wearing a santa hat, "
|
||||
# "<image_placeholder>a dog wearing a wizard outfit, and "
|
||||
# "<image_placeholder>what's the dog wearing?",
|
||||
# "images": [
|
||||
# "images/dog_a.png",
|
||||
# "images/dog_b.png",
|
||||
# "images/dog_c.png",
|
||||
# "images/dog_d.png",
|
||||
# ],
|
||||
# },
|
||||
# {"role": "Assistant", "content": ""}
|
||||
# ]
|
||||
|
||||
# load images and prepare for inputs
|
||||
pil_images = load_pil_images(conversation)
|
||||
|
Loading…
Reference in New Issue
Block a user