add multiple images (or in-context learning) conversation example

This commit is contained in:
StevenLiuWen 2024-04-16 12:34:19 +08:00
parent 3c02b24219
commit dd980e3429
6 changed files with 41 additions and 6 deletions

View File

@ -132,18 +132,34 @@ tokenizer = vl_chat_processor.tokenizer
vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
## single image conversation example
conversation = [
{
"role": "User",
"content": "<image_placeholder>Describe each stage of this image.",
"images": ["./images/training_pipelines.jpg"]
"images": ["./images/training_pipelines.jpg"],
},
{
"role": "Assistant",
"content": ""
}
{"role": "Assistant", "content": ""},
]
## multiple images (or in-context learning) conversation example
# conversation = [
# {
# "role": "User",
# "content": "<image_placeholder>A dog wearing nothing in the foreground, "
# "<image_placeholder>a dog wearing a santa hat, "
# "<image_placeholder>a dog wearing a wizard outfit, and "
# "<image_placeholder>what's the dog wearing?",
# "images": [
# "images/dog_a.png",
# "images/dog_b.png",
# "images/dog_c.png",
# "images/dog_d.png",
# ],
# },
# {"role": "Assistant", "content": ""}
# ]
# load images and prepare for inputs
pil_images = load_pil_images(conversation)
prepare_inputs = vl_chat_processor(

BIN
images/dog_a.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 204 KiB

BIN
images/dog_b.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 356 KiB

BIN
images/dog_c.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 418 KiB

BIN
images/dog_d.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 363 KiB

View File

@ -24,7 +24,8 @@ from deepseek_vl.models import MultiModalityCausalLM, VLChatProcessor
from deepseek_vl.utils.io import load_pil_images
# specify the path to the model
model_path = "deepseek-ai/deepseek-vl-7b-chat"
# model_path = "deepseek-ai/deepseek-vl-7b-chat"
model_path = "/hf3fs-jd/prod/deepseek/shared/liuwen/ckpts/deepseek-vl-7b-chat"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer
@ -33,6 +34,7 @@ vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
# single image conversation example
conversation = [
{
"role": "User",
@ -42,6 +44,23 @@ conversation = [
{"role": "Assistant", "content": ""},
]
# multiple images (or in-context learning) conversation example
# conversation = [
# {
# "role": "User",
# "content": "<image_placeholder>A dog wearing nothing in the foreground, "
# "<image_placeholder>a dog wearing a santa hat, "
# "<image_placeholder>a dog wearing a wizard outfit, and "
# "<image_placeholder>what's the dog wearing?",
# "images": [
# "images/dog_a.png",
# "images/dog_b.png",
# "images/dog_c.png",
# "images/dog_d.png",
# ],
# },
# {"role": "Assistant", "content": ""}
# ]
# load images and prepare for inputs
pil_images = load_pil_images(conversation)