add multiple images (or in-context learning) conversation example

This commit is contained in:
StevenLiuWen 2024-04-16 12:34:19 +08:00
parent 3c02b24219
commit dd980e3429
6 changed files with 41 additions and 6 deletions

View File

@ -132,18 +132,34 @@ tokenizer = vl_chat_processor.tokenizer
vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval() vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
## single image conversation example
conversation = [ conversation = [
{ {
"role": "User", "role": "User",
"content": "<image_placeholder>Describe each stage of this image.", "content": "<image_placeholder>Describe each stage of this image.",
"images": ["./images/training_pipelines.jpg"] "images": ["./images/training_pipelines.jpg"],
}, },
{ {"role": "Assistant", "content": ""},
"role": "Assistant",
"content": ""
}
] ]
## multiple images (or in-context learning) conversation example
# conversation = [
# {
# "role": "User",
# "content": "<image_placeholder>A dog wearing nothing in the foreground, "
# "<image_placeholder>a dog wearing a santa hat, "
# "<image_placeholder>a dog wearing a wizard outfit, and "
# "<image_placeholder>what's the dog wearing?",
# "images": [
# "images/dog_a.png",
# "images/dog_b.png",
# "images/dog_c.png",
# "images/dog_d.png",
# ],
# },
# {"role": "Assistant", "content": ""}
# ]
# load images and prepare for inputs # load images and prepare for inputs
pil_images = load_pil_images(conversation) pil_images = load_pil_images(conversation)
prepare_inputs = vl_chat_processor( prepare_inputs = vl_chat_processor(

BIN
images/dog_a.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 204 KiB

BIN
images/dog_b.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 356 KiB

BIN
images/dog_c.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 418 KiB

BIN
images/dog_d.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 363 KiB

View File

@ -24,7 +24,8 @@ from deepseek_vl.models import MultiModalityCausalLM, VLChatProcessor
from deepseek_vl.utils.io import load_pil_images from deepseek_vl.utils.io import load_pil_images
# specify the path to the model # specify the path to the model
model_path = "deepseek-ai/deepseek-vl-7b-chat" # model_path = "deepseek-ai/deepseek-vl-7b-chat"
model_path = "/hf3fs-jd/prod/deepseek/shared/liuwen/ckpts/deepseek-vl-7b-chat"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path) vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer tokenizer = vl_chat_processor.tokenizer
@ -33,6 +34,7 @@ vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
) )
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval() vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
# single image conversation example
conversation = [ conversation = [
{ {
"role": "User", "role": "User",
@ -42,6 +44,23 @@ conversation = [
{"role": "Assistant", "content": ""}, {"role": "Assistant", "content": ""},
] ]
# multiple images (or in-context learning) conversation example
# conversation = [
# {
# "role": "User",
# "content": "<image_placeholder>A dog wearing nothing in the foreground, "
# "<image_placeholder>a dog wearing a santa hat, "
# "<image_placeholder>a dog wearing a wizard outfit, and "
# "<image_placeholder>what's the dog wearing?",
# "images": [
# "images/dog_a.png",
# "images/dog_b.png",
# "images/dog_c.png",
# "images/dog_d.png",
# ],
# },
# {"role": "Assistant", "content": ""}
# ]
# load images and prepare for inputs # load images and prepare for inputs
pil_images = load_pil_images(conversation) pil_images = load_pil_images(conversation)