diff --git a/README.md b/README.md index 40e8a43..98b60a1 100644 --- a/README.md +++ b/README.md @@ -132,18 +132,34 @@ tokenizer = vl_chat_processor.tokenizer vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval() +## single image conversation example conversation = [ { "role": "User", "content": "Describe each stage of this image.", - "images": ["./images/training_pipelines.jpg"] + "images": ["./images/training_pipelines.jpg"], }, - { - "role": "Assistant", - "content": "" - } + {"role": "Assistant", "content": ""}, ] +## multiple images (or in-context learning) conversation example +# conversation = [ +# { +# "role": "User", +# "content": "A dog wearing nothing in the foreground, " +# "a dog wearing a santa hat, " +# "a dog wearing a wizard outfit, and " +# "what's the dog wearing?", +# "images": [ +# "images/dog_a.png", +# "images/dog_b.png", +# "images/dog_c.png", +# "images/dog_d.png", +# ], +# }, +# {"role": "Assistant", "content": ""} +# ] + # load images and prepare for inputs pil_images = load_pil_images(conversation) prepare_inputs = vl_chat_processor( diff --git a/images/dog_a.png b/images/dog_a.png new file mode 100644 index 0000000..956caab Binary files /dev/null and b/images/dog_a.png differ diff --git a/images/dog_b.png b/images/dog_b.png new file mode 100644 index 0000000..221f1d1 Binary files /dev/null and b/images/dog_b.png differ diff --git a/images/dog_c.png b/images/dog_c.png new file mode 100644 index 0000000..283a182 Binary files /dev/null and b/images/dog_c.png differ diff --git a/images/dog_d.png b/images/dog_d.png new file mode 100644 index 0000000..d9ff5d6 Binary files /dev/null and b/images/dog_d.png differ diff --git a/inference.py b/inference.py index 94da96e..fcf98ce 100644 --- a/inference.py +++ b/inference.py @@ -24,7 +24,8 @@ from deepseek_vl.models import MultiModalityCausalLM, VLChatProcessor from deepseek_vl.utils.io import load_pil_images # specify the path to the model -model_path = "deepseek-ai/deepseek-vl-7b-chat" +# model_path = "deepseek-ai/deepseek-vl-7b-chat" +model_path = "/hf3fs-jd/prod/deepseek/shared/liuwen/ckpts/deepseek-vl-7b-chat" vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path) tokenizer = vl_chat_processor.tokenizer @@ -33,6 +34,7 @@ vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained( ) vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval() +# single image conversation example conversation = [ { "role": "User", @@ -42,6 +44,23 @@ conversation = [ {"role": "Assistant", "content": ""}, ] +# multiple images (or in-context learning) conversation example +# conversation = [ +# { +# "role": "User", +# "content": "A dog wearing nothing in the foreground, " +# "a dog wearing a santa hat, " +# "a dog wearing a wizard outfit, and " +# "what's the dog wearing?", +# "images": [ +# "images/dog_a.png", +# "images/dog_b.png", +# "images/dog_c.png", +# "images/dog_d.png", +# ], +# }, +# {"role": "Assistant", "content": ""} +# ] # load images and prepare for inputs pil_images = load_pil_images(conversation)