diff --git a/README.md b/README.md index 69a8c03..bc7e2f6 100644 --- a/README.md +++ b/README.md @@ -124,6 +124,23 @@ conversation = [ {"role": "<|Assistant|>", "content": ""}, ] + +# multiple images/interleaved image-text +conversation_multi_images = [ + { + "role": "<|User|>", + "content": "This is image_1: \n" + "This is image_2: \n" + "This is image_3: \n If I am a vegetarian, what can I cook with these ingredients?", + "images": [ + "images/multi_image_1.png", + "images/multi_image_2.jpg", + "images/multi_image_3.jpg", + ], + }, + {"role": "<|Assistant|>", "content": ""} +] + # load images and prepare for inputs pil_images = load_pil_images(conversation) prepare_inputs = vl_chat_processor( @@ -148,7 +165,7 @@ outputs = vl_gpt.language.generate( use_cache=True ) -answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True) +answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=False) print(f"{prepare_inputs['sft_format'][0]}", answer) ``` diff --git a/images/multi_image_1.png b/images/multi_image_1.png new file mode 100644 index 0000000..1d619d3 Binary files /dev/null and b/images/multi_image_1.png differ diff --git a/images/multi_image_2.jpg b/images/multi_image_2.jpg new file mode 100644 index 0000000..5777d19 Binary files /dev/null and b/images/multi_image_2.jpg differ diff --git a/images/multi_image_3.jpg b/images/multi_image_3.jpg new file mode 100644 index 0000000..24eb26b Binary files /dev/null and b/images/multi_image_3.jpg differ diff --git a/requirements.txt b/requirements.txt index 37ed33c..c7acd26 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ torch==2.0.1 -transformers>=4.38.2 +transformers==4.38.2 timm>=0.9.16 accelerate sentencepiece