mak it run multimodal understanding on mac m3

This commit is contained in:
Vadzim Belski 2025-01-29 09:01:52 +04:00
parent ffaac90408
commit 7628bb8174

View File

@ -33,6 +33,12 @@ vl_gpt = AutoModelForCausalLM.from_pretrained(
) )
vl_gpt = vl_gpt.float().to(device) vl_gpt = vl_gpt.float().to(device)
for name, module in vl_gpt.named_modules():
if isinstance(module, torch.nn.Module):
module.float()
vl_gpt.to(device)
vl_chat_processor = VLChatProcessor.from_pretrained(model_path) vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer tokenizer = vl_chat_processor.tokenizer
cuda_device = device cuda_device = device
@ -57,10 +63,17 @@ def multimodal_understanding(image, question, seed, top_p, temperature):
] ]
pil_images = [Image.fromarray(image)] pil_images = [Image.fromarray(image)]
prepare_inputs = vl_chat_processor( prepare_inputs = vl_chat_processor(
conversations=conversation, images=pil_images, force_batchify=True conversations=conversation, images=pil_images, force_batchify=True
).to(cuda_device) ).to(cuda_device, dtype=torch.float32)
# Option 1: Just remove the autocast context entirely
# with torch.autocast("mps", dtype=torch.float32"):
# inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
# OR Option 2: explicitly disable autocast
with torch.autocast("mps", enabled=False):
inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs) inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
outputs = vl_gpt.language_model.generate( outputs = vl_gpt.language_model.generate(
@ -244,4 +257,4 @@ with gr.Blocks() as demo:
outputs=image_output outputs=image_output
) )
demo.launch(share=True) demo.launch(share=False)