DeepSeek-VL2/client.py

import json
import time
import sys
import os
from gradio_client import Client, handle_file

# Replace with the actual server URL if different
ip = "127.0.0.1"
port = "8080"

# Define the user prompt (caption)
user_prompt = "Thoroughly and carefully describe this image."

files = []
output_file = "output.json"

# Hyperparameters
temperature = 0.6
top_k = 50
top_p = 0.9
max_tokens = 100

startAt = 0

argumentStart = 1
if len(sys.argv) > 1:
    for i in range(0, len(sys.argv)):
        if sys.argv[i] == "--ip":
            ip = sys.argv[i + 1]
            argumentStart += 2
        if sys.argv[i] == "--directory":
            directory = sys.argv[i + 1]
            argumentStart += 2
            # Populate files with image (.jpg, .png) contents of directory
            if os.path.isdir(directory):
                directoryList = os.listdir(directory)
                directoryList.sort()
                for file in directoryList:
                    if file.lower().endswith(('.jpg', '.png', '.jpeg', '.txt')):
                        files.append(os.path.join(directory, file))
            else:
                print(f"Error: Directory '{directory}' does not exist.")
                sys.exit(1)
        elif sys.argv[i] == "--start":
            startAt = int(sys.argv[i + 1])
            argumentStart += 2
        elif sys.argv[i] == "--port":
            port = sys.argv[i + 1]
            argumentStart += 2
        elif sys.argv[i] == "--prompt":
            user_prompt = sys.argv[i + 1]
            argumentStart += 2
        elif sys.argv[i] == "--temperature":
            temperature = float(sys.argv[i + 1])
            argumentStart += 2
        elif sys.argv[i] == "--top_k":
            top_k = int(sys.argv[i + 1])
            argumentStart += 2
        elif sys.argv[i] == "--top_p":
            top_p = float(sys.argv[i + 1])
            argumentStart += 2
        elif sys.argv[i] == "--max_tokens":
            max_tokens = int(sys.argv[i + 1])
            argumentStart += 2
        elif sys.argv[i] in ("--output", "-o"):
            output_file = sys.argv[i + 1]
            argumentStart += 2

# Initialize the Gradio client with the server URL
client = Client(f"http://{ip}:{port}")

results = {"prompt": user_prompt}

for i in range(argumentStart, len(sys.argv)):
    files.append(sys.argv[i])

# Make sure the list is sorted
files.sort()

# Possibly start at a specific index
for i in range(startAt, len(files)):
    # Grab the next image path
    image_path = files[i]

    # Count start time
    start = time.time()

    # Make query to VLLM
    try:
        imageFile = None
        this_user_prompt = user_prompt
        if image_path.endswith('.txt'):
            with open(image_path, 'r') as txt_file:
                this_user_prompt = txt_file.read().strip()
        else:
            imageFile = handle_file(image_path)

        # Reset state
        result = client.predict(api_name="/reset_state" )

        # Send the image file path and the prompt to the Gradio app for processing
        result = client.predict(
            input_images=[imageFile],           # Provide the file path directly
            input_text=this_user_prompt,     # Adapted prompt parameter
            api_name="/transfer_input"
        )

        result = client.predict(
		    chatbot=[],
            temperature=temperature,
            #top_k=top_k,
            top_p=top_p,
            max_length_tokens=max_tokens, # Adapted max_tokens parameter
		    repetition_penalty=1.1,
		    max_context_length_tokens=4096,
		    #model_select_dropdown="deepseek-ai/deepseek-vl2-tiny",
            api_name="/predict"
        )


    except Exception as e:
        print(f"Failed to complete job at index {i}: {e}")
        output_file = f"partial_until_{i}_{output_file}"
        break

    # Calculate elapsed time
    seconds = time.time() - start
    remaining = (len(files) - i) * seconds
    hz = 1 / (seconds + 0.0001)

    # Output the result
    #print("result[0][0][1] ",result[0][0][1])
    question = this_user_prompt #Don't try to recover it from the list..
    response = result[0][0][1]

    # Print on screen
    print(f"Processing {1 + i}/{len(files)} | {hz:.2f} Hz / remaining {remaining / 60:.2f} minutes")
    print(f"Image: {image_path}\nResponse: {response}")

    # Store each path as the key pointing to each description
    results[image_path] = response

# Save results to JSON
print(f"\n\n\nStoring results in JSON file {output_file}")
with open(output_file, "w") as outfile:
    json.dump(results, outfile, indent=4)