From 53c540ec9aced291fb56fc6122156352a4f0e17d Mon Sep 17 00:00:00 2001
From: Luis <luis@replicate.com>
Date: Mon, 11 Mar 2024 18:48:35 +0000
Subject: [PATCH] First commit

---
 .dockerignore | 17 ++++++++++++
 .gitignore    |  2 ++
 cog.yaml      | 19 +++++++++++++
 predict.py    | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 115 insertions(+)
 create mode 100644 .dockerignore
 create mode 100644 cog.yaml
 create mode 100644 predict.py

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..4522d57
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,17 @@
+# The .dockerignore file excludes files from the container build process.
+#
+# https://docs.docker.com/engine/reference/builder/#dockerignore-file
+
+# Exclude Git files
+.git
+.github
+.gitignore
+
+# Exclude Python cache files
+__pycache__
+.mypy_cache
+.pytest_cache
+.ruff_cache
+
+# Exclude Python virtual environment
+/venv
diff --git a/.gitignore b/.gitignore
index dda2140..4dbcbaf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -413,3 +413,5 @@ Sessionx.vim
 tags
 # Persistent undo
 [._]*.un~
+
+.cog
diff --git a/cog.yaml b/cog.yaml
new file mode 100644
index 0000000..97a33c3
--- /dev/null
+++ b/cog.yaml
@@ -0,0 +1,19 @@
+# Configuration for Cog ⚙️
+# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
+
+build:
+  gpu: true
+  python_version: "3.9"
+  python_packages:
+    - "accelerate==0.27.2"
+    - "attrdict==2.0.1"
+    - "einops==0.7.0"
+    - "sentencepiece==0.2.0"
+    - "torch==2.0.1"
+    - "torchvision==0.15.2"
+    - "transformers>=4.38.2"
+    - "timm>=0.9.16"
+    - "hf_transfer==0.1.6"
+
+# predict.py defines how predictions are run on your model
+predict: "predict.py:Predictor"
diff --git a/predict.py b/predict.py
new file mode 100644
index 0000000..ab3115e
--- /dev/null
+++ b/predict.py
@@ -0,0 +1,77 @@
+# Prediction interface for Cog ⚙️
+# https://github.com/replicate/cog/blob/main/docs/python.md
+
+from cog import BasePredictor, Input, Path
+import os
+import torch
+from threading import Thread
+from transformers import AutoModelForCausalLM
+from deepseek_vl.utils.io import load_pil_images
+from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM
+
+# Enable faster download speed
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+MODEL_NAME = "deepseek-ai/deepseek-vl-7b-base"
+CACHE_DIR = "checkpoints"
+
+
+class Predictor(BasePredictor):
+    def setup(self) -> None:
+        """Load the model into memory to make running multiple predictions efficient"""
+        self.vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(
+            MODEL_NAME,
+            cache_dir=CACHE_DIR
+        )
+        self.tokenizer = self.vl_chat_processor.tokenizer
+        vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
+            MODEL_NAME,
+            torch_dtype=torch.bfloat16,
+            cache_dir=CACHE_DIR
+        )
+        self.vl_gpt = vl_gpt.to('cuda')
+
+    @torch.inference_mode()
+    def predict(
+        self,
+        image: Path = Input(description="Input image"),
+        prompt: str = Input(description="Input prompt", default="Describe the image"),
+        max_new_tokens: int = Input(description="Maximum number of tokens to generate", default=512)
+    ) -> str:
+        """Run a single prediction on the model"""
+        conversation = [
+            {
+                "role": "User",
+                "content": "<image_placeholder>"+prompt,
+                "images": [str(image)]
+            },
+            {
+                "role": "Assistant",
+                "content": ""
+            }
+        ]
+
+        # load images and prepare for inputs
+        pil_images = load_pil_images(conversation)
+        prepare_inputs = self.vl_chat_processor(
+            conversations=conversation,
+            images=pil_images,
+            force_batchify=True
+        ).to('cuda')
+
+        # run image encoder to get the image embeddings
+        inputs_embeds = self.vl_gpt.prepare_inputs_embeds(**prepare_inputs)
+
+        # run the model to get the response
+        outputs = self.vl_gpt.language_model.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=prepare_inputs.attention_mask,
+            pad_token_id=self.tokenizer.eos_token_id,
+            bos_token_id=self.tokenizer.bos_token_id,
+            eos_token_id=self.tokenizer.eos_token_id,
+            max_new_tokens=max_new_tokens,
+            do_sample=False,
+            use_cache=True
+        )
+
+        answer = self.tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
+        return answer