First commit

This commit is contained in:
Luis 2024-03-11 18:48:35 +00:00
parent f0e10dbb86
commit 53c540ec9a
4 changed files with 115 additions and 0 deletions

17
.dockerignore Normal file
View File

@ -0,0 +1,17 @@
# The .dockerignore file excludes files from the container build process.
#
# https://docs.docker.com/engine/reference/builder/#dockerignore-file
# Exclude Git files
.git
.github
.gitignore
# Exclude Python cache files
__pycache__
.mypy_cache
.pytest_cache
.ruff_cache
# Exclude Python virtual environment
/venv

2
.gitignore vendored
View File

@ -413,3 +413,5 @@ Sessionx.vim
tags tags
# Persistent undo # Persistent undo
[._]*.un~ [._]*.un~
.cog

19
cog.yaml Normal file
View File

@ -0,0 +1,19 @@
# Configuration for Cog ⚙️
# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
build:
gpu: true
python_version: "3.9"
python_packages:
- "accelerate==0.27.2"
- "attrdict==2.0.1"
- "einops==0.7.0"
- "sentencepiece==0.2.0"
- "torch==2.0.1"
- "torchvision==0.15.2"
- "transformers>=4.38.2"
- "timm>=0.9.16"
- "hf_transfer==0.1.6"
# predict.py defines how predictions are run on your model
predict: "predict.py:Predictor"

77
predict.py Normal file
View File

@ -0,0 +1,77 @@
# Prediction interface for Cog ⚙️
# https://github.com/replicate/cog/blob/main/docs/python.md
from cog import BasePredictor, Input, Path
import os
import torch
from threading import Thread
from transformers import AutoModelForCausalLM
from deepseek_vl.utils.io import load_pil_images
from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM
# Enable faster download speed
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
MODEL_NAME = "deepseek-ai/deepseek-vl-7b-base"
CACHE_DIR = "checkpoints"
class Predictor(BasePredictor):
def setup(self) -> None:
"""Load the model into memory to make running multiple predictions efficient"""
self.vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(
MODEL_NAME,
cache_dir=CACHE_DIR
)
self.tokenizer = self.vl_chat_processor.tokenizer
vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.bfloat16,
cache_dir=CACHE_DIR
)
self.vl_gpt = vl_gpt.to('cuda')
@torch.inference_mode()
def predict(
self,
image: Path = Input(description="Input image"),
prompt: str = Input(description="Input prompt", default="Describe the image"),
max_new_tokens: int = Input(description="Maximum number of tokens to generate", default=512)
) -> str:
"""Run a single prediction on the model"""
conversation = [
{
"role": "User",
"content": "<image_placeholder>"+prompt,
"images": [str(image)]
},
{
"role": "Assistant",
"content": ""
}
]
# load images and prepare for inputs
pil_images = load_pil_images(conversation)
prepare_inputs = self.vl_chat_processor(
conversations=conversation,
images=pil_images,
force_batchify=True
).to('cuda')
# run image encoder to get the image embeddings
inputs_embeds = self.vl_gpt.prepare_inputs_embeds(**prepare_inputs)
# run the model to get the response
outputs = self.vl_gpt.language_model.generate(
inputs_embeds=inputs_embeds,
attention_mask=prepare_inputs.attention_mask,
pad_token_id=self.tokenizer.eos_token_id,
bos_token_id=self.tokenizer.bos_token_id,
eos_token_id=self.tokenizer.eos_token_id,
max_new_tokens=max_new_tokens,
do_sample=False,
use_cache=True
)
answer = self.tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
return answer