From c3b9858a408d63edc9d68cb27848db889541b635 Mon Sep 17 00:00:00 2001
From: stack-heap-overflow <creativex@qq.com>
Date: Tue, 7 May 2024 17:51:42 +0800
Subject: [PATCH] Update README.md

---
 README.md | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 9ec28d8..15f23cf 100644
--- a/README.md
+++ b/README.md
@@ -189,7 +189,7 @@ We also provide OpenAI-Compatible API at DeepSeek Platform: [platform.deepseek.c
 ### Inference with Huggingface's Transformers
 You can directly employ [Huggingface's Transformers](https://github.com/huggingface/transformers) for model inference.
 
-### Text Completion
+#### Text Completion
 ```python
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
@@ -210,7 +210,7 @@ result = tokenizer.decode(outputs[0], skip_special_tokens=True)
 print(result)
 ```
 
-### Chat Completion
+#### Chat Completion
 ```python
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
@@ -257,6 +257,33 @@ Assistant: {assistant_message_1}<｜end▁of▁sentence｜>User: {user_message_2
 Assistant:
 ```
 
+### Inference with vLLM (recommended)
+To utilize [vLLM](https://github.com/vllm-project/vllm) for model inference, please merge this Pull Request into your vLLM codebase: https://github.com/vllm-project/vllm/pull/4650.
+
+```python
+from transformers import AutoTokenizer
+from vllm import LLM, SamplingParams
+
+max_model_len, tp_size = 8192, 8
+model_name = "deepseek-ai/DeepSeek-V2-Chat"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+llm = LLM(model=model_name, tensor_parallel_size=tp_size, max_model_len=max_model_len, trust_remote_code=True, enforce_eager=True)
+sampling_params = SamplingParams(temperature=0.3, max_tokens=256, stop_token_ids=[tokenizer.eos_token_id])
+
+messages_list = [
+    [{"role": "user", "content": "Who are you?"}],
+    [{"role": "user", "content": "Translate the following content into Chinese directly: DeepSeek-V2 adopts innovative architectures to guarantee economical training and efficient inference."}],
+    [{"role": "user", "content": "Write a piece of quicksort code in C++."}],
+]
+
+prompt_token_ids = [tokenizer.apply_chat_template(messages, add_generation_prompt=True) for messages in messages_list]
+
+outputs = llm.generate(prompt_token_ids=prompt_token_ids, sampling_params=sampling_params)
+
+generated_text = [output.outputs[0].text for output in outputs]
+print(generated_text)
+```
+
 ## 8. License
 This code repository is licensed under [the MIT License](LICENSE-CODE). The use of DeepSeek-V2 Base/Chat models is subject to [the Model License](LICENSE-MODEL). DeepSeek-V2 series (including Base and Chat) supports commercial use.