diff --git a/deepseek_vl/serve/app_deepseek.py b/deepseek_vl/serve/app_deepseek.py new file mode 100755 index 0000000..3d08210 --- /dev/null +++ b/deepseek_vl/serve/app_deepseek.py @@ -0,0 +1,466 @@ +# -*- coding:utf-8 -*- + +import gradio as gr +import torch +import base64 +from io import BytesIO + +from app_modules.gradio_utils import (cancel_outputing, delete_last_conversation, reset_state, + reset_textbox, transfer_input, wrap_gen_fn) +from app_modules.overwrites import reload_javascript +from app_modules.presets import CONCURRENT_COUNT, description, description_top, title +from app_modules.utils import (configure_logger, is_variable_assigned, + strip_stop_words) +from deepseek_vl.serve.inference import convert_conversation_to_prompts, deepseek_generate, load_model +from deepseek_vl.utils.conversation import SeparatorStyle + + +def load_models(): + models = { + "DeepSeek-VL 7B": "/hf3fs-jd/prod/deepseek/shared/liuwen/ckpts/deepseek-vl-7b-chat", + } + + for model_name in models: + models[model_name] = load_model(models[model_name]) + + return models + + +logger = configure_logger() +models = load_models() +MODELS = sorted(list(models.keys())) + + +def generate_prompt_with_history(text, image, history, vl_chat_processor, tokenizer, max_length=2048): + """ + Generate a prompt with history for the deepseek application. + + Args: + text (str): The text prompt. + image (str): The image prompt. + history (list): List of previous conversation messages. + tokenizer: The tokenizer used for encoding the prompt. + max_length (int): The maximum length of the prompt. + + Returns: + tuple: A tuple containing the generated prompt, image list, conversation, and conversation copy. If the prompt could not be generated within the max_length limit, returns None. + """ + + sft_format = "deepseek" + user_role_ind = 0 + bot_role_ind = 1 + + # Initialize conversation + conversation = vl_chat_processor.new_chat_template() + + if history: + conversation.messages = history + + if image is not None: + if '' not in text: + text = '' + '\n' + text # append the in a new line after the text prompt + text = (text, image) + + conversation.append_message(conversation.roles[user_role_ind], text) + conversation.append_message(conversation.roles[bot_role_ind], "") + + # Create a copy of the conversation to avoid history truncation in the UI + conversation_copy = conversation.copy() + logger.info("=" * 80) + logger.info(get_prompt(conversation)) + + rounds = len(conversation.messages) // 2 + + for _ in range(rounds): + current_prompt = get_prompt(conversation) + current_prompt = current_prompt.replace("", "") if sft_format == "deepseek" else current_prompt + + if torch.tensor(tokenizer.encode(current_prompt)).size(-1) <= max_length: + return conversation_copy + + if len(conversation.messages) % 2 != 0: + gr.Error("The messages between user and assistant are not paired.") + return + + try: + for _ in range(2): # pop out two messages in a row + conversation.messages.pop(0) + except IndexError: + gr.Error("Input text processing failed, unable to respond in this round.") + return None + + gr.Error("Prompt could not be generated within max_length limit.") + return None + + +def to_gradio_chatbot(conv): + """Convert the conversation to gradio chatbot format.""" + ret = [] + for i, (role, msg) in enumerate(conv.messages[conv.offset :]): + if i % 2 == 0: + if type(msg) is tuple: + msg, image = msg + if isinstance(image, str): + with open(image, 'rb') as f: + data = f.read() + img_b64_str = base64.b64encode(data).decode() + image_str = f'' + msg = msg.replace('\n'.join([''] * 4), image_str) + else: + max_hw, min_hw = max(image.size), min(image.size) + aspect_ratio = max_hw / min_hw + max_len, min_len = 800, 400 + shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw)) + longest_edge = int(shortest_edge * aspect_ratio) + W, H = image.size + if H > W: + H, W = longest_edge, shortest_edge + else: + H, W = shortest_edge, longest_edge + image = image.resize((W, H)) + buffered = BytesIO() + image.save(buffered, format="JPEG") + img_b64_str = base64.b64encode(buffered.getvalue()).decode() + img_str = f'user upload image' + msg = msg.replace('', img_str) + ret.append([msg, None]) + else: + ret[-1][-1] = msg + return ret + + +def to_gradio_history(conv): + """Convert the conversation to gradio history state.""" + return conv.messages[conv.offset :] + + +def get_prompt(conv) -> str: + """Get the prompt for generation.""" + system_prompt = conv.system_template.format(system_message=conv.system_message) + if conv.sep_style == SeparatorStyle.DeepSeek: + seps = [conv.sep, conv.sep2] + if system_prompt == "" or system_prompt is None: + ret = "" + else: + ret = system_prompt + seps[0] + for i, (role, message) in enumerate(conv.messages): + if message: + if type(message) is tuple: # multimodal message + message, _ = message + ret += role + ": " + message + seps[i % 2] + else: + ret += role + ":" + return ret + else: + return conv.get_prompt + + +@wrap_gen_fn +def predict( + text, + image, + chatbot, + history, + top_p, + temperature, + repetition_penalty, + max_length_tokens, + max_context_length_tokens, + model_select_dropdown, +): + """ + Function to predict the response based on the user's input and selected model. + + Parameters: + user_text (str): The input text from the user. + user_image (str): The input image from the user. + chatbot (str): The chatbot's name. + history (str): The history of the chat. + top_p (float): The top-p parameter for the model. + temperature (float): The temperature parameter for the model. + max_length_tokens (int): The maximum length of tokens for the model. + max_context_length_tokens (int): The maximum length of context tokens for the model. + model_select_dropdown (str): The selected model from the dropdown. + + Returns: + generator: A generator that yields the chatbot outputs, history, and status. + """ + print("running the prediction function") + try: + tokenizer, vl_gpt, vl_chat_processor = models[model_select_dropdown] + + if text == "": + yield chatbot, history, "Empty context." + return + except KeyError: + yield [[text, "No Model Found"]], [], "No Model Found" + return + + conversation = generate_prompt_with_history( + text, image, history, vl_chat_processor, tokenizer, max_length=max_context_length_tokens + ) + prompts = convert_conversation_to_prompts(conversation) + + stop_words = conversation.stop_str + gradio_chatbot_output = to_gradio_chatbot(conversation) + + full_response = "" + with torch.no_grad(): + for x in deepseek_generate( + prompts=prompts, + vl_gpt=vl_gpt, + vl_chat_processor=vl_chat_processor, + tokenizer=tokenizer, + stop_words=stop_words, + max_length=max_length_tokens, + temperature=temperature, + repetition_penalty=repetition_penalty, + top_p=top_p, + ): + full_response += x + response = strip_stop_words(full_response, stop_words) + conversation.update_last_message(response) + gradio_chatbot_output[-1][1] = response + yield gradio_chatbot_output, to_gradio_history(conversation), "Generating..." + + print("flushed result to gradio") + torch.cuda.empty_cache() + + if is_variable_assigned("x"): + print(f"{model_select_dropdown}:\n{text}\n{'-' * 80}\n{x}\n{'=' * 80}") + print( + f"temperature: {temperature}, top_p: {top_p}, repetition_penalty: {repetition_penalty}, max_length_tokens: {max_length_tokens}" + ) + + yield gradio_chatbot_output, to_gradio_history(conversation), "Generate: Success" + + +def retry( + text, + image, + chatbot, + history, + top_p, + temperature, + repetition_penalty, + max_length_tokens, + max_context_length_tokens, + model_select_dropdown, +): + if len(history) == 0: + yield (chatbot, history, "Empty context") + return + + chatbot.pop() + history.pop() + text = history.pop()[-1] + if type(text) is tuple: + text, image = text + + yield from predict( + text, + image, + chatbot, + history, + top_p, + temperature, + repetition_penalty, + max_length_tokens, + max_context_length_tokens, + model_select_dropdown, + ) + + +def build_demo(MODELS): + with open("deepseek_vl/serve/assets/custom.css", "r", encoding="utf-8") as f: + customCSS = f.read() + + with gr.Blocks(theme=gr.themes.Soft()) as demo: + history = gr.State([]) + input_text = gr.State() + input_image = gr.State() + + with gr.Row(): + gr.HTML(title) + status_display = gr.Markdown("Success", elem_id="status_display") + gr.Markdown(description_top) + + with gr.Row(equal_height=True): + with gr.Column(scale=4): + with gr.Row(): + chatbot = gr.Chatbot( + elem_id="deepseek_chatbot", + show_share_button=True, + likeable=True, + bubble_full_width=False, + height=600, + ) + with gr.Row(): + with gr.Column(scale=4): + text_box = gr.Textbox(show_label=False, placeholder="Enter text", container=False) + with gr.Column( + min_width=70, + ): + submitBtn = gr.Button("Send") + with gr.Column( + min_width=70, + ): + cancelBtn = gr.Button("Stop") + with gr.Row(): + emptyBtn = gr.Button( + "🧹 New Conversation", + ) + retryBtn = gr.Button("🔄 Regenerate") + delLastBtn = gr.Button("🗑️ Remove Last Turn") + + with gr.Column(): + image_box = gr.Image(type="pil") + + with gr.Tab(label="Parameter Setting") as parameter_row: + top_p = gr.Slider( + minimum=-0, + maximum=1.0, + value=0.95, + step=0.05, + interactive=True, + label="Top-p", + ) + temperature = gr.Slider( + minimum=0, + maximum=1.0, + value=0.1, + step=0.1, + interactive=True, + label="Temperature", + ) + repetition_penalty = gr.Slider( + minimum=0.0, + maximum=2.0, + value=1.1, + step=0.1, + interactive=True, + label="Repetition penalty", + ) + max_length_tokens = gr.Slider( + minimum=0, + maximum=4096, + value=2048, + step=8, + interactive=True, + label="Max Generation Tokens", + ) + max_context_length_tokens = gr.Slider( + minimum=0, + maximum=4096, + value=4096, + step=128, + interactive=True, + label="Max History Tokens", + ) + model_select_dropdown = gr.Dropdown( + label="Select Models", + choices=MODELS, + multiselect=False, + value=MODELS[0], + interactive=True, + ) + + examples_list = [ + [ + 'deepseek_vl/serve/examples/rap.jpeg', + 'Can you write me a master rap song that rhymes very well based on this image?', + ], + [ + 'deepseek_vl/serve/examples/app.png', + 'What is this app about?', + ], + [ + 'deepseek_vl/serve/examples/pipeline.png', + 'Help me write a python code based on the image.', + ], + [ + 'deepseek_vl/serve/examples/chart.png', + 'Could you help me to re-draw this picture with python codes?', + ], + [ + 'deepseek_vl/serve/examples/mirror.png', + 'How many people are there in the image. Why?', + ], + [ + 'deepseek_vl/serve/examples/puzzle.png', + 'Can this 2 pieces combine together?', + ], + ] + gr.Examples(examples=examples_list, inputs=[image_box, text_box]) + gr.Markdown(description) + + input_widgets = [ + input_text, + input_image, + chatbot, + history, + top_p, + temperature, + repetition_penalty, + max_length_tokens, + max_context_length_tokens, + model_select_dropdown, + ] + output_widgets = [chatbot, history, status_display] + + transfer_input_args = dict( + fn=transfer_input, + inputs=[text_box, image_box], + outputs=[input_text, input_image, text_box, image_box, submitBtn], + show_progress=True, + ) + + predict_args = dict( + fn=predict, + inputs=input_widgets, + outputs=output_widgets, + show_progress=True, + ) + + retry_args = dict( + fn=retry, + inputs=input_widgets, + outputs=output_widgets, + show_progress=True, + ) + + reset_args = dict(fn=reset_textbox, inputs=[], outputs=[text_box, status_display]) + + predict_events = [ + text_box.submit(**transfer_input_args).then(**predict_args), + submitBtn.click(**transfer_input_args).then(**predict_args), + ] + + emptyBtn.click(reset_state, outputs=output_widgets, show_progress=True) + emptyBtn.click(**reset_args) + retryBtn.click(**retry_args) + + delLastBtn.click( + delete_last_conversation, + [chatbot, history], + output_widgets, + show_progress=True, + ) + + cancelBtn.click(cancel_outputing, [], [status_display], cancels=predict_events) + + return demo + + +if __name__ == "__main__": + demo = build_demo(MODELS) + demo.title = "DeepSeek-VL Chatbot" + + reload_javascript() + demo.queue(concurrency_count=CONCURRENT_COUNT).launch( + share=False, + favicon_path="deepseek_vl/serve/assets/favicon.ico", + inbrowser=False, + server_name="0.0.0.0", + server_port=8122, + ) diff --git a/deepseek_vl/serve/app_modules/gradio_utils.py b/deepseek_vl/serve/app_modules/gradio_utils.py new file mode 100755 index 0000000..5aa3c70 --- /dev/null +++ b/deepseek_vl/serve/app_modules/gradio_utils.py @@ -0,0 +1,75 @@ +from functools import wraps + +import gradio as gr + + +def wrap_gen_fn(gen_fn): + @wraps(gen_fn) + def wrapped_gen_fn(prompt, *args, **kwargs): + try: + yield from gen_fn(prompt, *args, **kwargs) + except gr.Error as g_err: + raise g_err + except Exception as e: + raise gr.Error(f'Failed to generate text: {e}') from e + + return wrapped_gen_fn + + +def delete_last_conversation(chatbot, history): + if len(history) % 2 != 0: + gr.Error("history length is not even") + return ( + chatbot, + history, + "Delete Done", + ) + + if len(chatbot) > 0: + chatbot.pop() + + if len(history) > 0 and len(history) % 2 == 0: + history.pop() + history.pop() + + return ( + chatbot, + history, + "Delete Done", + ) + + +def reset_state(): + return [], [], None, "Reset Done" + + +def reset_textbox(): + return gr.update(value=""), "" + + +def cancel_outputing(): + return "Stop Done" + + +def transfer_input(input_text, input_image): + print("transferring input text and input image") + return ( + input_text, + input_image, + gr.update(value=""), + gr.update(value=None), + gr.Button(visible=True), + ) + + +class State: + interrupted = False + + def interrupt(self): + self.interrupted = True + + def recover(self): + self.interrupted = False + + +shared_state = State() diff --git a/deepseek_vl/serve/app_modules/overwrites.py b/deepseek_vl/serve/app_modules/overwrites.py new file mode 100755 index 0000000..3e4b8f2 --- /dev/null +++ b/deepseek_vl/serve/app_modules/overwrites.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +import logging +from typing import List, Tuple + +from app_modules.presets import * +from app_modules.utils import * + + +def compact_text_chunks(self, prompt, text_chunks: List[str]) -> List[str]: + logging.debug("Compacting text chunks...🚀🚀🚀") + combined_str = [c.strip() for c in text_chunks if c.strip()] + combined_str = [f"[{index+1}] {c}" for index, c in enumerate(combined_str)] + combined_str = "\n\n".join(combined_str) + # resplit based on self.max_chunk_overlap + text_splitter = self.get_text_splitter_given_prompt(prompt, 1, padding=1) + return text_splitter.split_text(combined_str) + + +def postprocess(self, y: List[Tuple[str | None, str | None]]) -> List[Tuple[str | None, str | None]]: + """ + Parameters: + y: List of tuples representing the message and response pairs. Each message and response should be a string, which may be in Markdown format. + Returns: + List of tuples representing the message and response. Each message and response will be a string of HTML. + """ + if y is None or y == []: + return [] + temp = [] + for x in y: + user, bot = x + if not detect_converted_mark(user): + user = convert_asis(user) + if not detect_converted_mark(bot): + bot = convert_mdtext(bot) + temp.append((user, bot)) + return temp + + +with open("deepseek_vl/serve/assets/custom.js", "r", encoding="utf-8") as f, open( + "deepseek_vl/serve/assets/Kelpy-Codos.js", "r", encoding="utf-8" +) as f2: + customJS = f.read() + kelpyCodos = f2.read() + + +def reload_javascript(): + print("Reloading javascript...") + js = f'' + + def template_response(*args, **kwargs): + res = GradioTemplateResponseOriginal(*args, **kwargs) + res.body = res.body.replace(b'', f'{js}'.encode("utf8")) + res.init_headers() + return res + + gr.routes.templates.TemplateResponse = template_response + + +GradioTemplateResponseOriginal = gr.routes.templates.TemplateResponse diff --git a/deepseek_vl/serve/app_modules/presets.py b/deepseek_vl/serve/app_modules/presets.py new file mode 100755 index 0000000..f99b6e3 --- /dev/null +++ b/deepseek_vl/serve/app_modules/presets.py @@ -0,0 +1,77 @@ +# -*- coding:utf-8 -*- +import gradio as gr + +title = """

Chat with DeepSeek-VL

""" +description_top = """""" +description = """""" +CONCURRENT_COUNT = 10 + + +ALREADY_CONVERTED_MARK = "" + +small_and_beautiful_theme = gr.themes.Soft( + primary_hue=gr.themes.Color( + c50="#EBFAF2", + c100="#CFF3E1", + c200="#A8EAC8", + c300="#77DEA9", + c400="#3FD086", + c500="#02C160", + c600="#06AE56", + c700="#05974E", + c800="#057F45", + c900="#04673D", + c950="#2E5541", + name="small_and_beautiful", + ), + secondary_hue=gr.themes.Color( + c50="#576b95", + c100="#576b95", + c200="#576b95", + c300="#576b95", + c400="#576b95", + c500="#576b95", + c600="#576b95", + c700="#576b95", + c800="#576b95", + c900="#576b95", + c950="#576b95", + ), + neutral_hue=gr.themes.Color( + name="gray", + c50="#f6f7f8", + # c100="#f3f4f6", + c100="#F2F2F2", + c200="#e5e7eb", + c300="#d1d5db", + c400="#B2B2B2", + c500="#808080", + c600="#636363", + c700="#515151", + c800="#393939", + # c900="#272727", + c900="#2B2B2B", + c950="#171717", + ), + radius_size=gr.themes.sizes.radius_sm, +).set( + # button_primary_background_fill="*primary_500", + button_primary_background_fill_dark="*primary_600", + # button_primary_background_fill_hover="*primary_400", + # button_primary_border_color="*primary_500", + button_primary_border_color_dark="*primary_600", + button_primary_text_color="white", + button_primary_text_color_dark="white", + button_secondary_background_fill="*neutral_100", + button_secondary_background_fill_hover="*neutral_50", + button_secondary_background_fill_dark="*neutral_900", + button_secondary_text_color="*neutral_800", + button_secondary_text_color_dark="white", + # background_fill_primary="#F7F7F7", + # background_fill_primary_dark="#1F1F1F", + # block_title_text_color="*primary_500", + block_title_background_fill_dark="*primary_900", + block_label_background_fill_dark="*primary_900", + input_background_fill="#F6F6F6", + # chatbot_code_background_color_dark="*neutral_950", +) diff --git a/deepseek_vl/serve/app_modules/utils.py b/deepseek_vl/serve/app_modules/utils.py new file mode 100755 index 0000000..3bd85cd --- /dev/null +++ b/deepseek_vl/serve/app_modules/utils.py @@ -0,0 +1,200 @@ +# -*- coding:utf-8 -*- +from __future__ import annotations + +import html +import logging +import re +import time + +import mdtex2html +from markdown import markdown +from pygments import highlight +from pygments.formatters import HtmlFormatter +from pygments.lexers import ClassNotFound, get_lexer_by_name, guess_lexer + +from app_modules.presets import ALREADY_CONVERTED_MARK + +logger = logging.getLogger('gradio_logger') + + +def configure_logger(): + logger = logging.getLogger('gradio_logger') + logger.setLevel(logging.DEBUG) + + timestr = time.strftime("%Y%m%d-%H%M%S") + file_handler = logging.FileHandler(f'deepseek_vl/serve/logs/{timestr}_gradio_log.log') + console_handler = logging.StreamHandler() + + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + console_handler.setFormatter(formatter) + file_handler.setFormatter(formatter) + + console_handler.setLevel(logging.INFO) + file_handler.setLevel(logging.INFO) + + logger.addHandler(console_handler) + logger.addHandler(file_handler) + + return logger + + +def strip_stop_words(x, stop_words): + for w in stop_words: + if w in x: + return x[: x.index(w)].strip() + return x.strip() + + +def format_output(history, text, x): + updated_history = history + [[text, x]] + a = [[y[0], convert_to_markdown(y[1])] for y in updated_history] + return a, updated_history + + +def markdown_to_html_with_syntax_highlight(md_str): # deprecated + def replacer(match): + lang = match.group(1) or "text" + code = match.group(2) + + try: + lexer = get_lexer_by_name(lang, stripall=True) + except ValueError: + lexer = get_lexer_by_name("text", stripall=True) + + formatter = HtmlFormatter() + highlighted_code = highlight(code, lexer, formatter) + + return f'
{highlighted_code}
' + + code_block_pattern = r"```(\w+)?\n([\s\S]+?)\n```" + md_str = re.sub(code_block_pattern, replacer, md_str, flags=re.MULTILINE) + + html_str = markdown(md_str) + return html_str + + +def normalize_markdown(md_text: str) -> str: # deprecated + lines = md_text.split("\n") + normalized_lines = [] + inside_list = False + + for i, line in enumerate(lines): + if re.match(r"^(\d+\.|-|\*|\+)\s", line.strip()): + if not inside_list and i > 0 and lines[i - 1].strip() != "": + normalized_lines.append("") + inside_list = True + normalized_lines.append(line) + elif inside_list and line.strip() == "": + if i < len(lines) - 1 and not re.match(r"^(\d+\.|-|\*|\+)\s", lines[i + 1].strip()): + normalized_lines.append(line) + continue + else: + inside_list = False + normalized_lines.append(line) + + return "\n".join(normalized_lines) + + +def convert_mdtext(md_text): + code_block_pattern = re.compile(r"```(.*?)(?:```|$)", re.DOTALL) + inline_code_pattern = re.compile(r"`(.*?)`", re.DOTALL) + code_blocks = code_block_pattern.findall(md_text) + non_code_parts = code_block_pattern.split(md_text)[::2] + + result = [] + for non_code, code in zip(non_code_parts, code_blocks + [""]): + if non_code.strip(): + non_code = normalize_markdown(non_code) + if inline_code_pattern.search(non_code): + result.append(markdown(non_code, extensions=["tables"])) + else: + result.append(mdtex2html.convert(non_code, extensions=["tables"])) + if code.strip(): + code = f"\n```{code}\n\n```" + code = markdown_to_html_with_syntax_highlight(code) + result.append(code) + result = "".join(result) + result += ALREADY_CONVERTED_MARK + return result + + +def convert_asis(userinput): + return f'

{html.escape(userinput)}

{ALREADY_CONVERTED_MARK}' + + +def is_stop_word_or_prefix(s: str, stop_words: list) -> bool: + return any(s.endswith(stop_word) for stop_word in stop_words) + + +def detect_converted_mark(userinput): + return bool(userinput.endswith(ALREADY_CONVERTED_MARK)) + + +def detect_language(code): + first_line = "" if code.startswith("\n") else code.strip().split("\n", 1)[0] + language = first_line.lower() if first_line else "" + code_without_language = code[len(first_line) :].lstrip() if first_line else code + return language, code_without_language + + +def convert_to_markdown(text): + text = text.replace("$", "$") + text = text.replace("\r\n", "\n") + + def replace_leading_tabs_and_spaces(line): + new_line = [] + + for char in line: + if char == "\t": + new_line.append(" ") + elif char == " ": + new_line.append(" ") + else: + break + return "".join(new_line) + line[len(new_line) :] + + markdown_text = "" + lines = text.split("\n") + in_code_block = False + + for line in lines: + if in_code_block is False and line.startswith("```"): + in_code_block = True + markdown_text += f"{line}\n" + elif in_code_block is True and line.startswith("```"): + in_code_block = False + markdown_text += f"{line}\n" + elif in_code_block: + markdown_text += f"{line}\n" + else: + line = replace_leading_tabs_and_spaces(line) + line = re.sub(r"^(#)", r"\\\1", line) + markdown_text += f"{line} \n" + + return markdown_text + + +def add_language_tag(text): + def detect_language(code_block): + try: + lexer = guess_lexer(code_block) + return lexer.name.lower() + except ClassNotFound: + return "" + + code_block_pattern = re.compile(r"(```)(\w*\n[^`]+```)", re.MULTILINE) + + def replacement(match): + code_block = match.group(2) + if match.group(2).startswith("\n"): + language = detect_language(code_block) + return f"```{language}{code_block}```" if language else f"```\n{code_block}```" + else: + return match.group(1) + code_block + "```" + + text2 = code_block_pattern.sub(replacement, text) + return text2 + + +def is_variable_assigned(var_name: str) -> bool: + return var_name in locals() diff --git a/deepseek_vl/serve/assets/Kelpy-Codos.js b/deepseek_vl/serve/assets/Kelpy-Codos.js new file mode 100755 index 0000000..923c9f9 --- /dev/null +++ b/deepseek_vl/serve/assets/Kelpy-Codos.js @@ -0,0 +1,79 @@ +// ==UserScript== +// @name Kelpy Codos +// @namespace https://github.com/Keldos-Li/Kelpy-Codos +// @version 1.0.5 +// @author Keldos; https://keldos.me/ +// @description Add copy button to PRE tags before CODE tag, for Chuanhu ChatGPT especially. +// Based on Chuanhu ChatGPT version: ac04408 (2023-3-22) +// @license GPL-3.0 +// @grant none +// ==/UserScript== + +(function () { + "use strict"; + + function addCopyButton(pre) { + var code = pre.querySelector("code"); + if (!code) { + return; // 如果没有找到 元素,则不添加按钮 + } + var firstChild = code.firstChild; + if (!firstChild) { + return; // 如果 元素没有子节点,则不添加按钮 + } + var button = document.createElement("button"); + button.textContent = "\uD83D\uDCCE"; // 使用 📎 符号作为“复制”按钮的文本 + button.style.position = "relative"; + button.style.float = "right"; + button.style.fontSize = "1em"; // 可选:调整按钮大小 + button.style.background = "none"; // 可选:去掉背景颜色 + button.style.border = "none"; // 可选:去掉边框 + button.style.cursor = "pointer"; // 可选:显示指针样式 + button.addEventListener("click", function () { + var range = document.createRange(); + range.selectNodeContents(code); + range.setStartBefore(firstChild); // 将范围设置为第一个子节点之前 + var selection = window.getSelection(); + selection.removeAllRanges(); + selection.addRange(range); + + try { + var success = document.execCommand("copy"); + if (success) { + button.textContent = "\u2714"; + setTimeout(function () { + button.textContent = "\uD83D\uDCCE"; // 恢复按钮为“复制” + }, 2000); + } else { + button.textContent = "\u2716"; + } + } catch (e) { + console.error(e); + button.textContent = "\u2716"; + } + + selection.removeAllRanges(); + }); + code.insertBefore(button, firstChild); // 将按钮插入到第一个子元素之前 + } + + function handleNewElements(mutationsList, observer) { + for (var mutation of mutationsList) { + if (mutation.type === "childList") { + for (var node of mutation.addedNodes) { + if (node.nodeName === "PRE") { + addCopyButton(node); + } + } + } + } + } + + var observer = new MutationObserver(handleNewElements); + observer.observe(document.documentElement, { + childList: true, + subtree: true, + }); + + document.querySelectorAll("pre").forEach(addCopyButton); +})(); diff --git a/deepseek_vl/serve/assets/avatar.png b/deepseek_vl/serve/assets/avatar.png new file mode 100755 index 0000000..d4b742b Binary files /dev/null and b/deepseek_vl/serve/assets/avatar.png differ diff --git a/deepseek_vl/serve/assets/custom.css b/deepseek_vl/serve/assets/custom.css new file mode 100755 index 0000000..fcf0b47 --- /dev/null +++ b/deepseek_vl/serve/assets/custom.css @@ -0,0 +1,334 @@ +:root { + --chatbot-color-light: #f3f3f3; + --chatbot-color-dark: #121111; +} + +/* status_display */ +#status_display { + display: flex; + min-height: 2.5em; + align-items: flex-end; + justify-content: flex-end; +} +#status_display p { + font-size: 0.85em; + font-family: monospace; + color: var(--body-text-color-subdued); +} + +/* usage_display */ +#usage_display { + height: 1em; +} +#usage_display p { + padding: 0 1em; + font-size: 0.85em; + font-family: monospace; + color: var(--body-text-color-subdued); +} +/* list */ +ol:not(.options), +ul:not(.options) { + padding-inline-start: 2em !important; +} + +/* Thank @Keldos-Li for fixing it */ +/* Light mode (default) */ +#deepseek_chatbot { + background-color: var(--chatbot-color-light) !important; + color: #000000 !important; +} +[data-testid="bot"] { + background-color: #ffffff !important; +} +[data-testid="user"] { + background-color: #95ec69 !important; +} + +/* Dark mode */ +.dark #deepseek_chatbot { + background-color: var(--chatbot-color-dark) !important; + color: #ffffff !important; +} +.dark [data-testid="bot"] { + background-color: #2c2c2c !important; +} +.dark [data-testid="user"] { + background-color: #26b561 !important; +} + +#deepseek_chatbot { + height: 100%; + min-height: 800px; + flex-grow: 1; + overflow: auto; +} + +[class*="message"] { + border-radius: var(--radius-xl) !important; + border: none; + padding: var(--spacing-xl) !important; + font-size: var(--text-md) !important; + line-height: var(--line-md) !important; + min-height: calc(var(--text-md) * var(--line-md) + 2 * var(--spacing-xl)); + min-width: calc(var(--text-md) * var(--line-md) + 2 * var(--spacing-xl)); +} +[data-testid="bot"] { + max-width: 85%; + border-bottom-left-radius: 0 !important; +} +[data-testid="user"] { + max-width: 85%; + width: auto !important; + border-bottom-right-radius: 0 !important; +} +/* Table */ +table { + margin: 1em 0; + border-collapse: collapse; + empty-cells: show; +} +td, +th { + border: 1.2px solid var(--border-color-primary) !important; + padding: 0.2em; +} +thead { + background-color: rgba(175, 184, 193, 0.2); +} +thead th { + padding: 0.5em 0.2em; +} +/* Inline code */ +#deepseek_chatbot code { + display: inline; + white-space: break-spaces; + border-radius: 6px; + margin: 0 2px 0 2px; + padding: 0.2em 0.4em 0.1em 0.4em; + background-color: rgba(175, 184, 193, 0.2); +} +/* Code block */ +#deepseek_chatbot pre code { + display: block; + overflow: auto; + white-space: pre; + background-color: #1c1d1e !important; + border-radius: 10px; + padding: 1.4em 1.2em 0em 1.4em; + margin: 1.2em 2em 1.2em 0.5em; + color: #fdf8f8; + box-shadow: 6px 6px 16px hsla(0, 0%, 0%, 0.2); +} +/* Hightlight */ +#deepseek_chatbot .highlight { + background-color: transparent; +} +#deepseek_chatbot .highlight .hll { + background-color: #49483e; +} +#deepseek_chatbot .highlight .c { + color: #75715e; +} /* Comment */ +#deepseek_chatbot .highlight .err { + color: #960050; + background-color: #1e0010; +} /* Error */ +#deepseek_chatbot .highlight .k { + color: #66d9ef; +} /* Keyword */ +#deepseek_chatbot .highlight .l { + color: #ae81ff; +} /* Literal */ +#deepseek_chatbot .highlight .n { + color: #f8f8f2; +} /* Name */ +#deepseek_chatbot .highlight .o { + color: #f92672; +} /* Operator */ +#deepseek_chatbot .highlight .p { + color: #f8f8f2; +} /* Punctuation */ +#deepseek_chatbot .highlight .ch { + color: #75715e; +} /* Comment.Hashbang */ +#deepseek_chatbot .highlight .cm { + color: #75715e; +} /* Comment.Multiline */ +#deepseek_chatbot .highlight .cp { + color: #75715e; +} /* Comment.Preproc */ +#deepseek_chatbot .highlight .cpf { + color: #75715e; +} /* Comment.PreprocFile */ +#deepseek_chatbot .highlight .c1 { + color: #75715e; +} /* Comment.Single */ +#deepseek_chatbot .highlight .cs { + color: #75715e; +} /* Comment.Special */ +#deepseek_chatbot .highlight .gd { + color: #f92672; +} /* Generic.Deleted */ +#deepseek_chatbot .highlight .ge { + font-style: italic; +} /* Generic.Emph */ +#deepseek_chatbot .highlight .gi { + color: #a6e22e; +} /* Generic.Inserted */ +#deepseek_chatbot .highlight .gs { + font-weight: bold; +} /* Generic.Strong */ +#deepseek_chatbot .highlight .gu { + color: #75715e; +} /* Generic.Subheading */ +#deepseek_chatbot .highlight .kc { + color: #66d9ef; +} /* Keyword.Constant */ +#deepseek_chatbot .highlight .kd { + color: #66d9ef; +} /* Keyword.Declaration */ +#deepseek_chatbot .highlight .kn { + color: #f92672; +} /* Keyword.Namespace */ +#deepseek_chatbot .highlight .kp { + color: #66d9ef; +} /* Keyword.Pseudo */ +#deepseek_chatbot .highlight .kr { + color: #66d9ef; +} /* Keyword.Reserved */ +#deepseek_chatbot .highlight .kt { + color: #66d9ef; +} /* Keyword.Type */ +#deepseek_chatbot .highlight .ld { + color: #e6db74; +} /* Literal.Date */ +#deepseek_chatbot .highlight .m { + color: #ae81ff; +} /* Literal.Number */ +#deepseek_chatbot .highlight .s { + color: #e6db74; +} /* Literal.String */ +#deepseek_chatbot .highlight .na { + color: #a6e22e; +} /* Name.Attribute */ +#deepseek_chatbot .highlight .nb { + color: #f8f8f2; +} /* Name.Builtin */ +#deepseek_chatbot .highlight .nc { + color: #a6e22e; +} /* Name.Class */ +#deepseek_chatbot .highlight .no { + color: #66d9ef; +} /* Name.Constant */ +#deepseek_chatbot .highlight .nd { + color: #a6e22e; +} /* Name.Decorator */ +#deepseek_chatbot .highlight .ni { + color: #f8f8f2; +} /* Name.Entity */ +#deepseek_chatbot .highlight .ne { + color: #a6e22e; +} /* Name.Exception */ +#deepseek_chatbot .highlight .nf { + color: #a6e22e; +} /* Name.Function */ +#deepseek_chatbot .highlight .nl { + color: #f8f8f2; +} /* Name.Label */ +#deepseek_chatbot .highlight .nn { + color: #f8f8f2; +} /* Name.Namespace */ +#deepseek_chatbot .highlight .nx { + color: #a6e22e; +} /* Name.Other */ +#deepseek_chatbot .highlight .py { + color: #f8f8f2; +} /* Name.Property */ +#deepseek_chatbot .highlight .nt { + color: #f92672; +} /* Name.Tag */ +#deepseek_chatbot .highlight .nv { + color: #f8f8f2; +} /* Name.Variable */ +#deepseek_chatbot .highlight .ow { + color: #f92672; +} /* Operator.Word */ +#deepseek_chatbot .highlight .w { + color: #f8f8f2; +} /* Text.Whitespace */ +#deepseek_chatbot .highlight .mb { + color: #ae81ff; +} /* Literal.Number.Bin */ +#deepseek_chatbot .highlight .mf { + color: #ae81ff; +} /* Literal.Number.Float */ +#deepseek_chatbot .highlight .mh { + color: #ae81ff; +} /* Literal.Number.Hex */ +#deepseek_chatbot .highlight .mi { + color: #ae81ff; +} /* Literal.Number.Integer */ +#deepseek_chatbot .highlight .mo { + color: #ae81ff; +} /* Literal.Number.Oct */ +#deepseek_chatbot .highlight .sa { + color: #e6db74; +} /* Literal.String.Affix */ +#deepseek_chatbot .highlight .sb { + color: #e6db74; +} /* Literal.String.Backtick */ +#deepseek_chatbot .highlight .sc { + color: #e6db74; +} /* Literal.String.Char */ +#deepseek_chatbot .highlight .dl { + color: #e6db74; +} /* Literal.String.Delimiter */ +#deepseek_chatbot .highlight .sd { + color: #e6db74; +} /* Literal.String.Doc */ +#deepseek_chatbot .highlight .s2 { + color: #e6db74; +} /* Literal.String.Double */ +#deepseek_chatbot .highlight .se { + color: #ae81ff; +} /* Literal.String.Escape */ +#deepseek_chatbot .highlight .sh { + color: #e6db74; +} /* Literal.String.Heredoc */ +#deepseek_chatbot .highlight .si { + color: #e6db74; +} /* Literal.String.Interpol */ +#deepseek_chatbot .highlight .sx { + color: #e6db74; +} /* Literal.String.Other */ +#deepseek_chatbot .highlight .sr { + color: #e6db74; +} /* Literal.String.Regex */ +#deepseek_chatbot .highlight .s1 { + color: #e6db74; +} /* Literal.String.Single */ +#deepseek_chatbot .highlight .ss { + color: #e6db74; +} /* Literal.String.Symbol */ +#deepseek_chatbot .highlight .bp { + color: #f8f8f2; +} /* Name.Builtin.Pseudo */ +#deepseek_chatbot .highlight .fm { + color: #a6e22e; +} /* Name.Function.Magic */ +#deepseek_chatbot .highlight .vc { + color: #f8f8f2; +} /* Name.Variable.Class */ +#deepseek_chatbot .highlight .vg { + color: #f8f8f2; +} /* Name.Variable.Global */ +#deepseek_chatbot .highlight .vi { + color: #f8f8f2; +} /* Name.Variable.Instance */ +#deepseek_chatbot .highlight .vm { + color: #f8f8f2; +} /* Name.Variable.Magic */ +#deepseek_chatbot .highlight .il { + color: #ae81ff; +} /* Literal.Number.Integer.Long */ diff --git a/deepseek_vl/serve/assets/custom.js b/deepseek_vl/serve/assets/custom.js new file mode 100755 index 0000000..2196914 --- /dev/null +++ b/deepseek_vl/serve/assets/custom.js @@ -0,0 +1 @@ +// custom javascript here diff --git a/deepseek_vl/serve/assets/favicon.ico b/deepseek_vl/serve/assets/favicon.ico new file mode 100755 index 0000000..7ba49cd Binary files /dev/null and b/deepseek_vl/serve/assets/favicon.ico differ diff --git a/deepseek_vl/serve/examples/app.png b/deepseek_vl/serve/examples/app.png new file mode 100644 index 0000000..5dcd4b0 Binary files /dev/null and b/deepseek_vl/serve/examples/app.png differ diff --git a/deepseek_vl/serve/examples/chart.png b/deepseek_vl/serve/examples/chart.png new file mode 100644 index 0000000..64ad76a Binary files /dev/null and b/deepseek_vl/serve/examples/chart.png differ diff --git a/deepseek_vl/serve/examples/mirror.png b/deepseek_vl/serve/examples/mirror.png new file mode 100644 index 0000000..88f2a12 Binary files /dev/null and b/deepseek_vl/serve/examples/mirror.png differ diff --git a/deepseek_vl/serve/examples/pipeline.png b/deepseek_vl/serve/examples/pipeline.png new file mode 100644 index 0000000..7acdc57 Binary files /dev/null and b/deepseek_vl/serve/examples/pipeline.png differ diff --git a/deepseek_vl/serve/examples/puzzle.png b/deepseek_vl/serve/examples/puzzle.png new file mode 100644 index 0000000..f67b8ac Binary files /dev/null and b/deepseek_vl/serve/examples/puzzle.png differ diff --git a/deepseek_vl/serve/examples/rap.jpeg b/deepseek_vl/serve/examples/rap.jpeg new file mode 100755 index 0000000..43f2325 Binary files /dev/null and b/deepseek_vl/serve/examples/rap.jpeg differ diff --git a/deepseek_vl/serve/inference.py b/deepseek_vl/serve/inference.py new file mode 100755 index 0000000..49766ec --- /dev/null +++ b/deepseek_vl/serve/inference.py @@ -0,0 +1,139 @@ +from threading import Thread +from typing import List + +import torch +import transformers +from transformers import (AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList, + TextIteratorStreamer) + +from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM +from deepseek_vl.utils.conversation import Conversation + + +def load_model(model_path): + vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path) + tokenizer = vl_chat_processor.tokenizer + vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) + vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval() + return tokenizer, vl_gpt, vl_chat_processor + + +def convert_conversation_to_prompts(conversation: Conversation): + prompts = [] + messages = conversation.messages + + for i in range(0, len(messages), 2): + prompt = { + "role": messages[i][0], + "content": messages[i][1][0] if isinstance(messages[i][1], tuple) else messages[i][1], + "images": [messages[i][1][1]] if isinstance(messages[i][1], tuple) else [], + } + response = {"role": messages[i + 1][0], "content": messages[i + 1][1]} + prompts.extend([prompt, response]) + + return prompts + + +class StoppingCriteriaSub(StoppingCriteria): + def __init__(self, stops=[], encounters=1): + super().__init__() + self.stops = [stop.to("cuda") for stop in stops] + + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs): + for stop in self.stops: + if input_ids.shape[-1] < len(stop): + continue + if torch.all((stop == input_ids[0][-len(stop) :])).item(): + return True + + return False + + +@torch.inference_mode() +def deepseek_generate( + prompts: list, + vl_gpt: torch.nn.Module, + vl_chat_processor, + tokenizer: transformers.PreTrainedTokenizer, + stop_words: list, + max_length: int = 256, + temperature: float = 1.0, + top_p: float = 1.0, + repetition_penalty=1.1, +): + prompts = prompts + pil_images = list() + for message in prompts: + if "images" not in message: + continue + for pil_img in message["images"]: + pil_images.append(pil_img) + + prepare_inputs = vl_chat_processor( + conversations=prompts, + images=pil_images, + force_batchify=True + ).to(vl_gpt.device) + + return generate( + vl_gpt, + tokenizer, + prepare_inputs, + max_length, + temperature, + repetition_penalty, + top_p, + stop_words, + ) + + +@torch.inference_mode() +def generate( + vl_gpt, + tokenizer, + prepare_inputs, + max_gen_len: int = 256, + temperature: float = 0, + repetition_penalty=1.1, + top_p: float = 0.95, + stop_words: List[str] = [], +): + """Stream the text output from the multimodality model with prompt and image inputs.""" + inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs) + + streamer = TextIteratorStreamer(tokenizer) + + stop_words_ids = [ + torch.tensor(tokenizer.encode(stop_word)) for stop_word in stop_words + ] + stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)]) + + generation_config = dict( + inputs_embeds=inputs_embeds, + attention_mask=prepare_inputs.attention_mask, + pad_token_id=tokenizer.eos_token_id, + bos_token_id=tokenizer.bos_token_id, + eos_token_id=tokenizer.eos_token_id, + max_new_tokens=max_gen_len, + do_sample=True, + use_cache=True, + streamer=streamer, + stopping_criteria=stopping_criteria, + ) + + if temperature > 0: + generation_config.update( + { + "do_sample": True, + "top_p": top_p, + "temperature": temperature, + "repetition_penalty": repetition_penalty, + } + ) + else: + generation_config["do_sample"] = False + + thread = Thread(target=vl_gpt.language_model.generate, kwargs=generation_config) + thread.start() + + yield from streamer