xorbitsai · aresnow1 · Jan 24, 2024 · Jan 22, 2024 · Jan 22, 2024 · Jan 22, 2024
diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst
@@ -85,6 +85,8 @@ The following is a list of built-in LLM in Xinference:
 
    qwen-chat
 
+   qwen-vl-chat
+
    skywork
 
    skywork-math

diff --git a/doc/source/models/builtin/llm/qwen-vl-chat.rst b/doc/source/models/builtin/llm/qwen-vl-chat.rst
@@ -0,0 +1,45 @@
+.. _models_llm_qwen-vl-chat:
+
+========================================
+qwen-vl-chat
+========================================
+
+- **Context Length:** 4096
+- **Model Name:** qwen-vl-chat
+- **Languages:** en, zh
+- **Abilities:** chat, vision
+- **Description:** Qwen-VL-Chat supports more flexible interaction, such as multiple image inputs, multi-round question answering, and creative capabilities.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** none
+- **Model ID:** Qwen/Qwen-VL-Chat
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen-VL-Chat>`_, `ModelScope <https://modelscope.cn/models/Qwen/Qwen-VL-Chat>`_
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen-vl-chat --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (gptq, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** gptq
+- **Model Size (in billions):** 7
+- **Quantizations:** Int4
+- **Model ID:** Qwen/Qwen-VL-Chat-{quantization}
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/Qwen/Qwen-VL-Chat-{quantization}>`_, `ModelScope <https://modelscope.cn/models/Qwen/Qwen-VL-Chat-{quantization}>`_
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name qwen-vl-chat --size-in-billions 7 --model-format gptq --quantization ${quantization}
+
diff --git a/examples/chat_vl.ipynb b/examples/chat_vl.ipynb
@@ -32,9 +32,9 @@
     "2023-12-29 06:14:57,079 xinference.api.restful_api 9197 INFO     Starting Xinference at endpoint: http://0.0.0.0:9997\n",
     "```\n",
     "\n",
-    "Finally, we launch a ChatGLM3 model for tool calls.\n",
+    "Finally, we launch a qwen-vl-chat model for vision language chat.\n",
     "```shell\n",
-    "xinference launch -u my_vl_model -n qwen-vl-chat -f pytorch -t multimodal\n",
+    "xinference launch -u my_vl_model -n qwen-vl-chat -f pytorch\n",
     "```\n",
     "\n"
    ]

diff --git a/xinference/api/restful_api.py b/xinference/api/restful_api.py
@@ -703,7 +703,7 @@ async def build_gradio_interface(
         but calling API in async function does not return
         """
         assert self._app is not None
-        assert body.model_type in ["LLM", "multimodal"]
+        assert body.model_type == "LLM"
 
         # asyncio.Lock() behaves differently in 3.9 than 3.10+
         # A event loop is required in 3.9 but not 3.10+

diff --git a/xinference/client/restful/restful_client.py b/xinference/client/restful/restful_client.py
@@ -400,92 +400,6 @@ def chat(
         return response_data
 
 
-class RESTfulMultimodalModelHandle(RESTfulModelHandle):
-    def chat(
-        self,
-        prompt: Any,
-        system_prompt: Optional[str] = None,
-        chat_history: Optional[List["ChatCompletionMessage"]] = None,
-        tools: Optional[List[Dict]] = None,
-        generate_config: Optional[
-            Union["LlamaCppGenerateConfig", "PytorchGenerateConfig"]
-        ] = None,
-    ) -> Union["ChatCompletion", Iterator["ChatCompletionChunk"]]:
-        """
-        Given a list of messages comprising a conversation, the model will return a response via RESTful APIs.
-
-        Parameters
-        ----------
-        prompt: str
-            The user's input.
-        system_prompt: Optional[str]
-            The system context provide to Model prior to any chats.
-        chat_history: Optional[List["ChatCompletionMessage"]]
-            A list of messages comprising the conversation so far.
-        tools: Optional[List[Dict]]
-            A tool list.
-        generate_config: Optional[Union["LlamaCppGenerateConfig", "PytorchGenerateConfig"]]
-            Additional configuration for the chat generation.
-            "LlamaCppGenerateConfig" -> configuration for ggml model
-            "PytorchGenerateConfig" -> configuration for pytorch model
-
-        Returns
-        -------
-        Union["ChatCompletion", Iterator["ChatCompletionChunk"]]
-            Stream is a parameter in generate_config.
-            When stream is set to True, the function will return Iterator["ChatCompletionChunk"].
-            When stream is set to False, the function will return "ChatCompletion".
-
-        Raises
-        ------
-        RuntimeError
-            Report the failure to generate the chat from the server. Detailed information provided in error message.
-
-        """
-
-        url = f"{self._base_url}/v1/chat/completions"
-
-        if chat_history is None:
-            chat_history = []
-
-        if chat_history and chat_history[0]["role"] == "system":
-            if system_prompt is not None:
-                chat_history[0]["content"] = system_prompt
-
-        else:
-            if system_prompt is not None:
-                chat_history.insert(0, {"role": "system", "content": system_prompt})
-
-        chat_history.append({"role": "user", "content": prompt})
-
-        request_body: Dict[str, Any] = {
-            "model": self._model_uid,
-            "messages": chat_history,
-        }
-        if tools is not None:
-            raise RuntimeError("Multimodal does not support function call.")
-
-        if generate_config is not None:
-            for key, value in generate_config.items():
-                request_body[key] = value
-
-        stream = bool(generate_config and generate_config.get("stream"))
-        response = requests.post(
-            url, json=request_body, stream=stream, headers=self.auth_headers
-        )
-
-        if response.status_code != 200:
-            raise RuntimeError(
-                f"Failed to generate chat completion, detail: {_get_error_string(response)}"
-            )
-
-        if stream:
-            return streaming_response_iterator(response.iter_lines())
-
-        response_data = response.json()
-        return response_data
-
-
 class RESTfulChatglmCppChatModelHandle(RESTfulModelHandle):
     def chat(
         self,
@@ -889,10 +803,6 @@ def get_model(self, model_uid: str) -> RESTfulModelHandle:
             return RESTfulRerankModelHandle(
                 model_uid, self.base_url, auth_headers=self._headers
             )
-        elif desc["model_type"] == "multimodal":
-            return RESTfulMultimodalModelHandle(
-                model_uid, self.base_url, auth_headers=self._headers
-            )
         else:
             raise ValueError(f"Unknown model type:{desc['model_type']}")
 

diff --git a/xinference/core/chat_interface.py b/xinference/core/chat_interface.py
@@ -27,7 +27,6 @@
     RESTfulChatglmCppChatModelHandle,
     RESTfulChatModelHandle,
     RESTfulGenerateModelHandle,
-    RESTfulMultimodalModelHandle,
 )
 from ..types import ChatCompletionMessage
 
@@ -66,7 +65,7 @@ def __init__(
         )
 
     def build(self) -> "gr.Blocks":
-        if self.model_type == "multimodal":
+        if "vision" in self.model_ability:
             interface = self.build_chat_vl_interface()
         elif "chat" in self.model_ability:
             interface = self.build_chat_interface()
@@ -191,7 +190,7 @@ def predict(history, bot):
             client = RESTfulClient(self.endpoint)
             client._set_token(self._access_token)
             model = client.get_model(self.model_uid)
-            assert isinstance(model, RESTfulMultimodalModelHandle)
+            assert isinstance(model, RESTfulChatModelHandle)
 
             prompt = history[-1]
             assert prompt["role"] == "user"

diff --git a/xinference/core/supervisor.py b/xinference/core/supervisor.py
@@ -40,7 +40,6 @@
     from ..model.embedding import EmbeddingModelSpec
     from ..model.image import ImageModelFamilyV1
     from ..model.llm import LLMFamilyV1
-    from ..model.multimodal import LVLMFamilyV1
     from ..model.rerank import RerankModelSpec
     from .worker import WorkerActor
 
@@ -290,25 +289,6 @@ def _to_image_model_reg(
                 "is_builtin": is_builtin,
             }
 
-    def _to_multimodal_reg(
-        self, model_family: "LVLMFamilyV1", is_builtin: bool
-    ) -> Dict[str, Any]:
-        from ..model.llm import get_cache_status
-
-        if self.is_local_deployment():
-            specs = []
-            # TODO: does not work when the supervisor and worker are running on separate nodes.
-            for spec in model_family.model_specs:
-                cache_status = get_cache_status(model_family, spec)
-                specs.append({**spec.dict(), "cache_status": cache_status})
-            return {
-                **model_family.dict(),
-                "is_builtin": is_builtin,
-                "model_specs": specs,
-            }
-        else:
-            return {**model_family.dict(), "is_builtin": is_builtin}
-
     @log_sync(logger=logger)
     def list_model_registrations(
         self, model_type: str, detailed: bool = False
@@ -389,18 +369,6 @@ def sort_helper(item):
                         {"model_name": model_spec.model_name, "is_builtin": False}
                     )
 
-            ret.sort(key=sort_helper)
-            return ret
-        elif model_type == "multimodal":
-            from ..model.multimodal import BUILTIN_LVLM_FAMILIES
-
-            ret = []
-            for family in BUILTIN_LVLM_FAMILIES:
-                if detailed:
-                    ret.append(self._to_multimodal_reg(family, True))
-                else:
-                    ret.append({"model_name": family.model_name, "is_builtin": True})
-
             ret.sort(key=sort_helper)
             return ret
         else:
@@ -441,13 +409,6 @@ def get_model_registration(self, model_type: str, model_name: str) -> Any:
                 if f.model_name == model_name:
                     return f
             raise ValueError(f"Model {model_name} not found")
-        elif model_type == "multimodal":
-            from ..model.multimodal import BUILTIN_LVLM_FAMILIES
-
-            for f in BUILTIN_LVLM_FAMILIES:
-                if f.model_name == model_name:
-                    return f
-            raise ValueError(f"Model {model_name} not found")
         else:
             raise ValueError(f"Unsupported model type: {model_type}")
 

diff --git a/xinference/core/worker.py b/xinference/core/worker.py
@@ -430,8 +430,6 @@ async def _get_model_ability(self, model: Any, model_type: str) -> List[str]:
             return ["rerank"]
         elif model_type == "image":
             return ["text_to_image"]
-        elif model_type == "multimodal":
-            return ["multimodal"]
         else:
             assert model_type == "LLM"
             assert isinstance(model, LLM)

diff --git a/xinference/deploy/cmdline.py b/xinference/deploy/cmdline.py
@@ -499,22 +499,6 @@ def list_model_registrations(
             tabulate(table, headers=["Type", "Name", "Family", "Is-built-in"]),
             file=sys.stderr,
         )
-    elif model_type == "multimodal":
-        for registration in registrations:
-            model_name = registration["model_name"]
-            model_family = client.get_model_registration(model_type, model_name)
-            table.append(
-                [
-                    model_type,
-                    model_family["model_name"],
-                    model_family["model_lang"],
-                    registration["is_builtin"],
-                ]
-            )
-        print(
-            tabulate(table, headers=["Type", "Name", "Language", "Is-built-in"]),
-            file=sys.stderr,
-        )
     else:
         raise NotImplementedError(f"List {model_type} is not implemented.")
 

diff --git a/xinference/model/core.py b/xinference/model/core.py
@@ -56,7 +56,6 @@ def create_model_instance(
     from .embedding.core import create_embedding_model_instance
     from .image.core import create_image_model_instance
     from .llm.core import create_llm_model_instance
-    from .multimodal.core import create_multimodal_model_instance
     from .rerank.core import create_rerank_model_instance
 
     if model_type == "LLM":
@@ -87,17 +86,5 @@ def create_model_instance(
         return create_rerank_model_instance(
             subpool_addr, devices, model_uid, model_name, **kwargs
         )
-    elif model_type == "multimodal":
-        kwargs.pop("trust_remote_code", None)
-        return create_multimodal_model_instance(
-            subpool_addr,
-            devices,
-            model_uid,
-            model_name,
-            model_format,
-            model_size_in_billions,
-            quantization,
-            **kwargs,
-        )
     else:
         raise ValueError(f"Unsupported model type: {model_type}.")
diff --git a/xinference/model/llm/__init__.py b/xinference/model/llm/__init__.py
@@ -56,6 +56,7 @@ def _install():
     from .pytorch.falcon import FalconPytorchChatModel, FalconPytorchModel
     from .pytorch.internlm2 import Internlm2PytorchChatModel
     from .pytorch.llama_2 import LlamaPytorchChatModel, LlamaPytorchModel
+    from .pytorch.qwen_vl import QwenVLChatModel
     from .pytorch.vicuna import VicunaPytorchChatModel
     from .vllm.core import VLLMChatModel, VLLMModel
 
@@ -88,6 +89,7 @@ def _install():
             PytorchChatModel,
             FalconPytorchModel,
             Internlm2PytorchChatModel,
+            QwenVLChatModel,
             PytorchModel,
         ]
     )

diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
@@ -3211,5 +3211,47 @@
         "[UNUSED_TOKEN_145]"
       ]
     }
+  },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "qwen-vl-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "vision"
+    ],
    "model_description": "Qwen-VL-Chat supports more flexible interaction, such as multiple image inputs, multi-round question answering, and creative capabilities.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "Qwen/Qwen-VL-Chat",
+        "model_revision": "6665c780ade5ff3f08853b4262dcb9c8f9598d42"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen-VL-Chat-{quantization}",
+        "model_revision": "5d3a5aa033ed2c502300d426c81cc5b13bcd1409"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "QWEN",
+      "system_prompt": "You are a helpful assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ]
+    }
   }
 ]
-Original file line number
+Diff line change
@@ Expand Up / @@ -85,6 +85,8 @@ The following is a list of built-in LLM in Xinference: @@
        qwen-chat
+       qwen-vl-chat
        skywork
        skywork-math
@@ Expand Down @@