feat: chat completion in stream mode

jameszyao · SimsonW · commit e89583f65ee6 · 2024-01-16T21:44:18.000+08:00
diff --git a/taskingai/client/api/inference_api.py b/taskingai/client/api/inference_api.py
@@ -13,8 +13,9 @@
 # python 2 and python 3 compatibility library
 import six
 
-from taskingai.client.api_client import SyncApiClient
-
+from ..api_client import SyncApiClient
+from ..stream import Stream
+from ..models import INFERENCE_CHAT_COMPLETION_STREAM_CAST_MAP
 
 class InferenceApi(object):
 
@@ -23,38 +24,31 @@ def __init__(self, api_client=None):
             api_client = SyncApiClient()
         self.api_client = api_client
 
-    def chat_completion(self, body, **kwargs):  # noqa: E501
+    def chat_completion(self, body, stream = False, **kwargs):  # noqa: E501
         """Chat Completion  # noqa: E501
 
         Model inference for chat completion.  # noqa: E501
-        This method makes a synchronous HTTP request by default. To make an
-        asynchronous HTTP request, please pass async_req=True
-        >>> thread = api.chat_completion(body, async_req=True)
-        >>> result = thread.get()
-
-        :param async_req bool
         :param ChatCompletionRequest body: (required)
         :return: object
                  If the method is called asynchronously,
                  returns the request thread.
         """
         kwargs['_return_http_data_only'] = True
-        if kwargs.get('async_req'):
-            return self.chat_completion_with_http_info(body, **kwargs)  # noqa: E501
+        cast_map = INFERENCE_CHAT_COMPLETION_STREAM_CAST_MAP
+        response = self.chat_completion_with_http_info(body, stream, **kwargs)
+        if not stream:
+            return response
         else:
-            (data) = self.chat_completion_with_http_info(body, **kwargs)  # noqa: E501
-            return data
+            return Stream(
+                cast_map=cast_map,
+                response=response,
+                client=self.api_client
+            )
 
-    def chat_completion_with_http_info(self, body, **kwargs):  # noqa: E501
+    def chat_completion_with_http_info(self, body, stream, **kwargs):  # noqa: E501
         """Chat Completion  # noqa: E501
 
         Model inference for chat completion.  # noqa: E501
-        This method makes a synchronous HTTP request by default. To make an
-        asynchronous HTTP request, please pass async_req=True
-        >>> thread = api.chat_completion_with_http_info(body, async_req=True)
-        >>> result = thread.get()
-
-        :param async_req bool
         :param ChatCompletionRequest body: (required)
         :return: object
                  If the method is called asynchronously,
@@ -106,7 +100,7 @@ def chat_completion_with_http_info(self, body, **kwargs):  # noqa: E501
         # Authentication setting
         auth_settings = []  # noqa: E501
 
-        return self.api_client.call_api(
+        response = self.api_client.call_api(
             '/v1/inference/chat_completion', 'POST',
             path_params,
             query_params,
@@ -119,7 +113,10 @@ def chat_completion_with_http_info(self, body, **kwargs):  # noqa: E501
             _return_http_data_only=params.get('_return_http_data_only'),
             _preload_content=params.get('_preload_content', True),
             _request_timeout=params.get('_request_timeout'),
-            collection_formats=collection_formats)
+            collection_formats=collection_formats,
+            stream=stream
+        )
+        return response
 
     def text_embedding(self, body, **kwargs):  # noqa: E501
         """Text Embedding  # noqa: E501
@@ -137,11 +134,8 @@ def text_embedding(self, body, **kwargs):  # noqa: E501
                  returns the request thread.
         """
         kwargs['_return_http_data_only'] = True
-        if kwargs.get('async_req'):
-            return self.text_embedding_with_http_info(body, **kwargs)  # noqa: E501
-        else:
-            (data) = self.text_embedding_with_http_info(body, **kwargs)  # noqa: E501
-            return data
+        (data) = self.text_embedding_with_http_info(body, **kwargs)  # noqa: E501
+        return data
 
     def text_embedding_with_http_info(self, body, **kwargs):  # noqa: E501
         """Text Embedding  # noqa: E501
diff --git a/taskingai/client/api_client.py b/taskingai/client/api_client.py
@@ -437,7 +437,7 @@ def __call_api(
             query_params=None, header_params=None, body=None, post_params=None,
             files=None, response_type=None, auth_settings=None,
             _return_http_data_only=None, collection_formats=None,
-            _preload_content=True, _request_timeout=None):
+            _preload_content=True, _request_timeout=None, stream=False):
 
         config = self.configuration
 
@@ -490,11 +490,15 @@ def __call_api(
 
         # perform request and return response
         response_data = self.request(
-            method, url, query_params=query_params, headers=header_params,
+            method, url, stream=stream,
+            query_params=query_params, headers=header_params,
             post_params=post_params, body=body,
             _preload_content=_preload_content,
             _request_timeout=_request_timeout)
 
+        if stream:
+            return response_data
+
         self.last_response = response_data
 
         return_data = response_data
@@ -517,7 +521,7 @@ def call_api(self, resource_path, method,
                  body=None, post_params=None, files=None,
                  response_type=None, auth_settings=None,
                  _return_http_data_only=None, collection_formats=None,
-                 _preload_content=True, _request_timeout=None):
+                 _preload_content=True, _request_timeout=None, stream=False):
         """Makes the HTTP request (synchronous) and returns deserialized data.
 
         :param resource_path: Path to method endpoint.
@@ -551,27 +555,30 @@ def call_api(self, resource_path, method,
                                body, post_params, files,
                                response_type, auth_settings,
                                _return_http_data_only, collection_formats,
-                               _preload_content, _request_timeout)
+                               _preload_content, _request_timeout, stream)
 
 
-    def request(self, method, url, query_params=None, headers=None,
+    def request(self, method, url, stream=False, query_params=None, headers=None,
                 post_params=None, body=None, _preload_content=True,
                 _request_timeout=None):
         """Makes the HTTP request using RESTClient."""
         if method == "GET":
             return self.rest_client.GET(url,
+                                        stream=stream,
                                         query_params=query_params,
                                         _preload_content=_preload_content,
                                         _request_timeout=_request_timeout,
                                         headers=headers)
         elif method == "HEAD":
             return self.rest_client.HEAD(url,
+                                        stream=stream,
                                          query_params=query_params,
                                          _preload_content=_preload_content,
                                          _request_timeout=_request_timeout,
                                          headers=headers)
         elif method == "OPTIONS":
             return self.rest_client.OPTIONS(url,
+                                        stream=stream,
                                             query_params=query_params,
                                             headers=headers,
                                             post_params=post_params,
@@ -580,6 +587,7 @@ def request(self, method, url, query_params=None, headers=None,
                                             body=body)
         elif method == "POST":
             return self.rest_client.POST(url,
+                                        stream=stream,
                                          query_params=query_params,
                                          headers=headers,
                                          post_params=post_params,
@@ -588,6 +596,7 @@ def request(self, method, url, query_params=None, headers=None,
                                          body=body)
         elif method == "PUT":
             return self.rest_client.PUT(url,
+                                        stream=stream,
                                         query_params=query_params,
                                         headers=headers,
                                         post_params=post_params,
@@ -596,6 +605,7 @@ def request(self, method, url, query_params=None, headers=None,
                                         body=body)
         elif method == "PATCH":
             return self.rest_client.PATCH(url,
+                                          stream=stream,
                                           query_params=query_params,
                                           headers=headers,
                                           post_params=post_params,
@@ -604,6 +614,7 @@ def request(self, method, url, query_params=None, headers=None,
                                           body=body)
         elif method == "DELETE":
             return self.rest_client.DELETE(url,
+                                           stream=stream,
                                            query_params=query_params,
                                            headers=headers,
                                            _preload_content=_preload_content,
diff --git a/taskingai/client/models/entity/inference/chat_completion.py b/taskingai/client/models/entity/inference/chat_completion.py
@@ -16,6 +16,8 @@
     "ChatCompletionFunctionCall",
     "ChatCompletionFunction",
     "ChatCompletionFinishReason",
+    "ChatCompletionChunk",
+    "INFERENCE_CHAT_COMPLETION_STREAM_CAST_MAP"
 ]
 
 class ChatCompletionRole(str, Enum):
@@ -71,3 +73,17 @@ class ChatCompletion(TaskingaiBaseModel):
     finish_reason: ChatCompletionFinishReason
     message: ChatCompletionAssistantMessage
     created_timestamp: int
+
+
+class ChatCompletionChunk(TaskingaiBaseModel):
+    object: str
+    role: ChatCompletionRole
+    index: int
+    delta: str
+    created_timestamp: int
+
+
+INFERENCE_CHAT_COMPLETION_STREAM_CAST_MAP = {
+    "ChatCompletion": ChatCompletion,
+    "ChatCompletionChunk": ChatCompletionChunk
+}
diff --git a/taskingai/client/rest.py b/taskingai/client/rest.py
@@ -41,40 +41,39 @@ def getheader(self, name, default=None):
 class RESTSyncClientObject(object):
 
     def __init__(self, configuration, pools_size=4, maxsize=None):
-        # 设置连接池的最大并发连接数
+        # set default user agent
         if maxsize is None:
             maxsize = configuration.connection_pool_maxsize if configuration.connection_pool_maxsize is not None else 4
 
-        # 设置连接限制
         limits = httpx.Limits(max_connections=maxsize, max_keepalive_connections=pools_size)
 
-        # 设置 SSL 配置
+        # SSL configuration
         verify = configuration.ssl_ca_cert or True  # 如果提供了自定义 CA 证书则使用，否则默认启用 SSL 验证
         if not configuration.verify_ssl:
             verify = False  # 如果明确指定不进行 SSL 验证，则设置为 False
 
-        # 设置代理
+        # proxy configuration
         proxies = None
         if configuration.proxy:
             proxies = {
                 'http://': configuration.proxy,
                 'https://': configuration.proxy,
             }
 
-        # 创建 httpx 客户端
+        # create httpx client
         self.client = httpx.Client(
             limits=limits,
             verify=verify,
             proxies=proxies,
         )
 
-        # 如果有提供客户端证书，设置之
+        # set client cert if provided
         if configuration.cert_file and configuration.key_file:
             self.client.cert = (configuration.cert_file, configuration.key_file)
 
-    def request(self, method, url, query_params=None, headers=None,
+    def request(self, method, url, stream = False, query_params=None, headers=None,
                 body=None, post_params=None, _preload_content=True,
-                _request_timeout=None):
+                _request_timeout=None) -> RESTResponse | httpx.Response:
         """
             Perform asynchronous HTTP requests.
 
@@ -110,13 +109,23 @@ def request(self, method, url, query_params=None, headers=None,
         request_body = json.dumps(body) if body is not None else None
 
         try:
-            r = self.client.request(
-                method, url,
-                params=query_params,
-                headers=headers,
-                content=request_body,
-                timeout=_request_timeout
-            )
+            if stream:
+                with self.client.stream(
+                        method, url,
+                        params=query_params,
+                        headers=headers,
+                        content=request_body,
+                        timeout=_request_timeout
+                ) as r:
+                    return r
+            else:
+                r = self.client.request(
+                    method, url,
+                    params=query_params,
+                    headers=headers,
+                    content=request_body,
+                    timeout=_request_timeout
+                )
         except HTTPError as e:
             msg = "{0}\n{1}".format(type(e).__name__, str(e))
             raise ApiException(status=0, reason=msg)
@@ -129,72 +138,78 @@ def request(self, method, url, query_params=None, headers=None,
 
         return r
 
-    def GET(self, url, headers=None, query_params=None, _preload_content=True,
+    def GET(self, url, stream=False, headers=None, query_params=None, _preload_content=True,
             _request_timeout=None):
         return self.request("GET", url,
+                            stream=stream,
                             headers=headers,
                             _preload_content=_preload_content,
                             _request_timeout=_request_timeout,
                             query_params=query_params)
 
-    def HEAD(self, url, headers=None, query_params=None, _preload_content=True,
+    def HEAD(self, url, stream=False, headers=None, query_params=None, _preload_content=True,
              _request_timeout=None):
         return self.request("HEAD", url,
+                            stream=stream,
                             headers=headers,
                             _preload_content=_preload_content,
                             _request_timeout=_request_timeout,
                             query_params=query_params)
 
-    def OPTIONS(self, url, headers=None, query_params=None, post_params=None,
+    def OPTIONS(self, url, stream=False, headers=None, query_params=None, post_params=None,
                 body=None, _preload_content=True, _request_timeout=None):
         return self.request("OPTIONS", url,
+                            stream=stream,
                             headers=headers,
                             query_params=query_params,
                             post_params=post_params,
                             _preload_content=_preload_content,
                             _request_timeout=_request_timeout,
                             body=body)
 
-    def DELETE(self, url, headers=None, query_params=None, body=None,
+    def DELETE(self, url, stream=False, headers=None, query_params=None, body=None,
                _preload_content=True, _request_timeout=None):
         return self.request("DELETE", url,
+                            stream=stream,
                             headers=headers,
                             query_params=query_params,
                             _preload_content=_preload_content,
                             _request_timeout=_request_timeout,
                             body=body)
 
-    def POST(self, url, headers=None, query_params=None, post_params=None,
+    def POST(self, url, stream=False, headers=None, query_params=None, post_params=None,
              body=None, _preload_content=True, _request_timeout=None):
         return self.request("POST", url,
+                            stream=stream,
                             headers=headers,
                             query_params=query_params,
                             post_params=post_params,
                             _preload_content=_preload_content,
                             _request_timeout=_request_timeout,
                             body=body)
 
-    def PUT(self, url, headers=None, query_params=None, post_params=None,
+    def PUT(self, url, stream=False, headers=None, query_params=None, post_params=None,
             body=None, _preload_content=True, _request_timeout=None):
         return self.request("PUT", url,
                             headers=headers,
+                            stream=stream,
                             query_params=query_params,
                             post_params=post_params,
                             _preload_content=_preload_content,
                             _request_timeout=_request_timeout,
                             body=body)
 
-    def PATCH(self, url, headers=None, query_params=None, post_params=None,
+    def PATCH(self, url, stream=False, headers=None, query_params=None, post_params=None,
               body=None, _preload_content=True, _request_timeout=None):
         return self.request("PATCH", url,
+                            stream=stream,
                             headers=headers,
                             query_params=query_params,
                             post_params=post_params,
                             _preload_content=_preload_content,
                             _request_timeout=_request_timeout,
                             body=body)
 
-
 class RESTAsyncClientObject(object):
 
     def __init__(self, configuration, pools_size=4, maxsize=None):
diff --git a/taskingai/client/stream.py b/taskingai/client/stream.py
diff --git a/taskingai/inference/chat_completion.py b/taskingai/inference/chat_completion.py