Support LLM Microservice with Streaming Response (opea-project#51)

* support llm with streaming response Signed-off-by: letonghan <[email protected]> --------- Signed-off-by: letonghan <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
poussa · May 13, 2024 · 931770d · 931770d
1 parent a9399ed
commit 931770d
Show file tree

Hide file tree

Showing 3 changed files with 79 additions and 4 deletions.
diff --git a/comps/llms/README.md b/comps/llms/README.md
@@ -63,7 +63,7 @@ docker build -t opea/gen-ai-comps:llm-tgi-server --build-arg https_proxy=$https_
 ## Run Docker with CLI
 
 ```bash
-docker run -d --name="llm-tgi-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TGI_LLM_ENDPOINT=$TGI_LLM_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN opea/gen-ai-comps:llm-tgi-server
+docker run -d --name="llm-tgi-server" -p 9000:9000 -p 9001:9001 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TGI_LLM_ENDPOINT=$TGI_LLM_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN opea/gen-ai-comps:llm-tgi-server
 ```
 
 ## Run Docker with Docker Compose
@@ -78,16 +78,29 @@ docker compose -f docker_compose_llm.yaml up -d
 ## Check Service Status
 
 ```bash
-curl http://localhost:9000/v1/health_check\
+curl http://${your_ip}:9000/v1/health_check\
+  -X GET \
+  -H 'Content-Type: application/json'
+
+curl http://${your_ip}:9001/v1/health_check\
   -X GET \
   -H 'Content-Type: application/json'
 ```
 
 ## Consume LLM Service
 
 ```bash
-curl http://localhost:9000/v1/chat/completions\
+curl http://${your_ip}:9000/v1/chat/completions\
+  -X POST \
+  -d '{"query":"What is Deep Learning?","doc":{"text":"Deep Learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured or unlabeled."}}' \
+  -H 'Content-Type: application/json'
+```
+
+## Consume LLM Stream Service
+
+```bash
+curl http://${your_ip}:9001/v1/chat/completions_stream\
   -X POST \
-  -d '{"input":{"query":"What is Deep Learning?","doc":{"text":"Deep Learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured or unlabeled."}},"params":{"max_new_tokens":128}}' \
+  -d '{"query":"What is Deep Learning?","doc":{"text":"Deep Learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured or unlabeled."}}' \
   -H 'Content-Type: application/json'
 ```
diff --git a/comps/llms/langchain/docker/docker_compose_llm.yaml b/comps/llms/langchain/docker/docker_compose_llm.yaml
@@ -29,6 +29,7 @@ services:
     container_name: llm-tgi-server
     ports:
       - "9000:9000"
+      - "9001:9001"
     ipc: host
     environment:
       http_proxy: ${http_proxy}

diff --git a/comps/llms/langchain/llm_tgi.py b/comps/llms/langchain/llm_tgi.py
@@ -15,6 +15,7 @@
 import os
 from typing import Union
 
+from fastapi.responses import StreamingResponse
 from langchain_community.llms import HuggingFaceEndpoint
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import ChatPromptTemplate
@@ -55,5 +56,65 @@ def llm_generate(input: Union[TextDoc, RerankedDoc]) -> GeneratedDoc:
     return res
 
 
+def post_process_text(text: str):
+    if text == " ":
+        return "data: @#$\n\n"
+    if text == "\n":
+        return "data: <br/>\n\n"
+    if text.isspace():
+        return None
+    new_text = text.replace(" ", "@#$")
+    return f"data: {new_text}\n\n"
+
+
+@register_microservice(
+    name="opea_service@llm_tgi_stream", expose_endpoint="/v1/chat/completions_stream", host="0.0.0.0", port=9001
+)
+def llm_generate_stream(input: Union[TextDoc, RerankedDoc]):
+    llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
+    params = LLMParamsDoc()
+    llm = HuggingFaceEndpoint(
+        endpoint_url=llm_endpoint,
+        max_new_tokens=params.max_new_tokens,
+        top_k=params.top_k,
+        top_p=params.top_p,
+        typical_p=params.typical_p,
+        temperature=params.temperature,
+        repetition_penalty=params.repetition_penalty,
+        streaming=params.streaming,
+    )
+    if isinstance(input, RerankedDoc):
+        template = """Answer the question based only on the following context:
+        {context}
+        Question: {question}
+        """
+        prompt = ChatPromptTemplate.from_template(template)
+        chain = prompt | llm | StrOutputParser()
+        final_input = {"question": input.query, "context": input.doc.text}
+    elif isinstance(input, TextDoc):
+        chain = llm
+        final_input = input.text
+    else:
+        raise TypeError("Invalid input type. Expected TextDoc or RerankedDoc.")
+
+    def stream_generator():
+        chat_response = ""
+        for text in chain.stream(final_input):
+            chat_response += text
+            processed_text = post_process_text(text)
+            if text and processed_text:
+                if "</s>" in text:
+                    res = text.split("</s>")[0]
+                    if res != "":
+                        yield res
+                    break
+                yield processed_text
+        print(f"[llm - chat_stream] stream response: {chat_response}")
+        yield "data: [DONE]\n\n"
+
+    return StreamingResponse(stream_generator(), media_type="text/event-stream")
+
+
 if __name__ == "__main__":
     opea_microservices["opea_service@llm_tgi"].start()
+    opea_microservices["opea_service@llm_tgi_stream"].start()