diff --git a/comps/__init__.py b/comps/__init__.py index 55db01b3d1..465eb247a6 100644 --- a/comps/__init__.py +++ b/comps/__init__.py @@ -20,7 +20,6 @@ EmbedDoc1024, GeneratedDoc, LLMParamsDoc, - RerankedDoc, SearchedDoc, TextDoc, ) diff --git a/comps/cores/proto/docarray.py b/comps/cores/proto/docarray.py index be92148ef5..05c2556c6d 100644 --- a/comps/cores/proto/docarray.py +++ b/comps/cores/proto/docarray.py @@ -62,17 +62,13 @@ class Config: json_encoders = {np.ndarray: lambda x: x.tolist()} -class RerankedDoc(BaseDoc): - query: str - doc: TextDoc - - class GeneratedDoc(BaseDoc): text: str prompt: str class LLMParamsDoc(BaseDoc): + query: str max_new_tokens: int = 1024 top_k: int = 10 top_p: float = 0.95 diff --git a/comps/llms/README.md b/comps/llms/README.md index 43606de6d2..79ed6d0cea 100644 --- a/comps/llms/README.md +++ b/comps/llms/README.md @@ -63,7 +63,7 @@ docker build -t opea/gen-ai-comps:llm-tgi-server --build-arg https_proxy=$https_ ## Run Docker with CLI ```bash -docker run -d --name="llm-tgi-server" -p 9000:9000 -p 9001:9001 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TGI_LLM_ENDPOINT=$TGI_LLM_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN opea/gen-ai-comps:llm-tgi-server +docker run -d --name="llm-tgi-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TGI_LLM_ENDPOINT=$TGI_LLM_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN opea/gen-ai-comps:llm-tgi-server ``` ## Run Docker with Docker Compose @@ -81,26 +81,24 @@ docker compose -f docker_compose_llm.yaml up -d curl http://${your_ip}:9000/v1/health_check\ -X GET \ -H 'Content-Type: application/json' - -curl http://${your_ip}:9001/v1/health_check\ - -X GET \ - -H 'Content-Type: application/json' ``` ## Consume LLM Service +You can set the following model parameters according to your actual needs, such as `max_new_tokens`, `streaming`. + +The `streaming` parameter determines the format of the data returned by the API. It will return text string with `streaming=false`, return text streaming flow with `streaming=true`. + ```bash +# non-streaming mode curl http://${your_ip}:9000/v1/chat/completions\ -X POST \ - -d '{"query":"What is Deep Learning?","doc":{"text":"Deep Learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured or unlabeled."}}' \ + -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \ -H 'Content-Type: application/json' -``` - -## Consume LLM Stream Service -```bash -curl http://${your_ip}:9001/v1/chat/completions_stream\ +# streaming mode +curl http://${your_ip}:9000/v1/chat/completions\ -X POST \ - -d '{"query":"What is Deep Learning?","doc":{"text":"Deep Learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured or unlabeled."}}' \ + -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \ -H 'Content-Type: application/json' ``` diff --git a/comps/llms/langchain/docker/docker_compose_llm.yaml b/comps/llms/langchain/docker/docker_compose_llm.yaml index a30a0393c0..579ee267e8 100644 --- a/comps/llms/langchain/docker/docker_compose_llm.yaml +++ b/comps/llms/langchain/docker/docker_compose_llm.yaml @@ -29,7 +29,6 @@ services: container_name: llm-tgi-server ports: - "9000:9000" - - "9001:9001" ipc: host environment: http_proxy: ${http_proxy} diff --git a/comps/llms/langchain/llm_tgi.py b/comps/llms/langchain/llm_tgi.py index 8925587193..175f4b5ac6 100644 --- a/comps/llms/langchain/llm_tgi.py +++ b/comps/llms/langchain/llm_tgi.py @@ -13,47 +13,11 @@ # limitations under the License. import os -from typing import Union from fastapi.responses import StreamingResponse from langchain_community.llms import HuggingFaceEndpoint -from langchain_core.output_parsers import StrOutputParser -from langchain_core.prompts import ChatPromptTemplate -from comps import GeneratedDoc, LLMParamsDoc, RerankedDoc, TextDoc, opea_microservices, register_microservice - - -@register_microservice(name="opea_service@llm_tgi", expose_endpoint="/v1/chat/completions", host="0.0.0.0", port=9000) -def llm_generate(input: Union[TextDoc, RerankedDoc]) -> GeneratedDoc: - llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") - params = LLMParamsDoc() - llm = HuggingFaceEndpoint( - endpoint_url=llm_endpoint, - max_new_tokens=params.max_new_tokens, - top_k=params.top_k, - top_p=params.top_p, - typical_p=params.typical_p, - temperature=params.temperature, - repetition_penalty=params.repetition_penalty, - streaming=params.streaming, - ) - final_prompt = None - if isinstance(input, RerankedDoc): - template = """Answer the question based only on the following context: - {context} - Question: {question} - """ - prompt = ChatPromptTemplate.from_template(template) - chain = prompt | llm | StrOutputParser() - final_prompt = input.query - response = chain.invoke({"question": input.query, "context": input.doc.text}) - elif isinstance(input, TextDoc): - final_prompt = input.text - response = llm.invoke(input.text) - else: - raise TypeError("Invalid input type. Expected TextDoc or RerankedDoc.") - res = GeneratedDoc(text=response, prompt=final_prompt) - return res +from comps import GeneratedDoc, LLMParamsDoc, opea_microservices, register_microservice def post_process_text(text: str): @@ -67,54 +31,42 @@ def post_process_text(text: str): return f"data: {new_text}\n\n" -@register_microservice( - name="opea_service@llm_tgi_stream", expose_endpoint="/v1/chat/completions_stream", host="0.0.0.0", port=9001 -) -def llm_generate_stream(input: Union[TextDoc, RerankedDoc]): +@register_microservice(name="opea_service@llm_tgi", expose_endpoint="/v1/chat/completions", host="0.0.0.0", port=9000) +def llm_generate(input: LLMParamsDoc): llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") - params = LLMParamsDoc() llm = HuggingFaceEndpoint( endpoint_url=llm_endpoint, - max_new_tokens=params.max_new_tokens, - top_k=params.top_k, - top_p=params.top_p, - typical_p=params.typical_p, - temperature=params.temperature, - repetition_penalty=params.repetition_penalty, - streaming=params.streaming, + max_new_tokens=input.max_new_tokens, + top_k=input.top_k, + top_p=input.top_p, + typical_p=input.typical_p, + temperature=input.temperature, + repetition_penalty=input.repetition_penalty, + streaming=input.streaming, ) - if isinstance(input, RerankedDoc): - template = """Answer the question based only on the following context: - {context} - Question: {question} - """ - prompt = ChatPromptTemplate.from_template(template) - chain = prompt | llm | StrOutputParser() - final_input = {"question": input.query, "context": input.doc.text} - elif isinstance(input, TextDoc): - chain = llm - final_input = input.text - else: - raise TypeError("Invalid input type. Expected TextDoc or RerankedDoc.") - def stream_generator(): - chat_response = "" - for text in chain.stream(final_input): - chat_response += text - processed_text = post_process_text(text) - if text and processed_text: - if "" in text: - res = text.split("")[0] - if res != "": - yield res - break - yield processed_text - print(f"[llm - chat_stream] stream response: {chat_response}") - yield "data: [DONE]\n\n" + if input.streaming: + + def stream_generator(): + chat_response = "" + for text in llm.stream(input.query): + chat_response += text + processed_text = post_process_text(text) + if text and processed_text: + if "" in text: + res = text.split("")[0] + if res != "": + yield res + break + yield processed_text + print(f"[llm - chat_stream] stream response: {chat_response}") + yield "data: [DONE]\n\n" - return StreamingResponse(stream_generator(), media_type="text/event-stream") + return StreamingResponse(stream_generator(), media_type="text/event-stream") + else: + response = llm.invoke(input.query) + return GeneratedDoc(text=response, prompt=input.query) if __name__ == "__main__": opea_microservices["opea_service@llm_tgi"].start() - opea_microservices["opea_service@llm_tgi_stream"].start() diff --git a/comps/reranks/README.md b/comps/reranks/README.md index 8caaea7b60..581db52655 100644 --- a/comps/reranks/README.md +++ b/comps/reranks/README.md @@ -48,7 +48,7 @@ If you start an Reranking microservice with docker, the `docker_compose_rerankin ```bash cd ../../ -docker build -t opea/gen-ai-comps:reranking-tei-xeon-server --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/reranks/docker/Dockerfile . +docker build -t opea/gen-ai-comps:reranking-tei-xeon-server --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/reranks/langchain/docker/Dockerfile . ``` ## Run Docker with CLI @@ -60,7 +60,7 @@ docker run -d --name="reranking-tei-server" -p 8000:8000 --ipc=host -e http_prox ## Run Docker with Docker Compose ```bash -cd docker +cd langchain/docker docker compose -f docker_compose_reranking.yaml up -d ``` diff --git a/comps/reranks/__init__.py b/comps/reranks/langchain/__init__.py similarity index 100% rename from comps/reranks/__init__.py rename to comps/reranks/langchain/__init__.py diff --git a/comps/reranks/docker/Dockerfile b/comps/reranks/langchain/docker/Dockerfile similarity index 100% rename from comps/reranks/docker/Dockerfile rename to comps/reranks/langchain/docker/Dockerfile diff --git a/comps/reranks/docker/docker_compose_reranking.yaml b/comps/reranks/langchain/docker/docker_compose_reranking.yaml similarity index 100% rename from comps/reranks/docker/docker_compose_reranking.yaml rename to comps/reranks/langchain/docker/docker_compose_reranking.yaml diff --git a/comps/reranks/local_reranking.py b/comps/reranks/langchain/local_reranking.py similarity index 100% rename from comps/reranks/local_reranking.py rename to comps/reranks/langchain/local_reranking.py diff --git a/comps/reranks/reranking_tei_xeon.py b/comps/reranks/langchain/reranking_tei_xeon.py similarity index 70% rename from comps/reranks/reranking_tei_xeon.py rename to comps/reranks/langchain/reranking_tei_xeon.py index 34167dd545..25b63f39f3 100644 --- a/comps/reranks/reranking_tei_xeon.py +++ b/comps/reranks/langchain/reranking_tei_xeon.py @@ -12,12 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. + import json import os import requests +from langchain_core.prompts import ChatPromptTemplate -from comps import RerankedDoc, SearchedDoc, opea_microservices, register_microservice +from comps import LLMParamsDoc, SearchedDoc, opea_microservices, register_microservice @register_microservice( @@ -26,9 +28,9 @@ host="0.0.0.0", port=8000, input_datatype=SearchedDoc, - output_datatype=RerankedDoc, + output_datatype=LLMParamsDoc, ) -def reranking(input: SearchedDoc) -> RerankedDoc: +def reranking(input: SearchedDoc) -> LLMParamsDoc: docs = [doc.text for doc in input.retrieved_docs] url = tei_reranking_endpoint + "/rerank" data = {"query": input.initial_query, "texts": docs} @@ -36,8 +38,14 @@ def reranking(input: SearchedDoc) -> RerankedDoc: response = requests.post(url, data=json.dumps(data), headers=headers) response_data = response.json() best_response = max(response_data, key=lambda response: response["score"]) - res = RerankedDoc(query=input.initial_query, doc=input.retrieved_docs[best_response["index"]]) - return res + template = """Answer the question based only on the following context: + {context} + Question: {question} + """ + prompt = ChatPromptTemplate.from_template(template) + doc = input.retrieved_docs[best_response["index"]] + final_prompt = prompt.format(context=doc.text, question=input.initial_query) + return LLMParamsDoc(query=final_prompt.strip()) if __name__ == "__main__": diff --git a/comps/reranks/requirements.txt b/comps/reranks/requirements.txt index 7411b66116..648e842520 100644 --- a/comps/reranks/requirements.txt +++ b/comps/reranks/requirements.txt @@ -1,5 +1,6 @@ docarray[full] fastapi +langchain opentelemetry-api opentelemetry-exporter-otlp opentelemetry-sdk