Skip to content

Commit

Permalink
Refactor LLM & Rerank Microservice (opea-project#56)
Browse files Browse the repository at this point in the history
* refactor llm & rerank service

Signed-off-by: letonghan <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add langchain folder for rerank

Signed-off-by: letonghan <[email protected]>

---------

Signed-off-by: letonghan <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
letonghan and pre-commit-ci[bot] authored May 14, 2024
1 parent a1337fd commit 39a9d53
Show file tree
Hide file tree
Showing 12 changed files with 57 additions and 104 deletions.
1 change: 0 additions & 1 deletion comps/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
EmbedDoc1024,
GeneratedDoc,
LLMParamsDoc,
RerankedDoc,
SearchedDoc,
TextDoc,
)
Expand Down
6 changes: 1 addition & 5 deletions comps/cores/proto/docarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,17 +62,13 @@ class Config:
json_encoders = {np.ndarray: lambda x: x.tolist()}


class RerankedDoc(BaseDoc):
query: str
doc: TextDoc


class GeneratedDoc(BaseDoc):
text: str
prompt: str


class LLMParamsDoc(BaseDoc):
query: str
max_new_tokens: int = 1024
top_k: int = 10
top_p: float = 0.95
Expand Down
22 changes: 10 additions & 12 deletions comps/llms/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ docker build -t opea/gen-ai-comps:llm-tgi-server --build-arg https_proxy=$https_
## Run Docker with CLI

```bash
docker run -d --name="llm-tgi-server" -p 9000:9000 -p 9001:9001 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TGI_LLM_ENDPOINT=$TGI_LLM_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN opea/gen-ai-comps:llm-tgi-server
docker run -d --name="llm-tgi-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TGI_LLM_ENDPOINT=$TGI_LLM_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN opea/gen-ai-comps:llm-tgi-server
```

## Run Docker with Docker Compose
Expand All @@ -81,26 +81,24 @@ docker compose -f docker_compose_llm.yaml up -d
curl http://${your_ip}:9000/v1/health_check\
-X GET \
-H 'Content-Type: application/json'

curl http://${your_ip}:9001/v1/health_check\
-X GET \
-H 'Content-Type: application/json'
```

## Consume LLM Service

You can set the following model parameters according to your actual needs, such as `max_new_tokens`, `streaming`.

The `streaming` parameter determines the format of the data returned by the API. It will return text string with `streaming=false`, return text streaming flow with `streaming=true`.

```bash
# non-streaming mode
curl http://${your_ip}:9000/v1/chat/completions\
-X POST \
-d '{"query":"What is Deep Learning?","doc":{"text":"Deep Learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured or unlabeled."}}' \
-d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \
-H 'Content-Type: application/json'
```

## Consume LLM Stream Service

```bash
curl http://${your_ip}:9001/v1/chat/completions_stream\
# streaming mode
curl http://${your_ip}:9000/v1/chat/completions\
-X POST \
-d '{"query":"What is Deep Learning?","doc":{"text":"Deep Learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured or unlabeled."}}' \
-d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
-H 'Content-Type: application/json'
```
1 change: 0 additions & 1 deletion comps/llms/langchain/docker/docker_compose_llm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ services:
container_name: llm-tgi-server
ports:
- "9000:9000"
- "9001:9001"
ipc: host
environment:
http_proxy: ${http_proxy}
Expand Down
108 changes: 30 additions & 78 deletions comps/llms/langchain/llm_tgi.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,47 +13,11 @@
# limitations under the License.

import os
from typing import Union

from fastapi.responses import StreamingResponse
from langchain_community.llms import HuggingFaceEndpoint
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

from comps import GeneratedDoc, LLMParamsDoc, RerankedDoc, TextDoc, opea_microservices, register_microservice


@register_microservice(name="opea_service@llm_tgi", expose_endpoint="/v1/chat/completions", host="0.0.0.0", port=9000)
def llm_generate(input: Union[TextDoc, RerankedDoc]) -> GeneratedDoc:
llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
params = LLMParamsDoc()
llm = HuggingFaceEndpoint(
endpoint_url=llm_endpoint,
max_new_tokens=params.max_new_tokens,
top_k=params.top_k,
top_p=params.top_p,
typical_p=params.typical_p,
temperature=params.temperature,
repetition_penalty=params.repetition_penalty,
streaming=params.streaming,
)
final_prompt = None
if isinstance(input, RerankedDoc):
template = """Answer the question based only on the following context:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
chain = prompt | llm | StrOutputParser()
final_prompt = input.query
response = chain.invoke({"question": input.query, "context": input.doc.text})
elif isinstance(input, TextDoc):
final_prompt = input.text
response = llm.invoke(input.text)
else:
raise TypeError("Invalid input type. Expected TextDoc or RerankedDoc.")
res = GeneratedDoc(text=response, prompt=final_prompt)
return res
from comps import GeneratedDoc, LLMParamsDoc, opea_microservices, register_microservice


def post_process_text(text: str):
Expand All @@ -67,54 +31,42 @@ def post_process_text(text: str):
return f"data: {new_text}\n\n"


@register_microservice(
name="opea_service@llm_tgi_stream", expose_endpoint="/v1/chat/completions_stream", host="0.0.0.0", port=9001
)
def llm_generate_stream(input: Union[TextDoc, RerankedDoc]):
@register_microservice(name="opea_service@llm_tgi", expose_endpoint="/v1/chat/completions", host="0.0.0.0", port=9000)
def llm_generate(input: LLMParamsDoc):
llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
params = LLMParamsDoc()
llm = HuggingFaceEndpoint(
endpoint_url=llm_endpoint,
max_new_tokens=params.max_new_tokens,
top_k=params.top_k,
top_p=params.top_p,
typical_p=params.typical_p,
temperature=params.temperature,
repetition_penalty=params.repetition_penalty,
streaming=params.streaming,
max_new_tokens=input.max_new_tokens,
top_k=input.top_k,
top_p=input.top_p,
typical_p=input.typical_p,
temperature=input.temperature,
repetition_penalty=input.repetition_penalty,
streaming=input.streaming,
)
if isinstance(input, RerankedDoc):
template = """Answer the question based only on the following context:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
chain = prompt | llm | StrOutputParser()
final_input = {"question": input.query, "context": input.doc.text}
elif isinstance(input, TextDoc):
chain = llm
final_input = input.text
else:
raise TypeError("Invalid input type. Expected TextDoc or RerankedDoc.")

def stream_generator():
chat_response = ""
for text in chain.stream(final_input):
chat_response += text
processed_text = post_process_text(text)
if text and processed_text:
if "</s>" in text:
res = text.split("</s>")[0]
if res != "":
yield res
break
yield processed_text
print(f"[llm - chat_stream] stream response: {chat_response}")
yield "data: [DONE]\n\n"
if input.streaming:

def stream_generator():
chat_response = ""
for text in llm.stream(input.query):
chat_response += text
processed_text = post_process_text(text)
if text and processed_text:
if "</s>" in text:
res = text.split("</s>")[0]
if res != "":
yield res
break
yield processed_text
print(f"[llm - chat_stream] stream response: {chat_response}")
yield "data: [DONE]\n\n"

return StreamingResponse(stream_generator(), media_type="text/event-stream")
return StreamingResponse(stream_generator(), media_type="text/event-stream")
else:
response = llm.invoke(input.query)
return GeneratedDoc(text=response, prompt=input.query)


if __name__ == "__main__":
opea_microservices["opea_service@llm_tgi"].start()
opea_microservices["opea_service@llm_tgi_stream"].start()
4 changes: 2 additions & 2 deletions comps/reranks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ If you start an Reranking microservice with docker, the `docker_compose_rerankin

```bash
cd ../../
docker build -t opea/gen-ai-comps:reranking-tei-xeon-server --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/reranks/docker/Dockerfile .
docker build -t opea/gen-ai-comps:reranking-tei-xeon-server --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/reranks/langchain/docker/Dockerfile .
```

## Run Docker with CLI
Expand All @@ -60,7 +60,7 @@ docker run -d --name="reranking-tei-server" -p 8000:8000 --ipc=host -e http_prox
## Run Docker with Docker Compose

```bash
cd docker
cd langchain/docker
docker compose -f docker_compose_reranking.yaml up -d
```

Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.


import json
import os

import requests
from langchain_core.prompts import ChatPromptTemplate

from comps import RerankedDoc, SearchedDoc, opea_microservices, register_microservice
from comps import LLMParamsDoc, SearchedDoc, opea_microservices, register_microservice


@register_microservice(
Expand All @@ -26,18 +28,24 @@
host="0.0.0.0",
port=8000,
input_datatype=SearchedDoc,
output_datatype=RerankedDoc,
output_datatype=LLMParamsDoc,
)
def reranking(input: SearchedDoc) -> RerankedDoc:
def reranking(input: SearchedDoc) -> LLMParamsDoc:
docs = [doc.text for doc in input.retrieved_docs]
url = tei_reranking_endpoint + "/rerank"
data = {"query": input.initial_query, "texts": docs}
headers = {"Content-Type": "application/json"}
response = requests.post(url, data=json.dumps(data), headers=headers)
response_data = response.json()
best_response = max(response_data, key=lambda response: response["score"])
res = RerankedDoc(query=input.initial_query, doc=input.retrieved_docs[best_response["index"]])
return res
template = """Answer the question based only on the following context:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
doc = input.retrieved_docs[best_response["index"]]
final_prompt = prompt.format(context=doc.text, question=input.initial_query)
return LLMParamsDoc(query=final_prompt.strip())


if __name__ == "__main__":
Expand Down
1 change: 1 addition & 0 deletions comps/reranks/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
docarray[full]
fastapi
langchain
opentelemetry-api
opentelemetry-exporter-otlp
opentelemetry-sdk
Expand Down

0 comments on commit 39a9d53

Please sign in to comment.