diff --git a/comps/llms/text-generation/llamacpp/Dockerfile b/comps/llms/text-generation/llamacpp/Dockerfile new file mode 100644 index 0000000000..70500e35d7 --- /dev/null +++ b/comps/llms/text-generation/llamacpp/Dockerfile @@ -0,0 +1,22 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +# Assumes we're building from the GenAIComps directory, and docker file is in comps/llms/text-generation/llamacpp +COPY ../../../comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + pip install --no-cache-dir -r /home/user/comps/llms/text-generation/llamacpp/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/llms/text-generation/llamacpp/ + +ENTRYPOINT ["python", "llm.py"] \ No newline at end of file diff --git a/comps/llms/text-generation/llamacpp/README.md b/comps/llms/text-generation/llamacpp/README.md new file mode 100644 index 0000000000..00b8e0b778 --- /dev/null +++ b/comps/llms/text-generation/llamacpp/README.md @@ -0,0 +1,90 @@ +# Introduction + +[llama.cpp](https://github.com/ggerganov/llama.cpp) provides inference in pure C/C++, and enables "LLM inference with minimal setup and state-of-the-art performance on a wide range of hardware - locally and in the cloud". + +This OPEA component wraps llama.cpp server so that it can interface with other OPEA components, or for creating OPEA Megaservices. + +llama.cpp supports this [hardware](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#supported-backends), and has only been tested on CPU. + +To use a CUDA server please refer to [this llama.cpp reference](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md#docker) and modify docker_compose_llm.yaml accordingly. + +## TLDR + +```bash +cd GenAIComps/ +docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml up +``` + +Please note it's instructive to run and validate each the llama.cpp server and OPEA component below. + +## 1. Run the llama.cpp server + +```bash +cd GenAIComps +docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml up llamacpp-server +``` + +Notes: + +i) If you prefer to run above in the background without screen output use `up -d`. + +ii) To stop the llama.cpp server: + +`docker compose -f comps/llms/text-generation/llamacpp/langchain/docker_compose_llm.yaml llamacpp-server stop` + +iii) For [llama.cpp settings](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md) please specify them in the docker_compose_llm.yaml file. + +#### Verify the llama.cpp Service: + +```bash +curl --request POST \ + --url http://localhost:8080/completion \ + --header "Content-Type: application/json" \ + --data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}' +``` + +## 2. Run the llama.cpp OPEA Service + +This is essentially a wrapper component of Llama.cpp server. OPEA nicely standardizes and verifies LLM inputs with LLMParamsDoc class (see llm.py). + +### 2.1 Build the llama.cpp OPEA image: + +```bash +cd GenAIComps/ +docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml up llamacpp-opea-llm --force-recreate +``` + +Equivalently, the above can be achieved with `build` and `run` from the Dockerfile. Build: + +```bash +cd GenAIComps/ +docker build --no-cache -t opea/llm-llamacpp:latest \ + --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy \ + -f comps/llms/text-generation/llamacpp/Dockerfile . +``` + +And run: + +```bash +docker run --network host -e http_proxy=$http_proxy -e https_proxy=$https_proxy \ + opea/llm-llamacpp:latest +``` + +### 2.3 Consume the llama.cpp Microservice: + +```bash +curl http://127.0.0.1:9000/v1/chat/completions -X POST \ + -d '{"query":"What is Deep Learning?","max_tokens":32,"top_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \ + -H 'Content-Type: application/json' +``` + +### Notes + +Stopping services: + +```bash +cd GenAIComps/comps/llms/text-generation/llamacpp/ +docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml stop +``` + +`down` may be used instead of 'stop' if you'd like to stop, and delete the containers and networks. diff --git a/comps/llms/text-generation/llamacpp/__init__.py b/comps/llms/text-generation/llamacpp/__init__.py new file mode 100644 index 0000000000..916f3a44b2 --- /dev/null +++ b/comps/llms/text-generation/llamacpp/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml b/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml new file mode 100644 index 0000000000..d66d93afd5 --- /dev/null +++ b/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml @@ -0,0 +1,38 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +services: + llamacpp-server: + image: ghcr.io/ggerganov/llama.cpp:server-b4419 + ports: + - 8080:8080 + environment: + # Refer to settings here: https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md + # Llama.cpp is based on .gguf format, and Hugging Face offers many .gguf format models. + LLAMA_ARG_MODEL_URL: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf + LLAMA_ARG_CTX_SIZE: 4096 + LLAMA_ARG_N_PARALLEL: 2 + LLAMA_ARG_ENDPOINT_METRICS: 1 + LLAMA_ARG_PORT: 8080 + + llamacpp-opea-llm: + image: opea/llm-llamacpp:latest + build: + # This context is to allow the 'COPY comps' command in the Dockerfile. + # When using docker compose with -f, the comps context is 4 levels down from docker_compose_llm.yaml. + context: ../../../../ + dockerfile: ./comps/llms/text-generation/llamacpp/Dockerfile + depends_on: + - llamacpp-server + ports: + - "9000:9000" + network_mode: "host" # equivalent to: docker run --network host ... + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/llms/text-generation/llamacpp/llm.py b/comps/llms/text-generation/llamacpp/llm.py new file mode 100644 index 0000000000..5612199eb0 --- /dev/null +++ b/comps/llms/text-generation/llamacpp/llm.py @@ -0,0 +1,65 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +import openai +from fastapi.responses import StreamingResponse + +from comps import CustomLogger, LLMParamsDoc, ServiceType, opea_microservices, register_microservice + +logger = CustomLogger("llm_llamacpp") +logflag = os.getenv("LOGFLAG", False) +llamacpp_endpoint = os.getenv("LLAMACPP_ENDPOINT", "http://localhost:8080/") + + +# OPEA microservice wrapper of llama.cpp +# llama.cpp server uses openai API format: https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md +@register_microservice( + name="opea_service@llm_llamacpp", + service_type=ServiceType.LLM, + endpoint="/v1/chat/completions", + host="0.0.0.0", + port=9000, +) +async def llm_generate(input: LLMParamsDoc): + if logflag: + logger.info(input) + logger.info(llamacpp_endpoint) + + client = openai.OpenAI( + base_url=llamacpp_endpoint, api_key="sk-no-key-required" # "http://:port" + ) + + # Llama.cpp works with openai API format + # The openai api doesn't have top_k parameter + # https://community.openai.com/t/which-openai-gpt-models-if-any-allow-specifying-top-k/777982/2 + chat_completion = client.chat.completions.create( + model=input.model, + messages=[{"role": "user", "content": input.query}], + max_tokens=input.max_tokens, + temperature=input.temperature, + top_p=input.top_p, + frequency_penalty=input.frequency_penalty, + presence_penalty=input.presence_penalty, + stream=input.streaming, + ) + + if input.streaming: + + def stream_generator(): + for c in chat_completion: + if logflag: + logger.info(c) + yield f"data: {c.model_dump_json()}\n\n" + yield "data: [DONE]\n\n" + + return StreamingResponse(stream_generator(), media_type="text/event-stream") + else: + if logflag: + logger.info(chat_completion) + return chat_completion + + +if __name__ == "__main__": + opea_microservices["opea_service@llm_llamacpp"].start() diff --git a/comps/llms/text-generation/llamacpp/requirements.txt b/comps/llms/text-generation/llamacpp/requirements.txt new file mode 100644 index 0000000000..fdb5f5a016 --- /dev/null +++ b/comps/llms/text-generation/llamacpp/requirements.txt @@ -0,0 +1,12 @@ +aiohttp +docarray[full] +fastapi +huggingface_hub +openai +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +prometheus-fastapi-instrumentator +shortuuid +transformers +uvicorn