opea-project · chensuyue · Jan 20, 2025 · Jan 17, 2025 · Jan 17, 2025 · Jan 17, 2025
@@ -40,8 +40,8 @@ export EMBEDDING_SERVICE_HOST_IP=${host_ip}
 export RETRIEVER_SERVICE_HOST_IP=${host_ip}
 export RERANK_SERVICE_HOST_IP=${host_ip}
 export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8889/v1/retrievaltool"
-export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep"
-export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/get_file"
-export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/delete_file"
+export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/ingest"
+export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/get"
+export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/delete"
 
 docker compose -f compose.yaml up -d
@@ -41,6 +41,6 @@ export EMBEDDING_SERVICE_HOST_IP=${host_ip}
 export RETRIEVER_SERVICE_HOST_IP=${host_ip}
 export RERANK_SERVICE_HOST_IP=${host_ip}
 export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8889/v1/retrievaltool"
-export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep"
-export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/get_file"
-export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/delete_file"
+export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/ingest"
+export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/get"
+export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/delete"
@@ -53,7 +53,7 @@ def main():
     host_ip = args.host_ip
     port = args.port
     proxies = {"http": ""}
-    url = "http://{host_ip}:{port}/v1/dataprep".format(host_ip=host_ip, port=port)
+    url = "http://{host_ip}:{port}/v1/dataprep/ingest".format(host_ip=host_ip, port=port)
 
     # Split jsonl file into json files
     files = split_jsonl_into_txts(os.path.join(args.filedir, args.filename))

@@ -19,8 +19,8 @@ export EMBEDDING_SERVICE_HOST_IP=${host_ip}
 export RETRIEVER_SERVICE_HOST_IP=${host_ip}
 export RERANK_SERVICE_HOST_IP=${host_ip}
 export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8889/v1/retrievaltool"
-export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep"
-export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6008/v1/dataprep/get_file"
-export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:6009/v1/dataprep/delete_file"
+export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/ingest"
+export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6008/v1/dataprep/get"
+export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:6009/v1/dataprep/delete"
 
 docker compose -f $WORKDIR/GenAIExamples/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml up -d
@@ -21,7 +21,7 @@ function build_docker_images_for_retrieval_tool(){
     # git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
     get_genai_comps
     echo "Build all the images with --no-cache..."
-    service_list="doc-index-retriever dataprep-redis embedding retriever reranking"
+    service_list="doc-index-retriever dataprep embedding retriever reranking"
     docker compose -f build.yaml build ${service_list} --no-cache
     docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
 

@@ -202,8 +202,8 @@ Gaudi default compose.yaml
 | Embedding | Langchain | Xeon | 6000 | /v1/embeddings |
 | Retriever | Langchain, Redis | Xeon | 7000 | /v1/retrieval |
 | Reranking | Langchain, TEI | Gaudi | 8000 | /v1/reranking |
-| LLM | Langchain, vLLM | Gaudi | 9000 | /v1/chat/completions |
-| Dataprep | Redis, Langchain | Xeon | 6007 | /v1/dataprep |
+| LLM | Langchain, TGI | Gaudi | 9000 | /v1/chat/completions |
+| Dataprep | Redis, Langchain | Xeon | 6007 | /v1/dataprep/ingest |
 
 ### Required Models
 
@@ -294,7 +294,7 @@ Here is an example of `Nike 2023` pdf.
 # download pdf file
 wget https://raw.githubusercontent.com/opea-project/GenAIComps/v1.1/comps/retrievers/redis/data/nke-10k-2023.pdf
 # upload pdf file with dataprep
-curl -X POST "http://${host_ip}:6007/v1/dataprep" \
+curl -X POST "http://${host_ip}:6007/v1/dataprep/ingest" \
     -H "Content-Type: multipart/form-data" \
     -F "files=@./nke-10k-2023.pdf"
 ```

@@ -72,14 +72,14 @@ python eval_multihop.py --docs_path MultiHop-RAG/dataset/corpus.json  --dataset_
 If you are using Kubernetes manifest/helm to deploy `ChatQnA` system, you must specify more arguments as following:
 
 ```bash
-python eval_multihop.py --docs_path MultiHop-RAG/dataset/corpus.json  --dataset_path MultiHop-RAG/dataset/MultiHopRAG.json --ingest_docs --retrieval_metrics --ragas_metrics --llm_endpoint http://{llm_as_judge_ip}:{llm_as_judge_port}/generate --database_endpoint http://{your_dataprep_ip}:{your_dataprep_port}/v1/dataprep --embedding_endpoint http://{your_embedding_ip}:{your_embedding_port}/v1/embeddings --tei_embedding_endpoint http://{your_tei_embedding_ip}:{your_tei_embedding_port} --retrieval_endpoint http://{your_retrieval_ip}:{your_retrieval_port}/v1/retrieval --service_url http://{your_chatqna_ip}:{your_chatqna_port}/v1/chatqna
+python eval_multihop.py --docs_path MultiHop-RAG/dataset/corpus.json  --dataset_path MultiHop-RAG/dataset/MultiHopRAG.json --ingest_docs --retrieval_metrics --ragas_metrics --llm_endpoint http://{llm_as_judge_ip}:{llm_as_judge_port}/generate --database_endpoint http://{your_dataprep_ip}:{your_dataprep_port}/v1/dataprep/ingest --embedding_endpoint http://{your_embedding_ip}:{your_embedding_port}/v1/embeddings --tei_embedding_endpoint http://{your_tei_embedding_ip}:{your_tei_embedding_port} --retrieval_endpoint http://{your_retrieval_ip}:{your_retrieval_port}/v1/retrieval --service_url http://{your_chatqna_ip}:{your_chatqna_port}/v1/chatqna
 ```
 
 The default values for arguments are:
 |Argument|Default value|
 |--------|-------------|
 |service_url|http://localhost:8888/v1/chatqna|
-|database_endpoint|http://localhost:6007/v1/dataprep|
+|database_endpoint|http://localhost:6007/v1/dataprep/ingest|
 |embedding_endpoint|http://localhost:6000/v1/embeddings|
 |tei_embedding_endpoint|http://localhost:8090|
 |retrieval_endpoint|http://localhost:7000/v1/retrieval|
@@ -139,14 +139,14 @@ python eval_crud.py --dataset_path ./data/split_merged.json --docs_path ./data/8
 If you are using Kubernetes manifest/helm to deploy `ChatQnA` system, you must specify more arguments as following:
 
 ```bash
-python eval_crud.py --dataset_path ./data/split_merged.json --docs_path ./data/80000_docs --ingest_docs --database_endpoint http://{your_dataprep_ip}:{your_dataprep_port}/v1/dataprep --embedding_endpoint http://{your_embedding_ip}:{your_embedding_port}/v1/embeddings --retrieval_endpoint http://{your_retrieval_ip}:{your_retrieval_port}/v1/retrieval --service_url http://{your_chatqna_ip}:{your_chatqna_port}/v1/chatqna
+python eval_crud.py --dataset_path ./data/split_merged.json --docs_path ./data/80000_docs --ingest_docs --database_endpoint http://{your_dataprep_ip}:{your_dataprep_port}/v1/dataprep/ingest --embedding_endpoint http://{your_embedding_ip}:{your_embedding_port}/v1/embeddings --retrieval_endpoint http://{your_retrieval_ip}:{your_retrieval_port}/v1/retrieval --service_url http://{your_chatqna_ip}:{your_chatqna_port}/v1/chatqna
 ```
 
 The default values for arguments are:
 |Argument|Default value|
 |--------|-------------|
 |service_url|http://localhost:8888/v1/chatqna|
-|database_endpoint|http://localhost:6007/v1/dataprep|
+|database_endpoint|http://localhost:6007/v1/dataprep/ingest|
 |embedding_endpoint|http://localhost:6000/v1/embeddings|
 |retrieval_endpoint|http://localhost:7000/v1/retrieval|
 |reranking_endpoint|http://localhost:8000/v1/reranking|

@@ -149,7 +149,7 @@ def args_parser():
     parser.add_argument("--tasks", default=["question_answering"], nargs="+", help="Task to perform")
     parser.add_argument("--ingest_docs", action="store_true", help="Whether to ingest documents to vector database")
     parser.add_argument(
-        "--database_endpoint", type=str, default="http://localhost:6007/v1/dataprep", help="Service URL address."
+        "--database_endpoint", type=str, default="http://localhost:6007/v1/dataprep/ingest", help="Service URL address."
     )
     parser.add_argument(
         "--embedding_endpoint", type=str, default="http://localhost:6000/v1/embeddings", help="Service URL address."

@@ -211,7 +211,7 @@ def args_parser():
     parser.add_argument("--ragas_metrics", action="store_true", help="Whether to compute ragas metrics.")
     parser.add_argument("--limits", type=int, default=100, help="Number of examples to be evaluated by llm-as-judge")
     parser.add_argument(
-        "--database_endpoint", type=str, default="http://localhost:6007/v1/dataprep", help="Service URL address."
+        "--database_endpoint", type=str, default="http://localhost:6007/v1/dataprep/ingest", help="Service URL address."
     )
     parser.add_argument(
         "--embedding_endpoint", type=str, default="http://localhost:6000/v1/embeddings", help="Service URL address."

@@ -164,7 +164,7 @@ Use the following `cURL` command to upload file:
 
 ```bash
 cd GenAIEval/evals/benchmark/data
-curl -X POST "http://${cluster_ip}:6007/v1/dataprep" \
+curl -X POST "http://${cluster_ip}:6007/v1/dataprep/ingest" \
      -H "Content-Type: multipart/form-data" \
      -F "chunk_size=3800" \
      -F "files=@./upload_file.txt"

@@ -65,7 +65,7 @@ Prepare and upload test document
 # download pdf file
 wget https://raw.githubusercontent.com/opea-project/GenAIComps/v1.1/comps/retrievers/redis/data/nke-10k-2023.pdf
 # upload pdf file with dataprep
-curl -X POST "http://${host_ip}:6007/v1/dataprep" \
+curl -X POST "http://${host_ip}:6007/v1/dataprep/ingest" \
     -H "Content-Type: multipart/form-data" \
     -F "files=@./nke-10k-2023.pdf"
 ```
@@ -100,7 +100,7 @@ docker build --no-cache -t opea/retriever:latest --build-arg https_proxy=$https_
 ### 3. Build Dataprep Image
 
 ```bash
-docker build --no-cache -t opea/dataprep-redis:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/redis/langchain/Dockerfile .
+docker build --no-cache -t opea/dataprep:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/src/Dockerfile .
 ```
 
 ### 4. Build MegaService Docker Image
@@ -144,7 +144,7 @@ docker build -t opea/nginx:latest --build-arg https_proxy=$https_proxy --build-a
 Then run the command `docker images`, you will have the following 5 Docker Images:
 
 1. `opea/retriever:latest`
-2. `opea/dataprep-redis:latest`
+2. `opea/dataprep:latest`
 3. `opea/chatqna:latest`
 4. `opea/chatqna-ui:latest` or `opea/chatqna-react-ui:latest`
 5. `opea/nginx:latest`
@@ -192,9 +192,9 @@ Change the `xxx_MODEL_ID` below for your needs.
    export CHATQNA_MEGA_SERVICE_HOST_IP=${HOST_IP}
    export CHATQNA_RETRIEVER_SERVICE_HOST_IP=${HOST_IP}
    export CHATQNA_BACKEND_SERVICE_ENDPOINT="http://127.0.0.1:${CHATQNA_BACKEND_SERVICE_PORT}/v1/chatqna"
-   export CHATQNA_DATAPREP_SERVICE_ENDPOINT="http://127.0.0.1:${CHATQNA_REDIS_DATAPREP_PORT}/v1/dataprep"
-   export CHATQNA_DATAPREP_GET_FILE_ENDPOINT="http://127.0.0.1:${CHATQNA_REDIS_DATAPREP_PORT}/v1/dataprep/get_file"
-   export CHATQNA_DATAPREP_DELETE_FILE_ENDPOINT="http://127.0.0.1:${CHATQNA_REDIS_DATAPREP_PORT}/v1/dataprep/delete_file"
+   export CHATQNA_DATAPREP_SERVICE_ENDPOINT="http://127.0.0.1:${CHATQNA_REDIS_DATAPREP_PORT}/v1/dataprep/ingest"
+   export CHATQNA_DATAPREP_GET_FILE_ENDPOINT="http://127.0.0.1:${CHATQNA_REDIS_DATAPREP_PORT}/v1/dataprep/get"
+   export CHATQNA_DATAPREP_DELETE_FILE_ENDPOINT="http://127.0.0.1:${CHATQNA_REDIS_DATAPREP_PORT}/v1/dataprep/delete"
    export CHATQNA_FRONTEND_SERVICE_IP=${HOST_IP}
    export CHATQNA_FRONTEND_SERVICE_PORT=5173
    export CHATQNA_BACKEND_SERVICE_NAME=chatqna
@@ -331,7 +331,7 @@ If you want to update the default knowledge base, you can use the following comm
 Update Knowledge Base via Local File Upload:
 
 ```bash
-curl -X POST "http://${host_ip}:6007/v1/dataprep" \
+curl -X POST "http://${host_ip}:6007/v1/dataprep/ingest" \
      -H "Content-Type: multipart/form-data" \
      -F "files=@./nke-10k-2023.pdf"
 ```
@@ -341,7 +341,7 @@ This command updates a knowledge base by uploading a local file for processing.
 Add Knowledge Base via HTTP Links:
 
 ```bash
-curl -X POST "http://${host_ip}:6007/v1/dataprep" \
+curl -X POST "http://${host_ip}:6007/v1/dataprep/ingest" \
      -H "Content-Type: multipart/form-data" \
      -F 'link_list=["https://opea.dev"]'
 ```
@@ -351,25 +351,25 @@ This command updates a knowledge base by submitting a list of HTTP links for pro
 Also, you are able to get the file list that you uploaded:
 
 ```bash
-curl -X POST "http://${host_ip}:6007/v1/dataprep/get_file" \
+curl -X POST "http://${host_ip}:6007/v1/dataprep/get" \
      -H "Content-Type: application/json"
 ```
 
 To delete the file/link you uploaded:
 
 ```bash
 # delete link
-curl -X POST "http://${host_ip}:6007/v1/dataprep/delete_file" \
+curl -X POST "http://${host_ip}:6007/v1/dataprep/delete" \
      -d '{"file_path": "https://opea.dev"}' \
      -H "Content-Type: application/json"
 
 # delete file
-curl -X POST "http://${host_ip}:6007/v1/dataprep/delete_file" \
+curl -X POST "http://${host_ip}:6007/v1/dataprep/delete" \
      -d '{"file_path": "nke-10k-2023.pdf"}' \
      -H "Content-Type: application/json"
 
 # delete all uploaded files and links
-curl -X POST "http://${host_ip}:6007/v1/dataprep/delete_file" \
+curl -X POST "http://${host_ip}:6007/v1/dataprep/delete" \
      -d '{"file_path": "all"}' \
      -H "Content-Type: application/json"
 ```

@@ -9,13 +9,13 @@ services:
       - "${CHATQNA_REDIS_VECTOR_PORT}:6379"
       - "${CHATQNA_REDIS_VECTOR_INSIGHT_PORT}:8001"
   chatqna-dataprep-redis-service:
-    image: ${REGISTRY:-opea}/dataprep-redis:${TAG:-latest}
+    image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
     container_name: dataprep-redis-server
     depends_on:
       - chatqna-redis-vector-db
       - chatqna-tei-embedding-service
     ports:
-      - "${CHATQNA_REDIS_DATAPREP_PORT}:6007"
+      - "${CHATQNA_REDIS_DATAPREP_PORT}:5000"
     environment:
       no_proxy: ${no_proxy}
       http_proxy: ${http_proxy}

@@ -19,9 +19,9 @@ export CHATQNA_INDEX_NAME="rag-redis"
 export CHATQNA_MEGA_SERVICE_HOST_IP=${HOST_IP}
 export CHATQNA_RETRIEVER_SERVICE_HOST_IP=${HOST_IP}
 export CHATQNA_BACKEND_SERVICE_ENDPOINT="http://127.0.0.1:${CHATQNA_BACKEND_SERVICE_PORT}/v1/chatqna"
-export CHATQNA_DATAPREP_SERVICE_ENDPOINT="http://127.0.0.1:${CHATQNA_REDIS_DATAPREP_PORT}/v1/dataprep"
-export CHATQNA_DATAPREP_GET_FILE_ENDPOINT="http://127.0.0.1:${CHATQNA_REDIS_DATAPREP_PORT}/v1/dataprep/get_file"
-export CHATQNA_DATAPREP_DELETE_FILE_ENDPOINT="http://127.0.0.1:${CHATQNA_REDIS_DATAPREP_PORT}/v1/dataprep/delete_file"
+export CHATQNA_DATAPREP_SERVICE_ENDPOINT="http://127.0.0.1:${CHATQNA_REDIS_DATAPREP_PORT}/v1/dataprep/ingest"
+export CHATQNA_DATAPREP_GET_FILE_ENDPOINT="http://127.0.0.1:${CHATQNA_REDIS_DATAPREP_PORT}/v1/dataprep/get"
+export CHATQNA_DATAPREP_DELETE_FILE_ENDPOINT="http://127.0.0.1:${CHATQNA_REDIS_DATAPREP_PORT}/v1/dataprep/delete"
 export CHATQNA_FRONTEND_SERVICE_IP=${HOST_IP}
 export CHATQNA_FRONTEND_SERVICE_PORT=15173
 export CHATQNA_BACKEND_SERVICE_NAME=chatqna

@@ -27,7 +27,7 @@ docker build --no-cache -t opea/retriever:latest --build-arg https_proxy=$https_
 ### 2. Build Dataprep Image
 
 ```bash
-docker build --no-cache -t opea/dataprep-redis:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/redis/langchain/Dockerfile .
+docker build --no-cache -t opea/dataprep:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/src/Dockerfile .
 cd ..
 ```
 
@@ -60,7 +60,7 @@ docker build -t opea/nginx:latest --build-arg https_proxy=$https_proxy --build-a
 
 Then run the command `docker images`, you will have the following Docker Images:
 
-1. `opea/dataprep-redis:latest`
+1. `opea/dataprep:latest`
 2. `opea/retriever:latest`
 3. `opea/chatqna:latest`
 4. `opea/chatqna-ui:latest`
@@ -191,7 +191,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v
 wget https://raw.githubusercontent.com/opea-project/GenAIComps/v1.1/comps/retrievers/redis/data/nke-10k-2023.pdf
 
 # upload pdf file with dataprep
-curl -X POST "http://${host_ip}:6007/v1/dataprep" \
+curl -X POST "http://${host_ip}:6007/v1/dataprep/ingest" \
      -H "Content-Type: multipart/form-data" \
      -F "files=@./nke-10k-2023.pdf"
 ```
@@ -201,7 +201,7 @@ This command updates a knowledge base by uploading a local file for processing.
 Alternatively, you can add knowledge base via HTTP Links:
 
 ```bash
-curl -X POST "http://${host_ip}:6007/v1/dataprep" \
+curl -X POST "http://${host_ip}:6007/v1/dataprep/ingest" \
      -H "Content-Type: multipart/form-data" \
      -F 'link_list=["https://opea.dev"]'
 ```
@@ -211,7 +211,7 @@ This command updates a knowledge base by submitting a list of HTTP links for pro
 To check the uploaded files, you are able to get the file list that uploaded:
 
 ```bash
-curl -X POST "http://${host_ip}:6007/v1/dataprep/get_file" \
+curl -X POST "http://${host_ip}:6007/v1/dataprep/get" \
      -H "Content-Type: application/json"
 ```
 

@@ -9,13 +9,13 @@ services:
       - "6379:6379"
       - "8001:8001"
   dataprep-redis-service:
-    image: ${REGISTRY:-opea}/dataprep-redis:${TAG:-latest}
+    image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
     container_name: dataprep-redis-server
     depends_on:
       - redis-vector-db
       - tei-embedding-service
     ports:
-      - "6007:6007"
+      - "6007:5000"
     environment:
       no_proxy: ${no_proxy}
       http_proxy: ${http_proxy}
@@ -146,7 +146,7 @@ services:
       - BACKEND_SERVICE_IP=chatqna-aipc-backend-server
       - BACKEND_SERVICE_PORT=8888
       - DATAPREP_SERVICE_IP=dataprep-redis-service
-      - DATAPREP_SERVICE_PORT=6007
+      - DATAPREP_SERVICE_PORT=5000
     ipc: host
     restart: always