Integration with Infinity (#2894)

### What problem does this PR solve? Integration with Infinity - Replaced ELASTICSEARCH with dataStoreConn - Renamed deleteByQuery with delete - Renamed bulk to upsertBulk - getHighlight, getAggregation - Fix KGSearch.search - Moved Dealer.sql_retrieval to es_conn.py ### Type of change - [x] Refactoring
infiniflow · Nov 12, 2024 · f4c5237 · f4c5237
1 parent 00b6000
commit f4c5237
Show file tree

Hide file tree

Showing 42 changed files with 2,621 additions and 1,852 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -78,7 +78,7 @@ jobs:
             echo "Waiting for service to be available..."
             sleep 5
           done
-          cd sdk/python && poetry install && source .venv/bin/activate && cd test && pytest t_dataset.py t_chat.py t_session.py t_document.py t_chunk.py
+          cd sdk/python && poetry install && source .venv/bin/activate && cd test && pytest --tb=short t_dataset.py t_chat.py t_session.py t_document.py t_chunk.py
 
       - name: Stop ragflow:dev
         if: always()  # always run this step even if previous steps failed

diff --git a/README.md b/README.md
@@ -285,7 +285,7 @@ docker build -f Dockerfile -t infiniflow/ragflow:dev .
    git clone https://github.com/infiniflow/ragflow.git
    cd ragflow/
    export POETRY_VIRTUALENVS_CREATE=true POETRY_VIRTUALENVS_IN_PROJECT=true
-   ~/.local/bin/poetry install --sync --no-root # install RAGFlow dependent python modules
+   ~/.local/bin/poetry install --sync --no-root --with=full # install RAGFlow dependent python modules
    ```
 
 3. Launch the dependent services (MinIO, Elasticsearch, Redis, and MySQL) using Docker Compose:
@@ -295,7 +295,7 @@ docker build -f Dockerfile -t infiniflow/ragflow:dev .
 
    Add the following line to `/etc/hosts` to resolve all hosts specified in **docker/service_conf.yaml** to `127.0.0.1`:
    ```
-   127.0.0.1       es01 mysql minio redis
+   127.0.0.1       es01 infinity mysql minio redis
    ```  
    In **docker/service_conf.yaml**, update mysql port to `5455` and es port to `1200`, as specified in **docker/.env**.
 

diff --git a/README_ja.md b/README_ja.md
@@ -250,7 +250,7 @@ docker build -f Dockerfile -t infiniflow/ragflow:dev .
 
    `/etc/hosts` に以下の行を追加して、**docker/service_conf.yaml** に指定されたすべてのホストを `127.0.0.1` に解決します:  
    ```
-   127.0.0.1       es01 mysql minio redis
+   127.0.0.1       es01 infinity mysql minio redis
    ```  
    **docker/service_conf.yaml** で mysql のポートを `5455` に、es のポートを `1200` に更新します（**docker/.env** に指定された通り）.
 

diff --git a/README_ko.md b/README_ko.md
@@ -254,7 +254,7 @@ docker build -f Dockerfile -t infiniflow/ragflow:dev .
 
    `/etc/hosts` 에 다음 줄을 추가하여 **docker/service_conf.yaml** 에 지정된 모든 호스트를 `127.0.0.1` 로 해결합니다:  
    ```
-   127.0.0.1       es01 mysql minio redis
+   127.0.0.1       es01 infinity mysql minio redis
    ```  
    **docker/service_conf.yaml** 에서 mysql 포트를 `5455` 로, es 포트를 `1200` 으로 업데이트합니다( **docker/.env** 에 지정된 대로).
 

diff --git a/README_zh.md b/README_zh.md
@@ -252,7 +252,7 @@ docker build -f Dockerfile -t infiniflow/ragflow:dev .
 
    在 `/etc/hosts` 中添加以下代码，将 **docker/service_conf.yaml** 文件中的所有 host 地址都解析为 `127.0.0.1`：  
    ```
-   127.0.0.1       es01 mysql minio redis
+   127.0.0.1       es01 infinity mysql minio redis
    ```  
    在文件 **docker/service_conf.yaml** 中，对照 **docker/.env** 的配置将 mysql 端口更新为 `5455`，es 端口更新为 `1200`。
 

diff --git a/api/apps/api_app.py b/api/apps/api_app.py
@@ -529,13 +529,14 @@ def list_chunks():
             return get_json_result(
                 data=False, message="Can't find doc_name or doc_id"
             )
+        kb_ids = KnowledgebaseService.get_kb_ids(tenant_id)
 
-        res = retrievaler.chunk_list(doc_id=doc_id, tenant_id=tenant_id)
+        res = retrievaler.chunk_list(doc_id, tenant_id, kb_ids)
         res = [
             {
                 "content": res_item["content_with_weight"],
                 "doc_name": res_item["docnm_kwd"],
-                "img_id": res_item["img_id"]
+                "image_id": res_item["img_id"]
             } for res_item in res
         ]
 

diff --git a/api/apps/chunk_app.py b/api/apps/chunk_app.py
@@ -18,25 +18,22 @@
 
 from flask import request
 from flask_login import login_required, current_user
-from elasticsearch_dsl import Q
 
 from api.db.services.dialog_service import keyword_extraction
 from rag.app.qa import rmPrefix, beAdoc
 from rag.nlp import search, rag_tokenizer
-from rag.utils.es_conn import ELASTICSEARCH
 from rag.utils import rmSpace
 from api.db import LLMType, ParserType
 from api.db.services.knowledgebase_service import KnowledgebaseService
 from api.db.services.llm_service import LLMBundle
 from api.db.services.user_service import UserTenantService
 from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
 from api.db.services.document_service import DocumentService
-from api.settings import RetCode, retrievaler, kg_retrievaler
+from api.settings import RetCode, retrievaler, kg_retrievaler, docStoreConn
 from api.utils.api_utils import get_json_result
 import hashlib
 import re
 
-
 @manager.route('/list', methods=['POST'])
 @login_required
 @validate_request("doc_id")
@@ -53,12 +50,13 @@ def list_chunk():
         e, doc = DocumentService.get_by_id(doc_id)
         if not e:
             return get_data_error_result(message="Document not found!")
+        kb_ids = KnowledgebaseService.get_kb_ids(tenant_id)
         query = {
             "doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True
         }
         if "available_int" in req:
             query["available_int"] = int(req["available_int"])
-        sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
+        sres = retrievaler.search(query, search.index_name(tenant_id), kb_ids, highlight=True)
         res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()}
         for id in sres.ids:
             d = {
@@ -69,16 +67,12 @@ def list_chunk():
                 "doc_id": sres.field[id]["doc_id"],
                 "docnm_kwd": sres.field[id]["docnm_kwd"],
                 "important_kwd": sres.field[id].get("important_kwd", []),
-                "img_id": sres.field[id].get("img_id", ""),
+                "image_id": sres.field[id].get("img_id", ""),
                 "available_int": sres.field[id].get("available_int", 1),
-                "positions": sres.field[id].get("position_int", "").split("\t")
+                "positions": json.loads(sres.field[id].get("position_list", "[]")),
             }
-            if len(d["positions"]) % 5 == 0:
-                poss = []
-                for i in range(0, len(d["positions"]), 5):
-                    poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]),
-                                 float(d["positions"][i + 3]), float(d["positions"][i + 4])])
-                d["positions"] = poss
+            assert isinstance(d["positions"], list)
+            assert len(d["positions"])==0 or (isinstance(d["positions"][0], list) and len(d["positions"][0]) == 5)
             res["chunks"].append(d)
         return get_json_result(data=res)
     except Exception as e:
@@ -96,22 +90,20 @@ def get():
         tenants = UserTenantService.query(user_id=current_user.id)
         if not tenants:
             return get_data_error_result(message="Tenant not found!")
-        res = ELASTICSEARCH.get(
-            chunk_id, search.index_name(
-                tenants[0].tenant_id))
-        if not res.get("found"):
+        tenant_id = tenants[0].tenant_id
+
+        kb_ids = KnowledgebaseService.get_kb_ids(tenant_id)
+        chunk = docStoreConn.get(chunk_id, search.index_name(tenant_id), kb_ids)
+        if chunk is None:
             return server_error_response("Chunk not found")
-        id = res["_id"]
-        res = res["_source"]
-        res["chunk_id"] = id
         k = []
-        for n in res.keys():
+        for n in chunk.keys():
             if re.search(r"(_vec$|_sm_|_tks|_ltks)", n):
                 k.append(n)
         for n in k:
-            del res[n]
+            del chunk[n]
 
-        return get_json_result(data=res)
+        return get_json_result(data=chunk)
     except Exception as e:
         if str(e).find("NotFoundError") >= 0:
             return get_json_result(data=False, message='Chunk not found!',
@@ -162,7 +154,7 @@ def set():
         v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
         v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
         d["q_%d_vec" % len(v)] = v.tolist()
-        ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
+        docStoreConn.insert([d], search.index_name(tenant_id), doc.kb_id)
         return get_json_result(data=True)
     except Exception as e:
         return server_error_response(e)
@@ -174,11 +166,11 @@ def set():
 def switch():
     req = request.json
     try:
-        tenant_id = DocumentService.get_tenant_id(req["doc_id"])
-        if not tenant_id:
-            return get_data_error_result(message="Tenant not found!")
-        if not ELASTICSEARCH.upsert([{"id": i, "available_int": int(req["available_int"])} for i in req["chunk_ids"]],
-                                    search.index_name(tenant_id)):
+        e, doc = DocumentService.get_by_id(req["doc_id"])
+        if not e:
+            return get_data_error_result(message="Document not found!")
+        if not docStoreConn.update({"id": req["chunk_ids"]}, {"available_int": int(req["available_int"])},
+                                    search.index_name(doc.tenant_id), doc.kb_id):
             return get_data_error_result(message="Index updating failure")
         return get_json_result(data=True)
     except Exception as e:
@@ -191,12 +183,11 @@ def switch():
 def rm():
     req = request.json
     try:
-        if not ELASTICSEARCH.deleteByQuery(
-                Q("ids", values=req["chunk_ids"]), search.index_name(current_user.id)):
-            return get_data_error_result(message="Index updating failure")
         e, doc = DocumentService.get_by_id(req["doc_id"])
         if not e:
             return get_data_error_result(message="Document not found!")
+        if not docStoreConn.delete({"id": req["chunk_ids"]}, search.index_name(current_user.id), doc.kb_id):
+            return get_data_error_result(message="Index updating failure")
         deleted_chunk_ids = req["chunk_ids"]
         chunk_number = len(deleted_chunk_ids)
         DocumentService.decrement_chunk_num(doc.id, doc.kb_id, 1, chunk_number, 0)
@@ -239,7 +230,7 @@ def create():
         v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
         v = 0.1 * v[0] + 0.9 * v[1]
         d["q_%d_vec" % len(v)] = v.tolist()
-        ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
+        docStoreConn.insert([d], search.index_name(tenant_id), doc.kb_id)
 
         DocumentService.increment_chunk_num(
             doc.id, doc.kb_id, c, 1, 0)
@@ -256,26 +247,27 @@ def retrieval_test():
     page = int(req.get("page", 1))
     size = int(req.get("size", 30))
     question = req["question"]
-    kb_id = req["kb_id"]
-    if isinstance(kb_id, str): kb_id = [kb_id]
+    kb_ids = req["kb_id"]
+    if isinstance(kb_ids, str):
+        kb_ids = [kb_ids]
     doc_ids = req.get("doc_ids", [])
     similarity_threshold = float(req.get("similarity_threshold", 0.0))
     vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
     top = int(req.get("top_k", 1024))
 
     try:
         tenants = UserTenantService.query(user_id=current_user.id)
-        for kid in kb_id:
+        for kb_id in kb_ids:
             for tenant in tenants:
                 if KnowledgebaseService.query(
-                        tenant_id=tenant.tenant_id, id=kid):
+                        tenant_id=tenant.tenant_id, id=kb_id):
                     break
             else:
                 return get_json_result(
                     data=False, message='Only owner of knowledgebase authorized for this operation.',
                     code=RetCode.OPERATING_ERROR)
 
-        e, kb = KnowledgebaseService.get_by_id(kb_id[0])
+        e, kb = KnowledgebaseService.get_by_id(kb_ids[0])
         if not e:
             return get_data_error_result(message="Knowledgebase not found!")
 
@@ -290,7 +282,7 @@ def retrieval_test():
             question += keyword_extraction(chat_mdl, question)
 
         retr = retrievaler if kb.parser_id != ParserType.KG else kg_retrievaler
-        ranks = retr.retrieval(question, embd_mdl, kb.tenant_id, kb_id, page, size,
+        ranks = retr.retrieval(question, embd_mdl, kb.tenant_id, kb_ids, page, size,
                                similarity_threshold, vector_similarity_weight, top,
                                doc_ids, rerank_mdl=rerank_mdl, highlight=req.get("highlight"))
         for c in ranks["chunks"]:
@@ -309,12 +301,16 @@ def retrieval_test():
 @login_required
 def knowledge_graph():
     doc_id = request.args["doc_id"]
+    e, doc = DocumentService.get_by_id(doc_id)
+    if not e:
+        return get_data_error_result(message="Document not found!")
+    tenant_id = DocumentService.get_tenant_id(doc_id)
+    kb_ids = KnowledgebaseService.get_kb_ids(tenant_id)
     req = {
         "doc_ids":[doc_id],
         "knowledge_graph_kwd": ["graph", "mind_map"]
     }
-    tenant_id = DocumentService.get_tenant_id(doc_id)
-    sres = retrievaler.search(req, search.index_name(tenant_id))
+    sres = retrievaler.search(req, search.index_name(tenant_id), kb_ids, doc.kb_id)
     obj = {"graph": {}, "mind_map": {}}
     for id in sres.ids[:2]:
         ty = sres.field[id]["knowledge_graph_kwd"]

diff --git a/api/apps/document_app.py b/api/apps/document_app.py
@@ -17,7 +17,6 @@
 import re
 
 import flask
-from elasticsearch_dsl import Q
 from flask import request
 from flask_login import login_required, current_user
 
@@ -27,14 +26,13 @@
 from api.db.services.task_service import TaskService, queue_tasks
 from api.db.services.user_service import UserTenantService
 from rag.nlp import search
-from rag.utils.es_conn import ELASTICSEARCH
 from api.db.services import duplicate_name
 from api.db.services.knowledgebase_service import KnowledgebaseService
 from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
 from api.utils import get_uuid
 from api.db import FileType, TaskStatus, ParserType, FileSource
 from api.db.services.document_service import DocumentService, doc_upload_and_parse
-from api.settings import RetCode
+from api.settings import RetCode, docStoreConn
 from api.utils.api_utils import get_json_result
 from rag.utils.storage_factory import STORAGE_IMPL
 from api.utils.file_utils import filename_type, thumbnail
@@ -275,18 +273,8 @@ def change_status():
             return get_data_error_result(
                 message="Database error (Document update)!")
 
-        if str(req["status"]) == "0":
-            ELASTICSEARCH.updateScriptByQuery(Q("term", doc_id=req["doc_id"]),
-                                              scripts="ctx._source.available_int=0;",
-                                              idxnm=search.index_name(
-                                                  kb.tenant_id)
-                                              )
-        else:
-            ELASTICSEARCH.updateScriptByQuery(Q("term", doc_id=req["doc_id"]),
-                                              scripts="ctx._source.available_int=1;",
-                                              idxnm=search.index_name(
-                                                  kb.tenant_id)
-                                              )
+        status = int(req["status"])
+        docStoreConn.update({"doc_id": req["doc_id"]}, {"available_int": status}, search.index_name(kb.tenant_id), doc.kb_id)
         return get_json_result(data=True)
     except Exception as e:
         return server_error_response(e)
@@ -365,8 +353,11 @@ def run():
             tenant_id = DocumentService.get_tenant_id(id)
             if not tenant_id:
                 return get_data_error_result(message="Tenant not found!")
-            ELASTICSEARCH.deleteByQuery(
-                Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
+            e, doc = DocumentService.get_by_id(id)
+            if not e:
+                return get_data_error_result(message="Document not found!")
+            if docStoreConn.indexExist(search.index_name(tenant_id), doc.kb_id):
+                docStoreConn.delete({"doc_id": id}, search.index_name(tenant_id), doc.kb_id)
 
             if str(req["run"]) == TaskStatus.RUNNING.value:
                 TaskService.filter_delete([Task.doc_id == id])
@@ -490,8 +481,8 @@ def change_parser():
             tenant_id = DocumentService.get_tenant_id(req["doc_id"])
             if not tenant_id:
                 return get_data_error_result(message="Tenant not found!")
-            ELASTICSEARCH.deleteByQuery(
-                Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
+            if docStoreConn.indexExist(search.index_name(tenant_id), doc.kb_id):
+                docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), doc.kb_id)
 
         return get_json_result(data=True)
     except Exception as e:

diff --git a/api/apps/kb_app.py b/api/apps/kb_app.py
@@ -28,6 +28,8 @@
 from api.db.db_models import File
 from api.settings import RetCode
 from api.utils.api_utils import get_json_result
+from api.settings import docStoreConn
+from rag.nlp import search
 
 
 @manager.route('/create', methods=['post'])
@@ -166,6 +168,9 @@ def rm():
         if not KnowledgebaseService.delete_by_id(req["kb_id"]):
             return get_data_error_result(
                 message="Database error (Knowledgebase removal)!")
+        tenants = UserTenantService.query(user_id=current_user.id)
+        for tenant in tenants:
+            docStoreConn.deleteIdx(search.index_name(tenant.tenant_id), req["kb_id"])
         return get_json_result(data=True)
     except Exception as e:
         return server_error_response(e)