Skip to content

Commit

Permalink
Passed knowledge base retrieval testing
Browse files Browse the repository at this point in the history
Renamed deleteByQuery with delete
Renamed bulk to upsertBulk
getHighlight
Replaced ELASTICSEARCH with dataStoreConn
Fix KGSearch.search
Moved Dealer.sql_retrieval to es_conn.py
getAggregation
  • Loading branch information
yuzhichang committed Oct 18, 2024
1 parent c760f05 commit 21376ec
Show file tree
Hide file tree
Showing 22 changed files with 1,078 additions and 766 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ docker build -f Dockerfile -t infiniflow/ragflow:dev .
git clone https://github.com/infiniflow/ragflow.git
cd ragflow/
export POETRY_VIRTUALENVS_CREATE=true POETRY_VIRTUALENVS_IN_PROJECT=true
~/.local/bin/poetry install --sync --no-root # install RAGFlow dependent python modules
~/.local/bin/poetry install --sync --no-root --with=full # install RAGFlow dependent python modules
```
3. Launch the dependent services (MinIO, Elasticsearch, Redis, and MySQL) using Docker Compose:
Expand Down
25 changes: 11 additions & 14 deletions api/apps/chunk_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,17 @@

from flask import request
from flask_login import login_required, current_user
from elasticsearch_dsl import Q

from rag.app.qa import rmPrefix, beAdoc
from rag.nlp import search, rag_tokenizer, keyword_extraction
from rag.utils.es_conn import ELASTICSEARCH
from rag.utils import rmSpace
from api.db import LLMType, ParserType
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.llm_service import LLMBundle
from api.db.services.user_service import UserTenantService
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
from api.db.services.document_service import DocumentService
from api.settings import RetCode, retrievaler, kg_retrievaler
from api.settings import RetCode, retrievaler, kg_retrievaler, docStoreConn
from api.utils.api_utils import get_json_result
import hashlib
import re
Expand Down Expand Up @@ -83,7 +81,7 @@ def list_chunk():
return get_json_result(data=res)
except Exception as e:
if str(e).find("not_found") > 0:
return get_json_result(data=False, retmsg=f'No chunk found!',
return get_json_result(data=False, retmsg='No chunk found!',
retcode=RetCode.DATA_ERROR)
return server_error_response(e)

Expand All @@ -96,7 +94,7 @@ def get():
tenants = UserTenantService.query(user_id=current_user.id)
if not tenants:
return get_data_error_result(retmsg="Tenant not found!")
res = ELASTICSEARCH.get(
res = docStoreConn.get(
chunk_id, search.index_name(
tenants[0].tenant_id))
if not res.get("found"):
Expand All @@ -114,7 +112,7 @@ def get():
return get_json_result(data=res)
except Exception as e:
if str(e).find("NotFoundError") >= 0:
return get_json_result(data=False, retmsg=f'Chunk not found!',
return get_json_result(data=False, retmsg='Chunk not found!',
retcode=RetCode.DATA_ERROR)
return server_error_response(e)

Expand Down Expand Up @@ -162,7 +160,7 @@ def set():
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
d["q_%d_vec" % len(v)] = v.tolist()
ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
docStoreConn.upsert([d], search.index_name(tenant_id))
return get_json_result(data=True)
except Exception as e:
return server_error_response(e)
Expand All @@ -177,7 +175,7 @@ def switch():
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
if not tenant_id:
return get_data_error_result(retmsg="Tenant not found!")
if not ELASTICSEARCH.upsert([{"id": i, "available_int": int(req["available_int"])} for i in req["chunk_ids"]],
if not docStoreConn.upsert([{"id": i, "available_int": int(req["available_int"])} for i in req["chunk_ids"]],
search.index_name(tenant_id)):
return get_data_error_result(retmsg="Index updating failure")
return get_json_result(data=True)
Expand All @@ -191,8 +189,7 @@ def switch():
def rm():
req = request.json
try:
if not ELASTICSEARCH.deleteByQuery(
Q("ids", values=req["chunk_ids"]), search.index_name(current_user.id)):
if not docStoreConn.delete({"_id": req["chunk_ids"]}, search.index_name(current_user.id)):
return get_data_error_result(retmsg="Index updating failure")
e, doc = DocumentService.get_by_id(req["doc_id"])
if not e:
Expand Down Expand Up @@ -239,7 +236,7 @@ def create():
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
v = 0.1 * v[0] + 0.9 * v[1]
d["q_%d_vec" % len(v)] = v.tolist()
ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
docStoreConn.upsert([d], search.index_name(tenant_id))

DocumentService.increment_chunk_num(
doc.id, doc.kb_id, c, 1, 0)
Expand Down Expand Up @@ -272,7 +269,7 @@ def retrieval_test():
break
else:
return get_json_result(
data=False, retmsg=f'Only owner of knowledgebase authorized for this operation.',
data=False, retmsg='Only owner of knowledgebase authorized for this operation.',
retcode=RetCode.OPERATING_ERROR)

e, kb = KnowledgebaseService.get_by_id(kb_id[0])
Expand Down Expand Up @@ -300,7 +297,7 @@ def retrieval_test():
return get_json_result(data=ranks)
except Exception as e:
if str(e).find("not_found") > 0:
return get_json_result(data=False, retmsg=f'No chunk found! Check the chunk status please!',
return get_json_result(data=False, retmsg='No chunk found! Check the chunk status please!',
retcode=RetCode.DATA_ERROR)
return server_error_response(e)

Expand All @@ -320,7 +317,7 @@ def knowledge_graph():
ty = sres.field[id]["knowledge_graph_kwd"]
try:
obj[ty] = json.loads(sres.field[id]["content_with_weight"])
except Exception as e:
except Exception:
print(traceback.format_exc(), flush=True)

return get_json_result(data=obj)
Expand Down
15 changes: 6 additions & 9 deletions api/apps/document_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,13 @@
from api.db.services.task_service import TaskService, queue_tasks
from api.db.services.user_service import UserTenantService
from rag.nlp import search
from rag.utils.es_conn import ELASTICSEARCH
from api.db.services import duplicate_name
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
from api.utils import get_uuid
from api.db import FileType, TaskStatus, ParserType, FileSource
from api.db.services.document_service import DocumentService, doc_upload_and_parse
from api.settings import RetCode
from api.settings import RetCode, docStoreConn
from api.utils.api_utils import get_json_result
from rag.utils.storage_factory import STORAGE_IMPL
from api.utils.file_utils import filename_type, thumbnail
Expand Down Expand Up @@ -187,7 +186,7 @@ def list_docs():
break
else:
return get_json_result(
data=False, retmsg=f'Only owner of knowledgebase authorized for this operation.',
data=False, retmsg='Only owner of knowledgebase authorized for this operation.',
retcode=RetCode.OPERATING_ERROR)
keywords = request.args.get("keywords", "")

Expand Down Expand Up @@ -276,13 +275,13 @@ def change_status():
retmsg="Database error (Document update)!")

if str(req["status"]) == "0":
ELASTICSEARCH.updateScriptByQuery(Q("term", doc_id=req["doc_id"]),
docStoreConn.updateScriptByQuery(Q("term", doc_id=req["doc_id"]),
scripts="ctx._source.available_int=0;",
idxnm=search.index_name(
kb.tenant_id)
)
else:
ELASTICSEARCH.updateScriptByQuery(Q("term", doc_id=req["doc_id"]),
docStoreConn.updateScriptByQuery(Q("term", doc_id=req["doc_id"]),
scripts="ctx._source.available_int=1;",
idxnm=search.index_name(
kb.tenant_id)
Expand Down Expand Up @@ -365,8 +364,7 @@ def run():
tenant_id = DocumentService.get_tenant_id(id)
if not tenant_id:
return get_data_error_result(retmsg="Tenant not found!")
ELASTICSEARCH.deleteByQuery(
Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
docStoreConn.delete({"doc_id": id}, search.index_name(tenant_id))

if str(req["run"]) == TaskStatus.RUNNING.value:
TaskService.filter_delete([Task.doc_id == id])
Expand Down Expand Up @@ -490,8 +488,7 @@ def change_parser():
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
if not tenant_id:
return get_data_error_result(retmsg="Tenant not found!")
ELASTICSEARCH.deleteByQuery(
Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id))

return get_json_result(data=True)
except Exception as e:
Expand Down
4 changes: 0 additions & 4 deletions api/apps/file2document_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License
#
from elasticsearch_dsl import Q

from api.db.db_models import File2Document
from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService

Expand All @@ -28,8 +26,6 @@
from api.db.services.document_service import DocumentService
from api.settings import RetCode
from api.utils.api_utils import get_json_result
from rag.nlp import search
from rag.utils.es_conn import ELASTICSEARCH


@manager.route('/convert', methods=['POST'])
Expand Down
3 changes: 0 additions & 3 deletions api/apps/file_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import re

import flask
from elasticsearch_dsl import Q
from flask import request
from flask_login import login_required, current_user

Expand All @@ -32,8 +31,6 @@
from api.settings import RetCode
from api.utils.api_utils import get_json_result
from api.utils.file_utils import filename_type
from rag.nlp import search
from rag.utils.es_conn import ELASTICSEARCH
from rag.utils.storage_factory import STORAGE_IMPL


Expand Down
6 changes: 3 additions & 3 deletions api/apps/kb_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def update():
if not KnowledgebaseService.query(
created_by=current_user.id, id=req["kb_id"]):
return get_json_result(
data=False, retmsg=f'Only owner of knowledgebase authorized for this operation.', retcode=RetCode.OPERATING_ERROR)
data=False, retmsg='Only owner of knowledgebase authorized for this operation.', retcode=RetCode.OPERATING_ERROR)

e, kb = KnowledgebaseService.get_by_id(req["kb_id"])
if not e:
Expand Down Expand Up @@ -110,7 +110,7 @@ def detail():
break
else:
return get_json_result(
data=False, retmsg=f'Only owner of knowledgebase authorized for this operation.',
data=False, retmsg='Only owner of knowledgebase authorized for this operation.',
retcode=RetCode.OPERATING_ERROR)
kb = KnowledgebaseService.get_detail(kb_id)
if not kb:
Expand Down Expand Up @@ -153,7 +153,7 @@ def rm():
created_by=current_user.id, id=req["kb_id"])
if not kbs:
return get_json_result(
data=False, retmsg=f'Only owner of knowledgebase authorized for this operation.', retcode=RetCode.OPERATING_ERROR)
data=False, retmsg='Only owner of knowledgebase authorized for this operation.', retcode=RetCode.OPERATING_ERROR)

for doc in DocumentService.query(kb_id=req["kb_id"]):
if not DocumentService.remove_document(doc, kbs[0].tenant_id):
Expand Down
58 changes: 14 additions & 44 deletions api/apps/sdk/doc.py
Original file line number Diff line number Diff line change
@@ -1,60 +1,36 @@
import pathlib
import re
import datetime
import json
import traceback

from botocore.docs.method import document_model_driven_method
from flask import request
from flask_login import login_required, current_user
from elasticsearch_dsl import Q
from pygments import highlight
from sphinx.addnodes import document

from rag.app.qa import rmPrefix, beAdoc
from rag.nlp import search, rag_tokenizer, keyword_extraction
from rag.utils.es_conn import ELASTICSEARCH
from rag.utils import rmSpace
from api.db import LLMType, ParserType
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.llm_service import TenantLLMService
from api.db.services.user_service import UserTenantService
from api.utils.api_utils import server_error_response, get_error_data_result, validate_request
from api.utils.api_utils import server_error_response, get_error_data_result
from api.db.services.document_service import DocumentService
from api.settings import RetCode, retrievaler, kg_retrievaler
from api.settings import RetCode, retrievaler, kg_retrievaler, docStoreConn
from api.utils.api_utils import get_result
import hashlib
import re
from api.utils.api_utils import get_result, token_required, get_error_data_result
from api.utils.api_utils import token_required

from api.db.db_models import Task, File

from api.db.services.task_service import TaskService, queue_tasks
from api.db.services.user_service import TenantService, UserTenantService

from api.utils.api_utils import server_error_response, get_error_data_result, validate_request

from api.utils.api_utils import get_result, get_result, get_error_data_result

from functools import partial
from io import BytesIO

from elasticsearch_dsl import Q
from flask import request, send_file
from flask_login import login_required
from flask import send_file

from api.db import FileSource, TaskStatus, FileType
from api.db.db_models import File
from api.db.services.document_service import DocumentService
from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.settings import RetCode, retrievaler
from api.utils.api_utils import construct_json_result, construct_error_response
from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio, email
from rag.nlp import search
from rag.utils import rmSpace
from rag.utils.es_conn import ELASTICSEARCH
from api.utils.api_utils import construct_json_result
from rag.utils.storage_factory import STORAGE_IMPL

MAXIMUM_OF_UPLOADING_FILES = 256
Expand Down Expand Up @@ -142,8 +118,7 @@ def update_doc(tenant_id, dataset_id, document_id):
tenant_id = DocumentService.get_tenant_id(req["id"])
if not tenant_id:
return get_error_data_result(retmsg="Tenant not found!")
ELASTICSEARCH.deleteByQuery(
Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id))

return get_result()

Expand Down Expand Up @@ -265,8 +240,7 @@ def parse(tenant_id,dataset_id):
info["token_num"] = 0
DocumentService.update_by_id(id, info)
# if str(req["run"]) == TaskStatus.CANCEL.value:
ELASTICSEARCH.deleteByQuery(
Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
docStoreConn.delete({"doc_id": id}, search.index_name(tenant_id))
TaskService.filter_delete([Task.doc_id == id])
e, doc = DocumentService.get_by_id(id)
doc = doc.to_dict()
Expand All @@ -293,8 +267,7 @@ def stop_parsing(tenant_id,dataset_id):
DocumentService.update_by_id(id, info)
# if str(req["run"]) == TaskStatus.CANCEL.value:
tenant_id = DocumentService.get_tenant_id(id)
ELASTICSEARCH.deleteByQuery(
Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
docStoreConn.delete({"doc_id": id}, search.index_name(tenant_id))
return get_result()


Expand Down Expand Up @@ -402,7 +375,7 @@ def create(tenant_id,dataset_id,document_id):
v, c = embd_mdl.encode([doc.name, req["content"]])
v = 0.1 * v[0] + 0.9 * v[1]
d["q_%d_vec" % len(v)] = v.tolist()
ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
docStoreConn.upsert([d], search.index_name(tenant_id))

DocumentService.increment_chunk_num(
doc.id, doc.kb_id, c, 1, 0)
Expand Down Expand Up @@ -445,8 +418,7 @@ def rm_chunk(tenant_id,dataset_id,document_id):
for chunk_id in req.get("chunk_ids"):
if chunk_id not in sres.ids:
return get_error_data_result(f"Chunk {chunk_id} not found")
if not ELASTICSEARCH.deleteByQuery(
Q("ids", values=req["chunk_ids"]), search.index_name(tenant_id)):
if not docStoreConn.delete({"_id": req["chunk_ids"]}, search.index_name(tenant_id)):
return get_error_data_result(retmsg="Index updating failure")
deleted_chunk_ids = req["chunk_ids"]
chunk_number = len(deleted_chunk_ids)
Expand All @@ -459,10 +431,8 @@ def rm_chunk(tenant_id,dataset_id,document_id):
@token_required
def set(tenant_id,dataset_id,document_id,chunk_id):
try:
res = ELASTICSEARCH.get(
chunk_id, search.index_name(
tenant_id))
except Exception as e:
res = docStoreConn.get(chunk_id, search.index_name(tenant_id))
except Exception:
return get_error_data_result(f"Can't find this chunk {chunk_id}")
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
Expand Down Expand Up @@ -508,7 +478,7 @@ def set(tenant_id,dataset_id,document_id,chunk_id):
v, c = embd_mdl.encode([doc.name, d["content_with_weight"]])
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
d["q_%d_vec" % len(v)] = v.tolist()
ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
docStoreConn.upsert([d], search.index_name(tenant_id))
return get_result()


Expand Down Expand Up @@ -580,6 +550,6 @@ def retrieval_test(tenant_id):
return get_result(data=ranks)
except Exception as e:
if str(e).find("not_found") > 0:
return get_result(retmsg=f'No chunk found! Check the chunk statu s please!',
return get_result(retmsg='No chunk found! Check the chunk status please!',
retcode=RetCode.DATA_ERROR)
return server_error_response(e)
Loading

0 comments on commit 21376ec

Please sign in to comment.