Skip to content

Commit

Permalink
Passed knowledge base retrieval testing
Browse files Browse the repository at this point in the history
Renamed deleteByQuery with delete
Renamed bulk to upsertBulk
getHighlight
Replaced ELASTICSEARCH with dataStoreConn
Fix KGSearch.search
Moved Dealer.sql_retrieval to es_conn.py
getAggregation
update numpy to ^1.26.0
Integrated Infinity
Handle ElasticSearch keyword
Replaced image_id with img_id
fix _id
img_id -> image_id
Added infinity-sdk
Renamed chunk_id to id
Fixed unit tests
Updated benchmark.py
Dealer.search kb_ids
updated benchmark.py
Update infinity to v0.5.0-dev2
Added infinity health
rm_chunk invoke decrement_chunk_num
Moved infinity table schema into a file
Improved benchmark.py
invoke deleteIdx
  • Loading branch information
yuzhichang committed Nov 12, 2024
1 parent db23d62 commit b7e1020
Show file tree
Hide file tree
Showing 42 changed files with 2,621 additions and 1,852 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ jobs:
echo "Waiting for service to be available..."
sleep 5
done
cd sdk/python && poetry install && source .venv/bin/activate && cd test && pytest t_dataset.py t_chat.py t_session.py t_document.py t_chunk.py
cd sdk/python && poetry install && source .venv/bin/activate && cd test && pytest --tb=short t_dataset.py t_chat.py t_session.py t_document.py t_chunk.py
- name: Stop ragflow:dev
if: always() # always run this step even if previous steps failed
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ docker build -f Dockerfile -t infiniflow/ragflow:dev .
git clone https://github.com/infiniflow/ragflow.git
cd ragflow/
export POETRY_VIRTUALENVS_CREATE=true POETRY_VIRTUALENVS_IN_PROJECT=true
~/.local/bin/poetry install --sync --no-root # install RAGFlow dependent python modules
~/.local/bin/poetry install --sync --no-root --with=full # install RAGFlow dependent python modules
```
3. Launch the dependent services (MinIO, Elasticsearch, Redis, and MySQL) using Docker Compose:
Expand All @@ -295,7 +295,7 @@ docker build -f Dockerfile -t infiniflow/ragflow:dev .
Add the following line to `/etc/hosts` to resolve all hosts specified in **docker/service_conf.yaml** to `127.0.0.1`:
```
127.0.0.1 es01 mysql minio redis
127.0.0.1 es01 infinity mysql minio redis
```
In **docker/service_conf.yaml**, update mysql port to `5455` and es port to `1200`, as specified in **docker/.env**.
Expand Down
2 changes: 1 addition & 1 deletion README_ja.md
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ docker build -f Dockerfile -t infiniflow/ragflow:dev .
`/etc/hosts` に以下の行を追加して、**docker/service_conf.yaml** に指定されたすべてのホストを `127.0.0.1` に解決します:
```
127.0.0.1 es01 mysql minio redis
127.0.0.1 es01 infinity mysql minio redis
```
**docker/service_conf.yaml** で mysql のポートを `5455` に、es のポートを `1200` に更新します(**docker/.env** に指定された通り).
Expand Down
2 changes: 1 addition & 1 deletion README_ko.md
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ docker build -f Dockerfile -t infiniflow/ragflow:dev .
`/etc/hosts` 에 다음 줄을 추가하여 **docker/service_conf.yaml** 에 지정된 모든 호스트를 `127.0.0.1` 로 해결합니다:
```
127.0.0.1 es01 mysql minio redis
127.0.0.1 es01 infinity mysql minio redis
```
**docker/service_conf.yaml** 에서 mysql 포트를 `5455` 로, es 포트를 `1200` 으로 업데이트합니다( **docker/.env** 에 지정된 대로).
Expand Down
2 changes: 1 addition & 1 deletion README_zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ docker build -f Dockerfile -t infiniflow/ragflow:dev .
`/etc/hosts` 中添加以下代码,将 **docker/service_conf.yaml** 文件中的所有 host 地址都解析为 `127.0.0.1`
```
127.0.0.1 es01 mysql minio redis
127.0.0.1 es01 infinity mysql minio redis
```
在文件 **docker/service_conf.yaml** 中,对照 **docker/.env** 的配置将 mysql 端口更新为 `5455`,es 端口更新为 `1200`
Expand Down
5 changes: 3 additions & 2 deletions api/apps/api_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,13 +529,14 @@ def list_chunks():
return get_json_result(
data=False, message="Can't find doc_name or doc_id"
)
kb_ids = KnowledgebaseService.get_kb_ids(tenant_id)

res = retrievaler.chunk_list(doc_id=doc_id, tenant_id=tenant_id)
res = retrievaler.chunk_list(doc_id, tenant_id, kb_ids)
res = [
{
"content": res_item["content_with_weight"],
"doc_name": res_item["docnm_kwd"],
"img_id": res_item["img_id"]
"image_id": res_item["img_id"]
} for res_item in res
]

Expand Down
78 changes: 37 additions & 41 deletions api/apps/chunk_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,25 +18,22 @@

from flask import request
from flask_login import login_required, current_user
from elasticsearch_dsl import Q

from api.db.services.dialog_service import keyword_extraction
from rag.app.qa import rmPrefix, beAdoc
from rag.nlp import search, rag_tokenizer
from rag.utils.es_conn import ELASTICSEARCH
from rag.utils import rmSpace
from api.db import LLMType, ParserType
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.llm_service import LLMBundle
from api.db.services.user_service import UserTenantService
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
from api.db.services.document_service import DocumentService
from api.settings import RetCode, retrievaler, kg_retrievaler
from api.settings import RetCode, retrievaler, kg_retrievaler, docStoreConn
from api.utils.api_utils import get_json_result
import hashlib
import re


@manager.route('/list', methods=['POST'])
@login_required
@validate_request("doc_id")
Expand All @@ -53,12 +50,13 @@ def list_chunk():
e, doc = DocumentService.get_by_id(doc_id)
if not e:
return get_data_error_result(message="Document not found!")
kb_ids = KnowledgebaseService.get_kb_ids(tenant_id)
query = {
"doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True
}
if "available_int" in req:
query["available_int"] = int(req["available_int"])
sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
sres = retrievaler.search(query, search.index_name(tenant_id), kb_ids, highlight=True)
res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()}
for id in sres.ids:
d = {
Expand All @@ -69,16 +67,12 @@ def list_chunk():
"doc_id": sres.field[id]["doc_id"],
"docnm_kwd": sres.field[id]["docnm_kwd"],
"important_kwd": sres.field[id].get("important_kwd", []),
"img_id": sres.field[id].get("img_id", ""),
"image_id": sres.field[id].get("img_id", ""),
"available_int": sres.field[id].get("available_int", 1),
"positions": sres.field[id].get("position_int", "").split("\t")
"positions": json.loads(sres.field[id].get("position_list", "[]")),
}
if len(d["positions"]) % 5 == 0:
poss = []
for i in range(0, len(d["positions"]), 5):
poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]),
float(d["positions"][i + 3]), float(d["positions"][i + 4])])
d["positions"] = poss
assert isinstance(d["positions"], list)
assert len(d["positions"])==0 or (isinstance(d["positions"][0], list) and len(d["positions"][0]) == 5)
res["chunks"].append(d)
return get_json_result(data=res)
except Exception as e:
Expand All @@ -96,22 +90,20 @@ def get():
tenants = UserTenantService.query(user_id=current_user.id)
if not tenants:
return get_data_error_result(message="Tenant not found!")
res = ELASTICSEARCH.get(
chunk_id, search.index_name(
tenants[0].tenant_id))
if not res.get("found"):
tenant_id = tenants[0].tenant_id

kb_ids = KnowledgebaseService.get_kb_ids(tenant_id)
chunk = docStoreConn.get(chunk_id, search.index_name(tenant_id), kb_ids)
if chunk is None:
return server_error_response("Chunk not found")
id = res["_id"]
res = res["_source"]
res["chunk_id"] = id
k = []
for n in res.keys():
for n in chunk.keys():
if re.search(r"(_vec$|_sm_|_tks|_ltks)", n):
k.append(n)
for n in k:
del res[n]
del chunk[n]

return get_json_result(data=res)
return get_json_result(data=chunk)
except Exception as e:
if str(e).find("NotFoundError") >= 0:
return get_json_result(data=False, message='Chunk not found!',
Expand Down Expand Up @@ -162,7 +154,7 @@ def set():
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
d["q_%d_vec" % len(v)] = v.tolist()
ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
docStoreConn.insert([d], search.index_name(tenant_id), doc.kb_id)
return get_json_result(data=True)
except Exception as e:
return server_error_response(e)
Expand All @@ -174,11 +166,11 @@ def set():
def switch():
req = request.json
try:
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
if not tenant_id:
return get_data_error_result(message="Tenant not found!")
if not ELASTICSEARCH.upsert([{"id": i, "available_int": int(req["available_int"])} for i in req["chunk_ids"]],
search.index_name(tenant_id)):
e, doc = DocumentService.get_by_id(req["doc_id"])
if not e:
return get_data_error_result(message="Document not found!")
if not docStoreConn.update({"id": req["chunk_ids"]}, {"available_int": int(req["available_int"])},
search.index_name(doc.tenant_id), doc.kb_id):
return get_data_error_result(message="Index updating failure")
return get_json_result(data=True)
except Exception as e:
Expand All @@ -191,12 +183,11 @@ def switch():
def rm():
req = request.json
try:
if not ELASTICSEARCH.deleteByQuery(
Q("ids", values=req["chunk_ids"]), search.index_name(current_user.id)):
return get_data_error_result(message="Index updating failure")
e, doc = DocumentService.get_by_id(req["doc_id"])
if not e:
return get_data_error_result(message="Document not found!")
if not docStoreConn.delete({"id": req["chunk_ids"]}, search.index_name(current_user.id), doc.kb_id):
return get_data_error_result(message="Index updating failure")
deleted_chunk_ids = req["chunk_ids"]
chunk_number = len(deleted_chunk_ids)
DocumentService.decrement_chunk_num(doc.id, doc.kb_id, 1, chunk_number, 0)
Expand Down Expand Up @@ -239,7 +230,7 @@ def create():
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
v = 0.1 * v[0] + 0.9 * v[1]
d["q_%d_vec" % len(v)] = v.tolist()
ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
docStoreConn.insert([d], search.index_name(tenant_id), doc.kb_id)

DocumentService.increment_chunk_num(
doc.id, doc.kb_id, c, 1, 0)
Expand All @@ -256,26 +247,27 @@ def retrieval_test():
page = int(req.get("page", 1))
size = int(req.get("size", 30))
question = req["question"]
kb_id = req["kb_id"]
if isinstance(kb_id, str): kb_id = [kb_id]
kb_ids = req["kb_id"]
if isinstance(kb_ids, str):
kb_ids = [kb_ids]
doc_ids = req.get("doc_ids", [])
similarity_threshold = float(req.get("similarity_threshold", 0.0))
vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
top = int(req.get("top_k", 1024))

try:
tenants = UserTenantService.query(user_id=current_user.id)
for kid in kb_id:
for kb_id in kb_ids:
for tenant in tenants:
if KnowledgebaseService.query(
tenant_id=tenant.tenant_id, id=kid):
tenant_id=tenant.tenant_id, id=kb_id):
break
else:
return get_json_result(
data=False, message='Only owner of knowledgebase authorized for this operation.',
code=RetCode.OPERATING_ERROR)

e, kb = KnowledgebaseService.get_by_id(kb_id[0])
e, kb = KnowledgebaseService.get_by_id(kb_ids[0])
if not e:
return get_data_error_result(message="Knowledgebase not found!")

Expand All @@ -290,7 +282,7 @@ def retrieval_test():
question += keyword_extraction(chat_mdl, question)

retr = retrievaler if kb.parser_id != ParserType.KG else kg_retrievaler
ranks = retr.retrieval(question, embd_mdl, kb.tenant_id, kb_id, page, size,
ranks = retr.retrieval(question, embd_mdl, kb.tenant_id, kb_ids, page, size,
similarity_threshold, vector_similarity_weight, top,
doc_ids, rerank_mdl=rerank_mdl, highlight=req.get("highlight"))
for c in ranks["chunks"]:
Expand All @@ -309,12 +301,16 @@ def retrieval_test():
@login_required
def knowledge_graph():
doc_id = request.args["doc_id"]
e, doc = DocumentService.get_by_id(doc_id)
if not e:
return get_data_error_result(message="Document not found!")
tenant_id = DocumentService.get_tenant_id(doc_id)
kb_ids = KnowledgebaseService.get_kb_ids(tenant_id)
req = {
"doc_ids":[doc_id],
"knowledge_graph_kwd": ["graph", "mind_map"]
}
tenant_id = DocumentService.get_tenant_id(doc_id)
sres = retrievaler.search(req, search.index_name(tenant_id))
sres = retrievaler.search(req, search.index_name(tenant_id), kb_ids, doc.kb_id)
obj = {"graph": {}, "mind_map": {}}
for id in sres.ids[:2]:
ty = sres.field[id]["knowledge_graph_kwd"]
Expand Down
29 changes: 10 additions & 19 deletions api/apps/document_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
import re

import flask
from elasticsearch_dsl import Q
from flask import request
from flask_login import login_required, current_user

Expand All @@ -27,14 +26,13 @@
from api.db.services.task_service import TaskService, queue_tasks
from api.db.services.user_service import UserTenantService
from rag.nlp import search
from rag.utils.es_conn import ELASTICSEARCH
from api.db.services import duplicate_name
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
from api.utils import get_uuid
from api.db import FileType, TaskStatus, ParserType, FileSource
from api.db.services.document_service import DocumentService, doc_upload_and_parse
from api.settings import RetCode
from api.settings import RetCode, docStoreConn
from api.utils.api_utils import get_json_result
from rag.utils.storage_factory import STORAGE_IMPL
from api.utils.file_utils import filename_type, thumbnail
Expand Down Expand Up @@ -275,18 +273,8 @@ def change_status():
return get_data_error_result(
message="Database error (Document update)!")

if str(req["status"]) == "0":
ELASTICSEARCH.updateScriptByQuery(Q("term", doc_id=req["doc_id"]),
scripts="ctx._source.available_int=0;",
idxnm=search.index_name(
kb.tenant_id)
)
else:
ELASTICSEARCH.updateScriptByQuery(Q("term", doc_id=req["doc_id"]),
scripts="ctx._source.available_int=1;",
idxnm=search.index_name(
kb.tenant_id)
)
status = int(req["status"])
docStoreConn.update({"doc_id": req["doc_id"]}, {"available_int": status}, search.index_name(kb.tenant_id), doc.kb_id)
return get_json_result(data=True)
except Exception as e:
return server_error_response(e)
Expand Down Expand Up @@ -365,8 +353,11 @@ def run():
tenant_id = DocumentService.get_tenant_id(id)
if not tenant_id:
return get_data_error_result(message="Tenant not found!")
ELASTICSEARCH.deleteByQuery(
Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
e, doc = DocumentService.get_by_id(id)
if not e:
return get_data_error_result(message="Document not found!")
if docStoreConn.indexExist(search.index_name(tenant_id), doc.kb_id):
docStoreConn.delete({"doc_id": id}, search.index_name(tenant_id), doc.kb_id)

if str(req["run"]) == TaskStatus.RUNNING.value:
TaskService.filter_delete([Task.doc_id == id])
Expand Down Expand Up @@ -490,8 +481,8 @@ def change_parser():
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
if not tenant_id:
return get_data_error_result(message="Tenant not found!")
ELASTICSEARCH.deleteByQuery(
Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
if docStoreConn.indexExist(search.index_name(tenant_id), doc.kb_id):
docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), doc.kb_id)

return get_json_result(data=True)
except Exception as e:
Expand Down
5 changes: 5 additions & 0 deletions api/apps/kb_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
from api.db.db_models import File
from api.settings import RetCode
from api.utils.api_utils import get_json_result
from api.settings import docStoreConn
from rag.nlp import search


@manager.route('/create', methods=['post'])
Expand Down Expand Up @@ -166,6 +168,9 @@ def rm():
if not KnowledgebaseService.delete_by_id(req["kb_id"]):
return get_data_error_result(
message="Database error (Knowledgebase removal)!")
tenants = UserTenantService.query(user_id=current_user.id)
for tenant in tenants:
docStoreConn.deleteIdx(search.index_name(tenant.tenant_id), req["kb_id"])
return get_json_result(data=True)
except Exception as e:
return server_error_response(e)
Loading

0 comments on commit b7e1020

Please sign in to comment.