Skip to content

Commit

Permalink
Passed knowledge base retrieval testing
Browse files Browse the repository at this point in the history
Renamed deleteByQuery with delete
Renamed bulk to upsertBulk
getHighlight
Replaced ELASTICSEARCH with dataStoreConn
Fix KGSearch.search
Moved Dealer.sql_retrieval to es_conn.py
getAggregation
update numpy to ^1.26.0
Integrated Infinity
Handle ElasticSearch keyword
  • Loading branch information
yuzhichang committed Oct 29, 2024
1 parent c5a3146 commit 0b8edaa
Show file tree
Hide file tree
Showing 46 changed files with 2,086 additions and 1,496 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ docker build -f Dockerfile -t infiniflow/ragflow:dev .
git clone https://github.com/infiniflow/ragflow.git
cd ragflow/
export POETRY_VIRTUALENVS_CREATE=true POETRY_VIRTUALENVS_IN_PROJECT=true
~/.local/bin/poetry install --sync --no-root # install RAGFlow dependent python modules
~/.local/bin/poetry install --sync --no-root --with=full # install RAGFlow dependent python modules
```
3. Launch the dependent services (MinIO, Elasticsearch, Redis, and MySQL) using Docker Compose:
Expand All @@ -264,7 +264,7 @@ docker build -f Dockerfile -t infiniflow/ragflow:dev .
Add the following line to `/etc/hosts` to resolve all hosts specified in **docker/service_conf.yaml** to `127.0.0.1`:
```
127.0.0.1 es01 mysql minio redis
127.0.0.1 es01 infinity mysql minio redis
```
In **docker/service_conf.yaml**, update mysql port to `5455` and es port to `1200`, as specified in **docker/.env**.
Expand Down
2 changes: 1 addition & 1 deletion README_ja.md
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ docker build -f Dockerfile -t infiniflow/ragflow:dev .
`/etc/hosts` に以下の行を追加して、**docker/service_conf.yaml** に指定されたすべてのホストを `127.0.0.1` に解決します:
```
127.0.0.1 es01 mysql minio redis
127.0.0.1 es01 infinity mysql minio redis
```
**docker/service_conf.yaml** で mysql のポートを `5455` に、es のポートを `1200` に更新します(**docker/.env** に指定された通り).
Expand Down
2 changes: 1 addition & 1 deletion README_ko.md
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ docker build -f Dockerfile -t infiniflow/ragflow:dev .
`/etc/hosts` 에 다음 줄을 추가하여 **docker/service_conf.yaml** 에 지정된 모든 호스트를 `127.0.0.1` 로 해결합니다:
```
127.0.0.1 es01 mysql minio redis
127.0.0.1 es01 infinity mysql minio redis
```
**docker/service_conf.yaml** 에서 mysql 포트를 `5455` 로, es 포트를 `1200` 으로 업데이트합니다( **docker/.env** 에 지정된 대로).
Expand Down
2 changes: 1 addition & 1 deletion README_zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ docker build -f Dockerfile -t infiniflow/ragflow:dev .
`/etc/hosts` 中添加以下代码,将 **docker/service_conf.yaml** 文件中的所有 host 地址都解析为 `127.0.0.1`
```
127.0.0.1 es01 mysql minio redis
127.0.0.1 es01 infinity mysql minio redis
```
在文件 **docker/service_conf.yaml** 中,对照 **docker/.env** 的配置将 mysql 端口更新为 `5455`,es 端口更新为 `1200`
Expand Down
15 changes: 8 additions & 7 deletions api/apps/api_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,13 +528,14 @@ def list_chunks():
return get_json_result(
data=False, retmsg="Can't find doc_name or doc_id"
)
kb_ids = KnowledgebaseService.get_kb_ids(tenant_id)

res = retrievaler.chunk_list(doc_id=doc_id, tenant_id=tenant_id)
res = retrievaler.chunk_list(doc_id, tenant_id, kb_ids)
res = [
{
"content": res_item["content_with_weight"],
"doc_name": res_item["docnm_kwd"],
"img_id": res_item["img_id"]
"image_id": res_item["image_id"]
} for res_item in res
]

Expand Down Expand Up @@ -734,9 +735,9 @@ def fillin_conv(ans):

chunk_idxs = [int(match[2]) for match in re.findall(r'##\d\$\$', ans["answer"])]
for chunk_idx in chunk_idxs[:1]:
if ans["reference"]["chunks"][chunk_idx]["img_id"]:
if ans["reference"]["chunks"][chunk_idx]["image_id"]:
try:
bkt, nm = ans["reference"]["chunks"][chunk_idx]["img_id"].split("-")
bkt, nm = ans["reference"]["chunks"][chunk_idx]["image_id"].split("-")
response = STORAGE_IMPL.get(bkt, nm)
data_type_picture["url"] = base64.b64encode(response).decode('utf-8')
data.append(data_type_picture)
Expand Down Expand Up @@ -779,9 +780,9 @@ def fillin_conv(ans):

chunk_idxs = [int(match[2]) for match in re.findall(r'##\d\$\$', ans["answer"])]
for chunk_idx in chunk_idxs[:1]:
if ans["reference"]["chunks"][chunk_idx]["img_id"]:
if ans["reference"]["chunks"][chunk_idx]["image_id"]:
try:
bkt, nm = ans["reference"]["chunks"][chunk_idx]["img_id"].split("-")
bkt, nm = ans["reference"]["chunks"][chunk_idx]["image_id"].split("-")
response = STORAGE_IMPL.get(bkt, nm)
data_type_picture["url"] = base64.b64encode(response).decode('utf-8')
data.append(data_type_picture)
Expand Down Expand Up @@ -840,6 +841,6 @@ def retrieval():
return get_json_result(data=ranks)
except Exception as e:
if str(e).find("not_found") > 0:
return get_json_result(data=False, retmsg=f'No chunk found! Check the chunk status please!',
return get_json_result(data=False, retmsg='No chunk found! Check the chunk status please!',
retcode=RetCode.DATA_ERROR)
return server_error_response(e)
70 changes: 34 additions & 36 deletions api/apps/chunk_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,24 +15,21 @@
#
import datetime
import json
import traceback

from flask import request
from flask_login import login_required, current_user
from elasticsearch_dsl import Q

from api.db.services.dialog_service import keyword_extraction
from rag.app.qa import rmPrefix, beAdoc
from rag.nlp import search, rag_tokenizer
from rag.utils.es_conn import ELASTICSEARCH
from rag.utils import rmSpace
from api.db import LLMType, ParserType
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.llm_service import LLMBundle
from api.db.services.user_service import UserTenantService
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
from api.db.services.document_service import DocumentService
from api.settings import RetCode, retrievaler, kg_retrievaler
from api.settings import RetCode, retrievaler, kg_retrievaler, docStoreConn
from api.utils.api_utils import get_json_result
import hashlib
import re
Expand Down Expand Up @@ -70,21 +67,17 @@ def list_chunk():
"doc_id": sres.field[id]["doc_id"],
"docnm_kwd": sres.field[id]["docnm_kwd"],
"important_kwd": sres.field[id].get("important_kwd", []),
"img_id": sres.field[id].get("img_id", ""),
"image_id": sres.field[id].get("image_id", ""),
"available_int": sres.field[id].get("available_int", 1),
"positions": sres.field[id].get("position_int", "").split("\t")
"positions": json.loads(sres.field[id].get("position_list", "[]")),
}
if len(d["positions"]) % 5 == 0:
poss = []
for i in range(0, len(d["positions"]), 5):
poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]),
float(d["positions"][i + 3]), float(d["positions"][i + 4])])
d["positions"] = poss
assert isinstance(d["positions"], list)
assert len(d["positions"])==0 or (isinstance(d["positions"][0], list) and len(d["positions"][0]) == 5)
res["chunks"].append(d)
return get_json_result(data=res)
except Exception as e:
if str(e).find("not_found") > 0:
return get_json_result(data=False, retmsg=f'No chunk found!',
return get_json_result(data=False, retmsg='No chunk found!',
retcode=RetCode.DATA_ERROR)
return server_error_response(e)

Expand All @@ -97,9 +90,11 @@ def get():
tenants = UserTenantService.query(user_id=current_user.id)
if not tenants:
return get_data_error_result(retmsg="Tenant not found!")
res = ELASTICSEARCH.get(

kb_ids = KnowledgebaseService.get_kb_ids(tenant_id)
res = docStoreConn.get(
chunk_id, search.index_name(
tenants[0].tenant_id))
tenants[0].tenant_id), kb_ids)
if not res.get("found"):
return server_error_response("Chunk not found")
id = res["_id"]
Expand All @@ -115,7 +110,7 @@ def get():
return get_json_result(data=res)
except Exception as e:
if str(e).find("NotFoundError") >= 0:
return get_json_result(data=False, retmsg=f'Chunk not found!',
return get_json_result(data=False, retmsg='Chunk not found!',
retcode=RetCode.DATA_ERROR)
return server_error_response(e)

Expand Down Expand Up @@ -163,7 +158,7 @@ def set():
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
d["q_%d_vec" % len(v)] = v.tolist()
ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
docStoreConn.upsert([d], search.index_name(tenant_id), doc.kb_id)
return get_json_result(data=True)
except Exception as e:
return server_error_response(e)
Expand All @@ -175,11 +170,11 @@ def set():
def switch():
req = request.json
try:
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
if not tenant_id:
return get_data_error_result(retmsg="Tenant not found!")
if not ELASTICSEARCH.upsert([{"id": i, "available_int": int(req["available_int"])} for i in req["chunk_ids"]],
search.index_name(tenant_id)):
e, doc = DocumentService.get_by_id(req["doc_id"])
if not e:
return get_data_error_result(retmsg="Document not found!")
if not docStoreConn.upsert([{"id": i, "available_int": int(req["available_int"])} for i in req["chunk_ids"]],
search.index_name(doc.tenant_id), doc.kb_id):
return get_data_error_result(retmsg="Index updating failure")
return get_json_result(data=True)
except Exception as e:
Expand All @@ -192,12 +187,11 @@ def switch():
def rm():
req = request.json
try:
if not ELASTICSEARCH.deleteByQuery(
Q("ids", values=req["chunk_ids"]), search.index_name(current_user.id)):
return get_data_error_result(retmsg="Index updating failure")
e, doc = DocumentService.get_by_id(req["doc_id"])
if not e:
return get_data_error_result(retmsg="Document not found!")
if not docStoreConn.delete({"_id": req["chunk_ids"]}, search.index_name(current_user.id), doc.kb_id):
return get_data_error_result(retmsg="Index updating failure")
deleted_chunk_ids = req["chunk_ids"]
chunk_number = len(deleted_chunk_ids)
DocumentService.decrement_chunk_num(doc.id, doc.kb_id, 1, chunk_number, 0)
Expand Down Expand Up @@ -240,7 +234,7 @@ def create():
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
v = 0.1 * v[0] + 0.9 * v[1]
d["q_%d_vec" % len(v)] = v.tolist()
ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
docStoreConn.upsert([d], search.index_name(tenant_id), doc.kb_id)

DocumentService.increment_chunk_num(
doc.id, doc.kb_id, c, 1, 0)
Expand All @@ -257,26 +251,27 @@ def retrieval_test():
page = int(req.get("page", 1))
size = int(req.get("size", 30))
question = req["question"]
kb_id = req["kb_id"]
if isinstance(kb_id, str): kb_id = [kb_id]
kb_ids = req["kb_id"]
if isinstance(kb_ids, str):
kb_ids = [kb_ids]
doc_ids = req.get("doc_ids", [])
similarity_threshold = float(req.get("similarity_threshold", 0.0))
vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
top = int(req.get("top_k", 1024))

try:
tenants = UserTenantService.query(user_id=current_user.id)
for kid in kb_id:
for kb_id in kb_ids:
for tenant in tenants:
if KnowledgebaseService.query(
tenant_id=tenant.tenant_id, id=kid):
tenant_id=tenant.tenant_id, id=kb_id):
break
else:
return get_json_result(
data=False, retmsg=f'Only owner of knowledgebase authorized for this operation.',
data=False, retmsg='Only owner of knowledgebase authorized for this operation.',
retcode=RetCode.OPERATING_ERROR)

e, kb = KnowledgebaseService.get_by_id(kb_id[0])
e, kb = KnowledgebaseService.get_by_id(kb_ids[0])
if not e:
return get_data_error_result(retmsg="Knowledgebase not found!")

Expand All @@ -291,7 +286,7 @@ def retrieval_test():
question += keyword_extraction(chat_mdl, question)

retr = retrievaler if kb.parser_id != ParserType.KG else kg_retrievaler
ranks = retr.retrieval(question, embd_mdl, kb.tenant_id, kb_id, page, size,
ranks = retr.retrieval(question, embd_mdl, kb.tenant_id, kb_ids, page, size,
similarity_threshold, vector_similarity_weight, top,
doc_ids, rerank_mdl=rerank_mdl, highlight=req.get("highlight"))
for c in ranks["chunks"]:
Expand All @@ -301,7 +296,7 @@ def retrieval_test():
return get_json_result(data=ranks)
except Exception as e:
if str(e).find("not_found") > 0:
return get_json_result(data=False, retmsg=f'No chunk found! Check the chunk status please!',
return get_json_result(data=False, retmsg='No chunk found! Check the chunk status please!',
retcode=RetCode.DATA_ERROR)
return server_error_response(e)

Expand All @@ -315,13 +310,16 @@ def knowledge_graph():
"knowledge_graph_kwd": ["graph", "mind_map"]
}
tenant_id = DocumentService.get_tenant_id(doc_id)
sres = retrievaler.search(req, search.index_name(tenant_id))
e, doc = DocumentService.get_by_id(req["doc_id"])
if not e:
return get_data_error_result(retmsg="Document not found!")
sres = retrievaler.search(req, search.index_name(tenant_id), doc.kb_id)
obj = {"graph": {}, "mind_map": {}}
for id in sres.ids[:2]:
ty = sres.field[id]["knowledge_graph_kwd"]
try:
content_json = json.loads(sres.field[id]["content_with_weight"])
except Exception as e:
except Exception:
continue

if ty == 'mind_map':
Expand Down
31 changes: 11 additions & 20 deletions api/apps/document_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
import re

import flask
from elasticsearch_dsl import Q
from flask import request
from flask_login import login_required, current_user

Expand All @@ -27,14 +26,13 @@
from api.db.services.task_service import TaskService, queue_tasks
from api.db.services.user_service import UserTenantService
from rag.nlp import search
from rag.utils.es_conn import ELASTICSEARCH
from api.db.services import duplicate_name
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
from api.utils import get_uuid
from api.db import FileType, TaskStatus, ParserType, FileSource
from api.db.services.document_service import DocumentService, doc_upload_and_parse
from api.settings import RetCode
from api.settings import RetCode, docStoreConn
from api.utils.api_utils import get_json_result
from rag.utils.storage_factory import STORAGE_IMPL
from api.utils.file_utils import filename_type, thumbnail
Expand Down Expand Up @@ -187,7 +185,7 @@ def list_docs():
break
else:
return get_json_result(
data=False, retmsg=f'Only owner of knowledgebase authorized for this operation.',
data=False, retmsg='Only owner of knowledgebase authorized for this operation.',
retcode=RetCode.OPERATING_ERROR)
keywords = request.args.get("keywords", "")

Expand Down Expand Up @@ -275,18 +273,8 @@ def change_status():
return get_data_error_result(
retmsg="Database error (Document update)!")

if str(req["status"]) == "0":
ELASTICSEARCH.updateScriptByQuery(Q("term", doc_id=req["doc_id"]),
scripts="ctx._source.available_int=0;",
idxnm=search.index_name(
kb.tenant_id)
)
else:
ELASTICSEARCH.updateScriptByQuery(Q("term", doc_id=req["doc_id"]),
scripts="ctx._source.available_int=1;",
idxnm=search.index_name(
kb.tenant_id)
)
status = int(req["status"])
docStoreConn.update({"doc_id": req["doc_id"]}, {"available_int": status}, search.index_name(kb.tenant_id), doc.kb_id)
return get_json_result(data=True)
except Exception as e:
return server_error_response(e)
Expand Down Expand Up @@ -365,8 +353,11 @@ def run():
tenant_id = DocumentService.get_tenant_id(id)
if not tenant_id:
return get_data_error_result(retmsg="Tenant not found!")
ELASTICSEARCH.deleteByQuery(
Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
e, doc = DocumentService.get_by_id(id)
if not e:
return get_data_error_result(retmsg="Document not found!")
if docStoreConn.indexExist(search.index_name(tenant_id), doc.kb_id):
docStoreConn.delete({"doc_id": id}, search.index_name(tenant_id), doc.kb_id)

if str(req["run"]) == TaskStatus.RUNNING.value:
TaskService.filter_delete([Task.doc_id == id])
Expand Down Expand Up @@ -490,8 +481,8 @@ def change_parser():
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
if not tenant_id:
return get_data_error_result(retmsg="Tenant not found!")
ELASTICSEARCH.deleteByQuery(
Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
if docStoreConn.indexExist(search.index_name(tenant_id), doc.kb_id):
docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), doc.kb_id)

return get_json_result(data=True)
except Exception as e:
Expand Down
4 changes: 0 additions & 4 deletions api/apps/file2document_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License
#
from elasticsearch_dsl import Q

from api.db.db_models import File2Document
from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService

Expand All @@ -28,8 +26,6 @@
from api.db.services.document_service import DocumentService
from api.settings import RetCode
from api.utils.api_utils import get_json_result
from rag.nlp import search
from rag.utils.es_conn import ELASTICSEARCH


@manager.route('/convert', methods=['POST'])
Expand Down
Loading

0 comments on commit 0b8edaa

Please sign in to comment.