Skip to content

Commit

Permalink
perf(ManualUpload): add metadata exclusion for embed and LLM
Browse files Browse the repository at this point in the history
  • Loading branch information
janaka committed Oct 16, 2023
1 parent 0522a1c commit 653b111
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 1 deletion.
12 changes: 12 additions & 0 deletions source/docq/data_source/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,18 @@ def _load_document_list(self, persist_path: str, filename: str) -> List[Document
document_list = [DocumentListItem(**item) for item in data]
return document_list

def _add_exclude_metadata_keys(
self, documents: List[Document], embed_keys: List[str], llm_keys: List[str]
) -> List[Document]:
"""Exclude metadata keys from embedding and LLM."""
if documents is None:
raise ValueError("`documents` cannot be None")
doc_count = len(documents)
for i in range(doc_count):
documents[i].excluded_embed_metadata_keys = embed_keys
documents[i].excluded_llm_metadata_keys = llm_keys
return documents


class SpaceDataSourceWebBased(SpaceDataSourceFileBased):
"""Abstract definition of a web-based data source for a space. To be extended by concrete data sources."""
30 changes: 29 additions & 1 deletion source/docq/data_source/manual_upload.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Data source for documents uploaded manually."""

import logging
import os
from datetime import datetime
from typing import List
Expand Down Expand Up @@ -37,7 +38,34 @@ def lambda_metadata(x: str) -> dict:
str(DocumentMetadata.INDEXED_ON.name).lower(): datetime.timestamp(datetime.now().utcnow()),
}

return SimpleDirectoryReader(get_upload_dir(space), file_metadata=lambda_metadata).load_data()
_documents = SimpleDirectoryReader(get_upload_dir(space), file_metadata=lambda_metadata).load_data()

pdfreader_metadata_keys = ["page_label", "file_name"]
exclude_embed_metadata_keys_ = [
str(DocumentMetadata.FILE_PATH.name).lower(),
str(DocumentMetadata.SPACE_ID.name).lower(),
str(DocumentMetadata.SPACE_TYPE.name).lower(),
str(DocumentMetadata.DATA_SOURCE_NAME.name).lower(),
str(DocumentMetadata.SOURCE_URI.name).lower(),
str(DocumentMetadata.INDEXED_ON.name).lower(),
]
exclude_embed_metadata_keys_.extend(pdfreader_metadata_keys)

excluded_llm_metadata_keys_ = [
str(DocumentMetadata.FILE_PATH.name).lower(),
str(DocumentMetadata.SPACE_ID.name).lower(),
str(DocumentMetadata.SPACE_TYPE.name).lower(),
str(DocumentMetadata.DATA_SOURCE_NAME.name).lower(),
str(DocumentMetadata.INDEXED_ON.name).lower(),
]
# logging.debug("exclude_embed_metadata_keys_: %s", exclude_embed_metadata_keys_)
# logging.debug("excluded_llm_metadata_keys_: %s", excluded_llm_metadata_keys_)
# exclude all meta-metadata from embedding to improve retrieval. The LLM needs some for referencing.
# for i in range(len(documents_)):
# documents_[i].excluded_embed_metadata_keys = exclude_embed_metadata_keys_
# documents_[i].excluded_llm_metadata_keys = excluded_llm_metadata_keys_

return self._add_exclude_metadata_keys(_documents, exclude_embed_metadata_keys_, excluded_llm_metadata_keys_)

def get_document_list(self, space: SpaceKey, configs: dict) -> List[DocumentListItem]:
"""Returns a list of tuples containing the name, creation time, and size (Mb) of each document in the specified space's configured data source.
Expand Down

0 comments on commit 653b111

Please sign in to comment.