perf(ManualUpload): add metadata exclusion for embed and LLM

docqai · Oct 16, 2023 · 653b111 · 653b111
1 parent 0522a1c
commit 653b111
Show file tree

Hide file tree

Showing 2 changed files with 41 additions and 1 deletion.
diff --git a/source/docq/data_source/main.py b/source/docq/data_source/main.py
@@ -94,6 +94,18 @@ def _load_document_list(self, persist_path: str, filename: str) -> List[Document
             document_list = [DocumentListItem(**item) for item in data]
             return document_list
 
+    def _add_exclude_metadata_keys(
+        self, documents: List[Document], embed_keys: List[str], llm_keys: List[str]
+    ) -> List[Document]:
+        """Exclude metadata keys from embedding and LLM."""
+        if documents is None:
+            raise ValueError("`documents` cannot be None")
+        doc_count = len(documents)
+        for i in range(doc_count):
+            documents[i].excluded_embed_metadata_keys = embed_keys
+            documents[i].excluded_llm_metadata_keys = llm_keys
+        return documents
+
 
 class SpaceDataSourceWebBased(SpaceDataSourceFileBased):
     """Abstract definition of a web-based data source for a space. To be extended by concrete data sources."""
diff --git a/source/docq/data_source/manual_upload.py b/source/docq/data_source/manual_upload.py
@@ -1,5 +1,6 @@
 """Data source for documents uploaded manually."""
 
+import logging
 import os
 from datetime import datetime
 from typing import List
@@ -37,7 +38,34 @@ def lambda_metadata(x: str) -> dict:
                 str(DocumentMetadata.INDEXED_ON.name).lower(): datetime.timestamp(datetime.now().utcnow()),
             }
 
-        return SimpleDirectoryReader(get_upload_dir(space), file_metadata=lambda_metadata).load_data()
+        _documents = SimpleDirectoryReader(get_upload_dir(space), file_metadata=lambda_metadata).load_data()
+
+        pdfreader_metadata_keys = ["page_label", "file_name"]
+        exclude_embed_metadata_keys_ = [
+            str(DocumentMetadata.FILE_PATH.name).lower(),
+            str(DocumentMetadata.SPACE_ID.name).lower(),
+            str(DocumentMetadata.SPACE_TYPE.name).lower(),
+            str(DocumentMetadata.DATA_SOURCE_NAME.name).lower(),
+            str(DocumentMetadata.SOURCE_URI.name).lower(),
+            str(DocumentMetadata.INDEXED_ON.name).lower(),
+        ]
+        exclude_embed_metadata_keys_.extend(pdfreader_metadata_keys)
+
+        excluded_llm_metadata_keys_ = [
+            str(DocumentMetadata.FILE_PATH.name).lower(),
+            str(DocumentMetadata.SPACE_ID.name).lower(),
+            str(DocumentMetadata.SPACE_TYPE.name).lower(),
+            str(DocumentMetadata.DATA_SOURCE_NAME.name).lower(),
+            str(DocumentMetadata.INDEXED_ON.name).lower(),
+        ]
+        # logging.debug("exclude_embed_metadata_keys_: %s", exclude_embed_metadata_keys_)
+        # logging.debug("excluded_llm_metadata_keys_: %s", excluded_llm_metadata_keys_)
+        # exclude all meta-metadata from embedding to improve retrieval. The LLM needs some for referencing.
+        # for i in range(len(documents_)):
+        #     documents_[i].excluded_embed_metadata_keys = exclude_embed_metadata_keys_
+        #     documents_[i].excluded_llm_metadata_keys = excluded_llm_metadata_keys_
+
+        return self._add_exclude_metadata_keys(_documents, exclude_embed_metadata_keys_, excluded_llm_metadata_keys_)
 
     def get_document_list(self, space: SpaceKey, configs: dict) -> List[DocumentListItem]:
         """Returns a list of tuples containing the name, creation time, and size (Mb) of each document in the specified space's configured data source.