diff --git a/source/docq/data_source/main.py b/source/docq/data_source/main.py index 54cfbe37..1146fb1d 100644 --- a/source/docq/data_source/main.py +++ b/source/docq/data_source/main.py @@ -94,6 +94,18 @@ def _load_document_list(self, persist_path: str, filename: str) -> List[Document document_list = [DocumentListItem(**item) for item in data] return document_list + def _add_exclude_metadata_keys( + self, documents: List[Document], embed_keys: List[str], llm_keys: List[str] + ) -> List[Document]: + """Exclude metadata keys from embedding and LLM.""" + if documents is None: + raise ValueError("`documents` cannot be None") + doc_count = len(documents) + for i in range(doc_count): + documents[i].excluded_embed_metadata_keys = embed_keys + documents[i].excluded_llm_metadata_keys = llm_keys + return documents + class SpaceDataSourceWebBased(SpaceDataSourceFileBased): """Abstract definition of a web-based data source for a space. To be extended by concrete data sources.""" diff --git a/source/docq/data_source/manual_upload.py b/source/docq/data_source/manual_upload.py index c2fccd92..9c24355d 100644 --- a/source/docq/data_source/manual_upload.py +++ b/source/docq/data_source/manual_upload.py @@ -1,5 +1,6 @@ """Data source for documents uploaded manually.""" +import logging import os from datetime import datetime from typing import List @@ -37,7 +38,34 @@ def lambda_metadata(x: str) -> dict: str(DocumentMetadata.INDEXED_ON.name).lower(): datetime.timestamp(datetime.now().utcnow()), } - return SimpleDirectoryReader(get_upload_dir(space), file_metadata=lambda_metadata).load_data() + _documents = SimpleDirectoryReader(get_upload_dir(space), file_metadata=lambda_metadata).load_data() + + pdfreader_metadata_keys = ["page_label", "file_name"] + exclude_embed_metadata_keys_ = [ + str(DocumentMetadata.FILE_PATH.name).lower(), + str(DocumentMetadata.SPACE_ID.name).lower(), + str(DocumentMetadata.SPACE_TYPE.name).lower(), + str(DocumentMetadata.DATA_SOURCE_NAME.name).lower(), + str(DocumentMetadata.SOURCE_URI.name).lower(), + str(DocumentMetadata.INDEXED_ON.name).lower(), + ] + exclude_embed_metadata_keys_.extend(pdfreader_metadata_keys) + + excluded_llm_metadata_keys_ = [ + str(DocumentMetadata.FILE_PATH.name).lower(), + str(DocumentMetadata.SPACE_ID.name).lower(), + str(DocumentMetadata.SPACE_TYPE.name).lower(), + str(DocumentMetadata.DATA_SOURCE_NAME.name).lower(), + str(DocumentMetadata.INDEXED_ON.name).lower(), + ] + # logging.debug("exclude_embed_metadata_keys_: %s", exclude_embed_metadata_keys_) + # logging.debug("excluded_llm_metadata_keys_: %s", excluded_llm_metadata_keys_) + # exclude all meta-metadata from embedding to improve retrieval. The LLM needs some for referencing. + # for i in range(len(documents_)): + # documents_[i].excluded_embed_metadata_keys = exclude_embed_metadata_keys_ + # documents_[i].excluded_llm_metadata_keys = excluded_llm_metadata_keys_ + + return self._add_exclude_metadata_keys(_documents, exclude_embed_metadata_keys_, excluded_llm_metadata_keys_) def get_document_list(self, space: SpaceKey, configs: dict) -> List[DocumentListItem]: """Returns a list of tuples containing the name, creation time, and size (Mb) of each document in the specified space's configured data source.