diff --git a/pyproject.toml b/pyproject.toml index 7ef0282fa..e1d846352 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -87,6 +87,7 @@ dependencies = [ "cron-descriptor == 1.4.3", "django_apscheduler == 0.6.2", "anthropic == 0.26.1", + "docx2txt == 0.8" ] dynamic = ["version"] diff --git a/src/khoj/database/models/__init__.py b/src/khoj/database/models/__init__.py index 92415f590..380dec22b 100644 --- a/src/khoj/database/models/__init__.py +++ b/src/khoj/database/models/__init__.py @@ -306,6 +306,7 @@ class EntryType(models.TextChoices): NOTION = "notion" GITHUB = "github" CONVERSATION = "conversation" + DOCX = "docx" class EntrySource(models.TextChoices): COMPUTER = "computer" diff --git a/src/khoj/interface/web/assets/icons/docx.svg b/src/khoj/interface/web/assets/icons/docx.svg new file mode 100644 index 000000000..7e588b767 --- /dev/null +++ b/src/khoj/interface/web/assets/icons/docx.svg @@ -0,0 +1,7 @@ +<svg xmlns="http://www.w3.org/2000/svg" fill="#FFF" stroke-miterlimit="10" stroke-width="2" viewBox="0 0 96 96"> + <path stroke="#979593" d="M67.1716 7H27c-1.1046 0-2 .8954-2 2v78c0 1.1046.8954 2 2 2h58c1.1046 0 2-.8954 2-2V26.8284c0-.5304-.2107-1.0391-.5858-1.4142L68.5858 7.5858C68.2107 7.2107 67.702 7 67.1716 7z"/> + <path fill="none" stroke="#979593" d="M67 7v18c0 1.1046.8954 2 2 2h18"/> + <path fill="#C8C6C4" d="M79 61H48v-2h31c.5523 0 1 .4477 1 1s-.4477 1-1 1zm0-6H48v-2h31c.5523 0 1 .4477 1 1s-.4477 1-1 1zm0-6H48v-2h31c.5523 0 1 .4477 1 1s-.4477 1-1 1zm0-6H48v-2h31c.5523 0 1 .4477 1 1s-.4477 1-1 1zm0 24H48v-2h31c.5523 0 1 .4477 1 1s-.4477 1-1 1z"/> + <path fill="#185ABD" d="M12 74h32c2.2091 0 4-1.7909 4-4V38c0-2.2091-1.7909-4-4-4H12c-2.2091 0-4 1.7909-4 4v32c0 2.2091 1.7909 4 4 4z"/> + <path d="M21.6245 60.6455c.0661.522.109.9769.1296 1.3657h.0762c.0306-.3685.0889-.8129.1751-1.3349.0862-.5211.1703-.961.2517-1.319L25.7911 44h4.5702l3.6562 15.1272c.183.7468.3353 1.6973.457 2.8532h.0608c.0508-.7979.1777-1.7184.3809-2.7615L37.8413 44H42l-5.1183 22h-4.86l-3.4885-14.5744c-.1016-.4197-.2158-.9663-.3428-1.6417-.127-.6745-.2057-1.1656-.236-1.4724h-.0608c-.0407.358-.1195.8896-.2364 1.595-.1169.7062-.211 1.2273-.2819 1.565L24.1 66h-4.9357L14 44h4.2349l3.1843 15.3882c.0709.3165.1392.7362.2053 1.2573z"/> +</svg> diff --git a/src/khoj/interface/web/chat.html b/src/khoj/interface/web/chat.html index 639c0642c..71ccea074 100644 --- a/src/khoj/interface/web/chat.html +++ b/src/khoj/interface/web/chat.html @@ -48,8 +48,8 @@ To get started, just start typing below. You can also type / to see a list of commands. `.trim() - const allowedExtensions = ['text/org', 'text/markdown', 'text/plain', 'text/html', 'application/pdf']; - const allowedFileEndings = ['org', 'md', 'txt', 'html', 'pdf']; + const allowedExtensions = ['text/org', 'text/markdown', 'text/plain', 'text/html', 'application/pdf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document']; + const allowedFileEndings = ['org', 'md', 'txt', 'html', 'pdf', 'docx']; let chatOptions = []; function createCopyParentText(message) { return function(event) { diff --git a/src/khoj/interface/web/content_source_computer_input.html b/src/khoj/interface/web/content_source_computer_input.html index 8dc4d7dc1..77816f353 100644 --- a/src/khoj/interface/web/content_source_computer_input.html +++ b/src/khoj/interface/web/content_source_computer_input.html @@ -73,6 +73,8 @@ <h2 class="section-title"> image_name = "pdf.svg" else if (fileExtension === "markdown" || fileExtension === "md") image_name = "markdown.svg" + else if (fileExtension === "docx") + image_name = "docx.svg" else image_name = "plaintext.svg" diff --git a/src/khoj/processor/content/docx/__init__.py b/src/khoj/processor/content/docx/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/khoj/processor/content/docx/docx_to_entries.py b/src/khoj/processor/content/docx/docx_to_entries.py new file mode 100644 index 000000000..ab28066d3 --- /dev/null +++ b/src/khoj/processor/content/docx/docx_to_entries.py @@ -0,0 +1,110 @@ +import logging +import os +from datetime import datetime +from typing import Dict, List, Tuple + +from langchain_community.document_loaders import Docx2txtLoader + +from khoj.database.models import Entry as DbEntry +from khoj.database.models import KhojUser +from khoj.processor.content.text_to_entries import TextToEntries +from khoj.utils.helpers import timer +from khoj.utils.rawconfig import Entry + +logger = logging.getLogger(__name__) + + +class DocxToEntries(TextToEntries): + def __init__(self): + super().__init__() + + # Define Functions + def process( + self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False + ) -> Tuple[int, int]: + # Extract required fields from config + if not full_corpus: + deletion_file_names = set([file for file in files if files[file] == b""]) + files_to_process = set(files) - deletion_file_names + files = {file: files[file] for file in files_to_process} + else: + deletion_file_names = None + + # Extract Entries from specified Docx files + with timer("Extract entries from specified DOCX files", logger): + file_to_text_map, current_entries = DocxToEntries.extract_docx_entries(files) + + # Split entries by max tokens supported by model + with timer("Split entries by max token size supported by model", logger): + current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256) + + # Identify, mark and merge any new entries with previous entries + with timer("Identify new or updated entries", logger): + num_new_embeddings, num_deleted_embeddings = self.update_embeddings( + current_entries, + DbEntry.EntryType.DOCX, + DbEntry.EntrySource.COMPUTER, + "compiled", + logger, + deletion_file_names, + user, + regenerate=regenerate, + file_to_text_map=file_to_text_map, + ) + + return num_new_embeddings, num_deleted_embeddings + + @staticmethod + def extract_docx_entries(docx_files) -> Tuple[Dict, List[Entry]]: + """Extract entries from specified DOCX files""" + + entries: List[str] = [] + entry_to_location_map: List[Tuple[str, str]] = [] + file_to_text_map = dict() + for docx_file in docx_files: + try: + timestamp_now = datetime.utcnow().timestamp() + tmp_file = f"tmp_docx_file_{timestamp_now}.docx" + with open(tmp_file, "wb") as f: + bytes_content = docx_files[docx_file] + f.write(bytes_content) + + # Load the content using Docx2txtLoader + loader = Docx2txtLoader(tmp_file) + docx_entries_per_file = loader.load() + + # Convert the loaded entries into the desired format + docx_texts = [page.page_content for page in docx_entries_per_file] + + entry_to_location_map += zip(docx_texts, [docx_file] * len(docx_texts)) + entries.extend(docx_texts) + file_to_text_map[docx_file] = docx_texts + except Exception as e: + logger.warning(f"Unable to process file: {docx_file}. This file will not be indexed.") + logger.warning(e, exc_info=True) + finally: + if os.path.exists(f"{tmp_file}"): + os.remove(f"{tmp_file}") + return file_to_text_map, DocxToEntries.convert_docx_entries_to_maps(entries, dict(entry_to_location_map)) + + @staticmethod + def convert_docx_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]: + """Convert each DOCX entry into a dictionary""" + entries = [] + for parsed_entry in parsed_entries: + entry_filename = entry_to_file_map[parsed_entry] + # Append base filename to compiled entry for context to model + heading = f"{entry_filename}\n" + compiled_entry = f"{heading}{parsed_entry}" + entries.append( + Entry( + compiled=compiled_entry, + raw=parsed_entry, + heading=heading, + file=f"{entry_filename}", + ) + ) + + logger.debug(f"Converted {len(parsed_entries)} DOCX entries to dictionaries") + + return entries diff --git a/src/khoj/routers/indexer.py b/src/khoj/routers/indexer.py index 1e0184cf0..4391b1ece 100644 --- a/src/khoj/routers/indexer.py +++ b/src/khoj/routers/indexer.py @@ -7,6 +7,7 @@ from starlette.authentication import requires from khoj.database.models import GithubConfig, KhojUser, NotionConfig +from khoj.processor.content.docx.docx_to_entries import DocxToEntries from khoj.processor.content.github.github_to_entries import GithubToEntries from khoj.processor.content.markdown.markdown_to_entries import MarkdownToEntries from khoj.processor.content.notion.notion_to_entries import NotionToEntries @@ -40,6 +41,7 @@ class IndexerInput(BaseModel): markdown: Optional[dict[str, str]] = None pdf: Optional[dict[str, bytes]] = None plaintext: Optional[dict[str, str]] = None + docx: Optional[dict[str, bytes]] = None @indexer.post("/update") @@ -63,7 +65,7 @@ async def update( ), ): user = request.user.object - index_files: Dict[str, Dict[str, str]] = {"org": {}, "markdown": {}, "pdf": {}, "plaintext": {}} + index_files: Dict[str, Dict[str, str]] = {"org": {}, "markdown": {}, "pdf": {}, "plaintext": {}, "docx": {}} try: logger.info(f"📬 Updating content index via API call by {client} client") for file in files: @@ -79,6 +81,7 @@ async def update( markdown=index_files["markdown"], pdf=index_files["pdf"], plaintext=index_files["plaintext"], + docx=index_files["docx"], ) if state.config == None: @@ -93,6 +96,7 @@ async def update( org=None, markdown=None, pdf=None, + docx=None, image=None, github=None, notion=None, @@ -129,6 +133,7 @@ async def update( "num_markdown": len(index_files["markdown"]), "num_pdf": len(index_files["pdf"]), "num_plaintext": len(index_files["plaintext"]), + "num_docx": len(index_files["docx"]), } update_telemetry_state( @@ -295,6 +300,20 @@ def configure_content( logger.error(f"🚨 Failed to setup Notion: {e}", exc_info=True) success = False + try: + if (search_type == state.SearchType.All.value or search_type == state.SearchType.Docx.value) and files["docx"]: + logger.info("📄 Setting up search for docx") + text_search.setup( + DocxToEntries, + files.get("docx"), + regenerate=regenerate, + full_corpus=full_corpus, + user=user, + ) + except Exception as e: + logger.error(f"🚨 Failed to setup docx: {e}", exc_info=True) + success = False + # Invalidate Query Cache if user: state.query_cache[user.uuid] = LRU() diff --git a/src/khoj/utils/config.py b/src/khoj/utils/config.py index 1732271a7..0e88075f8 100644 --- a/src/khoj/utils/config.py +++ b/src/khoj/utils/config.py @@ -28,6 +28,7 @@ class SearchType(str, Enum): Github = "github" Notion = "notion" Plaintext = "plaintext" + Docx = "docx" class ProcessorType(str, Enum): diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py index 59327b0df..de6b72857 100644 --- a/src/khoj/utils/helpers.py +++ b/src/khoj/utils/helpers.py @@ -115,6 +115,8 @@ def get_file_type(file_type: str, file_content: bytes) -> tuple[str, str]: return "org", encoding elif file_type in ["application/pdf"]: return "pdf", encoding + elif file_type in ["application/msword", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"]: + return "docx", encoding elif file_type in ["image/jpeg"]: return "jpeg", encoding elif file_type in ["image/png"]: diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py index fbc873f69..b4bdcbeaa 100644 --- a/src/khoj/utils/rawconfig.py +++ b/src/khoj/utils/rawconfig.py @@ -65,6 +65,7 @@ class ContentConfig(ConfigBase): plaintext: Optional[TextContentConfig] = None github: Optional[GithubContentConfig] = None notion: Optional[NotionContentConfig] = None + docx: Optional[TextContentConfig] = None class ImageSearchConfig(ConfigBase): diff --git a/tests/data/docx/bangalore.docx b/tests/data/docx/bangalore.docx new file mode 100644 index 000000000..df1dd7612 Binary files /dev/null and b/tests/data/docx/bangalore.docx differ diff --git a/tests/data/docx/iceland.docx b/tests/data/docx/iceland.docx new file mode 100644 index 000000000..cfdbf2a8e Binary files /dev/null and b/tests/data/docx/iceland.docx differ diff --git a/tests/test_client.py b/tests/test_client.py index 13f226c1a..d3c180302 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -61,7 +61,7 @@ def test_search_with_invalid_content_type(client): @pytest.mark.django_db(transaction=True) def test_search_with_valid_content_type(client): headers = {"Authorization": "Bearer kk-secret"} - for content_type in ["all", "org", "markdown", "image", "pdf", "github", "notion", "plaintext"]: + for content_type in ["all", "org", "markdown", "image", "pdf", "github", "notion", "plaintext", "docx"]: # Act response = client.get(f"/api/search?q=random&t={content_type}", headers=headers) # Assert @@ -480,6 +480,14 @@ def get_sample_files_data(): ("files", ("path/to/filename1.txt", "<html>my first web page</html>", "text/plain")), ("files", ("path/to/filename2.txt", "2021-02-02 Journal Entry", "text/plain")), ("files", ("path/to/filename.md", "# Notes from client call", "text/markdown")), + ( + "files", + ( + "path/to/filename.docx", + "## Studying anthropological records from the Fatimid caliphate", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ), + ), ( "files", ("path/to/filename1.md", "## Studying anthropological records from the Fatimid caliphate", "text/markdown"), diff --git a/tests/test_docx_to_entries.py b/tests/test_docx_to_entries.py new file mode 100644 index 000000000..089c7fec7 --- /dev/null +++ b/tests/test_docx_to_entries.py @@ -0,0 +1,37 @@ +import os + +from khoj.processor.content.docx.docx_to_entries import DocxToEntries + + +def test_single_page_docx_to_jsonl(): + "Convert single page DOCX file to jsonl." + # Act + # Extract Entries from specified Docx files + # Read singlepage.docx into memory as bytes + with open("tests/data/docx/iceland.docx", "rb") as f: + docx_bytes = f.read() + + data = {"tests/data/docx/iceland.docx": docx_bytes} + entries = DocxToEntries.extract_docx_entries(docx_files=data) + + # Assert + assert "The Icelandic horse" in entries[0]["tests/data/docx/iceland.docx"][0] + assert len(entries) == 2 + assert len(entries[1]) == 1 + + +def test_multi_page_docx_to_jsonl(): + "Convert multi page DOCX file to jsonl." + # Act + # Extract Entries from specified Docx files + # Read multipage.docx into memory as bytes + with open("tests/data/docx/bangalore.docx", "rb") as f: + docx_bytes = f.read() + + data = {"tests/data/docx/bangalore.docx": docx_bytes} + entries = DocxToEntries.extract_docx_entries(docx_files=data) + + # Assert + assert "Bangalore" in entries[0]["tests/data/docx/bangalore.docx"][0] + assert len(entries) == 2 + assert len(entries[1]) == 1