diff --git a/examples/sample_files/sample.md b/examples/sample_files/sample.md new file mode 100644 index 00000000..e02535b7 --- /dev/null +++ b/examples/sample_files/sample.md @@ -0,0 +1,2 @@ + +## A test document for markdown diff --git a/examples/sample_files/sample_docx.docx b/examples/sample_files/sample_docx.docx new file mode 100644 index 00000000..3a740ac9 Binary files /dev/null and b/examples/sample_files/sample_docx.docx differ diff --git a/examples/sample_files/sample_pdf_1.pdf b/examples/sample_files/sample_pdf_1.pdf new file mode 100644 index 00000000..87259b89 Binary files /dev/null and b/examples/sample_files/sample_pdf_1.pdf differ diff --git a/examples/sample_files/sample_pptx.pptx b/examples/sample_files/sample_pptx.pptx new file mode 100644 index 00000000..1f6e3791 Binary files /dev/null and b/examples/sample_files/sample_pptx.pptx differ diff --git a/examples/super_components.ipynb b/examples/super_components.ipynb new file mode 100644 index 00000000..135753ee --- /dev/null +++ b/examples/super_components.ipynb @@ -0,0 +1,497 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-11T09:03:33.257832Z", + "start_time": "2025-02-11T09:03:31.598703Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: python-docx in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (1.1.2)\r\n", + "Requirement already satisfied: trafilatura in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (2.0.0)\r\n", + "Requirement already satisfied: markdown-it-py in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (3.0.0)\r\n", + "Requirement already satisfied: mdit_plain in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (1.0.1)\r\n", + "Requirement already satisfied: pypdf in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (5.2.0)\r\n", + "Requirement already satisfied: python-pptx in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (1.0.2)\r\n", + "Requirement already satisfied: openpyxl in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (3.1.5)\r\n", + "Requirement already satisfied: nltk in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (3.9.1)\r\n", + "Requirement already satisfied: lxml>=3.1.0 in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (from python-docx) (5.3.0)\r\n", + "Requirement already satisfied: typing-extensions>=4.9.0 in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (from python-docx) (4.12.2)\r\n", + "Requirement already satisfied: certifi in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (from trafilatura) (2024.12.14)\r\n", + "Requirement already satisfied: charset_normalizer>=3.4.0 in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (from trafilatura) (3.4.1)\r\n", + "Requirement already satisfied: courlan>=1.3.2 in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (from trafilatura) (1.3.2)\r\n", + "Requirement already satisfied: htmldate>=1.9.2 in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (from trafilatura) (1.9.3)\r\n", + "Requirement already satisfied: justext>=3.0.1 in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (from trafilatura) (3.0.1)\r\n", + "Requirement already satisfied: urllib3<3,>=1.26 in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (from trafilatura) (2.3.0)\r\n", + "Requirement already satisfied: mdurl~=0.1 in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (from markdown-it-py) (0.1.2)\r\n", + "Requirement already satisfied: Pillow>=3.3.2 in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (from python-pptx) (11.1.0)\r\n", + "Requirement already satisfied: XlsxWriter>=0.5.7 in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (from python-pptx) (3.2.2)\r\n", + "Requirement already satisfied: et-xmlfile in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (from openpyxl) (2.0.0)\r\n", + "Requirement already satisfied: click in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (from nltk) (8.1.8)\r\n", + "Requirement already satisfied: joblib in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (from nltk) (1.4.2)\r\n", + "Requirement already satisfied: regex>=2021.8.3 in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (from nltk) (2024.11.6)\r\n", + "Requirement already satisfied: tqdm in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (from nltk) (4.67.1)\r\n", + "Requirement already satisfied: babel>=2.16.0 in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (from courlan>=1.3.2->trafilatura) (2.16.0)\r\n", + "Requirement already satisfied: tld>=0.13 in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (from courlan>=1.3.2->trafilatura) (0.13)\r\n", + "Requirement already satisfied: dateparser>=1.1.2 in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (from htmldate>=1.9.2->trafilatura) (1.2.0)\r\n", + "Requirement already satisfied: python-dateutil>=2.9.0.post0 in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (from htmldate>=1.9.2->trafilatura) (2.9.0.post0)\r\n", + "Requirement already satisfied: pytz in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (from dateparser>=1.1.2->htmldate>=1.9.2->trafilatura) (2024.2)\r\n", + "Requirement already satisfied: tzlocal in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (from dateparser>=1.1.2->htmldate>=1.9.2->trafilatura) (5.2)\r\n", + "Requirement already satisfied: lxml-html-clean in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (from lxml[html_clean]>=4.4.2->justext>=3.0.1->trafilatura) (0.4.1)\r\n", + "Requirement already satisfied: six>=1.5 in /Users/mathislucka/Library/Application Support/hatch/env/virtual/haystack-experimental/0WxCzKa9/test/lib/python3.13/site-packages (from python-dateutil>=2.9.0.post0->htmldate>=1.9.2->trafilatura) (1.17.0)\r\n" + ] + } + ], + "source": [ + "!pip install python-docx trafilatura markdown-it-py mdit_plain pypdf python-pptx openpyxl nltk" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-11T09:12:12.253083Z", + "start_time": "2025-02-11T09:12:12.250834Z" + } + }, + "outputs": [], + "source": [ + "from haystack import Pipeline\n", + "from haystack.document_stores.in_memory import InMemoryDocumentStore\n", + "\n", + "from haystack_experimental.super_components.converters import MultiFileConverter\n", + "from haystack_experimental.super_components.indexers import SentenceTransformersDocumentIndexer\n", + "from haystack_experimental.super_components.preprocessors import DocumentPreProcessor" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SuperComponents\n", + "\n", + "Supercomponents in general behave like any other component. They have init params, optional from_dict() and to_dict() methods as usual. The init params typically determine how the internal pipeline is constructed (e.g. which components are used).\n", + "\n", + "Supercomponents can make it easier to build common pipeline patterns.\n", + "\n", + "Here, we are building an indexing pipeline for a RAG system.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-11T09:12:13.829578Z", + "start_time": "2025-02-11T09:12:13.822121Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "\n", + "🚅 Components\n", + " - converter: MultiFileConverter\n", + " - preprocessor: DocumentPreProcessor\n", + " - indexer: DocumentIndexer\n", + "🛤️ Connections\n", + " - converter.documents -> preprocessor.documents (List[Document])\n", + " - preprocessor.documents -> indexer.documents (List[Document])" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "file_converter = MultiFileConverter()\n", + "preprocessor = DocumentPreProcessor()\n", + "document_store = InMemoryDocumentStore()\n", + "indexer = SentenceTransformersDocumentIndexer(document_store=document_store)\n", + "\n", + "pipe = Pipeline()\n", + "\n", + "pipe.add_component(\"converter\", file_converter)\n", + "pipe.add_component(\"preprocessor\", preprocessor)\n", + "pipe.add_component(\"indexer\", indexer)\n", + "\n", + "pipe.connect(\"converter.documents\", \"preprocessor.documents\")\n", + "pipe.connect(\"preprocessor.documents\", \"indexer.documents\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-11T09:12:15.424871Z", + "start_time": "2025-02-11T09:12:14.822152Z" + } + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "pipe.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Our pipeline handles CSV, DOCX, HTML, JSON, Markdown, TXT, PDF, PPTX, and XLSX.\n", + "\n", + "Let's test it out!" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-11T09:12:20.299682Z", + "start_time": "2025-02-11T09:12:19.697516Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Converting markdown files to Documents: 100%|██████████| 1/1 [00:00<00:00, 234.69it/s]\n", + "Batches: 100%|██████████| 1/1 [00:00<00:00, 1.80it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wrote 6 documents to the document store.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "example_files = [\n", + " \"sample_files/sample_docx.docx\",\n", + " \"sample_files/sample_pptx.pptx\",\n", + " \"sample_files/sample.md\",\n", + " \"sample_files/sample_pdf_1.pdf\",\n", + "]\n", + "\n", + "result = pipe.run({\"sources\": example_files})\n", + "\n", + "print(f\"Wrote {result['indexer']['documents_written']} documents to the document store.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's view the documents!\n", + "\n", + "You can see that they are split, and embedded." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-11T09:12:54.651275Z", + "start_time": "2025-02-11T09:12:54.645947Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(id=575e058e31109793279bb32e9eca01b0ecef8a4a687a8eb235d77b11f1746956, content: 'Sample Docx File The US has \"passed the peak\" on new coronavirus cases, President Donald Trump said ...', meta: {'file_path': 'sample_docx.docx', 'docx': DOCXMetadata(author='Saha, Anirban', category='', comments='', content_status='', created='2020-07-14T08:14:00+00:00', identifier='', keywords='', language='', last_modified_by='Saha, Anirban', last_printed=None, modified='2020-07-14T08:16:00+00:00', revision=1, subject='', title='', version=''), 'source_id': '86ba10e1b023f2dfdd576079468a060522f31be54cf6e41515d2311522002b06', 'page_number': 1, 'split_id': 0, 'split_idx_start': 0}, embedding: vector of size 768),\n", + " Document(id=afce88a93fde936bd049139ff5802dcbe4e1af65b9040e88523ee868b50aec16, content: 'type: intro\n", + " date: 1.1.2023 pip install farm-haystack What to build with Haystack Ask questions in na...', meta: {'file_path': 'sample.md', 'source_id': '9a881cceb8042a3db81c9390bf8c7ba28979030823bf0382788430bf6ee73af6', 'page_number': 1, 'split_id': 0, 'split_idx_start': 0}, embedding: vector of size 768),\n", + " Document(id=a5bdfffc7637bc6156e319f760655196093577d8478f98f78d4993b4a338e12e, content: 'production & improve your models continuously | | |\n", + " |-|-|\n", + " | :ledger: Docs | Usage, Guides, API docum...', meta: {'file_path': 'sample.md', 'source_id': '9a881cceb8042a3db81c9390bf8c7ba28979030823bf0382788430bf6ee73af6', 'page_number': 1, 'split_id': 1, 'split_idx_start': 1495}, embedding: vector of size 768),\n", + " Document(id=a3b622a10f684a51dd5edb79557eada73bc5ae29e64f94b99284b964d943cb21, content: 'A sample PDF le History and standardization\n", + " Format (PDF) Adobe Systems made the PDF specication av...', meta: {'file_path': 'sample_pdf_1.pdf', 'source_id': 'e7779175a0a5a7841df3f6153651ea94c895e8f31d65b2c269bf17cb6bea39a7', 'page_number': 1, 'split_id': 0, 'split_idx_start': 0}, embedding: vector of size 768),\n", + " Document(id=7c3d052b5d6514d9c23d920c9cbfaa549be434cf755f4a029cb45d4508eded14, content: 'Many of them are also not supported by popular third-party implementations of PDF.\n", + " Column 1 Column 2...', meta: {'file_path': 'sample_pdf_1.pdf', 'source_id': 'e7779175a0a5a7841df3f6153651ea94c895e8f31d65b2c269bf17cb6bea39a7', 'page_number': 1, 'split_id': 1, 'split_idx_start': 1317}, embedding: vector of size 768),\n", + " Document(id=3de510d8a1b6e3d8fc9dc1127b6b358caf2f5a2ac962de3a267fd14a9e86c1bc, content: 'Sample Title Slide\n", + " Jane Doe\n", + " Title of First Slide\n", + " This is a bullet point\n", + " This is another bullet point...', meta: {'file_path': 'sample_pptx.pptx', 'source_id': '163edc48d112b11d55c6b809fb4f632e0d1f9a73f44f3343b40fdd9317e2ea9a', 'page_number': 1, 'split_id': 0, 'split_idx_start': 0}, embedding: vector of size 768)]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "document_store.filter_documents()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Expanding SuperComponents\n", + "What makes SuperComponents special is the ability to expand it by calling their `to_super_component_dict()` method. This converts the component to a generic `SuperComponent` that contains the pipeline constructed by the SuperComponent. From there on the pipeline can be changed in any way.\n", + "\n", + "Let's use this feature to check how this pipeline would look like without using super components." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-11T09:23:10.530152Z", + "start_time": "2025-02-11T09:23:10.521099Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "\n", + "🚅 Components\n", + " - router: FileTypeRouter\n", + " - docx: DOCXToDocument\n", + " - html: HTMLToDocument\n", + " - json: JSONConverter\n", + " - md: MarkdownToDocument\n", + " - txt: TextFileToDocument\n", + " - pdf: PyPDFToDocument\n", + " - pptx: PPTXToDocument\n", + " - xlsx: XLSXToDocument\n", + " - joiner: DocumentJoiner\n", + " - csv: CSVToDocument\n", + "🛤️ Connections\n", + " - router.text/csv -> csv.sources (List[Union[str, Path, ByteStream]])\n", + " - router.application/vnd.openxmlformats-officedocument.wordprocessingml.document -> docx.sources (List[Union[str, Path, ByteStream]])\n", + " - router.text/html -> html.sources (List[Union[str, Path, ByteStream]])\n", + " - router.application/json -> json.sources (List[Union[str, Path, ByteStream]])\n", + " - router.text/markdown -> md.sources (List[Union[str, Path, ByteStream]])\n", + " - router.text/plain -> txt.sources (List[Union[str, Path, ByteStream]])\n", + " - router.application/pdf -> pdf.sources (List[Union[str, Path, ByteStream]])\n", + " - router.application/vnd.openxmlformats-officedocument.presentationml.presentation -> pptx.sources (List[Union[str, Path, ByteStream]])\n", + " - router.application/vnd.openxmlformats-officedocument.spreadsheetml.sheet -> xlsx.sources (List[Union[str, Path, ByteStream]])\n", + " - docx.documents -> joiner.documents (List[Document])\n", + " - html.documents -> joiner.documents (List[Document])\n", + " - json.documents -> joiner.documents (List[Document])\n", + " - md.documents -> joiner.documents (List[Document])\n", + " - txt.documents -> joiner.documents (List[Document])\n", + " - pdf.documents -> joiner.documents (List[Document])\n", + " - pptx.documents -> joiner.documents (List[Document])\n", + " - xlsx.documents -> joiner.documents (List[Document])\n", + " - csv.documents -> joiner.documents (List[Document])" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "file_converter_serialized = file_converter._to_super_component_dict()\n", + "Pipeline.from_dict(file_converter_serialized[\"init_parameters\"][\"pipeline\"])\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's a lot of components just for file conversion.\n", + "\n", + "\n", + "The full pipeline, that has the same functionality would look like this. 😵\t" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-11T09:40:41.120319Z", + "start_time": "2025-02-11T09:40:41.102739Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "\n", + "🚅 Components\n", + " - file_classifier: FileTypeRouter\n", + " - text_converter: TextFileToDocument\n", + " - pdf_converter: PyPDFToDocument\n", + " - markdown_converter: TextFileToDocument\n", + " - html_converter: HTMLToDocument\n", + " - docx_converter: DOCXToDocument\n", + " - pptx_converter: PPTXToDocument\n", + " - xlsx_converter: XLSXToDocument\n", + " - csv_converter: CSVToDocument\n", + " - joiner: DocumentJoiner\n", + " - joiner_xlsx: DocumentJoiner\n", + " - splitter: DocumentSplitter\n", + " - cleaner: DocumentCleaner\n", + " - document_embedder: SentenceTransformersDocumentEmbedder\n", + " - writer: DocumentWriter\n", + "🛤️ Connections\n", + " - file_classifier.text/plain -> text_converter.sources (List[Union[str, Path, ByteStream]])\n", + " - file_classifier.application/pdf -> pdf_converter.sources (List[Union[str, Path, ByteStream]])\n", + " - file_classifier.text/markdown -> markdown_converter.sources (List[Union[str, Path, ByteStream]])\n", + " - file_classifier.text/html -> html_converter.sources (List[Union[str, Path, ByteStream]])\n", + " - file_classifier.application/vnd.openxmlformats-officedocument.wordprocessingml.document -> docx_converter.sources (List[Union[str, Path, ByteStream]])\n", + " - file_classifier.application/vnd.openxmlformats-officedocument.presentationml.presentation -> pptx_converter.sources (List[Union[str, Path, ByteStream]])\n", + " - file_classifier.application/vnd.openxmlformats-officedocument.spreadsheetml.sheet -> xlsx_converter.sources (List[Union[str, Path, ByteStream]])\n", + " - file_classifier.text/csv -> csv_converter.sources (List[Union[str, Path, ByteStream]])\n", + " - text_converter.documents -> joiner.documents (List[Document])\n", + " - pdf_converter.documents -> joiner.documents (List[Document])\n", + " - markdown_converter.documents -> joiner.documents (List[Document])\n", + " - html_converter.documents -> joiner.documents (List[Document])\n", + " - docx_converter.documents -> joiner.documents (List[Document])\n", + " - pptx_converter.documents -> joiner.documents (List[Document])\n", + " - xlsx_converter.documents -> joiner.documents (List[Document])\n", + " - csv_converter.documents -> joiner.documents (List[Document])\n", + " - joiner.documents -> cleaner.documents (List[Document])\n", + " - cleaner.documents -> splitter.documents (List[Document])\n", + " - document_embedder.documents -> writer.documents (List[Document])" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from haystack import Pipeline\n", + "from haystack.components.converters.csv import CSVToDocument\n", + "from haystack.components.converters.docx import DOCXToDocument\n", + "from haystack.components.converters.html import HTMLToDocument\n", + "from haystack.components.converters.pptx import PPTXToDocument\n", + "from haystack.components.converters.pypdf import PyPDFToDocument\n", + "from haystack.components.converters.txt import TextFileToDocument\n", + "from haystack.components.converters.xlsx import XLSXToDocument\n", + "from haystack.components.embedders.sentence_transformers_document_embedder import SentenceTransformersDocumentEmbedder\n", + "from haystack.components.joiners.document_joiner import DocumentJoiner\n", + "from haystack.components.preprocessors.document_cleaner import DocumentCleaner\n", + "from haystack.components.preprocessors.document_splitter import DocumentSplitter\n", + "from haystack.components.routers.file_type_router import FileTypeRouter\n", + "from haystack.components.writers.document_writer import DocumentWriter\n", + "from haystack.document_stores.in_memory import InMemoryDocumentStore\n", + "\n", + "file_classifier = FileTypeRouter(\n", + " mime_types=[\n", + " \"text/plain\",\n", + " \"application/pdf\",\n", + " \"text/markdown\",\n", + " \"text/html\",\n", + " \"application/vnd.openxmlformats-officedocument.wordprocessingml.document\",\n", + " \"application/vnd.openxmlformats-officedocument.presentationml.presentation\",\n", + " \"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet\",\n", + " \"text/csv\"\n", + " ])\n", + "text_converter = TextFileToDocument(encoding=\"utf-8\")\n", + "pdf_converter = PyPDFToDocument()\n", + "markdown_converter = TextFileToDocument(encoding=\"utf-8\")\n", + "html_converter = HTMLToDocument()\n", + "docx_converter = DOCXToDocument()\n", + "pptx_converter = PPTXToDocument()\n", + "xlsx_converter = XLSXToDocument()\n", + "csv_converter = CSVToDocument(encoding=\"utf-8\")\n", + "joiner = DocumentJoiner(join_mode=\"concatenate\", sort_by_score=False)\n", + "joiner_xlsx = DocumentJoiner(join_mode=\"concatenate\", sort_by_score=False)\n", + "splitter = DocumentSplitter(split_by=\"word\", split_length=250)\n", + "cleaner = DocumentCleaner()\n", + "document_embedder = SentenceTransformersDocumentEmbedder()\n", + "document_store = InMemoryDocumentStore()\n", + "writer = DocumentWriter(document_store=document_store)\n", + "\n", + "pipeline = Pipeline()\n", + "pipeline.add_component(\"file_classifier\", file_classifier)\n", + "pipeline.add_component(\"text_converter\", text_converter)\n", + "pipeline.add_component(\"pdf_converter\", pdf_converter)\n", + "pipeline.add_component(\"markdown_converter\", markdown_converter)\n", + "pipeline.add_component(\"html_converter\", html_converter)\n", + "pipeline.add_component(\"docx_converter\", docx_converter)\n", + "pipeline.add_component(\"pptx_converter\", pptx_converter)\n", + "pipeline.add_component(\"xlsx_converter\", xlsx_converter)\n", + "pipeline.add_component(\"csv_converter\", csv_converter)\n", + "pipeline.add_component(\"joiner\", joiner)\n", + "pipeline.add_component(\"joiner_xlsx\", joiner_xlsx)\n", + "pipeline.add_component(\"splitter\", splitter)\n", + "pipeline.add_component(\"cleaner\", cleaner)\n", + "pipeline.add_component(\"document_embedder\", document_embedder)\n", + "pipeline.add_component(\"writer\", writer)\n", + "pipeline.connect(\"file_classifier.text/plain\", \"text_converter.sources\")\n", + "pipeline.connect(\"file_classifier.application/pdf\", \"pdf_converter.sources\")\n", + "pipeline.connect(\"file_classifier.text/markdown\", \"markdown_converter.sources\")\n", + "pipeline.connect(\"file_classifier.text/html\", \"html_converter.sources\")\n", + "pipeline.connect(\"file_classifier.application/vnd.openxmlformats-officedocument.wordprocessingml.document\", \"docx_converter.sources\")\n", + "pipeline.connect(\"file_classifier.application/vnd.openxmlformats-officedocument.presentationml.presentation\", \"pptx_converter.sources\")\n", + "pipeline.connect(\"file_classifier.application/vnd.openxmlformats-officedocument.spreadsheetml.sheet\", \"xlsx_converter.sources\")\n", + "pipeline.connect(\"file_classifier.text/csv\", \"csv_converter.sources\")\n", + "pipeline.connect(\"text_converter.documents\", \"joiner.documents\")\n", + "pipeline.connect(\"pdf_converter.documents\", \"joiner.documents\")\n", + "pipeline.connect(\"markdown_converter.documents\", \"joiner.documents\")\n", + "pipeline.connect(\"html_converter.documents\", \"joiner.documents\")\n", + "pipeline.connect(\"docx_converter.documents\", \"joiner.documents\")\n", + "pipeline.connect(\"pptx_converter.documents\", \"joiner.documents\")\n", + "pipeline.connect(\"joiner.documents\", \"cleaner.documents\")\n", + "pipeline.connect(\"cleaner.documents\", \"splitter.documents\")\n", + "pipeline.connect(\"xlsx_converter.documents\", \"joiner.documents\")\n", + "pipeline.connect(\"csv_converter.documents\", \"joiner.documents\")\n", + "pipeline.connect(\"document_embedder.documents\", \"writer.documents\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/haystack_experimental/super_components/__init__.py b/haystack_experimental/super_components/__init__.py new file mode 100644 index 00000000..c1764a6e --- /dev/null +++ b/haystack_experimental/super_components/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/haystack_experimental/super_components/converters/__init__.py b/haystack_experimental/super_components/converters/__init__.py new file mode 100644 index 00000000..7ff8a4a1 --- /dev/null +++ b/haystack_experimental/super_components/converters/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from haystack_experimental.super_components.converters.multi_file_converter import MultiFileConverter + +_all_ = ["MultiFileConverter"] diff --git a/haystack_experimental/super_components/converters/multi_file_converter.py b/haystack_experimental/super_components/converters/multi_file_converter.py new file mode 100644 index 00000000..9faf1155 --- /dev/null +++ b/haystack_experimental/super_components/converters/multi_file_converter.py @@ -0,0 +1,175 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from enum import Enum +from typing import Any, Dict + +from haystack import Pipeline, component, default_from_dict, default_to_dict +from haystack.components.converters import ( + CSVToDocument, + DOCXToDocument, + HTMLToDocument, + JSONConverter, + MarkdownToDocument, + PPTXToDocument, + PyPDFToDocument, + TextFileToDocument, + XLSXToDocument, +) +from haystack.components.joiners import DocumentJoiner +from haystack.components.routers import FileTypeRouter + +from haystack_experimental.core.super_component import SuperComponent + + +class ConverterMimeType(str, Enum): + CSV = "text/csv" + DOCX = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + HTML = "text/html" + JSON = "application/json" + MD = "text/markdown" + TEXT = "text/plain" + PDF = "application/pdf" + PPTX = "application/vnd.openxmlformats-officedocument.presentationml.presentation" + XLSX = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + + +@component +class MultiFileConverter(SuperComponent): + """ + A file converter that handles conversion of multiple file types. + + The MultiFileConverter handles the following file types: + - CSV + - DOCX + - HTML + - JSON + - MD + - TEXT + - PDF (no OCR) + - PPTX + - XLSX + + Usage: + ``` + converter = MultiFileConverter() + converter.run(sources=["test.txt", "test.pdf"], meta={}) + ``` + """ + + def __init__( # noqa: PLR0915 + self, + encoding: str = "utf-8", + json_content_key: str = "content", + ) -> None: + """ + Initialize the MultiFileConverter. + + :param encoding: The encoding to use when reading files. + :param json_content_key: The key to use as content-field in a document when converting json-files. + """ + self.encoding = encoding + self.json_content_key = json_content_key + + # initialize components + router = FileTypeRouter( + mime_types=[ + ConverterMimeType.CSV.value, + ConverterMimeType.DOCX.value, + ConverterMimeType.HTML.value, + ConverterMimeType.JSON.value, + ConverterMimeType.MD.value, + ConverterMimeType.TEXT.value, + ConverterMimeType.PDF.value, + ConverterMimeType.PPTX.value, + ConverterMimeType.XLSX.value, + ], + # Ensure common extensions are registered. Tests on Windows fail otherwise. + additional_mimetypes = { + "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx" + } + ) + + csv = CSVToDocument(encoding=self.encoding) + docx = DOCXToDocument() + html = HTMLToDocument() + json = JSONConverter(content_key=self.json_content_key) + md = MarkdownToDocument() + txt = TextFileToDocument(encoding=self.encoding) + pdf = PyPDFToDocument() + pptx = PPTXToDocument() + xlsx = XLSXToDocument() + + joiner = DocumentJoiner() + + + + # Create pipeline and add components + pp = Pipeline() + + pp.add_component("router", router) + + pp.add_component("docx", docx) + pp.add_component("html", html) + pp.add_component("json", json) + pp.add_component("md", md) + pp.add_component("txt", txt) + pp.add_component("pdf", pdf) + pp.add_component("pptx", pptx) + pp.add_component("xlsx", xlsx) + pp.add_component("joiner", joiner) + pp.add_component("csv", csv) + + pp.connect(f"router.{ConverterMimeType.CSV.value}", "csv") + pp.connect(f"router.{ConverterMimeType.DOCX.value}", "docx") + pp.connect(f"router.{ConverterMimeType.HTML.value}", "html") + pp.connect(f"router.{ConverterMimeType.JSON.value}", "json") + pp.connect(f"router.{ConverterMimeType.MD.value}", "md") + pp.connect(f"router.{ConverterMimeType.TEXT.value}", "txt") + pp.connect(f"router.{ConverterMimeType.PDF.value}", "pdf") + pp.connect(f"router.{ConverterMimeType.PPTX.value}", "pptx") + pp.connect(f"router.{ConverterMimeType.XLSX.value}", "xlsx") + + pp.connect("docx.documents", "joiner.documents") + pp.connect("html.documents", "joiner.documents") + pp.connect("json.documents", "joiner.documents") + pp.connect("md.documents", "joiner.documents") + pp.connect("txt.documents", "joiner.documents") + pp.connect("pdf.documents", "joiner.documents") + pp.connect("pptx.documents", "joiner.documents") + + pp.connect("csv.documents", "joiner.documents") + pp.connect("xlsx.documents", "joiner.documents") + + + output_mapping = {"joiner.documents": "documents"} + input_mapping = { + "sources": ["router.sources"], + "meta": ["router.meta"] + } + + super(MultiFileConverter, self).__init__( + pipeline=pp, + output_mapping=output_mapping, + input_mapping=input_mapping + ) + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this instance to a dictionary. + """ + return default_to_dict( + self, + encoding=self.encoding, + json_content_key=self.json_content_key, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "MultiFileConverter": + """ + Load this instance from a dictionary. + """ + return default_from_dict(cls, data) diff --git a/haystack_experimental/super_components/indexers/__init__.py b/haystack_experimental/super_components/indexers/__init__.py new file mode 100644 index 00000000..0f617cbd --- /dev/null +++ b/haystack_experimental/super_components/indexers/__init__.py @@ -0,0 +1,11 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from haystack_experimental.super_components.indexers.sentence_transformers_document_indexer import ( + SentenceTransformersDocumentIndexer, +) + +__all__ = [ + "SentenceTransformersDocumentIndexer", +] diff --git a/haystack_experimental/super_components/indexers/sentence_transformers_document_indexer.py b/haystack_experimental/super_components/indexers/sentence_transformers_document_indexer.py new file mode 100644 index 00000000..ead1db18 --- /dev/null +++ b/haystack_experimental/super_components/indexers/sentence_transformers_document_indexer.py @@ -0,0 +1,202 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Dict, List, Literal, Optional + +from haystack import Pipeline, component, default_from_dict, default_to_dict +from haystack.components.embedders import SentenceTransformersDocumentEmbedder +from haystack.components.writers import DocumentWriter +from haystack.document_stores.types import DocumentStore, DuplicatePolicy +from haystack.utils import ( + ComponentDevice, + Secret, + deserialize_document_store_in_init_params_inplace, + deserialize_secrets_inplace, +) +from haystack.utils.hf import deserialize_hf_model_kwargs, serialize_hf_model_kwargs + +from haystack_experimental.core.super_component import SuperComponent + + +@component +class SentenceTransformersDocumentIndexer(SuperComponent): + """ + A document indexer that takes a list of documents, embeds them using SentenceTransformers, and stores them. + + Usage: + + ```python + >>> from haystack import Document + >>> from haystack.document_stores.in_memory import InMemoryDocumentStore + >>> document_store = InMemoryDocumentStore() + >>> doc = Document(content="I love pizza!") + >>> indexer = SentenceTransformersDocumentIndexer(document_store=document_store) + >>> indexer.warm_up() + >>> result = indexer.run(documents=[doc]) + >>> print(result) + {'documents_written': 1} + >>> document_store.count_documents() + 1 + ``` + """ + + def __init__( # pylint: disable=R0917 + self, + document_store: DocumentStore, + model: str = "sentence-transformers/all-mpnet-base-v2", + device: Optional[ComponentDevice] = None, + token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False), + prefix: str = "", + suffix: str = "", + batch_size: int = 32, + progress_bar: bool = True, + normalize_embeddings: bool = False, + meta_fields_to_embed: Optional[List[str]] = None, + embedding_separator: str = "\n", + trust_remote_code: bool = False, + truncate_dim: Optional[int] = None, + model_kwargs: Optional[Dict[str, Any]] = None, + tokenizer_kwargs: Optional[Dict[str, Any]] = None, + config_kwargs: Optional[Dict[str, Any]] = None, + precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = "float32", + duplicate_policy: DuplicatePolicy = DuplicatePolicy.OVERWRITE, + ) -> None: + """ + Initialize the SentenceTransformersDocumentIndexer component. + + :param document_store: The document store where the documents should be stored. + :param model: The embedding model to use (local path or Hugging Face model ID). + :param device: The device to use for loading the model. + :param token: The API token to download private models from Hugging Face. + :param prefix: String to add at the beginning of each document text. + :param suffix: String to add at the end of each document text. + :param batch_size: Number of documents to embed at once. + :param progress_bar: If True, shows a progress bar when embedding documents. + :param normalize_embeddings: If True, embeddings are L2 normalized. + :param meta_fields_to_embed: List of metadata fields to embed along with the document text. + :param embedding_separator: Separator used to concatenate metadata fields to document text. + :param trust_remote_code: If True, allows custom models and scripts. + :param truncate_dim: Dimension to truncate sentence embeddings to. + :param model_kwargs: Additional keyword arguments for model initialization. + :param tokenizer_kwargs: Additional keyword arguments for tokenizer initialization. + :param config_kwargs: Additional keyword arguments for model configuration. + :param precision: The precision to use for the embeddings. + :param duplicate_policy: The duplicate policy to use when writing documents. + """ + self.document_store = document_store + self.model = model + self.device = device + self.token = token + self.prefix = prefix + self.suffix = suffix + self.batch_size = batch_size + self.progress_bar = progress_bar + self.normalize_embeddings = normalize_embeddings + self.meta_fields_to_embed = meta_fields_to_embed + self.embedding_separator = embedding_separator + self.trust_remote_code = trust_remote_code + self.truncate_dim = truncate_dim + self.model_kwargs = model_kwargs + self.tokenizer_kwargs = tokenizer_kwargs + self.config_kwargs = config_kwargs + self.precision = precision + self.duplicate_policy = duplicate_policy + + pipeline = Pipeline() + + pipeline.add_component( + "embedder", + SentenceTransformersDocumentEmbedder( + model=self.model, + device=self.device, + token=self.token, + prefix=self.prefix, + suffix=self.suffix, + batch_size=self.batch_size, + progress_bar=self.progress_bar, + normalize_embeddings=self.normalize_embeddings, + meta_fields_to_embed=self.meta_fields_to_embed, + embedding_separator=self.embedding_separator, + trust_remote_code=self.trust_remote_code, + truncate_dim=self.truncate_dim, + model_kwargs=self.model_kwargs, + tokenizer_kwargs=self.tokenizer_kwargs, + config_kwargs=self.config_kwargs, + precision=self.precision, + ), + ) + pipeline.add_component( + "writer", + DocumentWriter( + document_store=self.document_store, + policy=self.duplicate_policy, + ), + ) + + pipeline.connect("embedder.documents", "writer.documents") + + super(SentenceTransformersDocumentIndexer, self).__init__( + pipeline=pipeline, + input_mapping={"documents": ["embedder.documents"]}, + output_mapping={"writer.documents_written": "documents_written"}, + ) + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this instance to a dictionary. + """ + serialization_dict = default_to_dict( + self, + document_store=self.document_store.to_dict(), + model=self.model, + device=self.device.to_dict() if self.device else None, + token=self.token.to_dict() if self.token else None, + prefix=self.prefix, + suffix=self.suffix, + batch_size=self.batch_size, + progress_bar=self.progress_bar, + normalize_embeddings=self.normalize_embeddings, + meta_fields_to_embed=self.meta_fields_to_embed, + embedding_separator=self.embedding_separator, + trust_remote_code=self.trust_remote_code, + truncate_dim=self.truncate_dim, + model_kwargs=self.model_kwargs, + tokenizer_kwargs=self.tokenizer_kwargs, + config_kwargs=self.config_kwargs, + precision=self.precision, + duplicate_policy=self.duplicate_policy.value, + ) + + if serialization_dict["init_parameters"].get("model_kwargs") is not None: + serialize_hf_model_kwargs(serialization_dict["init_parameters"]["model_kwargs"]) + + return serialization_dict + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "SentenceTransformersDocumentIndexer": + """ + Load an instance of this component from a dictionary. + """ + deserialize_document_store_in_init_params_inplace(data) + init_params = data.get("init_parameters", {}) + + # Handle device deserialization + if init_params.get("device") is not None: + init_params["device"] = ComponentDevice.from_dict(init_params["device"]) + + # Handle secrets deserialization + deserialize_secrets_inplace(init_params, keys=["token"]) + + + + # Handle model kwargs deserialization + if init_params.get("model_kwargs") is not None: + deserialize_hf_model_kwargs(init_params["model_kwargs"]) + + # Handle duplicate policy deserialization + if policy_value := init_params.get("duplicate_policy"): + init_params["duplicate_policy"] = DuplicatePolicy(policy_value) + + data["init_parameters"] = init_params + return default_from_dict(cls, data) diff --git a/haystack_experimental/super_components/preprocessors/__init__.py b/haystack_experimental/super_components/preprocessors/__init__.py new file mode 100644 index 00000000..2e905683 --- /dev/null +++ b/haystack_experimental/super_components/preprocessors/__init__.py @@ -0,0 +1,9 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from haystack_experimental.super_components.preprocessors.document_preprocessor import DocumentPreProcessor + +__all__ = [ + "DocumentPreProcessor", +] diff --git a/haystack_experimental/super_components/preprocessors/document_preprocessor.py b/haystack_experimental/super_components/preprocessors/document_preprocessor.py new file mode 100644 index 00000000..a0ba3b62 --- /dev/null +++ b/haystack_experimental/super_components/preprocessors/document_preprocessor.py @@ -0,0 +1,188 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Callable, Dict, List, Literal, Optional + +from haystack import Pipeline, component, default_from_dict, default_to_dict +from haystack.components.preprocessors.document_cleaner import DocumentCleaner +from haystack.components.preprocessors.document_splitter import DocumentSplitter, Language +from haystack.utils import deserialize_callable, serialize_callable + +from haystack_experimental.core.super_component import SuperComponent + + +@component +class DocumentPreProcessor(SuperComponent): + """ + A SuperComponent that cleans documents and then splits them. + + This component composes a DocumentCleaner followed by a DocumentSplitter in a single pipeline. + It takes a list of documents as input and returns a processed list of documents. + + Usage: + ```python + from haystack import Document + doc = Document(content="I love pizza!") + preprocessor = DocumentPreProcessor() + results = preprocessor.run(documents=[doc]) + print(result["documents"]) + ``` + """ + + def __init__( # pylint: disable=R0917 + self, + # --- DocumentCleaner arguments --- + remove_empty_lines: bool = True, + remove_extra_whitespaces: bool = True, + remove_repeated_substrings: bool = False, + keep_id: bool = False, + remove_substrings: Optional[List[str]] = None, + remove_regex: Optional[str] = None, + unicode_normalization: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None, + ascii_only: bool = False, + # --- DocumentSplitter arguments --- + split_by: Literal["function", "page", "passage", "period", "word", "line", "sentence"] = "word", + split_length: int = 250, + split_overlap: int = 0, + split_threshold: int = 0, + splitting_function: Optional[Callable[[str], List[str]]] = None, + respect_sentence_boundary: bool = False, + language: Language = "en", + use_split_rules: bool = True, + extend_abbreviations: bool = True, + ) -> None: + """ + Initialize a DocumentPreProcessor that first cleans documents and then splits them. + + **Cleaner Params**: + :param remove_empty_lines: If `True`, removes empty lines. + :param remove_extra_whitespaces: If `True`, removes extra whitespaces. + :param remove_repeated_substrings: If `True`, remove repeated substrings like headers/footers across pages. + :param keep_id: If `True`, keeps the original document IDs. + :param remove_substrings: A list of strings to remove from the document content. + :param remove_regex: A regex pattern whose matches will be removed from the document content. + :param unicode_normalization: Unicode normalization form to apply to the text, e.g. `"NFC"`. + :param ascii_only: If `True`, convert text to ASCII only. + + **Splitter Params**: + :param split_by: The unit of splitting: "function", "page", "passage", "period", "word", "line", or "sentence". + :param split_length: The maximum number of units (words, lines, pages, etc.) in each split. + :param split_overlap: The number of overlapping units between consecutive splits. + :param split_threshold: The minimum number of units per split. If a split is smaller than this, it's merged + with the previous split. + :param splitting_function: A custom function for splitting if `split_by="function"`. + :param respect_sentence_boundary: If `True`, splits by words but tries not to break inside a sentence. + :param language: Language used by the sentence tokenizer if `split_by="sentence"` or + `respect_sentence_boundary=True`. + :param use_split_rules: Whether to apply additional splitting heuristics for the sentence splitter. + :param extend_abbreviations: Whether to extend the sentence splitter with curated abbreviations for certain + languages. + """ + # Store arguments for serialization + self.remove_empty_lines = remove_empty_lines + self.remove_extra_whitespaces = remove_extra_whitespaces + self.remove_repeated_substrings = remove_repeated_substrings + self.keep_id = keep_id + self.remove_substrings = remove_substrings + self.remove_regex = remove_regex + self.unicode_normalization = unicode_normalization + self.ascii_only = ascii_only + + self.split_by = split_by + self.split_length = split_length + self.split_overlap = split_overlap + self.split_threshold = split_threshold + self.splitting_function = splitting_function + self.respect_sentence_boundary = respect_sentence_boundary + self.language = language + self.use_split_rules = use_split_rules + self.extend_abbreviations = extend_abbreviations + + # Instantiate sub-components + cleaner = DocumentCleaner( + remove_empty_lines=self.remove_empty_lines, + remove_extra_whitespaces=self.remove_extra_whitespaces, + remove_repeated_substrings=self.remove_repeated_substrings, + keep_id=self.keep_id, + remove_substrings=self.remove_substrings, + remove_regex=self.remove_regex, + unicode_normalization=self.unicode_normalization, + ascii_only=self.ascii_only, + ) + + splitter = DocumentSplitter( + split_by=self.split_by, + split_length=self.split_length, + split_overlap=self.split_overlap, + split_threshold=self.split_threshold, + splitting_function=self.splitting_function, + respect_sentence_boundary=self.respect_sentence_boundary, + language=self.language, + use_split_rules=self.use_split_rules, + extend_abbreviations=self.extend_abbreviations, + ) + + # Build the Pipeline + pp = Pipeline() + pp.add_component("cleaner", cleaner) + pp.add_component("splitter", splitter) + + # Connect the cleaner output to splitter + pp.connect("cleaner.documents", "splitter.documents") + + # Define how pipeline inputs/outputs map to sub-component inputs/outputs + input_mapping = { + # The pipeline input "documents" feeds into "cleaner.documents" + "documents": ["cleaner.documents"] + } + # The pipeline output "documents" comes from "splitter.documents" + output_mapping = {"splitter.documents": "documents"} + + # Initialize the SuperComponent + super(DocumentPreProcessor, self).__init__( + pipeline=pp, + input_mapping=input_mapping, + output_mapping=output_mapping + ) + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this instance to a dictionary. + """ + data = default_to_dict( + self, + remove_empty_lines=self.remove_empty_lines, + remove_extra_whitespaces=self.remove_extra_whitespaces, + remove_repeated_substrings=self.remove_repeated_substrings, + keep_id=self.keep_id, + remove_substrings=self.remove_substrings, + remove_regex=self.remove_regex, + unicode_normalization=self.unicode_normalization, + ascii_only=self.ascii_only, + split_by=self.split_by, + split_length=self.split_length, + split_overlap=self.split_overlap, + split_threshold=self.split_threshold, + respect_sentence_boundary=self.respect_sentence_boundary, + language=self.language, + use_split_rules=self.use_split_rules, + extend_abbreviations=self.extend_abbreviations, + ) + + if self.splitting_function: + data["init_parameters"]["splitting_function"] = serialize_callable(self.splitting_function) + + return data + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "DocumentPreProcessor": + """ + Load this instance from a dictionary. + """ + if "splitting_function" in data["init_parameters"]: + data["init_parameters"]["splitting_function"] = deserialize_callable( + data["init_parameters"]["splitting_function"] + ) + + return default_from_dict(cls, data) diff --git a/pyproject.toml b/pyproject.toml index 5f513beb..13e53b83 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,7 +61,15 @@ extra-dependencies = [ "amazon-bedrock-haystack>=1.1.1", "google-vertex-haystack>=2.0.0", # HierachicalSplitter w/ split_by="sentence" - "nltk" + "nltk", + # Tests for MultiFileConverter + "python-docx", + "trafilatura", + "markdown-it-py", + "mdit_plain", + "pypdf", + "python-pptx", + "openpyxl" ] [tool.hatch.envs.test.scripts] diff --git a/test/super_components/__init__.py b/test/super_components/__init__.py new file mode 100644 index 00000000..c1764a6e --- /dev/null +++ b/test/super_components/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/test/super_components/converters/__init__.py b/test/super_components/converters/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/test/super_components/converters/test_multi_file_converter.py b/test/super_components/converters/test_multi_file_converter.py new file mode 100644 index 00000000..0b9dc32f --- /dev/null +++ b/test/super_components/converters/test_multi_file_converter.py @@ -0,0 +1,129 @@ +import pytest +from pathlib import Path +from unittest.mock import Mock, patch + +from haystack import Document +from haystack.dataclasses import ByteStream +from haystack_experimental.core.super_component import SuperComponent +from haystack_experimental.super_components.converters.multi_file_converter import MultiFileConverter + +@pytest.fixture +def converter(): + return MultiFileConverter() + + +class TestMultiFileConverter: + def test_init_default_params(self, converter): + """Test initialization with default parameters""" + assert converter.encoding == "utf-8" + assert converter.json_content_key == "content" + assert isinstance(converter, SuperComponent) + + def test_init_custom_params(self, converter): + """Test initialization with custom parameters""" + converter = MultiFileConverter( + encoding="latin-1", + json_content_key="text" + ) + assert converter.encoding == "latin-1" + assert converter.json_content_key == "text" + + def test_to_dict(self, converter): + """Test serialization to dictionary""" + data = converter.to_dict() + assert data == { + "type": "haystack_experimental.super_components.converters.multi_file_converter.MultiFileConverter", + "init_parameters": { + "encoding": "utf-8", + "json_content_key": "content" + } + } + + def test_from_dict(self): + """Test deserialization from dictionary""" + data = { + "type": "haystack_experimental.super_components.converters.multi_file_converter.MultiFileConverter", + "init_parameters": { + "encoding": "latin-1", + "json_content_key": "text" + } + } + conv = MultiFileConverter.from_dict(data) + assert conv.encoding == "latin-1" + assert conv.json_content_key == "text" + + @pytest.mark.parametrize( + "suffix,file_path", + [ + ("csv", "csv/sample_1.csv"), + ("docx", "docx/sample_docx.docx"), + ("html", "html/what_is_haystack.html"), + ("json", "json/json_conversion_testfile.json"), + ("md", "markdown/sample.md"), + ("pdf", "pdf/sample_pdf_1.pdf"), + ("pptx", "pptx/sample_pptx.pptx"), + ("txt", "txt/doc_1.txt"), + ("xlsx", "xlsx/table_empty_rows_and_columns.xlsx"), + ] + ) + @pytest.mark.integration + def test_run(self, test_files_path, converter, suffix, file_path): + paths = [test_files_path / file_path] + output = converter.run(sources=paths) + docs = output["documents"] + + assert len(docs) == 1 + assert isinstance(docs[0], Document) + assert docs[0].content is not None + assert docs[0].meta["file_path"].endswith(suffix) + + def test_run_with_meta(self, test_files_path, converter): + """Test conversion with metadata""" + paths = [test_files_path / "txt" / "doc_1.txt"] + meta = {"language": "en", "author": "test"} + output = converter.run(sources=paths, meta=meta) + docs = output["documents"] + assert docs[0].meta["language"] == "en" + assert docs[0].meta["author"] == "test" + + def test_run_with_bytestream(self, test_files_path, converter): + """Test converting ByteStream input""" + bytestream = ByteStream( + data=b"test content", + mime_type="text/plain", + meta={"file_path": "test.txt"} + ) + output = converter.run(sources=[bytestream]) + docs = output["documents"] + assert len(docs) == 1 + assert docs[0].content == "test content" + assert docs[0].meta["file_path"] == "test.txt" + + def test_run_error_handling(self, test_files_path, converter, caplog): + """Test error handling for non-existent files""" + paths = [test_files_path / "non_existent.txt"] + with caplog.at_level("WARNING"): + output = converter.run(sources=paths) + assert "Could not read" in caplog.text + assert len(output["documents"]) == 0 + + @pytest.mark.integration + def test_run_all_file_types(self, test_files_path, converter): + """Test converting all supported file types in parallel""" + paths = [ + test_files_path / "csv" / "sample_1.csv", + test_files_path / "docx" / "sample_docx.docx", + test_files_path / "html" / "what_is_haystack.html", + test_files_path / "json" / "json_conversion_testfile.json", + test_files_path / "markdown" / "sample.md", + test_files_path / "txt" / "doc_1.txt", + test_files_path / "pdf" / "sample_pdf_1.pdf", + test_files_path / "pptx" / "sample_pptx.pptx", + test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx" + ] + output = converter.run(sources=paths) + docs = output["documents"] + + # Verify we got a document for each file + assert len(docs) == len(paths) + assert all(isinstance(doc, Document) for doc in docs) diff --git a/test/super_components/indexers/__init__.py b/test/super_components/indexers/__init__.py new file mode 100644 index 00000000..c1764a6e --- /dev/null +++ b/test/super_components/indexers/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/test/super_components/indexers/test_document_indexer.py b/test/super_components/indexers/test_document_indexer.py new file mode 100644 index 00000000..19442c1d --- /dev/null +++ b/test/super_components/indexers/test_document_indexer.py @@ -0,0 +1,126 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from unittest.mock import Mock, ANY +from uuid import UUID +import pytest + +from haystack_experimental.super_components.indexers import SentenceTransformersDocumentIndexer + +from haystack import Document, Pipeline +from haystack.document_stores.in_memory import InMemoryDocumentStore +from haystack.components.embedders import SentenceTransformersDocumentEmbedder +from haystack.components.writers import DocumentWriter + + +class TestDocumentIndexer: + @pytest.fixture + def indexer(self) -> SentenceTransformersDocumentIndexer: + return SentenceTransformersDocumentIndexer(document_store=InMemoryDocumentStore()) + + @pytest.fixture + def embedding_backend(self, indexer: SentenceTransformersDocumentIndexer, monkeypatch: pytest.MonkeyPatch) -> Mock: + backend = Mock() + backend.embed.return_value = [ + [0.3, 0.4, 0.01, 0.7], + [0.1, 0.9, 0.87, 0.3], + ] + + embedder = indexer.pipeline.get_component("embedder") + monkeypatch.setattr(embedder, "embedding_backend", backend) + + return backend + + def test_init(self, indexer: SentenceTransformersDocumentIndexer) -> None: + assert isinstance(indexer.pipeline, Pipeline) + assert indexer.input_mapping == {"documents": ["embedder.documents"]} + assert indexer.output_mapping == {"writer.documents_written": "documents_written"} + + embedder = indexer.pipeline.get_component("embedder") + assert isinstance(embedder, SentenceTransformersDocumentEmbedder) + + writer = indexer.pipeline.get_component("writer") + assert isinstance(writer, DocumentWriter) + assert isinstance(writer.document_store, InMemoryDocumentStore) + assert writer.document_store.bm25_tokenization_regex == r"(?u)\b\w\w+\b" + assert writer.document_store.bm25_algorithm == "BM25L" + assert writer.document_store.bm25_parameters == {} + assert writer.document_store.embedding_similarity_function == "dot_product" + assert UUID(writer.document_store.index, version=4) + + def test_from_dict(self) -> None: + indexer = SentenceTransformersDocumentIndexer.from_dict( + { + "init_parameters": { + 'document_store': { + 'init_parameters': { + 'bm25_algorithm': 'BM25L', + 'bm25_parameters': {}, + 'bm25_tokenization_regex': '(?u)\\b\\w\\w+\\b', + 'embedding_similarity_function': 'dot_product', + 'index': '28f84766-11b7-4eac-bb75-3ee4e8d56958' + }, + 'type': 'haystack.document_stores.in_memory.document_store.InMemoryDocumentStore' + }, + "prefix": "", + "suffix": "", + "batch_size": 32, + "embedding_separator": "\n", + "meta_fields_to_embed": None, + "duplicate_policy": "overwrite", + }, + "type": "haystack_experimental.super_components.indexers.sentence_transformers_document_indexer.SentenceTransformersDocumentIndexer", + } + ) + assert isinstance(indexer, SentenceTransformersDocumentIndexer) + + def test_to_dict(self, indexer: SentenceTransformersDocumentIndexer) -> None: + expected = {'init_parameters': {'batch_size': 32, + 'config_kwargs': None, + 'device': None, + 'document_store': {'init_parameters': {'bm25_algorithm': 'BM25L', + 'bm25_parameters': {}, + 'bm25_tokenization_regex': '(?u)\\b\\w\\w+\\b', + 'embedding_similarity_function': 'dot_product', + 'index': ANY}, + 'type': 'haystack.document_stores.in_memory.document_store.InMemoryDocumentStore'}, + 'duplicate_policy': 'overwrite', + 'embedding_separator': '\n', + 'meta_fields_to_embed': None, + 'model': 'sentence-transformers/all-mpnet-base-v2', + 'model_kwargs': None, + 'normalize_embeddings': False, + 'precision': 'float32', + 'prefix': '', + 'progress_bar': True, + 'suffix': '', + 'token': {'env_vars': ['HF_API_TOKEN', 'HF_TOKEN'], + 'strict': False, + 'type': 'env_var'}, + 'tokenizer_kwargs': None, + 'truncate_dim': None, + 'trust_remote_code': False}, + 'type': 'haystack_experimental.super_components.indexers.sentence_transformers_document_indexer.SentenceTransformersDocumentIndexer' + } + assert indexer.to_dict() == expected + + def test_warm_up(self, indexer: SentenceTransformersDocumentIndexer, monkeypatch: pytest.MonkeyPatch) -> None: + with monkeypatch.context() as m: + m.setattr(indexer.pipeline, "warm_up", Mock()) + + indexer.warm_up() + + indexer.pipeline.warm_up.assert_called_once() + + def test_run(self, indexer: SentenceTransformersDocumentIndexer, embedding_backend: Mock) -> None: + documents = [ + Document(content="Test document"), + Document(content="Another test document"), + ] + + indexer.warm_up() + result = indexer.run(documents=documents) + + embedding_backend.embed.assert_called_once + assert result == {"documents_written": len(documents)} diff --git a/test/test_files/csv/sample_1.csv b/test/test_files/csv/sample_1.csv new file mode 100644 index 00000000..7cf88f29 --- /dev/null +++ b/test/test_files/csv/sample_1.csv @@ -0,0 +1,4 @@ +Name,Age +John Doe,27 +Jane Smith,37 +Mike Johnson,47 diff --git a/test/test_files/docx/sample_docx.docx b/test/test_files/docx/sample_docx.docx new file mode 100644 index 00000000..3a740ac9 Binary files /dev/null and b/test/test_files/docx/sample_docx.docx differ diff --git a/test/test_files/html/what_is_haystack.html b/test/test_files/html/what_is_haystack.html new file mode 100644 index 00000000..2d62b206 --- /dev/null +++ b/test/test_files/html/what_is_haystack.html @@ -0,0 +1,1634 @@ + + + + + + + + + + What is Haystack? | Haystack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ 🎃 We're participating in Hacktoberfest 2023! + + + + + +
+
+ + + +
+ +
+
+
+ + + + + +
+

What is Haystack?

+

Haystack is the open source Python framework by deepset for building custom apps with large language models (LLMs). It lets you quickly try out the latest models in natural language processing (NLP) while being flexible and easy to use. Our inspiring community of users and builders has helped shape Haystack into what it is today: a complete framework for building production-ready NLP apps.

+

Building with Haystack

+

Haystack offers comprehensive tooling for developing state-of-the-art NLP systems that use LLMs (such as GPT-4, Falcon and similar) and Transformer models . With Haystack, you can effortlessly experiment with various models hosted on platforms like Hugging Face, OpenAI, Cohere, or even models deployed on SageMaker and your local models to find the perfect fit for your use case.

+ + + + + + + + + + + + + + + + + + + + + Model Providers + + +

Some examples of what you can build include:

+
    +
  • Semantic search on a large collection of documents in any language
  • +
  • Generative question answering on a knowledge base containing mixed types of information: images, text, and tables.
  • +
  • Natural language chatbots powered by cutting-edge generative models like GPT-4
  • +
  • An LLM-based Haystack Agent capable of resolving complex queries
  • +
  • Information extraction from documents to populate your database or build a knowledge graph
  • +
+

This is just a small subset of the kinds of systems that can be created in Haystack.

+

Functionality for all stages of an NLP project

+

A successful NLP project requires more than just the language models. As an end-to-end framework, Haystack assists you in building your system every step of the way, offering tooling for each stage of the NLP project life cycle:

+ +

But that’s not all: +metadata filtering, +model distillation, or the prompt hub, whatever your NLP heart desires, you’re likely to find it in Haystack. And if not? We’ll build it together.

+ + + + + + + + + + + + + + + + + + + + + + + Rest API + + +

Building blocks

+

Haystack uses a few simple but effective concepts to help you build fully functional and customized end-to-end NLP systems.

+

Components

+

At the core of Haystack are its components—fundamental building blocks that can perform tasks like document retrieval, text generation, or summarization. A single component is already quite powerful. It can manage local language models or communicate with a hosted model through an API.

+

While Haystack offers a bunch of components you can use out of the box, it also lets you create your own custom components. Explore the +collection of integrations that includes custom components developed by our community, which you can freely use.

+

You can chain components together to build pipelines, which are the foundation of the NLP app architecture in Haystack.

+

Pipelines

+

Pipelines are powerful structures made up of components, such as a Retriever and Reader, connected to infrastructure building blocks, such as a DocumentStore (for example, Elasticsearch or Weaviate) to form complex systems.

+

Haystack offers ready-made pipelines for most common tasks, such as question answering, document retrieval, or summarization. But it’s just as easy to design and create a custom pipeline for NLP scenarios that are way more complex than question answering.

+

Agents

+

The Haystack Agent makes use of a large language model to resolve complex tasks. When initializing the Agent, you give it a set of tools, which can be pipeline components or whole pipelines. The Agent can use to those tools iteratively to arrive at an answer. When given a query, the Agent determines which tools are useful to answer this query and calls them in a loop until it gets the answer. This way, it can achieve much more than extractive or generative question answering pipelines.

+ + + + + + + + + + + + + + + + + + + + + Agent Tools + + +

Who’s it for?

+

Haystack is for everyone looking to build natural language apps—NLP enthusiasts and newbies alike. You don’t need to understand how the models work under the hood. With Haystack’s modular and flexible components, pipelines, and agents, all you need is some basic knowledge of Python to dive right in.

+

Our community

+

At the heart of Haystack is the vibrant open source community that thrives on the diverse backgrounds and skill sets of its members. We value collaboration greatly and encourage our users to shape Haystack actively through GitHub contributions. Our Discord channel is a space where community members can connect, seek help, and learn from each other.

+

We also organize live online and in-person events, webinars, and office hours, which are an opportunity to learn and grow.

+ + + + + + + + +
+ + + +
+ Join Discord +
+ + + +
+
+ +

Enter the Haystack universe

+ + + + +
+ + + +
+ +
+
+
+
+
+ + + + + + + + + + + + + + + +
+
+
+ +
+ + + +
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + diff --git a/test/test_files/json/json_conversion_testfile.json b/test/test_files/json/json_conversion_testfile.json new file mode 100644 index 00000000..04620d3a --- /dev/null +++ b/test/test_files/json/json_conversion_testfile.json @@ -0,0 +1 @@ +{"content": "Content from a json file"} \ No newline at end of file diff --git a/test/test_files/markdown/sample.md b/test/test_files/markdown/sample.md new file mode 100644 index 00000000..e02535b7 --- /dev/null +++ b/test/test_files/markdown/sample.md @@ -0,0 +1,2 @@ + +## A test document for markdown diff --git a/test/test_files/pdf/sample_pdf_1.pdf b/test/test_files/pdf/sample_pdf_1.pdf new file mode 100644 index 00000000..87259b89 Binary files /dev/null and b/test/test_files/pdf/sample_pdf_1.pdf differ diff --git a/test/test_files/pptx/sample_pptx.pptx b/test/test_files/pptx/sample_pptx.pptx new file mode 100644 index 00000000..1f6e3791 Binary files /dev/null and b/test/test_files/pptx/sample_pptx.pptx differ diff --git a/test/test_files/txt/doc_1.txt b/test/test_files/txt/doc_1.txt new file mode 100644 index 00000000..41218908 --- /dev/null +++ b/test/test_files/txt/doc_1.txt @@ -0,0 +1,2 @@ +Some text for testing. +Two lines in here. diff --git a/test/test_files/xlsx/table_empty_rows_and_columns.xlsx b/test/test_files/xlsx/table_empty_rows_and_columns.xlsx new file mode 100644 index 00000000..f599b8c3 Binary files /dev/null and b/test/test_files/xlsx/table_empty_rows_and_columns.xlsx differ