From bf44e89c341b70c55fa0a3d24e36a43b0da3570d Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Fri, 8 Nov 2024 20:48:21 +0100 Subject: [PATCH] fix docs and linting Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- .../document_loaders/docling.ipynb | 266 +++++++++++++----- .../document_loaders/docling.py | 62 +++- 2 files changed, 238 insertions(+), 90 deletions(-) diff --git a/docs/docs/integrations/document_loaders/docling.ipynb b/docs/docs/integrations/document_loaders/docling.ipynb index eed0e9b6400210..b84fedbb2cf87c 100644 --- a/docs/docs/integrations/document_loaders/docling.ipynb +++ b/docs/docs/integrations/document_loaders/docling.ipynb @@ -4,20 +4,34 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Docling" + "# Docling Loader" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "[Docling](https://github.com/DS4SD/docling) parses PDF, DOCX, PPTX, HTML, and other formats into a rich unified representation including document layout, tables etc., making them ready for generative AI workflows like RAG.\n", + "## Overview\n", "\n", - "Docling Loader, presented in this notebook, seamlessly integrates Docling into LangChain, enabling you to:\n", - "- use various document types in your LLM applications with ease and speed, and\n", - "- leverage Docling's rich representation for advanced, document-native grounding.\n", + "> [Docling](https://github.com/DS4SD/docling) parses PDF, DOCX, PPTX, HTML, and other formats into a rich unified representation including document layout, tables etc., making them ready for generative AI workflows like RAG.\n", + ">\n", + "> Docling Loader, presented in this notebook, seamlessly integrates Docling into LangChain, enabling you to:\n", + "> - use various document types in your LLM applications with ease and speed, and\n", + "> - leverage Docling's rich representation for advanced, document-native grounding.\n", + ">\n", + "> In the sections below, we showcase Docling Loader's usage, covering document loading specifics but also demonstrating an end-to-end RAG pipeline.\n", "\n", - "In the sections below, we showcase Docling Loader's usage, covering document loading specifics but also demonstrating an end-to-end RAG pipeline." + "This notebook provides a quick overview for getting started with Docling [document loader](https://python.langchain.com/docs/concepts/#document-loaders). For detailed documentation of all Docling Loader features and configurations head to the [API reference](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.docling.DoclingLoader.html).\n", + "\n", + "### Integration details\n", + "\n", + "| Class | Package | Local | Serializable | [JS support](https://js.langchain.com/docs/integrations/document_loaders/web_loaders/__module_name___loader)|\n", + "| :--- | :--- | :---: | :---: | :---: |\n", + "| [DoclingLoader](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.docling.DoclingLoader.html) | [langchain_community](https://python.langchain.com/api_reference/community/index.html) | ✅ | ❌ | ❌ | \n", + "### Loader features\n", + "| Source | Document Lazy Loading | Native Async Support\n", + "| :---: | :---: | :---: | \n", + "| DoclingLoader | ✅ | ❌ | " ] }, { @@ -27,6 +41,20 @@ "## Setup" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Installation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use the Docling document loader you will need to have `docling` installed besides `langchain-community`:" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -41,113 +69,164 @@ } ], "source": [ - "%pip install -qU docling langchain-community langchain langchain-text-splitters langchain-huggingface langchain-milvus pip" + "%pip install -qU docling langchain-community" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "from langchain_community.document_loaders import DoclingLoader\n", - "\n", - "FILE_PATH = \"https://arxiv.org/pdf/2408.09869\"\n", + "## Initialization\n", "\n", - "def clip_text(text, threshold=100):\n", - " return f\"{text[:threshold]}[...]\" if len(text) > threshold else text\n", + "Now we can instantiate our loader and load documents.\n", "\n", - "def print_docs(docs):\n", - " for d in docs:\n", - " print(f\"metadata={d.metadata}, page_content={repr(clip_text(d.page_content))}\")" + "By default, `DoclingLoader` loads each input document as a LangChain `Document` with Markdown content." ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 2, "metadata": {}, + "outputs": [], "source": [ - "## Document loading\n", + "from langchain_community.document_loaders import DoclingLoader\n", + "\n", + "FILE_PATH = \"https://arxiv.org/pdf/2408.09869\"\n", "\n", - "Docling Loader can be used in two different modes, based on the export type:\n", - "- **Markdown** mode: for each input doc, outputs a LangChain `Document` with the Markdown representation of the input doc\n", - "- **Doc-chunks** mode: for each input doc, outputs the doc chunks (using Docling layout-aware chunking) as LangChain `Document`s " + "loader = DoclingLoader(file_path=FILE_PATH)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Using Markdown mode" + "## Load" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 3, "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using CPU. Note: This module is much faster with a GPU.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "docs[0].page_content[:200]='## Docling Technical Report\\n\\nVersion 1.0\\n\\nChristoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla '\n" + ] + } + ], "source": [ - "The markdown mode (default mode) returns the markdown export of the input documents.\n", - "\n", - "For customizing the markdown export, the user can pass the Docling markdown export kwargs (via keyword argument `md_export_kwargs`).\n", - "\n", - "Advanced tip: for customizing the conversion initialization and/or execution, the user can pass a Docling `DocumentConverter` object (via keyword argument `converter`) and/or the conversion kwargs (via keyword argument `convert_kwargs`) respectively." + "docs = loader.load()\n", + "print(f\"{docs[0].page_content[:200]=}\")" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "metadata={'source': 'https://arxiv.org/pdf/2408.09869'}, page_content='## Docling Technical Report\\n\\nVersion 1.0\\n\\nChristoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nik[...]'\n" + "{'source': 'https://arxiv.org/pdf/2408.09869'}\n" ] } ], "source": [ - "loader = DoclingLoader(\n", - " file_path=FILE_PATH,\n", - " export_type=DoclingLoader.ExportType.MARKDOWN,\n", - ")\n", - "docs = loader.load()\n", + "print(docs[0].metadata)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Lazy Load\n", "\n", - "print_docs(docs)" + "Documents can also be loaded in a lazy fashion:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "doc_iter = loader.lazy_load()\n", + "for doc in doc_iter:\n", + " pass # you can operate on `doc` here" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "## Deep Dive\n", + "\n", + "### Initialization\n", "\n", - "Now that the `docs` have been loaded, any built-in (or custom) LangChain splitter can be used to split them.\n", + "The general syntax of `DoclingLoader` initialization is as follows (also see [API reference](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.docling.DoclingLoader.html)):\n", "\n", - "\n", - "For illustrating one option, below we show a possible splitting with a `MarkdownHeaderTextSplitter`:\n", + "### Document preparation using Markdown mode\n", "\n", - "" + "Following up on the steps further above, given that the `docs` have been loaded, any built-in (or custom) LangChain splitter can be used to split them. For example, below we show a possible splitting with a `MarkdownHeaderTextSplitter`:" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "metadata={'Header_2': 'Docling Technical Report'}, page_content='Version 1.0 \\nChristoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagen[...]'\n", - "metadata={'Header_2': 'Abstract'}, page_content='This technical report introduces Docling , an easy to use, self-contained, MITlicensed open-source p[...]'\n", - "metadata={'Header_2': '1 Introduction'}, page_content='Converting PDF documents back into a machine-processable format has been a major challenge for decad[...]'\n" + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install -qU langchain-text-splitters" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "d.metadata={'Header_2': 'Docling Technical Report'}, d.page_content='Version 1.0 \\nChristoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar \\nAI4K Group, IBM Research Ruschlikon, Switzerland'\n", + "d.metadata={'Header_2': 'Abstract'}, d.page_content='This technical report introduces Docling , an easy to use, self-contained, MITlicensed open-source package for PDF document conversion. It is powered by state-of-the-art specialized AI models for layout analysis (DocLayNet) and table structure recognition (TableFormer), and runs efficiently on commodity hardware in a small resource budget. The code interface allows for easy extensibility and addition of new features and models.'\n" ] } ], @@ -159,37 +238,37 @@ ")\n", "md_splits = [split for doc in docs for split in splitter.split_text(doc.page_content)]\n", "\n", - "print_docs(md_splits[:3])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Using doc chunks mode" + "for d in md_splits[:2]:\n", + " print(f\"{d.metadata=}, {d.page_content=}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The doc-chunks mode directly returns the document chunks including rich metadata such as the page number and the bounding box info.\n", + "### Document preparation using doc chunks mode\n", "\n", - "For custom chunking, the user can pass a Docling `BaseChunker` object (via keyword argument `chunker`)." + "The doc-chunks mode directly returns the document chunks including rich metadata such as the page number and the bounding box info." ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using CPU. Note: This module is much faster with a GPU.\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "metadata={'source': 'https://arxiv.org/pdf/2408.09869', 'dl_meta': {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/0', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'page_header', 'prov': [{'page_no': 1, 'bbox': {'l': 17.088111877441406, 't': 583.2296752929688, 'r': 36.339778900146484, 'b': 231.99996948242188, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 38]}]}], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 14981478401387673002, 'filename': '2408.09869v3.pdf'}}}, page_content='arXiv:2408.09869v3 [cs.CL] 30 Aug 2024'\n", - "metadata={'source': 'https://arxiv.org/pdf/2408.09869', 'dl_meta': {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/2', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 1, 'bbox': {'l': 282.772216796875, 't': 512.7218017578125, 'r': 328.8624572753906, 'b': 503.340087890625, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 11]}]}], 'headings': ['Docling Technical Report'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 14981478401387673002, 'filename': '2408.09869v3.pdf'}}}, page_content='Version 1.0'\n", - "metadata={'source': 'https://arxiv.org/pdf/2408.09869', 'dl_meta': {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/3', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 1, 'bbox': {'l': 113.4512939453125, 't': 482.4101257324219, 'r': 498.396728515625, 'b': 439.45928955078125, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 295]}]}], 'headings': ['Docling Technical Report'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 14981478401387673002, 'filename': '2408.09869v3.pdf'}}}, page_content='Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berro[...]'\n" + "d.metadata={'source': 'https://arxiv.org/pdf/2408.09869', 'dl_meta': {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/0', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'page_header', 'prov': [{'page_no': 1, 'bbox': {'l': 17.088111877441406, 't': 583.2296752929688, 'r': 36.339778900146484, 'b': 231.99996948242188, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 38]}]}], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 14981478401387673002, 'filename': '2408.09869v3.pdf'}}}, d.page_content='arXiv:2408.09869v3 [cs.CL] 30 Aug 2024'\n", + "d.metadata={'source': 'https://arxiv.org/pdf/2408.09869', 'dl_meta': {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/2', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 1, 'bbox': {'l': 282.772216796875, 't': 512.7218017578125, 'r': 328.8624572753906, 'b': 503.340087890625, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 11]}]}], 'headings': ['Docling Technical Report'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 14981478401387673002, 'filename': '2408.09869v3.pdf'}}}, d.page_content='Version 1.0'\n" ] } ], @@ -200,14 +279,15 @@ ")\n", "doc_splits = loader.load()\n", "\n", - "print_docs(doc_splits[:3])" + "for d in doc_splits[:2]:\n", + " print(f\"{d.metadata=}, {d.page_content=}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## RAG" + "### RAG example" ] }, { @@ -219,7 +299,24 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install -qU langchain langchain-huggingface langchain-milvus" + ] + }, + { + "cell_type": "code", + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -250,6 +347,9 @@ "\n", "\n", "def run_rag(documents, embedding, llm, question, prompt):\n", + " def clip_text(text, threshold=100):\n", + " return f\"{text[:threshold]}[...]\" if len(text) > threshold else text\n", + "\n", " milvus_uri = str(Path(mkdtemp()) / \"docling.db\") # or set as needed\n", " vectorstore = Milvus.from_documents(\n", " documents,\n", @@ -279,14 +379,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Using Markdown mode\n", + "#### RAG using Markdown mode\n", "\n", "Below we run the RAG pipeline passing it the output of the Markdown mode (after splitting):" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -297,7 +397,7 @@ "Which are the main AI models in Docling?\n", "\n", "Answer:\n", - "\"The main AI models in Docling are a layout analysis model called DocLayNet and a table structure recognition model called TableFormer. DocLayNet is an accurate object-detector for page elements, while[...]\"\n", + "\"The main AI models in Docling are a layout analysis model, which is an accurate object-detector for page elements, and TableFormer, a state-of-the-art table structure recognition model.\"\n", "\n", "Source 1:\n", " text: \"As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis m[...]\"\n", @@ -331,7 +431,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Using doc-chunk mode\n", + "#### RAG using doc-chunk mode\n", "\n", "Below we run the RAG pipeline passing it the output of the doc-chunk mode.\n", "\n", @@ -340,7 +440,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -351,7 +451,7 @@ "Which are the main AI models in Docling?\n", "\n", "Answer:\n", - "\"The main AI models in Docling are:\\n\\n1. A layout analysis model, an accurate object-detector for page elements.\\n2. TableFormer, a state-of-the-art table structure recognition model.\"\n", + "\"The main AI models in Docling are a layout analysis model, which is an accurate object-detector for page elements, and TableFormer, a state-of-the-art table structure recognition model. These models a[...]\"\n", "\n", "Source 1:\n", " text: \"As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis m[...]\"\n", @@ -384,6 +484,20 @@ " prompt=PROMPT,\n", ")" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## API reference\n", + "\n", + "For detailed documentation of all `DoclingLoader` features and configurations head to the [API reference](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.docling.DoclingLoader.html)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] } ], "metadata": { diff --git a/libs/community/langchain_community/document_loaders/docling.py b/libs/community/langchain_community/document_loaders/docling.py index 209c808e773600..7709b864a480c4 100644 --- a/libs/community/langchain_community/document_loaders/docling.py +++ b/libs/community/langchain_community/document_loaders/docling.py @@ -6,31 +6,65 @@ class DoclingLoader(BaseLoader): - """Load PDF, HTML, DOCX, PPTX, Markdown, and more document formats using Docling. + """ + Docling document loader integration + + Setup: + Install ``docling`` besides ``langchain-community``. + + .. code-block:: bash - Example of markdown mode (default mode): + pip install -U docling langchain-community + + Instantiate: .. code-block:: python from langchain_community.document_loaders import DoclingLoader loader = DoclingLoader( - file_path="https://arxiv.org/pdf/2408.09869", - export_type=DoclingLoader.ExportType.MARKDOWN, + file_path = "https://arxiv.org/pdf/2408.09869", + # converter=..., + # convert_kwargs=..., + # export_type=..., + # md_export_kwargs=..., + # chunker=..., ) - documents = loader.load() - # # or directly get the splits: - # splits = loader.load_and_split() - Example of doc chunks mode: + Load: .. code-block:: python - from langchain_community.document_loaders import DoclingLoader + docs = loader.load() + print(docs[0].page_content[:100]) + print(docs[0].metadata) - loader = DoclingLoader( - file_path="https://arxiv.org/pdf/2408.09869", - export_type=DoclingLoader.ExportType.DOC_CHUNKS, - ) - splits = loader.load() + .. code-block:: python + + ## Docling Technical Report + + Version 1.0 + + Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nik + {'source': 'https://arxiv.org/pdf/2408.09869'} + + Lazy load: + .. code-block:: python + + docs = [] + docs_lazy = loader.lazy_load() + + for doc in docs_lazy: + docs.append(doc) + print(docs[0].page_content[:100]) + print(docs[0].metadata) + + .. code-block:: python + + ## Docling Technical Report + + Version 1.0 + + Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nik + {'source': 'https://arxiv.org/pdf/2408.09869'} """ class ExportType(str, Enum):