diff --git a/docs/docs/integrations/document_loaders/docling.ipynb b/docs/docs/integrations/document_loaders/docling.ipynb new file mode 100644 index 00000000000000..eed0e9b6400210 --- /dev/null +++ b/docs/docs/integrations/document_loaders/docling.ipynb @@ -0,0 +1,410 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Docling" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[Docling](https://github.com/DS4SD/docling) parses PDF, DOCX, PPTX, HTML, and other formats into a rich unified representation including document layout, tables etc., making them ready for generative AI workflows like RAG.\n", + "\n", + "Docling Loader, presented in this notebook, seamlessly integrates Docling into LangChain, enabling you to:\n", + "- use various document types in your LLM applications with ease and speed, and\n", + "- leverage Docling's rich representation for advanced, document-native grounding.\n", + "\n", + "In the sections below, we showcase Docling Loader's usage, covering document loading specifics but also demonstrating an end-to-end RAG pipeline." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install -qU docling langchain-community langchain langchain-text-splitters langchain-huggingface langchain-milvus pip" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.document_loaders import DoclingLoader\n", + "\n", + "FILE_PATH = \"https://arxiv.org/pdf/2408.09869\"\n", + "\n", + "def clip_text(text, threshold=100):\n", + " return f\"{text[:threshold]}[...]\" if len(text) > threshold else text\n", + "\n", + "def print_docs(docs):\n", + " for d in docs:\n", + " print(f\"metadata={d.metadata}, page_content={repr(clip_text(d.page_content))}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Document loading\n", + "\n", + "Docling Loader can be used in two different modes, based on the export type:\n", + "- **Markdown** mode: for each input doc, outputs a LangChain `Document` with the Markdown representation of the input doc\n", + "- **Doc-chunks** mode: for each input doc, outputs the doc chunks (using Docling layout-aware chunking) as LangChain `Document`s " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using Markdown mode" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The markdown mode (default mode) returns the markdown export of the input documents.\n", + "\n", + "For customizing the markdown export, the user can pass the Docling markdown export kwargs (via keyword argument `md_export_kwargs`).\n", + "\n", + "Advanced tip: for customizing the conversion initialization and/or execution, the user can pass a Docling `DocumentConverter` object (via keyword argument `converter`) and/or the conversion kwargs (via keyword argument `convert_kwargs`) respectively." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "metadata={'source': 'https://arxiv.org/pdf/2408.09869'}, page_content='## Docling Technical Report\\n\\nVersion 1.0\\n\\nChristoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nik[...]'\n" + ] + } + ], + "source": [ + "loader = DoclingLoader(\n", + " file_path=FILE_PATH,\n", + " export_type=DoclingLoader.ExportType.MARKDOWN,\n", + ")\n", + "docs = loader.load()\n", + "\n", + "print_docs(docs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "Now that the `docs` have been loaded, any built-in (or custom) LangChain splitter can be used to split them.\n", + "\n", + "\n", + "For illustrating one option, below we show a possible splitting with a `MarkdownHeaderTextSplitter`:\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "metadata={'Header_2': 'Docling Technical Report'}, page_content='Version 1.0 \\nChristoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagen[...]'\n", + "metadata={'Header_2': 'Abstract'}, page_content='This technical report introduces Docling , an easy to use, self-contained, MITlicensed open-source p[...]'\n", + "metadata={'Header_2': '1 Introduction'}, page_content='Converting PDF documents back into a machine-processable format has been a major challenge for decad[...]'\n" + ] + } + ], + "source": [ + "from langchain_text_splitters import MarkdownHeaderTextSplitter\n", + "\n", + "splitter = MarkdownHeaderTextSplitter(\n", + " headers_to_split_on=[(\"#\", \"Header_1\"), (\"##\", \"Header_2\"), (\"###\", \"Header_3\")],\n", + ")\n", + "md_splits = [split for doc in docs for split in splitter.split_text(doc.page_content)]\n", + "\n", + "print_docs(md_splits[:3])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using doc chunks mode" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The doc-chunks mode directly returns the document chunks including rich metadata such as the page number and the bounding box info.\n", + "\n", + "For custom chunking, the user can pass a Docling `BaseChunker` object (via keyword argument `chunker`)." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "metadata={'source': 'https://arxiv.org/pdf/2408.09869', 'dl_meta': {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/0', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'page_header', 'prov': [{'page_no': 1, 'bbox': {'l': 17.088111877441406, 't': 583.2296752929688, 'r': 36.339778900146484, 'b': 231.99996948242188, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 38]}]}], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 14981478401387673002, 'filename': '2408.09869v3.pdf'}}}, page_content='arXiv:2408.09869v3 [cs.CL] 30 Aug 2024'\n", + "metadata={'source': 'https://arxiv.org/pdf/2408.09869', 'dl_meta': {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/2', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 1, 'bbox': {'l': 282.772216796875, 't': 512.7218017578125, 'r': 328.8624572753906, 'b': 503.340087890625, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 11]}]}], 'headings': ['Docling Technical Report'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 14981478401387673002, 'filename': '2408.09869v3.pdf'}}}, page_content='Version 1.0'\n", + "metadata={'source': 'https://arxiv.org/pdf/2408.09869', 'dl_meta': {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/3', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 1, 'bbox': {'l': 113.4512939453125, 't': 482.4101257324219, 'r': 498.396728515625, 'b': 439.45928955078125, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 295]}]}], 'headings': ['Docling Technical Report'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 14981478401387673002, 'filename': '2408.09869v3.pdf'}}}, page_content='Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berro[...]'\n" + ] + } + ], + "source": [ + "loader = DoclingLoader(\n", + " file_path=FILE_PATH,\n", + " export_type=DoclingLoader.ExportType.DOC_CHUNKS,\n", + ")\n", + "doc_splits = loader.load()\n", + "\n", + "print_docs(doc_splits[:3])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## RAG" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this section we put together a demo RAG pipeline and run it using the documents loaded above." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import os\n", + "from pathlib import Path\n", + "from tempfile import mkdtemp\n", + "\n", + "from langchain.chains import create_retrieval_chain\n", + "from langchain.chains.combine_documents import create_stuff_documents_chain\n", + "from langchain_core.prompts import PromptTemplate\n", + "from langchain_huggingface import HuggingFaceEndpoint\n", + "from langchain_huggingface.embeddings import HuggingFaceEmbeddings\n", + "from langchain_milvus import Milvus\n", + "\n", + "# https://github.com/huggingface/transformers/issues/5486:\n", + "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n", + "\n", + "QUESTION = \"Which are the main AI models in Docling?\"\n", + "PROMPT = PromptTemplate.from_template(\n", + " \"Context information is below.\\n---------------------\\n{context}\\n---------------------\\nGiven the context information and not prior knowledge, answer the query.\\nQuery: {input}\\nAnswer:\\n\",\n", + ")\n", + "HF_EMBED_MODEL_ID = \"BAAI/bge-small-en-v1.5\"\n", + "HF_LLM_MODEL_ID = \"mistralai/Mixtral-8x7B-Instruct-v0.1\"\n", + "\n", + "embedding = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)\n", + "llm = HuggingFaceEndpoint(repo_id=HF_LLM_MODEL_ID)\n", + "\n", + "\n", + "def run_rag(documents, embedding, llm, question, prompt):\n", + " milvus_uri = str(Path(mkdtemp()) / \"docling.db\") # or set as needed\n", + " vectorstore = Milvus.from_documents(\n", + " documents,\n", + " embedding,\n", + " connection_args={\"uri\": milvus_uri},\n", + " drop_old=True,\n", + " )\n", + " retriever = vectorstore.as_retriever()\n", + " question_answer_chain = create_stuff_documents_chain(llm, prompt)\n", + " rag_chain = create_retrieval_chain(retriever, question_answer_chain)\n", + " resp_dict = rag_chain.invoke({\"input\": question})\n", + "\n", + " answer = clip_text(resp_dict[\"answer\"], threshold=200)\n", + " print(f\"Question:\\n{resp_dict['input']}\\n\\nAnswer:\\n{json.dumps(answer)}\")\n", + " for i, doc in enumerate(resp_dict[\"context\"]):\n", + " print()\n", + " print(f\"Source {i+1}:\")\n", + " print(f\" text: {json.dumps(clip_text(doc.page_content, threshold=200))}\")\n", + " for key in doc.metadata:\n", + " if key != \"pk\":\n", + " val = doc.metadata.get(key)\n", + " clipped_val = clip_text(val) if isinstance(val, str) else val\n", + " print(f\" {key}: {clipped_val}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using Markdown mode\n", + "\n", + "Below we run the RAG pipeline passing it the output of the Markdown mode (after splitting):" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Question:\n", + "Which are the main AI models in Docling?\n", + "\n", + "Answer:\n", + "\"The main AI models in Docling are a layout analysis model called DocLayNet and a table structure recognition model called TableFormer. DocLayNet is an accurate object-detector for page elements, while[...]\"\n", + "\n", + "Source 1:\n", + " text: \"As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis m[...]\"\n", + " Header_2: 3.2 AI models\n", + "\n", + "Source 2:\n", + " text: \"This technical report introduces Docling , an easy to use, self-contained, MITlicensed open-source package for PDF document conversion. It is powered by state-of-the-art specialized AI models for layo[...]\"\n", + " Header_2: Abstract\n", + "\n", + "Source 3:\n", + " text: \"Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed e[...]\"\n", + " Header_2: 5 Applications\n", + "\n", + "Source 4:\n", + " text: \"Docling is designed to allow easy extension of the model library and pipelines. In the future, we plan to extend Docling with several more models, such as a figure-classifier model, an equationrecogni[...]\"\n", + " Header_2: 6 Future work and contributions\n" + ] + } + ], + "source": [ + "run_rag(\n", + " documents=md_splits,\n", + " embedding=embedding,\n", + " llm=llm,\n", + " question=QUESTION,\n", + " prompt=PROMPT,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using doc-chunk mode\n", + "\n", + "Below we run the RAG pipeline passing it the output of the doc-chunk mode.\n", + "\n", + "Notice how the sources now also contain document-level grounding (e.g. page number or bounding box information):" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Question:\n", + "Which are the main AI models in Docling?\n", + "\n", + "Answer:\n", + "\"The main AI models in Docling are:\\n\\n1. A layout analysis model, an accurate object-detector for page elements.\\n2. TableFormer, a state-of-the-art table structure recognition model.\"\n", + "\n", + "Source 1:\n", + " text: \"As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis m[...]\"\n", + " dl_meta: {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/34', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 3, 'bbox': {'l': 107.07593536376953, 't': 406.1695251464844, 'r': 504.1148681640625, 'b': 330.2677307128906, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 608]}]}], 'headings': ['3.2 AI models'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 14981478401387673002, 'filename': '2408.09869v3.pdf'}}\n", + " source: https://arxiv.org/pdf/2408.09869\n", + "\n", + "Source 2:\n", + " text: \"With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition[...]\"\n", + " dl_meta: {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/9', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 1, 'bbox': {'l': 107.0031967163086, 't': 136.7283935546875, 'r': 504.04998779296875, 'b': 83.30133056640625, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 488]}]}], 'headings': ['1 Introduction'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 14981478401387673002, 'filename': '2408.09869v3.pdf'}}\n", + " source: https://arxiv.org/pdf/2408.09869\n", + "\n", + "Source 3:\n", + " text: \"Docling is designed to allow easy extension of the model library and pipelines. In the future, we plan to extend Docling with several more models, such as a figure-classifier model, an equationrecogni[...]\"\n", + " dl_meta: {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/60', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 5, 'bbox': {'l': 106.92281341552734, 't': 323.5386657714844, 'r': 504.00347900390625, 'b': 258.76641845703125, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 543]}]}], 'headings': ['6 Future work and contributions'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 14981478401387673002, 'filename': '2408.09869v3.pdf'}}\n", + " source: https://arxiv.org/pdf/2408.09869\n", + "\n", + "Source 4:\n", + " text: \"This technical report introduces Docling , an easy to use, self-contained, MITlicensed open-source package for PDF document conversion. It is powered by state-of-the-art specialized AI models for layo[...]\"\n", + " dl_meta: {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/6', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 1, 'bbox': {'l': 142.92593383789062, 't': 364.814697265625, 'r': 468.3847351074219, 'b': 300.651123046875, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 431]}]}], 'headings': ['Abstract'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 14981478401387673002, 'filename': '2408.09869v3.pdf'}}\n", + " source: https://arxiv.org/pdf/2408.09869\n" + ] + } + ], + "source": [ + "run_rag(\n", + " documents=doc_splits,\n", + " embedding=embedding,\n", + " llm=llm,\n", + " question=QUESTION,\n", + " prompt=PROMPT,\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/src/theme/FeatureTables.js b/docs/src/theme/FeatureTables.js index 7217c245a12c9d..325b29b5b6ae9c 100644 --- a/docs/src/theme/FeatureTables.js +++ b/docs/src/theme/FeatureTables.js @@ -27,7 +27,6 @@ const FEATURE_TABLES = { "multimodal": true, "local": false, "apiLink": "https://python.langchain.com/api_reference/anthropic/chat_models/langchain_anthropic.chat_models.ChatAnthropic.html" - }, { "name": "ChatMistralAI", @@ -200,7 +199,7 @@ const FEATURE_TABLES = { "link": "upstage", "structured_output": true, "tool_calling": true, - "json_mode": false, + "json_mode": false, "multimodal": false, "local": false, "apiLink": "https://python.langchain.com/api_reference/upstage/chat_models/langchain_upstage.chat_models.ChatUpstage.html" @@ -211,7 +210,7 @@ const FEATURE_TABLES = { "link": "databricks", "structured_output": true, "tool_calling": true, - "json_mode": false, + "json_mode": false, "multimodal": false, "local": false, "apiLink": "https://python.langchain.com/api_reference/upstage/chat_models/langchain_databricks.chat_models.ChatDatabricks.html" @@ -222,7 +221,7 @@ const FEATURE_TABLES = { "link": "ibm_watsonx", "structured_output": true, "tool_calling": true, - "json_mode": true, + "json_mode": true, "multimodal": false, "local": false, "apiLink": "https://python.langchain.com/api_reference/ibm/chat_models/langchain_ibm.chat_models.ChatWatsonx.html" @@ -609,7 +608,7 @@ const FEATURE_TABLES = { partnerPackage: false, loaderName: "SharePointLoader", apiLink: "https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.sharepoint.SharePointLoader.html" - + }, { name: "Tencent COS Directory", @@ -743,7 +742,7 @@ const FEATURE_TABLES = { link: "twitter", loaderName: "TwitterTweetLoader", apiLink: "https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.twitter.TwitterTweetLoader.html" - + }, { name: "Reddit", @@ -777,6 +776,13 @@ const FEATURE_TABLES = { api: "Package", apiLink: "https://python.langchain.com/api_reference/unstructured/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html" }, + { + name: "Docling", + link: "docling", + source: "Uses Docling to load and parse web pages", + api: "Package", + apiLink: "https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.DoclingLoader.html" + }, { name: "RecursiveURL", link: "recursive_url", @@ -831,6 +837,13 @@ const FEATURE_TABLES = { api: "API", apiLink: "https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.AmazonTextractPDFLoader.html" }, + { + name: "Docling", + link: "docling", + source: "Uses Docling to load PDFs", + api: "Package", + apiLink: "https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.DoclingLoader.html" + }, { name: "MathPix", link: "mathpix", @@ -902,6 +915,12 @@ const FEATURE_TABLES = { source: "Many file types (see https://docs.unstructured.io/platform/supported-file-types)", apiLink: "https://python.langchain.com/api_reference/unstructured/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html" }, + { + name: "Docling", + link: "docling", + source: "Various file types (see https://ds4sd.github.io/docling/)", + apiLink: "https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.DoclingLoader.html" + }, { name: "JSONLoader", link: "json", diff --git a/libs/community/extended_testing_deps.txt b/libs/community/extended_testing_deps.txt index 5be87606873d42..ef3e4d6023a2c1 100644 --- a/libs/community/extended_testing_deps.txt +++ b/libs/community/extended_testing_deps.txt @@ -17,6 +17,7 @@ cohere>=4,<6 databricks-vectorsearch>=0.21,<0.22 datasets>=2.15.0,<3 dgml-utils>=0.3.0,<0.4 +docling>=2.4.0,<3 elasticsearch>=8.12.0,<9 esprima>=4.0.1,<5 faiss-cpu>=1,<2 diff --git a/libs/community/langchain_community/document_loaders/__init__.py b/libs/community/langchain_community/document_loaders/__init__.py index 2576093d3d48b3..0ae25e39468cd6 100644 --- a/libs/community/langchain_community/document_loaders/__init__.py +++ b/libs/community/langchain_community/document_loaders/__init__.py @@ -158,6 +158,9 @@ from langchain_community.document_loaders.doc_intelligence import ( AzureAIDocumentIntelligenceLoader, ) + from langchain_community.document_loaders.docling import ( + DoclingLoader, + ) from langchain_community.document_loaders.docugami import ( DocugamiLoader, ) @@ -585,6 +588,7 @@ "DiffbotLoader": "langchain_community.document_loaders.diffbot", "DirectoryLoader": "langchain_community.document_loaders.directory", "DiscordChatLoader": "langchain_community.document_loaders.discord", + "DoclingLoader": "langchain_community.document_loaders.docling", "DocugamiLoader": "langchain_community.document_loaders.docugami", "DocusaurusLoader": "langchain_community.document_loaders.docusaurus", "Docx2txtLoader": "langchain_community.document_loaders.word_document", @@ -791,6 +795,7 @@ def __getattr__(name: str) -> Any: "DiffbotLoader", "DirectoryLoader", "DiscordChatLoader", + "DoclingLoader", "DocugamiLoader", "DocusaurusLoader", "Docx2txtLoader", diff --git a/libs/community/langchain_community/document_loaders/docling.py b/libs/community/langchain_community/document_loaders/docling.py new file mode 100644 index 00000000000000..209c808e773600 --- /dev/null +++ b/libs/community/langchain_community/document_loaders/docling.py @@ -0,0 +1,131 @@ +from enum import Enum +from typing import Any, Dict, Iterable, Iterator, Optional, Union + +from langchain_core.document_loaders import BaseLoader +from langchain_core.documents import Document + + +class DoclingLoader(BaseLoader): + """Load PDF, HTML, DOCX, PPTX, Markdown, and more document formats using Docling. + + Example of markdown mode (default mode): + .. code-block:: python + + from langchain_community.document_loaders import DoclingLoader + + loader = DoclingLoader( + file_path="https://arxiv.org/pdf/2408.09869", + export_type=DoclingLoader.ExportType.MARKDOWN, + ) + documents = loader.load() + # # or directly get the splits: + # splits = loader.load_and_split() + + Example of doc chunks mode: + .. code-block:: python + + from langchain_community.document_loaders import DoclingLoader + + loader = DoclingLoader( + file_path="https://arxiv.org/pdf/2408.09869", + export_type=DoclingLoader.ExportType.DOC_CHUNKS, + ) + splits = loader.load() + """ + + class ExportType(str, Enum): + """Enumeration of available export types.""" + + MARKDOWN = "markdown" + DOC_CHUNKS = "doc_chunks" + + def __init__( + self, + file_path: Union[str, Iterable[str]], + *, + converter: Any = None, + convert_kwargs: Optional[Dict[str, Any]] = None, + export_type: ExportType = ExportType.MARKDOWN, + md_export_kwargs: Optional[Dict[str, Any]] = None, + chunker: Any = None, + ): + """Initialize with a file path. + + Args: + file_path (Union[str, Iterable[str]]): File source as single str (URL or + local file) or Iterable thereof. + converter (Union[docling.document_converter.DocumentConverter, None], + optional): Any specific `DocumentConverter` to use. Defaults to `None` + (i.e. converter defined internally). + convert_kwargs (Union[Dict[str, Any], None], optional): Any specific kwargs + to pass to conversion invocation. Defaults to `None` (i.e. behavior + defined internally). + export_type (ExportType, optional): The type to export to: either + `ExportType.MARKDOWN` (outputs Markdown of whole input file) or + `ExportType.DOC_CHUNKS` (outputs chunks based on chunker). Defaults to + `ExportType.MARKDOWN`. + md_export_kwargs (Union[Dict[str, Any], None], optional): Any specific + kwargs to pass to Markdown export (in case of `ExportType.MARKDOWN`). + Defaults to `None` (i.e. behavior defined internally). + chunker (Union[docling_core.transforms.chunker.BaseChunker, None], + optional): Any specific `BaseChunker` to use (in case of + `ExportType.DOC_CHUNKS`). Defaults to `None` (i.e. chunker defined + internally). + + Raises: + ImportError: In case `docling` is not installed. + """ + + try: + from docling.document_converter import DocumentConverter + from docling_core.transforms.chunker import BaseChunker, HierarchicalChunker + except ImportError: + raise ImportError( + "docling package not found, please install it with `pip install docling`" # noqa + ) + + self._file_paths = ( + file_path + if isinstance(file_path, Iterable) and not isinstance(file_path, str) + else [file_path] + ) + + self._converter: DocumentConverter = converter or DocumentConverter() + self._convert_kwargs = convert_kwargs if convert_kwargs is not None else {} + self._export_type = export_type + self._md_export_kwargs = ( + md_export_kwargs + if md_export_kwargs is not None + else {"image_placeholder": ""} + ) + self._chunker: BaseChunker = chunker or HierarchicalChunker() + + def lazy_load( + self, + ) -> Iterator[Document]: + """Lazy load documents.""" + + for file_path in self._file_paths: + conv_res = self._converter.convert( + source=file_path, + **self._convert_kwargs, + ) + dl_doc = conv_res.document + if self._export_type == self.ExportType.MARKDOWN: + yield Document( + page_content=dl_doc.export_to_markdown(**self._md_export_kwargs), + metadata={"source": file_path}, + ) + elif self._export_type == self.ExportType.DOC_CHUNKS: + chunk_iter = self._chunker.chunk(dl_doc) + for chunk in chunk_iter: + yield Document( + page_content=chunk.text, + metadata={ + "source": file_path, + "dl_meta": chunk.meta.export_json_dict(), + }, + ) + + else: + raise ValueError(f"Unexpected export type: {self._export_type}") diff --git a/libs/community/tests/integration_tests/document_loaders/test_docling.py b/libs/community/tests/integration_tests/document_loaders/test_docling.py new file mode 100644 index 00000000000000..4e991066cc362a --- /dev/null +++ b/libs/community/tests/integration_tests/document_loaders/test_docling.py @@ -0,0 +1,18 @@ +from pathlib import Path + +import pytest + +from langchain_community.document_loaders import DoclingLoader + +HELLO_PDF = Path(__file__).parent.parent.parent / "examples" / "hello.pdf" + + +@pytest.mark.requires("docling") +def test_docling_load_as_markdown() -> None: + loader = DoclingLoader( + file_path=str(HELLO_PDF.absolute()), + export_type=DoclingLoader.ExportType.MARKDOWN, + ) + docs = loader.load() + assert len(docs) == 1 + assert "Hello world!" in docs[0].page_content diff --git a/libs/community/tests/unit_tests/document_loaders/test_docling.py b/libs/community/tests/unit_tests/document_loaders/test_docling.py new file mode 100644 index 00000000000000..99c123be3bf936 --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/test_docling.py @@ -0,0 +1,180 @@ +import json +from unittest.mock import MagicMock + +import pytest + +from langchain_community.document_loaders import DoclingLoader + +in_json_str = json.dumps( + { + "schema_name": "DoclingDocument", + "version": "1.0.0", + "name": "sample", + "origin": { + "mimetype": "text/html", + "binary_hash": 42, + "filename": "sample.html", + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "name": "_root_", + "label": "unspecified", + }, + "body": { + "self_ref": "#/body", + "children": [{"$ref": "#/texts/0"}, {"$ref": "#/texts/1"}], + "name": "_root_", + "label": "unspecified", + }, + "groups": [], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": {"$ref": "#/body"}, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Some text", + "text": "Some text", + }, + { + "self_ref": "#/texts/1", + "parent": {"$ref": "#/body"}, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Another paragraph", + "text": "Another paragraph", + }, + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "pages": {}, + } +) + + +out_json_obj = { + "root": [ + { + "id": None, + "metadata": { + "source": "https://example.com/foo.pdf", + "dl_meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/0", + "parent": {"$ref": "#/body"}, + "children": [], + "label": "paragraph", + "prov": [], + } + ], + "origin": { + "mimetype": "text/html", + "binary_hash": 42, + "filename": "sample.html", + }, + }, + }, + "page_content": "Some text", + "type": "Document", + }, + { + "id": None, + "metadata": { + "source": "https://example.com/foo.pdf", + "dl_meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/1", + "parent": {"$ref": "#/body"}, + "children": [], + "label": "paragraph", + "prov": [], + } + ], + "origin": { + "mimetype": "text/html", + "binary_hash": 42, + "filename": "sample.html", + }, + }, + }, + "page_content": "Another paragraph", + "type": "Document", + }, + ] +} + +out_md_obj = { + "root": [ + { + "id": None, + "metadata": {"source": "https://example.com/foo.pdf"}, + "page_content": "Some text\n\nAnother paragraph", + "type": "Document", + } + ] +} + + +@pytest.mark.requires("docling") +def test_load_as_markdown(monkeypatch: pytest.MonkeyPatch) -> None: + from docling_core.types import DoclingDocument as DLDocument + + mock_dl_doc = DLDocument.model_validate_json(in_json_str) + mock_response = MagicMock() + mock_response.document = mock_dl_doc + + monkeypatch.setattr( + "docling.document_converter.DocumentConverter.__init__", + lambda *args, **kwargs: None, + ) + monkeypatch.setattr( + "docling.document_converter.DocumentConverter.convert", + lambda *args, **kwargs: mock_response, + ) + + loader = DoclingLoader(file_path="https://example.com/foo.pdf") + lc_doc_iter = loader.lazy_load() + act_lc_docs = list(lc_doc_iter) + assert len(act_lc_docs) == 1 + + act_data = {"root": [lc_doc.model_dump() for lc_doc in act_lc_docs]} + assert act_data == out_md_obj + + +@pytest.mark.requires("docling") +def test_load_as_doc_chunks(monkeypatch: pytest.MonkeyPatch) -> None: + from docling_core.types import DoclingDocument as DLDocument + + mock_dl_doc = DLDocument.model_validate_json(in_json_str) + mock_response = MagicMock() + mock_response.document = mock_dl_doc + + monkeypatch.setattr( + "docling.document_converter.DocumentConverter.__init__", + lambda *args, **kwargs: None, + ) + monkeypatch.setattr( + "docling.document_converter.DocumentConverter.convert", + lambda *args, **kwargs: mock_response, + ) + + loader = DoclingLoader( + file_path="https://example.com/foo.pdf", + export_type=DoclingLoader.ExportType.DOC_CHUNKS, + ) + lc_doc_iter = loader.lazy_load() + act_lc_docs = list(lc_doc_iter) + assert len(act_lc_docs) == 2 + + act_data = {"root": [lc_doc.model_dump() for lc_doc in act_lc_docs]} + assert act_data == out_json_obj diff --git a/libs/community/tests/unit_tests/document_loaders/test_imports.py b/libs/community/tests/unit_tests/document_loaders/test_imports.py index b49a1b7cc4a2e1..f9e74be89e125f 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_imports.py +++ b/libs/community/tests/unit_tests/document_loaders/test_imports.py @@ -59,6 +59,7 @@ "DiffbotLoader", "DirectoryLoader", "DiscordChatLoader", + "DoclingLoader", "DocugamiLoader", "DocusaurusLoader", "Docx2txtLoader",