community: add Docling document loader

Signed-off-by: Panos Vagenas <[email protected]>
langchain-ai · Nov 8, 2024 · 5ae6ed4 · 5ae6ed4
1 parent b509747
commit 5ae6ed4
Show file tree

Hide file tree

Showing 8 changed files with 771 additions and 6 deletions.
diff --git a/docs/docs/integrations/document_loaders/docling.ipynb b/docs/docs/integrations/document_loaders/docling.ipynb
diff --git a/docs/src/theme/FeatureTables.js b/docs/src/theme/FeatureTables.js
@@ -27,7 +27,6 @@ const FEATURE_TABLES = {
                 "multimodal": true,
                 "local": false,
                 "apiLink": "https://python.langchain.com/api_reference/anthropic/chat_models/langchain_anthropic.chat_models.ChatAnthropic.html"
-
             },
             {
                 "name": "ChatMistralAI",
@@ -200,7 +199,7 @@ const FEATURE_TABLES = {
                 "link": "upstage",
                 "structured_output": true,
                 "tool_calling": true,
-                "json_mode": false, 
+                "json_mode": false,
                 "multimodal": false,
                 "local": false,
                 "apiLink": "https://python.langchain.com/api_reference/upstage/chat_models/langchain_upstage.chat_models.ChatUpstage.html"
@@ -211,7 +210,7 @@ const FEATURE_TABLES = {
                 "link": "databricks",
                 "structured_output": true,
                 "tool_calling": true,
-                "json_mode": false, 
+                "json_mode": false,
                 "multimodal": false,
                 "local": false,
                 "apiLink": "https://python.langchain.com/api_reference/upstage/chat_models/langchain_databricks.chat_models.ChatDatabricks.html"
@@ -222,7 +221,7 @@ const FEATURE_TABLES = {
                 "link": "ibm_watsonx",
                 "structured_output": true,
                 "tool_calling": true,
-                "json_mode": true, 
+                "json_mode": true,
                 "multimodal": false,
                 "local": false,
                 "apiLink": "https://python.langchain.com/api_reference/ibm/chat_models/langchain_ibm.chat_models.ChatWatsonx.html"
@@ -609,7 +608,7 @@ const FEATURE_TABLES = {
                 partnerPackage: false,
                 loaderName: "SharePointLoader",
                 apiLink: "https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.sharepoint.SharePointLoader.html"
-                
+
             },
             {
                 name: "Tencent COS Directory",
@@ -743,7 +742,7 @@ const FEATURE_TABLES = {
                 link: "twitter",
                 loaderName: "TwitterTweetLoader",
                 apiLink: "https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.twitter.TwitterTweetLoader.html"
-                
+
             },
             {
                 name: "Reddit",
@@ -777,6 +776,13 @@ const FEATURE_TABLES = {
                 api: "Package",
                 apiLink: "https://python.langchain.com/api_reference/unstructured/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html"
             },
+            {
+                name: "Docling",
+                link: "docling",
+                source: "Uses Docling to load and parse web pages",
+                api: "Package",
+                apiLink: "https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.DoclingLoader.html"
+            },
             {
                 name: "RecursiveURL",
                 link: "recursive_url",
@@ -831,6 +837,13 @@ const FEATURE_TABLES = {
                 api: "API",
                 apiLink: "https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.AmazonTextractPDFLoader.html"
             },
+            {
+                name: "Docling",
+                link: "docling",
+                source: "Uses Docling to load PDFs",
+                api: "Package",
+                apiLink: "https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.DoclingLoader.html"
+            },
             {
                 name: "MathPix",
                 link: "mathpix",
@@ -902,6 +915,12 @@ const FEATURE_TABLES = {
                 source: "Many file types (see https://docs.unstructured.io/platform/supported-file-types)",
                 apiLink: "https://python.langchain.com/api_reference/unstructured/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html"
             },
+            {
+                name: "Docling",
+                link: "docling",
+                source: "Various file types (see https://ds4sd.github.io/docling/)",
+                apiLink: "https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.DoclingLoader.html"
+            },
             {
                 name: "JSONLoader",
                 link: "json",

diff --git a/libs/community/extended_testing_deps.txt b/libs/community/extended_testing_deps.txt
@@ -17,6 +17,7 @@ cohere>=4,<6
 databricks-vectorsearch>=0.21,<0.22
 datasets>=2.15.0,<3
 dgml-utils>=0.3.0,<0.4
+docling>=2.4.0,<3
 elasticsearch>=8.12.0,<9
 esprima>=4.0.1,<5
 faiss-cpu>=1,<2

diff --git a/libs/community/langchain_community/document_loaders/__init__.py b/libs/community/langchain_community/document_loaders/__init__.py
@@ -158,6 +158,9 @@
     from langchain_community.document_loaders.doc_intelligence import (
         AzureAIDocumentIntelligenceLoader,
     )
+    from langchain_community.document_loaders.docling import (
+        DoclingLoader,
+    )
     from langchain_community.document_loaders.docugami import (
         DocugamiLoader,
     )
@@ -585,6 +588,7 @@
     "DiffbotLoader": "langchain_community.document_loaders.diffbot",
     "DirectoryLoader": "langchain_community.document_loaders.directory",
     "DiscordChatLoader": "langchain_community.document_loaders.discord",
+    "DoclingLoader": "langchain_community.document_loaders.docling",
     "DocugamiLoader": "langchain_community.document_loaders.docugami",
     "DocusaurusLoader": "langchain_community.document_loaders.docusaurus",
     "Docx2txtLoader": "langchain_community.document_loaders.word_document",
@@ -791,6 +795,7 @@ def __getattr__(name: str) -> Any:
     "DiffbotLoader",
     "DirectoryLoader",
     "DiscordChatLoader",
+    "DoclingLoader",
     "DocugamiLoader",
     "DocusaurusLoader",
     "Docx2txtLoader",

diff --git a/libs/community/langchain_community/document_loaders/docling.py b/libs/community/langchain_community/document_loaders/docling.py
@@ -0,0 +1,131 @@
+from enum import Enum
+from typing import Any, Dict, Iterable, Iterator, Optional, Union
+
+from langchain_core.document_loaders import BaseLoader
+from langchain_core.documents import Document
+
+
+class DoclingLoader(BaseLoader):
+    """Load PDF, HTML, DOCX, PPTX, Markdown, and more document formats using Docling.
+
+    Example of markdown mode (default mode):
+        .. code-block:: python
+
+            from langchain_community.document_loaders import DoclingLoader
+
+            loader = DoclingLoader(
+                file_path="https://arxiv.org/pdf/2408.09869",
+                export_type=DoclingLoader.ExportType.MARKDOWN,
+            )
+            documents = loader.load()
+            # # or directly get the splits:
+            # splits = loader.load_and_split()
+
+    Example of doc chunks mode:
+        .. code-block:: python
+
+            from langchain_community.document_loaders import DoclingLoader
+
+            loader = DoclingLoader(
+                file_path="https://arxiv.org/pdf/2408.09869",
+                export_type=DoclingLoader.ExportType.DOC_CHUNKS,
+            )
+            splits = loader.load()
+    """
+
+    class ExportType(str, Enum):
+        """Enumeration of available export types."""
+
+        MARKDOWN = "markdown"
+        DOC_CHUNKS = "doc_chunks"
+
+    def __init__(
+        self,
+        file_path: Union[str, Iterable[str]],
+        *,
+        converter: Any = None,
+        convert_kwargs: Optional[Dict[str, Any]] = None,
+        export_type: ExportType = ExportType.MARKDOWN,
+        md_export_kwargs: Optional[Dict[str, Any]] = None,
+        chunker: Any = None,
+    ):
+        """Initialize with a file path.
+
+        Args:
+            file_path (Union[str, Iterable[str]]): File source as single str (URL or
+                local file) or Iterable thereof.
+            converter (Union[docling.document_converter.DocumentConverter, None],
+                optional): Any specific `DocumentConverter` to use. Defaults to `None`
+                (i.e. converter defined internally).
+            convert_kwargs (Union[Dict[str, Any], None], optional): Any specific kwargs
+                to pass to conversion invocation. Defaults to `None` (i.e. behavior
+                defined internally).
+            export_type (ExportType, optional): The type to export to: either
+                `ExportType.MARKDOWN` (outputs Markdown of whole input file) or
+                `ExportType.DOC_CHUNKS` (outputs chunks based on chunker). Defaults to
+                `ExportType.MARKDOWN`.
+            md_export_kwargs (Union[Dict[str, Any], None], optional): Any specific
+                kwargs to pass to Markdown export (in case of `ExportType.MARKDOWN`).
+                Defaults to `None` (i.e. behavior defined internally).
+            chunker (Union[docling_core.transforms.chunker.BaseChunker, None],
+                optional): Any specific `BaseChunker` to use (in case of
+                `ExportType.DOC_CHUNKS`). Defaults to `None` (i.e. chunker defined
+                internally).
+
+        Raises:
+            ImportError: In case `docling` is not installed.
+        """
+
+        try:
+            from docling.document_converter import DocumentConverter
+            from docling_core.transforms.chunker import BaseChunker, HierarchicalChunker
+        except ImportError:
+            raise ImportError(
+                "docling package not found, please install it with `pip install docling`"  # noqa
+            )
+
+        self._file_paths = (
+            file_path
+            if isinstance(file_path, Iterable) and not isinstance(file_path, str)
+            else [file_path]
+        )
+
+        self._converter: DocumentConverter = converter or DocumentConverter()
+        self._convert_kwargs = convert_kwargs if convert_kwargs is not None else {}
+        self._export_type = export_type
+        self._md_export_kwargs = (
+            md_export_kwargs
+            if md_export_kwargs is not None
+            else {"image_placeholder": ""}
+        )
+        self._chunker: BaseChunker = chunker or HierarchicalChunker()
+
+    def lazy_load(
+        self,
+    ) -> Iterator[Document]:
+        """Lazy load documents."""
+
+        for file_path in self._file_paths:
+            conv_res = self._converter.convert(
+                source=file_path,
+                **self._convert_kwargs,
+            )
+            dl_doc = conv_res.document
+            if self._export_type == self.ExportType.MARKDOWN:
+                yield Document(
+                    page_content=dl_doc.export_to_markdown(**self._md_export_kwargs),
+                    metadata={"source": file_path},
+                )
+            elif self._export_type == self.ExportType.DOC_CHUNKS:
+                chunk_iter = self._chunker.chunk(dl_doc)
+                for chunk in chunk_iter:
+                    yield Document(
+                        page_content=chunk.text,
+                        metadata={
+                            "source": file_path,
+                            "dl_meta": chunk.meta.export_json_dict(),
+                        },
+                    )
+
+            else:
+                raise ValueError(f"Unexpected export type: {self._export_type}")
diff --git a/libs/community/tests/integration_tests/document_loaders/test_docling.py b/libs/community/tests/integration_tests/document_loaders/test_docling.py
@@ -0,0 +1,18 @@
+from pathlib import Path
+
+import pytest
+
+from langchain_community.document_loaders import DoclingLoader
+
+HELLO_PDF = Path(__file__).parent.parent.parent / "examples" / "hello.pdf"
+
+
+@pytest.mark.requires("docling")
+def test_docling_load_as_markdown() -> None:
+    loader = DoclingLoader(
+        file_path=str(HELLO_PDF.absolute()),
+        export_type=DoclingLoader.ExportType.MARKDOWN,
+    )
+    docs = loader.load()
+    assert len(docs) == 1
+    assert "Hello world!" in docs[0].page_content