Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

community: add Docling document loader #27987

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
481 changes: 481 additions & 0 deletions docs/docs/integrations/document_loaders/docling.ipynb

Large diffs are not rendered by default.

31 changes: 25 additions & 6 deletions docs/src/theme/FeatureTables.js
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ const FEATURE_TABLES = {
"multimodal": true,
"local": false,
"apiLink": "https://python.langchain.com/api_reference/anthropic/chat_models/langchain_anthropic.chat_models.ChatAnthropic.html"

},
{
"name": "ChatMistralAI",
Expand Down Expand Up @@ -200,7 +199,7 @@ const FEATURE_TABLES = {
"link": "upstage",
"structured_output": true,
"tool_calling": true,
"json_mode": false,
"json_mode": false,
"multimodal": false,
"local": false,
"apiLink": "https://python.langchain.com/api_reference/upstage/chat_models/langchain_upstage.chat_models.ChatUpstage.html"
Expand All @@ -211,7 +210,7 @@ const FEATURE_TABLES = {
"link": "databricks",
"structured_output": true,
"tool_calling": true,
"json_mode": false,
"json_mode": false,
"multimodal": false,
"local": false,
"apiLink": "https://python.langchain.com/api_reference/upstage/chat_models/langchain_databricks.chat_models.ChatDatabricks.html"
Expand All @@ -222,7 +221,7 @@ const FEATURE_TABLES = {
"link": "ibm_watsonx",
"structured_output": true,
"tool_calling": true,
"json_mode": true,
"json_mode": true,
"multimodal": false,
"local": false,
"apiLink": "https://python.langchain.com/api_reference/ibm/chat_models/langchain_ibm.chat_models.ChatWatsonx.html"
Expand Down Expand Up @@ -620,7 +619,7 @@ const FEATURE_TABLES = {
partnerPackage: false,
loaderName: "SharePointLoader",
apiLink: "https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.sharepoint.SharePointLoader.html"

},
{
name: "Tencent COS Directory",
Expand Down Expand Up @@ -754,7 +753,7 @@ const FEATURE_TABLES = {
link: "twitter",
loaderName: "TwitterTweetLoader",
apiLink: "https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.twitter.TwitterTweetLoader.html"

},
{
name: "Reddit",
Expand Down Expand Up @@ -788,6 +787,13 @@ const FEATURE_TABLES = {
api: "Package",
apiLink: "https://python.langchain.com/api_reference/unstructured/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html"
},
{
name: "Docling",
link: "docling",
source: "Uses Docling to load and parse web pages",
api: "Package",
apiLink: "https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.DoclingLoader.html"
},
{
name: "RecursiveURL",
link: "recursive_url",
Expand Down Expand Up @@ -842,6 +848,13 @@ const FEATURE_TABLES = {
api: "API",
apiLink: "https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.AmazonTextractPDFLoader.html"
},
{
name: "Docling",
link: "docling",
source: "Uses Docling to load PDFs",
api: "Package",
apiLink: "https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.DoclingLoader.html"
},
{
name: "MathPix",
link: "mathpix",
Expand Down Expand Up @@ -913,6 +926,12 @@ const FEATURE_TABLES = {
source: "Many file types (see https://docs.unstructured.io/platform/supported-file-types)",
apiLink: "https://python.langchain.com/api_reference/unstructured/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html"
},
{
name: "Docling",
link: "docling",
source: "Various file types (see https://ds4sd.github.io/docling/)",
apiLink: "https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.DoclingLoader.html"
},
{
name: "JSONLoader",
link: "json",
Expand Down
1 change: 1 addition & 0 deletions libs/community/extended_testing_deps.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ cohere>=4,<6
databricks-vectorsearch>=0.21,<0.22
datasets>=2.15.0,<3
dgml-utils>=0.3.0,<0.4
docling>=2.4.0,<3
elasticsearch>=8.12.0,<9
esprima>=4.0.1,<5
faiss-cpu>=1,<2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,9 @@
from langchain_community.document_loaders.doc_intelligence import (
AzureAIDocumentIntelligenceLoader,
)
from langchain_community.document_loaders.docling import (
DoclingLoader,
)
from langchain_community.document_loaders.docugami import (
DocugamiLoader,
)
Expand Down Expand Up @@ -585,6 +588,7 @@
"DiffbotLoader": "langchain_community.document_loaders.diffbot",
"DirectoryLoader": "langchain_community.document_loaders.directory",
"DiscordChatLoader": "langchain_community.document_loaders.discord",
"DoclingLoader": "langchain_community.document_loaders.docling",
"DocugamiLoader": "langchain_community.document_loaders.docugami",
"DocusaurusLoader": "langchain_community.document_loaders.docusaurus",
"Docx2txtLoader": "langchain_community.document_loaders.word_document",
Expand Down Expand Up @@ -791,6 +795,7 @@ def __getattr__(name: str) -> Any:
"DiffbotLoader",
"DirectoryLoader",
"DiscordChatLoader",
"DoclingLoader",
"DocugamiLoader",
"DocusaurusLoader",
"Docx2txtLoader",
Expand Down
165 changes: 165 additions & 0 deletions libs/community/langchain_community/document_loaders/docling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
from enum import Enum
from typing import Any, Dict, Iterable, Iterator, Optional, Union

from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document


class DoclingLoader(BaseLoader):
"""
Docling document loader integration

Setup:
Install ``docling`` besides ``langchain-community``.

.. code-block:: bash

pip install -U docling langchain-community

Instantiate:
.. code-block:: python

from langchain_community.document_loaders import DoclingLoader

loader = DoclingLoader(
file_path = "https://arxiv.org/pdf/2408.09869",
# converter=...,
# convert_kwargs=...,
# export_type=...,
# md_export_kwargs=...,
# chunker=...,
)

Load:
.. code-block:: python

docs = loader.load()
print(docs[0].page_content[:100])
print(docs[0].metadata)

.. code-block:: python

## Docling Technical Report

Version 1.0

Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nik
{'source': 'https://arxiv.org/pdf/2408.09869'}

Lazy load:
.. code-block:: python

docs = []
docs_lazy = loader.lazy_load()

for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)

.. code-block:: python

## Docling Technical Report

Version 1.0

Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nik
{'source': 'https://arxiv.org/pdf/2408.09869'}
"""

class ExportType(str, Enum):
"""Enumeration of available export types."""

MARKDOWN = "markdown"
DOC_CHUNKS = "doc_chunks"

def __init__(
self,
file_path: Union[str, Iterable[str]],
*,
converter: Any = None,
convert_kwargs: Optional[Dict[str, Any]] = None,
export_type: ExportType = ExportType.MARKDOWN,
md_export_kwargs: Optional[Dict[str, Any]] = None,
chunker: Any = None,
):
"""Initialize with a file path.

Args:
file_path (Union[str, Iterable[str]]): File source as single str (URL or
local file) or Iterable thereof.
converter (Union[docling.document_converter.DocumentConverter, None],
optional): Any specific `DocumentConverter` to use. Defaults to `None`
(i.e. converter defined internally).
convert_kwargs (Union[Dict[str, Any], None], optional): Any specific kwargs
to pass to conversion invocation. Defaults to `None` (i.e. behavior
defined internally).
export_type (ExportType, optional): The type to export to: either
`ExportType.MARKDOWN` (outputs Markdown of whole input file) or
`ExportType.DOC_CHUNKS` (outputs chunks based on chunker). Defaults to
`ExportType.MARKDOWN`.
md_export_kwargs (Union[Dict[str, Any], None], optional): Any specific
kwargs to pass to Markdown export (in case of `ExportType.MARKDOWN`).
Defaults to `None` (i.e. behavior defined internally).
chunker (Union[docling_core.transforms.chunker.BaseChunker, None],
optional): Any specific `BaseChunker` to use (in case of
`ExportType.DOC_CHUNKS`). Defaults to `None` (i.e. chunker defined
internally).

Raises:
ImportError: In case `docling` is not installed.
"""

try:
from docling.document_converter import DocumentConverter
from docling_core.transforms.chunker import BaseChunker, HierarchicalChunker
except ImportError:
raise ImportError(
"docling package not found, please install it with `pip install docling`" # noqa
)

self._file_paths = (
file_path
if isinstance(file_path, Iterable) and not isinstance(file_path, str)
else [file_path]
)

self._converter: DocumentConverter = converter or DocumentConverter()
self._convert_kwargs = convert_kwargs if convert_kwargs is not None else {}
self._export_type = export_type
self._md_export_kwargs = (
md_export_kwargs
if md_export_kwargs is not None
else {"image_placeholder": ""}
)
self._chunker: BaseChunker = chunker or HierarchicalChunker()

def lazy_load(
self,
) -> Iterator[Document]:
"""Lazy load documents."""

for file_path in self._file_paths:
conv_res = self._converter.convert(
source=file_path,
**self._convert_kwargs,
)
dl_doc = conv_res.document
if self._export_type == self.ExportType.MARKDOWN:
yield Document(
page_content=dl_doc.export_to_markdown(**self._md_export_kwargs),
metadata={"source": file_path},
)
elif self._export_type == self.ExportType.DOC_CHUNKS:
chunk_iter = self._chunker.chunk(dl_doc)
for chunk in chunk_iter:
yield Document(
page_content=chunk.text,
metadata={
"source": file_path,
"dl_meta": chunk.meta.export_json_dict(),
},
)

else:
raise ValueError(f"Unexpected export type: {self._export_type}")
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from pathlib import Path

import pytest

from langchain_community.document_loaders import DoclingLoader

HELLO_PDF = Path(__file__).parent.parent.parent / "examples" / "hello.pdf"


@pytest.mark.requires("docling")
def test_docling_load_as_markdown() -> None:
loader = DoclingLoader(
file_path=str(HELLO_PDF.absolute()),
export_type=DoclingLoader.ExportType.MARKDOWN,
)
docs = loader.load()
assert len(docs) == 1
assert "Hello world!" in docs[0].page_content
Loading
Loading