Skip to content

Commit

Permalink
community: add Docling document loader
Browse files Browse the repository at this point in the history
Signed-off-by: Panos Vagenas <[email protected]>
  • Loading branch information
vagenas committed Nov 8, 2024
1 parent b509747 commit 5ae6ed4
Show file tree
Hide file tree
Showing 8 changed files with 771 additions and 6 deletions.
410 changes: 410 additions & 0 deletions docs/docs/integrations/document_loaders/docling.ipynb

Large diffs are not rendered by default.

31 changes: 25 additions & 6 deletions docs/src/theme/FeatureTables.js
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ const FEATURE_TABLES = {
"multimodal": true,
"local": false,
"apiLink": "https://python.langchain.com/api_reference/anthropic/chat_models/langchain_anthropic.chat_models.ChatAnthropic.html"

},
{
"name": "ChatMistralAI",
Expand Down Expand Up @@ -200,7 +199,7 @@ const FEATURE_TABLES = {
"link": "upstage",
"structured_output": true,
"tool_calling": true,
"json_mode": false,
"json_mode": false,
"multimodal": false,
"local": false,
"apiLink": "https://python.langchain.com/api_reference/upstage/chat_models/langchain_upstage.chat_models.ChatUpstage.html"
Expand All @@ -211,7 +210,7 @@ const FEATURE_TABLES = {
"link": "databricks",
"structured_output": true,
"tool_calling": true,
"json_mode": false,
"json_mode": false,
"multimodal": false,
"local": false,
"apiLink": "https://python.langchain.com/api_reference/upstage/chat_models/langchain_databricks.chat_models.ChatDatabricks.html"
Expand All @@ -222,7 +221,7 @@ const FEATURE_TABLES = {
"link": "ibm_watsonx",
"structured_output": true,
"tool_calling": true,
"json_mode": true,
"json_mode": true,
"multimodal": false,
"local": false,
"apiLink": "https://python.langchain.com/api_reference/ibm/chat_models/langchain_ibm.chat_models.ChatWatsonx.html"
Expand Down Expand Up @@ -609,7 +608,7 @@ const FEATURE_TABLES = {
partnerPackage: false,
loaderName: "SharePointLoader",
apiLink: "https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.sharepoint.SharePointLoader.html"

},
{
name: "Tencent COS Directory",
Expand Down Expand Up @@ -743,7 +742,7 @@ const FEATURE_TABLES = {
link: "twitter",
loaderName: "TwitterTweetLoader",
apiLink: "https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.twitter.TwitterTweetLoader.html"

},
{
name: "Reddit",
Expand Down Expand Up @@ -777,6 +776,13 @@ const FEATURE_TABLES = {
api: "Package",
apiLink: "https://python.langchain.com/api_reference/unstructured/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html"
},
{
name: "Docling",
link: "docling",
source: "Uses Docling to load and parse web pages",
api: "Package",
apiLink: "https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.DoclingLoader.html"
},
{
name: "RecursiveURL",
link: "recursive_url",
Expand Down Expand Up @@ -831,6 +837,13 @@ const FEATURE_TABLES = {
api: "API",
apiLink: "https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.AmazonTextractPDFLoader.html"
},
{
name: "Docling",
link: "docling",
source: "Uses Docling to load PDFs",
api: "Package",
apiLink: "https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.DoclingLoader.html"
},
{
name: "MathPix",
link: "mathpix",
Expand Down Expand Up @@ -902,6 +915,12 @@ const FEATURE_TABLES = {
source: "Many file types (see https://docs.unstructured.io/platform/supported-file-types)",
apiLink: "https://python.langchain.com/api_reference/unstructured/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html"
},
{
name: "Docling",
link: "docling",
source: "Various file types (see https://ds4sd.github.io/docling/)",
apiLink: "https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.DoclingLoader.html"
},
{
name: "JSONLoader",
link: "json",
Expand Down
1 change: 1 addition & 0 deletions libs/community/extended_testing_deps.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ cohere>=4,<6
databricks-vectorsearch>=0.21,<0.22
datasets>=2.15.0,<3
dgml-utils>=0.3.0,<0.4
docling>=2.4.0,<3
elasticsearch>=8.12.0,<9
esprima>=4.0.1,<5
faiss-cpu>=1,<2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,9 @@
from langchain_community.document_loaders.doc_intelligence import (
AzureAIDocumentIntelligenceLoader,
)
from langchain_community.document_loaders.docling import (
DoclingLoader,
)
from langchain_community.document_loaders.docugami import (
DocugamiLoader,
)
Expand Down Expand Up @@ -585,6 +588,7 @@
"DiffbotLoader": "langchain_community.document_loaders.diffbot",
"DirectoryLoader": "langchain_community.document_loaders.directory",
"DiscordChatLoader": "langchain_community.document_loaders.discord",
"DoclingLoader": "langchain_community.document_loaders.docling",
"DocugamiLoader": "langchain_community.document_loaders.docugami",
"DocusaurusLoader": "langchain_community.document_loaders.docusaurus",
"Docx2txtLoader": "langchain_community.document_loaders.word_document",
Expand Down Expand Up @@ -791,6 +795,7 @@ def __getattr__(name: str) -> Any:
"DiffbotLoader",
"DirectoryLoader",
"DiscordChatLoader",
"DoclingLoader",
"DocugamiLoader",
"DocusaurusLoader",
"Docx2txtLoader",
Expand Down
131 changes: 131 additions & 0 deletions libs/community/langchain_community/document_loaders/docling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
from enum import Enum
from typing import Any, Dict, Iterable, Iterator, Optional, Union

from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document


class DoclingLoader(BaseLoader):
"""Load PDF, HTML, DOCX, PPTX, Markdown, and more document formats using Docling.
Example of markdown mode (default mode):
.. code-block:: python
from langchain_community.document_loaders import DoclingLoader
loader = DoclingLoader(
file_path="https://arxiv.org/pdf/2408.09869",
export_type=DoclingLoader.ExportType.MARKDOWN,
)
documents = loader.load()
# # or directly get the splits:
# splits = loader.load_and_split()
Example of doc chunks mode:
.. code-block:: python
from langchain_community.document_loaders import DoclingLoader
loader = DoclingLoader(
file_path="https://arxiv.org/pdf/2408.09869",
export_type=DoclingLoader.ExportType.DOC_CHUNKS,
)
splits = loader.load()
"""

class ExportType(str, Enum):
"""Enumeration of available export types."""

MARKDOWN = "markdown"
DOC_CHUNKS = "doc_chunks"

def __init__(
self,
file_path: Union[str, Iterable[str]],
*,
converter: Any = None,
convert_kwargs: Optional[Dict[str, Any]] = None,
export_type: ExportType = ExportType.MARKDOWN,
md_export_kwargs: Optional[Dict[str, Any]] = None,
chunker: Any = None,
):
"""Initialize with a file path.
Args:
file_path (Union[str, Iterable[str]]): File source as single str (URL or
local file) or Iterable thereof.
converter (Union[docling.document_converter.DocumentConverter, None],
optional): Any specific `DocumentConverter` to use. Defaults to `None`
(i.e. converter defined internally).
convert_kwargs (Union[Dict[str, Any], None], optional): Any specific kwargs
to pass to conversion invocation. Defaults to `None` (i.e. behavior
defined internally).
export_type (ExportType, optional): The type to export to: either
`ExportType.MARKDOWN` (outputs Markdown of whole input file) or
`ExportType.DOC_CHUNKS` (outputs chunks based on chunker). Defaults to
`ExportType.MARKDOWN`.
md_export_kwargs (Union[Dict[str, Any], None], optional): Any specific
kwargs to pass to Markdown export (in case of `ExportType.MARKDOWN`).
Defaults to `None` (i.e. behavior defined internally).
chunker (Union[docling_core.transforms.chunker.BaseChunker, None],
optional): Any specific `BaseChunker` to use (in case of
`ExportType.DOC_CHUNKS`). Defaults to `None` (i.e. chunker defined
internally).
Raises:
ImportError: In case `docling` is not installed.
"""

try:
from docling.document_converter import DocumentConverter
from docling_core.transforms.chunker import BaseChunker, HierarchicalChunker
except ImportError:
raise ImportError(
"docling package not found, please install it with `pip install docling`" # noqa
)

self._file_paths = (
file_path
if isinstance(file_path, Iterable) and not isinstance(file_path, str)
else [file_path]
)

self._converter: DocumentConverter = converter or DocumentConverter()
self._convert_kwargs = convert_kwargs if convert_kwargs is not None else {}
self._export_type = export_type
self._md_export_kwargs = (
md_export_kwargs
if md_export_kwargs is not None
else {"image_placeholder": ""}
)
self._chunker: BaseChunker = chunker or HierarchicalChunker()

def lazy_load(
self,
) -> Iterator[Document]:
"""Lazy load documents."""

for file_path in self._file_paths:
conv_res = self._converter.convert(
source=file_path,
**self._convert_kwargs,
)
dl_doc = conv_res.document
if self._export_type == self.ExportType.MARKDOWN:
yield Document(
page_content=dl_doc.export_to_markdown(**self._md_export_kwargs),
metadata={"source": file_path},
)
elif self._export_type == self.ExportType.DOC_CHUNKS:
chunk_iter = self._chunker.chunk(dl_doc)
for chunk in chunk_iter:
yield Document(
page_content=chunk.text,
metadata={
"source": file_path,
"dl_meta": chunk.meta.export_json_dict(),
},
)

else:
raise ValueError(f"Unexpected export type: {self._export_type}")
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from pathlib import Path

import pytest

from langchain_community.document_loaders import DoclingLoader

HELLO_PDF = Path(__file__).parent.parent.parent / "examples" / "hello.pdf"


@pytest.mark.requires("docling")
def test_docling_load_as_markdown() -> None:
loader = DoclingLoader(
file_path=str(HELLO_PDF.absolute()),
export_type=DoclingLoader.ExportType.MARKDOWN,
)
docs = loader.load()
assert len(docs) == 1
assert "Hello world!" in docs[0].page_content
Loading

0 comments on commit 5ae6ed4

Please sign in to comment.