Improved docling parser (#8255)

GitOrigin-RevId: 3dcc905c5f8fea803674bbfff36c914248f3fc65
pathwaycom · Feb 21, 2025 · 325d453 · 325d453
1 parent 916c0b0
commit 325d453
Show file tree

Hide file tree

Showing 5 changed files with 403 additions and 54 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file.
 This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 ## [Unreleased]
 
+### Added
+- Added structure-aware chunking for `DoclingParser`
+- Added `table_parsing_strategy` for `DoclingParser`
+
+### Changed
+- **BREAKING**: Changed the argument in `DoclingParser` from `parse_images` (bool) into `image_parsing_strategy` (Literal["llm"] | None)
+
+### Removed
+
+
 ## [0.19.0] - 2025-02-20
 
 ### Added
@@ -24,6 +34,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 - **BREAKING**: `pw.stdlib.utils.AsyncTransformer.output_table` now returns a table with columns with Future data type.
 - `pw.io.deltalake.read` can now read append-only tables without requiring explicit specification of primary key fields.
 
+
 ## [0.18.0] - 2025-02-07
 
 ### Added

diff --git a/docs/2.developers/4.user-guide/50.llm-xpack/.parsers/parsers.md b/docs/2.developers/4.user-guide/50.llm-xpack/.parsers/parsers.md
@@ -89,9 +89,19 @@ Unstructured chunking is character-based rather than token-based, meaning you do
 
 It is recommended to use this parser when extracting **text**, **tables**, and **images** from PDFs.
 
+DoclingParser offers structure-aware chunking functionality. It separates tables and images into distinct chunks, merges all list items into a single chunk, and ensures each chunk is wrapped with markdown headings at the top and appropriate captions at the bottom when available (e.g., for tables and images).
+
+### Table parsing
+
+There are two main approaches for parsing tables: (1) using Docling engine or (2) parsing using multimodal LLM. The first one will run Docling OCR on the top of table that is in the pdf and transform it into markdown format. The second one will transform the table into an image and send it to multimodal LLM and ask for parsing it. As of now we only support LLMs having same API interface as OpenAI.
+
+In order to choose between these two you must set `table_parsing_strategy` to either `llm` or `docling`.
+If you don't want to parse tables simply set this argument to `None`.
+
+
 ### Image parsing
 
-If `parse_images=True`, the parser detects images within the document, processes them with a multimodal LLM (such as OpenAI's GPT-4o), and embeds its descriptions in the Markdown output. If disabled, images are replaced with placeholders.
+If `image_parsing_strategy="llm"`, the parser detects images within the document, processes them with a multimodal LLM (such as OpenAI's GPT-4o), and embeds its descriptions in the Markdown output. If disabled, images are replaced with placeholders.
 
 ::if{path="/llm-xpack/"}
 Example:
@@ -103,7 +113,7 @@ from pathway.xpacks.llm.llms import OpenAIChat
 multimodal_llm = OpenAIChat(model="gpt-4o-mini")
 
 parser = DoclingParser(
-    parse_images=True,
+    image_parsing_strategy="llm",
     multimodal_llm=multimodal_llm,
     pdf_pipeline_options={  # use it to override our default options for parsing pdfs with docling
         "do_formula_enrichment": True,

diff --git a/python/pathway/xpacks/llm/_parser_utils.py b/python/pathway/xpacks/llm/_parser_utils.py
@@ -8,7 +8,7 @@
 import subprocess
 import tempfile
 from collections.abc import Callable
-from typing import Literal
+from typing import Any, Iterable, Iterator, Literal
 
 import PIL.Image
 from pydantic import BaseModel
@@ -18,6 +18,22 @@
 from pathway.optional_import import optional_imports
 from pathway.xpacks.llm.constants import DEFAULT_VISION_MODEL
 
+with optional_imports("xpack-llm-docs"):
+    from docling.chunking import HierarchicalChunker, HybridChunker
+    from docling_core.transforms.chunker import BaseChunk, DocChunk, DocMeta
+    from docling_core.types.doc.document import (
+        CodeItem,
+        DocItem,
+        DoclingDocument,
+        LevelNumber,
+        ListItem,
+        PictureItem,
+        SectionHeaderItem,
+        TableItem,
+        TextItem,
+    )
+    from docling_core.types.doc.labels import DocItemLabel
+
 logger = logging.getLogger(__name__)
 
 
@@ -280,3 +296,142 @@ def _convert_pptx_to_pdf(contents: bytes) -> bytes:
             os.remove(pdf_temp_path)
 
     return pdf_contents
+
+
+class _HierarchicalChunker(HierarchicalChunker):
+    """
+    This class is a wrapper for Docling's `HierarchicalChunker` that changes the behavior of
+    `chunk` method. In particular it handles PictureItem and TableItem differently to allow
+    our custom parsing of these items using multimodal LLM.
+    Majority of the code is copied from the original `HierarchicalChunker` with minor changes.
+    Here is link to the reference Docling code:
+    https://github.com/DS4SD/docling-core/blob/main/docling_core/transforms/chunker/hierarchical_chunker.py#L113
+    """
+
+    def chunk(self, dl_doc: DoclingDocument, **kwargs: Any) -> Iterator[DocChunk]:
+        r"""Chunk the provided document.
+
+        Args:
+            dl_doc (DLDocument): document to chunk
+
+        Yields:
+            Iterator[Chunk]: iterator over extracted chunks
+        """
+        heading_by_level: dict[LevelNumber, str] = {}
+        list_items: list[TextItem] = []
+        for item, level in dl_doc.iterate_items():
+            captions = None
+            if isinstance(item, DocItem):
+
+                # first handle any merging needed
+                if self.merge_list_items:
+                    if isinstance(
+                        item, ListItem
+                    ) or (  # TODO remove when all captured as ListItem:
+                        isinstance(item, TextItem)
+                        and item.label == DocItemLabel.LIST_ITEM  # type: ignore
+                    ):
+                        list_items.append(item)
+                        continue
+                    elif list_items:  # need to yield
+                        yield DocChunk(
+                            text=self.delim.join([i.text for i in list_items]),
+                            meta=DocMeta(
+                                doc_items=list_items,  # type: ignore
+                                headings=[
+                                    heading_by_level[k]
+                                    for k in sorted(heading_by_level)
+                                ]
+                                or None,
+                                origin=dl_doc.origin,
+                            ),
+                        )
+                        list_items = []  # reset
+
+                if isinstance(item, SectionHeaderItem) or (
+                    isinstance(item, TextItem)
+                    and item.label in [DocItemLabel.SECTION_HEADER, DocItemLabel.TITLE]
+                ):
+                    level = (
+                        item.level
+                        if isinstance(item, SectionHeaderItem)
+                        else (0 if item.label == DocItemLabel.TITLE else 1)
+                    )
+                    heading_by_level[level] = item.text
+
+                    # remove headings of higher level as they just went out of scope
+                    keys_to_del = [k for k in heading_by_level if k > level]
+                    for k in keys_to_del:
+                        heading_by_level.pop(k, None)
+                    continue
+
+                if (
+                    isinstance(item, TextItem)
+                    or ((not self.merge_list_items) and isinstance(item, ListItem))
+                    or isinstance(item, CodeItem)
+                ):
+                    # we skip captions as they are handled separately by their parents
+                    # we also skip other smaller elements like page footers, footnotes etc. (TBD)
+                    if item.label in ["caption", "page_footer", "footnote"]:
+                        continue
+                    text = item.text
+                # these two following elifs are Pathway custom behavior
+                elif isinstance(item, TableItem):
+                    table_df = item.export_to_dataframe()
+                    text = table_df.to_markdown(index=False)
+                    captions = [
+                        c.text for c in [r.resolve(dl_doc) for r in item.captions]
+                    ] or None
+                elif isinstance(item, PictureItem):
+                    text = ""
+                    captions = [
+                        cap.resolve(dl_doc).text for cap in item.captions
+                    ] or None
+
+                else:
+                    continue
+                c = DocChunk(
+                    text=text,
+                    meta=DocMeta(
+                        doc_items=[item],
+                        headings=[heading_by_level[k] for k in sorted(heading_by_level)]
+                        or None,
+                        captions=captions,
+                        origin=dl_doc.origin,
+                    ),
+                )
+                yield c
+
+        if self.merge_list_items and list_items:  # need to yield
+            yield DocChunk(
+                text=self.delim.join([i.text for i in list_items]),
+                meta=DocMeta(
+                    doc_items=list_items,  # type: ignore
+                    headings=[heading_by_level[k] for k in sorted(heading_by_level)]
+                    or None,
+                    origin=dl_doc.origin,
+                ),
+            )
+
+
+class _HybridChunker(HybridChunker):
+    """
+    Wrapper for Docling's `HybridChunker` to use custom `HierarchicalChunker` with changed
+    behavior for serializing tables. This wrapper also changes the behavior of `chunk` method
+    a bit to chunk using Hierarchical chunking and then merging chunks with similar metadata.
+    Chunking based on length of the text is disabled as of now.
+    Here is link to the reference Docling code:
+    https://github.com/DS4SD/docling-core/blob/main/docling_core/transforms/chunker/hybrid_chunker.py#L34
+    """
+
+    _inner_chunker: _HierarchicalChunker = _HierarchicalChunker()
+
+    def chunk(self, dl_doc: DoclingDocument, **kwargs: Any) -> Iterator[BaseChunk]:
+        res: Iterable[DocChunk]
+        res = self._inner_chunker.chunk(dl_doc=dl_doc, **kwargs)
+        # res = [x for c in res for x in self._split_by_doc_items(c)]
+        # res = [x for c in res for x in self._split_using_plain_text(c)]
+        res = list(res)
+        if self.merge_peers:
+            res = self._merge_chunks_with_matching_metadata(res)
+        return iter(res)