Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sub-line page sequence calculation #103

Merged
merged 2 commits into from
Dec 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 25 additions & 7 deletions pdfannots/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,13 +217,30 @@ def receive_layout(self, ltpage: LTPage) -> None:

self.page = None

def update_pageseq(self, component: LTComponent) -> None:
"""Assign sequence numbers for objects on the page based on the nearest line of text."""
def update_pageseq(self, component: LTComponent) -> bool:
"""Assign sequence numbers for objects on the page based on the nearest line of text.
Returns True if we need to recurse on smaller sub-components (e.g. characters)."""
assert self.page is not None
self.compseq += 1

hits = 0
for x in itertools.chain(self.page.annots, self.page.outlines):
x.update_pageseq(component, self.compseq)
if x.update_pageseq(component, self.compseq):
hits += 1

# If we have assigned the same sequence number to multiple objects, and there exist smaller
# sub-components (e.g. characters within a line), we'll recurse on those assigning sequence
# numbers to sub-components to disambiguate the hits, but first we must forget about the
# current sequence number.
# NB: This could be done more efficiently -- we really only need to disambiguate conflicts
# that still exist after processing *all* the line-level components on the same page, but
# that would require multiple rendering passes.
if hits > 1 and isinstance(component, LTContainer) and len(component) > 1:
for x in itertools.chain(self.page.annots, self.page.outlines):
x.discard_pageseq(self.compseq)
return True

return False

def test_boxes(self, item: LTComponent) -> None:
"""Update the set of annotations whose boxes intersect with the area of the given item."""
Expand Down Expand Up @@ -288,21 +305,22 @@ def capture_char(self, text: str) -> None:
# Subscribe this annotation for post-context.
self.context_subscribers.append((self.charseq, a))

def render(self, item: LTItem) -> None:
def render(self, item: LTItem, pageseq_nested: bool = False) -> None:
"""
Helper for receive_layout, called recursively for every item on a page, in layout order.

Ref: https://pdfminersix.readthedocs.io/en/latest/topic/converting_pdf_to_text.html
"""
# Assign sequence numbers to items on the page based on their proximity to lines of text or
# to figures (which may contain bare LTChar elements).
if isinstance(item, (LTTextLine, LTFigure)):
self.update_pageseq(item)
if isinstance(item, (LTTextLine, LTFigure)) or (
pageseq_nested and isinstance(item, LTComponent)):
pageseq_nested = self.update_pageseq(item)

# If it's a container, recurse on nested items.
if isinstance(item, LTContainer):
for child in item:
self.render(child)
self.render(child, pageseq_nested)

# After the children of a text box, capture the end of the final
# line (logic derived from pdfminer.converter.TextConverter).
Expand Down
25 changes: 21 additions & 4 deletions pdfannots/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ def get_height(self) -> float:
"""Return the height of the box."""
return self.y1 - self.y0

def get_area(self) -> float:
"""Return the area of the box."""
return self.get_height() * self.get_width()

def get_overlap(self, other: Box) -> float:
"""Compute the overlapping area (if any) with the provided box."""
x_overlap = max(0, min(other.x1, self.x1) - max(other.x0, self.x0))
Expand Down Expand Up @@ -218,18 +222,27 @@ def item_hit(self, item: LTComponent) -> bool:
and self.y >= item.y0
and self.y <= item.y1)

def update_pageseq(self, component: LTComponent, pageseq: int) -> None:
"""If close-enough to the given component, adopt its sequence number."""
def update_pageseq(self, component: LTComponent, pageseq: int) -> bool:
"""If close-enough to the given component, adopt its sequence number and return True."""
assert pageseq > 0
if self.item_hit(component):
# This pos is inside the component area
self._pageseq = pageseq
self._pageseq_distance = 0
return True
else:
d = Box.from_item(component).square_of_distance_to_closest_point((self.x, self.y))
if self._pageseq == 0 or self._pageseq_distance > d:
self._pageseq = pageseq
self._pageseq_distance = d
return True
return False

def discard_pageseq(self, pageseq: int) -> None:
"""If we have been assigned the specified pageseq, forget about it."""
if self._pageseq == pageseq:
self._pageseq = 0
self._pageseq_distance = 0.0


@functools.total_ordering
Expand All @@ -246,10 +259,14 @@ def __lt__(self, other: object) -> bool:
return self.pos < other.pos
return NotImplemented

def update_pageseq(self, component: LTComponent, pageseq: int) -> None:
def update_pageseq(self, component: LTComponent, pageseq: int) -> bool:
"""Delegates to Pos.update_pageseq"""
return False if self.pos is None else self.pos.update_pageseq(component, pageseq)

def discard_pageseq(self, pageseq: int) -> None:
"""Delegates to Pos.discard_pageseq"""
if self.pos is not None:
self.pos.update_pageseq(component, pageseq)
self.pos.discard_pageseq(pageseq)


class AnnotationType(enum.Enum):
Expand Down
20 changes: 10 additions & 10 deletions tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,8 +173,8 @@ def test(self) -> None:
(AnnotationType.Highlight, 'short highlight', 'not working'),
(AnnotationType.Text, None, None),
(AnnotationType.Highlight, None, 'Some more text'),
(AnnotationType.Text, 's', None),
(AnnotationType.Text, 'dual\n\npara note', None)]
(AnnotationType.Text, 'dual\n\npara note', None),
(AnnotationType.Text, 's', None)]
self.assertEqual(len(self.annots), len(EXPECTED))
for a, expected in zip(self.annots, EXPECTED):
self.assertEqual((a.subtype, a.contents, a.gettext()), expected)
Expand Down Expand Up @@ -270,14 +270,14 @@ class CaretAnnotations(ExtractionTestBase):

def test(self) -> None:
self.assertEqual(len(self.annots), 5)
self.assertEqual(self.annots[1].subtype, AnnotationType.StrikeOut)
self.assertEqual(self.annots[1].gettext(), 'Adobe Acrobat Reader')
self.assertEqual(self.annots[4].subtype, AnnotationType.Caret)
self.assertEqual(self.annots[4].contents, 'Google Chrome')
self.assertEqual(self.annots[1].in_reply_to, self.annots[4])
self.assertEqual(self.annots[4].replies, [self.annots[1]])
self.assertEqual(self.annots[1].replies, [])
self.assertEqual(self.annots[4].in_reply_to, None)
self.assertEqual(self.annots[0].subtype, AnnotationType.StrikeOut)
self.assertEqual(self.annots[0].gettext(), 'Adobe Acrobat Reader')
self.assertEqual(self.annots[3].subtype, AnnotationType.Caret)
self.assertEqual(self.annots[3].contents, 'Google Chrome')
self.assertEqual(self.annots[0].in_reply_to, self.annots[3])
self.assertEqual(self.annots[3].replies, [self.annots[0]])
self.assertEqual(self.annots[0].replies, [])
self.assertEqual(self.annots[3].in_reply_to, None)


class PrinterTestBase(unittest.TestCase):
Expand Down
Loading