diff --git a/pyproject.toml b/pyproject.toml index 6b136f3..ba03b71 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "pdf-document-layout-analysis" -version = "2025.02.04.01" +version = "2025.02.04.02" description = "This tool is for PDF document layout analysis" license = { file = "LICENSE" } authors = [{ name = "HURIDOCS" }] diff --git a/src/pdf_features/Rectangle.py b/src/pdf_features/Rectangle.py index 3e6e2c3..ba457f0 100644 --- a/src/pdf_features/Rectangle.py +++ b/src/pdf_features/Rectangle.py @@ -2,19 +2,18 @@ import sys from lxml.etree import ElementBase +from pydantic import BaseModel sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) -class Rectangle: - def __init__(self, left: int, top: int, right: int, bottom: int): - self.left = left - self.top = top - self.right = right - self.bottom = bottom - self.fix_wrong_areas() - self.width = self.right - self.left - self.height = self.bottom - self.top +class Rectangle(BaseModel): + left: int + top: int + right: int + bottom: int + width: int + height: int @staticmethod def from_poppler_tag_etree(tag: ElementBase) -> "Rectangle": @@ -26,7 +25,7 @@ def from_poppler_tag_etree(tag: ElementBase) -> "Rectangle": y_max = y_min + int(tag.attrib["height"]) if len(content) <= 1: - return Rectangle(x_min, y_min, x_max, y_max) + return Rectangle.from_coordinates(x_min, y_min, x_max, y_max) one_character_length = max(int((x_max - x_min) / len(content)), 2) if content[0] == " ": @@ -35,22 +34,7 @@ def from_poppler_tag_etree(tag: ElementBase) -> "Rectangle": if content[-1] == " ": x_max -= one_character_length - return Rectangle(x_min, y_min, x_max, y_max) - - def fix_wrong_areas(self): - if self.right == self.left: - self.left -= 1 - self.right += 1 - - if self.top == self.bottom: - self.top -= 1 - self.bottom += 1 - - if self.right < self.left: - self.right, self.left = self.left, self.right - - if self.bottom < self.top: - self.top, self.bottom = self.bottom, self.top + return Rectangle.from_coordinates(x_min, y_min, x_max, y_max) def get_intersection_percentage(self, rectangle: "Rectangle") -> float: x1 = max(self.left, rectangle.left) @@ -91,8 +75,33 @@ def merge_rectangles(rectangles: list["Rectangle"]) -> "Rectangle": right = max([rectangle.right for rectangle in rectangles]) bottom = max([rectangle.bottom for rectangle in rectangles]) - return Rectangle(left, top, right, bottom) + return Rectangle.from_coordinates(left, top, right, bottom) @staticmethod def from_width_height(left: int, top: int, width: int, height: int): - return Rectangle(left, top, left + width, top + height) + return Rectangle.from_coordinates(left, top, left + width, top + height) + + @staticmethod + def from_coordinates(left: float, top: float, right: float, bottom: float): + left, top, right, bottom = Rectangle.fix_wrong_areas(left, top, right, bottom) + width = right - left + height = bottom - top + return Rectangle(left=left, top=top, right=right, bottom=bottom, width=width, height=height) + + @staticmethod + def fix_wrong_areas(left: float, top: float, right: float, bottom: float): + if right == left: + left -= 1 + right += 1 + + if top == bottom: + top -= 1 + bottom += 1 + + if right < left: + right, left = left, right + + if bottom < top: + top, bottom = bottom, top + + return int(left), int(top), int(right), int(bottom) diff --git a/src/pdf_token_type_labels/Label.py b/src/pdf_token_type_labels/Label.py index 2677fde..64b2f56 100644 --- a/src/pdf_token_type_labels/Label.py +++ b/src/pdf_token_type_labels/Label.py @@ -13,7 +13,7 @@ class Label(BaseModel): metadata: str = "" def intersection_percentage(self, token_bounding_box: Rectangle): - label_bounding_box = Rectangle( + label_bounding_box = Rectangle.from_coordinates( left=self.left, top=self.top, right=self.left + self.width, bottom=self.top + self.height ) return label_bounding_box.get_intersection_percentage(token_bounding_box) diff --git a/src/pdf_tokens_type_trainer/PdfTrainer.py b/src/pdf_tokens_type_trainer/PdfTrainer.py index f811f1b..b50a6c1 100644 --- a/src/pdf_tokens_type_trainer/PdfTrainer.py +++ b/src/pdf_tokens_type_trainer/PdfTrainer.py @@ -66,7 +66,7 @@ def get_padding_token(segment_number: int, page_number: int): "", PdfFont("pad_font_id", False, False, 0.0, "#000000"), segment_number, - Rectangle(0, 0, 0, 0), + Rectangle.from_coordinates(0, 0, 0, 0), TokenType.TEXT, ) diff --git a/src/test_end_to_end.py b/src/test_end_to_end.py index 9c89544..2f431c4 100644 --- a/src/test_end_to_end.py +++ b/src/test_end_to_end.py @@ -269,9 +269,9 @@ def test_ocr_english(self): self.assertEqual(1, len(results_list)) self.assertEqual("Test text OCR", results_list[0]["text"]) self.assertEqual(248, results_list[0]["left"]) - self.assertEqual(263, results_list[0]["top"]) + self.assertEqual(264, results_list[0]["top"]) self.assertEqual(313, results_list[0]["width"]) - self.assertEqual(52, results_list[0]["height"]) + self.assertEqual(50, results_list[0]["height"]) self.assertEqual(1, results_list[0]["page_number"]) self.assertEqual(842, results_list[0]["page_width"]) self.assertEqual(595, results_list[0]["page_height"]) diff --git a/src/vgt/create_word_grid.py b/src/vgt/create_word_grid.py index 93d9872..8cc7e32 100644 --- a/src/vgt/create_word_grid.py +++ b/src/vgt/create_word_grid.py @@ -24,14 +24,14 @@ def get_words_positions(text: str, rectangle: Rectangle): width_per_letter = rectangle.width / text_len - words_bboxes = [Rectangle(rectangle.left, rectangle.top, rectangle.left + 5, rectangle.bottom)] + words_bboxes = [Rectangle.from_coordinates(rectangle.left, rectangle.top, rectangle.left + 5, rectangle.bottom)] words_bboxes[-1].width = 0 words_bboxes[-1].right = words_bboxes[-1].left for letter in text: if letter == " ": left = words_bboxes[-1].right + width_per_letter - words_bboxes.append(Rectangle(left, words_bboxes[-1].top, left + 5, words_bboxes[-1].bottom)) + words_bboxes.append(Rectangle.from_coordinates(left, words_bboxes[-1].top, left + 5, words_bboxes[-1].bottom)) words_bboxes[-1].width = 0 words_bboxes[-1].right = words_bboxes[-1].left else: @@ -52,11 +52,11 @@ def get_subwords_positions(word: str, rectangle: Rectangle): ids = [x[-2] for x in tokenizer(word_tokens)["input_ids"]] right = rectangle.left + len(word_tokens[0]) * width_per_letter - bboxes = [Rectangle(rectangle.left, rectangle.top, right, rectangle.bottom)] + bboxes = [Rectangle.from_coordinates(rectangle.left, rectangle.top, right, rectangle.bottom)] for subword in word_tokens[1:]: right = bboxes[-1].right + len(subword) * width_per_letter - bboxes.append(Rectangle(bboxes[-1].right, rectangle.top, right, rectangle.bottom)) + bboxes.append(Rectangle.from_coordinates(bboxes[-1].right, rectangle.top, right, rectangle.bottom)) return ids, bboxes