From 8e3017e1c011db01e79eefd13792524fc0dfa4e9 Mon Sep 17 00:00:00 2001 From: paolobettelini Date: Mon, 4 Nov 2024 19:27:05 +0100 Subject: [PATCH] Fixed rare bug --- scripts/pdfextract.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/pdfextract.py b/scripts/pdfextract.py index bd01066..faec87e 100755 --- a/scripts/pdfextract.py +++ b/scripts/pdfextract.py @@ -35,7 +35,10 @@ def extract_text_from_pdf(pdf_file): # Get coordinates and text x, y, text = lobj.bbox[0], lobj.bbox[3], lobj.get_text().strip() - if text.startswith("!"): + # The '!' characters should be at coordinate 0 (actually, very very close) + # I encountered a situation where there were '!' symbols at absurd cordinates, + # so we add the < 0.001 condition just to be sure. + if text.startswith("!") and x < 0.001: # Clean characters. # TODO: ' needs to be replace, but the other characters should be supported