Skip to content

Commit

Permalink
fix regression in extract_texkeys_and_urls_from_pdf
Browse files Browse the repository at this point in the history
  • Loading branch information
MJedr committed Sep 13, 2023
1 parent ee93261 commit de987f5
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 27 deletions.
63 changes: 38 additions & 25 deletions refextract/references/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,24 +55,28 @@ def extract_texkeys_and_urls_from_pdf(pdf_file):
destinations = pdf.getNamedDestinations()
urls = extract_urls(pdf)
except Exception:
LOGGER.debug(u"PDF: Internal PyPDF2 error, no TeXkeys returned.")
LOGGER.debug("PDF: Internal PyPDF2 error, no TeXkeys returned.")
return []
# not all named destinations point to references
refs = []
for destination in destinations.items():
destination_key = destination[0].decode("utf-8") if isinstance(destination[0], ByteStringObject) else destination[0]
destination_key = (
destination[0].decode("utf-8")
if isinstance(destination[0], ByteStringObject)
else destination[0]
)
match = re_reference_in_dest.match(destination_key)
if match:
refs.append(destination)
try:
if _destinations_in_two_columns(pdf, refs):
LOGGER.debug(u"PDF: Using two-column layout")
LOGGER.debug("PDF: Using two-column layout")

def sortfunc(dest_couple):
return dest_couple[1]

else:
LOGGER.debug(u"PDF: Using single-column layout")
LOGGER.debug("PDF: Using single-column layout")

def sortfunc(dest_couple):
page, _, ypos, xpos = dest_couple[1]
Expand Down Expand Up @@ -102,24 +106,31 @@ def sortfunc(dest_couple):
texkey_url_list.append(current_texkey_urls_dict)
return texkey_url_list
except Exception:
LOGGER.debug(u"PDF: Impossible to determine layout, no TeXkeys returned")
LOGGER.debug("PDF: Impossible to determine layout, no TeXkeys returned")
return []


def _match_urls_with_reference(urls_to_match, reference, next_reference=None):
def _match_urls_with_reference(
urls_to_match, reference, next_reference=None, previous_reference=None
):
ref_page_number, ref_column, ref_y, _ = reference[1]
if next_reference:
next_ref_page_number, next_ref_col, next_ref_y, _ = next_reference[1]
urls_for_reference = set()
for (url_index, url) in enumerate(urls_to_match):
url_page_number, url_col, url_y, _ = url[1]
is_url_under_texkey = ref_y <= url_y
is_url_in_same_col = ref_column == url_col
is_url_in_next_col = url_col > ref_column
is_reference_on_same_page_as_url = ref_page_number == url_page_number
is_reference_on_previous_page_than_url = ref_page_number + 1 == url_page_number
if not next_reference:
if (
is_reference_on_same_page_as_url or
is_reference_on_previous_page_than_url
(
is_reference_on_same_page_as_url
and (is_url_in_same_col or is_url_in_next_col)
)
or is_reference_on_previous_page_than_url
) and is_url_under_texkey:
urls_for_reference.add(url[0])
continue
Expand All @@ -128,29 +139,31 @@ def _match_urls_with_reference(urls_to_match, reference, next_reference=None):
) and (ref_y <= url_y <= next_ref_y)
is_next_reference_on_the_same_page = next_ref_page_number == url_page_number
is_last_reference_in_page = (
is_reference_on_same_page_as_url and
(next_ref_page_number > url_page_number) and
is_url_under_texkey
is_reference_on_same_page_as_url
and (next_ref_page_number > url_page_number)
and is_url_under_texkey
)
is_last_reference_in_page_two_col_layout = (
is_reference_on_same_page_as_url and
is_next_reference_on_the_same_page and
is_url_under_texkey and
(next_ref_col > url_col) and
next_ref_y < url_y
is_reference_on_same_page_as_url
and is_next_reference_on_the_same_page
and is_url_under_texkey
and (next_ref_col > url_col)
and next_ref_y < url_y
and ref_y <= url_y
and (is_url_in_same_col or is_url_in_next_col)
)
is_in_new_column = (
is_reference_on_same_page_as_url and
is_next_reference_on_the_same_page and
ref_y > url_y and
(next_ref_col > ref_column) and
(next_ref_y > url_y)
is_reference_on_same_page_as_url
and is_next_reference_on_the_same_page
and ref_y > url_y
and (next_ref_col > ref_column)
and (next_ref_y > url_y)
)
is_url_for_other_reference_in_new_column = (
is_reference_on_same_page_as_url and
(next_ref_page_number == url_page_number) and
(next_ref_col == ref_column < url_col) and
(next_ref_y > url_y)
is_reference_on_same_page_as_url
and (next_ref_page_number == url_page_number)
and (next_ref_col == ref_column < url_col)
and (next_ref_y > url_y)
)
is_url_unrelated_to_references = ref_page_number > url_page_number
is_url_for_next_reference = url_y >= next_ref_y
Expand Down
2 changes: 1 addition & 1 deletion run-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,5 @@

set -e

flake8 refextract tests
flake8 refextract tests --ignore W503
py.test tests
Binary file added tests/data/2303.03819.pdf
Binary file not shown.
34 changes: 33 additions & 1 deletion tests/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -795,6 +795,38 @@ def test_extract_texkeys_and_urls_from_pdf(pdf_files):

assert two_col_2 == expected_two_col_keys_2

two_col_with_one_url_only = extract_texkeys_and_urls_from_pdf(pdf_files[6])
expected_two_col_with_one_url_only = [
{"texkey": "Hees-Rapp"},
{"texkey": "He:2022ywp", "urls": {"http://arxiv.org/abs/2204.09299"}},
{"texkey": "Das-Alam-Mohanty"},
{"texkey": "Svetitsky:1987gq"},
{"texkey": "Tsallis"},
{"texkey": "Marques-Cleymans-Deppman-2015"},
{"texkey": "Marques-Andrade-Deppman-2013"},
{"texkey": "WilkWlodarkzyk-multiparticle"},
{"texkey": "TsallisBook"},
{"texkey": "PLASTINO1995347"},
{"texkey": "Muskat"},
{"texkey": "Schwammle"},
{"texkey": "Schwammle2009"},
{"texkey": "WaltonRafelski"},
{"texkey": "Wong:2015mba"},
{"texkey": "Deppman:2019yno"},
{"texkey": "PasechnikSumbera"},
{"texkey": "Adolfsson:2020dhm"},
{"texkey": "Qin:2015srf"},
{"texkey": "Apolinario:2015bfm"},
{"texkey": "Casalderrey-Solana:2018wrw"},
{"texkey": "CORADDU2003473"},
{"texkey": "Curilef"},
{"texkey": "Annala:2019puf"},
{"texkey": "Annala:2020rgx"},
{"texkey": "Cardoso2017"},
{"texkey": "Sen:2021tdu"},
]
assert two_col_with_one_url_only == expected_two_col_with_one_url_only


def test_extract_texkeys_and_urls_from_pdf_no_crash_on_incomplete_dest_coordinates(
pdf_files,
Expand All @@ -814,6 +846,6 @@ def test_extract_texkeys_from_pdf_no_crash_on_pydpf2_error(pdf_files):

def test_extract_texkeys_from_pdf_no_crash_on_other_exceptions(pdf_files):
expected = []
result = extract_texkeys_and_urls_from_pdf(pdf_files[6])
result = extract_texkeys_and_urls_from_pdf(pdf_files[7])

assert result == expected

0 comments on commit de987f5

Please sign in to comment.