diff --git a/refextract/references/pdf.py b/refextract/references/pdf.py index 92a95af..6620554 100644 --- a/refextract/references/pdf.py +++ b/refextract/references/pdf.py @@ -55,24 +55,28 @@ def extract_texkeys_and_urls_from_pdf(pdf_file): destinations = pdf.getNamedDestinations() urls = extract_urls(pdf) except Exception: - LOGGER.debug(u"PDF: Internal PyPDF2 error, no TeXkeys returned.") + LOGGER.debug("PDF: Internal PyPDF2 error, no TeXkeys returned.") return [] # not all named destinations point to references refs = [] for destination in destinations.items(): - destination_key = destination[0].decode("utf-8") if isinstance(destination[0], ByteStringObject) else destination[0] + destination_key = ( + destination[0].decode("utf-8") + if isinstance(destination[0], ByteStringObject) + else destination[0] + ) match = re_reference_in_dest.match(destination_key) if match: refs.append(destination) try: if _destinations_in_two_columns(pdf, refs): - LOGGER.debug(u"PDF: Using two-column layout") + LOGGER.debug("PDF: Using two-column layout") def sortfunc(dest_couple): return dest_couple[1] else: - LOGGER.debug(u"PDF: Using single-column layout") + LOGGER.debug("PDF: Using single-column layout") def sortfunc(dest_couple): page, _, ypos, xpos = dest_couple[1] @@ -102,11 +106,13 @@ def sortfunc(dest_couple): texkey_url_list.append(current_texkey_urls_dict) return texkey_url_list except Exception: - LOGGER.debug(u"PDF: Impossible to determine layout, no TeXkeys returned") + LOGGER.debug("PDF: Impossible to determine layout, no TeXkeys returned") return [] -def _match_urls_with_reference(urls_to_match, reference, next_reference=None): +def _match_urls_with_reference( + urls_to_match, reference, next_reference=None, previous_reference=None +): ref_page_number, ref_column, ref_y, _ = reference[1] if next_reference: next_ref_page_number, next_ref_col, next_ref_y, _ = next_reference[1] @@ -114,12 +120,17 @@ def _match_urls_with_reference(urls_to_match, reference, next_reference=None): for (url_index, url) in enumerate(urls_to_match): url_page_number, url_col, url_y, _ = url[1] is_url_under_texkey = ref_y <= url_y + is_url_in_same_col = ref_column == url_col + is_url_in_next_col = url_col > ref_column is_reference_on_same_page_as_url = ref_page_number == url_page_number is_reference_on_previous_page_than_url = ref_page_number + 1 == url_page_number if not next_reference: if ( - is_reference_on_same_page_as_url or - is_reference_on_previous_page_than_url + ( + is_reference_on_same_page_as_url + and (is_url_in_same_col or is_url_in_next_col) + ) + or is_reference_on_previous_page_than_url ) and is_url_under_texkey: urls_for_reference.add(url[0]) continue @@ -128,29 +139,31 @@ def _match_urls_with_reference(urls_to_match, reference, next_reference=None): ) and (ref_y <= url_y <= next_ref_y) is_next_reference_on_the_same_page = next_ref_page_number == url_page_number is_last_reference_in_page = ( - is_reference_on_same_page_as_url and - (next_ref_page_number > url_page_number) and - is_url_under_texkey + is_reference_on_same_page_as_url + and (next_ref_page_number > url_page_number) + and is_url_under_texkey ) is_last_reference_in_page_two_col_layout = ( - is_reference_on_same_page_as_url and - is_next_reference_on_the_same_page and - is_url_under_texkey and - (next_ref_col > url_col) and - next_ref_y < url_y + is_reference_on_same_page_as_url + and is_next_reference_on_the_same_page + and is_url_under_texkey + and (next_ref_col > url_col) + and next_ref_y < url_y + and ref_y <= url_y + and (is_url_in_same_col or is_url_in_next_col) ) is_in_new_column = ( - is_reference_on_same_page_as_url and - is_next_reference_on_the_same_page and - ref_y > url_y and - (next_ref_col > ref_column) and - (next_ref_y > url_y) + is_reference_on_same_page_as_url + and is_next_reference_on_the_same_page + and ref_y > url_y + and (next_ref_col > ref_column) + and (next_ref_y > url_y) ) is_url_for_other_reference_in_new_column = ( - is_reference_on_same_page_as_url and - (next_ref_page_number == url_page_number) and - (next_ref_col == ref_column < url_col) and - (next_ref_y > url_y) + is_reference_on_same_page_as_url + and (next_ref_page_number == url_page_number) + and (next_ref_col == ref_column < url_col) + and (next_ref_y > url_y) ) is_url_unrelated_to_references = ref_page_number > url_page_number is_url_for_next_reference = url_y >= next_ref_y diff --git a/run-tests.sh b/run-tests.sh index 5d27abd..670e6d4 100755 --- a/run-tests.sh +++ b/run-tests.sh @@ -23,5 +23,5 @@ set -e -flake8 refextract tests +flake8 refextract tests --ignore W503 py.test tests diff --git a/tests/data/2303.03819.pdf b/tests/data/2303.03819.pdf new file mode 100644 index 0000000..e99f93f Binary files /dev/null and b/tests/data/2303.03819.pdf differ diff --git a/tests/test_pdf.py b/tests/test_pdf.py index 1876b3d..8ffd2cc 100644 --- a/tests/test_pdf.py +++ b/tests/test_pdf.py @@ -795,6 +795,38 @@ def test_extract_texkeys_and_urls_from_pdf(pdf_files): assert two_col_2 == expected_two_col_keys_2 + two_col_with_one_url_only = extract_texkeys_and_urls_from_pdf(pdf_files[6]) + expected_two_col_with_one_url_only = [ + {"texkey": "Hees-Rapp"}, + {"texkey": "He:2022ywp", "urls": {"http://arxiv.org/abs/2204.09299"}}, + {"texkey": "Das-Alam-Mohanty"}, + {"texkey": "Svetitsky:1987gq"}, + {"texkey": "Tsallis"}, + {"texkey": "Marques-Cleymans-Deppman-2015"}, + {"texkey": "Marques-Andrade-Deppman-2013"}, + {"texkey": "WilkWlodarkzyk-multiparticle"}, + {"texkey": "TsallisBook"}, + {"texkey": "PLASTINO1995347"}, + {"texkey": "Muskat"}, + {"texkey": "Schwammle"}, + {"texkey": "Schwammle2009"}, + {"texkey": "WaltonRafelski"}, + {"texkey": "Wong:2015mba"}, + {"texkey": "Deppman:2019yno"}, + {"texkey": "PasechnikSumbera"}, + {"texkey": "Adolfsson:2020dhm"}, + {"texkey": "Qin:2015srf"}, + {"texkey": "Apolinario:2015bfm"}, + {"texkey": "Casalderrey-Solana:2018wrw"}, + {"texkey": "CORADDU2003473"}, + {"texkey": "Curilef"}, + {"texkey": "Annala:2019puf"}, + {"texkey": "Annala:2020rgx"}, + {"texkey": "Cardoso2017"}, + {"texkey": "Sen:2021tdu"}, + ] + assert two_col_with_one_url_only == expected_two_col_with_one_url_only + def test_extract_texkeys_and_urls_from_pdf_no_crash_on_incomplete_dest_coordinates( pdf_files, @@ -814,6 +846,6 @@ def test_extract_texkeys_from_pdf_no_crash_on_pydpf2_error(pdf_files): def test_extract_texkeys_from_pdf_no_crash_on_other_exceptions(pdf_files): expected = [] - result = extract_texkeys_and_urls_from_pdf(pdf_files[6]) + result = extract_texkeys_and_urls_from_pdf(pdf_files[7]) assert result == expected