diff --git a/refextract/references/pdf.py b/refextract/references/pdf.py index 92a95af..a30992c 100644 --- a/refextract/references/pdf.py +++ b/refextract/references/pdf.py @@ -55,24 +55,30 @@ def extract_texkeys_and_urls_from_pdf(pdf_file): destinations = pdf.getNamedDestinations() urls = extract_urls(pdf) except Exception: - LOGGER.debug(u"PDF: Internal PyPDF2 error, no TeXkeys returned.") + LOGGER.debug("PDF: Internal PyPDF2 error, no TeXkeys returned.") return [] # not all named destinations point to references refs = [] for destination in destinations.items(): - destination_key = destination[0].decode("utf-8") if isinstance(destination[0], ByteStringObject) else destination[0] + destination_key = ( + destination[0].decode("utf-8") + if isinstance(destination[0], ByteStringObject) + else destination[0] + ) match = re_reference_in_dest.match(destination_key) if match: refs.append(destination) + two_column_layout = False try: if _destinations_in_two_columns(pdf, refs): - LOGGER.debug(u"PDF: Using two-column layout") + two_column_layout = True + LOGGER.debug("PDF: Using two-column layout") def sortfunc(dest_couple): return dest_couple[1] else: - LOGGER.debug(u"PDF: Using single-column layout") + LOGGER.debug("PDF: Using single-column layout") def sortfunc(dest_couple): page, _, ypos, xpos = dest_couple[1] @@ -91,22 +97,24 @@ def sortfunc(dest_couple): if nb < len(refs) - 1: next_reference_data = refs[nb + 1] matched_urls_for_reference, urls = _match_urls_with_reference( - urls, ref, next_reference_data + urls, ref, next_reference_data, two_column_layout=two_column_layout ) else: matched_urls_for_reference, urls = _match_urls_with_reference( - urls, ref + urls, ref, two_column_layout=two_column_layout ) if matched_urls_for_reference: current_texkey_urls_dict["urls"] = matched_urls_for_reference texkey_url_list.append(current_texkey_urls_dict) return texkey_url_list except Exception: - LOGGER.debug(u"PDF: Impossible to determine layout, no TeXkeys returned") + LOGGER.debug("PDF: Impossible to determine layout, no TeXkeys returned") return [] -def _match_urls_with_reference(urls_to_match, reference, next_reference=None): +def _match_urls_with_reference( + urls_to_match, reference, next_reference=None, two_column_layout=False +): ref_page_number, ref_column, ref_y, _ = reference[1] if next_reference: next_ref_page_number, next_ref_col, next_ref_y, _ = next_reference[1] @@ -114,11 +122,16 @@ def _match_urls_with_reference(urls_to_match, reference, next_reference=None): for (url_index, url) in enumerate(urls_to_match): url_page_number, url_col, url_y, _ = url[1] is_url_under_texkey = ref_y <= url_y + is_url_in_same_col = ref_column == url_col + is_url_in_next_col = url_col > ref_column is_reference_on_same_page_as_url = ref_page_number == url_page_number is_reference_on_previous_page_than_url = ref_page_number + 1 == url_page_number if not next_reference: if ( - is_reference_on_same_page_as_url or + ( + is_reference_on_same_page_as_url and + (is_url_in_same_col or is_url_in_next_col) + ) or is_reference_on_previous_page_than_url ) and is_url_under_texkey: urls_for_reference.add(url[0]) @@ -137,7 +150,9 @@ def _match_urls_with_reference(urls_to_match, reference, next_reference=None): is_next_reference_on_the_same_page and is_url_under_texkey and (next_ref_col > url_col) and - next_ref_y < url_y + next_ref_y < url_y and + ref_y <= url_y and + (is_url_in_same_col or is_url_in_next_col) ) is_in_new_column = ( is_reference_on_same_page_as_url and @@ -155,8 +170,9 @@ def _match_urls_with_reference(urls_to_match, reference, next_reference=None): is_url_unrelated_to_references = ref_page_number > url_page_number is_url_for_next_reference = url_y >= next_ref_y if is_url_between_texkeys: - urls_for_reference.add(url[0]) - continue + if not two_column_layout or (two_column_layout and url_col == ref_column): + urls_for_reference.add(url[0]) + continue elif is_last_reference_in_page or is_last_reference_in_page_two_col_layout: urls_for_reference.add(url[0]) continue diff --git a/tests/data/2301.05883.pdf b/tests/data/2301.05883.pdf new file mode 100644 index 0000000..fc3cfcf Binary files /dev/null and b/tests/data/2301.05883.pdf differ diff --git a/tests/data/2303.03819.pdf b/tests/data/2303.03819.pdf new file mode 100644 index 0000000..e99f93f Binary files /dev/null and b/tests/data/2303.03819.pdf differ diff --git a/tests/data/2304.10117.pdf b/tests/data/2304.10117.pdf new file mode 100644 index 0000000..2acf488 Binary files /dev/null and b/tests/data/2304.10117.pdf differ diff --git a/tests/data/packed_pdf.pdf b/tests/data/packed_pdf.pdf index 39ef755..d96f6db 100644 Binary files a/tests/data/packed_pdf.pdf and b/tests/data/packed_pdf.pdf differ diff --git a/tests/test_api.py b/tests/test_api.py index 05b940a..5f5c17e 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -162,7 +162,7 @@ def test_extract_references_from_url(pdf_files): def test_long_registrant_dois(pdf_files): """ DOIs with 5 digit registrant code """ - r = extract_references_from_file(pdf_files[8]) + r = extract_references_from_file(pdf_files[11]) assert len(r) == 6 for ref in r[1:]: assert 'doi' in ref diff --git a/tests/test_engine.py b/tests/test_engine.py index 679e5ec..0eeb5d4 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -438,7 +438,7 @@ def test_reference_split_handles_semicolon(): def test_clean_pdf_before_run(tmp_path, pdf_files): tmp_file_path = tmp_path / "packed.pdf" - pdf = pdf_files[7] + pdf = pdf_files[10] with open(pdf, 'rb') as input, open(tmp_file_path, 'wb') as tmp_out: tmp_out.write(input.read()) diff --git a/tests/test_pdf.py b/tests/test_pdf.py index 1876b3d..61f6b01 100644 --- a/tests/test_pdf.py +++ b/tests/test_pdf.py @@ -795,6 +795,115 @@ def test_extract_texkeys_and_urls_from_pdf(pdf_files): assert two_col_2 == expected_two_col_keys_2 + two_col_with_one_url_only = extract_texkeys_and_urls_from_pdf(pdf_files[6]) + expected_two_col_with_one_url_only = [ + {"texkey": "Gr20"}, + {"texkey": "Au18"}, + {"texkey": "Ue35"}, + {"texkey": "SM88"}, + {"texkey": "Sh00"}, + {"texkey": "Jaku21"}, + {"texkey": "T60"}, + {"texkey": "MT00"}, + {"texkey": "Jaku21b", "urls": {"http://arxiv.org/abs/2102.08069"}}, + {"texkey": "Mo64"}, + {"texkey": "Ko21"}, + {"texkey": "Lan"}, + {"texkey": "Sch55"}, + {"texkey": "Le56"}, + {"texkey": "FR74"}, + {"texkey": "BS19"}, + {"texkey": "Va00"}, + {"texkey": "T61"}, + {"texkey": "BD64"}, + {"texkey": "Ye61"}, + {"texkey": "Kl77"}, + {"texkey": "VJ"}, + {"texkey": "Sal"}, + {"texkey": "YRW"}, + {"texkey": "Ma69"}, + {"texkey": "MG64"}, + {"texkey": "We65"}, + {"texkey": "Lo58"}, + {"texkey": "Reu82"}, + {"texkey": "Off91"}, + {"texkey": "Jaku22"}, + {"texkey": "Fr72"}, + {"texkey": "Jo62"}, + ] + assert two_col_with_one_url_only == expected_two_col_with_one_url_only + + two_col_with_one_url_only_1 = extract_texkeys_and_urls_from_pdf(pdf_files[7]) + expected_two_col_with_one_url_only_1 = [ + {"texkey": "Hees-Rapp"}, + {"texkey": "He:2022ywp", "urls": {"http://arxiv.org/abs/2204.09299"}}, + {"texkey": "Das-Alam-Mohanty"}, + {"texkey": "Svetitsky:1987gq"}, + {"texkey": "Tsallis"}, + {"texkey": "Marques-Cleymans-Deppman-2015"}, + {"texkey": "Marques-Andrade-Deppman-2013"}, + {"texkey": "WilkWlodarkzyk-multiparticle"}, + {"texkey": "TsallisBook"}, + {"texkey": "PLASTINO1995347"}, + {"texkey": "Muskat"}, + {"texkey": "Schwammle"}, + {"texkey": "Schwammle2009"}, + {"texkey": "WaltonRafelski"}, + {"texkey": "Wong:2015mba"}, + {"texkey": "Deppman:2019yno"}, + {"texkey": "PasechnikSumbera"}, + {"texkey": "Adolfsson:2020dhm"}, + {"texkey": "Qin:2015srf"}, + {"texkey": "Apolinario:2015bfm"}, + {"texkey": "Casalderrey-Solana:2018wrw"}, + {"texkey": "CORADDU2003473"}, + {"texkey": "Curilef"}, + {"texkey": "Annala:2019puf"}, + {"texkey": "Annala:2020rgx"}, + {"texkey": "Cardoso2017"}, + {"texkey": "Sen:2021tdu"}, + ] + assert two_col_with_one_url_only_1 == expected_two_col_with_one_url_only_1 + + two_col_with_one_url_only_2 = extract_texkeys_and_urls_from_pdf(pdf_files[8]) + two_col_with_one_url_only_2_expected = [ + {"texkey": "Penrose:1964wq"}, + {"texkey": "Penrose:1969pc"}, + {"texkey": "Hawking:1976ra"}, + {"texkey": "wald2001thermodynamics"}, + {"texkey": "abbott2016observation"}, + {"texkey": "isi2021testing"}, + {"texkey": "isi2019testing"}, + {"texkey": "PhysRevD.7.2333"}, + {"texkey": "PhysRevLett.30.71"}, + {"texkey": "bardeen1973four"}, + {"texkey": "wald1994quantum"}, + {"texkey": "abbott2021gwtc", "urls": {"http://arxiv.org/abs/2111.03606"}}, + {"texkey": "cabero2018observational"}, + {"texkey": "samples2"}, + {"texkey": "scientific2016tests"}, + {"texkey": "lalsimulation"}, + {"texkey": "abbott2021gwtc1"}, + {"texkey": "samples1"}, + {"texkey": "schutz1999gravitational"}, + {"texkey": "martynov2016sensitivity"}, + {"texkey": "bersanetti2021advanced"}, + {"texkey": "kagra2019kagra"}, + {"texkey": "shaddock2008space"}, + {"texkey": "hu2017taiji"}, + {"texkey": "ruan2020taiji"}, + {"texkey": "luo2020brief"}, + {"texkey": "luo2016tianqin"}, + {"texkey": "Gong:2021gvw"}, + {"texkey": "amaro2018relativistic"}, + {"texkey": "berry2013observing"}, + {"texkey": "babak2017science"}, + {"texkey": "berry2013expectations"}, + {"texkey": "gwosc"}, + ] + + assert two_col_with_one_url_only_2 == two_col_with_one_url_only_2_expected + def test_extract_texkeys_and_urls_from_pdf_no_crash_on_incomplete_dest_coordinates( pdf_files, @@ -814,6 +923,6 @@ def test_extract_texkeys_from_pdf_no_crash_on_pydpf2_error(pdf_files): def test_extract_texkeys_from_pdf_no_crash_on_other_exceptions(pdf_files): expected = [] - result = extract_texkeys_and_urls_from_pdf(pdf_files[6]) + result = extract_texkeys_and_urls_from_pdf(pdf_files[9]) assert result == expected