From fb6919e171a89786f7ceb06e22d2109e947713de Mon Sep 17 00:00:00 2001 From: MJedr Date: Fri, 4 Aug 2023 15:40:38 +0200 Subject: [PATCH] fix extract_texkeys_and_urls_from_pdf --- refextract/references/pdf.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/refextract/references/pdf.py b/refextract/references/pdf.py index 73580d2..92a95af 100644 --- a/refextract/references/pdf.py +++ b/refextract/references/pdf.py @@ -24,6 +24,7 @@ import logging from PyPDF2 import PdfFileReader +from PyPDF2.generic import ByteStringObject from .regexs import re_reference_in_dest @@ -57,9 +58,12 @@ def extract_texkeys_and_urls_from_pdf(pdf_file): LOGGER.debug(u"PDF: Internal PyPDF2 error, no TeXkeys returned.") return [] # not all named destinations point to references - refs = [ - dest for dest in destinations.items() if re_reference_in_dest.match(dest[0]) - ] + refs = [] + for destination in destinations.items(): + destination_key = destination[0].decode("utf-8") if isinstance(destination[0], ByteStringObject) else destination[0] + match = re_reference_in_dest.match(destination_key) + if match: + refs.append(destination) try: if _destinations_in_two_columns(pdf, refs): LOGGER.debug(u"PDF: Using two-column layout")