Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

articles: fix PDF reading from stream #237

Merged
merged 1 commit into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions scoap3/articles/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,14 +163,15 @@ def check_contains_funded_by_scoap3(article):
return False, "No files found for the given article."

for article_file in article_files:
try:
if is_string_in_pdf(article_file, "Funded by SCOAP3"):
return (
True,
f"Files contain the required text: 'Funded by SCOAP3'. File: {article_file.file.path}",
)
except FileNotFoundError:
return False, f"File not found: {article_file.file.path}"
if article_file.filetype in ["pdf", "pdf/a"]:
try:
if is_string_in_pdf(article_file, "Funded by SCOAP3"):
return (
True,
f"Files contain the required text: 'Funded by SCOAP3'. File: {article_file.file.url}",
)
except FileNotFoundError:
return False, f"File not found: {article_file.file.url}"

return False, "Files do not contain the required text: 'Funded by SCOAP3'"
except Exception as e:
Expand Down
8 changes: 6 additions & 2 deletions scoap3/articles/tests/test_funded_by_scoap3.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@ def _create_pdf_with_text(text):

@pytest.fixture
def attach_file_to_article(db):
def _attach_file_to_article(article, content, file_name):
def _attach_file_to_article(article, content, file_name, filetype="pdf"):
file = File(content, name=file_name)
ArticleFile.objects.create(article_id=article, file=file)
ArticleFile.objects.create(article_id=article, file=file, filetype=filetype)

return _attach_file_to_article

Expand Down Expand Up @@ -111,8 +111,12 @@ def test_mixed_files(

file_with_text = create_pdf_with_text("Funded by SCOAP3")
file_without_text = create_pdf_with_text("Other text")
file_without_text_xml = create_pdf_with_text("<>Other textXML</>")
attach_file_to_article(article, file_with_text, "file_with_text.pdf")
attach_file_to_article(article, file_without_text, "file_without_text.pdf")
attach_file_to_article(
article, file_without_text_xml, "file_without_text.xml", filetype="xml"
)

result, message = check_contains_funded_by_scoap3(article)
assert result is True
Expand Down
30 changes: 17 additions & 13 deletions scoap3/articles/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,19 +31,23 @@ def parse_string_to_date_object(date_string):

def is_string_in_pdf(article_file, search_string):
try:
pdf_file = article_file.file.read()
document = fitz.open(stream=pdf_file)
search_string_lower = search_string.lower()

for page_num in range(document.page_count):
page = document[page_num]
page_text = page.get_text().lower()
if search_string_lower in page_text:
document.close()
return True

document.close()
return False
with article_file.file.open(mode="rb") as _file:
file_content = _file.read()

if article_file.file.name.endswith(".pdf"):
document = fitz.open(stream=file_content, filetype="pdf")
else:
document = fitz.open(stream=file_content, filetype="txt")
search_string_lower = search_string.lower()
for page_num in range(document.page_count):
page = document[page_num]
page_text = page.get_text().lower()
if search_string_lower in page_text:
document.close()
return True

document.close()
return False
except FileNotFoundError:
raise FileNotFoundError(f"File not found: {article_file}")
except Exception as e:
Expand Down
Loading