From 98c46e06c57649971d53c68828e086a7c6c6b73c Mon Sep 17 00:00:00 2001 From: pamfilos Date: Thu, 31 Oct 2024 14:44:41 +0100 Subject: [PATCH] articles: fix PDF reading from stream Signed-off-by: pamfilos --- scoap3/articles/tasks.py | 17 ++++++----- .../articles/tests/test_funded_by_scoap3.py | 8 +++-- scoap3/articles/util.py | 30 +++++++++++-------- 3 files changed, 32 insertions(+), 23 deletions(-) diff --git a/scoap3/articles/tasks.py b/scoap3/articles/tasks.py index 097011acf..917669474 100644 --- a/scoap3/articles/tasks.py +++ b/scoap3/articles/tasks.py @@ -163,14 +163,15 @@ def check_contains_funded_by_scoap3(article): return False, "No files found for the given article." for article_file in article_files: - try: - if is_string_in_pdf(article_file, "Funded by SCOAP3"): - return ( - True, - f"Files contain the required text: 'Funded by SCOAP3'. File: {article_file.file.path}", - ) - except FileNotFoundError: - return False, f"File not found: {article_file.file.path}" + if article_file.filetype in ["pdf", "pdf/a"]: + try: + if is_string_in_pdf(article_file, "Funded by SCOAP3"): + return ( + True, + f"Files contain the required text: 'Funded by SCOAP3'. File: {article_file.file.url}", + ) + except FileNotFoundError: + return False, f"File not found: {article_file.file.url}" return False, "Files do not contain the required text: 'Funded by SCOAP3'" except Exception as e: diff --git a/scoap3/articles/tests/test_funded_by_scoap3.py b/scoap3/articles/tests/test_funded_by_scoap3.py index 8a8cceb97..61abc2ef1 100644 --- a/scoap3/articles/tests/test_funded_by_scoap3.py +++ b/scoap3/articles/tests/test_funded_by_scoap3.py @@ -40,9 +40,9 @@ def _create_pdf_with_text(text): @pytest.fixture def attach_file_to_article(db): - def _attach_file_to_article(article, content, file_name): + def _attach_file_to_article(article, content, file_name, filetype="pdf"): file = File(content, name=file_name) - ArticleFile.objects.create(article_id=article, file=file) + ArticleFile.objects.create(article_id=article, file=file, filetype=filetype) return _attach_file_to_article @@ -111,8 +111,12 @@ def test_mixed_files( file_with_text = create_pdf_with_text("Funded by SCOAP3") file_without_text = create_pdf_with_text("Other text") + file_without_text_xml = create_pdf_with_text("<>Other textXML") attach_file_to_article(article, file_with_text, "file_with_text.pdf") attach_file_to_article(article, file_without_text, "file_without_text.pdf") + attach_file_to_article( + article, file_without_text_xml, "file_without_text.xml", filetype="xml" + ) result, message = check_contains_funded_by_scoap3(article) assert result is True diff --git a/scoap3/articles/util.py b/scoap3/articles/util.py index 703e57384..be687b92e 100644 --- a/scoap3/articles/util.py +++ b/scoap3/articles/util.py @@ -31,19 +31,23 @@ def parse_string_to_date_object(date_string): def is_string_in_pdf(article_file, search_string): try: - pdf_file = article_file.file.read() - document = fitz.open(stream=pdf_file) - search_string_lower = search_string.lower() - - for page_num in range(document.page_count): - page = document[page_num] - page_text = page.get_text().lower() - if search_string_lower in page_text: - document.close() - return True - - document.close() - return False + with article_file.file.open(mode="rb") as _file: + file_content = _file.read() + + if article_file.file.name.endswith(".pdf"): + document = fitz.open(stream=file_content, filetype="pdf") + else: + document = fitz.open(stream=file_content, filetype="txt") + search_string_lower = search_string.lower() + for page_num in range(document.page_count): + page = document[page_num] + page_text = page.get_text().lower() + if search_string_lower in page_text: + document.close() + return True + + document.close() + return False except FileNotFoundError: raise FileNotFoundError(f"File not found: {article_file}") except Exception as e: