Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#318 Added article compliance check for Funded by SCOAP3 #214

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
335 changes: 96 additions & 239 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ django-lifecycle = "^1.1.2"
country-converter = "^1.2"
numpy = "<2"
django-select2 = "^8.2.1"
pymupdf = "^1.24.10"

[tool.poetry.dev-dependencies]
Werkzeug = {extras = ["watchdog"], version = "^2.3.4"}
Expand Down
13 changes: 13 additions & 0 deletions scoap3/articles/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class ComplianceReportAdmin(admin.ModelAdmin):
"check_article_type",
"check_doi_registration_time",
"check_authors_affiliation",
"check_contains_funded_by_scoap3",
"get_is_compliant",
"report_date",
]
Expand All @@ -53,6 +54,8 @@ class ComplianceReportAdmin(admin.ModelAdmin):
"check_doi_registration_time_description",
"check_authors_affiliation",
"check_authors_affiliation_description",
"check_contains_funded_by_scoap3",
"check_contains_funded_by_scoap3_description",
]
readonly_fields = [
"article",
Expand All @@ -70,6 +73,8 @@ class ComplianceReportAdmin(admin.ModelAdmin):
"check_doi_registration_time_description",
"check_authors_affiliation",
"check_authors_affiliation_description",
"check_contains_funded_by_scoap3",
"check_contains_funded_by_scoap3_description",
]

list_filter = [
Expand All @@ -84,6 +89,7 @@ class ComplianceReportAdmin(admin.ModelAdmin):
"article_id__report__check_article_type",
"article_id__report__check_doi_registration_time",
"article_id__report__check_authors_affiliation",
"article_id__report__check_contains_funded_by_scoap3",
]

actions = ["export_as_csv"]
Expand Down Expand Up @@ -125,6 +131,7 @@ def export_as_csv(self, request, queryset):
"Check DOI Registration": "check_doi_registration_time_description",
}


response = HttpResponse(content_type="text/csv")
response["Content-Disposition"] = f"attachment; filename={filename}"
writer = csv.writer(response)
Expand Down Expand Up @@ -164,6 +171,8 @@ class ArticleComplianceReportInline(admin.StackedInline):
"check_doi_registration_time_description",
"check_authors_affiliation",
"check_authors_affiliation_description",
"check_contains_funded_by_scoap3",
"check_contains_funded_by_scoap3_description",
]
can_delete = False
can_create = False
Expand All @@ -186,6 +195,10 @@ class ArticleComplianceReportInline(admin.StackedInline):
"check_authors_affiliation",
"check_authors_affiliation_description",
),
(
"check_contains_funded_by_scoap3",
"check_contains_funded_by_scoap3_description",
),
]
},
),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Generated by Django 4.2.5 on 2024-09-17 11:31

from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("articles", "0014_compliancereport_check_authors_affiliation_and_more"),
]

operations = [
migrations.AddField(
model_name="compliancereport",
name="check_contains_funded_by_scoap3",
field=models.BooleanField(default=False),
),
migrations.AddField(
model_name="compliancereport",
name="check_contains_funded_by_scoap3_description",
field=models.TextField(blank=True, default=""),
),
]
4 changes: 4 additions & 0 deletions scoap3/articles/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,10 @@ class ComplianceReport(models.Model):
check_doi_registration_time_description = models.TextField(blank=True, default="")
check_authors_affiliation = models.BooleanField(default=False)
check_authors_affiliation_description = models.TextField(blank=True, default="")
check_contains_funded_by_scoap3 = models.BooleanField(default=False)
check_contains_funded_by_scoap3_description = models.TextField(
blank=True, default=""
)

def __str__(self):
return f"Compliance Report for {self.article.title} on {self.report_date.strftime('%Y-%m-%d')}"
Expand Down
32 changes: 31 additions & 1 deletion scoap3/articles/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
from django.core.paginator import Paginator
from django_opensearch_dsl.registries import registry

from scoap3.articles.models import Article, ComplianceReport
from scoap3.articles.models import Article, ArticleFile, ComplianceReport
from scoap3.articles.util import is_string_in_pdf
from scoap3.authors.models import Author
from scoap3.misc.models import Affiliation
from scoap3.misc.utils import fetch_doi_registration_date
Expand Down Expand Up @@ -133,6 +134,29 @@ def check_authors_affiliation(article):
return True, "Authors' affiliations are compliant"


def check_contains_funded_by_scoap3(article):
try:
article_files = ArticleFile.objects.filter(article_id=article)

if not article_files.exists():
return False, "No files found for the given article."

for article_file in article_files:
file_path = article_file.file.path
try:
if is_string_in_pdf(file_path, "Funded by SCOAP3"):
return (
True,
f"Files contain the required text: 'Funded by SCOAP3'. File: {file_path}",
)
except FileNotFoundError:
return False, f"File not found: {file_path}"

return False, "Files do not contain the required text: 'Funded by SCOAP3'"
except Exception as e:
return False, f"An unexpected error occurred: {str(e)}"


@shared_task(name="compliance_checks", acks_late=True)
def compliance_checks(article_id):
try:
Expand Down Expand Up @@ -160,6 +184,10 @@ def compliance_checks(article_id):
check_affiliations_compliance,
check_affiliations_description,
) = check_authors_affiliation(article)
(
check_funded_by_scoap3_compliance,
check_funded_by_scoap3_description,
) = check_contains_funded_by_scoap3(article)

article.report.all().delete()

Expand All @@ -177,6 +205,8 @@ def compliance_checks(article_id):
check_license_description=check_license_description,
check_authors_affiliation=check_affiliations_compliance,
check_authors_affiliation_description=check_affiliations_description,
check_contains_funded_by_scoap3=check_funded_by_scoap3_compliance,
check_contains_funded_by_scoap3_description=check_funded_by_scoap3_description,
)
report.save()
logger.info("Compliance checks completed for article %s", article_id)
Expand Down
126 changes: 126 additions & 0 deletions scoap3/articles/tests/test_funded_by_scoap3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
from io import BytesIO

import fitz # PyMuPDF
import pytest
from django.core.files import File
from django.core.files.base import ContentFile

from scoap3.articles.models import Article, ArticleFile
from scoap3.articles.tasks import check_contains_funded_by_scoap3

pytestmark = pytest.mark.django_db


@pytest.fixture
def create_article(db):
def _create_article():
return Article.objects.create(
title="Test Article",
subtitle="Subtitle",
abstract="Abstract",
)

return _create_article


@pytest.fixture
def create_pdf_with_text():
def _create_pdf_with_text(text):
pdf_bytes = BytesIO()
doc = fitz.open()
page = doc.new_page()
page.insert_text((72, 72), text)
doc.save(pdf_bytes)
doc.close()
pdf_bytes.seek(0)
return ContentFile(pdf_bytes.read(), "test_file.pdf")

return _create_pdf_with_text


@pytest.fixture
def attach_file_to_article(db):
def _attach_file_to_article(article, content, file_name):
file = File(content, name=file_name)
ArticleFile.objects.create(article_id=article, file=file)

return _attach_file_to_article


class TestCheckContainsFundedBySCOAP3:
def test_contains_funded_by(
self, create_article, create_pdf_with_text, attach_file_to_article
):
article = create_article()

file_with_text = create_pdf_with_text("Funded by SCOAP3")
attach_file_to_article(article, file_with_text, "file_with_text.pdf")

result, message = check_contains_funded_by_scoap3(article)
assert result is True
assert message.startswith("Files contain the required text: 'Funded by SCOAP3'")

def test_contains_funded_by_multiple_files(
self, create_article, create_pdf_with_text, attach_file_to_article
):
article = create_article()

file_with_text_1 = create_pdf_with_text("Funded by SCOAP3")
file_with_text_2 = create_pdf_with_text(
"Some other content. Funded by SCOAP3 again."
)

attach_file_to_article(article, file_with_text_1, "file_with_text_1.pdf")
attach_file_to_article(article, file_with_text_2, "file_with_text_2.pdf")

result, message = check_contains_funded_by_scoap3(article)
assert result is True
assert message.startswith("Files contain the required text: 'Funded by SCOAP3'")

def test_does_not_contain_funded_by(
self, create_article, create_pdf_with_text, attach_file_to_article
):
article = create_article()

file_without_text = create_pdf_with_text("Other text")
attach_file_to_article(article, file_without_text, "file_without_text.pdf")

result, message = check_contains_funded_by_scoap3(article)
assert result is False
assert message == "Files do not contain the required text: 'Funded by SCOAP3'"

def test_does_not_contain_funded_by_multiple_files(
self, create_article, create_pdf_with_text, attach_file_to_article
):
article = create_article()

file_without_text_1 = create_pdf_with_text("This is some random text.")
file_without_text_2 = create_pdf_with_text("Some other random content.")

attach_file_to_article(article, file_without_text_1, "file_without_text_1.pdf")
attach_file_to_article(article, file_without_text_2, "file_without_text_2.pdf")

result, message = check_contains_funded_by_scoap3(article)
assert result is False
assert message == "Files do not contain the required text: 'Funded by SCOAP3'"

def test_mixed_files(
self, create_article, create_pdf_with_text, attach_file_to_article
):
article = create_article()

file_with_text = create_pdf_with_text("Funded by SCOAP3")
file_without_text = create_pdf_with_text("Other text")
attach_file_to_article(article, file_with_text, "file_with_text.pdf")
attach_file_to_article(article, file_without_text, "file_without_text.pdf")

result, message = check_contains_funded_by_scoap3(article)
assert result is True
assert message.startswith("Files contain the required text: 'Funded by SCOAP3'")

def test_no_files(self, create_article):
article = create_article()

result, message = check_contains_funded_by_scoap3(article)
assert result is False
assert message == "No files found for the given article."
22 changes: 22 additions & 0 deletions scoap3/articles/util.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from datetime import datetime

import fitz

from scoap3.articles.models import ArticleIdentifierType


Expand All @@ -25,3 +27,23 @@ def get_arxiv_primary_category(article_document):

def parse_string_to_date_object(date_string):
return datetime.fromisoformat(date_string.replace("Z", "+00:00"))


def is_string_in_pdf(pdf_path, search_string):
try:
document = fitz.open(pdf_path)
search_string_lower = search_string.lower()

for page_num in range(document.page_count):
page = document[page_num]
page_text = page.get_text().lower()
if search_string_lower in page_text:
document.close()
return True

document.close()
return False
except FileNotFoundError:
raise FileNotFoundError(f"File not found: {pdf_path}")
except Exception as e:
raise Exception(f"An error occurred while reading the PDF: {str(e)}")
23 changes: 23 additions & 0 deletions scoap3/authors/migrations/0005_alter_author_article_id.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Generated by Django 4.2.5 on 2024-09-17 11:31

from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):
dependencies = [
("articles", "0015_compliancereport_check_contains_funded_by_scoap3_and_more"),
("authors", "0004_alter_authoridentifier_author_id"),
]

operations = [
migrations.AlterField(
model_name="author",
name="article_id",
field=models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="authors",
to="articles.article",
),
),
]
Loading