Skip to content

Commit

Permalink
fix: Filtered author export based on Country
Browse files Browse the repository at this point in the history
Signed-off-by: Lorenzo Vagliano

Also avoided duplicated entries in output by filtering DOIs
  • Loading branch information
Lorenzovagliano committed Dec 3, 2024
1 parent 236aa07 commit 387e0ae
Show file tree
Hide file tree
Showing 2 changed files with 333 additions and 23 deletions.
304 changes: 304 additions & 0 deletions scoap3/utils/tests/test_author_export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,304 @@
import pytest
from django.test import TestCase

from scoap3.articles.models import Article, ArticleIdentifier
from scoap3.authors.models import Author
from scoap3.misc.models import Affiliation, Country, PublicationInfo, Publisher
from scoap3.utils.tools import author_export


@pytest.mark.django_db
@pytest.mark.vcr
class TestAuthorExport(TestCase):
def setUp(self):
self.publisher_1 = Publisher.objects.create(name="Elsevier")
self.publisher_2 = Publisher.objects.create(name="Springer")

self.country_gb = Country.objects.create(code="GB", name="United Kingdom")
self.country_fr = Country.objects.create(code="FR", name="France")
self.country_jp = Country.objects.create(code="JP", name="Japan")
self.country_be = Country.objects.create(code="BE", name="Belgium")
self.country_br = Country.objects.create(code="BR", name="Brazil")
self.country_it = Country.objects.create(code="IT", name="Italy")
self.country_es = Country.objects.create(code="ES", name="Spain")

def create_article(
self,
title,
subtitle,
abstract,
publication_date,
doi_value,
publisher,
journal_title,
author_data,
country,
affiliation_value,
):
article = Article.objects.create(
title=title,
subtitle=subtitle,
abstract=abstract,
publication_date=publication_date,
)

ArticleIdentifier.objects.create(
article_id=article,
identifier_type="DOI",
identifier_value=doi_value,
)

PublicationInfo.objects.create(
journal_title=journal_title,
article_id=article,
publisher=publisher,
)

author = Author.objects.create(article_id=article, **author_data)

affiliation = Affiliation.objects.create(
country=country,
value=affiliation_value,
organization="Example Organization",
)
affiliation.author_id.add(author)

return article

def test_author_export_no_data(self):
result = author_export("2024", "IN")
expected_result = {
"header": [
"year",
"journal",
"doi",
"arxiv number",
"primary arxiv category",
"author",
"country",
"affiliation",
"total number of authors",
],
"data": [],
}

assert result == expected_result

def test_author_export_correct_year_wrong_country(self):
self.create_article(
title="Test Article",
subtitle="Test Subtitle",
abstract="Test Abstract",
publication_date="2024-01-01",
doi_value="TestDOI2",
publisher=self.publisher_1,
journal_title="Adv. High Energy Phys.",
author_data={
"last_name": "ExampleSurname",
"first_name": "ExampleName",
"email": "[email protected]",
"author_order": 100,
},
country=self.country_fr,
affiliation_value="Example",
)

result = author_export("2024", "JP")
expected_result = {
"header": [
"year",
"journal",
"doi",
"arxiv number",
"primary arxiv category",
"author",
"country",
"affiliation",
"total number of authors",
],
"data": [],
}

assert result == expected_result

def test_author_export_wrong_year_correct_country(self):
self.create_article(
title="Test Article",
subtitle="Test Subtitle",
abstract="Test Abstract",
publication_date="2024-01-01",
doi_value="TestDOI3",
publisher=self.publisher_1,
journal_title="Adv. High Energy Phys.",
author_data={
"last_name": "ExampleSurname",
"first_name": "ExampleName",
"email": "[email protected]",
"author_order": 100,
},
country=self.country_be,
affiliation_value="Example",
)

result = author_export("2023", "BE")
expected_result = {
"header": [
"year",
"journal",
"doi",
"arxiv number",
"primary arxiv category",
"author",
"country",
"affiliation",
"total number of authors",
],
"data": [],
}

assert result == expected_result

def test_author_export_filtering(self):
self.create_article(
title="Test Article",
subtitle="Test Subtitle",
abstract="Test Abstract",
publication_date="2024-01-01",
doi_value="TestDOI4",
publisher=self.publisher_1,
journal_title="Adv. High Energy Phys.",
author_data={
"last_name": "ExampleSurname",
"first_name": "ExampleName",
"email": "[email protected]",
"author_order": 100,
},
country=self.country_br,
affiliation_value="Example",
)

self.create_article(
title="Test Article 2",
subtitle="Test Subtitle 2",
abstract="Test Abstract 2",
publication_date="2024-02-02",
doi_value="TestDOI5",
publisher=self.publisher_2,
journal_title="Adv. High Energy Phys.",
author_data={
"last_name": "ExampleSurname2",
"first_name": "ExampleName2",
"email": "[email protected]",
"author_order": 100,
},
country=self.country_es,
affiliation_value="Example2",
)

result = author_export("2024", "BR")
expected_result = {
"header": [
"year",
"journal",
"doi",
"arxiv number",
"primary arxiv category",
"author",
"country",
"affiliation",
"total number of authors",
],
"data": [
[
2024,
"Adv. High Energy Phys.",
"TestDOI4",
None,
None,
"ExampleName ExampleSurname",
"BR",
"Example",
1,
]
],
}

assert result == expected_result

def test_author_export_multiple(self):
self.create_article(
title="Test Article",
subtitle="Test Subtitle",
abstract="Test Abstract",
publication_date="2024-01-01",
doi_value="TestDOI6",
publisher=self.publisher_1,
journal_title="Adv. High Energy Phys.",
author_data={
"last_name": "ExampleSurname",
"first_name": "ExampleName",
"email": "[email protected]",
"author_order": 100,
},
country=self.country_it,
affiliation_value="Example",
)

self.create_article(
title="Test Article 2",
subtitle="Test Subtitle 2",
abstract="Test Abstract 2",
publication_date="2024-02-02",
doi_value="TestDOI7",
publisher=self.publisher_2,
journal_title="Adv. High Energy Phys.",
author_data={
"last_name": "ExampleSurname2",
"first_name": "ExampleName2",
"email": "[email protected]",
"author_order": 100,
},
country=self.country_it,
affiliation_value="Example2",
)

result = author_export("2024", "IT")
expected_result = {
"header": [
"year",
"journal",
"doi",
"arxiv number",
"primary arxiv category",
"author",
"country",
"affiliation",
"total number of authors",
],
"data": [
[
2024,
"Adv. High Energy Phys.",
"TestDOI7",
None,
None,
"ExampleName2 ExampleSurname2",
"IT",
"Example2",
1,
],
[
2024,
"Adv. High Energy Phys.",
"TestDOI6",
None,
None,
"ExampleName ExampleSurname",
"IT",
"Example",
1,
],
],
}

assert result == expected_result
52 changes: 29 additions & 23 deletions scoap3/utils/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,12 +110,22 @@ def author_export(search_year, search_country):
if search_year:
search = search.filter("match", publication_date=f"{search_year}-01-01||/y")

if search_country:
search = search.filter("term", countries=search_country)
seen_dois = set()

for article in search.scan():
doi = get_first_doi(article)

if doi in seen_dois or doi is None:
continue

seen_dois.add(doi)

year = article.publication_date.year
journal = article.publication_info[0].journal_title
journal = (
article.publication_info[0].journal_title
if article.publication_info
else None
)
doi = get_first_doi(article)
arxiv = get_first_arxiv(article)
arxiv_category = get_arxiv_primary_category(article)
Expand All @@ -124,41 +134,37 @@ def author_export(search_year, search_country):
missing_author_affiliations = 0

for author in authors:
# if there are no affiliations, we cannot add this author
# (this also means the record is not valid according to the schema)
if not author.affiliations:
missing_author_affiliations += 1
continue

author_first_name = author.get("first_name", "UNKNOWN")
author_last_name = author.get("last_name", "UNKNOWN")
# add extracted information to result list
for affiliation in author.affiliations:
if not affiliation.country:
aff_country = "UNKNOWN"
else:
if affiliation.country.code == search_country:
aff_country = affiliation.country.code
aff_value = affiliation.get("value", "UNKNOWN")
result_data.append(
[
year,
journal,
doi,
arxiv,
arxiv_category,
author_first_name + " " + author_last_name,
aff_country,
aff_value,
total_authors,
]
)
aff_value = affiliation.get("value", "UNKNOWN")
result_data.append(
[
year,
journal,
doi,
arxiv,
arxiv_category,
author_first_name + " " + author_last_name,
aff_country,
aff_value,
total_authors,
]
)

if missing_author_affiliations:
logger.warn(
"Article with DOI: {} had missing affiliations in {} / {} authors".format(
doi, missing_author_affiliations, total_authors
)
)

return {"header": result_headers, "data": result_data}


Expand Down

0 comments on commit 387e0ae

Please sign in to comment.