Skip to content

Commit

Permalink
feat(export): Added year export for extraction
Browse files Browse the repository at this point in the history
Signed-off-by: Lorenzo Vagliano
  • Loading branch information
Lorenzovagliano committed Nov 15, 2024
1 parent 5c123f2 commit 188662e
Show file tree
Hide file tree
Showing 3 changed files with 126 additions and 0 deletions.
40 changes: 40 additions & 0 deletions scoap3/management/commands/year_export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import csv
import datetime
import logging

from django.core.files.storage import storages
from django.core.management.base import BaseCommand, CommandParser

from scoap3.utils.tools import year_export

logger = logging.getLogger(__name__)


class Command(BaseCommand):
help = "Export article information by year"

def add_arguments(self, parser: CommandParser) -> None:
parser.add_argument(
"--start",
type=str,
required=False,
help="Start date.",
)

parser.add_argument(
"--end",
type=str,
required=False,
help="End date.",
)

def handle(self, *args, **options):
storage = storages["default"]
result = year_export(options["start"], options["end"])

with storage.open(
f"scoap3_export_years_{datetime.datetime.now()}.csv", "w"
) as f:
writer = csv.writer(f)
writer.writerow(result["header"])
writer.writerows(result["data"])
8 changes: 8 additions & 0 deletions scoap3/misc/api/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,14 @@ def get_ror(self, obj):
else:
return None

def to_representation(self, instance):
representation = super().to_representation(instance)

if representation.get("ror") is None:
representation.pop("ror", None)

return representation


class InstitutionIdentifierSerializer(serializers.ModelSerializer):
class Meta:
Expand Down
78 changes: 78 additions & 0 deletions scoap3/utils/tools.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
from collections import Counter
from datetime import datetime

from django.db import connection
from django.db.models import Max
Expand Down Expand Up @@ -162,6 +163,83 @@ def author_export(search_year, search_country):
return {"header": result_headers, "data": result_data}


def year_export(start_date=None, end_date=None):
result_headers = [
"year",
"journal",
"doi",
"arxiv number",
"primary arxiv category",
"total number of authors",
"total number of ORCIDs linked to the authors",
"total number of affiliations",
"total number of ROR linked with the affiliations",
"total number of related materials, type dataset",
"total number of related materials, type software",
]
result_data = []

search = ArticleDocument.search()

if start_date or end_date:
date_range = {}
if start_date:
date_range["gte"] = datetime.strptime(start_date, "%Y-%m-%d")
if end_date:
date_range["lte"] = datetime.strptime(end_date, "%Y-%m-%d")

search = search.filter("range", publication_date=date_range)

for article in search.scan():
year = article.publication_date.year
journal = article.publication_info[0].journal_title
doi = get_first_doi(article)
arxiv = get_first_arxiv(article)
arxiv_category = get_arxiv_primary_category(article)

article_data = article.to_dict()
authors = article_data.get("authors", [])
total_authors = len(authors)

total_orcid = sum(1 for author in authors if author.get("orcid"))

total_affiliations = 0
total_ror = 0
for author in authors:
affiliations = author.get("affiliations", [])
total_affiliations += len(affiliations)

for affiliation in affiliations:
if affiliation.get("ror"):
total_ror += 1

total_related_materials_dataset = 0
total_related_materials_software = 0
for related_material in article.related_materials:
if related_material.related_material_type == "dataset":
total_related_materials_dataset += 1
elif related_material.related_material_type == "software":
total_related_materials_software += 1

result_data.append(
[
year,
journal,
doi,
arxiv,
arxiv_category,
total_authors,
total_orcid,
total_affiliations,
total_ror,
total_related_materials_dataset,
total_related_materials_software,
]
)

return {"header": result_headers, "data": result_data}


def update_article_db_model_sequence(new_start_sequence):
max_id = Article.objects.aggregate(max_id=Max("id"))["max_id"] or 0
if new_start_sequence <= max_id:
Expand Down

0 comments on commit 188662e

Please sign in to comment.