Skip to content

Commit

Permalink
annual-reports: add dedicated API
Browse files Browse the repository at this point in the history
  • Loading branch information
drjova committed Nov 10, 2023
1 parent ac796c2 commit 673d1c8
Show file tree
Hide file tree
Showing 10 changed files with 1,399 additions and 775 deletions.
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[flake8]
max-line-length = 88
extend-ignore = E203, E704
extend-ignore = E203, E704, E501
274 changes: 94 additions & 180 deletions annual-reports/src/api.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import datetime
import os
import re
import xml.etree.ElementTree as ET

import backoff
import requests
Expand All @@ -9,15 +9,8 @@
from sqlalchemy import create_engine
from sqlalchemy.orm import Session

# noqa: E501
PUBLICATIONS_PER_YEAR = "https://cds.cern.ch/search?p=(980:ARTICLE+or+980:BOOK+or+980:PROCEEDINGS+or+690:'YELLOW+REPORT'+or+980:REPORT)+and+year:{year}+and+(affiliation:CERN+or+260:CERN+or+595:'For+annual+report')+not+595:'Not+for+annual+report'&of=xm&rg=1" # noqa: E501
JOURNALS_PER_YEAR = "https://cds.cern.ch/search?ln=en&cc=Published+Articles&p=(affiliation:CERN+or+595:'For+annual+report')+and+year:{year}+not+980:ConferencePaper+not+980:BookChapter+not+595:'Not+for+annual+report'&action_search=Search&op1=a&m1=a&p1=&f1=&c=Published+Articles&c=&sf=&so=d&rm=&rg=2000&sc=0&of=tb&ot=773__p" # noqa: E501
PUBLISHED_ARTICLES_PER_YEAR = "https://cds.cern.ch/search?ln=en&cc=Published+Articles&p=%28affiliation%3ACERN+or+595%3A%27For+annual+report%27%29+and+year%3A{year}+not+980%3AConferencePaper+not+980%3ABookChapter+not+595%3A%27Not+for+annual+report%27&action_search=Search&op1=a&m1=a&p1=&f1=&c=Published+Articles&c=&sf=&so=d&rm=&rg=1&sc=0&of=xm" # noqa: E501
CONTRIBUTIONS_TO_CONFERENCE_PROCEEDINGS_PER_YEAR = "https://cds.cern.ch/search?wl=0&ln=en&cc=Published+Articles&p=980%3AARTICLE+and+%28affiliation%3ACERN+or+595%3A%27For+annual+report%27%29+and+year%3A{year}+and+980%3AConferencePaper+not+595%3A%27Not+for+annual+report%27&f=&action_search=Search&c=Published+Articles&c=&sf=author&so=a&rm=&rg=1&sc=1&of=xm" # noqa: E501
REPORTS_BOOKS_AND_BOOK_CHAPTERS_PER_YEAR = "https://cds.cern.ch/search?ln=en&p=affiliation%3ACERN+or+260%3ACERN+and+260%3A{year}++and+%28980%3ABOOK+or+980%3APROCEEDINGS+or+690%3A%27YELLOW+REPORT%27+or+980%3ABookChapter+or+980%3AREPORT%29+not+595%3A%27Not+for+annual+report%27&action_search=Search&op1=a&m1=a&p1=&f1=&c=Articles+%26+Preprints&c=Books+%26+Proceedings&sf=&so=d&rm=&rg=1&sc=1&of=xm" # noqa: E501
THESES_PER_YEAR = "https://cds.cern.ch/search?wl=0&ln=en&cc=CERN+Theses&p=502%3A%27{year}%27+and+502%3Aphd&f=&action_search=Search&c=CERN+Theses&c=&sf=&so=d&rm=&rg=1&sc=1&of=xm" # noqa: E501,E261
SUBJECT_CATEGORIES_PER_YEAR = "https://cds.cern.ch/search?ln=en&cc=Published+Articles&p=(affiliation:CERN+or+595:'For+annual+report')+and+year:{year}+not+980:ConferencePaper+not+980:BookChapter+not+595:'Not+for+annual+report'&action_search=Search&op1=a&m1=a&p1=&f1=&c=Published+Articles&c=&sf=&so=d&rm=&rg=2000&sc=0&of=tb&ot=65017" # noqa: E501

PUBLICATIONS = "https://cds.cern.ch/tools/custom_query_summary.py?start={year}&end={year}&apikey={cds_token}&refresh=1&repeated_values=0"
SUBJECTS = PUBLICATIONS + "&otag=65017a"

LOGGING = structlog.get_logger("Annual_Report_API")

Expand All @@ -30,147 +23,6 @@ def _backoff_handler(details):
)


def get_number_of_records(url):
# Get the response from the URL
response = requests.get(url)
# Raise exception if HTTP error
response.raise_for_status()
match = re.search(r"Search-Engine-Total-Number-Of-Results: (\d+)", response.text)
# Return the number of records
if match:
return int(match.group(1))
return None


@backoff.on_exception(
backoff.expo,
requests.exceptions.RequestException,
max_tries=5,
on_backoff=_backoff_handler,
)
def get_publications_per_year(year):
"""
Get the number of publications for each publication type for a given year.
Parameters
----------
year : int
The year to get the number of publications for.
Returns
-------
dict
The number of publications for each publication type for the given year.
"""
publications_per_year = get_number_of_records(
PUBLICATIONS_PER_YEAR.format(year=year)
)
published_articles_per_year = get_number_of_records(
PUBLISHED_ARTICLES_PER_YEAR.format(year=year)
)
contributions_to_conference_proceedings_per_year = get_number_of_records(
CONTRIBUTIONS_TO_CONFERENCE_PROCEEDINGS_PER_YEAR.format(year=year)
)
reports_books_and_book_chapters_per_year = get_number_of_records(
REPORTS_BOOKS_AND_BOOK_CHAPTERS_PER_YEAR.format(year=year)
)
theses_per_year = get_number_of_records(THESES_PER_YEAR.format(year=year))

return {
"publications": publications_per_year,
"published_articles": published_articles_per_year,
"contributions_to_conference_proceedings": contributions_to_conference_proceedings_per_year, # noqa: E501,E261
"reports_books_and_book_chapters": reports_books_and_book_chapters_per_year,
"theses": theses_per_year,
}


@backoff.on_exception(
backoff.expo,
requests.exceptions.RequestException,
max_tries=5,
on_backoff=_backoff_handler,
)
def get_journals_per_year(year):
"""
Get the number of publications for each journal for a given year.
Parameters
----------
year : int
The year to get the number of publications for.
Returns
-------
dict
The number of publications for each journal for the given year.
"""
url = JOURNALS_PER_YEAR.format(year=year)

response = requests.get(url)

response.raise_for_status()
journals = response.text.split("\n")

journal_to_count = {}
for journal in journals:
journal_name = journal
if journal_name in journal_to_count:
journal_to_count[journal_name] += 1
else:
if journal_name:
journal_to_count[journal_name] = 1
return journal_to_count


@backoff.on_exception(
backoff.expo,
requests.exceptions.RequestException,
max_tries=5,
on_backoff=_backoff_handler,
)
def get_subject_categories_per_year(year):
"""
Get the number of publications for each subject category for a given year.
Parameters
----------
year : int
The year to get the number of categories for.
Returns
-------
dict
The number of categories for the given year.
"""

url = SUBJECT_CATEGORIES_PER_YEAR.format(year=year)

response = requests.get(url)
response.raise_for_status()

categories = response.text.split("\n")

categories_to_count = {}

for category in categories:
if "SzGeCERN" not in category:
continue

try:
category_name = category.split("$$a")[1].split("$$")[0]
except IndexError:
# Skip this category since it's malformed
continue

if category_name in categories_to_count:
categories_to_count[category_name] += 1
else:
categories_to_count[category_name] = 1

return categories_to_count


class AnnualReportsAPI:
def __init__(
self,
Expand All @@ -180,19 +32,22 @@ def __init__(
db_name: str = "",
db_port: str = "",
years: list = None,
cds_token: str = "",
) -> None:
self.db_user = db_user or os.environ.get("DB_USER")
self.db_password = db_password or os.environ.get("DB_PASSWORD")
self.db_host = db_host or os.environ.get("DB_HOST")
self.db_name = db_name or os.environ.get("MATOMO_DB_NAME")
self.db_port = db_port or os.environ.get("DB_PORT")
self.cds_token = cds_token or os.environ.get("CDS_TOKEN")
if not all(
[
self.db_user,
self.db_password,
self.db_host,
self.db_name,
self.db_port,
self.cds_token,
]
):
raise ValueError("All the required attributes must be passed!")
Expand All @@ -206,18 +61,84 @@ def __init__(
def create_tables(self):
Base.metadata.create_all(self.engine, checkfirst=True)

def get_categories(self):
def drop_tables(self):
Base.metadata.drop_all(self.engine, checkfirst=True)

@backoff.on_exception(
backoff.expo,
requests.exceptions.RequestException,
max_tries=5,
on_backoff=_backoff_handler,
)
def request_publications_from_cds(self, year):
url = PUBLICATIONS.format(year=year, cds_token=self.cds_token)
response = requests.get(url)
response.raise_for_status()
root = ET.fromstring(response.content)
return root

@backoff.on_exception(
backoff.expo,
requests.exceptions.RequestException,
max_tries=5,
on_backoff=_backoff_handler,
)
def request_subjects_from_cds(self, year):
url = SUBJECTS.format(year=year, cds_token=self.cds_token)
response = requests.get(url)
response.raise_for_status()
root = ET.fromstring(response.content)
return root

def get_publications_by_year(self, year):
"""
Get the number of publications for a given year.
Parameters
----------
year : int
The year to get the number of publications for.
Returns
-------
int
The number of publications for the given year.
"""
root = self.request_publications_from_cds(year)
yearly_report = root.find("yearly_report")
publication_report_count = yearly_report.attrib
del publication_report_count["year"]
journals = {}
for journal in yearly_report.findall("line"):
name = journal.find("result").text
if "TOTAL" in name:
continue
journals[name] = journal.find("nb").text
return publication_report_count, journals

def get_subjects_by_year(self, year):
root = self.request_subjects_from_cds(year)
yearly_report = root.find("yearly_report")
subjects = {}
for subject in yearly_report.findall("line"):
name = subject.find("result").text
if "TOTAL" in name:
continue
subjects[name] = subject.find("nb").text
return subjects

def get_subjects(self):
for year in self.years:
year = int(year)
LOGGING.info("Getting categories", year=year)
results = get_subject_categories_per_year(year)
results = self.get_subjects_by_year(year)
year_to_date = datetime.date(year, 1, 1)
with Session(self.engine) as session:
try:
LOGGING.info("Deleting categories", year=year)
session.query(Categories).filter_by(year=year_to_date).delete()
records = [
Categories(year=year_to_date, category=key, count=value)
Categories(year=year_to_date, category=key, count=int(value))
for key, value in results.items()
]
LOGGING.info(
Expand All @@ -231,46 +152,39 @@ def get_categories(self):
else:
LOGGING.info("Populate categories success")

def get_journals(self):
def get_publications(self):
for year in self.years:
year = int(year)
LOGGING.info("Getting journals", year=year)
results = get_journals_per_year(year)
LOGGING.info("Getting publications", year=year)
publications, journals = self.get_publications_by_year(year)
year_to_date = datetime.date(year, 1, 1)
with Session(self.engine) as session:
try:
LOGGING.info("Deleting publications", year=year)
session.query(Publications).filter_by(year=year_to_date).delete()
LOGGING.info("Populate publications", publications=publications)
records = Publications(year=year_to_date, **publications)
session.add(records)
session.commit()
except Exception as e:
print("ERROR: " + str(e))
LOGGING.exception("Populate publications")
else:
LOGGING.info("Populate publications success")
with Session(self.engine) as session:
try:
LOGGING.info("Deleting journals", year=year)
session.query(Journals).filter_by(year=year_to_date).delete()
records = [
Journals(year=year_to_date, journal=key, count=value)
for key, value in results.items()
Journals(year=year_to_date, journal=key, count=int(value))
for key, value in journals.items()
]
LOGGING.info(
"Populate journals", count=len(records), journals=results
"Populate journals", count=len(records), journals=journals
)
session.add_all(records)
session.commit()
except Exception as e: # noqa: F841
LOGGING.exception("Populate journals")
else:
LOGGING.info("Populate journals success")

def get_publications(self):
for year in self.years:
year = int(year)
LOGGING.info("Getting publications", year=year)
results = get_publications_per_year(year)
year_to_date = datetime.date(year, 1, 1)
with Session(self.engine) as session:
try:
LOGGING.info("Deleting publications", year=year)
session.query(Publications).filter_by(year=year_to_date).delete()
LOGGING.info("Populate publications", publications=results)
records = Publications(year=year_to_date, **results)
session.add(records)
session.commit()
except Exception as e:
print("ERROR: " + str(e))
LOGGING.exception("Populate publications")
else:
LOGGING.info("Populate publications success")
7 changes: 2 additions & 5 deletions annual-reports/src/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,8 @@ def fetch_annual_reports(years):
click.echo("Create tables if missing")
annual_reports.create_tables()

click.echo("Fetching categories")
annual_reports.get_categories()

click.echo("Fetching journals")
annual_reports.get_journals()
click.echo("Fetching subjects")
annual_reports.get_subjects()

click.echo("Fetching publications")
annual_reports.get_publications()
Expand Down
6 changes: 3 additions & 3 deletions annual-reports/src/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ class Publications(Base):

id = Column(Integer, primary_key=True)
publications = Column(Integer, nullable=False)
published_articles = Column(Integer, nullable=False)
contributions_to_conference_proceedings = Column(Integer, nullable=False)
reports_books_and_book_chapters = Column(Integer, nullable=False)
journals = Column(Integer, nullable=False)
contributions = Column(Integer, nullable=False)
theses = Column(Integer, nullable=False)
rest = Column(Integer, nullable=False)
year = Column(Date, nullable=False)


Expand Down
Loading

0 comments on commit 673d1c8

Please sign in to comment.