Skip to content

Commit

Permalink
OA: changed golden access calculations
Browse files Browse the repository at this point in the history
  • Loading branch information
ErnestaP committed Apr 23, 2024
1 parent 8bd2528 commit 9b65a91
Show file tree
Hide file tree
Showing 7 changed files with 21,162 additions and 0 deletions.
4 changes: 4 additions & 0 deletions dags/open_access/open_access.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ def fetch_data_task(query, **kwargs):
url = utils.get_url(f"{base_query}+{query[type_of_query]}")
data = utils.get_data(url)
total = utils.get_total_results_count(data.text)
if type_of_query == "gold":
total = utils.get_gold_access_count(total, url)
if type_of_query == "green":
total = total - utils.get_gold_access_count(total, url)
return {type_of_query: total}

@task(multiple_outputs=True, executor_config=kubernetes_executor_config)
Expand Down
34 changes: 34 additions & 0 deletions dags/open_access/parsers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import xml.etree.ElementTree as ET
from io import StringIO


def parse_without_names_spaces(xml):
if type(xml) == str:
it = ET.iterparse(StringIO(xml))
else:
it = ET.iterparse(StringIO(xml.getvalue().decode("utf-8")))
for _, el in it:
el.tag = el.tag.rpartition("}")[-1]
root = it.root
return root


def get_golden_access_records_ids(data):
xml = parse_without_names_spaces(data)
records = xml.findall(".record")
golden_access = []
for record in records:
datafields = record.find("datafield/[@tag='540']")
record_type = datafields.find("subfield/[@code='3']")
license = datafields.find("subfield/[@code='a']")
if record_type is not None and license is not None:
if (
"CC" in license.text
and "BY" in license.text
and record_type.text == "publication"
):
record_id = record.find("controlfield/[@tag='001']")
if record_id is not None:
doi = record_id.text
golden_access.append(doi)
return golden_access
16 changes: 16 additions & 0 deletions dags/open_access/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import math
import re

import backoff
import requests
from open_access.parsers import get_golden_access_records_ids


def get_url(query, current_collection="Published+Articles"):
Expand All @@ -13,6 +15,20 @@ def get_url(query, current_collection="Published+Articles"):
return url


def get_gold_access_count(total, url):
iterations = math.ceil(total / 100.0)
golden_access_records_ids_count = 0

for i in range(0, iterations):
jrec = (i * 100) + 1
full_url = f"{url}&jrec={jrec}"
data = get_data(full_url)
golden_access_records_ids_count = golden_access_records_ids_count + len(
get_golden_access_records_ids(data)
)
return golden_access_records_ids_count


def get_total_results_count(data):
TOTAL_RECORDS_COUNT = re.compile(
r"Search-Engine-Total-Number-Of-Results" + r":\s(\d*)\s"
Expand Down
1 change: 1 addition & 0 deletions requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ pre-commit==3.6.2
pytest==7.4.4
coverage==7.4.3
pytest-cov==4.1.0
pytest-datadir==1.5.0
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
apache-airflow[celery, postgres, redis, cncf.kubernetes]==2.8.3
alembic==1.13.1
airflow-provider-alembic==1.0.0
elementpath==4.4.0
21,087 changes: 21,087 additions & 0 deletions tests/open_access/data/search.xml

Large diffs are not rendered by default.

19 changes: 19 additions & 0 deletions tests/open_access/test_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from open_access.parsers import get_golden_access_records_ids

expected = [
"2891488",
"2888511",
"2884471",
"2884470",
"2883672",
"2882429",
"2882335",
"2882324",
"2882311",
]


def test_get_golden_access_records_dois(shared_datadir):
with open(shared_datadir / "search.xml") as file:
records_ids = get_golden_access_records_ids(file.read())
assert records_ids == expected

0 comments on commit 9b65a91

Please sign in to comment.