OA: changed golden access calculations

cern-sis · Apr 23, 2024 · 9b65a91 · 9b65a91
1 parent 8bd2528
commit 9b65a91
Show file tree

Hide file tree

Showing 7 changed files with 21,162 additions and 0 deletions.
diff --git a/dags/open_access/open_access.py b/dags/open_access/open_access.py
@@ -25,6 +25,10 @@ def fetch_data_task(query, **kwargs):
         url = utils.get_url(f"{base_query}+{query[type_of_query]}")
         data = utils.get_data(url)
         total = utils.get_total_results_count(data.text)
+        if type_of_query == "gold":
+            total = utils.get_gold_access_count(total, url)
+        if type_of_query == "green":
+            total = total - utils.get_gold_access_count(total, url)
         return {type_of_query: total}
 
     @task(multiple_outputs=True, executor_config=kubernetes_executor_config)

diff --git a/dags/open_access/parsers.py b/dags/open_access/parsers.py
@@ -0,0 +1,34 @@
+import xml.etree.ElementTree as ET
+from io import StringIO
+
+
+def parse_without_names_spaces(xml):
+    if type(xml) == str:
+        it = ET.iterparse(StringIO(xml))
+    else:
+        it = ET.iterparse(StringIO(xml.getvalue().decode("utf-8")))
+    for _, el in it:
+        el.tag = el.tag.rpartition("}")[-1]
+    root = it.root
+    return root
+
+
+def get_golden_access_records_ids(data):
+    xml = parse_without_names_spaces(data)
+    records = xml.findall(".record")
+    golden_access = []
+    for record in records:
+        datafields = record.find("datafield/[@tag='540']")
+        record_type = datafields.find("subfield/[@code='3']")
+        license = datafields.find("subfield/[@code='a']")
+        if record_type is not None and license is not None:
+            if (
+                "CC" in license.text
+                and "BY" in license.text
+                and record_type.text == "publication"
+            ):
+                record_id = record.find("controlfield/[@tag='001']")
+                if record_id is not None:
+                    doi = record_id.text
+                    golden_access.append(doi)
+    return golden_access
diff --git a/dags/open_access/utils.py b/dags/open_access/utils.py
@@ -1,7 +1,9 @@
+import math
 import re
 
 import backoff
 import requests
+from open_access.parsers import get_golden_access_records_ids
 
 
 def get_url(query, current_collection="Published+Articles"):
@@ -13,6 +15,20 @@ def get_url(query, current_collection="Published+Articles"):
     return url
 
 
+def get_gold_access_count(total, url):
+    iterations = math.ceil(total / 100.0)
+    golden_access_records_ids_count = 0
+
+    for i in range(0, iterations):
+        jrec = (i * 100) + 1
+        full_url = f"{url}&jrec={jrec}"
+        data = get_data(full_url)
+        golden_access_records_ids_count = golden_access_records_ids_count + len(
+            get_golden_access_records_ids(data)
+        )
+    return golden_access_records_ids_count
+
+
 def get_total_results_count(data):
     TOTAL_RECORDS_COUNT = re.compile(
         r"Search-Engine-Total-Number-Of-Results" + r":\s(\d*)\s"

diff --git a/requirements-test.txt b/requirements-test.txt
@@ -2,3 +2,4 @@ pre-commit==3.6.2
 pytest==7.4.4
 coverage==7.4.3
 pytest-cov==4.1.0
+pytest-datadir==1.5.0
diff --git a/requirements.txt b/requirements.txt
@@ -2,3 +2,4 @@
 apache-airflow[celery, postgres, redis, cncf.kubernetes]==2.8.3
 alembic==1.13.1
 airflow-provider-alembic==1.0.0
+elementpath==4.4.0
diff --git a/tests/open_access/data/search.xml b/tests/open_access/data/search.xml
diff --git a/tests/open_access/test_parser.py b/tests/open_access/test_parser.py
@@ -0,0 +1,19 @@
+from open_access.parsers import get_golden_access_records_ids
+
+expected = [
+    "2891488",
+    "2888511",
+    "2884471",
+    "2884470",
+    "2883672",
+    "2882429",
+    "2882335",
+    "2882324",
+    "2882311",
+]
+
+
+def test_get_golden_access_records_dois(shared_datadir):
+    with open(shared_datadir / "search.xml") as file:
+        records_ids = get_golden_access_records_ids(file.read())
+        assert records_ids == expected