diff --git a/dags/common/cleanup.py b/dags/common/cleanup.py
index 229351de..595141ed 100644
--- a/dags/common/cleanup.py
+++ b/dags/common/cleanup.py
@@ -15,28 +15,27 @@ def convert_html_subscripts_to_latex(input):
input = re.sub("(.*?)", r"$^{\1}$", input)
return input
+
def clean_inline_expressions(input):
input = re.sub(
- r"(.*?)",
- r"\1",
- input,
- flags=re.DOTALL
+ r"(.*?)", r"\1", input, flags=re.DOTALL
)
input = re.sub(
r"",
r"\1",
- input
+ input,
)
input = re.sub(
r".*?",
"",
input,
- flags=re.DOTALL
+ flags=re.DOTALL,
)
- input = input.replace('\n', '').replace('\r', '')
+ input = input.replace("\n", "").replace("\r", "")
return input
+
def convert_html_italics_to_latex(input):
input = re.sub(r"]*>(.*?)", r"$\\textit{\1}$", input)
return input
diff --git a/dags/springer/springer_process_file.py b/dags/springer/springer_process_file.py
index 97f446df..6c173eee 100644
--- a/dags/springer/springer_process_file.py
+++ b/dags/springer/springer_process_file.py
@@ -4,18 +4,18 @@
import pendulum
import requests
from airflow.decorators import dag, task
+from common.cleanup import (
+ clean_inline_expressions,
+ clean_whitespace_characters,
+ convert_html_italics_to_latex,
+ convert_html_subscripts_to_latex,
+ replace_cdata_format,
+)
from common.enhancer import Enhancer
from common.enricher import Enricher
from common.exceptions import EmptyOutputFromPreviousTask
from common.scoap3_s3 import Scoap3Repository
from common.utils import create_or_update_article, upload_json_to_s3
-from common.cleanup import (
- replace_cdata_format,
- convert_html_subscripts_to_latex,
- convert_html_italics_to_latex,
- clean_whitespace_characters,
- clean_inline_expressions,
-)
from inspire_utils.record import get_value
from jsonschema import validate
from springer.parser import SpringerParser
@@ -24,22 +24,24 @@
logger = get_logger()
+
def process_xml(input):
input = convert_html_subscripts_to_latex(input)
input = convert_html_italics_to_latex(input)
input = replace_cdata_format(input)
input = clean_inline_expressions(input)
- input = input.replace('\n', '').replace('\r', '').lstrip().rstrip()
+ input = input.replace("\n", "").replace("\r", "").lstrip().rstrip()
input = clean_whitespace_characters(input.strip())
return input
+
def springer_parse_file(**kwargs):
if "params" in kwargs and "file" in kwargs["params"]:
encoded_xml = kwargs["params"]["file"]
file_name = kwargs["params"]["file_name"]
xml_bytes = base64.b64decode(encoded_xml)
if isinstance(xml_bytes, bytes):
- xml_bytes = xml_bytes.decode('utf-8')
+ xml_bytes = xml_bytes.decode("utf-8")
xml_bytes = process_xml(xml_bytes)
xml = ET.fromstring(xml_bytes.decode("utf-8"))
diff --git a/tests/units/aps/test_aps_parser.py b/tests/units/aps/test_aps_parser.py
index b7650dfe..9986f764 100644
--- a/tests/units/aps/test_aps_parser.py
+++ b/tests/units/aps/test_aps_parser.py
@@ -1,8 +1,8 @@
import json
import pytest
-from aps.parser import APSParser
from aps.aps_process_file import enhance_aps
+from aps.parser import APSParser
@pytest.fixture(scope="module")
@@ -203,6 +203,7 @@ def test_aps_parsing(parsed_articles, expected, key):
assert key in article
assert article[key] == expected_value
+
def test_aps_country_parsing(parsed_articles):
for article in parsed_articles:
enhance_aps(article)
diff --git a/tests/units/springer/test_parser.py b/tests/units/springer/test_parser.py
index 3cebb4f0..9a47990c 100644
--- a/tests/units/springer/test_parser.py
+++ b/tests/units/springer/test_parser.py
@@ -30,15 +30,17 @@ def parsed_articles(parser, articles):
def test_weird_titles(parsed_articles):
parsed_titles = sorted([a.get("title") for a in parsed_articles])
- expected_results = sorted([
- " $$(g-2)_{e,\\mu }$$ anomalies and decays $$h\\rightarrow e_a e_b$$ , "
+ expected_results = sorted(
+ [
+ " $$(g-2)_{e,\\mu }$$ anomalies and decays $$h\\rightarrow e_a e_b$$ , "
"$$Z\\rightarrow e_ae_b$$ , and $$e_b\\rightarrow e_a \\gamma $$ in a two "
"Higgs doublet model with inverse seesaw neutrinos",
- " $$\\Lambda $$ polarization in very high energy heavy ion collisions as a probe of the quark–gluon plasma formation and properties",
- "A strategy for a general search for new phenomena using data-derived signal regions and its application within the ATLAS experiment",
- "Revisiting the mechanical properties of the nucleon",
- "Symmetry breaking in quantum curves and super Chern-Simons matrix models"
- ])
+ " $$\\Lambda $$ polarization in very high energy heavy ion collisions as a probe of the quark–gluon plasma formation and properties",
+ "A strategy for a general search for new phenomena using data-derived signal regions and its application within the ATLAS experiment",
+ "Revisiting the mechanical properties of the nucleon",
+ "Symmetry breaking in quantum curves and super Chern-Simons matrix models",
+ ]
+ )
assert expected_results == parsed_titles
@@ -351,7 +353,7 @@ def test_abstract(parsed_articles):
"experimental data of $$(g-2)_{e,\\mu }$$ as well as the "
"promising LFV signals corresponding to the future experimental "
"sensitivities.",
- None
+ None,
)
for abstract, article in zip(abstracts, parsed_articles):
if abstract is None:
diff --git a/tests/units/springer/test_parser/weird.title.Meta b/tests/units/springer/test_parser/weird.title.Meta
index 31607de5..158b942c 100644
--- a/tests/units/springer/test_parser/weird.title.Meta
+++ b/tests/units/springer/test_parser/weird.title.Meta
@@ -220,4 +220,4 @@
-
\ No newline at end of file
+