springer: fix xml math expressions

Signed-off-by: pamfilos <[email protected]>
cern-sis · Nov 27, 2024 · f738fc6 · f738fc6
1 parent 4331dbe
commit f738fc6
Show file tree

Hide file tree

Showing 4 changed files with 350 additions and 2 deletions.
diff --git a/dags/common/cleanup.py b/dags/common/cleanup.py
@@ -15,6 +15,27 @@ def convert_html_subscripts_to_latex(input):
     input = re.sub("<sup>(.*?)</sup>", r"$^{\1}$", input)
     return input
 
+def clean_inline_expressions(input):
+    input = re.sub(
+        r"<InlineEquation.*?>(.*?)</InlineEquation>",
+        r"\1",
+        input,
+        flags=re.DOTALL
+    )
+    input = re.sub(
+        r"<EquationSource Format=\"TEX\"><!\[CDATA\[(.*?)\]\]></EquationSource>",
+        r"\1",
+        input
+    )
+    input = re.sub(
+        r"<EquationSource Format=\"MATHML\">.*?</EquationSource>",
+        "",
+        input,
+        flags=re.DOTALL
+    )
+    input = input.replace('\n', '').replace('\r', '')
+
+    return input
 
 def convert_html_italics_to_latex(input):
     input = re.sub(r"<italic\b[^>]*>(.*?)</italic>", r"$\\textit{\1}$", input)

diff --git a/dags/springer/springer_process_file.py b/dags/springer/springer_process_file.py
@@ -9,6 +9,13 @@
 from common.exceptions import EmptyOutputFromPreviousTask
 from common.scoap3_s3 import Scoap3Repository
 from common.utils import create_or_update_article, upload_json_to_s3
+from common.cleanup import (
+    replace_cdata_format,
+    convert_html_subscripts_to_latex,
+    convert_html_italics_to_latex,
+    clean_whitespace_characters,
+    clean_inline_expressions,
+)
 from inspire_utils.record import get_value
 from jsonschema import validate
 from springer.parser import SpringerParser
@@ -17,12 +24,23 @@
 
 logger = get_logger()
 
+def process_xml(input):
+    input = convert_html_subscripts_to_latex(input)
+    input = convert_html_italics_to_latex(input)
+    input = replace_cdata_format(input)
+    input = clean_inline_expressions(input)
+    input = input.replace('\n', '').replace('\r', '').lstrip().rstrip()
+    input = clean_whitespace_characters(input.strip())
+    return input
 
 def springer_parse_file(**kwargs):
     if "params" in kwargs and "file" in kwargs["params"]:
         encoded_xml = kwargs["params"]["file"]
         file_name = kwargs["params"]["file_name"]
         xml_bytes = base64.b64decode(encoded_xml)
+        if isinstance(xml_bytes, bytes):
+            xml_bytes = xml_bytes.decode('utf-8')
+        xml_bytes = process_xml(xml_bytes)
         xml = ET.fromstring(xml_bytes.decode("utf-8"))
 
         parser = SpringerParser(file_name)

diff --git a/tests/units/springer/test_parser.py b/tests/units/springer/test_parser.py
@@ -3,6 +3,7 @@
 
 from pytest import fixture
 from springer.parser import SpringerParser
+from springer.springer_process_file import process_xml
 from common.enhancer import Enhancer
 
 
@@ -17,8 +18,8 @@ def articles(datadir):
     articles = []
     for filename in sorted(listdir(datadir)):
         with open(datadir / filename) as file:
-            articles.append(ET.fromstring(file.read()))
-
+            xml = process_xml(file.read())
+            articles.append(ET.fromstring(xml))
     return articles
 
 
@@ -27,6 +28,21 @@ def parsed_articles(parser, articles):
     return [parser._publisher_specific_parsing(article) for article in articles]
 
 
+def test_weird_titles(parsed_articles):
+    parsed_titles = sorted([a.get("title") for a in parsed_articles])
+    expected_results = sorted([
+        " $$(g-2)_{e,\\mu }$$ anomalies and decays $$h\\rightarrow e_a e_b$$ , "
+            "$$Z\\rightarrow e_ae_b$$ , and $$e_b\\rightarrow e_a \\gamma $$ in a two "
+            "Higgs doublet model with inverse seesaw neutrinos",
+        " $$\\Lambda $$ polarization in very high energy heavy ion collisions as a probe of the quark–gluon plasma formation and properties",
+        "A strategy for a general search for new phenomena using data-derived signal regions and its application within the ATLAS experiment",
+        "Revisiting the mechanical properties of the nucleon",
+        "Symmetry breaking in quantum curves and super Chern-Simons matrix models"
+    ])
+
+    assert expected_results == parsed_titles
+
+
 def test_authors(parsed_articles):
     expected_results = (
         [