cern-sis · pamfilos · Dec 5, 2024 · Nov 27, 2024 · Nov 28, 2024 · Nov 28, 2024
diff --git a/dags/common/cleanup.py b/dags/common/cleanup.py
@@ -16,6 +16,26 @@ def convert_html_subscripts_to_latex(input):
     return input
 
 
+def clean_inline_expressions(input):
+    input = re.sub(
+        r"<InlineEquation.*?>(.*?)</InlineEquation>", r"\1", input, flags=re.DOTALL
+    )
+    input = re.sub(
+        r"<EquationSource Format=\"TEX\"><!\[CDATA\[(.*?)\]\]></EquationSource>",
+        r"\1",
+        input,
+    )
+    input = re.sub(
+        r"<EquationSource Format=\"MATHML\">.*?</EquationSource>",
+        "",
+        input,
+        flags=re.DOTALL,
+    )
+    input = input.replace("\n", "").replace("\r", "")
+
+    return input
+
+
 def convert_html_italics_to_latex(input):
     input = re.sub(r"<italic\b[^>]*>(.*?)</italic>", r"$\\textit{\1}$", input)
     return input

diff --git a/dags/common/utils.py b/dags/common/utils.py
@@ -320,7 +320,7 @@ def parse_country_from_value(affiliation_value):
 
         if len(mapped_countries) > 1 or len(mapped_countries) == 0:
             raise FoundMoreThanOneMatchOrNone(affiliation_value)
-        return mapped_countries[0].name
+        return mapped_countries[0].get("name", "")
     except (LookupError, FoundMoreThanOneMatchOrNone):
         return find_country_match_from_mapping(affiliation_value)
 

diff --git a/dags/elsevier/parser.py b/dags/elsevier/parser.py
@@ -78,6 +78,7 @@ def __init__(self):
             TextExtractor(
                 destination="copyright_holder",
                 source="item-info/copyright",
+                required=False,
             ),
             AttributeExtractor(
                 destination="copyright_year",
@@ -87,6 +88,7 @@ def __init__(self):
             TextExtractor(
                 destination="copyright_statement",
                 source="item-info/copyright",
+                required=False,
             ),
             CustomExtractor(
                 destination="journal_doctype",

diff --git a/dags/springer/parser.py b/dags/springer/parser.py
@@ -138,6 +138,9 @@ def is_latex_node(node):
             "./Journal/Volume/Issue/Article/ArticleHeader/Abstract/Para"
         )
 
+        if paragraph is None:
+            return ""
+
         text_to_skip_flatten = [
             child_node.text
             for child in paragraph

diff --git a/dags/springer/springer_process_file.py b/dags/springer/springer_process_file.py
@@ -4,6 +4,13 @@
 import pendulum
 import requests
 from airflow.decorators import dag, task
+from common.cleanup import (
+    clean_inline_expressions,
+    clean_whitespace_characters,
+    convert_html_italics_to_latex,
+    convert_html_subscripts_to_latex,
+    replace_cdata_format,
+)
 from common.enhancer import Enhancer
 from common.enricher import Enricher
 from common.exceptions import EmptyOutputFromPreviousTask
@@ -18,11 +25,24 @@
 logger = get_logger()
 
 
+def process_xml(input):
+    input = convert_html_subscripts_to_latex(input)
+    input = convert_html_italics_to_latex(input)
+    input = replace_cdata_format(input)
+    input = clean_inline_expressions(input)
+    input = input.replace("\n", "").replace("\r", "").lstrip().rstrip()
+    input = clean_whitespace_characters(input.strip())
+    return input
+
+
 def springer_parse_file(**kwargs):
     if "params" in kwargs and "file" in kwargs["params"]:
         encoded_xml = kwargs["params"]["file"]
         file_name = kwargs["params"]["file_name"]
         xml_bytes = base64.b64decode(encoded_xml)
+        if isinstance(xml_bytes, bytes):
+            xml_bytes = xml_bytes.decode("utf-8")
+        xml_bytes = process_xml(xml_bytes)
         xml = ET.fromstring(xml_bytes.decode("utf-8"))
 
         parser = SpringerParser(file_name)