common: updates xml cleanup for html tags, fixes typo

Signed-off-by: pamfilos <[email protected]>
cern-sis · Aug 23, 2024 · 80156a3 · 80156a3
1 parent a2512d7
commit 80156a3
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 18 deletions.
diff --git a/dags/common/cleanup.py b/dags/common/cleanup.py
@@ -9,7 +9,7 @@ def clean_whitespace_characters(input):
     return " ".join(input.split())
 
 
-def convert_html_subsripts_to_latex(input):
+def convert_html_subscripts_to_latex(input):
     input = re.sub("<sub>(.*?)</sub>", r"$_{\1}$", input)
     input = re.sub("<inf>(.*?)</inf>", r"$_{\1}$", input)
     input = re.sub("<sup>(.*?)</sup>", r"$^{\1}$", input)

diff --git a/dags/iop/iop_process_file.py b/dags/iop/iop_process_file.py
@@ -10,7 +10,7 @@
 from common.utils import create_or_update_article, upload_json_to_s3
 from common.cleanup import (
     replace_cdata_format,
-    convert_html_subsripts_to_latex,
+    convert_html_subscripts_to_latex,
     convert_html_italics_to_latex,
 )
 from inspire_utils.record import get_value
@@ -20,20 +20,22 @@
 
 logger = get_logger()
 
-def process_xml(xml_file):
-    xml_file = convert_html_subsripts_to_latex(xml_file)
-    xml_file = convert_html_italics_to_latex(xml_file)
-    xml_file = replace_cdata_format(xml_file)
-    return xml_file
+def process_xml(input):
+    input = convert_html_subscripts_to_latex(input)
+    input = convert_html_italics_to_latex(input)
+    input = replace_cdata_format(input)
+    return input
 
 def iop_parse_file(**kwargs):
     if "params" not in kwargs or "file" not in kwargs["params"]:
         raise Exception("There was no 'file' parameter. Exiting run.")
     encoded_xml = kwargs["params"]["file"]
     file_name = kwargs["params"]["file_name"]
     xml_bytes = base64.b64decode(encoded_xml)
+    if isinstance(xml_bytes, bytes):
+        xml_bytes = xml_bytes.decode('utf-8')
     xml_bytes = process_xml(xml_bytes)
-    xml = ET.fromstring(xml_bytes.decode("utf-8"))
+    xml = ET.fromstring(xml_bytes)
 
     parser = IOPParser(file_path=file_name)
     parsed = parser.parse(xml)

diff --git a/tests/units/common/test_cleanup.py b/tests/units/common/test_cleanup.py
@@ -4,7 +4,7 @@
     clean_all_affiliations_for_author,
     clean_collaboration,
     clean_whitespace_characters,
-    convert_html_subsripts_to_latex,
+    convert_html_subscripts_to_latex,
     remove_orcid_prefix,
     remove_specific_tags,
     remove_unnecessary_fields,
@@ -64,36 +64,36 @@ def test_clean_whitespace_characters(test_input, expected):
     "test_input, expected",
     [
         pytest.param(
-            sup, expected_string_sup, id="test_convert_html_subsripts_to_latex_sup"
+            sup, expected_string_sup, id="test_convert_html_subscripts_to_latex_sup"
         ),
         pytest.param(
-            sub, expected_string_sub, id="test_convert_html_subsripts_to_latex_sub"
+            sub, expected_string_sub, id="test_convert_html_subscripts_to_latex_sub"
         ),
         pytest.param(
-            inf, expected_string_inf, id="test_convert_html_subsripts_to_latex_inf"
+            inf, expected_string_inf, id="test_convert_html_subscripts_to_latex_inf"
         ),
         pytest.param(
             sup_extended,
             expected_string_sup_extended,
-            id="test_convert_html_subsripts_to_latex_sup_extended",
+            id="test_convert_html_subscripts_to_latex_sup_extended",
         ),
         pytest.param(
             sub_extended,
             expected_string_sub_extended,
-            id="test_convert_html_subsripts_to_latex_sub_extended",
+            id="test_convert_html_subscripts_to_latex_sub_extended",
         ),
         pytest.param(
             inf_extended,
             expected_string_inf_extended,
-            id="test_convert_html_subsripts_to_latex_inf_extended",
+            id="test_convert_html_subscripts_to_latex_inf_extended",
         ),
         pytest.param(
-            no_tags, no_tags, id="test_convert_html_subsripts_to_latex_no_tags"
+            no_tags, no_tags, id="test_convert_html_subscripts_to_latex_no_tags"
         ),
     ],
 )
-def test_convert_html_subsripts_to_latex(test_input, expected):
-    assert convert_html_subsripts_to_latex(test_input) == expected
+def test_convert_html_subscripts_to_latex(test_input, expected):
+    assert convert_html_subscripts_to_latex(test_input) == expected
 
 
 xml = "<div><p>example<h1> h1 example</h1></p></div>"