Skip to content

Commit

Permalink
common: updates xml cleanup for html tags, fixes typo
Browse files Browse the repository at this point in the history
Signed-off-by: pamfilos <[email protected]>
  • Loading branch information
pamfilos committed Aug 23, 2024
1 parent a2512d7 commit 80156a3
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 18 deletions.
2 changes: 1 addition & 1 deletion dags/common/cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def clean_whitespace_characters(input):
return " ".join(input.split())


def convert_html_subsripts_to_latex(input):
def convert_html_subscripts_to_latex(input):
input = re.sub("<sub>(.*?)</sub>", r"$_{\1}$", input)
input = re.sub("<inf>(.*?)</inf>", r"$_{\1}$", input)
input = re.sub("<sup>(.*?)</sup>", r"$^{\1}$", input)
Expand Down
16 changes: 9 additions & 7 deletions dags/iop/iop_process_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from common.utils import create_or_update_article, upload_json_to_s3
from common.cleanup import (
replace_cdata_format,
convert_html_subsripts_to_latex,
convert_html_subscripts_to_latex,
convert_html_italics_to_latex,
)
from inspire_utils.record import get_value
Expand All @@ -20,20 +20,22 @@

logger = get_logger()

def process_xml(xml_file):
xml_file = convert_html_subsripts_to_latex(xml_file)
xml_file = convert_html_italics_to_latex(xml_file)
xml_file = replace_cdata_format(xml_file)
return xml_file
def process_xml(input):
input = convert_html_subscripts_to_latex(input)
input = convert_html_italics_to_latex(input)
input = replace_cdata_format(input)
return input

def iop_parse_file(**kwargs):
if "params" not in kwargs or "file" not in kwargs["params"]:
raise Exception("There was no 'file' parameter. Exiting run.")
encoded_xml = kwargs["params"]["file"]
file_name = kwargs["params"]["file_name"]
xml_bytes = base64.b64decode(encoded_xml)
if isinstance(xml_bytes, bytes):
xml_bytes = xml_bytes.decode('utf-8')
xml_bytes = process_xml(xml_bytes)
xml = ET.fromstring(xml_bytes.decode("utf-8"))
xml = ET.fromstring(xml_bytes)

parser = IOPParser(file_path=file_name)
parsed = parser.parse(xml)
Expand Down
20 changes: 10 additions & 10 deletions tests/units/common/test_cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
clean_all_affiliations_for_author,
clean_collaboration,
clean_whitespace_characters,
convert_html_subsripts_to_latex,
convert_html_subscripts_to_latex,
remove_orcid_prefix,
remove_specific_tags,
remove_unnecessary_fields,
Expand Down Expand Up @@ -64,36 +64,36 @@ def test_clean_whitespace_characters(test_input, expected):
"test_input, expected",
[
pytest.param(
sup, expected_string_sup, id="test_convert_html_subsripts_to_latex_sup"
sup, expected_string_sup, id="test_convert_html_subscripts_to_latex_sup"
),
pytest.param(
sub, expected_string_sub, id="test_convert_html_subsripts_to_latex_sub"
sub, expected_string_sub, id="test_convert_html_subscripts_to_latex_sub"
),
pytest.param(
inf, expected_string_inf, id="test_convert_html_subsripts_to_latex_inf"
inf, expected_string_inf, id="test_convert_html_subscripts_to_latex_inf"
),
pytest.param(
sup_extended,
expected_string_sup_extended,
id="test_convert_html_subsripts_to_latex_sup_extended",
id="test_convert_html_subscripts_to_latex_sup_extended",
),
pytest.param(
sub_extended,
expected_string_sub_extended,
id="test_convert_html_subsripts_to_latex_sub_extended",
id="test_convert_html_subscripts_to_latex_sub_extended",
),
pytest.param(
inf_extended,
expected_string_inf_extended,
id="test_convert_html_subsripts_to_latex_inf_extended",
id="test_convert_html_subscripts_to_latex_inf_extended",
),
pytest.param(
no_tags, no_tags, id="test_convert_html_subsripts_to_latex_no_tags"
no_tags, no_tags, id="test_convert_html_subscripts_to_latex_no_tags"
),
],
)
def test_convert_html_subsripts_to_latex(test_input, expected):
assert convert_html_subsripts_to_latex(test_input) == expected
def test_convert_html_subscripts_to_latex(test_input, expected):
assert convert_html_subscripts_to_latex(test_input) == expected


xml = "<div><p>example<h1> h1 example</h1></p></div>"
Expand Down

0 comments on commit 80156a3

Please sign in to comment.