Skip to content

Commit

Permalink
springer: fix xml math expressions
Browse files Browse the repository at this point in the history
Signed-off-by: pamfilos <[email protected]>
  • Loading branch information
pamfilos committed Nov 27, 2024
1 parent 4331dbe commit f738fc6
Show file tree
Hide file tree
Showing 4 changed files with 350 additions and 2 deletions.
21 changes: 21 additions & 0 deletions dags/common/cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,27 @@ def convert_html_subscripts_to_latex(input):
input = re.sub("<sup>(.*?)</sup>", r"$^{\1}$", input)
return input

def clean_inline_expressions(input):
input = re.sub(
r"<InlineEquation.*?>(.*?)</InlineEquation>",
r"\1",
input,
flags=re.DOTALL
)
input = re.sub(
r"<EquationSource Format=\"TEX\"><!\[CDATA\[(.*?)\]\]></EquationSource>",
r"\1",
input
)
input = re.sub(
r"<EquationSource Format=\"MATHML\">.*?</EquationSource>",
"",
input,
flags=re.DOTALL
)
input = input.replace('\n', '').replace('\r', '')

return input

def convert_html_italics_to_latex(input):
input = re.sub(r"<italic\b[^>]*>(.*?)</italic>", r"$\\textit{\1}$", input)
Expand Down
18 changes: 18 additions & 0 deletions dags/springer/springer_process_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@
from common.exceptions import EmptyOutputFromPreviousTask
from common.scoap3_s3 import Scoap3Repository
from common.utils import create_or_update_article, upload_json_to_s3
from common.cleanup import (
replace_cdata_format,
convert_html_subscripts_to_latex,
convert_html_italics_to_latex,
clean_whitespace_characters,
clean_inline_expressions,
)
from inspire_utils.record import get_value
from jsonschema import validate
from springer.parser import SpringerParser
Expand All @@ -17,12 +24,23 @@

logger = get_logger()

def process_xml(input):
input = convert_html_subscripts_to_latex(input)
input = convert_html_italics_to_latex(input)
input = replace_cdata_format(input)
input = clean_inline_expressions(input)
input = input.replace('\n', '').replace('\r', '').lstrip().rstrip()
input = clean_whitespace_characters(input.strip())
return input

def springer_parse_file(**kwargs):
if "params" in kwargs and "file" in kwargs["params"]:
encoded_xml = kwargs["params"]["file"]
file_name = kwargs["params"]["file_name"]
xml_bytes = base64.b64decode(encoded_xml)
if isinstance(xml_bytes, bytes):
xml_bytes = xml_bytes.decode('utf-8')
xml_bytes = process_xml(xml_bytes)
xml = ET.fromstring(xml_bytes.decode("utf-8"))

parser = SpringerParser(file_name)
Expand Down
20 changes: 18 additions & 2 deletions tests/units/springer/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from pytest import fixture
from springer.parser import SpringerParser
from springer.springer_process_file import process_xml
from common.enhancer import Enhancer


Expand All @@ -17,8 +18,8 @@ def articles(datadir):
articles = []
for filename in sorted(listdir(datadir)):
with open(datadir / filename) as file:
articles.append(ET.fromstring(file.read()))

xml = process_xml(file.read())
articles.append(ET.fromstring(xml))
return articles


Expand All @@ -27,6 +28,21 @@ def parsed_articles(parser, articles):
return [parser._publisher_specific_parsing(article) for article in articles]


def test_weird_titles(parsed_articles):
parsed_titles = sorted([a.get("title") for a in parsed_articles])
expected_results = sorted([
" $$(g-2)_{e,\\mu }$$ anomalies and decays $$h\\rightarrow e_a e_b$$ , "
"$$Z\\rightarrow e_ae_b$$ , and $$e_b\\rightarrow e_a \\gamma $$ in a two "
"Higgs doublet model with inverse seesaw neutrinos",
" $$\\Lambda $$ polarization in very high energy heavy ion collisions as a probe of the quark–gluon plasma formation and properties",
"A strategy for a general search for new phenomena using data-derived signal regions and its application within the ATLAS experiment",
"Revisiting the mechanical properties of the nucleon",
"Symmetry breaking in quantum curves and super Chern-Simons matrix models"
])

assert expected_results == parsed_titles


def test_authors(parsed_articles):
expected_results = (
[
Expand Down
Loading

0 comments on commit f738fc6

Please sign in to comment.