From f738fc62d61ad16e0f9cb4ea95fef84462ca4d51 Mon Sep 17 00:00:00 2001 From: pamfilos Date: Wed, 27 Nov 2024 16:23:20 +0100 Subject: [PATCH] springer: fix xml math expressions Signed-off-by: pamfilos --- dags/common/cleanup.py | 21 ++ dags/springer/springer_process_file.py | 18 ++ tests/units/springer/test_parser.py | 20 +- .../springer/test_parser/weird.title.Meta | 293 ++++++++++++++++++ 4 files changed, 350 insertions(+), 2 deletions(-) create mode 100644 tests/units/springer/test_parser/weird.title.Meta diff --git a/dags/common/cleanup.py b/dags/common/cleanup.py index a2b4eb8f..68568d90 100644 --- a/dags/common/cleanup.py +++ b/dags/common/cleanup.py @@ -15,6 +15,27 @@ def convert_html_subscripts_to_latex(input): input = re.sub("(.*?)", r"$^{\1}$", input) return input +def clean_inline_expressions(input): + input = re.sub( + r"(.*?)", + r"\1", + input, + flags=re.DOTALL + ) + input = re.sub( + r"", + r"\1", + input + ) + input = re.sub( + r".*?", + "", + input, + flags=re.DOTALL + ) + input = input.replace('\n', '').replace('\r', '') + + return input def convert_html_italics_to_latex(input): input = re.sub(r"]*>(.*?)", r"$\\textit{\1}$", input) diff --git a/dags/springer/springer_process_file.py b/dags/springer/springer_process_file.py index 83d82565..97f446df 100644 --- a/dags/springer/springer_process_file.py +++ b/dags/springer/springer_process_file.py @@ -9,6 +9,13 @@ from common.exceptions import EmptyOutputFromPreviousTask from common.scoap3_s3 import Scoap3Repository from common.utils import create_or_update_article, upload_json_to_s3 +from common.cleanup import ( + replace_cdata_format, + convert_html_subscripts_to_latex, + convert_html_italics_to_latex, + clean_whitespace_characters, + clean_inline_expressions, +) from inspire_utils.record import get_value from jsonschema import validate from springer.parser import SpringerParser @@ -17,12 +24,23 @@ logger = get_logger() +def process_xml(input): + input = convert_html_subscripts_to_latex(input) + input = convert_html_italics_to_latex(input) + input = replace_cdata_format(input) + input = clean_inline_expressions(input) + input = input.replace('\n', '').replace('\r', '').lstrip().rstrip() + input = clean_whitespace_characters(input.strip()) + return input def springer_parse_file(**kwargs): if "params" in kwargs and "file" in kwargs["params"]: encoded_xml = kwargs["params"]["file"] file_name = kwargs["params"]["file_name"] xml_bytes = base64.b64decode(encoded_xml) + if isinstance(xml_bytes, bytes): + xml_bytes = xml_bytes.decode('utf-8') + xml_bytes = process_xml(xml_bytes) xml = ET.fromstring(xml_bytes.decode("utf-8")) parser = SpringerParser(file_name) diff --git a/tests/units/springer/test_parser.py b/tests/units/springer/test_parser.py index 77146de0..ce096089 100644 --- a/tests/units/springer/test_parser.py +++ b/tests/units/springer/test_parser.py @@ -3,6 +3,7 @@ from pytest import fixture from springer.parser import SpringerParser +from springer.springer_process_file import process_xml from common.enhancer import Enhancer @@ -17,8 +18,8 @@ def articles(datadir): articles = [] for filename in sorted(listdir(datadir)): with open(datadir / filename) as file: - articles.append(ET.fromstring(file.read())) - + xml = process_xml(file.read()) + articles.append(ET.fromstring(xml)) return articles @@ -27,6 +28,21 @@ def parsed_articles(parser, articles): return [parser._publisher_specific_parsing(article) for article in articles] +def test_weird_titles(parsed_articles): + parsed_titles = sorted([a.get("title") for a in parsed_articles]) + expected_results = sorted([ + " $$(g-2)_{e,\\mu }$$ anomalies and decays $$h\\rightarrow e_a e_b$$ , " + "$$Z\\rightarrow e_ae_b$$ , and $$e_b\\rightarrow e_a \\gamma $$ in a two " + "Higgs doublet model with inverse seesaw neutrinos", + " $$\\Lambda $$ polarization in very high energy heavy ion collisions as a probe of the quark–gluon plasma formation and properties", + "A strategy for a general search for new phenomena using data-derived signal regions and its application within the ATLAS experiment", + "Revisiting the mechanical properties of the nucleon", + "Symmetry breaking in quantum curves and super Chern-Simons matrix models" + ]) + + assert expected_results == parsed_titles + + def test_authors(parsed_articles): expected_results = ( [ diff --git a/tests/units/springer/test_parser/weird.title.Meta b/tests/units/springer/test_parser/weird.title.Meta new file mode 100644 index 00000000..17511897 --- /dev/null +++ b/tests/units/springer/test_parser/weird.title.Meta @@ -0,0 +1,293 @@ + + + + + Springer Berlin Heidelberg + Berlin/Heidelberg + Springer + + + + 10052 + 10.1007/10052.1434-6052 + 1434-6052 + 30312819 + The European Physical Journal C + Particles and Fields + Eur. Phys. J. C + + Physics + Elementary Particles, Quantum Field Theory + Nuclear Physics, Heavy Ions, Hadrons + Quantum Field Theories, String Theory + Measurement Science and Instrumentation + Astronomy, Astrophysics and Cosmology + Nuclear Energy + Physics and Astronomy + + + + + 84 + 84 + 12 + + + + 9 + 9 + 114 + + + 2024 + 11 + 13 + + + 2024 + 9 + + 2024 + + + EDP Sciences, Societa Italiana di Fisica (SIF) and Springer-Verlag GmbH, DE, part of Springer Nature + 2024 + + +
+ + 13229 + 10.1140/epjc/s10052-024-13229-z + 920 + 42 + + + + + + Λ + + + polarization in very high energy heavy ion collisions as a probe of the quark–gluon plasma formation and properties + Regular Article –Theoretical Physics + 1 + 14 + + + 2024 + 8 + 14 + + + 2024 + 4 + 29 + + + 2024 + 8 + 10 + + + 2024 + 9 + 11 + + + + + Ministero dell’Università e della Ricerca + Advanced Probes of the Quark Gluon Plasma + + + Basic Energy Sciences + DE-FG88ER40388 + + + European Commission + NextGenerationEU + + + Grantová Agentura České Republiky + 22-25026S + + + + The Author(s) + 2024 + + + Open Access This article is licensed under a Creative Commons Attribution 4.0 International License, which permits use, sharing, adaptation, distribution and reproduction in any medium or format, as long as you give appropriate credit to the original author(s) and the source, provide a link to the Creative Commons licence, and indicate if changes were made. The images or other third party material in this article are included in the article’s Creative Commons licence, unless indicated otherwise in a credit line to the material. If material is not included in the article’s Creative Commons licence and your intended use is not permitted by statutory regulation or exceeds the permitted use, you will need to obtain permission directly from the copyright holder. To view a copy of this licence, visit + http://creativecommons.org/licenses/by/4.0/ + + . + Funded by SCOAP3. + + + + + + + + + + + + 10052 + 84 + 84 + 9 + 9 + + + + + + + Andrea + Palermo + + + andrea.palermo@stonybrook.edu + + + + + Eduardo + Grossi + + + + + Iurii + Karpenko + + + + + Francesco + Becattini + + + + https://ror.org/05qghxh33 + grid.36425.36 + 0000 0001 2216 9681 + Department of Physics and Astronomy, Center for Nuclear Theory + Stony Brook University + + Stony Brook + NY + 11794-3800 + USA + + + + grid.470204.5 + 0000 0001 2231 4148 + Università di Firenze and INFN Sezione di Firenze + + Via G. Sansone 1 + 50019 + Sesto Fiorentino + Florence + Italy + + + + https://ror.org/03kqpb082 + grid.6652.7 + 0000 0001 2173 8213 + Faculty of Nuclear Sciences and Physical Engineering + Czech Technical University in Prague + + Břehová 7 + 11519 + Prague 1 + Czech Republic + + + + + Abstract + We have studied the spin polarization of + + + + Λ + + + hyperons in heavy ion collisions at center-of-mass energies + + + + + + + s + NN + + + = + 200 + + + + GeV and + + + + + + + s + NN + + + = + 5.02 + + + + TeV carried out at RHIC and LHC colliders. We have calculated the mean spin vector at local thermodynamic equilibrium, including all known first-order terms in the gradients of the thermo-hydrodynamic fields, assuming that the hadronization hypersurface has a uniform temperature. We have also included the feed-down contributions to the polarization of + + + + Λ + + + stemming from the decays of polarized + + + + + Σ + + + + + and + + + + + Σ + 0 + + + + hyperons. The obtained results are in good agreement with the data. In general, the component of the spin vector along the global angular momentum, orthogonal to the reaction plane, shows strong sensitivity to the initial longitudinal flow velocity. Furthermore, the longitudinal component of the spin vector turns out to be very sensitive to the bulk viscosity of the plasma at the highest LHC energy. Therefore, the azimuthal dependence of spin polarization can effectively constrain the initial hydrodynamic conditions and the transport coefficients of the quark gluon plasma. + + + +
+
+
+
+
\ No newline at end of file