diff --git a/mltb2/text.py b/mltb2/text.py index d998c4e..8eae3d4 100644 --- a/mltb2/text.py +++ b/mltb2/text.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 Philip May +# Copyright (c) 2023-2024 Philip May # This software is distributed under the terms of the MIT license # which is available at https://opensource.org/licenses/MIT @@ -54,6 +54,29 @@ MULTI_SPACE_PATTERN: Pattern = re.compile(r" {2,}") +XML_TAG_PATTERN: Pattern = re.compile(r"<\/?[\w:]+( \/|\/|)>") + + +def has_xml_tag(text: str) -> bool: + """Check if text contains XML tags (one or multiple). + + These are some XML tags we detect: + + - ```` + - ```` + - ```` + - ```` + - ```` + + While we do not detect ``a < b but x > y``. + + Args: + text: The text to check. + Returns: + ``True`` if the text contains XML tags, ``False`` otherwise. + """ + return re.search(XML_TAG_PATTERN, text) is not None + def remove_invisible_characters(text: str) -> str: """Remove invisible characters from text. diff --git a/tests/test_text.py b/tests/test_text.py index 9c1d949..2d6551f 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -17,6 +17,7 @@ clean_all_invisible_chars_and_whitespaces, has_invisible_characters, has_special_whitespaces, + has_xml_tag, remove_invisible_characters, replace_multiple_whitespaces, replace_special_whitespaces, @@ -229,3 +230,38 @@ def test_normalize_counter_to_defaultdict_empty_counter(): assert isinstance(normalized_counter, defaultdict) assert len(normalized_counter) == 0 + + +@pytest.mark.parametrize( + "text", + [ + "Some textmore text", + "Some textmore text", + "Some textmore text", + "Some textmore text", + "Some textmore text", + ], +) +def test_has_xml_tag_with_tags(text: str): + assert has_xml_tag(text) + + +@pytest.mark.parametrize( + "text", + [ + "Some text", + "", + "a < b but x > y", + ], +) +def test_has_xml_tag_without_tags(text: str): + assert not has_xml_tag(text) + + +@settings(max_examples=1000) +@given(text()) +def test_has_xml_tag_hypothesis(text: str): + result = has_xml_tag(text) + if result: + assert "<" in text + assert ">" in text