Skip to content

Commit

Permalink
Add XML tag detection function. (#139)
Browse files Browse the repository at this point in the history
* Add XML tag detection function

* Add detection of XML tags in text.py

* Add clarification in has_xml_tag function

* Update copyright year in text.py
  • Loading branch information
PhilipMay authored Jan 5, 2024
1 parent 7b88367 commit 1f6c9ac
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 1 deletion.
25 changes: 24 additions & 1 deletion mltb2/text.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023 Philip May
# Copyright (c) 2023-2024 Philip May
# This software is distributed under the terms of the MIT license
# which is available at https://opensource.org/licenses/MIT

Expand Down Expand Up @@ -54,6 +54,29 @@

MULTI_SPACE_PATTERN: Pattern = re.compile(r" {2,}")

XML_TAG_PATTERN: Pattern = re.compile(r"<\/?[\w:]+( \/|\/|)>")


def has_xml_tag(text: str) -> bool:
"""Check if text contains XML tags (one or multiple).
These are some XML tags we detect:
- ``<xml_tag>``
- ``<xml:tag>``
- ``</xml_tag>``
- ``<xml_tag/>``
- ``<xml_tag />``
While we do not detect ``a < b but x > y``.
Args:
text: The text to check.
Returns:
``True`` if the text contains XML tags, ``False`` otherwise.
"""
return re.search(XML_TAG_PATTERN, text) is not None


def remove_invisible_characters(text: str) -> str:
"""Remove invisible characters from text.
Expand Down
36 changes: 36 additions & 0 deletions tests/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
clean_all_invisible_chars_and_whitespaces,
has_invisible_characters,
has_special_whitespaces,
has_xml_tag,
remove_invisible_characters,
replace_multiple_whitespaces,
replace_special_whitespaces,
Expand Down Expand Up @@ -229,3 +230,38 @@ def test_normalize_counter_to_defaultdict_empty_counter():

assert isinstance(normalized_counter, defaultdict)
assert len(normalized_counter) == 0


@pytest.mark.parametrize(
"text",
[
"Some text<ta_g>more text",
"Some text<ta:g>more text",
"Some text</tag>more text",
"Some text<tag/>more text",
"Some text<tag />more text",
],
)
def test_has_xml_tag_with_tags(text: str):
assert has_xml_tag(text)


@pytest.mark.parametrize(
"text",
[
"Some text",
"",
"a < b but x > y",
],
)
def test_has_xml_tag_without_tags(text: str):
assert not has_xml_tag(text)


@settings(max_examples=1000)
@given(text())
def test_has_xml_tag_hypothesis(text: str):
result = has_xml_tag(text)
if result:
assert "<" in text
assert ">" in text

0 comments on commit 1f6c9ac

Please sign in to comment.