From 436800f49ab0f423811d9b08ef16ccd884c6bee9 Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Sun, 10 Dec 2023 20:29:11 +0100 Subject: [PATCH 1/2] Add remove_multiple_whitespaces function to remove multiple whitespaces from text --- mltb2/text.py | 18 +++++++++++++++++- tests/test_text.py | 25 +++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/mltb2/text.py b/mltb2/text.py index ed61399..2caa681 100644 --- a/mltb2/text.py +++ b/mltb2/text.py @@ -4,7 +4,8 @@ """Text specific module.""" -from typing import Dict, Final, Tuple +import re +from typing import Dict, Final, Pattern, Tuple INVISIBLE_CHARACTERS: Final[Tuple[str, ...]] = ( "\u200b", # Zero Width Space (ZWSP) https://www.compart.com/en/unicode/U+200b @@ -37,6 +38,9 @@ SPECIAL_WHITESPACES_TRANS: Final[Dict[int, str]] = str.maketrans({char: " " for char in SPECIAL_WHITESPACES}) +MULTI_SPACE_PATTERN: Pattern = re.compile(r" {2,}") + + def remove_invisible_characters(text: str) -> str: """Remove invisible characters from text. @@ -91,3 +95,15 @@ def has_special_whitespaces(text: str) -> bool: ``True`` if the text contains special whitespaces, ``False`` otherwise. """ return any(char in text for char in SPECIAL_WHITESPACES) + + +def remove_multiple_whitespaces(text: str) -> str: + """Remove multiple whitespaces from text. + + Args: + text: The text from which the multiple whitespaces are to be removed. + + Returns: + The cleaned text. + """ + return MULTI_SPACE_PATTERN.sub(" ", text) diff --git a/tests/test_text.py b/tests/test_text.py index bd56674..e11630c 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -10,6 +10,7 @@ has_invisible_characters, has_special_whitespaces, remove_invisible_characters, + remove_multiple_whitespaces, replace_special_whitespaces, ) @@ -74,3 +75,27 @@ def test_has_special_whitespaces_false(): text = "Hello you!" result = has_special_whitespaces(text) assert not result + + +def test_remove_multiple_whitespaces(): + text = "Hello World !" + result = remove_multiple_whitespaces(text) + assert result == "Hello World !" + + +def test_remove_multiple_whitespaces_empty(): + text = "" + result = remove_multiple_whitespaces(text) + assert result == "" + + +def test_remove_multiple_whitespaces_empty_result(): + text = " " + result = remove_multiple_whitespaces(text) + assert result == " " + + +def test_remove_multiple_whitespaces_one_space(): + text = " " + result = remove_multiple_whitespaces(text) + assert result == " " From de0ed6f2d3d77f8192114252b1bad05267452aec Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Sun, 10 Dec 2023 21:03:56 +0100 Subject: [PATCH 2/2] add clean_all_invisible_chars_and_whitespaces --- mltb2/text.py | 38 +++++++++++++++++++++++++++++++------- tests/test_text.py | 31 ++++++++++++++++++++++--------- 2 files changed, 53 insertions(+), 16 deletions(-) diff --git a/mltb2/text.py b/mltb2/text.py index 2caa681..2138347 100644 --- a/mltb2/text.py +++ b/mltb2/text.py @@ -37,6 +37,7 @@ SPECIAL_WHITESPACES_TRANS: Final[Dict[int, str]] = str.maketrans({char: " " for char in SPECIAL_WHITESPACES}) +INVISIBLE_CHARACTERS_AND_SPECIAL_WHITESPACES_TRANS = {**SPECIAL_WHITESPACES_TRANS, **INVISIBLE_CHARACTERS_TRANS} MULTI_SPACE_PATTERN: Pattern = re.compile(r" {2,}") @@ -44,7 +45,7 @@ def remove_invisible_characters(text: str) -> str: """Remove invisible characters from text. - The invisible characters are defined in the constant `INVISIBLE_CHARACTERS`. + The invisible characters are defined in the constant ``INVISIBLE_CHARACTERS``. Args: text: The text from which the invisible characters are to be removed. @@ -58,7 +59,7 @@ def remove_invisible_characters(text: str) -> str: def has_invisible_characters(text: str) -> bool: """Check if text contains invisible characters. - The invisible characters are defined in the constant `INVISIBLE_CHARACTERS`. + The invisible characters are defined in the constant ``INVISIBLE_CHARACTERS``. Args: text: The text to check. @@ -72,7 +73,7 @@ def has_invisible_characters(text: str) -> bool: def replace_special_whitespaces(text: str) -> str: """Replace special whitespaces with normal whitespaces. - The special whitespaces are defined in the constant `SPECIAL_WHITESPACES`. + The special whitespaces are defined in the constant ``SPECIAL_WHITESPACES``. Args: text: The text from which the special whitespaces are to be replaced. @@ -86,7 +87,7 @@ def replace_special_whitespaces(text: str) -> str: def has_special_whitespaces(text: str) -> bool: """Check if text contains special whitespaces. - The special whitespaces are defined in the constant `SPECIAL_WHITESPACES`. + The special whitespaces are defined in the constant ``SPECIAL_WHITESPACES``. Args: text: The text to check. @@ -97,13 +98,36 @@ def has_special_whitespaces(text: str) -> bool: return any(char in text for char in SPECIAL_WHITESPACES) -def remove_multiple_whitespaces(text: str) -> str: - """Remove multiple whitespaces from text. +def replace_multiple_whitespaces(text: str) -> str: + """Replace multiple whitespaces with single whitespace. Args: - text: The text from which the multiple whitespaces are to be removed. + text: The text from which the multiple whitespaces are to be replaced. Returns: The cleaned text. """ return MULTI_SPACE_PATTERN.sub(" ", text) + + +def clean_all_invisible_chars_and_whitespaces(text: str) -> str: + """Clean text form invisible characters and whitespaces. + + - Remove invisible characters from text. + - Replace special whitespaces with normal whitespaces. + - Replace multiple whitespaces with single whitespace. + - Remove leading and trailing whitespaces. + + The invisible characters are defined in the constant ``INVISIBLE_CHARACTERS``. + The special whitespaces are defined in the constant ``SPECIAL_WHITESPACES``. + + Args: + text: The text to clean. + + Rteturns: + The cleaned text. + """ + text = text.translate(INVISIBLE_CHARACTERS_AND_SPECIAL_WHITESPACES_TRANS) + text = replace_multiple_whitespaces(text) + text = text.strip() + return text diff --git a/tests/test_text.py b/tests/test_text.py index e11630c..708e987 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -7,10 +7,11 @@ from mltb2.text import ( INVISIBLE_CHARACTERS, SPECIAL_WHITESPACES, + clean_all_invisible_chars_and_whitespaces, has_invisible_characters, has_special_whitespaces, remove_invisible_characters, - remove_multiple_whitespaces, + replace_multiple_whitespaces, replace_special_whitespaces, ) @@ -77,25 +78,37 @@ def test_has_special_whitespaces_false(): assert not result -def test_remove_multiple_whitespaces(): +def test_replace_multiple_whitespaces(): text = "Hello World !" - result = remove_multiple_whitespaces(text) + result = replace_multiple_whitespaces(text) assert result == "Hello World !" -def test_remove_multiple_whitespaces_empty(): +def test_replace_multiple_whitespaces_empty(): text = "" - result = remove_multiple_whitespaces(text) + result = replace_multiple_whitespaces(text) assert result == "" -def test_remove_multiple_whitespaces_empty_result(): +def test_replace_multiple_whitespaces_empty_result(): text = " " - result = remove_multiple_whitespaces(text) + result = replace_multiple_whitespaces(text) assert result == " " -def test_remove_multiple_whitespaces_one_space(): +def test_replace_multiple_whitespaces_one_space(): text = " " - result = remove_multiple_whitespaces(text) + result = replace_multiple_whitespaces(text) assert result == " " + + +def test_clean_all_invisible_chars_and_whitespaces(): + text = " Hello\u200bWorld\u00ad! How\u2007 are you? " + result = clean_all_invisible_chars_and_whitespaces(text) + assert result == "HelloWorld! How are you?" + + +def test_clean_all_invisible_chars_and_whitespaces_empty_result(): + text = " \u200b\u00ad\u2007 " + result = clean_all_invisible_chars_and_whitespaces(text) + assert result == ""