Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add whitespace cleaner. #110

Merged
merged 2 commits into from
Dec 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 45 additions & 5 deletions mltb2/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

"""Text specific module."""

from typing import Dict, Final, Tuple
import re
from typing import Dict, Final, Pattern, Tuple

INVISIBLE_CHARACTERS: Final[Tuple[str, ...]] = (
"\u200b", # Zero Width Space (ZWSP) https://www.compart.com/en/unicode/U+200b
Expand Down Expand Up @@ -36,11 +37,15 @@

SPECIAL_WHITESPACES_TRANS: Final[Dict[int, str]] = str.maketrans({char: " " for char in SPECIAL_WHITESPACES})

INVISIBLE_CHARACTERS_AND_SPECIAL_WHITESPACES_TRANS = {**SPECIAL_WHITESPACES_TRANS, **INVISIBLE_CHARACTERS_TRANS}

MULTI_SPACE_PATTERN: Pattern = re.compile(r" {2,}")


def remove_invisible_characters(text: str) -> str:
"""Remove invisible characters from text.

The invisible characters are defined in the constant `INVISIBLE_CHARACTERS`.
The invisible characters are defined in the constant ``INVISIBLE_CHARACTERS``.

Args:
text: The text from which the invisible characters are to be removed.
Expand All @@ -54,7 +59,7 @@ def remove_invisible_characters(text: str) -> str:
def has_invisible_characters(text: str) -> bool:
"""Check if text contains invisible characters.

The invisible characters are defined in the constant `INVISIBLE_CHARACTERS`.
The invisible characters are defined in the constant ``INVISIBLE_CHARACTERS``.

Args:
text: The text to check.
Expand All @@ -68,7 +73,7 @@ def has_invisible_characters(text: str) -> bool:
def replace_special_whitespaces(text: str) -> str:
"""Replace special whitespaces with normal whitespaces.

The special whitespaces are defined in the constant `SPECIAL_WHITESPACES`.
The special whitespaces are defined in the constant ``SPECIAL_WHITESPACES``.

Args:
text: The text from which the special whitespaces are to be replaced.
Expand All @@ -82,7 +87,7 @@ def replace_special_whitespaces(text: str) -> str:
def has_special_whitespaces(text: str) -> bool:
"""Check if text contains special whitespaces.

The special whitespaces are defined in the constant `SPECIAL_WHITESPACES`.
The special whitespaces are defined in the constant ``SPECIAL_WHITESPACES``.

Args:
text: The text to check.
Expand All @@ -91,3 +96,38 @@ def has_special_whitespaces(text: str) -> bool:
``True`` if the text contains special whitespaces, ``False`` otherwise.
"""
return any(char in text for char in SPECIAL_WHITESPACES)


def replace_multiple_whitespaces(text: str) -> str:
"""Replace multiple whitespaces with single whitespace.

Args:
text: The text from which the multiple whitespaces are to be replaced.

Returns:
The cleaned text.
"""
return MULTI_SPACE_PATTERN.sub(" ", text)


def clean_all_invisible_chars_and_whitespaces(text: str) -> str:
"""Clean text form invisible characters and whitespaces.

- Remove invisible characters from text.
- Replace special whitespaces with normal whitespaces.
- Replace multiple whitespaces with single whitespace.
- Remove leading and trailing whitespaces.

The invisible characters are defined in the constant ``INVISIBLE_CHARACTERS``.
The special whitespaces are defined in the constant ``SPECIAL_WHITESPACES``.

Args:
text: The text to clean.

Rteturns:
The cleaned text.
"""
text = text.translate(INVISIBLE_CHARACTERS_AND_SPECIAL_WHITESPACES_TRANS)
text = replace_multiple_whitespaces(text)
text = text.strip()
return text
38 changes: 38 additions & 0 deletions tests/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
from mltb2.text import (
INVISIBLE_CHARACTERS,
SPECIAL_WHITESPACES,
clean_all_invisible_chars_and_whitespaces,
has_invisible_characters,
has_special_whitespaces,
remove_invisible_characters,
replace_multiple_whitespaces,
replace_special_whitespaces,
)

Expand Down Expand Up @@ -74,3 +76,39 @@ def test_has_special_whitespaces_false():
text = "Hello you!"
result = has_special_whitespaces(text)
assert not result


def test_replace_multiple_whitespaces():
text = "Hello World !"
result = replace_multiple_whitespaces(text)
assert result == "Hello World !"


def test_replace_multiple_whitespaces_empty():
text = ""
result = replace_multiple_whitespaces(text)
assert result == ""


def test_replace_multiple_whitespaces_empty_result():
text = " "
result = replace_multiple_whitespaces(text)
assert result == " "


def test_replace_multiple_whitespaces_one_space():
text = " "
result = replace_multiple_whitespaces(text)
assert result == " "


def test_clean_all_invisible_chars_and_whitespaces():
text = " Hello\u200bWorld\u00ad! How\u2007 are you? "
result = clean_all_invisible_chars_and_whitespaces(text)
assert result == "HelloWorld! How are you?"


def test_clean_all_invisible_chars_and_whitespaces_empty_result():
text = " \u200b\u00ad\u2007 "
result = clean_all_invisible_chars_and_whitespaces(text)
assert result == ""