Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cleaned up regex function by introducing constants for patterns #143

Draft
wants to merge 20 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,46 @@ def test_pipeline_stopwords(self):
pipeline = [preprocessing.lowercase, preprocessing.remove_stopwords]
self.assertEqual(preprocessing.clean(s, pipeline=pipeline), s_true)

"""
Test clean
"""

def _get_default_clean_pipeline(self):
henrifroese marked this conversation as resolved.
Show resolved Hide resolved
"""
Return a list contaning all the methods used in the default cleaning pipeline.

Return a list with the following functions:
1. :meth:`texthero.preprocessing.fillna`
2. :meth:`texthero.preprocessing.lowercase`
3. :meth:`texthero.preprocessing.remove_digits`
4. :meth:`texthero.preprocessing.remove_punctuation`
5. :meth:`texthero.preprocessing.remove_diacritics`
6. :meth:`texthero.preprocessing.remove_stopwords`
7. :meth:`texthero.preprocessing.remove_whitespace`
"""

return [
preprocessing.fillna,
preprocessing.lowercase,
preprocessing.remove_digits,
preprocessing.remove_punctuation,
preprocessing.remove_diacritics,
preprocessing.remove_stopwords,
preprocessing.remove_whitespace,
]

def test_clean(self):
s = pd.Series(
["This serös 42 should bE CLeaned.! I am a stopword \n", np.NAN]
)
s_true = pd.Series(
["This serös 42 should bE CLeaned.! I am a stopword \n", np.NAN]
)
self.assertEqual(
preprocessing.clean(s),
preprocessing.clean(s_true, self._get_default_clean_pipeline()),
)

"""
Test stopwords.
"""
Expand Down
81 changes: 46 additions & 35 deletions texthero/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,44 @@
import numpy as np
import pandas as pd
import unidecode

# Ignore gensim annoying warnings
import warnings
from nltk.stem import PorterStemmer, SnowballStemmer

from texthero import stopwords as _stopwords

from typing import List, Callable

# Ignore gensim annoying warnings
import warnings
"""
Define all regex pattern, which will be used in the functions below. They define different charateristics, on how to clean
henrifroese marked this conversation as resolved.
Show resolved Hide resolved
a text
"""

DIGITS_BLOCK = r"\b\d+\b"
PUNCTUATION = rf"([{string.punctuation}])+"
STOPWORD_TOKENIZER = r"""(?x) # Set flag to allow verbose regexps
\w+(?:-\w+)* # Words with optional internal hyphens
| \s* # Any space
| [][!"#$%&'*+,-./:;<=>?@\\^():_`{|}~] # Any symbol
"""
ROUND_BRACKETS = r"\([^()]*\)"
CURLY_BRACKETS = r"\{[^{}]*\}"
SQUARE_BRACKETS = r"\[[^\[\]]*\]"
ANGLE_BRACKETS = r"<[^<>]*>"
HTML_TAGS = r"""(?x) # Turn on free-spacing
<[^>]+> # Remove <html> tags
| &([a-z0-9]+|\#[0-9]{1,6}|\#x[0-9a-f]{1,6}); # Remove &nbsp;
"""
URLS = r"http\S+"
TAGS = r"@[a-zA-Z0-9]+"
HASHTAGS = r"#[a-zA-Z0-9_]+"

# In regex, the metacharacter 'w' is "a-z, A-Z, 0-9, including the _ (underscore) character." We therefore remove it from the punctuation string as this is already included in \w
punct = string.punctuation.replace("_", "")
TOKENIZE = rf"((\w)([{punct}])(?:\B|$)|(?:^|\B)([{punct}])(\w))" # The standart tokenisation will seperate all "regex words" '\w' from each other and also
# puts the punctuation in its own tokens


warnings.filterwarnings(action="ignore", category=UserWarning, module="gensim")

Expand Down Expand Up @@ -91,8 +121,7 @@ def replace_digits(s: pd.Series, symbols: str = " ", only_blocks=True) -> pd.Ser
"""

if only_blocks:
pattern = r"\b\d+\b"
return s.str.replace(pattern, symbols)
return s.str.replace(DIGITS_BLOCK, symbols)
else:
return s.str.replace(r"\d+", symbols)

Expand Down Expand Up @@ -157,7 +186,7 @@ def replace_punctuation(s: pd.Series, symbol: str = " ") -> pd.Series:
dtype: object
"""

return s.str.replace(rf"([{string.punctuation}])+", symbol)
return s.str.replace(PUNCTUATION, symbol)


def remove_punctuation(s: pd.Series) -> pd.Series:
Expand Down Expand Up @@ -266,13 +295,9 @@ def _replace_stopwords(text: str, words: Set[str], symbol: str = " ") -> str:

"""

pattern = r"""(?x) # Set flag to allow verbose regexps
\w+(?:-\w+)* # Words with optional internal hyphens
| \s* # Any space
| [][!"#$%&'*+,-./:;<=>?@\\^():_`{|}~] # Any symbol
"""

return "".join(t if t not in words else symbol for t in re.findall(pattern, text))
return "".join(
t if t not in words else symbol for t in re.findall(STOPWORD_TOKENIZER, text)
)


def replace_stopwords(
Expand Down Expand Up @@ -525,7 +550,7 @@ def remove_round_brackets(s: pd.Series) -> pd.Series:
:meth:`remove_square_brackets`

"""
return s.str.replace(r"\([^()]*\)", "")
return s.str.replace(ROUND_BRACKETS, "")


def remove_curly_brackets(s: pd.Series) -> pd.Series:
Expand All @@ -549,7 +574,7 @@ def remove_curly_brackets(s: pd.Series) -> pd.Series:
:meth:`remove_square_brackets`

"""
return s.str.replace(r"\{[^{}]*\}", "")
return s.str.replace(CURLY_BRACKETS, "")


def remove_square_brackets(s: pd.Series) -> pd.Series:
Expand All @@ -574,7 +599,7 @@ def remove_square_brackets(s: pd.Series) -> pd.Series:


"""
return s.str.replace(r"\[[^\[\]]*\]", "")
return s.str.replace(SQUARE_BRACKETS, "")


def remove_angle_brackets(s: pd.Series) -> pd.Series:
Expand All @@ -598,7 +623,7 @@ def remove_angle_brackets(s: pd.Series) -> pd.Series:
:meth:`remove_square_brackets`

"""
return s.str.replace(r"<[^<>]*>", "")
return s.str.replace(ANGLE_BRACKETS, "")


def remove_brackets(s: pd.Series) -> pd.Series:
Expand Down Expand Up @@ -651,12 +676,7 @@ def remove_html_tags(s: pd.Series) -> pd.Series:

"""

pattern = r"""(?x) # Turn on free-spacing
<[^>]+> # Remove <html> tags
| &([a-z0-9]+|\#[0-9]{1,6}|\#x[0-9a-f]{1,6}); # Remove &nbsp;
"""

return s.str.replace(pattern, "")
return s.str.replace(HTML_TAGS, "")


def tokenize(s: pd.Series) -> pd.Series:
Expand All @@ -680,12 +700,7 @@ def tokenize(s: pd.Series) -> pd.Series:

"""

punct = string.punctuation.replace("_", "")
# In regex, the metacharacter 'w' is "a-z, A-Z, 0-9, including the _ (underscore) character." We therefore remove it from the punctuation string as this is already included in \w

pattern = rf"((\w)([{punct}])(?:\B|$)|(?:^|\B)([{punct}])(\w))"

return s.str.replace(pattern, r"\2 \3 \4 \5").str.split()
return s.str.replace(TOKENIZE, r"\2 \3 \4 \5").str.split()


# Warning message for not-tokenized inputs
Expand Down Expand Up @@ -775,9 +790,7 @@ def replace_urls(s: pd.Series, symbol: str) -> pd.Series:

"""

pattern = r"http\S+"

return s.str.replace(pattern, symbol)
return s.str.replace(URLS, symbol)


def remove_urls(s: pd.Series) -> pd.Series:
Expand Down Expand Up @@ -826,8 +839,7 @@ def replace_tags(s: pd.Series, symbol: str) -> pd.Series:

"""

pattern = r"@[a-zA-Z0-9]+"
return s.str.replace(pattern, symbol)
return s.str.replace(TAGS, symbol)


def remove_tags(s: pd.Series) -> pd.Series:
Expand Down Expand Up @@ -873,8 +885,7 @@ def replace_hashtags(s: pd.Series, symbol: str) -> pd.Series:
dtype: object

"""
pattern = r"#[a-zA-Z0-9_]+"
return s.str.replace(pattern, symbol)
return s.str.replace(HASHTAGS, symbol)


def remove_hashtags(s: pd.Series) -> pd.Series:
Expand Down