jbesomi · mk2510 · Jul 26, 2020 · Jul 27, 2020 · Jul 27, 2020 · Jul 27, 2020
diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
@@ -114,6 +114,46 @@ def test_pipeline_stopwords(self):
         pipeline = [preprocessing.lowercase, preprocessing.remove_stopwords]
         self.assertEqual(preprocessing.clean(s, pipeline=pipeline), s_true)
 
+    """
+    Test clean
+    """
+
+    def _get_default_clean_pipeline(self):
+        """
+        Return a list contaning all the methods used in the default cleaning pipeline.
+
+        Return a list with the following functions:
+        1. :meth:`texthero.preprocessing.fillna`
+        2. :meth:`texthero.preprocessing.lowercase`
+        3. :meth:`texthero.preprocessing.remove_digits`
+        4. :meth:`texthero.preprocessing.remove_punctuation`
+        5. :meth:`texthero.preprocessing.remove_diacritics`
+        6. :meth:`texthero.preprocessing.remove_stopwords`
+        7. :meth:`texthero.preprocessing.remove_whitespace`
+        """
+
+        return [
+            preprocessing.fillna,
+            preprocessing.lowercase,
+            preprocessing.remove_digits,
+            preprocessing.remove_punctuation,
+            preprocessing.remove_diacritics,
+            preprocessing.remove_stopwords,
+            preprocessing.remove_whitespace,
+        ]
+
+    def test_clean(self):
+        s = pd.Series(
+            ["This serös 42 should bE CLeaned.! I am a stopword    \n", np.NAN]
+        )
+        s_true = pd.Series(
+            ["This serös 42 should bE CLeaned.! I am a stopword    \n", np.NAN]
+        )
+        self.assertEqual(
+            preprocessing.clean(s),
+            preprocessing.clean(s_true, self._get_default_clean_pipeline()),
+        )
+
     """
     Test stopwords.
     """

diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py
@@ -11,14 +11,44 @@
 import numpy as np
 import pandas as pd
 import unidecode
+
+# Ignore gensim annoying warnings
+import warnings
 from nltk.stem import PorterStemmer, SnowballStemmer
 
 from texthero import stopwords as _stopwords
 
 from typing import List, Callable
 
-# Ignore gensim annoying warnings
-import warnings
+"""
+Define all regex pattern, which will be used in the functions below. They define different charateristics, on how to clean
+a text
+"""
+
+DIGITS_BLOCK = r"\b\d+\b"
+PUNCTUATION = rf"([{string.punctuation}])+"
+STOPWORD_TOKENIZER = r"""(?x)                          # Set flag to allow verbose regexps
+                    \w+(?:-\w+)*                              # Words with optional internal hyphens 
+                    | \s*                                     # Any space
+                    | [][!"#$%&'*+,-./:;<=>?@\\^():_`{|}~]    # Any symbol 
+                    """
+ROUND_BRACKETS = r"\([^()]*\)"
+CURLY_BRACKETS = r"\{[^{}]*\}"
+SQUARE_BRACKETS = r"\[[^\[\]]*\]"
+ANGLE_BRACKETS = r"<[^<>]*>"
+HTML_TAGS = r"""(?x)                    # Turn on free-spacing
+            <[^>]+>                             # Remove <html> tags
+            | &([a-z0-9]+|\#[0-9]{1,6}|\#x[0-9a-f]{1,6}); # Remove &nbsp;
+            """
+URLS = r"http\S+"
+TAGS = r"@[a-zA-Z0-9]+"
+HASHTAGS = r"#[a-zA-Z0-9_]+"
+
+# In regex, the metacharacter 'w' is "a-z, A-Z, 0-9, including the _ (underscore) character." We therefore remove it from the punctuation string as this is already included in \w
+punct = string.punctuation.replace("_", "")
+TOKENIZE = rf"((\w)([{punct}])(?:\B|$)|(?:^|\B)([{punct}])(\w))"  # The standart tokenisation will seperate all "regex words" '\w' from each other and also
+# puts the punctuation in its own tokens
+
 
 warnings.filterwarnings(action="ignore", category=UserWarning, module="gensim")
 
@@ -91,8 +121,7 @@ def replace_digits(s: pd.Series, symbols: str = " ", only_blocks=True) -> pd.Ser
     """
 
     if only_blocks:
-        pattern = r"\b\d+\b"
-        return s.str.replace(pattern, symbols)
+        return s.str.replace(DIGITS_BLOCK, symbols)
     else:
         return s.str.replace(r"\d+", symbols)
 
@@ -157,7 +186,7 @@ def replace_punctuation(s: pd.Series, symbol: str = " ") -> pd.Series:
     dtype: object
     """
 
-    return s.str.replace(rf"([{string.punctuation}])+", symbol)
+    return s.str.replace(PUNCTUATION, symbol)
 
 
 def remove_punctuation(s: pd.Series) -> pd.Series:
@@ -266,13 +295,9 @@ def _replace_stopwords(text: str, words: Set[str], symbol: str = " ") -> str:
 
     """
 
-    pattern = r"""(?x)                          # Set flag to allow verbose regexps
-      \w+(?:-\w+)*                              # Words with optional internal hyphens 
-      | \s*                                     # Any space
-      | [][!"#$%&'*+,-./:;<=>?@\\^():_`{|}~]    # Any symbol 
-    """
-
-    return "".join(t if t not in words else symbol for t in re.findall(pattern, text))
+    return "".join(
+        t if t not in words else symbol for t in re.findall(STOPWORD_TOKENIZER, text)
+    )
 
 
 def replace_stopwords(
@@ -525,7 +550,7 @@ def remove_round_brackets(s: pd.Series) -> pd.Series:
     :meth:`remove_square_brackets`
 
     """
-    return s.str.replace(r"\([^()]*\)", "")
+    return s.str.replace(ROUND_BRACKETS, "")
 
 
 def remove_curly_brackets(s: pd.Series) -> pd.Series:
@@ -549,7 +574,7 @@ def remove_curly_brackets(s: pd.Series) -> pd.Series:
     :meth:`remove_square_brackets`
 
     """
-    return s.str.replace(r"\{[^{}]*\}", "")
+    return s.str.replace(CURLY_BRACKETS, "")
 
 
 def remove_square_brackets(s: pd.Series) -> pd.Series:
@@ -574,7 +599,7 @@ def remove_square_brackets(s: pd.Series) -> pd.Series:
 
 
     """
-    return s.str.replace(r"\[[^\[\]]*\]", "")
+    return s.str.replace(SQUARE_BRACKETS, "")
 
 
 def remove_angle_brackets(s: pd.Series) -> pd.Series:
@@ -598,7 +623,7 @@ def remove_angle_brackets(s: pd.Series) -> pd.Series:
     :meth:`remove_square_brackets`
 
     """
-    return s.str.replace(r"<[^<>]*>", "")
+    return s.str.replace(ANGLE_BRACKETS, "")
 
 
 def remove_brackets(s: pd.Series) -> pd.Series:
@@ -651,12 +676,7 @@ def remove_html_tags(s: pd.Series) -> pd.Series:
 
     """
 
-    pattern = r"""(?x)                    # Turn on free-spacing
-      <[^>]+>                             # Remove <html> tags
-      | &([a-z0-9]+|\#[0-9]{1,6}|\#x[0-9a-f]{1,6}); # Remove &nbsp;
-      """
-
-    return s.str.replace(pattern, "")
+    return s.str.replace(HTML_TAGS, "")
 
 
 def tokenize(s: pd.Series) -> pd.Series:
@@ -680,12 +700,7 @@ def tokenize(s: pd.Series) -> pd.Series:
 
     """
 
-    punct = string.punctuation.replace("_", "")
-    # In regex, the metacharacter 'w' is "a-z, A-Z, 0-9, including the _ (underscore) character." We therefore remove it from the punctuation string as this is already included in \w
-
-    pattern = rf"((\w)([{punct}])(?:\B|$)|(?:^|\B)([{punct}])(\w))"
-
-    return s.str.replace(pattern, r"\2 \3 \4 \5").str.split()
+    return s.str.replace(TOKENIZE, r"\2 \3 \4 \5").str.split()
 
 
 # Warning message for not-tokenized inputs
@@ -775,9 +790,7 @@ def replace_urls(s: pd.Series, symbol: str) -> pd.Series:
 
     """
 
-    pattern = r"http\S+"
-
-    return s.str.replace(pattern, symbol)
+    return s.str.replace(URLS, symbol)
 
 
 def remove_urls(s: pd.Series) -> pd.Series:
@@ -826,8 +839,7 @@ def replace_tags(s: pd.Series, symbol: str) -> pd.Series:
 
     """
 
-    pattern = r"@[a-zA-Z0-9]+"
-    return s.str.replace(pattern, symbol)
+    return s.str.replace(TAGS, symbol)
 
 
 def remove_tags(s: pd.Series) -> pd.Series:
@@ -873,8 +885,7 @@ def replace_hashtags(s: pd.Series, symbol: str) -> pd.Series:
     dtype: object
 
     """
-    pattern = r"#[a-zA-Z0-9_]+"
-    return s.str.replace(pattern, symbol)
+    return s.str.replace(HASHTAGS, symbol)
 
 
 def remove_hashtags(s: pd.Series) -> pd.Series: