Release 0.3.0

snguyenthanh · Nov 16, 2018 · dc088b5 · dc088b5
1 parent 9645617
commit dc088b5
Show file tree

Hide file tree

Showing 6 changed files with 34 additions and 28 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 better_profanity
 ---
-*A Python library to clean swear words in strings.*
+*A Python library to clean swear words (and their leetspeak) in strings*
 
 [![release](https://img.shields.io/badge/dynamic/json.svg?label=release&url=https%3A%2F%2Fpypi.org%2Fpypi%2Fbetter-profanity%2Fjson&query=%24.info.version&colorB=blue)](https://github.com/snguyenthanh/better_profanity/releases/latest)
 [![Build Status](https://travis-ci.com/snguyenthanh/better_profanity.svg?branch=master)](https://travis-ci.com/snguyenthanh/better_profanity)
@@ -10,32 +10,34 @@ better_profanity
 
 Inspired from package [profanity](https://github.com/ben174/profanity) of [Ben Friedland](https://github.com/ben174), this library is much faster than the original one, by using string comparison instead of regex.
 
+It partially supports [modified spellings](https://en.wikipedia.org/wiki/Leet) (such as `p0rn`, `h4ndjob` and `handj0b`).
+
 ## Requirements
 To make use of Python static typing, this package only works with `Python 3.6+`.
 
 ## Installation
-
-### 1. *Stable* version:
 ```
 $ pip install better_profanity
 ```
 
-### 2. *Beta* version
-```
-$ pip install better-profanity==0.3b0
-```
-
 ## Unicode characters
 A huge thanks to [@Derfirm](https://github.com/Derfirm) for adding support for Unicode characters.
 
-Currently, the Unicode support is only available in [*beta* release `0.3-beta.0`](https://pypi.org/project/better-profanity/0.3b0/).
-
-For release `0.3-beta.0`, only Unicode characters from categories `Ll`, `Lu`, `Mc` and `Mn` are added. More on Unicode categories can be found [here][unicode category link].
+Only Unicode characters from categories `Ll`, `Lu`, `Mc` and `Mn` are added. More on Unicode categories can be found [here][unicode category link].
 
 [unicode category link]: https://en.wikipedia.org/wiki/Template:General_Category_(Unicode)
 
+However, this library has not supported all languages yet, such as *Chinese*.
+
 ## Usage
-By default, on the first `.censor()` call, `profanity` initializes a set of words, from [profanity_wordlist.txt](./better_profanity/profanity_wordlist.txt), to be used to compare against the input texts. This set of words will be stored in memory (~5MB+).
+By default, on the first `.censor()` call, function `.load_censor_words()` generates all possible [leetspeak](https://en.wikipedia.org/wiki/Leet) words, from [profanity_wordlist.txt](./better_profanity/profanity_wordlist.txt), to be used to compare against the input texts.  The full mapping of the library can be found in [profanity.py](./better_profanity/profanity.py#L9-L18).
+
+For example, the word `handjob` would be loaded into:
+```
+'h@ndjob', 'handj0b', 'handj@b', 'h*ndj*b', 'h*ndjob', 'h@ndj0b', 'h@ndj*b', 'h4ndj*b', 'h@ndj@b', 'handjob', 'h4ndj0b', 'h4ndjob', 'h4ndj@b', 'h*ndj0b', 'handj*b', 'h*ndj@b'
+```
+
+This set of words will be stored in memory (~5MB+).
 
 ### 1. Censor swear words from a text
 By default, `profanity` replaces each swear words with 4 asterisks `****`.
@@ -51,14 +53,14 @@ if __name__ == "__main__":
     # You **** of ****.
 ```
 
-### 2. Censor doesn't care about word dividers (*beta*)
-The function `.censor()` also hide words separated not just by an empty space ` ` but also other dividers, such as `_`, `,` and `.`. Except for `@, $, *, &, ", '`.
+### 2. Censor doesn't care about word dividers
+The function `.censor()` also hide words separated not just by an empty space ` ` but also other dividers, such as `_`, `,` and `.`. Except for `@, $, *, ", '`.
 
 ```
 from better_profanity import profanity
 
 if __name__ == "__main__":
-    text = "...shit...hello_cat_fuck,,,,123"
+    text = "...sh1t...hello_cat_fuck,,,,123"
 
     censored_text = profanity.censor(text)
     print(censored_text)
@@ -67,6 +69,7 @@ if __name__ == "__main__":
 
 ### 3. Censor swear words with custom character
 4 instances of the character in second parameter in `.censor()` will be used to replace the swear words.
+
 ```
 from better_profanity import profanity
 
@@ -79,6 +82,8 @@ if __name__ == "__main__":
 ```
 
 ### 4. Check if the string contains any swear words
+Function `.contains_profanity()` return `True` if any words in the given string has a word existing in the wordlist.
+
 ```
 from better_profanity import profanity
 
@@ -90,15 +95,13 @@ if __name__ == "__main__":
 ```
 
 ### 5. Censor swear words with a custom wordlist
-The provided list of words will replace the default wordlist.
+Function `.load_censor_words()` takes a `List` of strings as censored words.
+The provided list will replace the default wordlist.
 
-4 instances of the character in second parameter in `.censor()` will be used to replace the swear words.
 ```
 from better_profanity import profanity
 
 if __name__ == "__main__":
-    text = "You p1ec3 of sHit."
-
     custom_badwords = ['happy', 'jolly', 'merry']
     profanity.load_censor_words(custom_badwords)
 
@@ -109,7 +112,8 @@ if __name__ == "__main__":
     # Have a **** day! :)
 ```
 
-### 6. Censor Unicode characters (*beta*)
+### 6. Censor Unicode characters
+No extra steps needed!
 
 ```
 from better_profanity import profanity

diff --git a/better_profanity/__init__.py b/better_profanity/__init__.py
@@ -1,2 +1,2 @@
 name = 'better_profanity'
-__version__ = '0.3-beta.0'
+__version__ = '0.3.0'
diff --git a/better_profanity/profanity.py b/better_profanity/profanity.py
@@ -7,7 +7,7 @@
 ## GLOBAL VARIABLES ##
 CENSOR_WORDSET = set()
 CHARS_MAPPING = {
-    'a': ('a', '@', '*', '4', '&'),
+    'a': ('a', '@', '*', '4'),
     'i': ('i', '*', 'l', '1'),
     'o': ('o', '*', '0', '@'),
     'u': ('u', '*', 'v'),

diff --git a/better_profanity/utils.py b/better_profanity/utils.py
@@ -10,7 +10,7 @@
 ## GLOBAL VARIABLES ##
 ALLOWED_CHARACTERS = set(ascii_letters)
 ALLOWED_CHARACTERS.update(set(digits))
-ALLOWED_CHARACTERS.update({'@', '$', '*', '&', '\"', '\''})
+ALLOWED_CHARACTERS.update({'@', '$', '*', '\"', '\''})
 
 
 def get_complete_path_of_file(filename: str) -> str:

diff --git a/setup.py b/setup.py
@@ -10,12 +10,14 @@
     version=__version__,
     author="Son Nguyen Thanh",
     author_email="[email protected]",
-    description="A Python library to clean English swear words in strings",
+    description="A Python library to clean swear words (and their leetspeak) in strings",
     long_description=long_description,
     long_description_content_type="text/markdown",
     url="https://github.com/snguyenthanh/better_profanity",
     classifiers=[
         "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.7",
         "License :: OSI Approved :: MIT License",
         "Operating System :: OS Independent",
     ],

diff --git a/tests.py b/tests.py
@@ -70,7 +70,7 @@ def test_custom_wordlist(self):
         self.assertTrue(profanity.contains_profanity("Have a merry day! :)"))
 
     def test_censorship_without_spaces(self):
-        bad_text = "...penis...hello_cat_vagina,,,,qew"
+        bad_text = "...pen1s...hello_cat_vagina,,,,qew"
         censored_text = "...****...hello_cat_****,,,,qew"
         self.assertEqual(profanity.censor(bad_text), censored_text)
 
@@ -119,9 +119,9 @@ def setUp(self):
         profanity.load_censor_words()
 
     def test_unicode_vietnamese_1(self):
-        bad_text = "Chào con cặc.Thằng lồn."
-        censored_text = "Chào con ****.Thằng ****."
-        profanity.load_censor_words(["cặc", "lồn"])
+        bad_text = "Đây là 1 câu nói bậy."
+        censored_text = "Đây là 1 **** nói ****."
+        profanity.load_censor_words(["câu", "bậy"])
         self.assertEqual(profanity.censor(bad_text), censored_text)
 
     def test_unicode_vietnamese_2(self):