Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added train_model.py and made the necessary modifications #19

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
language: python
python:
- "3.5"
- "3.6"
- "3.7-dev"
- "3.7"
- "3.8"
script:
- pytest
notifications:
Expand Down
2 changes: 1 addition & 1 deletion profanity_check/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .profanity_check import predict, predict_prob
__version__="1.0.3"
__version__="1.0.5"
Binary file modified profanity_check/data/model.joblib
Binary file not shown.
20 changes: 20 additions & 0 deletions profanity_check/data/train_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from joblib import dump, load

data = pd.read_csv("clean_data.csv")
texts = data["text"].astype(str)
y = data["is_offensive"]

vectorizer = TfidfVectorizer(stop_words="english", min_df=0.0001)
X = vectorizer.fit_transform(texts)

model = LinearSVC(class_weight="balanced", dual=False, tol=1e-2, max_iter=1e5)
cclf = CalibratedClassifierCV(base_estimator=model)
cclf.fit(X, y)

dump(vectorizer, "vectorizer.joblib")
dump(cclf, "model.joblib")
Binary file modified profanity_check/data/vectorizer.joblib
Binary file not shown.
8 changes: 6 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
scikit-learn>=0.20.2
joblib==0.14.1

joblib==0.15.1
numpy==1.19.0
# Used for the training script
pandas==1.0.5
scikit-learn==0.23.1
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@

setuptools.setup(
name="profanity-check",
version="1.0.3",
version="1.0.5",
author="Victor Zhou",
author_email="[email protected]",
description="A fast, robust library to check for offensive language in strings.",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/vzhou842/profanity-check",
packages=setuptools.find_packages(),
install_requires=['joblib>=0.14.1', 'scikit-learn>=0.20.2'],
install_requires=['scikit-learn>=0.23.1', 'joblib>=0.15.1'],
package_data={ 'profanity_check': ['data/model.joblib', 'data/vectorizer.joblib'] },
classifiers=[
"Development Status :: 5 - Production/Stable",
Expand Down
4 changes: 2 additions & 2 deletions tests/test_profanity_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ def test_accuracy():
'fUcK u',
'GO TO hElL, you dirty scum',
]
assert list(predict(texts)) == [0, 0, 0, 1, 1, 1]
assert list(predict(texts)) == [0, 1, 0, 1, 1, 1]

probs = predict_prob(texts)
for i in range(len(probs)):
if i < 3:
if i == 0 or i == 2:
assert probs[i] <= 0.5
else:
assert probs[i] >= 0.5
Expand Down