-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelper.py
121 lines (106 loc) · 4.27 KB
/
helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# importing libraries
import pandas as pd
import numpy as np
import re
import string
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import RegexpTokenizer
from nltk import PorterStemmer, WordNetLemmatizer
import pickle
import nltk
nltk.download('wordnet')
# preprocessing functions
# converting tweet text to lower case
def text_lower(text):
return text.str.lower()
# removing stopwoords from the tweet text
def clean_stopwords(text):
# stopwords list that needs to be excluded from the data
stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
'does', 'doing', 'down', 'during', 'each','few', 'for', 'from',
'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're',
's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
"youve", 'your', 'yours', 'yourself', 'yourselves']
STOPWORDS = set(stopwordlist)
return " ".join([word for word in str(text).split() if word not in STOPWORDS])
# cleaning and removing punctuations
def clean_puctuations(text):
english_puctuations = string.punctuation
translator = str.maketrans('','', english_puctuations)
return text.translate(translator)
# cleaning and removing repeating characters
def clean_repeating_characters(text):
return re.sub(r'(.)1+', r'1', text)
# cleaning and removing URLs
def clean_URLs(text):
return re.sub(r"((www.[^s]+)|(http\S+))","",text)
# cleaning and removing numeric data
def clean_numeric(text):
return re.sub('[0-9]+', '', text)
# Tokenization of tweet text
def tokenize_tweet(text):
tokenizer = RegexpTokenizer('\w+')
text = text.apply(tokenizer.tokenize)
return text
# stemming
def text_stemming(text):
st = PorterStemmer()
text = [st.stem(word) for word in text]
return text
# lemmatization
def text_lemmatization(text):
lm = WordNetLemmatizer()
text = [lm.lemmatize(word) for word in text]
return text
# defining preprocess function
def preprocess(text):
text = text_lower(text)
text = text.apply(lambda text: clean_stopwords(text))
text = text.apply(lambda x : clean_puctuations(x))
text = text.apply(lambda x: clean_repeating_characters(x))
text = text.apply(lambda x : clean_URLs(x))
text = text.apply(lambda x: clean_numeric(x))
text = tokenize_tweet(text)
text = text.apply(lambda x: text_stemming(x))
text = text.apply(lambda x: text_lemmatization(x))
text = text.apply(lambda x : " ".join(x))
return text
# Function for custom input prediction
def custom_input_prediction(text):
import nltk
nltk.download('omw-1.4')
text = pd.Series(text)
text = preprocess(text)
text = [text[0],]
# to use this function we will need to define vectoriser first
vectoriser = pickle.load(open("tdf_vectorizer", "rb"))
text = vectoriser.transform(text)
model = pickle.load(open("model.pkl", "rb"))
prediction = model.predict(text)
prediction = prediction[0]
interpretations = {
0 : "Age",
1 : "Ethnicity",
2 : "Gender",
3 : "Not Cyberbullying",
4 : "Other Cyberbullying",
5 : "Religion"
}
for i in interpretations.keys():
if i == prediction:
return interpretations[i]