This repository was archived by the owner on Aug 13, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDT_tfidf_pipeline.py
100 lines (78 loc) · 3.36 KB
/
DT_tfidf_pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import os, re, json
from random import shuffle
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from nltk.corpus import stopwords
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import Normalizer
from sklearn.feature_extraction.text import TfidfTransformer
import warnings
warnings.filterwarnings("ignore",category = FutureWarning)
#warnings.filterwarnings("ignore")
# *****Initialization*****
filter_words = []
target = [] #len = 25000
reviews = [] #Shuffled training data, len = 25000
# ********Read data*********
with open("train_data.json") as fp:
train_data = json.load(fp)
i = 0
while i < 25000:
target.append(train_data[i][0])
reviews.append(train_data[i][1])
i += 1
# ********Preprocessing********
delete = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])")
replace_with_space = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
def compile(reviews):
reviews = [delete.sub("",line.lower()) for line in reviews]
reviews = [replace_with_space.sub(" ",line) for line in reviews]
return reviews
def get_stemmed_text(corpus,name): #PorterStemmer - SnowballStemmer("english")
if name == 'Porter':
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
else:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]
def get_lemmatized_text(corpus):
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]
# *******Feature pipelines******
vect_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
('tfidf', TfidfTransformer()),
('classifier', DecisionTreeClassifier())])
# *********Applying preprocessing*******
reviews = compile(reviews)
x_train,x_val,y_train,y_val = train_test_split(compile(reviews), target, train_size = 0.75, random_state = 42)
# x_train = get_stemmed_text(x_train,'Porter')
# x_val = get_stemmed_text(x_val,'Porter')
x_train = get_stemmed_text(x_train,'Snow')
x_val = get_stemmed_text(x_val,'Snow')
# x_train = get_lemmatized_text(x_train)
# x_val = get_lemmatized_text(x_val)
#[x_train,x_val] = tf_idf_vectorization(x_train, x_val)
# *********Grid Search*******
parameters_grid = {'vect__binary': (True,False),
'vect__min_df':(30,50,100,120),
'tfidf__use_idf': (True, False),
'classifier__min_samples_split':(2,'4,6,8,10),
'classifier__max_depth':(5,10,15,20)}
# parameters_grid = {'vect__binary': (True,False),
# }
# *********Validation Pipeline*******
grid_search = GridSearchCV(vect_pipeline, parameters_grid, cv=4, n_jobs=-2, scoring='accuracy')
grid_search.fit(reviews,target)
cvres = grid_search.cv_results_
for accuracy, params in zip(cvres['mean_test_score'],cvres['params']):
print('Mean accuracy: ', accuracy,' using: ',params)