This repository was archived by the owner on Aug 13, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMini-project2_no_extra.py
95 lines (71 loc) · 3.17 KB
/
Mini-project2_no_extra.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os, re, json
from random import shuffle
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
import warnings
warnings.filterwarnings("ignore",category = FutureWarning)
# *****Initialization*****
filter_words = []
target = [] #len = 25000
reviews_train = [] #first 12500 are positive reviews, the rest are negative
reviews_test = []
train_pos_path = 'train//pos'
train_neg_path = 'train//neg'
test_path = 'test'
# ******Read data******
for file in os.listdir(train_pos_path):
with open(os.path.join(train_pos_path,file),"r",encoding="utf8") as f:
reviews_train.append(f.read())
for file in os.listdir(train_neg_path):
with open(os.path.join(train_neg_path,file),"r",encoding="utf8") as f:
reviews_train.append(f.read())
target = [1 if i<12500 else 0 for i in range(25000)]
reviews = reviews_train
shuffle(reviews)
# ********Preprocessing********
delete = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])")
replace_with_space = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
def compile(reviews):
reviews = [delete.sub("",line.lower()) for line in reviews]
reviews = [replace_with_space.sub(" ",line) for line in reviews]
return reviews
def vectorization(train,test, words): #It will vectorize the train set and it will transform both train and test set
cv = CountVectorizer(binary = True, stop_words = words, min_df = 0.01)
cv.fit(train)
train = cv.transform(train)
test = cv.transform(test)
return train, test
def get_stemmed_text(corpus,name): #PorterStemmer - SnowballStemmer("english")
if name == 'Porter':
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
else:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]
# ********Heald-out validation********
x_train,x_val,y_train,y_val = train_test_split(compile(reviews), target, train_size = 0.75, random_state = 42)
# *********Applying preprocessing*******
x_train = get_stemmed_text(x_train,'Snow')
x_val = get_stemmed_text(x_val,'Snow')
[x_train,x_val] = vectorization(x_train, x_val, filter_words)
# *********Classifiers*******
print('Classifier: LogisticRegression')
lr = LogisticRegression(C = 0.05)
lr.fit(x_train,y_train)
print("Accuracy on train set: %s " %(accuracy_score(y_train,lr.predict(x_train))))
print("Accuracy on val set: %s " %(accuracy_score(y_val,lr.predict(x_val))))
print('Classifier: Support Vector Machines')
svm = LinearSVC(C = 0.01)
svm.fit(x_train,y_train)
print("Accuracy on train set: %s " %(accuracy_score(y_train,svm.predict(x_train))))
print("Accuracy on val set: %s " %(accuracy_score(y_val,svm.predict(x_val))))