-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_clf.py
134 lines (85 loc) · 3.81 KB
/
text_clf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from sklearn.datasets import fetch_20newsgroups
import sklearn.datasets
from sklearn.feature_extraction.text import CountVectorizer,CharNGramAnalyzer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.svm.sparse import LinearSVC
import numpy as np
sklearn.datasets.load_files("C://Users/Tathagat Dasgupta/Desktop/ML Project/20news-18828")
categories=['alt.atheism','soc.religion.christian','comp.graphics','sci.med']
print "hello"
twenty_train=fetch_20newsgroups(subset='train',categories=categories,shuffle=True,random_state=42)
#twenty_train.target_names=['alt.atheism','comp.graphics','sci.med','soc.religion.christian']
print len(twenty_train.data)
print("\n".join(twenty_train.data[0].split("\n")[:3]))
print(twenty_train.target_names[twenty_train.target[0]])
print(twenty_train.target[:10])
for t in twenty_train.target[:10]:
print(twenty_train.target_names[t])
#Preprocessing
#Tokenizing text
count_vect=CountVectorizer()
X_train_counts=count_vect.fit_transform(twenty_train.data)
print(X_train_counts.shape)
print(count_vect.vocabulary_.get(u'algorithm'))
#vocabulary contains={word:frequency in corpus}
#tf-idf
tfidf_transformer=TfidfTransformer()
X_train_tfidf=tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)
#Classifier Training
clf=MultinomialNB().fit(X_train_tfidf,twenty_train.target)
docs_new=['God is love','OpenGL on the GPU is fast']
X_new_counts=count_vect.transform(docs_new)
X_new_tfidf=tfidf_transformer.transform(X_new_counts)
predicted=clf.predict(X_new_tfidf)
for doc,category in zip(docs_new,predicted):
print('%r=>%s'%(doc,twenty_train.target_names[category]))
#Building a pipeline
text_clf=Pipeline([('vect',CountVectorizer()),('tfidf',TfidfTransformer()),('clf',MultinomialNB())])
text_clf=text_clf.fit(twenty_train.data,twenty_train.target)
#Performance on test set
twenty_test=fetch_20newsgroups(subset='test',categories=categories,shuffle=True,random_state=42)
doc_test=twenty_test.data
predicted=text_clf.predict(doc_test)
print "Classifier Accuracy:"
print(np.mean(predicted==twenty_test.target))
#SVM Implementation
text_clf=Pipeline([('vect',CountVectorizer()),('tfidf',TfidfTransformer()),('clf',SGDClassifier(loss='hinge',alpha=1e-3,n_iter=5,random_state=42))])
text_clf.fit(twenty_train.data,twenty_train.target)
predicted=text_clf.predict(doc_test)
print "SVM Accuracy:"
print(np.mean(predicted==twenty_test.target))
#Language Identifier
class LowerCasePreprocessor(object):
"""Simple preprocessor that should be available by default"""
def preprocess(self, unicode_content):
return unicode_content.lower()
def __repr__(self):
return "LowerCasePreprocessor()"
analyzer1 = CharNGramAnalyzer(
min_n=1,
max_n=3,
preprocessor=LowerCasePreprocessor(),
)
# Build a vectorizer / classifier pipeline using the previous analyzer
clf = Pipeline([
('vec', CountVectorizer(analyzer=analyzer1)),
('tfidf', TfidfTransformer(use_idf=False)),
('clf', LinearSVC(loss='l2', penalty='l1', dual=False, C=100)),
])
# Fit the pipeline on the training set
clf.fit(twenty_train.data,twenty_train.target)
# Predict the outcome on the testing set
y_predicted = clf.predict(doc_test)
# Predict the result on some short new sentences:
sentences = [
u'This is a language detection test.',
u'Ceci est un test de d\xe9tection de la langue.',
u'Dies ist ein Test, um die Sprache zu erkennen.',
]
predicted = clf.predict(sentences)
for s, p in zip(sentences, predicted):
print u'The language of "%s" is "%s"' % (s, twenty_train.target_names[p])