-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathexport.py
60 lines (50 loc) · 1.77 KB
/
export.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# -*- coding: utf8 -*-
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_predict
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import wordpunct_tokenize
from nltk.corpus import wordnet as wn
from functools import lru_cache
from nltk.tag.perceptron import PerceptronTagger
import matplotlib.pyplot as plt
import seaborn as sn
# Load data
dataset = json.load(open('products.json', encoding='utf-8'))
# Initiate lemmatizer
wnl = WordNetLemmatizer()
# Load tagger pickle
tagger = PerceptronTagger()
# Lookup if tag is noun, verb, adverb or an adjective
tags = {'N': wn.NOUN, 'V': wn.VERB, 'R': wn.ADV, 'J': wn.ADJ}
# Memoization of POS tagging and Lemmatizer
lemmatize_mem = lru_cache(maxsize=10000)(wnl.lemmatize)
tagger_mem = lru_cache(maxsize=10000)(tagger.tag)
# POS tag sentences and lemmatize each word
def tokenizer(text):
for token in wordpunct_tokenize(text):
if token not in ENGLISH_STOP_WORDS:
tag = tagger_mem(frozenset({token}))
yield lemmatize_mem(token, tags.get(tag[0][1], wn.NOUN))
# Pipeline definition
pipeline = Pipeline([
('vectorizer', TfidfVectorizer(
tokenizer=tokenizer,
ngram_range=(1, 2),
stop_words=ENGLISH_STOP_WORDS,
sublinear_tf=True,
min_df=0.00009
)),
('classifier', SGDClassifier(
alpha=1e-4, n_jobs=-1
)),
])
pipeline.fit(dataset.get('data'), dataset.get('target'))
import pickle
filehandler = open('model.pkl', 'wb')
pickle.dump(pipeline, filehandler)
filehandler.close()