-
Notifications
You must be signed in to change notification settings - Fork 11
/
preprocessor.py
53 lines (41 loc) · 1.34 KB
/
preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# -*- coding: utf-8 -*-
# @Author: pranit
# @Date: 2018-05-14 10:31:38
# @Last Modified by: pranit
# @Last Modified time: 2018-05-15 08:06:52
import string
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import sent_tokenize
from nltk import WordNetLemmatizer
from nltk import pos_tag
class NltkPreprocessor:
def __init__(self, stopwords = None, punct = None, lower = True, strip = True):
self.lower = lower
self.strip = strip
self.stopwords = stopwords or set(sw.words('english'))
self.punct = punct or set(string.punctuation)
self.lemmatizer = WordNetLemmatizer()
def tokenize(self, document):
tokenized_doc = []
for sent in sent_tokenize(document):
for token, tag in pos_tag(wordpunct_tokenize(sent)):
token = token.lower() if self.lower else token
token = token.strip() if self.strip else token
token = token.strip('_0123456789') if self.strip else token
if token in self.stopwords:
continue
if all(char in self.punct for char in token):
continue
lemma = self.lemmatize(token, tag)
tokenized_doc.append(lemma)
return ' '.join(tokenized_doc)
def lemmatize(self, token, tag):
tag = {
'N': wn.NOUN,
'V': wn.VERB,
'R': wn.ADV,
'J': wn.ADJ
}.get(tag[0], wn.NOUN)
return self.lemmatizer.lemmatize(token, tag)