-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprocess.py
71 lines (62 loc) · 2.33 KB
/
process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import re
import os
import nltk
from data import get_docs
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# global variables
stop_words = set(stopwords.words('english'))
class Process():
"""
Class with methods for preprocessing the text
Methods:
remove_stopwords: removes stopwords from the text
lemmatize: implements lemmatization over the complete text
get_wordnet_pos: generate POS tag for words in text
"""
def __init__(self, filepath):
"""
params:
filepath: path to file which has to be preprocessed
"""
self.filename = os.path.splitext(os.path.basename(filepath))[0]
self.filedata = open(filepath, encoding='unicode_escape').read()
def get_wordnet_pos(self, word):
"""Generated POS tag for the words in a format accepted by lemmatize() function"""
tag = nltk.pos_tag([word])[0][1][0].upper()
tag_dict = {"J": wordnet.ADJ,
"N": wordnet.NOUN,
"V": wordnet.VERB,
"R": wordnet.ADV}
return tag_dict.get(tag, wordnet.NOUN)
def lemmatization(self):
""" Performs lemmatization on the input text"""
lemmatizer = WordNetLemmatizer()
self.filedata = [lemmatizer.lemmatize(w, self.get_wordnet_pos(w)) for w in nltk.word_tokenize(''.join(self.filedata))]
def remove_stopwords(self):
"""
Removes stopwords from the corpus
"""
file = []
for i in self.filedata:
if i.lower() not in stop_words:
file.append(i)
self.filedata = ' '.join(file)
self.filedata = re.sub(r'[^a-zA-Z _-]', "", self.filedata).split()
file.clear()
for i in self.filedata:
if i.lower() not in stop_words:
file.append(i.lower())
self.filedata = file.copy()
def __call__(self):
""" executes functions when object is called
parameters:
none
returns:
filename: name of the file
filedata: list of words in the file after processing
"""
self.lemmatization()
self.remove_stopwords()
return self.filename, self.filedata