forked from lloistborn/tsm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
40 lines (31 loc) · 1.13 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import re
import string
from ecsstemmer import EcsStemmer
class Preprocess():
DELIMETERS = "\"", "\'", "{", "}", "(", ")", "[", "]", ">", "<", "_", "=", "+", "|", "\\", ":", ";", " ", ",", ".", "/", "?", "~", "!", "@", "#", "$", "%", "^", "&", "*", "\r", "\n", "\t", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"
ecs = EcsStemmer()
def prep_text(self, text):
texts = self.tokenizing(text)
texts = self.stopwordremoval(texts)
texts = self.ecs.doStemming(texts)
return texts
# tokenizing, casefolding(lowercase)z
def tokenizing(self, text):
regexPattern = '|'.join(map(re.escape, self.DELIMETERS))
texts = re.split(regexPattern, text.lower())
return texts
# exclude = set(string.punctuation)
# return ''.join(x.lower() for x in text if x not in exclude)
# removing stopword
def stopwordremoval(self, text):
handle = open('E:\\Workspace\\corpus\\stopwords.txt', 'r+')
var = handle.read()
handle.close()
LIST_STOP_WORD = var.split()
result = []
# result = list(set(text).difference(set(LIST_STOP_WORD)))
for txt in text:
if txt:
if txt not in LIST_STOP_WORD:
result.append(txt)
return result