-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPreProcess.py
65 lines (53 loc) · 2.47 KB
/
PreProcess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import string
import re
import hazm
class PreProcess:
def __init__(self) -> None:
persian_punctuations = '''`÷×؛#<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
self.punctuations_list = string.punctuation + persian_punctuations
self.arabic_diacritics = re.compile("""
ّ | # Tashdid
َ | # Fatha
ً | # Tanwin Fath
ُ | # Damma
ٌ | # Tanwin Damm
ِ | # Kasra
ٍ | # Tanwin Kasr
ْ | # Sukun
ـ # Tatwil/Kashida
""", re.VERBOSE)
self.stop_words = hazm.stopwords_list()
self.lemmatizer = hazm.Lemmatizer()
def remove_punctuations(self, text : str) -> str:
translator = str.maketrans('', '', self.punctuations_list)
return text.translate(translator)
def remove_diacritics(self, text : str) -> str:
text = re.sub(self.arabic_diacritics, '', text)
return text
def normalize_persian(self, text : str) -> str:
text = re.sub("[إأآا]", "ا", text)
text = re.sub("ي", "ی", text)
text = re.sub("ؤ", "و", text)
text = re.sub("ئ", "ی", text)
text = re.sub("ة", "ه", text)
text = re.sub("ك" ,"ک" , text)
text = re.sub("[^ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی]", " ", text)
text = re.sub("[^\S\n\t]+", ' ', text)
return text
def remove_repeating_char(self, text : str) -> str:
return re.sub(r'(.)\1+', r'\1', text)
def tokenize(self, text : str) -> list:
return hazm.word_tokenize(text)
def remove_stopwords(self, tokens : list) -> list:
return [token for token in tokens if token not in self.stop_words]
def lemmatizer_text(self, tokens : list) -> list:
return [self.lemmatizer.lemmatize(token) for token in tokens]
def pipeline(self, text : str) -> list:
text = self.remove_punctuations(text)
text = self.remove_diacritics(text)
text = self.normalize_persian(text)
text = self.remove_repeating_char(text)
tokens = self.tokenize(text)
tokens = self.remove_stopwords(tokens)
tokens = self.lemmatizer_text(tokens)
return tokens