-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathReAIPreprocessor.py
210 lines (170 loc) · 8.12 KB
/
ReAIPreprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import re
from kss import split_sentences
class Preprocessor:
def __init__(self, reCompiler_dic=None, noToken_dic=None):
'''@Parm
reCompiler: 소문자, 소제목 제거하는 정규표현식
noToken_dic: 버트사전에 없는 특수문자 처리하는 dictionary
'''
self.split_sentences = split_sentences
self.reCompiler_dic = self.my_compiler()
self.smallSubjectCompilers = self.reCompiler_dic['small_subject']
self.bracketCompiler = self.reCompiler_dic['small_bracket']
self.noBertToken_dic = self.my_noToken_dic()
def __call__(self, text):
pass
def make_Classifier_input(self, text):
paragraphList, sentencePos_dict = self.split_sentence(text)
preprocessedParagraphList = []
for i, paragraph in enumerate(paragraphList):
paragraphTemp = []
for j, sentence in enumerate(paragraph):
preprecessedSentence = self.preprocessing_Classifier(sentence[1])
if preprecessedSentence != '':
paragraphTemp.append((sentence[0], preprecessedSentence))
if paragraphTemp != []:
preprocessedParagraphList.append(paragraphTemp)
Modelinput = []
SentenceNumInfo = []
for preprocessedParagraph in preprocessedParagraphList:
for i, sentence in enumerate(preprocessedParagraph[:-1]):
nextSentence = preprocessedParagraph[i+1]
Modelinput.append([sentence[1], nextSentence[1]])
SentenceNumInfo.append([sentence[0], nextSentence[0]])
return Modelinput, SentenceNumInfo, sentencePos_dict
def make_Recommend_input(self, text, sentence=True):
if sentence:
# text가 문장일 때
textRemoveSmallBracket = self.preprocessing_Recommend(text)
return textRemoveSmallBracket
else:
# text가 문단일 때
textRemoveSmallBracket = self.preprocessing_Recommend(text)
paragraphList, _ = self.split_sentence(textRemoveSmallSubject)
################ 구현하기 #################
################ 구현하기 #################
################ 구현하기 #################
def preprocessing_Classifier(self, text):
textRemoveSpecialToken = self.process_specialToken(text)
textRemoveSmallBracket = self.process_smallBrackets(textRemoveSpecialToken)
textRemoveSmallSubject = self.process_smallSubject(textRemoveSmallBracket)
return textRemoveSmallSubject
def preprocessing_Recommend(self, text):
textRemoveSpecialToken = self.process_specialToken(text)
textRemoveSmallBracket = self.process_smallBrackets(textRemoveSpecialToken)
return textRemoveSmallBracket
######################################################################
############## 전처리에 필요한 기본 변수를 생성하는 함수 ##############
######################################################################
def my_compiler(self):
myCompiler_dic = {}
# 소제목 분류하는 정규표현식
smallSubjectCompilers = []
smallSubjectCompilers.append(re.compile('(^▶▶[^\n(◀◀)]+◀◀$)'))
smallSubjectCompilers.append(re.compile('(^◆[^\n(◆)]+◆$)'))
smallSubjectCompilers.append(re.compile('(^<[^\n(>)]+>$)'))
smallSubjectCompilers.append(re.compile('(^\'[^\n\']+\'$)'))
smallSubjectCompilers.append(re.compile('(^\"[^\n\"]+\"$)'))
smallSubjectCompilers.append(re.compile('(^\[[^\n\]]+\]$)'))
smallSubjectCompilers.append(re.compile('(^“[^\n(”)]+”$)'))
smallSubjectCompilers.append(re.compile('(^‘[^\n’]+’$)'))
smallSubjectCompilers.append(re.compile('(^`[^\n`]+`$)'))
# 소괄호 분류하는 정규표현식
bracketCompiler = re.compile("\([^\)]*\)")
myCompiler_dic['small_subject'] = smallSubjectCompilers
myCompiler_dic['small_bracket'] = bracketCompiler
return myCompiler_dic
def my_noToken_dic(self):
# 버트에서 안쓰이는 특수문자 처리
noBertToken_dic = {}
noBertToken_dic['⓵'] = '1'
noBertToken_dic['♬'] = '' # 제거
noBertToken_dic['➂'] = '3'
noBertToken_dic['\U000f0853'] = '' # 제거
noBertToken_dic['⓷'] = '3'
noBertToken_dic['₃'] = '3'
noBertToken_dic['¸'] = ''
noBertToken_dic['Å'] = 'A'
noBertToken_dic['♪'] = ''
noBertToken_dic['\u200b'] = '' # 제거
noBertToken_dic['#'] = '#'
noBertToken_dic['➀'] = '1'
noBertToken_dic['➁'] = '2'
noBertToken_dic['∞'] = '무한'
noBertToken_dic['⓶'] = '2'
noBertToken_dic['''] = ''
noBertToken_dic['Ω'] = 'o'
noBertToken_dic['⓸'] = '4'
noBertToken_dic['\uf09e'] = '' # 제거
noBertToken_dic['˙'] = '‧'
noBertToken_dic['\U000f0852'] ='' # 제거
noBertToken_dic['ᄁ'] = '까'
return noBertToken_dic
######################################################################
########################### 전처리하는 함수 ##########################
######################################################################
def process_specialToken(self, text):
'''Delete no used token in KoELECTRA
@parm:
text: 처리하고싶은 글
@return
text: KoELECTRA에 없는 특수문자가 제거 및 변경된 글
'''
for specialToken in self.noBertToken_dic.keys():
text = text.replace(specialToken, self.noBertToken_dic[specialToken])
return text
def process_smallBrackets(self, text):
'''Delete small brackets in text
@parm:
text: 처리하고싶은 글
@return
text: 소괄호가 제거된 글
'''
text = self.bracketCompiler.sub('', text)
return text
def process_smallSubject(self, text):
'''Delete small subject in text
@parm:
text: 처리하고싶은 글
@return
소제목이 제거된 글
'''
textTemp = []
# 엔터구분으로 단락 분리
paragraphs = text.split('\n')
paragraphs = [paragraph.strip() for paragraph in paragraphs if len(paragraph) > 0]
for paragraph in paragraphs:
for compiler in self.smallSubjectCompilers:
paragraph = compiler.sub("", paragraph)
textTemp.append(paragraph)
return "\n".join(textTemp)
def split_sentence(self, text):
'''Splite sentence and paragraph
@parm:
text: 처리하고싶은 글
@return
paragraphTemp: 단락별로 (문장번호, 문장)인 2차원 배열
'''
# 엔터구분으로 단락 분리
paragraphs = text.split('\n')
paragraphs = [paragraph.strip() for paragraph in paragraphs if len(paragraph) > 0]
# 단락별로 문장 분리
paragraphTemp = []
num = 0
for paragraph in paragraphs:
sentences = self.split_sentences(paragraph)
sentNum = len(sentences)
sentences = list(zip(range(num, num + sentNum), sentences))
num += sentNum
paragraphTemp.append(sentences)
# 원본 문장에서 문장의 위치를 저장하는 사전 - 어색한 문장 검추 모델 결과의 문장 위치파악에 사용
sentencePos_dict = {}
cutter = 0
for paragraph in paragraphTemp:
for sentence in paragraph:
startIdx = ("*"*cutter + text[cutter:]).find(sentence[1])
endIdx = startIdx+len(sentence[1])
sentPosIdx = (startIdx, endIdx)
sentencePos_dict[sentence[0]] = sentPosIdx
cutter = endIdx
return paragraphTemp, sentencePos_dict