-
Notifications
You must be signed in to change notification settings - Fork 38
/
trans.py
276 lines (257 loc) · 9.77 KB
/
trans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
import re
import hgtk
import hanja
import mecab
mecab = mecab.MeCab()
from infer import transformer_transliteration as tft
from utils import decide_acronym, read_acronym
from utils import readNumberKor, readNumberEng, readNumber, readBigNum, readOnlyNum
from utils import dataset, small, big, real_latin
from utils import puncs, symbols, sym_han, sym_pro, count_symbols, count_sym_han
## Dictionary from https://github.com/muik/transliteration/tree/master/data/source
dataset = dataset()
data_dict = {re.sub(' +', ' ',dataset[i][0]).lower(): re.sub(' +', ' ',dataset[i][1]) for i in range(len(dataset))}
## Josa set for post-processing
josa_o = ['은','이','과','을','이다']
josa_x = ['는','가','와','를','다']
## Splits a sentence into chunks using decomposer
def align_particles(sentence):
s = sentence.split()
#particles = tagger.parse(sentence) ## Available if another tagger is used
particles = mecab.pos(sentence)
chunks = []
final = False
if len(particles) > 0:
count_word = 0
morphemes = []
total = []
for i in range(len(particles)):
morphemes.append(particles[i][0])
total.append(particles[i])
if i+1 < len(particles):
morphemes_temp = morphemes[:]
morphemes_temp.append(particles[i+1][0])
if "".join(morphemes_temp) not in s[count_word]:
chunks.append(total)
count_word += 1
morphemes = []
total = []
else:
chunks.append(total)
return s, particles, chunks
## Generates a placeholder for the chunk sequence
def info_to_word(chunks):
res = []
for i in range(len(chunks)):
temp = []
for j in range(len(chunks[i])):
temp.append(chunks[i][j][0])
res.append(temp)
return res
## Transliterates numbers to hangul
def trans_number(n,
prev_term,
next_term): ## Context-given number reading
if hgtk.checker.is_hangul(prev_term) and hgtk.checker.is_hangul(next_term):
return readNumberKor(n,next_term)
elif real_latin(prev_term) or real_latin(next_term):
if hgtk.checker.is_hangul(next_term) and n>10:
return readNumberKor(n,next_term)
else:
return readNumberEng(n)
else: ## Maybe hanja
if prev_term in symbols or next_term in symbols:
return readOnlyNum(n)
elif n > 99999:
return readBigNum(n)
else:
return readNumber(n)
## Transliterates symbols to hangul
def trans_symbol(symbol,
prev_term,
next_term):
if symbol in count_symbols:
return count_sym_han[count_symbols.index(symbol)]
elif prev_term not in puncs:
if hgtk.checker.is_hangul(prev_term) or hgtk.checker.is_hangul(next_term):
return sym_han[symbols.index(symbol)]
elif prev_term.isdigit() or next_term.isdigit():
return sym_han[symbols.index(symbol)]
elif real_latin(prev_term) or real_latin(next_term):
return sym_pro[symbols.index(symbol)]
else:
return ''
else:
return ''
## Transliterates hanja to hangul
def trans_hanja(term): ## Complementary check
return hanja.translate(term,'substitution')
## Transliterates English to hangul
def trans_latin(term): ## Rule and training hybrid transliteration
if term.lower() in data_dict:
return data_dict[term.lower()]
else:
if decide_acronym(term):
return read_acronym(term)
else:
return tft(term) ## Tentative
## Assigns transliteration result given the position of eojeol
def decide_context(term,
chunks,
eojeol,
i,j):
if len(chunks) == 1: ## Only one eojeol
if len(eojeol) == 1: ## Eojeol has a single morpheme
return readNumber(term)
else: ## Multiple morphemes
if j == len(eojeol)-1:
return chunks[i][j-1],chunks[i][j-1]
elif j == 0:
return chunks[i][j+1],chunks[i][j+1]
else:
return chunks[i][j-1],chunks[i][j+1]
else: ## Multiple eojeols
if len(eojeol) == 1: ## Eojeol has a single morpheme
if i == len(chunks)-1:
return chunks[i-1][-1],chunks[i-1][-1]
elif i == 0:
return chunks[i+1][0],chunks[i+1][0]
else:
return chunks[i-1][-1],chunks[i+1][0]
else: ## Multiple morphemes
if j == len(eojeol)-1:
if i == len(chunks)-1: ## Truly last morpheme
return chunks[i][j-1],chunks[i][j-1]
else:
return chunks[i][j-1],chunks[i+1][0]
elif j == 0:
if i == 0: ## Truly first morpheme
return chunks[i][j+1],chunks[i][j+1]
else:
return chunks[i-1][-1],chunks[i][j+1]
else:
return chunks[i][j-1],chunks[i][j+1]
## Provides an eojeol-level morpheme-wise
def trans_eojeol(chunks,
chunks_4num,
metadata,
if_num=True,
if_sym=True,
if_han=True,
if_eng=True,
if_puncs=True,
if_else=True):
for i in range(len(chunks)):
eojeol = chunks[i]
for j in range(len(eojeol)):
term = eojeol[j]
if term.isdigit():
if if_num:
term = int(term)
x,y = decide_context(term,chunks_4num,eojeol,i,j)
chunks[i][j] = trans_number(term,x,y) ## Reflects context
else:
chunks[i][j] = term
elif term in symbols+count_symbols and i+j>0: ## Symbols not sentence-first
if if_sym:
x,y = decide_context(term,chunks_4num,eojeol,i,j)
chunks[i][j] = trans_symbol(term,x,y) ## Currently bypassing
else:
chunks[i][j] = term
elif hgtk.checker.is_hanja(term):
if if_han:
chunks[i][j] = trans_hanja(term) ## Double check
else:
chunks[i][j] = term
elif real_latin(term):
if if_eng:
chunks[i][j] = trans_latin(term) ## Transliteration (or bypassing)
else:
chunks[i][j] = term
elif term in puncs:
if if_puncs:
chunks[i][j] = term ## Bypassing by default
else:
chunks[i][j] = ''
elif hgtk.checker.is_hangul(term):
chunks[i][j] = term ## Bypassing by default
else:
if if_else:
chunks[i][j] = term # '' ## Currently bypassing but able to delete
else:
chunks[i][j] = ''
return chunks
## Modifies josa for some preceding chunks
def decide_josa(context,
term):
if hgtk.checker.is_hangul(context) and context!= '':
dec = (hgtk.letter.decompose(context[-1])[2] != '') # If third sound is non-empty
if term in josa_o and not dec:
return josa_x[josa_o.index(term)]
elif term in josa_x and dec:
return josa_o[josa_x.index(term)]
else:
return term
else:
return term
## Returns the sentence with modified josa
def check_josa(chunks,
chunks_4num,
metadata):
for i in range(len(chunks)):
eojeol = chunks[i]
for j in range(len(eojeol)):
term = eojeol[j]
pos = metadata[i][j][1].split(',')[0].lower()
if pos[0] == 'j' and (term in josa_o or term in josa_x): # If pos is functional particle
if j > 0 and chunks[i][j-1] != chunks_4num[i][j-1]:
chunks[i][j] = decide_josa(chunks[i][j-1],term)
if i > 0 and j == 0 and chunks[i-1][-1] != chunks_4num[i-1][-1]:
chunks[i][j] = decide_josa(chunks[i-1][-1],term)
return chunks
## Cleanses the sentence with non-hangul terms
def leftword(chunks):
for i in range(len(chunks)):
eojeol = chunks[i]
for j in range(len(eojeol)):
term = chunks[i][j]
if real_latin(term):
chunks[i][j] = read_acronym(term)
elif not hgtk.checker.is_hangul(term) and term not in puncs:
chunks[i][j] = ''
return chunks
## Final function
def sentranslit(sentence,
if_num=True,
if_sym=True,
if_han=True,
if_eng=True,
if_puncs=True,
if_else=True):
if if_han:
sentence = hanja.translate(sentence,'substitution') ## For word-initial rule
if not hgtk.checker.is_hangul(sentence): ## Only if contains non-Hangul terms
s, particles, metadata = align_particles(sentence)
chunks = info_to_word(metadata)
chunks_4num = info_to_word(metadata)
mod_chunks = trans_eojeol(chunks,chunks_4num,metadata,if_num,if_sym,if_han,if_eng,if_puncs,if_else) ## Chunks > Mod_chunks
mod_chunks = check_josa(mod_chunks,chunks_4num,metadata) ## Mod_chunks > Mod_final
return (' ').join([''.join(z) for z in mod_chunks])
else:
return sentence
''' ## If KoG2P directory is cloned
from KoG2P.g2p import runKoG2P
'''
#from KoG2P.g2p import runKoG2P
''' ## If G2pK is successfully installed
from g2pk import G2p
g2p = G2p()
'''
from g2pk import G2p
g2p = G2p()
def mixed_g2p(sentence,
out_type='eng'):
if out_type == 'kor':
return g2p(trans(sentence))
else:
return runKoG2P(sentence,'KoG2P/rulebook.txt')