-
Notifications
You must be signed in to change notification settings - Fork 10
/
InspirationHub_2.py
105 lines (93 loc) · 3.44 KB
/
InspirationHub_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 01 19:47:26 2015
@author: sherlockye
"""
import re
import sys
pinyin_dict = {}
def LoadPinyinDict():
global pinyin_dict
for line in open('pinyin.txt'):
line = line.replace('\n','')
line = line.split(":")
pinyin_dict[unicode(line[0],"utf-8")] = line[1].split(",")
def UniformPinyin(pinyin):
pinyin, number = re.subn('(.)h(.*)', r'\1\2', pinyin)
pinyin, number = re.subn('(.*)ng', r'\1n', pinyin)
return pinyin
def GetPinyin(word):
if not pinyin_dict.has_key(word):
return "-"
res = pinyin_dict[word][0]
res = re.sub("[ā|á|ǎ|à]","a",res)
res = re.sub("[ō|ó|ǒ|ò]","o",res)
res = re.sub("[ē|é|ě|è]","e",res)
res = re.sub("[ī|í|ǐ|ì]","i",res)
res = re.sub("[ū|ú|ǔ|ù]","u",res)
res = re.sub("[ǖ|ǘ|ǚ|ǜ]","ü",res)
return res
def convertPinyin(word):
# word = word.replace(",","")
return map(lambda x:UniformPinyin(GetPinyin(x)),word)
def ValidCheck(wordList_py,dstWord_py):
idx = []
for wordidx,wordpy in enumerate(wordList_py):
inflag = False
for pyidx,py in enumerate(wordpy):
if py in dstWord_py:
dstidx = dstWord_py.index(py)
idx.append([wordidx,pyidx,dstidx])
inflag = True
if not inflag:
return []
return idx
def PrintValidResult(pinyinSet,oriText,wordList):
line = oriText
try:
dstWord_py = convertPinyin(line)
except:
print "Error:%s"%(oriText)
return
ReplaceRule = ValidCheck(pinyinSet,dstWord_py)
if len(ReplaceRule):
result = list(line)
for replace_idx in ReplaceRule:
dst = wordList[replace_idx[0]][replace_idx[1]]
src = result[replace_idx[2]]
line = line.replace(src,dst)
print line,'<<----',oriText
def ReadGushi(filename,pinyinSet,wordList):
for (linenum,oriText) in enumerate(open(filename)):
oriText = unicode(oriText.replace('\n',''),"utf-8")
regexp = "<<(.*)>>:(.*)$"
result = re.findall(regexp,oriText)
for (title,poet) in result:
poet = poet.replace(unicode(',',"utf-8"),'---')
poet = poet.replace(unicode('。',"utf-8"),'---')
poet = poet.replace(unicode('?',"utf-8"),'---')
poet = poet.replace(unicode('、',"utf-8"),'---')
poet = poet.replace(unicode('!',"utf-8"),'---')
poet = poet.replace(unicode(';',"utf-8"),'---')
poet = poet.replace(unicode(':',"utf-8"),'---')
poet = poet.replace(unicode(':',"utf-8"),'---')
poet = poet.replace(unicode('”',"utf-8"),'---')
poet = poet.replace(unicode('“',"utf-8"),'---')
poet = poet.split('---')
for p in poet:
PrintValidResult(pinyinSet,p,wordList)
def main(argv1,argv2):
wordList = [argv1,argv2]
wordList = map(lambda x:unicode(x,"utf-8"),wordList)
pinyinSet = map(lambda x:convertPinyin(x),wordList)
# ReadGushi("tangshi.txt",pinyinSet,wordList)
#ReadGushi("songci.txt",pinyinSet,wordList)
for (linenum,oriText) in enumerate(open('chengyu.txt')):
oriText = unicode(oriText.replace('\n',''),"utf-8")
PrintValidResult(pinyinSet,oriText,wordList)
if __name__ == "__main__":
if len(sys.argv) != 3:
print "python InspirationHub.py input1 input2"
else:
LoadPinyinDict()
main(sys.argv[1],sys.argv[2])