-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsegment.py
121 lines (98 loc) · 2.89 KB
/
segment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
'''
Created on 2017年4月29日
@author: heguofeng
'''
# encoding=utf-8
from algorithm.hmm import HMModel
from segment.extra import seg_stop_words
STATES = {'B', 'M', 'E', 'S'}
def get_tags(src):
tags = []
if len(src) == 1:
tags = ['S']
elif len(src) == 2:
tags = ['B', 'E']
else:
m_num = len(src) - 2
tags.append('B')
tags.extend(['M'] * m_num)
tags.append('S')
return tags
def cut_sent(src, tags):
word_list = []
start = -1
started = False
if len(tags) != len(src):
return None
if tags[-1] not in {'S', 'E'}:
if tags[-2] in {'S', 'E'}:
tags[-1] = 'S' # for tags: r".*(S|E)(B|M)"
else:
tags[-1] = 'E' # for tags: r".*(B|M)(B|M)"
for i in range(len(tags)):
if tags[i] == 'S':
if started:
started = False
word_list.append(src[start:i]) # for tags: r"BM*S"
word_list.append(src[i])
elif tags[i] == 'B':
if started:
word_list.append(src[start:i]) # for tags: r"BM*B"
start = i
started = True
elif tags[i] == 'E':
started = False
word = src[start:i+1]
word_list.append(word)
elif tags[i] == 'M':
continue
return word_list
class HMMSegger(HMModel):
def __init__(self, *args, **kwargs):
super(HMMSegger, self).__init__(*args, **kwargs)
self.states = STATES
self.data = None
def load_data(self, filename):
self.data = open(filename, 'r', encoding="utf-8")
def train(self):
if not self.inited:
self.setup()
# train
for line in self.data:
# pre processing
line = line.strip()
if not line:
continue
# get observes
observes = []
for i in range(len(line)):
if line[i] == " ":
continue
observes.append(line[i])
# get states
words = line.split(" ") # spilt word by whitespace
states = []
for word in words:
if word in seg_stop_words:
continue
states.extend(get_tags(word))
# resume train
self.do_train(observes, states)
def cut(self, sentence):
try:
tags = self.do_predict(sentence)
return cut_sent(sentence, tags)
except:
return sentence
def test(self):
cases = [
"我来到北京清华大学",
"长春市长春节讲话",
"我们去野生动物园玩",
"我只是做了一些微小的工作",
]
for case in cases:
result = self.cut(case)
for word in result:
print(word)
print('')