-
Notifications
You must be signed in to change notification settings - Fork 0
/
query.py
executable file
·55 lines (49 loc) · 1.79 KB
/
query.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import gl
import codecs
import time
import jieba
class Query:
def __init__(self, query_origin, answer, tokens, mentions):
self.query_origin = query_origin
self.answer = answer
self.tokens = tokens
self.mentions = mentions
def valid_pid(self, entity):
# every word in entity in kb should be the substring of the question(strict mode)
valid = True
entities = jieba.cut_for_search(entity)
for item in entities:
if item not in self.query_origin:
valid = False
if entity not in self.query_origin:
valid = False
return valid
class QueryList:
def __init__(self):
self.query_list = []
def read_query_file(self, file_name=gl.training_data_split_file_name):
print 'read question file', file_name
t1 = time.time()
fh = codecs.open(file_name, 'r', encoding='utf-8')
lines = fh.readlines()
for i in range(0, len(lines), 5):
# print i
query = lines[i].strip()
ans = lines[i+1].strip()
tokens = lines[i+2].strip().split('\t')
mentions = lines[i+3].strip().split('\t')
query_tmp = Query(query, ans, tokens, mentions)
self.query_list.append(query_tmp)
fh.close()
t2 = time.time()
print 'Finish reading question file ', file_name, ' consumed', t2 - t1, 'seconds'
return self.query_list
# test
if __name__ == '__main__':
ql = QueryList()
ql.read_query_file()
for i in range(20):
print ql.query_list[i].query_origin
print '$$$$'.join(ql.query_list[i].tokens)
print '||||'.join(ql.query_list[i].mentions)
print '----------------------------------'