-
Notifications
You must be signed in to change notification settings - Fork 0
/
similar_chinese_search.py
109 lines (90 loc) · 2.44 KB
/
similar_chinese_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from pywubi import wubi
import re
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
def letter2gram(text):
if len(text) == 0:
return []
text = text.lower()
text = re.sub(r'\W+', '_', text)
text = '_' + text + '_'
result = re.findall(r'(?=([a-z_]{2}))', text)
return result
def get_letter_bigram_map():
alphabet = 'abcdefghijklmnopqrstuvwxyz_'
alphabet_len = len(alphabet)
bigram_map = {}
for i, li in enumerate(alphabet):
for j, lj in enumerate(alphabet):
bigram_map[li + lj] = i * alphabet_len + j
return bigram_map
letter_bigram_map = get_letter_bigram_map()
letter_bigram_map_len = len(letter_bigram_map)
def encode_letter_bigram(text):
text = ' '.join(wubi(text))
data = letter2gram(text)
encode = np.zeros(letter_bigram_map_len, dtype=np.float32)
for item in data:
encode[letter_bigram_map[item]] += 1
return encode
query = encode_letter_bigram('南冒')
target = encode_letter_bigram('南昌')
print(cosine_similarity(query.reshape(1, -1), target.reshape(1, -1)))
districts = ['福州', '广州', '北京', '上海', '深圳', '兰州', '长沙', '武汉', '沈阳', '南京', '洛阳', '岳阳', '天津',
'西安', '昆明', '拉萨', '南昌', '浏阳', '南宁', '南阳', '南海']
vectors = []
for i in districts:
vectors.append(encode_letter_bigram(i))
import faiss
index = faiss.IndexFlatL2(letter_bigram_map_len)
print(index.is_trained)
index.add(np.array(vectors))
print(index.ntotal)
print('query:', '南冒')
D, I = index.search(np.array([query]), 3)
for i in I[0]:
print(districts[i])
# query: 南冒
# 南昌
# 南宁
# 南京
print('query:', '刘阳')
query = encode_letter_bigram('刘阳')
D, I = index.search(np.array([query]), 3)
for i in I[0]:
print(districts[i])
# query: 刘阳
# 浏阳
# 沈阳
# 岳阳
# from annoy import AnnoyIndex
#
#
# index = AnnoyIndex(letter_bigram_map_len, 'angular') # Length of item vector that will be indexed
#
# for i in range(len(vectors)):
# index.add_item(i, vectors[i])
#
# index.build(10)
#
#
# print('query:', '南冒')
# D = index.get_nns_by_vector(query, 3)
# for i in D:
# print(districts[i])
#
# # query: 南冒
# # 南昌
# # 南宁
# # 南京
#
#
# print('query:', '刘阳')
# query = encode_letter_bigram('刘阳')
# D = index.get_nns_by_vector(query, 3)
# for i in D:
# print(districts[i])
# # query: 刘阳
# # 浏阳
# # 沈阳
# # 岳阳