-
Notifications
You must be signed in to change notification settings - Fork 0
/
TestLSH.py
48 lines (43 loc) · 1.55 KB
/
TestLSH.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# -*- coding: utf-8 -*-
import Mongodb as mongo
from datasketch import MinHashLSHForest, MinHash, MinHashLSH
from KeywordInput import get_top_n_words_tf
import time
from bson.objectid import ObjectId
docs_col = mongo.get_colection("documents")
test = mongo.get_colection("vnmeseTest")
def query_candidates(doc, topn):
min = MinHash(num_perm=128)
# keyword = doc['keyword'].split(",")
keyword = get_top_n_words_tf(doc['content'], 15)
for k in keyword:
min.update(k.encode('utf8'))
result = forest.query(min, topn)
print(doc['title'])
# filename = "result_minHashLSHforest.txt"
# myfile = open(filename, 'a+')
# myfile.write(doc['title'] + '\n')
if result:
for item in result:
doc = docs_col.find_one({"_id": ObjectId(str(item))})
print(doc['title'])
# myfile.write(doc['title'] + '\n')
# myfile.write("================" + '\n')
print("=====================")
if __name__ == '__main__':
forest = MinHashLSHForest(num_perm=128)
documents_en = docs_col.find({"lang": 'english'})
for item in documents_en:
minhash = MinHash(num_perm=128)
list_keyword = item["keyword"].split(",")
for k in list_keyword:
minhash.update(k.encode("utf-8"))
forest.add(str(item["_id"]), minhash)
forest.index()
documents_vi = docs_col.find({"lang": 'vietnamese'})
start_time = time.time()
for doc in documents_vi:
# pdb.set_trace()
query_candidates(doc, 5)
elapsed_time = time.time() - start_time
print(elapsed_time)