-
Notifications
You must be signed in to change notification settings - Fork 0
/
basic_es_queries.py
122 lines (104 loc) · 4.55 KB
/
basic_es_queries.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from elasticsearch import Elasticsearch
from collections import defaultdict
from utils import time_function, _pprint
from test.test_words import df_to_list, emotion_to_df
from addict import Dict
es = Elasticsearch() # assumes localhost:9200
def _check_all(addendum=''):
''' useful for small indexes: lists everything '''
return es.search( index='book-index'+addendum,
body= {
'query' : {
'match_all' : {}
}
})
def _num_docs(addendum=''):
return es.count( index='book-index'+addendum,
body= {
'query' : {
'match_all' : {}
}
}).get('count')
def _tail(num_docs_tail, addendum=''):
''' gets the tail, i.e the last `num_docs_tail` documents in terms of id'''
num_docs = _num_docs()
return es.search( index='book-index'+addendum,
body= {
'query' : {
'bool' : {
'should' : [
{'match_all' : {} }
],
'filter' : [
{ 'range' : { '_id' : { 'gt' : num_docs - num_docs_tail }}}
]
}
}
})
def _delete_all(addendum=''):
''' in case we need to wipe the index..'''
es.delete_by_query( index='book-index'+addendum,
body= {
'query' : {
'match_all' : {}
}
})
def _search(body, addendum=''):
return es.search(index='book-index'+addendum,
body=body)
# _pprint(_check_all('-page'), depth=6) # 5 depth does not display text..
def page_get_book(book_name):
assert book_name in [ book.lower() for book in page_get_titles() ]
body = Dict()
body.query.bool.must.match = { "_name": book_name}
return es.search(index='page-index', body=body)
def page_get_titles():
body = Dict()
body.size = 0
body.aggs.titles.terms.field = '_name.keyword'
body.aggs.titles.terms.size = 10*page_num_docs() # put a limit +inf
return [ d.get('key') for d in es.search(index='page-index', body=body)\
.get('aggregations')\
.get('titles')\
.get('buckets') ]
def page_book_get_between(low, high):
body = Dict()
body.query.bool.must = {
"range": {
"_num_pages": {
"gte": low,
"lte": high
}
}
}
return es.search(index='page-index', body=body)
def page_num_docs():
body = Dict()
body.query.match_all = {}
return es.count(index='page-index', body=body).get('count')
def check_get_book(name):
res = page_get_book(name)
_pprint(res.get('hits').get('hits'), depth=2)
print('is the number of hits (=',res.get('hits').get('total'),') equal to the number of pages (=',res.get('hits').get('hits')[0].get('_source').get('_num_pages'), ')')
def page_match_emotion(emotion, slice=(0,None)):
''' fetches synonyms for `emotion` in ./data/words/<emotion>.csv,
and uses a match terms query to find pages which use these synonyms
'''
emotion_terms = df_to_list(emotion_to_df(emotion))
emotion_terms = emotion_terms[slice[0]:slice[1]] if slice[1] is not None else emotion_terms
body = Dict()
body.query.bool.should = [ {'match': { '_page' : emotion }} for emotion in emotion_terms ]
# _pprint(body, depth=4)
return es.search(index='page-index', body=body)
def get_hits(result):
return result.get('hits').get('hits')
### Left to code: bit on filtering by emotion
if __name__ == '__main__':
# res = page_get_titles()
# check_get_book('hamlet')
res = page_match_emotion('fear')
for hit in get_hits(res):
print(hit.get('_source').get('_name'), hit.get('_source').get('_page_number'))
# _pprint(get_hits(page_get_book('pygmalion')), depth=3)
_pprint(res)
# print(page_num_docs())