-
Notifications
You must be signed in to change notification settings - Fork 0
/
basic_example.py
283 lines (245 loc) · 8.97 KB
/
basic_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
from dataclasses import dataclass
from opensearchpy import OpenSearch
from utils import EMBEDDING_DIM, vectorize, TextType
@dataclass
class MinimalChunk:
content: str
@dataclass
class MinimalDoc:
title: str
chunks: list[MinimalChunk]
# Creating documents for testing
DOC_1 = MinimalDoc(
title="Doc 1",
chunks=[
MinimalChunk(content="The weather in Florida is hot and humid"),
MinimalChunk(content="The weather in Alaska is frigid"),
MinimalChunk(content="The weather in the Sahara is dry")
]
)
DOC_2 = MinimalDoc(
title="Doc 2",
chunks=[
MinimalChunk(content="My favorite animal in the world is the dog"),
MinimalChunk(content="My favorite animal in the world is the cat"),
MinimalChunk(content="My favorite animal in Florida is the alligator")
]
)
DOC_3 = MinimalDoc(
title="Doc 3",
chunks=[
MinimalChunk(content="The best food is French fries"),
MinimalChunk(content="The best food is pizza"),
MinimalChunk(content="The best food is sushi")
]
)
DOCUMENTS = [DOC_1, DOC_2, DOC_3]
def get_opensearch_client():
return OpenSearch(
hosts=[{"host": "localhost", "port": 9200}],
http_auth=("admin", "D@nswer_1ndex"),
use_ssl=True,
verify_certs=False,
ssl_show_warn=False
)
def create_index(client, index_name):
hnsw_config = {
"type": "knn_vector",
"dimension": EMBEDDING_DIM,
"method": {
"name": "hnsw",
"space_type": "cosinesimil",
"engine": "lucene",
"parameters": {
"ef_construction": 200,
"m": 48
}
}
}
schema = {
"settings": {
"index": {
"number_of_shards": 1,
"knn": True
}
},
"mappings": {
"properties": {
"title": {"type": "text"},
"chunks": {
"type": "nested",
"properties": {
"content": {"type": "text"},
"embedding": hnsw_config
}
},
}
}
}
# Wiping just to be sure
client.indices.delete(index_name, ignore=[404])
print(f"Creating Index {index_name}")
response = client.indices.create(index_name, body=schema)
print(response)
def add_normalization_processor(client):
pipeline_body = {
"description": "Normalization for keyword and vector scores",
"phase_results_processors": [
{
"normalization-processor": {
"normalization": {
"technique": "min_max"
},
"combination": {
"technique": "arithmetic_mean",
"parameters": {
"weights": [
0.3,
0.7
]
}
}
}
}
]
}
client.search_pipeline.put(id="normalization_step", body=pipeline_body)
def index_document(client, index_name, document: MinimalDoc):
doc = {
"title": document.title,
"chunks": [
{
"content": chunk.content,
"embedding": vectorize(chunk.content, TextType.PASSAGE)
} for chunk in document.chunks
],
}
print(f"Indexing {document.title} document")
response = client.index(index=index_name, body=doc)
print(response)
def hybrid_search_v1(client, index_name, query, max_num_results=10):
query_vector = vectorize(query, TextType.QUERY)
# We need to use the nested field and also hybrid
# Either the hybrid is on the outside or the nested is on the outside
# Neither work correctly but the behavior is different
# The nested named queries don't really work right as well
# For this one, the problems are:
# It only gives back one chunk and there are no inner hits so we have no idea what the individual chunk scores are
# The scores seem normalized as in the smallest is 0 and largest is 1 (if a doc has both the highest keyword and vector scores or it has the lowest of both)
# However the middle scores just seem wrong, there's no reasonable way of normalizing that leads to that middle score
search_body_hybrid_outside = {
"size": max_num_results,
"query": {
"hybrid": {
"queries": [
# Chunk Keyword Score
{
"nested": {
"path": "chunks",
"query": {
"match": {
"chunks.content": {
"query": query,
"_name": "chunk_keyword_score"
}
}
},
"score_mode": "max",
"inner_hits": {
"size": 20
}
},
},
# Chunk Vector Score
# This way of nesting apparently doesn't give back the inner hits
# Gives back documents that are normalized
{
"nested": {
"path": "chunks",
"query": {
"knn": {
"chunks.embedding": {
"vector": query_vector,
"k": max_num_results,
"_name": "chunk_vector_score"
},
},
},
"score_mode": "max",
"inner_hits": {
"size": 20
}
},
},
],
},
},
}
response = client.search(
index=index_name,
search_pipeline="normalization_step",
body=search_body_hybrid_outside,
include_named_queries_score=True
)
return response
def hybrid_search_v2(client, index_name, query, max_num_results=10):
query_vector = vectorize(query, TextType.QUERY)
# This one is wrong for other reasons, the inner hits are definitely not normalized
# We tried changing the normalization weighting and it does not change anything
# Can see inner hits but just one chunk per hit
search_body_hybrid_inside = {
"size": max_num_results,
"query": {
"nested": {
"path": "chunks",
"query": {
"hybrid": {
"queries": [
# Keyword Score
{
"match": {
"chunks.content": {
"query": query,
"_name": "chunk_keyword_score"
}
}
},
# Chunk Vector Score
{
"knn": {
"chunks.embedding": {
"vector": query_vector,
"k": max_num_results,
"_name": "chunk_vector_score"
},
},
},
],
},
},
"score_mode": "max",
"inner_hits": {
"size": 20
},
},
},
}
response = client.search(
index=index_name,
search_pipeline="normalization_step",
body=search_body_hybrid_inside,
include_named_queries_score=True
)
return response
def main():
client = get_opensearch_client()
index_name = "danswer-index"
create_index(client, index_name)
add_normalization_processor(client)
for document in DOCUMENTS:
index_document(client, index_name, document)
print("Performing hybrid search")
print(hybrid_search_v1(client, index_name, "Florida"))
print(hybrid_search_v2(client, index_name, "Florida"))
if __name__ == "__main__":
main()