diff --git a/daras_ai_v2/vector_search.py b/daras_ai_v2/vector_search.py index ccc673c62..c0781de63 100644 --- a/daras_ai_v2/vector_search.py +++ b/daras_ai_v2/vector_search.py @@ -199,8 +199,8 @@ def get_top_k_references( # merge duplicate references uniques: dict[str, SearchReference] = {} - for ref in references: - key = ref["url"] + for ref_key, ref in references: + key = ref_key try: existing = uniques[key] except KeyError: @@ -217,13 +217,20 @@ def vespa_search_results_to_refs( for hit in search_result["root"].get("children", []): try: ref = EmbeddingsReference.objects.get(vespa_doc_id=hit["fields"]["id"]) + ref_key = ref.url except EmbeddingsReference.DoesNotExist: continue if "text/html" in ref.embedded_file.metadata.mime_type: # logger.debug(f"Generating fragments {ref['url']} as it is a HTML file") ref.url = generate_text_fragment_url(url=ref.url, text=ref.snippet) - yield SearchReference( - url=ref.url, title=ref.title, snippet=ref.snippet, score=hit["relevance"] + yield ( + ref_key, + SearchReference( + url=ref.url, + title=ref.title, + snippet=ref.snippet, + score=hit["relevance"], + ), )