-
Notifications
You must be signed in to change notification settings - Fork 22
/
main.go
173 lines (164 loc) · 6.88 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
package main
import (
"context"
"encoding/json"
"fmt"
"io"
"log"
"os"
"runtime"
"strings"
"time"
"github.com/philippgille/chromem-go"
)
const searchTerm = "semantic search with vector databases"
func main() {
ctx := context.Background()
// Set up chromem-go with persistence, so that when the program restarts, the
// DB's data is still available.
log.Println("Setting up chromem-go...")
db, err := chromem.NewPersistentDB("./db", false)
if err != nil {
panic(err)
}
// Create collection if it wasn't loaded from persistent storage yet.
// We pass nil as embedding function to use the default (OpenAI text-embedding-3-small),
// which is very good and cheap. It requires the OPENAI_API_KEY environment
// variable to be set.
collection, err := db.GetOrCreateCollection("arXiv cs.CL 2023", nil, nil)
if err != nil {
panic(err)
}
// Add docs to the collection, if the collection was just created (and not
// loaded from persistent storage).
var docs []chromem.Document
if collection.Count() == 0 {
// Here we use an arXiv metadata sample, where each line contains the metadata
// of a paper, including its submitter, title and abstract.
f, err := os.Open("/tmp/arxiv_cs-cl_2023.jsonl")
if err != nil {
panic(err)
}
defer f.Close()
d := json.NewDecoder(f)
log.Println("Reading JSON lines...")
i := 0
for {
var paper struct {
ID string `json:"id"`
Submitter string `json:"submitter"`
Title string `json:"title"`
Abstract string `json:"abstract"`
}
err := d.Decode(&paper)
if err == io.EOF {
break // reached end of file
} else if err != nil {
panic(err)
}
title := strings.ReplaceAll(paper.Title, "\n", " ")
title = strings.ReplaceAll(title, " ", " ")
content := strings.TrimSpace(paper.Abstract)
docs = append(docs, chromem.Document{
ID: paper.ID,
Metadata: map[string]string{"submitter": paper.Submitter, "title": title},
Content: content,
})
i++
}
log.Println("Read and parsed", i, "documents.")
log.Println("Adding documents to chromem-go, including creating their embeddings via OpenAI API...")
err = collection.AddDocuments(ctx, docs, runtime.NumCPU())
if err != nil {
panic(err)
}
} else {
log.Println("Not reading JSON lines because collection was loaded from persistent storage.")
}
// Search for documents that are semantically similar to the search term.
// We ask for the 10 most similar documents, but you can use more or less depending
// on your needs.
// You can limit the search by filtering on content or metadata (like the paper's
// submitter), but we don't do that in this example.
log.Println("Querying chromem-go...")
start := time.Now()
docRes, err := collection.Query(ctx, searchTerm, 10, nil, nil)
if err != nil {
panic(err)
}
log.Println("Search (incl query embedding) took", time.Since(start))
// Here you could filter out any documents whose similarity is below a certain threshold.
// if docRes[...].Similarity < 0.5 { ...
// Print the retrieved documents and their similarity to the question.
buf := &strings.Builder{}
for i, res := range docRes {
content := strings.ReplaceAll(res.Content, "\n", " ")
content = content[:min(100, len(content))] + "..."
fmt.Fprintf(buf, "\t%d) Similarity %f:\n"+
"\t\tURL: https://arxiv.org/abs/%s\n"+
"\t\tSubmitter: %s\n"+
"\t\tTitle: %s\n"+
"\t\tAbstract: %s\n",
i+1, res.Similarity, res.ID, res.Metadata["submitter"], res.Metadata["title"], content)
}
log.Printf("Search results:\n%s\n", buf.String())
/* Output:
2024/03/10 18:23:55 Setting up chromem-go...
2024/03/10 18:23:55 Reading JSON lines...
2024/03/10 18:23:55 Read and parsed 5006 documents.
2024/03/10 18:23:55 Adding documents to chromem-go, including creating their embeddings via OpenAI API...
2024/03/10 18:28:12 Querying chromem-go...
2024/03/10 18:28:12 Search (incl query embedding) took 529.451163ms
2024/03/10 18:28:12 Search results:
1) Similarity 0.488895:
URL: https://arxiv.org/abs/2209.15469
Submitter: Christian Buck
Title: Zero-Shot Retrieval with Search Agents and Hybrid Environments
Abstract: Learning to search is the task of building artificial agents that learn to autonomously use a search...
2) Similarity 0.480713:
URL: https://arxiv.org/abs/2305.11516
Submitter: Ryo Nagata Dr.
Title: Contextualized Word Vector-based Methods for Discovering Semantic Differences with No Training nor Word Alignment
Abstract: In this paper, we propose methods for discovering semantic differences in words appearing in two cor...
3) Similarity 0.476079:
URL: https://arxiv.org/abs/2310.14025
Submitter: Maria Lymperaiou
Title: Large Language Models and Multimodal Retrieval for Visual Word Sense Disambiguation
Abstract: Visual Word Sense Disambiguation (VWSD) is a novel challenging task with the goal of retrieving an i...
4) Similarity 0.474883:
URL: https://arxiv.org/abs/2302.14785
Submitter: Teven Le Scao
Title: Joint Representations of Text and Knowledge Graphs for Retrieval and Evaluation
Abstract: A key feature of neural models is that they can produce semantic vector representations of objects (...
5) Similarity 0.470326:
URL: https://arxiv.org/abs/2309.02403
Submitter: Dallas Card
Title: Substitution-based Semantic Change Detection using Contextual Embeddings
Abstract: Measuring semantic change has thus far remained a task where methods using contextual embeddings hav...
6) Similarity 0.466851:
URL: https://arxiv.org/abs/2309.08187
Submitter: Vu Tran
Title: Encoded Summarization: Summarizing Documents into Continuous Vector Space for Legal Case Retrieval
Abstract: We present our method for tackling a legal case retrieval task by introducing our method of encoding...
7) Similarity 0.461783:
URL: https://arxiv.org/abs/2307.16638
Submitter: Maiia Bocharova Bocharova
Title: VacancySBERT: the approach for representation of titles and skills for semantic similarity search in the recruitment domain
Abstract: The paper focuses on deep learning semantic search algorithms applied in the HR domain. The aim of t...
8) Similarity 0.460481:
URL: https://arxiv.org/abs/2106.07400
Submitter: Clara Meister
Title: Determinantal Beam Search
Abstract: Beam search is a go-to strategy for decoding neural sequence models. The algorithm can naturally be ...
9) Similarity 0.460001:
URL: https://arxiv.org/abs/2305.04049
Submitter: Yuxia Wu
Title: Actively Discovering New Slots for Task-oriented Conversation
Abstract: Existing task-oriented conversational search systems heavily rely on domain ontologies with pre-defi...
10) Similarity 0.458321:
URL: https://arxiv.org/abs/2305.08654
Submitter: Taichi Aida
Title: Unsupervised Semantic Variation Prediction using the Distribution of Sibling Embeddings
Abstract: Languages are dynamic entities, where the meanings associated with words constantly change with time...
*/
}