-
Notifications
You must be signed in to change notification settings - Fork 0
/
FileVectorStoreDataLoader.java
90 lines (70 loc) · 2.89 KB
/
FileVectorStoreDataLoader.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
package nl._42.springai.hackathon.domain.file;
import static nl._42.springai.hackathon.domain.BatchUtils.runTasksMultithreaded;
import static nl._42.springai.hackathon.domain.BatchUtils.splitListIntoBatches;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;
import lombok.AllArgsConstructor;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.elasticsearch.client.Request;
import org.elasticsearch.client.RestClient;
import org.springframework.ai.document.Document;
import org.springframework.ai.reader.TextReader;
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
import org.springframework.ai.vectorstore.VectorStore;
import org.springframework.http.HttpMethod;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Transactional;
@Slf4j
@Component
@RequiredArgsConstructor
public class FileVectorStoreDataLoader {
private static final int BATCH_SIZE = 10;
public static final TokenTextSplitter TEXT_SPLITTER = new TokenTextSplitter();
private static final String FILE_NAME = "/file";
private final VectorStore vectorStore;
private final RestClient elasticsearchClient;
public void loadVectorStoreData() throws Exception {
log.info("STARTED");
TextReader reader = new TextReader(FILE_NAME);
var documents = reader.read();
var splitDocuments = TEXT_SPLITTER.apply(documents);
log.info("Split into {} documents", splitDocuments.size());
var batches = splitListIntoBatches(splitDocuments, BATCH_SIZE);
log.info("Processed into {} batches", batches.size());
List<Callable<Void>> tasks = new ArrayList<>();
for (List<Document> batch : batches) {
tasks.add(new FileVectorBuildTask(batch));
}
runTasksMultithreaded(tasks);
log.info("DONE in {} batches", batches.size());
}
public void cleanVectors(String indexName) {
try {
// this will drop the existing data from the vector store (start with a clean slate)
log.info("deleting: {}", indexName);
elasticsearchClient.performRequest(new Request(HttpMethod.DELETE.name(), "http://localhost:9200/" + indexName));
} catch (Exception ex) {
log.info("Index {} not found", indexName);
}
}
@AllArgsConstructor
class FileVectorBuildTask implements Callable<Void> {
public final List<Document> documents;
@Override
@Transactional
public Void call() {
try {
log.info("Started VectorBuildTask");
vectorStore.accept(documents);
log.info("Finished VectorBuildTask id");
} catch (Exception e) {
log.error("Failed task", e);
}
return null;
}
}
public void generateTicketData() {
}
}