-
Notifications
You must be signed in to change notification settings - Fork 112
/
utils.ts
145 lines (142 loc) · 5.05 KB
/
utils.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import { OpenAIEmbeddings } from 'langchain/embeddings/openai'
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'
import { OpenAI } from 'langchain/llms/openai'
import { loadQAStuffChain } from 'langchain/chains'
import { Document } from 'langchain/document'
import { timeout } from './config'
export const queryPineconeVectorStoreAndQueryLLM = async (
client,
indexName,
question
) => {
// 1. Start query process
console.log('Querying Pinecone vector store...');
// 2. Retrieve the Pinecone index
const index = client.Index(indexName);
// 3. Create query embedding
const queryEmbedding = await new OpenAIEmbeddings().embedQuery(question)
// 4. Query Pinecone index and return top 10 matches
let queryResponse = await index.query({
queryRequest: {
topK: 10,
vector: queryEmbedding,
includeMetadata: true,
includeValues: true,
},
});
// 5. Log the number of matches
console.log(`Found ${queryResponse.matches.length} matches...`);
// 6. Log the question being asked
console.log(`Asking question: ${question}...`);
if (queryResponse.matches.length) {
// 7. Create an OpenAI instance and load the QAStuffChain
const llm = new OpenAI({});
const chain = loadQAStuffChain(llm);
// 8. Extract and concatenate page content from matched documents
const concatenatedPageContent = queryResponse.matches
.map((match) => match.metadata.pageContent)
.join(" ");
// 9. Execute the chain with input documents and question
const result = await chain.call({
input_documents: [new Document({ pageContent: concatenatedPageContent })],
question: question,
});
// 10. Log the answer
console.log(`Answer: ${result.text}`);
return result.text
} else {
// 11. Log that there are no matches, so GPT-3 will not be queried
console.log('Since there are no matches, GPT-3 will not be queried.');
}
};
export const createPineconeIndex = async (
client,
indexName,
vectorDimension
) => {
// 1. Initiate index existence check
console.log(`Checking "${indexName}"...`);
// 2. Get list of existing indexes
const existingIndexes = await client.listIndexes();
// 3. If index doesn't exist, create it
if (!existingIndexes.includes(indexName)) {
// 4. Log index creation initiation
console.log(`Creating "${indexName}"...`);
// 5. Create index
await client.createIndex({
createRequest: {
name: indexName,
dimension: vectorDimension,
metric: 'cosine',
},
});
// 6. Log successful creation
console.log(`Creating index.... please wait for it to finish initializing.`);
// 7. Wait for index initialization
await new Promise((resolve) => setTimeout(resolve, timeout));
} else {
// 8. Log if index already exists
console.log(`"${indexName}" already exists.`);
}
};
export const updatePinecone = async (client, indexName, docs) => {
console.log('Retrieving Pinecone index...');
// 1. Retrieve Pinecone index
const index = client.Index(indexName);
// 2. Log the retrieved index name
console.log(`Pinecone index retrieved: ${indexName}`);
// 3. Process each document in the docs array
for (const doc of docs) {
console.log(`Processing document: ${doc.metadata.source}`);
const txtPath = doc.metadata.source;
const text = doc.pageContent;
// 4. Create RecursiveCharacterTextSplitter instance
const textSplitter = new RecursiveCharacterTextSplitter({
chunkSize: 1000,
});
console.log('Splitting text into chunks...');
// 5. Split text into chunks (documents)
const chunks = await textSplitter.createDocuments([text]);
console.log(`Text split into ${chunks.length} chunks`);
console.log(
`Calling OpenAI's Embedding endpoint documents with ${chunks.length} text chunks ...`
);
// 6. Create OpenAI embeddings for documents
const embeddingsArrays = await new OpenAIEmbeddings().embedDocuments(
chunks.map((chunk) => chunk.pageContent.replace(/\n/g, " "))
);
console.log('Finished embedding documents');
console.log(
`Creating ${chunks.length} vectors array with id, values, and metadata...`
);
// 7. Create and upsert vectors in batches of 100
const batchSize = 100;
let batch:any = [];
for (let idx = 0; idx < chunks.length; idx++) {
const chunk = chunks[idx];
const vector = {
id: `${txtPath}_${idx}`,
values: embeddingsArrays[idx],
metadata: {
...chunk.metadata,
loc: JSON.stringify(chunk.metadata.loc),
pageContent: chunk.pageContent,
txtPath: txtPath,
},
};
batch = [...batch, vector]
// When batch is full or it's the last item, upsert the vectors
if (batch.length === batchSize || idx === chunks.length - 1) {
await index.upsert({
upsertRequest: {
vectors: batch,
},
});
// Empty the batch
batch = [];
}
}
// 8. Log the number of vectors updated
console.log(`Pinecone index updated with ${chunks.length} vectors`);
}
};