diff --git a/backend.dev.dockerfile b/backend.dev.dockerfile index dd3f706..2dcbf36 100644 --- a/backend.dev.dockerfile +++ b/backend.dev.dockerfile @@ -1,21 +1,27 @@ -FROM node:slim - -ARG SEARXNG_API_URL +FROM node:20-bullseye-slim WORKDIR /home/starknet-agent -# Copy package.json and yarn.lock first to leverage Docker cache +# Install Python and build dependencies +RUN apt-get update && apt-get install -y \ + python3 \ + python3-pip \ + build-essential \ + python-is-python3 \ + && rm -rf /var/lib/apt/lists/* + +# Copy package files COPY package.json yarn.lock ./ +# Install dependencies +RUN yarn install -# Copy the rest of the application code +# Copy source code COPY . . -RUN sed -i "s|SEARXNG = \".*\"|SEARXNG = \"${SEARXNG_API_URL}\"|g" /home/starknet-agent/config.toml +# Build TypeScript +RUN yarn build -RUN mkdir -p /home/starknet-agent/data +EXPOSE 3001 -# Install dependencies including development ones -RUN yarn install -# Use the existing dev command CMD ["yarn", "run", "dev"] diff --git a/docker-compose.dev-hosted.yml b/docker-compose.dev-hosted.yml index 96c6d67..52c555a 100644 --- a/docker-compose.dev-hosted.yml +++ b/docker-compose.dev-hosted.yml @@ -1,5 +1,4 @@ services: - starknet-agent-backend: build: context: . @@ -43,7 +42,7 @@ services: - WATCHPACK_POLLING=true restart: unless-stopped - # cairobook-ingest: + # docs-ingest: # build: # context: . # dockerfile: ingest.dockerfile diff --git a/src/agents/ragSearchAgent.ts b/src/agents/ragSearchAgent.ts index 4a8b24c..9946cc5 100644 --- a/src/agents/ragSearchAgent.ts +++ b/src/agents/ragSearchAgent.ts @@ -58,31 +58,55 @@ export const handleStream = async ( stream: IterableReadableStream, emitter: eventEmitter, ): Promise => { - for await (const event of stream) { - if ( - event.event === 'on_chain_end' && - event.name === 'FinalSourceRetriever' - ) { - emitter.emit( - 'data', - JSON.stringify({ type: 'sources', data: event.data.output }), - ); - } - if ( - event.event === 'on_chain_stream' && - event.name === 'FinalResponseGenerator' - ) { - emitter.emit( - 'data', - JSON.stringify({ type: 'response', data: event.data.chunk }), - ); - } - if ( - event.event === 'on_chain_end' && - event.name === 'FinalResponseGenerator' - ) { - emitter.emit('end'); + logger.info('Starting stream handling'); + try { + for await (const event of stream) { + logger.debug('Stream event received:', { + eventType: event.event, + name: event.name, + }); + + if ( + event.event === 'on_chain_end' && + event.name === 'FinalSourceRetriever' + ) { + logger.info('Sources retrieved:', { + sourceCount: event.data.output.length, + }); + emitter.emit( + 'data', + JSON.stringify({ + type: 'sources', + data: event.data.output, + }), + ); + } + + if ( + event.event === 'on_chain_stream' && + event.name === 'FinalResponseGenerator' + ) { + logger.debug('Response chunk received'); + emitter.emit( + 'data', + JSON.stringify({ + type: 'response', + data: event.data.chunk, + }), + ); + } + + if ( + event.event === 'on_chain_end' && + event.name === 'FinalResponseGenerator' + ) { + logger.info('Stream completed successfully'); + emitter.emit('end'); + } } + } catch (error) { + logger.error('Error in handleStream:', error); + throw error; } }; @@ -96,12 +120,16 @@ export const createBasicSearchRetrieverChain = ( llm, strParser, RunnableLambda.from(async (input: string) => { + logger.debug('Search retriever input:', { input }); if (input === 'not_needed') { return { query: '', docs: [] }; } const documents = await vectorStore.similaritySearch(input, 5); - + logger.debug('Vector store search results:', { + documentCount: documents.length, + firstDoc: documents[0], + }); return { query: input, docs: documents }; }), ]); @@ -138,29 +166,55 @@ export const rerankDocs = query: string; docs: Document[]; }): Promise => { + logger.debug('Reranking docs input:', { + query, + docsLength: docs.length, + firstDoc: docs[0], + }); + if (docs.length === 0 || query === 'Summarize') { + logger.info('Skipping reranking - empty docs or summarize query'); return docs; } const docsWithContent = docs.filter( (doc) => doc.pageContent && doc.pageContent.length > 0, ); - - const [docEmbeddings, queryEmbedding] = await Promise.all([ - embeddings.embedDocuments(docsWithContent.map((doc) => doc.pageContent)), - embeddings.embedQuery(query), - ]); - - const similarity = docEmbeddings.map((docEmbedding, i) => ({ - index: i, - similarity: computeSimilarity(queryEmbedding, docEmbedding), - })); - - return similarity - .filter((sim) => sim.similarity > 0.5) - .sort((a, b) => b.similarity - a.similarity) - .slice(0, 15) - .map((sim) => docsWithContent[sim.index]); + logger.debug('Filtered documents with content:', { + originalCount: docs.length, + filteredCount: docsWithContent.length, + }); + + try { + const [docEmbeddings, queryEmbedding] = await Promise.all([ + embeddings.embedDocuments( + docsWithContent.map((doc) => doc.pageContent), + ), + embeddings.embedQuery(query), + ]); + logger.debug('Embeddings generated successfully'); + + const similarity = docEmbeddings.map((docEmbedding, i) => ({ + index: i, + similarity: computeSimilarity(queryEmbedding, docEmbedding), + })); + + const rerankedDocs = similarity + .filter((sim) => sim.similarity > 0.5) + .sort((a, b) => b.similarity - a.similarity) + .slice(0, 15) + .map((sim) => docsWithContent[sim.index]); + + logger.info('Reranking completed', { + inputDocs: docsWithContent.length, + filteredDocs: rerankedDocs.length, + }); + + return rerankedDocs; + } catch (error) { + logger.error('Error in rerankDocs:', error); + throw error; + } }; export const createBasicSearchAnsweringChain = ( @@ -234,7 +288,13 @@ export const basicRagSearch = ( ): eventEmitter => { const emitter = new eventEmitter(); + logger.info('Starting RAG search', { + query, + historyLength: history.length, + }); + try { + logger.debug('Initializing search chain'); const basicSearchAnsweringChain = createBasicSearchAnsweringChain( llm, embeddings, @@ -244,6 +304,7 @@ export const basicRagSearch = ( noSourceFoundPrompt, ); + logger.debug('Starting stream'); const stream = basicSearchAnsweringChain.streamEvents( { chat_history: history, @@ -254,13 +315,25 @@ export const basicRagSearch = ( }, ); - handleStream(stream, emitter); + handleStream(stream, emitter).catch((error) => { + logger.error('Stream handling failed:', error); + emitter.emit( + 'error', + JSON.stringify({ + data: 'An error occurred while processing the stream', + }), + ); + }); } catch (err) { + logger.error('Error in basicRagSearch:', { + error: err, + query, + historyLength: history.length, + }); emitter.emit( 'error', JSON.stringify({ data: 'An error has occurred please try again later' }), ); - logger.error(`Error in Search: ${err}`); } return emitter; diff --git a/src/agents/ragSearchAgents/cairoBookSearchAgent.ts b/src/agents/ragSearchAgents/cairoBookSearchAgent.ts index bcb44e7..222d3f1 100644 --- a/src/agents/ragSearchAgents/cairoBookSearchAgent.ts +++ b/src/agents/ragSearchAgents/cairoBookSearchAgent.ts @@ -73,7 +73,7 @@ neutral and educational tone in your responses. Format your responses using Mark readability. Use code blocks for Cairo code examples. Provide medium to long responses that are comprehensive and informative. -You have to cite the answer using [number] notation. You must cite the sentences with their relevent context number. You must cite each and every part of the answer so the user can know where the information is coming from. +You have to cite the answer using [number] notation. You must cite the sentences with their relevant context number. You must cite each and every part of the answer so the user can know where the information is coming from. Place these citations at the end of that particular sentence. You can cite the same sentence multiple times if it is relevant to the user's query like [number1][number2]. However you do not need to cite it using the same number. You can use different numbers to cite the same sentence multiple times. The number refers to the number of the search result (passed in the context) used to generate that part of the answer. diff --git a/src/agents/ragSearchAgents/starknetDocsSearchAgent.ts b/src/agents/ragSearchAgents/starknetDocsSearchAgent.ts index bbb91ea..4552e9c 100644 --- a/src/agents/ragSearchAgents/starknetDocsSearchAgent.ts +++ b/src/agents/ragSearchAgents/starknetDocsSearchAgent.ts @@ -39,7 +39,7 @@ neutral and educational tone in your responses. Format your responses using Mark readability. Use code blocks for Cairo code examples. Provide medium to long responses that are comprehensive and informative. -You have to cite the answer using [number] notation. You must cite the sentences with their relevent context number. You must cite each and every part of the answer so the user can know where the information is coming from. +You have to cite the answer using [number] notation. You must cite the sentences with their relevant context number. You must cite each and every part of the answer so the user can know where the information is coming from. Place these citations at the end of that particular sentence. You can cite the same sentence multiple times if it is relevant to the user's query like [number1][number2]. However you do not need to cite it using the same number. You can use different numbers to cite the same sentence multiple times. The number refers to the number of the search result (passed in the context) used to generate that part of the answer. diff --git a/src/agents/ragSearchAgents/starknetEcosystemSearchAgent.ts b/src/agents/ragSearchAgents/starknetEcosystemSearchAgent.ts index 4608921..36a5e70 100644 --- a/src/agents/ragSearchAgents/starknetEcosystemSearchAgent.ts +++ b/src/agents/ragSearchAgents/starknetEcosystemSearchAgent.ts @@ -51,7 +51,7 @@ neutral and educational tone in your responses. Format your responses using Mark readability. Use code blocks for Cairo code examples. Provide medium to long responses that are comprehensive and informative. -You have to cite the answer using [number] notation. You must cite the sentences with their relevent context number. You must cite each and every part of the answer so the user can know where the information is coming from. +You have to cite the answer using [number] notation. You must cite the sentences with their relevant context number. You must cite each and every part of the answer so the user can know where the information is coming from. Place these citations at the end of that particular sentence. You can cite the same sentence multiple times if it is relevant to the user's query like [number1][number2]. However you do not need to cite it using the same number. You can use different numbers to cite the same sentence multiple times. The number refers to the number of the search result (passed in the context) used to generate that part of the answer. @@ -77,7 +77,7 @@ Remember, your knowledge is based solely on the provided Cairo and Starknet docu accuracy and relevance in your responses. Today's date is ${new Date().toISOString()} `; -const handleStarknetEcosystemSearch= ( +const handleStarknetEcosystemSearch = ( message: string, history: BaseMessage[], llm: BaseChatModel, diff --git a/src/agents/ragSearchAgents/starknetFoundrySearchAgent.ts b/src/agents/ragSearchAgents/starknetFoundrySearchAgent.ts new file mode 100644 index 0000000..ab4e7c0 --- /dev/null +++ b/src/agents/ragSearchAgents/starknetFoundrySearchAgent.ts @@ -0,0 +1,116 @@ +/** + * @file starknetFoundrySearchAgent.ts + * @description This file implements a search agent for the Starknet Foundry documentation. + * It uses LangChain to create a chain of operations for processing user queries, + * retrieving relevant information, and generating responses. + * + * Key components: + * - basicSearchRetrieverPrompt: Prompt for rephrasing user queries + * - basicStarknetFoundrySearchResponsePrompt: System prompt for the AI assistant + * - handleStarknetFoundrySearch: Main function that processes queries using basicRagSearch + */ + +import { BaseMessage } from '@langchain/core/messages'; +import type { BaseChatModel } from '@langchain/core/language_models/chat_models'; +import type { Embeddings } from '@langchain/core/embeddings'; +import { VectorStore } from '../../db/vectorStore'; +import eventEmitter from 'events'; +import { basicRagSearch } from '../ragSearchAgent'; +import { getStarknetFoundryVersion, getScarbVersion } from '../../config'; + +const basicSearchRetrieverPrompt = ` +You will be given a conversation below and a follow up question. You need to rephrase the follow-up question if needed so it is a standalone question that can be used by the LLM to search the Starknet Foundry documentation for information. +If it is a writing task or a simple hi, hello rather than a question, you need to return \`not_needed\` as the response. +If the user asks to summarize the content from some links you need to return \`not_needed\` as the response. + +Example: +1. Follow up question: How do I set up Starknet Foundry? +Rephrased question: \`Setting up Starknet Foundry\` + +2. Follow up question: How do I run tests? +Rephrased question: \`Running Tests in Starknet Foundry\` + +3. Follow up question: How do I deploy contracts? +Rephrased question: \`Deploying Contracts with Starknet Foundry\` + +4. Follow up question: What are cheatcodes? +Rephrased question: \`Cheatcodes in Starknet Foundry\` + +You also need to reword the question to be specific about Starknet Foundry features and tools. +If the user asks about "testing", "deployment", "scripts", "configuration", rephrase the question to include "Starknet Foundry". + +Example: +1. Follow up question: How do I write a test? +Rephrased question: \`Writing Tests in Starknet Foundry\` + +2. Follow up question: How do I configure my project? +Rephrased question: \`Project Configuration in Starknet Foundry\` + +3. Follow up question: What are scripts? +Rephrased question: \`Scripts in Starknet Foundry\` + +Conversation: +{chat_history} + +Follow up question: {query} +Rephrased question: +`; + +const basicStarknetFoundrySearchResponsePrompt = ` +You are FoundryGuide, an AI assistant specialized in searching and providing information from the +Starknet Foundry documentation (version ${getStarknetFoundryVersion()}). Your primary role is to assist users with queries related to using +Starknet Foundry for development and testing. The current supported Scarb version is ${getScarbVersion()}. + +Generate informative and relevant responses based on the provided context from the Starknet Foundry documentation. Use a +neutral and educational tone in your responses. Format your responses using Markdown for +readability. Use code blocks for command-line examples and configuration snippets. Provide medium to long responses that are +comprehensive and informative. + +When discussing features or functionality, always mention if they are specific to certain versions of Starknet Foundry or Scarb. + +You have to cite the answer using [number] notation. You must cite the sentences with their relevant context number. You must cite each and every part of the answer so the user can know where the information is coming from. +Place these citations at the end of that particular sentence. You can cite the same sentence multiple times if it is relevant to the user's query like [number1][number2]. +However you do not need to cite it using the same number. You can use different numbers to cite the same sentence multiple times. The number refers to the number of the search result (passed in the context) used to generate that part of the answer. + +Anything inside the following \`context\` HTML block provided below is for your knowledge taken from the Starknet Foundry docs and is not shared by the user. You have to answer question on the basis of it and cite the relevant information from it but you do not have to talk about the context in your response. + + +{context} + + +If the user's query is not related to Starknet Foundry, respond with: "I apologize, but +I'm specifically designed to assist with Starknet Foundry-related queries. This topic +appears to be outside my area of expertise. Is there anything related to Starknet Foundry that I +can help you with instead?" + +Do not tell the user to visit external websites or open links. Provide the information directly in +your response. If asked for specific documentation links, you may provide them if available in the +context. + +If you cannot find relevant information in the provided context, state: "I'm sorry, but I couldn't +find specific information about that in the Starknet Foundry documentation. Could you rephrase your question or ask +about a related topic in Starknet Foundry?" + +Remember, your knowledge is based solely on the provided Starknet Foundry documentation. Always strive for +accuracy and relevance in your responses. Today's date is ${new Date().toISOString()} +`; + +const handleStarknetFoundrySearch = ( + message: string, + history: BaseMessage[], + llm: BaseChatModel, + embeddings: Embeddings, + additionalParams: { vectorStore: VectorStore }, +): eventEmitter => { + return basicRagSearch( + message, + history, + llm, + embeddings, + additionalParams.vectorStore, + basicSearchRetrieverPrompt, + basicStarknetFoundrySearchResponsePrompt, + ); +}; + +export default handleStarknetFoundrySearch; diff --git a/src/agents/ragSearchAgents/succintCairoBookSearchAgent.ts b/src/agents/ragSearchAgents/succintCairoBookSearchAgent.ts index f9e5f3d..c231fff 100644 --- a/src/agents/ragSearchAgents/succintCairoBookSearchAgent.ts +++ b/src/agents/ragSearchAgents/succintCairoBookSearchAgent.ts @@ -32,7 +32,7 @@ neutral and educational tone in your responses. Format your responses using Mark readability. Use code blocks for very shortCairo code examples. Provide as concise and short responses as possible without losing information. Make sure to reply in a way that links to the relevant information using the citation method. -You have to cite the answer using [number] notation. You must cite the sentences with their relevent context number. You must cite each and every part of the answer so the user can know where the information is coming from. +You have to cite the answer using [number] notation. You must cite the sentences with their relevant context number. You must cite each and every part of the answer so the user can know where the information is coming from. Place these citations at the end of that particular sentence. You can cite the same sentence multiple times if it is relevant to the user's query like [number1][number2]. However you do not need to cite it using the same number. You can use different numbers to cite the same sentence multiple times. The number refers to the number of the search result (passed in the context) used to generate that part of the answer. diff --git a/src/config.ts b/src/config.ts index e80fc6c..945d677 100644 --- a/src/config.ts +++ b/src/config.ts @@ -18,6 +18,7 @@ interface Config { CAIRO_DB: VectorStoreConfig; STARKNET_DB: VectorStoreConfig; ECOSYSTEM_DB: VectorStoreConfig; + STARKNET_FOUNDRY_DB: VectorStoreConfig; API_KEYS: { OPENAI: string; GROQ: string; @@ -33,6 +34,10 @@ interface Config { DEFAULT_EMBEDDING_PROVIDER: string; DEFAULT_EMBEDDING_MODEL: string; }; + VERSIONS: { + STARKNET_FOUNDRY: string; + SCARB: string; + }; } type RecursivePartial = { @@ -65,6 +70,9 @@ export const getOllamaApiEndpoint = () => loadConfig().API_ENDPOINTS.OLLAMA; export const getCairoDbConfig = () => loadConfig().CAIRO_DB; +export const getStarknetFoundryDbConfig = () => + loadConfig().STARKNET_FOUNDRY_DB; + export const getStarknetDbConfig = () => loadConfig().STARKNET_DB; export const getStarknetEcosystemDbConfig = () => loadConfig().ECOSYSTEM_DB; @@ -95,3 +103,6 @@ export const updateConfig = (config: RecursivePartial) => { toml.stringify(config), ); }; + +export const getStarknetFoundryVersion = () => loadConfig().VERSIONS.STARKNET_FOUNDRY; +export const getScarbVersion = () => loadConfig().VERSIONS.SCARB; \ No newline at end of file diff --git a/src/ingester/cairoBookIngester.ts b/src/ingester/cairoBookIngester.ts index 39ca983..7eab58d 100644 --- a/src/ingester/cairoBookIngester.ts +++ b/src/ingester/cairoBookIngester.ts @@ -29,7 +29,6 @@ const config: BookConfig = { baseUrl: 'https://book.cairo-lang.org', }; - export const ingestCairoBook = async (vectorStore: VectorStore) => { try { const pages = await downloadAndExtractCairoBook(); @@ -95,7 +94,8 @@ export async function createChunks( for (const page of pages) { const sanitizedContent = sanitizeCodeBlocks(page.content); - const sections: ParsedSection[] = splitMarkdownIntoSections(sanitizedContent); + const sections: ParsedSection[] = + splitMarkdownIntoSections(sanitizedContent); sections.forEach((section: ParsedSection, index: number) => { const hash: string = calculateHash(section.content); diff --git a/src/ingester/starknetDocsIngester.ts b/src/ingester/starknetDocsIngester.ts index bbca232..5eb48ba 100644 --- a/src/ingester/starknetDocsIngester.ts +++ b/src/ingester/starknetDocsIngester.ts @@ -66,7 +66,10 @@ export async function downloadAndExtractStarknetDocs(): Promise { logger.error('Error running Antora:', error); throw error; } finally { - await fs.rm(path.join(__dirname, 'build'), { recursive: true, force: true }); + await fs.rm(path.join(__dirname, 'build'), { + recursive: true, + force: true, + }); } const outputDir = path.join(__dirname, 'antora-output'); @@ -75,9 +78,14 @@ export async function downloadAndExtractStarknetDocs(): Promise { return await processDocFiles(STARKNET_DOCS_CONFIG, targetDir); } -async function mergeDocsCommonContent(docsCommonContentDir: string, mergeDir: string) { +async function mergeDocsCommonContent( + docsCommonContentDir: string, + mergeDir: string, +) { console.log('Merging Docs Common Content into Starknet Docs'); - const entries = await fs.readdir(docsCommonContentDir, { withFileTypes: true }); + const entries = await fs.readdir(docsCommonContentDir, { + withFileTypes: true, + }); for (const entry of entries) { if (entry.isDirectory()) { const sourcePath = path.join(docsCommonContentDir, entry.name); @@ -244,7 +252,7 @@ export function splitAsciiDocIntoSections(content: string): ParsedSection[] { lastTitle, markdownContent, MAX_SECTION_SIZE, - lastAnchor + lastAnchor, ); } } @@ -260,12 +268,12 @@ export function splitAsciiDocIntoSections(content: string): ParsedSection[] { if (sectionContent) { const markdownContent = downdoc(sectionContent); if (markdownContent) { - addSectionWithSizeLimit( - sections, - lastTitle, - markdownContent, - MAX_SECTION_SIZE, - lastAnchor + addSectionWithSizeLimit( + sections, + lastTitle, + markdownContent, + MAX_SECTION_SIZE, + lastAnchor, ); } } @@ -277,9 +285,12 @@ export function splitAsciiDocIntoSections(content: string): ParsedSection[] { export function convertCodeBlocks(content: string): string { // Case 1: With language specification const languageCodeBlockRegex = /^\[source,(\w+)\]\s*^----$([\s\S]*?)^----$/gm; - content = content.replace(languageCodeBlockRegex, (match, language, codeContent) => { - return convertCodeBlock(codeContent, language); - }); + content = content.replace( + languageCodeBlockRegex, + (match, language, codeContent) => { + return convertCodeBlock(codeContent, language); + }, + ); // Case 2: No language specification const simpleCodeBlockRegex = /^----$([\s\S]*?)^----$/gm; diff --git a/src/ingester/starknetEcosystemIngester.ts b/src/ingester/starknetEcosystemIngester.ts index 4ca2d3d..344efc8 100644 --- a/src/ingester/starknetEcosystemIngester.ts +++ b/src/ingester/starknetEcosystemIngester.ts @@ -1,24 +1,45 @@ -import { VectorStore } from "../db/vectorStore"; -import { createChunks as createCairoBookChunks, downloadAndExtractCairoBook, cleanupDownloadedFiles as cleanupCairoBookFiles } from "./cairoBookIngester"; -import { updateVectorStore } from "./shared"; -import { cleanupDownloadedFiles as cleanupSNDocsFiles, downloadAndExtractStarknetDocs, createChunks as createSNDocsChunks } from "./starknetDocsIngester"; +import { VectorStore } from '../db/vectorStore'; +import { + createChunks as createCairoBookChunks, + downloadAndExtractCairoBook, + cleanupDownloadedFiles as cleanupCairoBookFiles, +} from './cairoBookIngester'; +import { updateVectorStore } from './shared'; +import { + cleanupDownloadedFiles as cleanupSNDocsFiles, + downloadAndExtractStarknetDocs, + createChunks as createSNDocsChunks, +} from './starknetDocsIngester'; +import { + downloadAndExtractFoundryDocs, + createChunks as createFoundryChunks, + cleanupDownloadedFiles as cleanupFoundryFiles, +} from './starknetFoundryIngester'; // Main ingestion function export const ingestStarknetEcosystem = async (vectorStore: VectorStore) => { - try { - const snDocsPages = await downloadAndExtractStarknetDocs(); - const snDocsChunks = await createSNDocsChunks(snDocsPages); - const cairoBookPages = await downloadAndExtractCairoBook(); - const cairoBookChunks = await createCairoBookChunks(cairoBookPages); - const chunks = [...snDocsChunks, ...cairoBookChunks]; - await updateVectorStore(vectorStore, chunks); - await cleanupSNDocsFiles(); - await cleanupCairoBookFiles(); - } catch (error) { - console.error('Error processing Starknet Ecosystem:', error); - if (error instanceof Error) { - console.error('Stack trace:', error.stack); - } - throw error; + try { + const snDocsPages = await downloadAndExtractStarknetDocs(); + const snDocsChunks = await createSNDocsChunks(snDocsPages); + const cairoBookPages = await downloadAndExtractCairoBook(); + const cairoBookChunks = await createCairoBookChunks(cairoBookPages); + const starknetFoundryPages = await downloadAndExtractFoundryDocs(); + const starknetFoundryChunks = + await createFoundryChunks(starknetFoundryPages); + const chunks = [ + ...snDocsChunks, + ...cairoBookChunks, + ...starknetFoundryChunks, + ]; + await updateVectorStore(vectorStore, chunks); + await cleanupSNDocsFiles(); + await cleanupCairoBookFiles(); + await cleanupFoundryFiles(); + } catch (error) { + console.error('Error processing Starknet Ecosystem:', error); + if (error instanceof Error) { + console.error('Stack trace:', error.stack); } - }; + throw error; + } +}; diff --git a/src/ingester/starknetFoundryIngester.ts b/src/ingester/starknetFoundryIngester.ts new file mode 100644 index 0000000..f761173 --- /dev/null +++ b/src/ingester/starknetFoundryIngester.ts @@ -0,0 +1,179 @@ +import * as fs from 'fs/promises'; +import * as path from 'path'; +import axios from 'axios'; +import AdmZip from 'adm-zip'; +import { VectorStore } from '../db/vectorStore'; +import { Document } from 'langchain/document'; +import logger from '../utils/logger'; +import { BookChunk } from '../types/types'; +import { + BookConfig, + BookPageDto, + isInsideCodeBlock, + ParsedSection, + processDocFiles, + calculateHash, + createAnchor, + addSectionWithSizeLimit, + MAX_SECTION_SIZE, + updateVectorStore, +} from './shared'; + +const config: BookConfig = { + repoOwner: 'cairo-book', + repoName: 'starknet-foundry', + fileExtension: '.md', + chunkSize: 4096, + chunkOverlap: 512, + baseUrl: 'https://foundry-rs.github.io/starknet-foundry', +}; + +export const ingestStarknetFoundry = async (vectorStore: VectorStore) => { + try { + const pages = await downloadAndExtractFoundryDocs(); + const chunks = await createChunks(pages); + await updateVectorStore(vectorStore, chunks); + await cleanupDownloadedFiles(); + } catch (error) { + console.error('Error processing Starknet Foundry docs:', error); + if (error instanceof Error) { + console.error('Stack trace:', error.stack); + } + throw error; + } +}; + +export async function cleanupDownloadedFiles() { + const extractDir = path.join(__dirname, 'starknet-foundry'); + await fs.rm(extractDir, { recursive: true, force: true }); + logger.info(`Deleted downloaded markdown files from ${extractDir}`); +} + +export async function downloadAndExtractFoundryDocs(): Promise { + logger.info('Downloading and extracting Starknet Foundry docs'); + const latestReleaseUrl = `https://api.github.com/repos/${config.repoOwner}/${config.repoName}/releases/latest`; + const response = await axios.get(latestReleaseUrl); + const latestRelease = response.data; + const zipAsset = latestRelease.assets.find( + (asset: any) => asset.name === 'markdown-output.zip', + ); + + if (!zipAsset) { + throw new Error('ZIP asset not found in the latest release.'); + } + + const zipUrl = zipAsset.browser_download_url; + logger.info(`Downloading ZIP file from ${zipUrl}`); + const zipResponse = await axios.get(zipUrl, { responseType: 'arraybuffer' }); + const zipData = zipResponse.data; + + const zipFile = new AdmZip(zipData); + const extractDir = path.join(__dirname, 'starknet-foundry'); + zipFile.extractAllTo(extractDir, true); + + logger.info('ZIP file downloaded and extracted successfully.'); + + const srcDir = path.join(extractDir, 'markdown-output'); + + const pages = await processDocFiles(config, srcDir); + + return pages; +} + +/** + * Creates chunks from book pages based on markdown sections + * @param pages - Array of BookPageDto objects + * @returns Promise - Array of Document objects representing chunks + */ +export async function createChunks( + pages: BookPageDto[], +): Promise[]> { + logger.info('Creating chunks from foundry pages based on markdown sections'); + const chunks: Document[] = []; + + for (const page of pages) { + const sanitizedContent = sanitizeCodeBlocks(page.content); + const sections: ParsedSection[] = + splitMarkdownIntoSections(sanitizedContent); + + sections.forEach((section: ParsedSection, index: number) => { + const hash: string = calculateHash(section.content); + chunks.push( + new Document({ + pageContent: section.content, + metadata: { + name: page.name, + title: section.title, + chunkNumber: index, + contentHash: hash, + uniqueId: `${page.name}-${index}`, + sourceLink: `${config.baseUrl}/${page.name}.html#${createAnchor(section.title)}`, + }, + }), + ); + }); + } + + return chunks as Document[]; +} + +export function sanitizeCodeBlocks(content: string): string { + const lines = content.split('\n'); + let isInCodeBlock = false; + const sanitizedLines = lines.filter((line) => { + if (line.trim().startsWith('```')) { + isInCodeBlock = !isInCodeBlock; + return true; + } + if (isInCodeBlock) { + return !line.trim().startsWith('# ') && line.trim() !== '#'; + } + return true; + }); + return sanitizedLines.join('\n'); +} + +/** + * Splits markdown content into sections based on headers and imposes a maximum section size + * Only Headers 1 & 2 are considered to avoid splitting sections too small. + * The maximum section size is 20000 characters - this is to avoid embedding large sections, which is + * limited by OpenAI. The limit is 8192 tokens, therefore 20000 characters should be safe at 1token~=4 characters. + * @param content - The markdown content to split + * @returns ParsedSection[] - Array of ParsedSection objects + */ +export function splitMarkdownIntoSections(content: string): ParsedSection[] { + const headerRegex = /^(#{1,2})\s+(.+)$/gm; + const sections: ParsedSection[] = []; + let lastIndex = 0; + let lastTitle = ''; + let match; + + while ((match = headerRegex.exec(content)) !== null) { + if (!isInsideCodeBlock(content, match.index)) { + if (lastIndex < match.index) { + const sectionContent = content.slice(lastIndex, match.index).trim(); + addSectionWithSizeLimit( + sections, + lastTitle, + sectionContent, + MAX_SECTION_SIZE, + ); + } + lastTitle = match[2]; + lastIndex = match.index; + } + } + + // Add the last section + if (lastIndex < content.length) { + const sectionContent = content.slice(lastIndex).trim(); + addSectionWithSizeLimit( + sections, + lastTitle, + sectionContent, + MAX_SECTION_SIZE, + ); + } + + return sections; +} diff --git a/src/scripts/generateEmbeddings.ts b/src/scripts/generateEmbeddings.ts index 418392b..29499e7 100644 --- a/src/scripts/generateEmbeddings.ts +++ b/src/scripts/generateEmbeddings.ts @@ -2,6 +2,7 @@ import { getCairoDbConfig, getStarknetDbConfig, getStarknetEcosystemDbConfig, + getStarknetFoundryDbConfig, VectorStoreConfig, } from '../config'; import { loadOpenAIEmbeddingsModels } from '../lib/providers/openai'; @@ -12,6 +13,7 @@ import dotenv from 'dotenv'; import { createInterface } from 'readline'; import logger from '../utils/logger'; import { ingestStarknetEcosystem } from '../ingester/starknetEcosystemIngester'; +import { ingestStarknetFoundry } from '../ingester/starknetFoundryIngester'; dotenv.config(); @@ -71,6 +73,18 @@ async function ingestEcosystemData() { } } +async function ingestFoundryData() { + console.log('Starting Starknet Foundry ingestion process...'); + try { + const store = await setupVectorStore(getStarknetFoundryDbConfig()); + await ingestStarknetFoundry(store); + console.log('Starknet Foundry ingestion completed successfully.'); + } catch (error) { + console.error('Error during Starknet Foundry ingestion:', error); + throw error; + } +} + async function promptForTarget(): Promise { const rl = createInterface({ input: process.stdin, @@ -79,13 +93,14 @@ async function promptForTarget(): Promise { return new Promise((resolve) => { rl.question( - 'Select the ingestion target (1: Cairo Book, 2: Starknet Docs, 3: Ecosystem): ', + 'Select the ingestion target (1: Cairo Book, 2: Starknet Docs, 3: Starknet Foundry, 4: Everything): ', (answer) => { rl.close(); const targets = [ 'Cairo Book', 'Starknet Docs', - 'All Starknet Ecosystem', + 'Starknet Foundry', + 'Everything', ]; resolve(targets[parseInt(answer) - 1] || 'Both'); }, @@ -106,7 +121,11 @@ async function main() { await ingestStarknetDocsData(); } - if (target === 'All Starknet Ecosystem') { + if (target === 'Starknet Foundry') { + await ingestFoundryData(); + } + + if (target === 'Everything') { await ingestEcosystemData(); } diff --git a/src/websocket/messageHandler.ts b/src/websocket/messageHandler.ts index 5c6eb97..e1efc67 100644 --- a/src/websocket/messageHandler.ts +++ b/src/websocket/messageHandler.ts @@ -11,6 +11,7 @@ import { getCairoDbConfig, getStarknetDbConfig, getStarknetEcosystemDbConfig, + getStarknetFoundryDbConfig, VectorStoreConfig, } from '../config'; import { VectorStore } from '../db/vectorStore'; @@ -19,6 +20,7 @@ import { HandlerOptions, SearchHandler } from '../types/types'; import handleStarknetDocsSearch from '../agents/ragSearchAgents/starknetDocsSearchAgent'; import handleStarknetEcosystemSearch from '../agents/ragSearchAgents/starknetEcosystemSearchAgent'; import handleSuccintCairoBookSearch from '../agents/ragSearchAgents/succintCairoBookSearchAgent'; +import handleStarknetFoundrySearch from '../agents/ragSearchAgents/starknetFoundrySearchAgent'; type Message = { messageId: string; @@ -39,6 +41,7 @@ const searchHandlers: Record = { succintCairoBookSearch: handleSuccintCairoBookSearch, starknetDocsSearch: handleStarknetDocsSearch, starknetEcosystemSearch: handleStarknetEcosystemSearch, + starknetFoundrySearch: handleStarknetFoundrySearch, }; const searchDatabases: Record VectorStoreConfig> = { @@ -46,6 +49,7 @@ const searchDatabases: Record VectorStoreConfig> = { starknetDocsSearch: getStarknetDbConfig, starknetEcosystemSearch: getStarknetEcosystemDbConfig, succintCairoBookSearch: getCairoDbConfig, + starknetFoundrySearch: getStarknetFoundryDbConfig, }; const handleEmitterEvents = ( diff --git a/ui/components/MessageInputActions/Focus.tsx b/ui/components/MessageInputActions/Focus.tsx index a5787ba..94ab00c 100644 --- a/ui/components/MessageInputActions/Focus.tsx +++ b/ui/components/MessageInputActions/Focus.tsx @@ -9,7 +9,6 @@ import { } from 'lucide-react'; import { cn } from '@/lib/utils'; import { Popover, Transition } from '@headlessui/react'; -import { SiReddit, SiYoutube } from '@icons-pack/react-simple-icons'; import { Fragment } from 'react'; const focusModes = [ @@ -31,6 +30,18 @@ const focusModes = [ description: 'Search in Starknet Docs', icon: , }, + { + key: 'starknetDocsSearch', + title: 'Starknet Docs', + description: 'Search in Starknet Docs', + icon: , + }, + { + key: 'starknetFoundrySearch', + title: 'Starknet Foundry', + description: 'Search in Starknet Foundry', + icon: , + }, ]; const Focus = ({