diff --git a/persistence-module/src/connector.ts b/persistence-module/src/connector.ts deleted file mode 100644 index b801dea61..000000000 --- a/persistence-module/src/connector.ts +++ /dev/null @@ -1,33 +0,0 @@ -// Service that holds responsibility for initializing and exposing mdb interfaces. -// Also exports helper functions for common operations (insert, upsert one by _id, etc.) -// When adding helpers here, ask yourself if the helper will be used by more than one service -// If no, the helper should be implemented in that service, not here - -import * as mongodb from 'mongodb'; -import type { Db } from 'mongodb'; - -// We should only ever have one client active at a time. -const atlasURL = `mongodb+srv://${process.env.MONGO_ATLAS_USERNAME}:${process.env.MONGO_ATLAS_PASSWORD}@${process.env.MONGO_ATLAS_HOST}/?retryWrites=true&w=majority&maxPoolSize=20`; -const client = new mongodb.MongoClient(atlasURL); - -export const teardown = async () => { - await client.close(); -}; - -const SNOOTY_DB_NAME = 'snooty_dotcomstg'; - -// cached db object, so we can handle initial connection process once if unitialized -let dbInstance: Db; -// Handles memoization of db object, and initial connection logic if needs to be initialized -export const db = async () => { - if (!dbInstance) { - try { - await client.connect(); - dbInstance = client.db(SNOOTY_DB_NAME); - } catch (error) { - console.error(`Error at db client connection: ${error}`); - throw error; - } - } - return dbInstance; -}; diff --git a/persistence-module/src/db-operations.ts b/persistence-module/src/db-operations.ts deleted file mode 100644 index 550ca576e..000000000 --- a/persistence-module/src/db-operations.ts +++ /dev/null @@ -1,20 +0,0 @@ -import type * as mongodb from 'mongodb'; -import { db } from './connector'; - -export const bulkWrite = async ( - operations: mongodb.AnyBulkWriteOperation[], - collection: string, -) => { - const dbSession = await db(); - try { - if (!operations || !operations.length) { - return; - } - return dbSession - .collection(collection) - .bulkWrite(operations, { ordered: false }); - } catch (error) { - console.error(`Error at bulk write time for ${collection}: ${error}`); - throw error; - } -}; diff --git a/persistence-module/src/index.ts b/persistence-module/src/index.ts index 181c6e01c..6b1e07561 100644 --- a/persistence-module/src/index.ts +++ b/persistence-module/src/index.ts @@ -1,49 +1,18 @@ // Documentation: https://sdk.netlify.com import { NetlifyIntegration } from '@netlify/sdk'; -import { deserialize } from 'bson'; -import { readdir, readFile, existsSync } from 'fs'; -import { promisify } from 'util'; -import { type Page, updatePages } from './update-pages'; - -const readdirAsync = promisify(readdir); -const readFileAsync = promisify(readFile); +import { downloadPersistenceModule } from './persistence'; const integration = new NetlifyIntegration(); const ZIP_PATH = `${process.cwd()}/bundle/documents`; integration.addBuildEventHandler( - 'onSuccess', - async ({ utils: { run, git } }) => { - /** - * Minor note that persistence module also handles merging of ToCs for embedded products - */ - console.log('=========== Chatbot Data Upload Integration ================'); - - const bundleDirExists = existsSync(`${process.cwd()}/bundle`); - - if (!bundleDirExists) await run.command('unzip -o bundle.zip -d bundle'); - - const zipContents = await readdirAsync(ZIP_PATH, { - recursive: true, - }); - - const bsonPages = zipContents.filter((fileName) => { - const splitFile = fileName.toString().split('.'); - - return splitFile[splitFile.length - 1] === 'bson'; - }); - - const pageAstObjects = await Promise.all( - bsonPages.map(async (bsonFileName) => { - const rawData = await readFileAsync(`${ZIP_PATH}/${bsonFileName}`); - - return deserialize(rawData) as Page; - }), - ); - - await updatePages(pageAstObjects, 'updated_documents'); - console.log('=========== Chatbot Data Upload Integration ================'); - }, + 'onPreBuild', + async ({ utils: { cache, run } }) => { + try { + await downloadPersistenceModule(run); + } catch (e) { + console.error('Unable to run the persistence module', e); + } + }, ); - export { integration }; diff --git a/persistence-module/src/persistence.ts b/persistence-module/src/persistence.ts new file mode 100644 index 000000000..f90289476 --- /dev/null +++ b/persistence-module/src/persistence.ts @@ -0,0 +1,29 @@ +import type { NetlifyPluginUtils } from '@netlify/build'; +import { existsSync } from 'node:fs'; + +const WORKER_POOL_PATH = `${process.cwd()}/docs-worker-pool`; +const PERSISTENCE_PATH = `${WORKER_POOL_PATH}/modules/persistence`; + +export async function downloadPersistenceModule( + run: NetlifyPluginUtils['run'], +): Promise { + const isModuleDownloaded = existsSync(WORKER_POOL_PATH); + + if (isModuleDownloaded) return; + + await run.command( + 'git clone --depth 1 --filter=tree:0 https://github.com/mongodb/docs-worker-pool.git --sparse', + ); + + await run.command('git sparse-checkout set --no-cone modules/persistence', { + cwd: WORKER_POOL_PATH, + }); + + await run.command('npm ci', { + cwd: PERSISTENCE_PATH, + }); + + await run.command('npm run build', { + cwd: PERSISTENCE_PATH, + }); +} diff --git a/persistence-module/src/update-pages.ts b/persistence-module/src/update-pages.ts deleted file mode 100644 index 77b7fa016..000000000 --- a/persistence-module/src/update-pages.ts +++ /dev/null @@ -1,286 +0,0 @@ -import type { AnyBulkWriteOperation, FindCursor } from 'mongodb'; -import isEqual from 'fast-deep-equal'; -import { db } from './connector'; -import { bulkWrite } from './db-operations'; - -interface PreviousPageMapping { - [key: string]: { - ast: PageAst; - static_assets: StaticAsset[]; - }; -} - -export interface StaticAsset { - checksum: string; - key: string; - updated_at?: Date; -} -export interface PageAst { - [key: string]: unknown; - type: string; - position: Record; - children: PageAst[]; - fileid: string; - options: Record; -} -export interface Page { - page_id: string; - filename: string; - ast: PageAst; - source: string; - static_assets: StaticAsset[]; - github_username: string; -} -export interface UpdatedPage extends Page { - created_at: Date; - updated_at: Date; - deleted: boolean; -} -export const GITHUB_USER = 'docs-builder-bot'; - -export const createPageAstMapping = async (docsCursor: FindCursor) => { - // Create mapping for page id and its AST - const mapping: PreviousPageMapping = {}; - // Create set of all page ids. To be used for tracking unseen pages in the current build - const pageIds = new Set(); - for await (const doc of docsCursor) { - mapping[doc.page_id] = { - ast: doc.ast, - static_assets: doc.static_assets, - }; - pageIds.add(doc.page_id); - } - return { mapping, pageIds }; -}; - -/** - * Finds the page documents for a given Snooty project + branch + user combination. - * If this is the first build for the Snooty project + branch + user, no documents - * will be found. - * - * @param pageIdPrefix - Includes the Snooty project name, user (docsworker-xlarge), and branch - * @param collection - The collection to perform the find query on - */ -const findPrevPageDocs = async ( - pageIdPrefix: string, - collection: string, - githubUser: string, -) => { - const dbSession = await db(); - const findQuery = { - page_id: { $regex: new RegExp(`^${pageIdPrefix}/`) }, - github_username: githubUser, - deleted: false, - }; - const projection = { - _id: 0, - page_id: 1, - ast: 1, - static_assets: 1, - }; - - try { - return dbSession - .collection(collection) - .find(findQuery) - .project(projection); - } catch (error) { - console.error( - `Error trying to find previous page documents using prefix ${pageIdPrefix} in ${collection}}: ${error}`, - ); - throw error; - } -}; - -/** - * Upserts pages in separate collection. Copies of a page are created by page_id. - * Updated pages within the same Snooty project name + branch should only update - * related page documents. - * - * @param pages - * @param collection - */ -export const updatePages = async (pages: Page[], collection: string) => { - if (pages.length === 0) { - return; - } - - try { - const updateTime = new Date(); - // Find all pages that share the same project name + branch. Expects page IDs - // to include these two properties after parse - const pageIdPrefix = pages[0].page_id.split('/').slice(0, 3).join('/'); - const previousPagesCursor = await findPrevPageDocs( - pageIdPrefix, - collection, - GITHUB_USER, - ); - const { mapping: prevPageDocsMapping, pageIds: prevPageIds } = - await createPageAstMapping(previousPagesCursor); - - const operations = [ - ...checkForPageDiffs({ - prevPageDocsMapping, - prevPageIds, - currentPages: pages, - updateTime, - }), - ...markUnseenPagesAsDeleted({ prevPageIds, updateTime }), - ]; - - if (operations.length > 0) { - await bulkWrite(operations, collection); - } - } catch (error) { - console.error(`Error when trying to update pages: ${error}`); - throw error; - } -}; - -interface MarkUnseenPagesAsDeletedParams { - updateTime: Date; - prevPageIds: Set; -} -function markUnseenPagesAsDeleted({ - prevPageIds, - updateTime, -}: MarkUnseenPagesAsDeletedParams) { - const operations: AnyBulkWriteOperation[] = []; - prevPageIds.forEach((unseenPageId) => { - const operation = { - updateOne: { - filter: { page_id: unseenPageId, github_username: GITHUB_USER }, - update: { - $set: { - deleted: true, - updated_at: updateTime, - }, - }, - }, - }; - operations.push(operation); - }); - return operations; -} - -interface CheckForPageDiffsParams { - currentPages: Page[]; - updateTime: Date; - prevPageDocsMapping: PreviousPageMapping; - prevPageIds: Set; -} -/** - * Compares the ASTs of the current pages with the previous pages. New update - * operations are added whenever a diff in the page ASTs is found. Page IDs are - * removed from `prevPageIds` to signal that the previous page has been "seen" - */ -export function checkForPageDiffs({ - currentPages, - updateTime, - prevPageDocsMapping, - prevPageIds, -}: CheckForPageDiffsParams) { - const operations: AnyBulkWriteOperation[] = []; - currentPages.forEach((page) => { - // Filter out rst (non-page) files - if (!page.filename.endsWith('.txt')) { - return; - } - - const currentPageId = page.page_id; - prevPageIds.delete(currentPageId); - const prevPageData = prevPageDocsMapping[currentPageId]; - - // Update the document if page's current AST is different from previous build's. - // New pages should always count as having a "different" AST - if (isEqual(page.ast, prevPageData?.ast)) return; - const operation = { - updateOne: { - filter: { - page_id: currentPageId, - github_username: page.github_username, - }, - update: { - $set: { - page_id: currentPageId, - filename: page.filename, - ast: page.ast, - static_assets: findUpdatedAssets( - page.static_assets, - updateTime, - prevPageData?.static_assets, - ), - updated_at: updateTime, - deleted: false, - // Track the last build ID to update the content - }, - $setOnInsert: { - created_at: updateTime, - }, - }, - upsert: true, - }, - }; - operations.push(operation); - }); - return operations; -} - -/** - * Identifies any changes in assets between the current page and its previous page. - * A new array of static assets with their last update time is returned. - * - * The Snooty Data API will take into account an asset's `updated_at` field to - * compare with timestamps that it receives on requests for updated pages. When - * the API sends an updated page, an updated page's asset will only be sent if that asset's - * timestamp is greater than the timestamp sent in the request (denoting a change). - * Unchanged assets with older timestamps will not be sent. - * - * Assets that are deleted between builds are not included since the Snooty Data API - * will not need to return it for now. - * - * @param currentPageAssets - * @param prevPageAssets - */ -function findUpdatedAssets( - currentPageAssets: StaticAsset[], - updateTime: Date, - prevPageAssets?: StaticAsset[], -) { - const updatedAssets: StaticAsset[] = []; - if ( - currentPageAssets && - currentPageAssets.length === 0 && - prevPageAssets && - prevPageAssets.length === 0 - ) { - return updatedAssets; - } - - const prevAssetMapping: Record = - {}; - if (prevPageAssets) { - prevPageAssets.forEach((asset) => { - prevAssetMapping[asset.checksum] = { - key: asset.key, - updated_at: asset.updated_at ?? updateTime, - }; - }); - } - - currentPageAssets.forEach(({ checksum, key }) => { - const prevAsset = prevAssetMapping[checksum]; - // Edge case: check to ensure previous asset exists with the same checksum, - // but different key/filename. This can happen if an image is renamed - const isSame = prevAsset && prevAsset.key === key; - // Most common case: no change in asset; we keep the updated time the same - const timeOfUpdate = isSame ? prevAsset.updated_at : updateTime; - updatedAssets.push({ - checksum, - key, - updated_at: timeOfUpdate, - }); - }); - - return updatedAssets; -}