From 8b0c45c803a3d2d94cb775f25d66cf5bb546782c Mon Sep 17 00:00:00 2001 From: pgayvallet Date: Fri, 27 Dec 2024 11:30:24 +0100 Subject: [PATCH] Tweak product doc generation for 8.17 --- .../src/build_artifacts.ts | 4 - .../src/tasks/create_index.ts | 8 +- .../src/tasks/index.ts | 1 - .../src/tasks/install_elser.ts | 73 ------------------- .../src/tasks/process_documents.ts | 8 +- 5 files changed, 12 insertions(+), 82 deletions(-) delete mode 100644 x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/install_elser.ts diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/build_artifacts.ts b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/build_artifacts.ts index 551f58bc68308..a929cb62d0bb4 100644 --- a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/build_artifacts.ts +++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/build_artifacts.ts @@ -14,7 +14,6 @@ import { createTargetIndex, extractDocumentation, indexDocuments, - installElser, createChunkFiles, createArtifact, cleanupFolders, @@ -68,9 +67,6 @@ export const buildArtifacts = async (config: TaskConfig) => { await cleanupFolders({ folders: [config.buildFolder] }); - log.info('Ensuring ELSER is installed on the embedding cluster'); - await installElser({ client: embeddingClient }); - for (const productName of config.productNames) { await buildArtifact({ productName, diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/create_index.ts b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/create_index.ts index d26ffc980f3ab..23915b3a1ab09 100644 --- a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/create_index.ts +++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/create_index.ts @@ -8,13 +8,15 @@ import type { Client } from '@elastic/elasticsearch'; import type { MappingTypeMapping } from '@elastic/elasticsearch/lib/api/types'; +const DEFAULT_ELSER = '.elser-2-elasticsearch'; + const mappings: MappingTypeMapping = { dynamic: 'strict', properties: { content_title: { type: 'text' }, content_body: { type: 'semantic_text', - inference_id: 'kibana-elser2', + inference_id: DEFAULT_ELSER, }, product_name: { type: 'keyword' }, root_type: { type: 'keyword' }, @@ -24,11 +26,11 @@ const mappings: MappingTypeMapping = { ai_subtitle: { type: 'text' }, ai_summary: { type: 'semantic_text', - inference_id: 'kibana-elser2', + inference_id: DEFAULT_ELSER, }, ai_questions_answered: { type: 'semantic_text', - inference_id: 'kibana-elser2', + inference_id: DEFAULT_ELSER, }, ai_tags: { type: 'keyword' }, }, diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/index.ts b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/index.ts index ec94e4c135c17..26497c71faba5 100644 --- a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/index.ts +++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/index.ts @@ -8,7 +8,6 @@ export { extractDocumentation } from './extract_documentation'; export { indexDocuments } from './index_documents'; export { createTargetIndex } from './create_index'; -export { installElser } from './install_elser'; export { createChunkFiles } from './create_chunk_files'; export { checkConnectivity } from './check_connectivity'; export { createArtifact } from './create_artifact'; diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/install_elser.ts b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/install_elser.ts deleted file mode 100644 index 09dc85b816191..0000000000000 --- a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/install_elser.ts +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the Elastic License - * 2.0; you may not use this file except in compliance with the Elastic License - * 2.0. - */ - -import type { Client } from '@elastic/elasticsearch'; - -const inferenceEndpointId = 'kibana-elser2'; - -export const installElser = async ({ client }: { client: Client }) => { - const getInferenceRes = await client.inference.get( - { - task_type: 'sparse_embedding', - inference_id: 'kibana-elser2', - }, - { ignore: [404] } - ); - - const installed = (getInferenceRes.endpoints ?? []).some( - (endpoint) => endpoint.inference_id === inferenceEndpointId - ); - - if (!installed) { - await client.inference.put({ - task_type: 'sparse_embedding', - inference_id: inferenceEndpointId, - inference_config: { - service: 'elser', - service_settings: { - num_allocations: 1, - num_threads: 1, - model_id: '.elser_model_2', - }, - task_settings: {}, - }, - }); - } - - await waitUntilDeployed({ - modelId: '.elser_model_2', - client, - }); -}; - -const waitUntilDeployed = async ({ - modelId, - client, - maxRetries = 20, - delay = 2000, -}: { - modelId: string; - client: Client; - maxRetries?: number; - delay?: number; -}) => { - for (let i = 0; i < maxRetries; i++) { - const statsRes = await client.ml.getTrainedModelsStats({ - model_id: modelId, - }); - const deploymentStats = statsRes.trained_model_stats[0]?.deployment_stats; - if (!deploymentStats || deploymentStats.nodes.length === 0) { - await sleep(delay); - continue; - } - return; - } - - throw new Error(`Timeout waiting for ML model ${modelId} to be deployed`); -}; - -const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/process_documents.ts b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/process_documents.ts index 69141ca167ab4..557f83fa1b781 100644 --- a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/process_documents.ts +++ b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/process_documents.ts @@ -39,7 +39,7 @@ const removeDuplicates = (documents: ExtractedDocument[]): ExtractedDocument[] = const filterEmptyDocs = (documents: ExtractedDocument[]): ExtractedDocument[] => { return documents.filter((doc) => { const tokenCount = encode(doc.content_body).length; - if (tokenCount < 100) { + if (tokenCount < 120) { return false; } return true; @@ -52,8 +52,14 @@ const processDocument = (document: ExtractedDocument) => { .replaceAll(/([a-zA-Z])edit\n/g, (match) => { return `${match[0]}\n`; }) + // remove edit links + .replaceAll(/\[\s*edit\s*\]\(\s*[^)]+\s*\)/g, '') + // remove empty links + .replaceAll('[]()', '') // limit to 2 consecutive carriage return .replaceAll(/\n\n+/g, '\n\n'); + document.content_title = document.content_title.split('|')[0].trim(); + return document; };