From d7591b9812a12abf807110dfb6c16ffa6946be33 Mon Sep 17 00:00:00 2001 From: Philippe Rolet Date: Thu, 20 Jun 2024 14:55:48 +0200 Subject: [PATCH] [Webcrawler] Handle websites taking long to crawl (#5757) * [Webcrawler] Handle websites taking long to crawl Description --- Some websites may take long to crawl. This is an issue when crawling exceeds 2 hours which is the activity limit for our crawl => the crawl is silently aborted and retried and hour later, and this repeats 15 times before we get an 'activity timeout' monitor (so 2 days) See example in issue https://github.com/dust-tt/tasks/issues/883 This PR fixes https://github.com/dust-tt/tasks/issues/883. It: 1. clarifies the situation by raising a panic flag when the issue is clearly that the website is long to crawl so we don't crawl uselessely the same pages for 2 days before seeing an activity timeout (which is btw less clear than "website takes too long to crawl); 2. moves the timeout to 4 hours which seems acceptable for slow websites with big pages (2 pages / minute maximum tolerated slowness for a 512 pages crawl) 3. decreases max-requests-per-minute Risk --- na Deploy --- - deploy connectors - Update eng runner runbook * loglevel off --- .../webcrawler/temporal/activities.ts | 32 ++++++++++++++++--- .../webcrawler/temporal/workflows.ts | 4 ++- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/connectors/src/connectors/webcrawler/temporal/activities.ts b/connectors/src/connectors/webcrawler/temporal/activities.ts index 03830cf89ec6..0947701179bb 100644 --- a/connectors/src/connectors/webcrawler/temporal/activities.ts +++ b/connectors/src/connectors/webcrawler/temporal/activities.ts @@ -3,7 +3,7 @@ import type { ModelId } from "@dust-tt/types"; import { WEBCRAWLER_MAX_DEPTH, WEBCRAWLER_MAX_PAGES } from "@dust-tt/types"; import { Context } from "@temporalio/activity"; import { isCancellation } from "@temporalio/workflow"; -import { CheerioCrawler, Configuration } from "crawlee"; +import { CheerioCrawler, Configuration, LogLevel } from "crawlee"; import { Op } from "sequelize"; import turndown from "turndown"; @@ -16,7 +16,10 @@ import { isTopFolder, stableIdForUrl, } from "@connectors/connectors/webcrawler/lib/utils"; -import { REQUEST_HANDLING_TIMEOUT } from "@connectors/connectors/webcrawler/temporal/workflows"; +import { + MAX_TIME_TO_CRAWL_MINUTES, + REQUEST_HANDLING_TIMEOUT, +} from "@connectors/connectors/webcrawler/temporal/workflows"; import { dataSourceConfigFromConnector } from "@connectors/lib/api/data_source_config"; import { deleteFromDataSource, @@ -57,6 +60,7 @@ export async function markAsCrawled(connectorId: ModelId) { } export async function crawlWebsiteByConnectorId(connectorId: ModelId) { + const startCrawlingTime = Date.now(); const connector = await ConnectorResource.fetchById(connectorId); if (!connector) { throw new Error(`Connector ${connectorId} not found.`); @@ -123,7 +127,7 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) { webCrawlerConfig.maxPageToCrawl || WEBCRAWLER_MAX_PAGES, maxConcurrency: CONCURRENCY, - maxRequestsPerMinute: 60, // 5 requests per second to avoid overloading the target website + maxRequestsPerMinute: 20, // 1 request every 3 seconds average, to avoid overloading the target website requestHandlerTimeoutSecs: REQUEST_HANDLING_TIMEOUT, async requestHandler({ $, request, enqueueLinks }) { Context.current().heartbeat({ @@ -131,12 +135,31 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) { }); const currentRequestDepth = request.userData.depth || 0; - // try-catch allowing activity cancellation by temporal (timeout, or signal) + // try-catch allowing activity cancellation by temporal (various timeouts, or signal) try { await Context.current().sleep(1); } catch (e) { if (isCancellation(e)) { childLogger.error("The activity was canceled. Aborting crawl."); + + // raise a panic flag if the activity is aborted because it exceeded the maximum time to crawl + const isTooLongToCrawl = + Date.now() - startCrawlingTime > + 1000 * 60 * (MAX_TIME_TO_CRAWL_MINUTES - 1); + + if (isTooLongToCrawl) { + childLogger.error( + { + url, + configId: webCrawlerConfig.id, + panic: true, + }, + `Website takes too long to crawl (crawls ${Math.round( + pageCount / MAX_TIME_TO_CRAWL_MINUTES + )} pages per minute)` + ); + } + // abort crawling await crawler.autoscaledPool?.abort(); await crawler.teardown(); @@ -314,6 +337,7 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) { new Configuration({ purgeOnStart: true, persistStorage: false, + logLevel: LogLevel.OFF, }) ); diff --git a/connectors/src/connectors/webcrawler/temporal/workflows.ts b/connectors/src/connectors/webcrawler/temporal/workflows.ts index 862a1223a99c..458d4d0111d7 100644 --- a/connectors/src/connectors/webcrawler/temporal/workflows.ts +++ b/connectors/src/connectors/webcrawler/temporal/workflows.ts @@ -14,9 +14,11 @@ import type * as activities from "@connectors/connectors/webcrawler/temporal/act // leeway to crawl on slow websites export const REQUEST_HANDLING_TIMEOUT = 420; +export const MAX_TIME_TO_CRAWL_MINUTES = 240; + const { crawlWebsiteByConnectorId, webCrawlerGarbageCollector } = proxyActivities({ - startToCloseTimeout: "120 minutes", + startToCloseTimeout: `${MAX_TIME_TO_CRAWL_MINUTES} minutes`, // for each page crawl, there are heartbeats, but a page crawl can last at max // REQUEST_HANDLING_TIMEOUT seconds heartbeatTimeout: `${REQUEST_HANDLING_TIMEOUT + 120} seconds`,