[Webcrawler] Handle websites taking long to crawl (#5757)

* [Webcrawler] Handle websites taking long to crawl Description --- Some websites may take long to crawl. This is an issue when crawling exceeds 2 hours which is the activity limit for our crawl => the crawl is silently aborted and retried and hour later, and this repeats 15 times before we get an 'activity timeout' monitor (so 2 days) See example in issue dust-tt/tasks#883 This PR fixes dust-tt/tasks#883. It: 1. clarifies the situation by raising a panic flag when the issue is clearly that the website is long to crawl so we don't crawl uselessely the same pages for 2 days before seeing an activity timeout (which is btw less clear than "website takes too long to crawl); 2. moves the timeout to 4 hours which seems acceptable for slow websites with big pages (2 pages / minute maximum tolerated slowness for a 512 pages crawl) 3. decreases max-requests-per-minute Risk --- na Deploy --- - deploy connectors - Update eng runner runbook * loglevel off
dust-tt · Jun 20, 2024 · d7591b9 · d7591b9
1 parent e8581a9
commit d7591b9
Show file tree

Hide file tree

Showing 2 changed files with 31 additions and 5 deletions.
diff --git a/connectors/src/connectors/webcrawler/temporal/activities.ts b/connectors/src/connectors/webcrawler/temporal/activities.ts
@@ -3,7 +3,7 @@ import type { ModelId } from "@dust-tt/types";
 import { WEBCRAWLER_MAX_DEPTH, WEBCRAWLER_MAX_PAGES } from "@dust-tt/types";
 import { Context } from "@temporalio/activity";
 import { isCancellation } from "@temporalio/workflow";
-import { CheerioCrawler, Configuration } from "crawlee";
+import { CheerioCrawler, Configuration, LogLevel } from "crawlee";
 import { Op } from "sequelize";
 import turndown from "turndown";
 
@@ -16,7 +16,10 @@ import {
   isTopFolder,
   stableIdForUrl,
 } from "@connectors/connectors/webcrawler/lib/utils";
-import { REQUEST_HANDLING_TIMEOUT } from "@connectors/connectors/webcrawler/temporal/workflows";
+import {
+  MAX_TIME_TO_CRAWL_MINUTES,
+  REQUEST_HANDLING_TIMEOUT,
+} from "@connectors/connectors/webcrawler/temporal/workflows";
 import { dataSourceConfigFromConnector } from "@connectors/lib/api/data_source_config";
 import {
   deleteFromDataSource,
@@ -57,6 +60,7 @@ export async function markAsCrawled(connectorId: ModelId) {
 }
 
 export async function crawlWebsiteByConnectorId(connectorId: ModelId) {
+  const startCrawlingTime = Date.now();
   const connector = await ConnectorResource.fetchById(connectorId);
   if (!connector) {
     throw new Error(`Connector ${connectorId} not found.`);
@@ -123,20 +127,39 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) {
         webCrawlerConfig.maxPageToCrawl || WEBCRAWLER_MAX_PAGES,
 
       maxConcurrency: CONCURRENCY,
-      maxRequestsPerMinute: 60, // 5 requests per second to avoid overloading the target website
+      maxRequestsPerMinute: 20, // 1 request every 3 seconds average, to avoid overloading the target website
       requestHandlerTimeoutSecs: REQUEST_HANDLING_TIMEOUT,
       async requestHandler({ $, request, enqueueLinks }) {
         Context.current().heartbeat({
           type: "http_request",
         });
         const currentRequestDepth = request.userData.depth || 0;
 
-        // try-catch allowing activity cancellation by temporal (timeout, or signal)
+        // try-catch allowing activity cancellation by temporal (various timeouts, or signal)
         try {
           await Context.current().sleep(1);
         } catch (e) {
           if (isCancellation(e)) {
             childLogger.error("The activity was canceled. Aborting crawl.");
+
+            // raise a panic flag if the activity is aborted because it exceeded the maximum time to crawl
+            const isTooLongToCrawl =
+              Date.now() - startCrawlingTime >
+              1000 * 60 * (MAX_TIME_TO_CRAWL_MINUTES - 1);
+
+            if (isTooLongToCrawl) {
+              childLogger.error(
+                {
+                  url,
+                  configId: webCrawlerConfig.id,
+                  panic: true,
+                },
+                `Website takes too long to crawl (crawls ${Math.round(
+                  pageCount / MAX_TIME_TO_CRAWL_MINUTES
+                )} pages per minute)`
+              );
+            }
+
             // abort crawling
             await crawler.autoscaledPool?.abort();
             await crawler.teardown();
@@ -314,6 +337,7 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) {
     new Configuration({
       purgeOnStart: true,
       persistStorage: false,
+      logLevel: LogLevel.OFF,
     })
   );
 

diff --git a/connectors/src/connectors/webcrawler/temporal/workflows.ts b/connectors/src/connectors/webcrawler/temporal/workflows.ts
@@ -14,9 +14,11 @@ import type * as activities from "@connectors/connectors/webcrawler/temporal/act
 // leeway to crawl on slow websites
 export const REQUEST_HANDLING_TIMEOUT = 420;
 
+export const MAX_TIME_TO_CRAWL_MINUTES = 240;
+
 const { crawlWebsiteByConnectorId, webCrawlerGarbageCollector } =
   proxyActivities<typeof activities>({
-    startToCloseTimeout: "120 minutes",
+    startToCloseTimeout: `${MAX_TIME_TO_CRAWL_MINUTES} minutes`,
     // for each page crawl, there are heartbeats, but a page crawl can last at max
     // REQUEST_HANDLING_TIMEOUT seconds
     heartbeatTimeout: `${REQUEST_HANDLING_TIMEOUT + 120} seconds`,