From d7591b9812a12abf807110dfb6c16ffa6946be33 Mon Sep 17 00:00:00 2001
From: Philippe Rolet <pr@dust.tt>
Date: Thu, 20 Jun 2024 14:55:48 +0200
Subject: [PATCH] [Webcrawler] Handle websites taking long to crawl (#5757)

* [Webcrawler] Handle websites taking long to crawl

Description
---
Some websites may take long to crawl. This is an issue when crawling exceeds 2 hours which is the activity limit for our crawl => the crawl is silently aborted and retried and hour later, and this repeats 15 times before we get an 'activity timeout' monitor (so 2 days)

See example in issue https://github.com/dust-tt/tasks/issues/883

This PR fixes https://github.com/dust-tt/tasks/issues/883. It:
1. clarifies the situation by raising a panic flag when the issue is clearly that the website is long to crawl so we don't crawl uselessely the same pages for 2 days before seeing an activity timeout (which is btw less clear than "website takes too long to crawl);
2. moves the timeout to 4 hours which seems acceptable for slow
websites with big pages (2 pages / minute maximum tolerated slowness
for a 512 pages crawl)
3. decreases max-requests-per-minute

Risk
---
na

Deploy
---
- deploy connectors
- Update eng runner runbook

* loglevel off
---
 .../webcrawler/temporal/activities.ts         | 32 ++++++++++++++++---
 .../webcrawler/temporal/workflows.ts          |  4 ++-
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/connectors/src/connectors/webcrawler/temporal/activities.ts b/connectors/src/connectors/webcrawler/temporal/activities.ts
index 03830cf89ec6..0947701179bb 100644
--- a/connectors/src/connectors/webcrawler/temporal/activities.ts
+++ b/connectors/src/connectors/webcrawler/temporal/activities.ts
@@ -3,7 +3,7 @@ import type { ModelId } from "@dust-tt/types";
 import { WEBCRAWLER_MAX_DEPTH, WEBCRAWLER_MAX_PAGES } from "@dust-tt/types";
 import { Context } from "@temporalio/activity";
 import { isCancellation } from "@temporalio/workflow";
-import { CheerioCrawler, Configuration } from "crawlee";
+import { CheerioCrawler, Configuration, LogLevel } from "crawlee";
 import { Op } from "sequelize";
 import turndown from "turndown";
 
@@ -16,7 +16,10 @@ import {
   isTopFolder,
   stableIdForUrl,
 } from "@connectors/connectors/webcrawler/lib/utils";
-import { REQUEST_HANDLING_TIMEOUT } from "@connectors/connectors/webcrawler/temporal/workflows";
+import {
+  MAX_TIME_TO_CRAWL_MINUTES,
+  REQUEST_HANDLING_TIMEOUT,
+} from "@connectors/connectors/webcrawler/temporal/workflows";
 import { dataSourceConfigFromConnector } from "@connectors/lib/api/data_source_config";
 import {
   deleteFromDataSource,
@@ -57,6 +60,7 @@ export async function markAsCrawled(connectorId: ModelId) {
 }
 
 export async function crawlWebsiteByConnectorId(connectorId: ModelId) {
+  const startCrawlingTime = Date.now();
   const connector = await ConnectorResource.fetchById(connectorId);
   if (!connector) {
     throw new Error(`Connector ${connectorId} not found.`);
@@ -123,7 +127,7 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) {
         webCrawlerConfig.maxPageToCrawl || WEBCRAWLER_MAX_PAGES,
 
       maxConcurrency: CONCURRENCY,
-      maxRequestsPerMinute: 60, // 5 requests per second to avoid overloading the target website
+      maxRequestsPerMinute: 20, // 1 request every 3 seconds average, to avoid overloading the target website
       requestHandlerTimeoutSecs: REQUEST_HANDLING_TIMEOUT,
       async requestHandler({ $, request, enqueueLinks }) {
         Context.current().heartbeat({
@@ -131,12 +135,31 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) {
         });
         const currentRequestDepth = request.userData.depth || 0;
 
-        // try-catch allowing activity cancellation by temporal (timeout, or signal)
+        // try-catch allowing activity cancellation by temporal (various timeouts, or signal)
         try {
           await Context.current().sleep(1);
         } catch (e) {
           if (isCancellation(e)) {
             childLogger.error("The activity was canceled. Aborting crawl.");
+
+            // raise a panic flag if the activity is aborted because it exceeded the maximum time to crawl
+            const isTooLongToCrawl =
+              Date.now() - startCrawlingTime >
+              1000 * 60 * (MAX_TIME_TO_CRAWL_MINUTES - 1);
+
+            if (isTooLongToCrawl) {
+              childLogger.error(
+                {
+                  url,
+                  configId: webCrawlerConfig.id,
+                  panic: true,
+                },
+                `Website takes too long to crawl (crawls ${Math.round(
+                  pageCount / MAX_TIME_TO_CRAWL_MINUTES
+                )} pages per minute)`
+              );
+            }
+
             // abort crawling
             await crawler.autoscaledPool?.abort();
             await crawler.teardown();
@@ -314,6 +337,7 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) {
     new Configuration({
       purgeOnStart: true,
       persistStorage: false,
+      logLevel: LogLevel.OFF,
     })
   );
 
diff --git a/connectors/src/connectors/webcrawler/temporal/workflows.ts b/connectors/src/connectors/webcrawler/temporal/workflows.ts
index 862a1223a99c..458d4d0111d7 100644
--- a/connectors/src/connectors/webcrawler/temporal/workflows.ts
+++ b/connectors/src/connectors/webcrawler/temporal/workflows.ts
@@ -14,9 +14,11 @@ import type * as activities from "@connectors/connectors/webcrawler/temporal/act
 // leeway to crawl on slow websites
 export const REQUEST_HANDLING_TIMEOUT = 420;
 
+export const MAX_TIME_TO_CRAWL_MINUTES = 240;
+
 const { crawlWebsiteByConnectorId, webCrawlerGarbageCollector } =
   proxyActivities<typeof activities>({
-    startToCloseTimeout: "120 minutes",
+    startToCloseTimeout: `${MAX_TIME_TO_CRAWL_MINUTES} minutes`,
     // for each page crawl, there are heartbeats, but a page crawl can last at max
     // REQUEST_HANDLING_TIMEOUT seconds
     heartbeatTimeout: `${REQUEST_HANDLING_TIMEOUT + 120} seconds`,