[connectors]- fix(webcrawler): sanitize webcrawler url (#9138)

* [types] - feature: add utility to validate and standardize URLs - Introduce a new function to check if a URL is valid and to standardize it if so - Ensure that only URLs with http or https protocols are considered valid * [front/lib/api] - refactor: use centralized validateUrl function from @dust-tt/types - Replaced local validateUrl function with imported one from @dust-tt/types to ensure consistency across modules - Removed duplicate validateUrl function definition from @app/lib/utils [front/pages/api] - refactor: update document API to use centralized validateUrl - Switched to use the validateUrl function from @dust-tt/types in the document API endpoint for URL validation * [connectors/webcrawler/temporal] - fix: ensure URLs are validated and sanitized in activities - Implement URL validation using a new utility to ensure input URLs are valid and standardized before processing - Sanitize the URL to remove query parameters and ensure the length doesn't exceed preset maximums in document formatting * fix: lint/format * [front] - refactor: streamline import of validateUrl utility - Consolidate validateUrl import by removing the duplicate import statement - Simplify the codebase for better maintainability and readability * [connectors] - fix: handle invalid URLs during document formatting - Extract document content formatting into a separate function to allow for null returns on invalid URLs - Log and skip document upsert to datasource if formatted document content is invalid [front] - refactor: relocate validateUrl import - Move import of validateUrl to a different section of the code for better code organization * [connectors/webcrawler/temporal] - fix: refine error message for invalid URLs during crawl - Updated error message to include both invalid URLs and documents for better clarity during website crawling errors
dust-tt · Dec 5, 2024 · 418082a · 418082a
1 parent e4a70b3
commit 418082a
Show file tree

Hide file tree

Showing 5 changed files with 50 additions and 31 deletions.
diff --git a/connectors/src/connectors/webcrawler/temporal/activities.ts b/connectors/src/connectors/webcrawler/temporal/activities.ts
@@ -2,6 +2,7 @@ import type { CoreAPIDataSourceDocumentSection } from "@dust-tt/types";
 import type { ModelId } from "@dust-tt/types";
 import { WEBCRAWLER_MAX_DEPTH, WEBCRAWLER_MAX_PAGES } from "@dust-tt/types";
 import { stripNullBytes } from "@dust-tt/types";
+import { validateUrl } from "@dust-tt/types/src/shared/utils/url_utils";
 import { Context } from "@temporalio/activity";
 import { isCancellation } from "@temporalio/workflow";
 import { CheerioCrawler, Configuration, LogLevel } from "crawlee";
@@ -317,14 +318,26 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) {
             extracted.length > 0 &&
             extracted.length <= MAX_SMALL_DOCUMENT_TXT_LEN
           ) {
+            const formattedDocumentContent = formatDocumentContent({
+              title: pageTitle,
+              content: extracted,
+              url: request.url,
+            });
+            if (!formattedDocumentContent) {
+              childLogger.info(
+                {
+                  documentId,
+                  configId: webCrawlerConfig.id,
+                  url,
+                },
+                `Invalid document or URL. Skipping`
+              );
+              return;
+            }
             await upsertToDatasource({
               dataSourceConfig,
               documentId: documentId,
-              documentContent: formatDocumentContent({
-                title: pageTitle,
-                content: extracted,
-                url: request.url,
-              }),
+              documentContent: formattedDocumentContent,
               documentUrl: request.url,
               timestampMs: new Date().getTime(),
               tags: [`title:${stripNullBytes(pageTitle)}`],
@@ -466,18 +479,25 @@ function formatDocumentContent({
   title: string;
   content: string;
   url: string;
-}): CoreAPIDataSourceDocumentSection {
+}): CoreAPIDataSourceDocumentSection | null {
   const URL_MAX_LENGTH = 128;
   const TITLE_MAX_LENGTH = 300;
-  const parsedUrl = new URL(url);
+
+  const validatedUrl = validateUrl(url);
+  if (!validatedUrl.valid || !validatedUrl.standardized) {
+    return null;
+  }
+
+  const parsedUrl = new URL(validatedUrl.standardized);
   const urlWithoutQuery = `${parsedUrl.origin}/${parsedUrl.pathname}`;
 
   const sanitizedContent = stripNullBytes(content);
   const sanitizedTitle = stripNullBytes(title);
+  const sanitizedUrlWithoutQuery = stripNullBytes(urlWithoutQuery);
 
   return {
-    prefix: `URL: ${urlWithoutQuery.slice(0, URL_MAX_LENGTH)}${
-      urlWithoutQuery.length > URL_MAX_LENGTH ? "..." : ""
+    prefix: `URL: ${sanitizedUrlWithoutQuery.slice(0, URL_MAX_LENGTH)}${
+      sanitizedUrlWithoutQuery.length > URL_MAX_LENGTH ? "..." : ""
     }\n`,
     content: `TITLE: ${sanitizedTitle.substring(0, TITLE_MAX_LENGTH)}\n${sanitizedContent}`,
     sections: [],

diff --git a/front/lib/api/data_sources.ts b/front/lib/api/data_sources.ts
@@ -34,6 +34,7 @@ import {
   Ok,
   sectionFullText,
 } from "@dust-tt/types";
+import { validateUrl } from "@dust-tt/types/src/shared/utils/url_utils";
 import assert from "assert";
 import type { Transaction } from "sequelize";
 
@@ -50,7 +51,6 @@ import type { SpaceResource } from "@app/lib/resources/space_resource";
 import { generateRandomModelSId } from "@app/lib/resources/string_ids";
 import { ServerSideTracking } from "@app/lib/tracking/server";
 import { enqueueUpsertTable } from "@app/lib/upsert_queue";
-import { validateUrl } from "@app/lib/utils";
 import logger from "@app/logger/logger";
 import { launchScrubDataSourceWorkflow } from "@app/poke/temporal/client";
 

diff --git a/front/lib/utils.ts b/front/lib/utils.ts
@@ -89,26 +89,6 @@ export function formatTimestampToFriendlyDate(
   });
 }
 
-export const validateUrl = (
-  urlString: string
-): {
-  valid: boolean;
-  standardized: string | null;
-} => {
-  let url: URL;
-  try {
-    url = new URL(urlString);
-  } catch (e) {
-    return { valid: false, standardized: null };
-  }
-
-  if (url.protocol !== "http:" && url.protocol !== "https:") {
-    return { valid: false, standardized: null };
-  }
-
-  return { valid: true, standardized: url.href };
-};
-
 // from http://emailregex.com/
 const EMAIL_REGEX =
   /^(([^<>()[\]\\.,;:\s@"]+(\.[^<>()[\]\\.,;:\s@"]+)*)|(".+"))@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))$/;

diff --git a/...pages/api/v1/w/[wId]/spaces/[spaceId]/data_sources/[dsId]/documents/[documentId]/index.ts b/...pages/api/v1/w/[wId]/spaces/[spaceId]/data_sources/[dsId]/documents/[documentId]/index.ts
@@ -11,6 +11,7 @@ import {
   rateLimiter,
   sectionFullText,
 } from "@dust-tt/types";
+import { validateUrl } from "@dust-tt/types/src/shared/utils/url_utils";
 import type { NextApiRequest, NextApiResponse } from "next";
 
 import { withPublicAPIAuthentication } from "@app/lib/api/auth_wrappers";
@@ -23,7 +24,6 @@ import {
   enqueueUpsertDocument,
   runPostUpsertHooks,
 } from "@app/lib/upsert_queue";
-import { validateUrl } from "@app/lib/utils";
 import logger from "@app/logger/logger";
 import { apiError, statsDClient } from "@app/logger/withlogging";
 import { launchRunPostDeleteHooksWorkflow } from "@app/temporal/documents_post_process_hooks/client";

diff --git a/types/src/shared/utils/url_utils.ts b/types/src/shared/utils/url_utils.ts
@@ -0,0 +1,19 @@
+export const validateUrl = (
+  urlString: string
+): {
+  valid: boolean;
+  standardized: string | null;
+} => {
+  let url: URL;
+  try {
+    url = new URL(urlString);
+  } catch (e) {
+    return { valid: false, standardized: null };
+  }
+
+  if (url.protocol !== "http:" && url.protocol !== "https:") {
+    return { valid: false, standardized: null };
+  }
+
+  return { valid: true, standardized: url.href };
+};