Skip to content

Commit

Permalink
[connectors]- fix(webcrawler): sanitize webcrawler url (#9138)
Browse files Browse the repository at this point in the history
* [types] - feature: add utility to validate and standardize URLs

 - Introduce a new function to check if a URL is valid and to standardize it if so
 - Ensure that only URLs with http or https protocols are considered valid

* [front/lib/api] - refactor: use centralized validateUrl function from @dust-tt/types

 - Replaced local validateUrl function with imported one from @dust-tt/types to ensure consistency across modules
 - Removed duplicate validateUrl function definition from @app/lib/utils

[front/pages/api] - refactor: update document API to use centralized validateUrl

 - Switched to use the validateUrl function from @dust-tt/types in the document API endpoint for URL validation

* [connectors/webcrawler/temporal] - fix: ensure URLs are validated and sanitized in activities

 - Implement URL validation using a new utility to ensure input URLs are valid and standardized before processing
 - Sanitize the URL to remove query parameters and ensure the length doesn't exceed preset maximums in document formatting

* fix: lint/format

* [front] - refactor: streamline import of validateUrl utility

 - Consolidate validateUrl import by removing the duplicate import statement
 - Simplify the codebase for better maintainability and readability

* [connectors] - fix: handle invalid URLs during document formatting

 - Extract document content formatting into a separate function to allow for null returns on invalid URLs
 - Log and skip document upsert to datasource if formatted document content is invalid

[front] - refactor: relocate validateUrl import

 - Move import of validateUrl to a different section of the code for better code organization

* [connectors/webcrawler/temporal] - fix: refine error message for invalid URLs during crawl

 - Updated error message to include both invalid URLs and documents for better clarity during website crawling errors
  • Loading branch information
JulesBelveze authored Dec 5, 2024
1 parent e4a70b3 commit 418082a
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 31 deletions.
38 changes: 29 additions & 9 deletions connectors/src/connectors/webcrawler/temporal/activities.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import type { CoreAPIDataSourceDocumentSection } from "@dust-tt/types";
import type { ModelId } from "@dust-tt/types";
import { WEBCRAWLER_MAX_DEPTH, WEBCRAWLER_MAX_PAGES } from "@dust-tt/types";
import { stripNullBytes } from "@dust-tt/types";
import { validateUrl } from "@dust-tt/types/src/shared/utils/url_utils";
import { Context } from "@temporalio/activity";
import { isCancellation } from "@temporalio/workflow";
import { CheerioCrawler, Configuration, LogLevel } from "crawlee";
Expand Down Expand Up @@ -317,14 +318,26 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) {
extracted.length > 0 &&
extracted.length <= MAX_SMALL_DOCUMENT_TXT_LEN
) {
const formattedDocumentContent = formatDocumentContent({
title: pageTitle,
content: extracted,
url: request.url,
});
if (!formattedDocumentContent) {
childLogger.info(
{
documentId,
configId: webCrawlerConfig.id,
url,
},
`Invalid document or URL. Skipping`
);
return;
}
await upsertToDatasource({
dataSourceConfig,
documentId: documentId,
documentContent: formatDocumentContent({
title: pageTitle,
content: extracted,
url: request.url,
}),
documentContent: formattedDocumentContent,
documentUrl: request.url,
timestampMs: new Date().getTime(),
tags: [`title:${stripNullBytes(pageTitle)}`],
Expand Down Expand Up @@ -466,18 +479,25 @@ function formatDocumentContent({
title: string;
content: string;
url: string;
}): CoreAPIDataSourceDocumentSection {
}): CoreAPIDataSourceDocumentSection | null {
const URL_MAX_LENGTH = 128;
const TITLE_MAX_LENGTH = 300;
const parsedUrl = new URL(url);

const validatedUrl = validateUrl(url);
if (!validatedUrl.valid || !validatedUrl.standardized) {
return null;
}

const parsedUrl = new URL(validatedUrl.standardized);
const urlWithoutQuery = `${parsedUrl.origin}/${parsedUrl.pathname}`;

const sanitizedContent = stripNullBytes(content);
const sanitizedTitle = stripNullBytes(title);
const sanitizedUrlWithoutQuery = stripNullBytes(urlWithoutQuery);

return {
prefix: `URL: ${urlWithoutQuery.slice(0, URL_MAX_LENGTH)}${
urlWithoutQuery.length > URL_MAX_LENGTH ? "..." : ""
prefix: `URL: ${sanitizedUrlWithoutQuery.slice(0, URL_MAX_LENGTH)}${
sanitizedUrlWithoutQuery.length > URL_MAX_LENGTH ? "..." : ""
}\n`,
content: `TITLE: ${sanitizedTitle.substring(0, TITLE_MAX_LENGTH)}\n${sanitizedContent}`,
sections: [],
Expand Down
2 changes: 1 addition & 1 deletion front/lib/api/data_sources.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import {
Ok,
sectionFullText,
} from "@dust-tt/types";
import { validateUrl } from "@dust-tt/types/src/shared/utils/url_utils";
import assert from "assert";
import type { Transaction } from "sequelize";

Expand All @@ -50,7 +51,6 @@ import type { SpaceResource } from "@app/lib/resources/space_resource";
import { generateRandomModelSId } from "@app/lib/resources/string_ids";
import { ServerSideTracking } from "@app/lib/tracking/server";
import { enqueueUpsertTable } from "@app/lib/upsert_queue";
import { validateUrl } from "@app/lib/utils";
import logger from "@app/logger/logger";
import { launchScrubDataSourceWorkflow } from "@app/poke/temporal/client";

Expand Down
20 changes: 0 additions & 20 deletions front/lib/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -89,26 +89,6 @@ export function formatTimestampToFriendlyDate(
});
}

export const validateUrl = (
urlString: string
): {
valid: boolean;
standardized: string | null;
} => {
let url: URL;
try {
url = new URL(urlString);
} catch (e) {
return { valid: false, standardized: null };
}

if (url.protocol !== "http:" && url.protocol !== "https:") {
return { valid: false, standardized: null };
}

return { valid: true, standardized: url.href };
};

// from http://emailregex.com/
const EMAIL_REGEX =
/^(([^<>()[\]\\.,;:\s@"]+(\.[^<>()[\]\\.,;:\s@"]+)*)|(".+"))@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))$/;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import {
rateLimiter,
sectionFullText,
} from "@dust-tt/types";
import { validateUrl } from "@dust-tt/types/src/shared/utils/url_utils";
import type { NextApiRequest, NextApiResponse } from "next";

import { withPublicAPIAuthentication } from "@app/lib/api/auth_wrappers";
Expand All @@ -23,7 +24,6 @@ import {
enqueueUpsertDocument,
runPostUpsertHooks,
} from "@app/lib/upsert_queue";
import { validateUrl } from "@app/lib/utils";
import logger from "@app/logger/logger";
import { apiError, statsDClient } from "@app/logger/withlogging";
import { launchRunPostDeleteHooksWorkflow } from "@app/temporal/documents_post_process_hooks/client";
Expand Down
19 changes: 19 additions & 0 deletions types/src/shared/utils/url_utils.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
export const validateUrl = (
urlString: string
): {
valid: boolean;
standardized: string | null;
} => {
let url: URL;
try {
url = new URL(urlString);
} catch (e) {
return { valid: false, standardized: null };
}

if (url.protocol !== "http:" && url.protocol !== "https:") {
return { valid: false, standardized: null };
}

return { valid: true, standardized: url.href };
};

0 comments on commit 418082a

Please sign in to comment.