Skip to content

Commit

Permalink
Report webcrawler syncrhonization limit error (#9763)
Browse files Browse the repository at this point in the history
* Introduce webcrawling_syncrhonization_limit_reached

* Fix failure chips + add one for limit reached

* Update connectors/src/connectors/webcrawler/temporal/activities.ts

Co-authored-by: devloai[bot] <168258904+devloai[bot]@users.noreply.github.com>

* Update types/src/front/lib/connectors_api.ts

Co-authored-by: devloai[bot] <168258904+devloai[bot]@users.noreply.github.com>

* Update front/components/data_source/DataSourceSyncChip.tsx

Co-authored-by: devloai[bot] <168258904+devloai[bot]@users.noreply.github.com>

* Update front/components/data_source/DataSourceSyncChip.tsx

Co-authored-by: devloai[bot] <168258904+devloai[bot]@users.noreply.github.com>

* nit

* fix typo

---------

Co-authored-by: devloai[bot] <168258904+devloai[bot]@users.noreply.github.com>
  • Loading branch information
spolu and devloai[bot] authored Jan 6, 2025
1 parent 86bc945 commit 2ab9e85
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 26 deletions.
11 changes: 7 additions & 4 deletions connectors/src/connectors/webcrawler/temporal/activities.ts
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,9 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) {
let upsertingError = 0;
const createdFolders = new Set<string>();

const maxRequestsPerCrawl =
webCrawlerConfig.maxPageToCrawl || WEBCRAWLER_MAX_PAGES;

const crawler = new CheerioCrawler(
{
navigationTimeoutSecs: 10,
Expand Down Expand Up @@ -149,9 +152,7 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) {
}
},
],
maxRequestsPerCrawl:
webCrawlerConfig.maxPageToCrawl || WEBCRAWLER_MAX_PAGES,

maxRequestsPerCrawl,
maxConcurrency: CONCURRENCY,
maxRequestsPerMinute: 20, // 1 request every 3 seconds average, to avoid overloading the target website
requestHandlerTimeoutSecs: REQUEST_HANDLING_TIMEOUT,
Expand Down Expand Up @@ -451,7 +452,7 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) {
"Webcrawler activity started"
);

await crawler.run([url]);
const stats = await crawler.run([url]);

await crawler.teardown();

Expand All @@ -469,6 +470,8 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) {
await syncFailed(connector.id, "webcrawling_error_empty_content");
} else if (pageCount.valid === 0) {
await syncFailed(connector.id, "webcrawling_error");
} else if (stats.requestsFinished >= maxRequestsPerCrawl) {
await syncFailed(connector.id, "webcrawling_synchronization_limit_reached");
} else {
await syncSucceeded(connector.id);
}
Expand Down
41 changes: 19 additions & 22 deletions front/components/data_source/DataSourceSyncChip.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ export default function ConnectorSyncingChip({
}

if (connector.errorType) {
let label = "";
switch (connector.errorType) {
case "oauth_token_revoked":
return (
Expand All @@ -65,43 +64,41 @@ export default function ConnectorSyncingChip({
/>
);
case "webcrawling_error_content_too_large":
label = "Page too large: this page contains too much data.";
return (
<Tooltip
label={label}
className="max-w-md"
trigger={
<Chip className="w-36" color="warning">
<div className="w-full truncate">{label}</div>
</Chip>
label={
"The synchronization failed because too many excessively large pages were found."
}
className="max-w-md"
trigger={<Chip color="warning">Pages too large</Chip>}
/>
);
case "webcrawling_error_empty_content":
label =
"Unable to read: this site's content loads in a way we can't read.";
return (
<Tooltip
label={label}
label={"The synchronization failed to retrieve any content."}
className="max-w-md"
trigger={
<Chip className="w-36" color="warning">
<div className="w-full truncate">{label}</div>
</Chip>
}
trigger={<Chip color="warning">Empty content</Chip>}
/>
);
case "webcrawling_error_blocked":
label = "Access denied: the site blocks automated visits.";
return (
<Tooltip
label={label}
label={
"The synchronization failed because the websites blocks automated visits."
}
className="max-w-md"
trigger={
<Chip className="w-36" color="warning">
<div className="w-full truncate">{label}</div>
</Chip>
trigger={<Chip color="warning">Access blocked</Chip>}
/>
);
case "webcrawling_synchronization_limit_reached":
return (
<Tooltip
label={
"The website synchronization reached the maximum page limit."
}
className="max-w-md"
trigger={<Chip color="warning">Limit reached</Chip>}
/>
);
case "webcrawling_error":
Expand Down
1 change: 1 addition & 0 deletions types/src/front/lib/connectors_api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ const CONNECTORS_ERROR_TYPES = [
"webcrawling_error_empty_content",
"webcrawling_error_content_too_large",
"webcrawling_error_blocked",
"webcrawling_synchronization_limit_reached",
"remote_database_connection_not_readonly",
"remote_database_network_error",
] as const;
Expand Down

0 comments on commit 2ab9e85

Please sign in to comment.