From 1ea95f74ae1f343d4d81b8afc674d9f3fb609b77 Mon Sep 17 00:00:00 2001 From: Angie Byron <332535+webchick@users.noreply.github.com> Date: Tue, 7 Jan 2025 05:44:19 -0800 Subject: [PATCH] docs: Fix timeouts and retry limit exceeded in interim introduction example (#2791) I'm not sure if this is the correct fix, but it does fix the problem, and it uses similar logic to the final script on this page, which is working properly. (It also works if you simply comment out the `await page.waitForSelector('.collection-block-item');` line, but I assume that's there for a reason.) Old output: ``` INFO PlaywrightCrawler: All requests from the queue have been processed, the crawler will shut down. INFO PlaywrightCrawler: Final request statistics: {"requestsFinished":1,"requestsFailed":31,"retryHistogram":[1,null,null,31],"requestAvgFailedDurationMillis":30359,"requestAvgFinishedDurationMillis":1056,"requestsFinishedPerMinute":0,"requestsFailedPerMinute":6,"requestTotalDurationMillis":942186,"requestsTotal":32,"crawlerRuntimeMillis":291681} INFO PlaywrightCrawler: Error analysis: {"totalErrors":31,"uniqueErrors":1,"mostCommonErrors":["31x: page.waitForSelector: Timeout 30000ms exceeded. (/Users/webchick/TechAround/fun-with-scraping/src/main.js:8:20)"]} INFO PlaywrightCrawler: Finished! Total 32 requests: 1 succeeded, 31 failed. {"terminal":true} ``` New output: ``` INFO PlaywrightCrawler: All requests from the queue have been processed, the crawler will shut down. INFO PlaywrightCrawler: Final request statistics: {"requestsFinished":32,"requestsFailed":0,"retryHistogram":[32],"requestAvgFailedDurationMillis":null,"requestAvgFinishedDurationMillis":302,"requestsFinishedPerMinute":340,"requestsFailedPerMinute":0,"requestTotalDurationMillis":9677,"requestsTotal":32,"crawlerRuntimeMillis":5644} INFO PlaywrightCrawler: Finished! Total 32 requests: 32 succeeded, 0 failed. {"terminal":true} ``` Closes #2790 --- docs/introduction/05-crawling.mdx | 25 +++++++++++-------- .../version-3.12/introduction/05-crawling.mdx | 25 +++++++++++-------- 2 files changed, 30 insertions(+), 20 deletions(-) diff --git a/docs/introduction/05-crawling.mdx b/docs/introduction/05-crawling.mdx index 6621ee10b0ff..a3930c6b6be5 100644 --- a/docs/introduction/05-crawling.mdx +++ b/docs/introduction/05-crawling.mdx @@ -23,16 +23,21 @@ import { PlaywrightCrawler } from 'crawlee'; const crawler = new PlaywrightCrawler({ requestHandler: async ({ page, request, enqueueLinks }) => { console.log(`Processing: ${request.url}`); - // Wait for the category cards to render, - // otherwise enqueueLinks wouldn't enqueue anything. - await page.waitForSelector('.collection-block-item'); - - // Add links to the queue, but only from - // elements matching the provided selector. - await enqueueLinks({ - selector: '.collection-block-item', - label: 'CATEGORY', - }); + + // Only run this logic on the main category listing, not on sub-pages. + if (request.label !== 'CATEGORY') { + + // Wait for the category cards to render, + // otherwise enqueueLinks wouldn't enqueue anything. + await page.waitForSelector('.collection-block-item'); + + // Add links to the queue, but only from + // elements matching the provided selector. + await enqueueLinks({ + selector: '.collection-block-item', + label: 'CATEGORY', + }); + } }, }); diff --git a/website/versioned_docs/version-3.12/introduction/05-crawling.mdx b/website/versioned_docs/version-3.12/introduction/05-crawling.mdx index 6621ee10b0ff..a3930c6b6be5 100644 --- a/website/versioned_docs/version-3.12/introduction/05-crawling.mdx +++ b/website/versioned_docs/version-3.12/introduction/05-crawling.mdx @@ -23,16 +23,21 @@ import { PlaywrightCrawler } from 'crawlee'; const crawler = new PlaywrightCrawler({ requestHandler: async ({ page, request, enqueueLinks }) => { console.log(`Processing: ${request.url}`); - // Wait for the category cards to render, - // otherwise enqueueLinks wouldn't enqueue anything. - await page.waitForSelector('.collection-block-item'); - - // Add links to the queue, but only from - // elements matching the provided selector. - await enqueueLinks({ - selector: '.collection-block-item', - label: 'CATEGORY', - }); + + // Only run this logic on the main category listing, not on sub-pages. + if (request.label !== 'CATEGORY') { + + // Wait for the category cards to render, + // otherwise enqueueLinks wouldn't enqueue anything. + await page.waitForSelector('.collection-block-item'); + + // Add links to the queue, but only from + // elements matching the provided selector. + await enqueueLinks({ + selector: '.collection-block-item', + label: 'CATEGORY', + }); + } }, });