Merge branch 'master' into renovate/jsdom-26.x

apify · Jan 20, 2025 · ec8aee3 · ec8aee3
2 parents d0224ae + 8073792
commit ec8aee3
Show file tree

Hide file tree

Showing 44 changed files with 1,407 additions and 274 deletions.
diff --git a/.github/workflows/check-pr-title.yml b/.github/workflows/check-pr-title.yml
@@ -7,7 +7,7 @@ on:
 jobs:
     check_pr_title:
         name: 'Check PR title'
-        runs-on: ubuntu-latest
+        runs-on: ubuntu-22.04
         steps:
             -   uses: amannn/[email protected]
                 env:

diff --git a/.github/workflows/docker-images.yml b/.github/workflows/docker-images.yml
@@ -26,7 +26,7 @@ env:
 jobs:
     trigger_ci:
         name: Trigger CI
-        runs-on: ubuntu-latest
+        runs-on: ubuntu-22.04
 
         if: github.repository == 'apify/crawlee'
 

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -14,7 +14,7 @@ jobs:
             contents: write
             pages: write
             id-token: write
-        runs-on: ubuntu-latest
+        runs-on: ubuntu-22.04
 
         steps:
             -   uses: actions/checkout@v4

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -32,7 +32,7 @@ jobs:
             fail-fast: true
             matrix:
                 # We don't test on Windows as the tests are flaky
-                os: [ ubuntu-latest ]
+                os: [ ubuntu-22.04 ]
                 node-version: [ 16, 18, 20, 22 ]
 
         runs-on: ${{ matrix.os }}
@@ -86,7 +86,7 @@ jobs:
         name: "Bump Crawlee: ${{ inputs.version }} version (${{ inputs.custom_version || 'n/a' }} custom version)"
         if: (!contains(github.event.head_commit.message, '[skip ci]') && !contains(github.event.head_commit.message, 'docs:'))
         needs: build_and_test
-        runs-on: ubuntu-latest
+        runs-on: ubuntu-22.04
 
         steps:
             -   name: Checkout repository
@@ -173,7 +173,7 @@ jobs:
 
     version-docs:
         needs: release
-        runs-on: ubuntu-latest
+        runs-on: ubuntu-22.04
         if: (github.event.inputs.version == 'minor' || github.event.inputs.version == 'major')
 
         steps:

diff --git a/.github/workflows/test-ci.yml b/.github/workflows/test-ci.yml
@@ -21,8 +21,8 @@ jobs:
             fail-fast: false
             matrix:
                 # tests on windows are extremely unstable
-                # os: [ ubuntu-latest, windows-2019 ]
-                os: [ ubuntu-latest ]
+                # os: [ ubuntu-22.04, windows-2019 ]
+                os: [ ubuntu-22.04 ]
                 node-version: [ 16, 18, 20, 22 ]
 
         steps:
@@ -83,7 +83,7 @@ jobs:
     docs:
         name: Docs build
         if: (!contains(github.event.head_commit.message, '[skip ci]') && github.ref != 'refs/heads/master')
-        runs-on: ubuntu-latest
+        runs-on: ubuntu-22.04
         steps:
             -   name: Checkout Source code
                 uses: actions/checkout@v4
@@ -125,7 +125,7 @@ jobs:
 
     lint:
         name: Lint
-        runs-on: ubuntu-latest
+        runs-on: ubuntu-22.04
 
         steps:
             -   name: Checkout repository
@@ -168,7 +168,7 @@ jobs:
         name: Release @next
         if: github.event_name == 'push' && contains(github.event.ref, 'master') && (!contains(github.event.head_commit.message, '[skip ci]') && !contains(github.event.head_commit.message, 'docs:'))
         needs: build_and_test
-        runs-on: ubuntu-latest
+        runs-on: ubuntu-22.04
 
         steps:
             -   name: Checkout repository

diff --git a/.github/workflows/test-e2e.yml b/.github/workflows/test-e2e.yml
@@ -13,7 +13,7 @@ jobs:
     # NPM install is done in a separate job and cached to speed up the following jobs.
     build_and_test:
         name: Build & Test
-        runs-on: ubuntu-latest
+        runs-on: ubuntu-22.04
 
         strategy:
             fail-fast: false

diff --git a/.github/workflows/update_new_issue.yml b/.github/workflows/update_new_issue.yml
@@ -8,7 +8,7 @@ on:
 jobs:
     label_issues:
         name: Label issues
-        runs-on: ubuntu-latest
+        runs-on: ubuntu-22.04
         permissions:
             issues: write
 

diff --git a/docs/guides/proxy_management.mdx b/docs/guides/proxy_management.mdx
@@ -61,7 +61,72 @@ Examples of how to use our proxy URLs with crawlers are shown below in [Crawler
 
 All our proxy needs are managed by the <ApiLink to="core/class/ProxyConfiguration">`ProxyConfiguration`</ApiLink> class. We create an instance using the `ProxyConfiguration` <ApiLink to="core/class/ProxyConfiguration#constructor">`constructor`</ApiLink> function based on the provided options. See the <ApiLink to="core/interface/ProxyConfigurationOptions">`ProxyConfigurationOptions`</ApiLink> for all the possible constructor options.
 
-### Crawler integration
+### Static proxy list
+
+You can provide a static list of proxy URLs to the `proxyUrls` option. The `ProxyConfiguration` will then rotate through the provided proxies.
+
+```javascript
+const proxyConfiguration = new ProxyConfiguration({
+    proxyUrls: [
+        'http://proxy-1.com',
+        'http://proxy-2.com',
+        null // null means no proxy is used
+    ]
+});
+```
+
+This is the simplest way to use a list of proxies. Crawlee will rotate through the list of proxies in a round-robin fashion.
+
+### Custom proxy function
+
+The `ProxyConfiguration` class allows you to provide a custom function to pick a proxy URL. This is useful when you want to implement your own logic for selecting a proxy.
+
+```javascript
+const proxyConfiguration = new ProxyConfiguration({
+    newUrlFunction: (sessionId, { request }) => {
+        if (request?.url.includes('crawlee.dev')) {
+            return null; // for crawlee.dev, we don't use a proxy
+        }
+
+        return 'http://proxy-1.com'; // for all other URLs, we use this proxy
+    }
+});
+```
+
+The `newUrlFunction` receives two parameters - `sessionId` and `options` - and returns a string containing the proxy URL.
+
+The `sessionId` parameter is always provided and allows us to differentiate between different sessions - e.g. when Crawlee recognizes your crawlers are being blocked, it will automatically create a new session with a different id.
+
+The `options` parameter is an object containing a <ApiLink to="core/class/Request">`Request`</ApiLink>, which is the request that will be made. Note that this object is not always available, for example when we are using the `newUrl` function directly. Your custom function should therefore not rely on the `request` object being present and provide a default behavior when it is not.
+
+### Tiered proxies
+
+You can also provide a list of proxy tiers to the `ProxyConfiguration` class. This is useful when you want to switch between different proxies automatically based on the blocking behavior of the website.
+
+:::warning
+
+Note that the `tieredProxyUrls` option requires `ProxyConfiguration` to be used from a crawler instance ([see below](#crawler-integration)). 
+
+Using this configuration through the `newUrl` calls will not yield the expected results.
+
+:::
+
+```javascript
+const proxyConfiguration = new ProxyConfiguration({
+    tieredProxyUrls: [
+        [null], // At first, we try to connect without a proxy
+        ['http://okay-proxy.com'],
+        ['http://slightly-better-proxy.com', 'http://slightly-better-proxy-2.com'],
+        ['http://very-good-and-expensive-proxy.com'],
+    ]
+});
+```
+
+This configuration will start with no proxy, then switch to `http://okay-proxy.com` if Crawlee recognizes we're getting blocked by the target website. If that proxy is also blocked, we will switch to one of the `slightly-better-proxy` URLs. If those are blocked, we will switch to the `very-good-and-expensive-proxy.com` URL.
+
+Crawlee also periodically probes lower tier proxies to see if they are unblocked, and if they are, it will switch back to them.
+
+## Crawler integration
 
 `ProxyConfiguration` integrates seamlessly into <ApiLink to="http-crawler/class/HttpCrawler">`HttpCrawler`</ApiLink>, <ApiLink to="cheerio-crawler/class/CheerioCrawler">`CheerioCrawler`</ApiLink>, <ApiLink to="jsdom-crawler/class/JSDOMCrawler">`JSDOMCrawler`</ApiLink>, <ApiLink to="playwright-crawler/class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and <ApiLink to="puppeteer-crawler/class/PuppeteerCrawler">`PuppeteerCrawler`</ApiLink>.
 
@@ -95,7 +160,7 @@ All our proxy needs are managed by the <ApiLink to="core/class/ProxyConfiguratio
 
 Our crawlers will now use the selected proxies for all connections.
 
-### IP Rotation and session management
+## IP Rotation and session management
 
 &#8203;<ApiLink to="core/class/ProxyConfiguration#newUrl">`proxyConfiguration.newUrl()`</ApiLink> allows us to pass a `sessionId` parameter. It will then be used to create a `sessionId`-`proxyUrl` pair, and subsequent `newUrl()` calls with the same `sessionId` will always return the same `proxyUrl`. This is extremely useful in scraping, because we want to create the impression of a real user. See the [session management guide](../guides/session-management) and <ApiLink to="core/class/SessionPool">`SessionPool`</ApiLink> class for more information on how keeping a real session helps us avoid blocking.
 

diff --git a/package.json b/package.json
@@ -92,7 +92,7 @@
         "cross-env": "^7.0.3",
         "deep-equal": "^2.0.5",
         "eslint": "^8.57.1",
-        "eslint-config-prettier": "^9.1.0",
+        "eslint-config-prettier": "^10.0.0",
         "express": "^4.18.1",
         "fs-extra": "^11.0.0",
         "gen-esm-wrapper": "^1.1.3",
@@ -106,7 +106,7 @@
         "playwright": "1.49.1",
         "portastic": "^1.0.1",
         "proxy": "^1.0.2",
-        "puppeteer": "23.11.1",
+        "puppeteer": "24.1.0",
         "rimraf": "^6.0.0",
         "tsx": "^4.4.0",
         "turbo": "^2.1.0",

diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts
@@ -294,7 +294,7 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
     /**
      * Allows to keep the crawler alive even if the {@apilink RequestQueue} gets empty.
      * By default, the `crawler.run()` will resolve once the queue is empty. With `keepAlive: true` it will keep running,
-     * waiting for more requests to come. Use `crawler.teardown()` to exit the crawler.
+     * waiting for more requests to come. Use `crawler.stop()` to exit the crawler gracefully, or `crawler.teardown()` to stop it immediately.
      */
     keepAlive?: boolean;
 
@@ -874,7 +874,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
             );
         }
 
-        const purgeRequestQueue = options?.purgeRequestQueue ?? true;
+        const { purgeRequestQueue = true, ...addRequestsOptions } = options ?? {};
 
         if (this.hasFinishedBefore) {
             // When executing the run method for the second time explicitly,
@@ -896,7 +896,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
         await purgeDefaultStorages({ onlyPurgeOnce: true });
 
         if (requests) {
-            await this.addRequests(requests, options);
+            await this.addRequests(requests, addRequestsOptions);
         }
 
         await this._init();
@@ -918,6 +918,8 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
         this.events.on(EventType.MIGRATING, boundPauseOnMigration);
         this.events.on(EventType.ABORTING, boundPauseOnMigration);
 
+        let stats = {} as FinalStatistics;
+
         try {
             await this.autoscaledPool!.run();
         } finally {
@@ -927,53 +929,71 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
             process.off('SIGINT', sigintHandler);
             this.events.off(EventType.MIGRATING, boundPauseOnMigration);
             this.events.off(EventType.ABORTING, boundPauseOnMigration);
-        }
 
-        const finalStats = this.stats.calculate();
-        const stats = {
-            requestsFinished: this.stats.state.requestsFinished,
-            requestsFailed: this.stats.state.requestsFailed,
-            retryHistogram: this.stats.requestRetryHistogram,
-            ...finalStats,
-        };
-        this.log.info('Final request statistics:', stats);
+            const finalStats = this.stats.calculate();
+            stats = {
+                requestsFinished: this.stats.state.requestsFinished,
+                requestsFailed: this.stats.state.requestsFailed,
+                retryHistogram: this.stats.requestRetryHistogram,
+                ...finalStats,
+            };
+            this.log.info('Final request statistics:', stats);
+
+            if (this.stats.errorTracker.total !== 0) {
+                const prettify = ([count, info]: [number, string[]]) =>
+                    `${count}x: ${info.at(-1)!.trim()} (${info[0]})`;
+
+                this.log.info(`Error analysis:`, {
+                    totalErrors: this.stats.errorTracker.total,
+                    uniqueErrors: this.stats.errorTracker.getUniqueErrorCount(),
+                    mostCommonErrors: this.stats.errorTracker.getMostPopularErrors(3).map(prettify),
+                });
+            }
 
-        if (this.stats.errorTracker.total !== 0) {
-            const prettify = ([count, info]: [number, string[]]) => `${count}x: ${info.at(-1)!.trim()} (${info[0]})`;
+            const client = this.config.getStorageClient();
 
-            this.log.info(`Error analysis:`, {
-                totalErrors: this.stats.errorTracker.total,
-                uniqueErrors: this.stats.errorTracker.getUniqueErrorCount(),
-                mostCommonErrors: this.stats.errorTracker.getMostPopularErrors(3).map(prettify),
-            });
-        }
-
-        const client = this.config.getStorageClient();
+            if (client.teardown) {
+                let finished = false;
+                setTimeout(() => {
+                    if (!finished) {
+                        this.log.info('Waiting for the storage to write its state to file system.');
+                    }
+                }, 1000);
+                await client.teardown();
+                finished = true;
+            }
 
-        if (client.teardown) {
-            let finished = false;
-            setTimeout(() => {
-                if (!finished) {
-                    this.log.info('Waiting for the storage to write its state to file system.');
-                }
-            }, 1000);
-            await client.teardown();
-            finished = true;
+            periodicLogger.stop();
+            await this.setStatusMessage(
+                `Finished! Total ${this.stats.state.requestsFinished + this.stats.state.requestsFailed} requests: ${
+                    this.stats.state.requestsFinished
+                } succeeded, ${this.stats.state.requestsFailed} failed.`,
+                { isStatusMessageTerminal: true, level: 'INFO' },
+            );
+            this.running = false;
+            this.hasFinishedBefore = true;
         }
 
-        periodicLogger.stop();
-        await this.setStatusMessage(
-            `Finished! Total ${this.stats.state.requestsFinished + this.stats.state.requestsFailed} requests: ${
-                this.stats.state.requestsFinished
-            } succeeded, ${this.stats.state.requestsFailed} failed.`,
-            { isStatusMessageTerminal: true, level: 'INFO' },
-        );
-        this.running = false;
-        this.hasFinishedBefore = true;
-
         return stats;
     }
 
+    /**
+     * Gracefully stops the current run of the crawler.
+     *
+     * All the tasks active at the time of calling this method will be allowed to finish.
+     */
+    stop(message = 'The crawler has been gracefully stopped.'): void {
+        // Gracefully starve the this.autoscaledPool, so it doesn't start new tasks. Resolves once the pool is cleared.
+        this.autoscaledPool
+            ?.pause()
+            // Resolves the `autoscaledPool.run()` promise in the `BasicCrawler.run()` method. Since the pool is already paused, it resolves immediately and doesn't kill any tasks.
+            .then(async () => this.autoscaledPool?.abort())
+            .then(() => this.log.info(message))
+            .catch((err) => {
+                this.log.error('An error occurred when stopping the crawler:', err);
+            });
+    }
+
     async getRequestQueue() {
         if (!this.requestQueue && this.requestList) {
             this.log.warningOnce(

diff --git a/packages/browser-pool/src/puppeteer/puppeteer-controller.ts b/packages/browser-pool/src/puppeteer/puppeteer-controller.ts
@@ -16,7 +16,7 @@ const PROCESS_KILL_TIMEOUT_MILLIS = 5000;
 
 export class PuppeteerController extends BrowserController<
     typeof Puppeteer,
-    PuppeteerTypes.PuppeteerLaunchOptions,
+    PuppeteerTypes.LaunchOptions,
     PuppeteerTypes.Browser,
     PuppeteerNewPageOptions
 > {