Skip to content

Commit

Permalink
Merge branch 'master' into renovate/jsdom-26.x
Browse files Browse the repository at this point in the history
  • Loading branch information
B4nan authored Jan 20, 2025
2 parents d0224ae + 8073792 commit ec8aee3
Show file tree
Hide file tree
Showing 44 changed files with 1,407 additions and 274 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/check-pr-title.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ on:
jobs:
check_pr_title:
name: 'Check PR title'
runs-on: ubuntu-latest
runs-on: ubuntu-22.04
steps:
- uses: amannn/[email protected]
env:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/docker-images.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ env:
jobs:
trigger_ci:
name: Trigger CI
runs-on: ubuntu-latest
runs-on: ubuntu-22.04

if: github.repository == 'apify/crawlee'

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
contents: write
pages: write
id-token: write
runs-on: ubuntu-latest
runs-on: ubuntu-22.04

steps:
- uses: actions/checkout@v4
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
fail-fast: true
matrix:
# We don't test on Windows as the tests are flaky
os: [ ubuntu-latest ]
os: [ ubuntu-22.04 ]
node-version: [ 16, 18, 20, 22 ]

runs-on: ${{ matrix.os }}
Expand Down Expand Up @@ -86,7 +86,7 @@ jobs:
name: "Bump Crawlee: ${{ inputs.version }} version (${{ inputs.custom_version || 'n/a' }} custom version)"
if: (!contains(github.event.head_commit.message, '[skip ci]') && !contains(github.event.head_commit.message, 'docs:'))
needs: build_and_test
runs-on: ubuntu-latest
runs-on: ubuntu-22.04

steps:
- name: Checkout repository
Expand Down Expand Up @@ -173,7 +173,7 @@ jobs:

version-docs:
needs: release
runs-on: ubuntu-latest
runs-on: ubuntu-22.04
if: (github.event.inputs.version == 'minor' || github.event.inputs.version == 'major')

steps:
Expand Down
10 changes: 5 additions & 5 deletions .github/workflows/test-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ jobs:
fail-fast: false
matrix:
# tests on windows are extremely unstable
# os: [ ubuntu-latest, windows-2019 ]
os: [ ubuntu-latest ]
# os: [ ubuntu-22.04, windows-2019 ]
os: [ ubuntu-22.04 ]
node-version: [ 16, 18, 20, 22 ]

steps:
Expand Down Expand Up @@ -83,7 +83,7 @@ jobs:
docs:
name: Docs build
if: (!contains(github.event.head_commit.message, '[skip ci]') && github.ref != 'refs/heads/master')
runs-on: ubuntu-latest
runs-on: ubuntu-22.04
steps:
- name: Checkout Source code
uses: actions/checkout@v4
Expand Down Expand Up @@ -125,7 +125,7 @@ jobs:

lint:
name: Lint
runs-on: ubuntu-latest
runs-on: ubuntu-22.04

steps:
- name: Checkout repository
Expand Down Expand Up @@ -168,7 +168,7 @@ jobs:
name: Release @next
if: github.event_name == 'push' && contains(github.event.ref, 'master') && (!contains(github.event.head_commit.message, '[skip ci]') && !contains(github.event.head_commit.message, 'docs:'))
needs: build_and_test
runs-on: ubuntu-latest
runs-on: ubuntu-22.04

steps:
- name: Checkout repository
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test-e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:
# NPM install is done in a separate job and cached to speed up the following jobs.
build_and_test:
name: Build & Test
runs-on: ubuntu-latest
runs-on: ubuntu-22.04

strategy:
fail-fast: false
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/update_new_issue.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ on:
jobs:
label_issues:
name: Label issues
runs-on: ubuntu-latest
runs-on: ubuntu-22.04
permissions:
issues: write

Expand Down
69 changes: 67 additions & 2 deletions docs/guides/proxy_management.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,72 @@ Examples of how to use our proxy URLs with crawlers are shown below in [Crawler

All our proxy needs are managed by the <ApiLink to="core/class/ProxyConfiguration">`ProxyConfiguration`</ApiLink> class. We create an instance using the `ProxyConfiguration` <ApiLink to="core/class/ProxyConfiguration#constructor">`constructor`</ApiLink> function based on the provided options. See the <ApiLink to="core/interface/ProxyConfigurationOptions">`ProxyConfigurationOptions`</ApiLink> for all the possible constructor options.

### Crawler integration
### Static proxy list

You can provide a static list of proxy URLs to the `proxyUrls` option. The `ProxyConfiguration` will then rotate through the provided proxies.

```javascript
const proxyConfiguration = new ProxyConfiguration({
proxyUrls: [
'http://proxy-1.com',
'http://proxy-2.com',
null // null means no proxy is used
]
});
```

This is the simplest way to use a list of proxies. Crawlee will rotate through the list of proxies in a round-robin fashion.

### Custom proxy function

The `ProxyConfiguration` class allows you to provide a custom function to pick a proxy URL. This is useful when you want to implement your own logic for selecting a proxy.

```javascript
const proxyConfiguration = new ProxyConfiguration({
newUrlFunction: (sessionId, { request }) => {
if (request?.url.includes('crawlee.dev')) {
return null; // for crawlee.dev, we don't use a proxy
}

return 'http://proxy-1.com'; // for all other URLs, we use this proxy
}
});
```
The `newUrlFunction` receives two parameters - `sessionId` and `options` - and returns a string containing the proxy URL.
The `sessionId` parameter is always provided and allows us to differentiate between different sessions - e.g. when Crawlee recognizes your crawlers are being blocked, it will automatically create a new session with a different id.
The `options` parameter is an object containing a <ApiLink to="core/class/Request">`Request`</ApiLink>, which is the request that will be made. Note that this object is not always available, for example when we are using the `newUrl` function directly. Your custom function should therefore not rely on the `request` object being present and provide a default behavior when it is not.
### Tiered proxies
You can also provide a list of proxy tiers to the `ProxyConfiguration` class. This is useful when you want to switch between different proxies automatically based on the blocking behavior of the website.
:::warning
Note that the `tieredProxyUrls` option requires `ProxyConfiguration` to be used from a crawler instance ([see below](#crawler-integration)).
Using this configuration through the `newUrl` calls will not yield the expected results.
:::
```javascript
const proxyConfiguration = new ProxyConfiguration({
tieredProxyUrls: [
[null], // At first, we try to connect without a proxy
['http://okay-proxy.com'],
['http://slightly-better-proxy.com', 'http://slightly-better-proxy-2.com'],
['http://very-good-and-expensive-proxy.com'],
]
});
```
This configuration will start with no proxy, then switch to `http://okay-proxy.com` if Crawlee recognizes we're getting blocked by the target website. If that proxy is also blocked, we will switch to one of the `slightly-better-proxy` URLs. If those are blocked, we will switch to the `very-good-and-expensive-proxy.com` URL.

Crawlee also periodically probes lower tier proxies to see if they are unblocked, and if they are, it will switch back to them.

## Crawler integration

`ProxyConfiguration` integrates seamlessly into <ApiLink to="http-crawler/class/HttpCrawler">`HttpCrawler`</ApiLink>, <ApiLink to="cheerio-crawler/class/CheerioCrawler">`CheerioCrawler`</ApiLink>, <ApiLink to="jsdom-crawler/class/JSDOMCrawler">`JSDOMCrawler`</ApiLink>, <ApiLink to="playwright-crawler/class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and <ApiLink to="puppeteer-crawler/class/PuppeteerCrawler">`PuppeteerCrawler`</ApiLink>.

Expand Down Expand Up @@ -95,7 +160,7 @@ All our proxy needs are managed by the <ApiLink to="core/class/ProxyConfiguratio

Our crawlers will now use the selected proxies for all connections.

### IP Rotation and session management
## IP Rotation and session management

&#8203;<ApiLink to="core/class/ProxyConfiguration#newUrl">`proxyConfiguration.newUrl()`</ApiLink> allows us to pass a `sessionId` parameter. It will then be used to create a `sessionId`-`proxyUrl` pair, and subsequent `newUrl()` calls with the same `sessionId` will always return the same `proxyUrl`. This is extremely useful in scraping, because we want to create the impression of a real user. See the [session management guide](../guides/session-management) and <ApiLink to="core/class/SessionPool">`SessionPool`</ApiLink> class for more information on how keeping a real session helps us avoid blocking.

Expand Down
4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@
"cross-env": "^7.0.3",
"deep-equal": "^2.0.5",
"eslint": "^8.57.1",
"eslint-config-prettier": "^9.1.0",
"eslint-config-prettier": "^10.0.0",
"express": "^4.18.1",
"fs-extra": "^11.0.0",
"gen-esm-wrapper": "^1.1.3",
Expand All @@ -106,7 +106,7 @@
"playwright": "1.49.1",
"portastic": "^1.0.1",
"proxy": "^1.0.2",
"puppeteer": "23.11.1",
"puppeteer": "24.1.0",
"rimraf": "^6.0.0",
"tsx": "^4.4.0",
"turbo": "^2.1.0",
Expand Down
102 changes: 61 additions & 41 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
/**
* Allows to keep the crawler alive even if the {@apilink RequestQueue} gets empty.
* By default, the `crawler.run()` will resolve once the queue is empty. With `keepAlive: true` it will keep running,
* waiting for more requests to come. Use `crawler.teardown()` to exit the crawler.
* waiting for more requests to come. Use `crawler.stop()` to exit the crawler gracefully, or `crawler.teardown()` to stop it immediately.
*/
keepAlive?: boolean;

Expand Down Expand Up @@ -874,7 +874,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
);
}

const purgeRequestQueue = options?.purgeRequestQueue ?? true;
const { purgeRequestQueue = true, ...addRequestsOptions } = options ?? {};

if (this.hasFinishedBefore) {
// When executing the run method for the second time explicitly,
Expand All @@ -896,7 +896,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
await purgeDefaultStorages({ onlyPurgeOnce: true });

if (requests) {
await this.addRequests(requests, options);
await this.addRequests(requests, addRequestsOptions);
}

await this._init();
Expand All @@ -918,6 +918,8 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
this.events.on(EventType.MIGRATING, boundPauseOnMigration);
this.events.on(EventType.ABORTING, boundPauseOnMigration);

let stats = {} as FinalStatistics;

try {
await this.autoscaledPool!.run();
} finally {
Expand All @@ -927,53 +929,71 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
process.off('SIGINT', sigintHandler);
this.events.off(EventType.MIGRATING, boundPauseOnMigration);
this.events.off(EventType.ABORTING, boundPauseOnMigration);
}

const finalStats = this.stats.calculate();
const stats = {
requestsFinished: this.stats.state.requestsFinished,
requestsFailed: this.stats.state.requestsFailed,
retryHistogram: this.stats.requestRetryHistogram,
...finalStats,
};
this.log.info('Final request statistics:', stats);
const finalStats = this.stats.calculate();
stats = {
requestsFinished: this.stats.state.requestsFinished,
requestsFailed: this.stats.state.requestsFailed,
retryHistogram: this.stats.requestRetryHistogram,
...finalStats,
};
this.log.info('Final request statistics:', stats);

if (this.stats.errorTracker.total !== 0) {
const prettify = ([count, info]: [number, string[]]) =>
`${count}x: ${info.at(-1)!.trim()} (${info[0]})`;

this.log.info(`Error analysis:`, {
totalErrors: this.stats.errorTracker.total,
uniqueErrors: this.stats.errorTracker.getUniqueErrorCount(),
mostCommonErrors: this.stats.errorTracker.getMostPopularErrors(3).map(prettify),
});
}

if (this.stats.errorTracker.total !== 0) {
const prettify = ([count, info]: [number, string[]]) => `${count}x: ${info.at(-1)!.trim()} (${info[0]})`;
const client = this.config.getStorageClient();

this.log.info(`Error analysis:`, {
totalErrors: this.stats.errorTracker.total,
uniqueErrors: this.stats.errorTracker.getUniqueErrorCount(),
mostCommonErrors: this.stats.errorTracker.getMostPopularErrors(3).map(prettify),
});
}

const client = this.config.getStorageClient();
if (client.teardown) {
let finished = false;
setTimeout(() => {
if (!finished) {
this.log.info('Waiting for the storage to write its state to file system.');
}
}, 1000);
await client.teardown();
finished = true;
}

if (client.teardown) {
let finished = false;
setTimeout(() => {
if (!finished) {
this.log.info('Waiting for the storage to write its state to file system.');
}
}, 1000);
await client.teardown();
finished = true;
periodicLogger.stop();
await this.setStatusMessage(
`Finished! Total ${this.stats.state.requestsFinished + this.stats.state.requestsFailed} requests: ${
this.stats.state.requestsFinished
} succeeded, ${this.stats.state.requestsFailed} failed.`,
{ isStatusMessageTerminal: true, level: 'INFO' },
);
this.running = false;
this.hasFinishedBefore = true;
}

periodicLogger.stop();
await this.setStatusMessage(
`Finished! Total ${this.stats.state.requestsFinished + this.stats.state.requestsFailed} requests: ${
this.stats.state.requestsFinished
} succeeded, ${this.stats.state.requestsFailed} failed.`,
{ isStatusMessageTerminal: true, level: 'INFO' },
);
this.running = false;
this.hasFinishedBefore = true;

return stats;
}

/**
* Gracefully stops the current run of the crawler.
*
* All the tasks active at the time of calling this method will be allowed to finish.
*/
stop(message = 'The crawler has been gracefully stopped.'): void {
// Gracefully starve the this.autoscaledPool, so it doesn't start new tasks. Resolves once the pool is cleared.
this.autoscaledPool
?.pause()
// Resolves the `autoscaledPool.run()` promise in the `BasicCrawler.run()` method. Since the pool is already paused, it resolves immediately and doesn't kill any tasks.
.then(async () => this.autoscaledPool?.abort())
.then(() => this.log.info(message))
.catch((err) => {
this.log.error('An error occurred when stopping the crawler:', err);
});
}

async getRequestQueue() {
if (!this.requestQueue && this.requestList) {
this.log.warningOnce(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ const PROCESS_KILL_TIMEOUT_MILLIS = 5000;

export class PuppeteerController extends BrowserController<
typeof Puppeteer,
PuppeteerTypes.PuppeteerLaunchOptions,
PuppeteerTypes.LaunchOptions,
PuppeteerTypes.Browser,
PuppeteerNewPageOptions
> {
Expand Down
Loading

0 comments on commit ec8aee3

Please sign in to comment.