Skip to content

Commit af2966f

Browse files
barjinB4nan
andauthored
feat: stopping the crawlers gracefully with BasicCrawler.stop() (#2792)
Allows users to call `crawler.stop()` to stop the crawler gracefully. Closes #2777 --------- Co-authored-by: Martin Adámek <[email protected]>
1 parent 53331e8 commit af2966f

File tree

9 files changed

+159
-1
lines changed

9 files changed

+159
-1
lines changed

packages/basic-crawler/src/internals/basic-crawler.ts

+18-1
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,7 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
294294
/**
295295
* Allows to keep the crawler alive even if the {@apilink RequestQueue} gets empty.
296296
* By default, the `crawler.run()` will resolve once the queue is empty. With `keepAlive: true` it will keep running,
297-
* waiting for more requests to come. Use `crawler.teardown()` to exit the crawler.
297+
* waiting for more requests to come. Use `crawler.stop()` to exit the crawler gracefully, or `crawler.teardown()` to stop it immediately.
298298
*/
299299
keepAlive?: boolean;
300300

@@ -977,6 +977,23 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
977977
return stats;
978978
}
979979

980+
/**
981+
* Gracefully stops the current run of the crawler.
982+
*
983+
* All the tasks active at the time of calling this method will be allowed to finish.
984+
*/
985+
stop(message = 'The crawler has been gracefully stopped.'): void {
986+
// Gracefully starve the this.autoscaledPool, so it doesn't start new tasks. Resolves once the pool is cleared.
987+
this.autoscaledPool
988+
?.pause()
989+
// Resolves the `autoscaledPool.run()` promise in the `BasicCrawler.run()` method. Since the pool is already paused, it resolves immediately and doesn't kill any tasks.
990+
.then(async () => this.autoscaledPool?.abort())
991+
.then(() => this.log.info(message))
992+
.catch((err) => {
993+
this.log.error('An error occurred when stopping the crawler:', err);
994+
});
995+
}
996+
980997
async getRequestQueue() {
981998
if (!this.requestQueue && this.requestList) {
982999
this.log.warningOnce(
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"actorSpecification": 1,
3+
"name": "test-cheerio-stop-resume-ts",
4+
"version": "0.0",
5+
"buildTag": "latest",
6+
"env": null
7+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{
2+
"root": true,
3+
"extends": "../../.eslintrc.json",
4+
"parserOptions": {
5+
"project": "./test/e2e/cheerio-stop-resume-ts/actor/tsconfig.json",
6+
"ecmaVersion": 2022
7+
}
8+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
.idea
2+
.DS_Store
3+
node_modules
4+
package-lock.json
5+
apify_storage
6+
crawlee_storage
7+
storage
8+
main.d.ts
9+
main.d.ts.map
10+
main.js
11+
main.js.map
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# using multistage build, as we need dev deps to build the TS source code
2+
FROM apify/actor-node:20-beta AS builder
3+
4+
# copy all files, install all dependencies (including dev deps) and build the project
5+
COPY . ./
6+
RUN npm install --include=dev \
7+
&& npm run build
8+
9+
# create final image
10+
FROM apify/actor-node:20-beta
11+
# copy only necessary files
12+
COPY --from=builder /usr/src/app/packages ./packages
13+
COPY --from=builder /usr/src/app/package.json ./
14+
COPY --from=builder /usr/src/app/main.js ./
15+
16+
# install only prod deps
17+
RUN npm --quiet set progress=false \
18+
&& npm install --only=prod --no-optional --no-audit \
19+
&& npm update --no-audit \
20+
&& echo "Installed NPM packages:" \
21+
&& (npm list --only=prod --no-optional --all || true) \
22+
&& echo "Node.js version:" \
23+
&& node --version \
24+
&& echo "NPM version:" \
25+
&& npm --version
26+
27+
# run compiled code
28+
CMD npm run start:prod
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import { CheerioCrawler, Dataset } from '@crawlee/cheerio';
2+
import { Actor } from 'apify';
3+
4+
if (process.env.STORAGE_IMPLEMENTATION === 'LOCAL') {
5+
// @ts-ignore
6+
await Actor.init({ storage: new (await import('@apify/storage-local')).ApifyStorageLocal() });
7+
} else {
8+
await Actor.init();
9+
}
10+
11+
let requestCount = 0;
12+
13+
const crawler = new CheerioCrawler();
14+
crawler.router.addDefaultHandler(async ({ $, enqueueLinks, request, log }) => {
15+
const { url } = request;
16+
await enqueueLinks({
17+
globs: ['https://crawlee.dev/docs/**'],
18+
});
19+
20+
const pageTitle = $('title').first().text();
21+
log.info(`URL: ${url} TITLE: ${pageTitle}`);
22+
await Dataset.pushData({ url, pageTitle });
23+
24+
if (requestCount++ > 10) crawler.stop();
25+
});
26+
27+
await crawler.run(['https://crawlee.dev/docs/quick-start']);
28+
29+
requestCount = 0;
30+
await crawler.run(['https://crawlee.dev/docs/quick-start'], { purgeRequestQueue: false });
31+
await Actor.exit({ exit: Actor.isAtHome() });
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
{
2+
"name": "test-cheerio-stop-resume-ts",
3+
"version": "0.0.1",
4+
"description": "Crawler Stop-Resume Test - TypeScript",
5+
"dependencies": {
6+
"apify": "next",
7+
"@apify/storage-local": "^2.1.3",
8+
"@crawlee/basic": "file:./packages/basic-crawler",
9+
"@crawlee/browser-pool": "file:./packages/browser-pool",
10+
"@crawlee/http": "file:./packages/http-crawler",
11+
"@crawlee/cheerio": "file:./packages/cheerio-crawler",
12+
"@crawlee/core": "file:./packages/core",
13+
"@crawlee/memory-storage": "file:./packages/memory-storage",
14+
"@crawlee/types": "file:./packages/types",
15+
"@crawlee/utils": "file:./packages/utils"
16+
},
17+
"overrides": {
18+
"apify": {
19+
"@crawlee/core": "file:./packages/core",
20+
"@crawlee/types": "file:./packages/types",
21+
"@crawlee/utils": "file:./packages/utils"
22+
}
23+
},
24+
"devDependencies": {
25+
"@apify/tsconfig": "^0.1.0",
26+
"typescript": "^5.0.0"
27+
},
28+
"scripts": {
29+
"start": "tsc && node main.js",
30+
"start:prod": "node main.js",
31+
"build": "tsc"
32+
},
33+
"type": "module",
34+
"license": "ISC"
35+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{
2+
"extends": "@apify/tsconfig",
3+
"compilerOptions": {
4+
"module": "ES2022",
5+
"target": "ES2022",
6+
"lib": ["DOM"]
7+
},
8+
"include": ["./**/*.ts"]
9+
}
+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import { initialize, getActorTestDir, runActor, expect } from '../tools.mjs';
2+
3+
const testActorDirname = getActorTestDir(import.meta.url);
4+
await initialize(testActorDirname);
5+
6+
const { stats, datasetItems } = await runActor(testActorDirname);
7+
8+
/// Some extra requests are expected (at most 10 extra for each run).
9+
await expect(stats.requestsFinished < 40, 'crawler.stop() works');
10+
11+
const visitedUrls = new Set(datasetItems.map((x) => x.url));
12+
await expect(visitedUrls.size === datasetItems.length, 'stateful crawler.run({ purgeRQ: false }) works');

0 commit comments

Comments
 (0)