diff --git a/test/e2e/cheerio-impit-ts/actor/.actor/actor.json b/test/e2e/cheerio-impit-ts/actor/.actor/actor.json new file mode 100644 index 000000000000..99324939c355 --- /dev/null +++ b/test/e2e/cheerio-impit-ts/actor/.actor/actor.json @@ -0,0 +1,7 @@ +{ + "actorSpecification": 1, + "name": "test-cheerio-impit-ts", + "version": "0.0", + "buildTag": "latest", + "env": null +} diff --git a/test/e2e/cheerio-impit-ts/actor/.eslintrc.json b/test/e2e/cheerio-impit-ts/actor/.eslintrc.json new file mode 100644 index 000000000000..629299be5fe5 --- /dev/null +++ b/test/e2e/cheerio-impit-ts/actor/.eslintrc.json @@ -0,0 +1,13 @@ +{ + "root": true, + "extends": "../../.eslintrc.json", + "parserOptions": { + "project": "./test/e2e/cheerio-impit-ts/actor/tsconfig.json", + "ecmaVersion": 2022 + }, + "rules": { + "no-empty-function": "off", + "@typescript-eslint/no-explicit-any": "off", + "no-constant-condition": "off" + } +} diff --git a/test/e2e/cheerio-impit-ts/actor/.gitignore b/test/e2e/cheerio-impit-ts/actor/.gitignore new file mode 100644 index 000000000000..f2fc11c72bcc --- /dev/null +++ b/test/e2e/cheerio-impit-ts/actor/.gitignore @@ -0,0 +1,11 @@ +.idea +.DS_Store +node_modules +package-lock.json +apify_storage +crawlee_storage +storage +main.d.ts +main.d.ts.map +main.js +main.js.map diff --git a/test/e2e/cheerio-impit-ts/actor/Dockerfile b/test/e2e/cheerio-impit-ts/actor/Dockerfile new file mode 100644 index 000000000000..91fadb14630b --- /dev/null +++ b/test/e2e/cheerio-impit-ts/actor/Dockerfile @@ -0,0 +1,30 @@ +FROM node:20 AS builder + +COPY /packages ./packages +COPY /package*.json ./ +COPY /tsconfig.json ./ +COPY /main.ts ./ +RUN npm --quiet set progress=false \ + && npm install --only=prod --no-optional --no-audit \ + && npm update \ + && npm run build + +FROM lwthiker/curl-impersonate + +COPY --from=builder /usr/local/bin /usr/local/bin +COPY --from=builder /usr/local/lib/node_modules/npm /usr/local/lib/node_modules/npm +COPY --from=builder /node_modules ./node_modules +COPY --from=builder /packages ./packages +COPY --from=builder /package*.json ./ +COPY --from=builder /main.js ./ +COPY /.actor ./.actor + +RUN echo "Installed NPM packages:" \ + && (npm list --only=prod --no-optional --all || true) \ + && echo "Node.js version:" \ + && node --version \ + && echo "NPM version:" \ + && npm --version + +# run compiled code +CMD npm run start:prod diff --git a/test/e2e/cheerio-impit-ts/actor/main.ts b/test/e2e/cheerio-impit-ts/actor/main.ts new file mode 100644 index 000000000000..68a4a52dd1c6 --- /dev/null +++ b/test/e2e/cheerio-impit-ts/actor/main.ts @@ -0,0 +1,41 @@ +import { CheerioCrawler, Dictionary } from '@crawlee/cheerio'; +import { Actor } from 'apify'; +import { ImpitHttpClient, Browser } from '@crawlee/impit-client'; + +if (process.env.STORAGE_IMPLEMENTATION === 'LOCAL') { + // @ts-ignore + await Actor.init({ storage: new (await import('@apify/storage-local')).ApifyStorageLocal() }); +} else { + await Actor.init(); +} + +const crawler = new CheerioCrawler({ + async requestHandler(context) { + const { body: text } = await context.sendRequest({ + url: 'https://httpbin.org/uuid', + }); + + const { body: json } = await context.sendRequest({ + url: 'https://httpbin.org/uuid', + responseType: 'json', + }); + + const { body: ua } = await context.sendRequest({ + url: 'https://httpbin.org/user-agent', + responseType: 'json', + }); + + await context.pushData({ + body: context.body, + title: context.$('title').text(), + userAgent: ua['user-agent'], + uuidTextResponse: text, + uuidJsonResponse: json, + }); + }, + httpClient: new ImpitHttpClient({ browser: Browser.Firefox }), +}); + +await crawler.run(['https://httpbin.org/']); + +await Actor.exit({ exit: Actor.isAtHome() }); diff --git a/test/e2e/cheerio-impit-ts/actor/package.json b/test/e2e/cheerio-impit-ts/actor/package.json new file mode 100644 index 000000000000..03ccac5e739f --- /dev/null +++ b/test/e2e/cheerio-impit-ts/actor/package.json @@ -0,0 +1,36 @@ +{ + "name": "test-cheerio-impit-ts", + "version": "0.0.1", + "description": "Cheerio Crawler Test - Impit HTTP client", + "dependencies": { + "apify": "next", + "@apify/storage-local": "^2.1.3", + "@crawlee/basic": "file:./packages/basic-crawler", + "@crawlee/browser-pool": "file:./packages/browser-pool", + "@crawlee/http": "file:./packages/http-crawler", + "@crawlee/cheerio": "file:./packages/cheerio-crawler", + "@crawlee/core": "file:./packages/core", + "@crawlee/memory-storage": "file:./packages/memory-storage", + "@crawlee/types": "file:./packages/types", + "@crawlee/utils": "file:./packages/utils", + "@crawlee/impit-client": "file:./packages/impit-client" + }, + "overrides": { + "apify": { + "@crawlee/core": "file:./packages/core", + "@crawlee/types": "file:./packages/types", + "@crawlee/utils": "file:./packages/utils" + } + }, + "devDependencies": { + "@apify/tsconfig": "^0.1.0", + "typescript": "^5.0.0" + }, + "scripts": { + "start": "tsc && node main.js", + "start:prod": "node main.js", + "build": "tsc" + }, + "type": "module", + "license": "ISC" +} diff --git a/test/e2e/cheerio-impit-ts/actor/tsconfig.json b/test/e2e/cheerio-impit-ts/actor/tsconfig.json new file mode 100644 index 000000000000..f6f2e9d778a5 --- /dev/null +++ b/test/e2e/cheerio-impit-ts/actor/tsconfig.json @@ -0,0 +1,11 @@ +{ + "extends": "@apify/tsconfig", + "compilerOptions": { + "module": "ES2022", + "target": "ES2022", + "lib": ["DOM"], + "skipLibCheck": true, + "incremental": false + }, + "include": ["./**/*.ts"] +} diff --git a/test/e2e/cheerio-impit-ts/test.mjs b/test/e2e/cheerio-impit-ts/test.mjs new file mode 100644 index 000000000000..23256793dcc2 --- /dev/null +++ b/test/e2e/cheerio-impit-ts/test.mjs @@ -0,0 +1,18 @@ +import { initialize, getActorTestDir, runActor, expect } from '../tools.mjs'; + +const testActorDirname = getActorTestDir(import.meta.url); +await initialize(testActorDirname); + +const { stats, datasetItems } = await runActor(testActorDirname); + +await expect(stats.requestsFinished > 0, 'All requests finished'); +await expect(datasetItems.length === 1, 'A dataset item was pushed'); + +const result = datasetItems[0]; + +expect(result.body.length > 1000, 'HTML response is not empty'); +expect(result.title === 'httpbin.org', 'HTML title is correct'); +console.log(result.userAgent); +expect(/Gecko\/\d{8} Firefox\/\d{2}/.test(result.userAgent), 'Impit correctly spoofs Firefox'); +expect(result.uuidJsonResponse.uuid !== undefined, 'JSON response contains UUID'); +expect(JSON.parse(result.uuidTextResponse).uuid !== undefined, 'Text response contains UUID');