From 5751e88487078ea89bf613f33c98f9b709e75d50 Mon Sep 17 00:00:00 2001 From: anish-work Date: Wed, 13 Nov 2024 20:50:25 +0530 Subject: [PATCH] use hpagent package --- package-lock.json | 48 +++++++++---------------------------------- package.json | 2 +- src/proxyConfig.js | 51 ++++++++++------------------------------------ 3 files changed, 22 insertions(+), 79 deletions(-) diff --git a/package-lock.json b/package-lock.json index 5afbc36..aef552c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,11 +9,11 @@ "version": "1.0.0", "license": "ISC", "dependencies": { - "axios": "^1.7.7", "cors": "^2.8.5", "dotenv": "^8.6.0", "express": "^4.19.2", "got": "^11.8.0", + "hpagent": "^1.2.0", "metascraper": "^5.14.18", "metascraper-description": "^5.14.18", "metascraper-image": "^5.14.18", @@ -315,17 +315,6 @@ "node": ">=0.10.0" } }, - "node_modules/axios": { - "version": "1.7.7", - "resolved": "https://registry.npmjs.org/axios/-/axios-1.7.7.tgz", - "integrity": "sha512-S4kL7XrjgBmvdGut0sN3yJxqYzrDOnivkBiN0OFs6hLiUam3UPvswUo0kqGyhqUZGEOytHyumEdXsAkgCOUf3Q==", - "license": "MIT", - "dependencies": { - "follow-redirects": "^1.15.6", - "form-data": "^4.0.0", - "proxy-from-env": "^1.1.0" - } - }, "node_modules/balanced-match": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz", @@ -1047,26 +1036,6 @@ "node": ">= 0.8" } }, - "node_modules/follow-redirects": { - "version": "1.15.9", - "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.9.tgz", - "integrity": "sha512-gew4GsXizNgdoRyqmyfMHyAmXsZDk6mHkSxZFCzW9gwlbtOW44CDtYavM+y+72qD/Vq2l550kMF52DT8fOLJqQ==", - "funding": [ - { - "type": "individual", - "url": "https://github.com/sponsors/RubenVerborgh" - } - ], - "license": "MIT", - "engines": { - "node": ">=4.0" - }, - "peerDependenciesMeta": { - "debug": { - "optional": true - } - } - }, "node_modules/foreground-child": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.1.1.tgz", @@ -1278,6 +1247,15 @@ "node": ">= 0.4" } }, + "node_modules/hpagent": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/hpagent/-/hpagent-1.2.0.tgz", + "integrity": "sha512-A91dYTeIB6NoXG+PxTQpCCDDnfHsW9kc06Lvpu1TEe9gnd6ZFeiBoRO9JvzEv6xK7EX97/dUE8g/vBMTqTS3CA==", + "license": "MIT", + "engines": { + "node": ">=14" + } + }, "node_modules/html-encoding-sniffer": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-4.0.0.tgz", @@ -2243,12 +2221,6 @@ "node": ">= 0.10" } }, - "node_modules/proxy-from-env": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", - "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==", - "license": "MIT" - }, "node_modules/psl": { "version": "1.9.0", "resolved": "https://registry.npmjs.org/psl/-/psl-1.9.0.tgz", diff --git a/package.json b/package.json index a37459a..8c94bc7 100644 --- a/package.json +++ b/package.json @@ -9,11 +9,11 @@ "author": "", "license": "ISC", "dependencies": { - "axios": "^1.7.7", "cors": "^2.8.5", "dotenv": "^8.6.0", "express": "^4.19.2", "got": "^11.8.0", + "hpagent": "^1.2.0", "metascraper": "^5.14.18", "metascraper-description": "^5.14.18", "metascraper-image": "^5.14.18", diff --git a/src/proxyConfig.js b/src/proxyConfig.js index 1f58dfb..45d667d 100644 --- a/src/proxyConfig.js +++ b/src/proxyConfig.js @@ -1,8 +1,6 @@ // proxyConfig.js const path = require("path"); -const fs = require("fs"); -const https = require("https"); -const got = require("got"); +const { HttpsProxyAgent, HttpProxyAgent } = require("hpagent"); // Fake user agents array - you can expand this list const FAKE_USER_AGENTS = [ @@ -34,53 +32,26 @@ function getProxyUrl(scheme) { return `${scheme}://${config.SCRAPING_PROXY_USERNAME}:${config.SCRAPING_PROXY_PASSWORD}@${config.SCRAPING_PROXY_HOST}`; } -// Get proxy configuration -const SCRAPING_PROXIES = config.SCRAPING_PROXY_HOST - ? { - http: getProxyUrl("http"), - https: getProxyUrl("https"), - } - : {}; - -// Function to get proxy certificate -async function getScrapingProxyCertPath() { - if (!config.SCRAPING_PROXY_CERT_URL) { - return null; - } - - const certPath = path.join(config.BASE_DIR, "proxy_ca_crt.pem"); - - if (!fs.existsSync(certPath)) { - console.log(`Downloading proxy cert to ${certPath}`); - const response = await got(config.SCRAPING_PROXY_CERT_URL, { - responseType: "arraybuffer", - }); - fs.writeFileSync(certPath, response.data); - } - - return certPath; -} - // Main function to get axios config for scraping async function getScrapingConfig() { - const certPath = await getScrapingProxyCertPath(); - - const httpsAgent = new https.Agent({ - ca: certPath ? fs.readFileSync(certPath) : undefined, - }); - return { headers: { "User-Agent": FAKE_USER_AGENTS[Math.floor(Math.random() * FAKE_USER_AGENTS.length)], }, - proxy: SCRAPING_PROXIES, - httpsAgent, + agent: { + https: new HttpsProxyAgent({ + https: getProxyUrl("https"), + }), + // http: + // SCRAPING_PROXIES.http && + // new HttpProxyAgent({ + // http: getProxyUrl("http"), + // }), + }, }; } module.exports = { getScrapingConfig, - FAKE_USER_AGENTS, - SCRAPING_PROXIES, };