diff --git a/package-lock.json b/package-lock.json index 67f7594..5afbc36 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,8 +9,9 @@ "version": "1.0.0", "license": "ISC", "dependencies": { + "axios": "^1.7.7", "cors": "^2.8.5", - "dotenv": "^8.2.0", + "dotenv": "^8.6.0", "express": "^4.19.2", "got": "^11.8.0", "metascraper": "^5.14.18", @@ -18,6 +19,7 @@ "metascraper-image": "^5.14.18", "metascraper-logo-favicon": "^5.45.9", "metascraper-title": "^5.14.18", + "path": "^0.12.7", "twitter-text": "^3.1.0" } }, @@ -313,6 +315,17 @@ "node": ">=0.10.0" } }, + "node_modules/axios": { + "version": "1.7.7", + "resolved": "https://registry.npmjs.org/axios/-/axios-1.7.7.tgz", + "integrity": "sha512-S4kL7XrjgBmvdGut0sN3yJxqYzrDOnivkBiN0OFs6hLiUam3UPvswUo0kqGyhqUZGEOytHyumEdXsAkgCOUf3Q==", + "license": "MIT", + "dependencies": { + "follow-redirects": "^1.15.6", + "form-data": "^4.0.0", + "proxy-from-env": "^1.1.0" + } + }, "node_modules/balanced-match": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz", @@ -839,6 +852,7 @@ "version": "8.6.0", "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-8.6.0.tgz", "integrity": "sha512-IrPdXQsk2BbzvCBGBOTmmSH5SodmqZNt4ERAZDmW4CT+tL8VtvinqywuANaFu4bOMWki16nqf0e4oC0QIaDr/g==", + "license": "BSD-2-Clause", "engines": { "node": ">=10" } @@ -1033,6 +1047,26 @@ "node": ">= 0.8" } }, + "node_modules/follow-redirects": { + "version": "1.15.9", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.9.tgz", + "integrity": "sha512-gew4GsXizNgdoRyqmyfMHyAmXsZDk6mHkSxZFCzW9gwlbtOW44CDtYavM+y+72qD/Vq2l550kMF52DT8fOLJqQ==", + "funding": [ + { + "type": "individual", + "url": "https://github.com/sponsors/RubenVerborgh" + } + ], + "license": "MIT", + "engines": { + "node": ">=4.0" + }, + "peerDependenciesMeta": { + "debug": { + "optional": true + } + } + }, "node_modules/foreground-child": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.1.1.tgz", @@ -1297,6 +1331,7 @@ "version": "7.0.2", "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.2.tgz", "integrity": "sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==", + "license": "MIT", "dependencies": { "agent-base": "^7.1.0", "debug": "^4.3.4" @@ -1339,9 +1374,10 @@ } }, "node_modules/https-proxy-agent": { - "version": "7.0.4", - "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.4.tgz", - "integrity": "sha512-wlwpilI7YdjSkWaQ/7omYBMTliDcmCN8OLihO6I9B86g06lMyAoqgoDpV0XqoaPOKj+0DIdAvnsWfyAAhmimcg==", + "version": "7.0.5", + "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.5.tgz", + "integrity": "sha512-1e4Wqeblerz+tMKPIq2EMGiiWW1dIjZOksyHWSUm1rmuvw/how9hBHZ38lAGj5ID4Ik6EdkOw7NmWPy6LAwalw==", + "license": "MIT", "dependencies": { "agent-base": "^7.0.2", "debug": "4" @@ -2128,6 +2164,16 @@ "node": ">= 0.8" } }, + "node_modules/path": { + "version": "0.12.7", + "resolved": "https://registry.npmjs.org/path/-/path-0.12.7.tgz", + "integrity": "sha512-aXXC6s+1w7otVF9UletFkFcDsJeO7lSZBPUQhtb5O0xJe8LtYhj/GxldoL09bBj9+ZmE2hNoHqQSFMN5fikh4Q==", + "license": "MIT", + "dependencies": { + "process": "^0.11.1", + "util": "^0.10.3" + } + }, "node_modules/path-key": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/path-key/-/path-key-3.1.1.tgz", @@ -2164,6 +2210,15 @@ "node": "^14.17.0 || ^16.13.0 || >=18.0.0" } }, + "node_modules/process": { + "version": "0.11.10", + "resolved": "https://registry.npmjs.org/process/-/process-0.11.10.tgz", + "integrity": "sha512-cdGef/drWFoydD1JsMzuFf8100nZl+GT+yacc2bEced5f9Rjk4z+WtFUTBu9PhOi9j/jfmBPu0mMEY4wIdAF8A==", + "license": "MIT", + "engines": { + "node": ">= 0.6.0" + } + }, "node_modules/promise-retry": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/promise-retry/-/promise-retry-2.0.1.tgz", @@ -2188,6 +2243,12 @@ "node": ">= 0.10" } }, + "node_modules/proxy-from-env": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", + "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==", + "license": "MIT" + }, "node_modules/psl": { "version": "1.9.0", "resolved": "https://registry.npmjs.org/psl/-/psl-1.9.0.tgz", @@ -2888,6 +2949,21 @@ } } }, + "node_modules/util": { + "version": "0.10.4", + "resolved": "https://registry.npmjs.org/util/-/util-0.10.4.tgz", + "integrity": "sha512-0Pm9hTQ3se5ll1XihRic3FDIku70C+iHUdT/W926rSgHV5QgXsYbKZN8MSC3tJtSkhuROzvsQjAaFENRXr+19A==", + "license": "MIT", + "dependencies": { + "inherits": "2.0.3" + } + }, + "node_modules/util/node_modules/inherits": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz", + "integrity": "sha512-x00IRNXNy63jwGkJmzPigoySHbaqpNuzKbBOmzK+g2OdZpQ9w+sxCN+VSB3ja7IAge2OP2qpfxTjeNcyjmW1uw==", + "license": "ISC" + }, "node_modules/utils-merge": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz", diff --git a/package.json b/package.json index 91a3741..a37459a 100644 --- a/package.json +++ b/package.json @@ -9,8 +9,9 @@ "author": "", "license": "ISC", "dependencies": { + "axios": "^1.7.7", "cors": "^2.8.5", - "dotenv": "^8.2.0", + "dotenv": "^8.6.0", "express": "^4.19.2", "got": "^11.8.0", "metascraper": "^5.14.18", @@ -18,6 +19,7 @@ "metascraper-image": "^5.14.18", "metascraper-logo-favicon": "^5.45.9", "metascraper-title": "^5.14.18", + "path": "^0.12.7", "twitter-text": "^3.1.0" } } diff --git a/src/index.js b/src/index.js index b8d0e37..429c5fd 100644 --- a/src/index.js +++ b/src/index.js @@ -1,12 +1,13 @@ const metascraper = require("metascraper")([ - require("metascraper-title")(), - require("metascraper-description")(), - require("metascraper-image")(), - require("metascraper-logo-favicon")(), + require("metascraper-title")(), + require("metascraper-description")(), + require("metascraper-image")(), + require("metascraper-logo-favicon")(), ]); const express = require("express"); var cors = require("cors"); +const { getScrapingConfig } = require("./proxyConfig"); const app = express(); const port = process.env.PORT || 8090; @@ -17,81 +18,85 @@ const twitter = require("twitter-text"); require("dotenv").config(); const REQUEST_TIMEOUT_MS = - parseInt(process.env.REQUEST_TIMEOUT_SEC || 40) * 1000; + parseInt(process.env.REQUEST_TIMEOUT_SEC || 40) * 1000; app.listen(port, () => { - console.log(`Server started on port ${port}`); + console.log(`Server started on port ${port}`); }); app.use(cors()); app.get("/fetchUrlMeta", (req, res) => { - const { url } = req.query; - dispatch({ data: [url], cmd: "fetchMetadata" }).then((response) => { - res.json(response); - }); + const { url } = req.query; + dispatch({ data: [url], cmd: "fetchMetadata" }).then((response) => { + res.json(response); + }); }); async function dispatch({ cmd, data }) { - switch (cmd) { - case "extractUrls": - return twitter.extractUrls(data); - case "fetchMetadata": - let url; - for (url of data) { - try { - let metadata = await fetchMetadata(url); - metadata.url = url; - return metadata; - } catch (e) { - console.log("!", url, e); - } - } - break; - } - return {}; + switch (cmd) { + case "extractUrls": + return twitter.extractUrls(data); + case "fetchMetadata": + let url; + for (url of data) { + try { + let metadata = await fetchMetadata(url); + metadata.url = url; + return metadata; + } catch (e) { + console.log("!", url, e); + } + } + break; + } + return {}; } async function fetchMetadata(targetUrl) { - const { - body: html, - url, - headers, - redirectUrls = [], - } = await got(targetUrl, { - timeout: { - request: REQUEST_TIMEOUT_MS, - }, - retry: { - limit: 0, - }, - }); - const contentType = headers?.["content-type"]; - let hostname = new URL( - redirectUrls.length ? [...redirectUrls].pop() : targetUrl - ).hostname; - const hostnameParts = hostname.split("."); + const proxyConfig = await getScrapingConfig(); - if (hostnameParts.length >= 2) { - const mainDomain = hostnameParts.slice(-2, -1)[0]; // gooey, google, facebook etc - const ext = hostnameParts.slice(-1)[0]; // .ai, .com, .org, .net, etc - if (hostname.includes("googleapis")) - // for favicon logo from googleapis include subdomain - hostname = hostnameParts.slice(-3, -1).join("."); // storage.googleapis.com etc - hostname = mainDomain + "." + ext; - } + const { + body: html, + url, + headers, + redirectUrls = [], + } = await got(targetUrl, { + timeout: { + request: REQUEST_TIMEOUT_MS, + }, + retry: { + limit: 0, + }, + ...proxyConfig, // Add proxy configuration here + }); - const preMeta = { - redirect_urls: redirectUrls, - url: targetUrl, - logo: `https://www.google.com/s2/favicons?sz=128&domain=${hostname}`, - content_type: contentType, - }; + const contentType = headers?.["content-type"]; + let hostname = new URL( + redirectUrls.length ? [...redirectUrls].pop() : targetUrl + ).hostname; + const hostnameParts = hostname.split("."); - if (!contentType.includes("text/html")) return preMeta; - const metaData = await metascraper({ html, url }); - return { - ...preMeta, - ...metaData, - }; + if (hostnameParts.length >= 2) { + const mainDomain = hostnameParts.slice(-2, -1)[0]; // gooey, google, facebook etc + const ext = hostnameParts.slice(-1)[0]; // .ai, .com, .org, .net, etc + if (hostname.includes("googleapis")) + // for favicon logo from googleapis include subdomain + hostname = hostnameParts.slice(-3, -1).join("."); // storage.googleapis.com etc + hostname = mainDomain + "." + ext; + } + + const preMeta = { + redirect_urls: redirectUrls, + url: targetUrl, + logo: `https://www.google.com/s2/favicons?sz=128&domain=${hostname}`, + content_type: contentType, + }; + + if (!contentType.includes("text/html")) return preMeta; + const metaData = await metascraper({ html, url }); + return { + ...preMeta, + ...metaData, + }; } diff --git a/src/proxyConfig.js b/src/proxyConfig.js new file mode 100644 index 0000000..12b321c --- /dev/null +++ b/src/proxyConfig.js @@ -0,0 +1,79 @@ +// proxyConfig.js +const path = require("path"); +const fs = require("fs"); +const axios = require("axios"); +const https = require("https"); + +// Fake user agents array - you can expand this list +const FAKE_USER_AGENTS = [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0", +]; + +// Environment variables (you'll need to set these in your .env file) +const config = { + SCRAPING_PROXY_HOST: process.env.SCRAPING_PROXY_HOST || "", + SCRAPING_PROXY_USERNAME: process.env.SCRAPING_PROXY_USERNAME || "", + SCRAPING_PROXY_PASSWORD: process.env.SCRAPING_PROXY_PASSWORD || "", + SCRAPING_PROXY_CERT_URL: process.env.SCRAPING_PROXY_CERT_URL || "", + BASE_DIR: process.env.BASE_DIR || path.join(__dirname, ""), +}; + +// Build proxy URL +function getProxyUrl(scheme) { + if (!config.SCRAPING_PROXY_HOST) return ""; + + return `http://${config.SCRAPING_PROXY_USERNAME}:${config.SCRAPING_PROXY_PASSWORD}@${config.SCRAPING_PROXY_HOST}`; +} + +// Get proxy configuration +const SCRAPING_PROXIES = config.SCRAPING_PROXY_HOST + ? { + http: getProxyUrl("http"), + https: getProxyUrl("https"), + } + : {}; + +// Function to get proxy certificate +async function getScrapingProxyCertPath() { + if (!config.SCRAPING_PROXY_CERT_URL) { + return null; + } + + const certPath = path.join(config.BASE_DIR, "proxy_ca_crt.pem"); + + if (!fs.existsSync(certPath)) { + console.log(`Downloading proxy cert to ${certPath}`); + const response = await axios.get(config.SCRAPING_PROXY_CERT_URL, { + responseType: "arraybuffer", + }); + fs.writeFileSync(certPath, response.data); + } + + return certPath; +} + +// Main function to get axios config for scraping +async function getScrapingConfig() { + const certPath = await getScrapingProxyCertPath(); + + const httpsAgent = new https.Agent({ + ca: certPath ? fs.readFileSync(certPath) : undefined, + }); + + return { + headers: { + "User-Agent": + FAKE_USER_AGENTS[Math.floor(Math.random() * FAKE_USER_AGENTS.length)], + }, + proxy: SCRAPING_PROXIES, + httpsAgent, + }; +} + +module.exports = { + getScrapingConfig, + FAKE_USER_AGENTS, + SCRAPING_PROXIES, +};