Skip to content

Commit

Permalink
add proxy
Browse files Browse the repository at this point in the history
  • Loading branch information
anish-work committed Nov 13, 2024
1 parent 989c002 commit d8a8ba1
Show file tree
Hide file tree
Showing 4 changed files with 232 additions and 70 deletions.
84 changes: 80 additions & 4 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,17 @@
"author": "",
"license": "ISC",
"dependencies": {
"axios": "^1.7.7",
"cors": "^2.8.5",
"dotenv": "^8.2.0",
"dotenv": "^8.6.0",
"express": "^4.19.2",
"got": "^11.8.0",
"metascraper": "^5.14.18",
"metascraper-description": "^5.14.18",
"metascraper-image": "^5.14.18",
"metascraper-logo-favicon": "^5.45.9",
"metascraper-title": "^5.14.18",
"path": "^0.12.7",
"twitter-text": "^3.1.0"
}
}
135 changes: 70 additions & 65 deletions src/index.js
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
const metascraper = require("metascraper")([
require("metascraper-title")(),
require("metascraper-description")(),
require("metascraper-image")(),
require("metascraper-logo-favicon")(),
require("metascraper-title")(),
require("metascraper-description")(),
require("metascraper-image")(),
require("metascraper-logo-favicon")(),
]);

const express = require("express");
var cors = require("cors");
const { getScrapingConfig } = require("./proxyConfig");

const app = express();
const port = process.env.PORT || 8090;
Expand All @@ -17,81 +18,85 @@ const twitter = require("twitter-text");
require("dotenv").config();

const REQUEST_TIMEOUT_MS =
parseInt(process.env.REQUEST_TIMEOUT_SEC || 40) * 1000;
parseInt(process.env.REQUEST_TIMEOUT_SEC || 40) * 1000;

app.listen(port, () => {
console.log(`Server started on port ${port}`);
console.log(`Server started on port ${port}`);
});

app.use(cors());

app.get("/fetchUrlMeta", (req, res) => {
const { url } = req.query;
dispatch({ data: [url], cmd: "fetchMetadata" }).then((response) => {
res.json(response);
});
const { url } = req.query;
dispatch({ data: [url], cmd: "fetchMetadata" }).then((response) => {
res.json(response);
});
});

async function dispatch({ cmd, data }) {
switch (cmd) {
case "extractUrls":
return twitter.extractUrls(data);
case "fetchMetadata":
let url;
for (url of data) {
try {
let metadata = await fetchMetadata(url);
metadata.url = url;
return metadata;
} catch (e) {
console.log("!", url, e);
}
}
break;
}
return {};
switch (cmd) {
case "extractUrls":
return twitter.extractUrls(data);
case "fetchMetadata":
let url;
for (url of data) {
try {
let metadata = await fetchMetadata(url);
metadata.url = url;
return metadata;
} catch (e) {
console.log("!", url, e);
}
}
break;
}
return {};
}

async function fetchMetadata(targetUrl) {
const {
body: html,
url,
headers,
redirectUrls = [],
} = await got(targetUrl, {
timeout: {
request: REQUEST_TIMEOUT_MS,
},
retry: {
limit: 0,
},
});
const contentType = headers?.["content-type"];
let hostname = new URL(
redirectUrls.length ? [...redirectUrls].pop() : targetUrl
).hostname;
const hostnameParts = hostname.split(".");
const proxyConfig = await getScrapingConfig();

if (hostnameParts.length >= 2) {
const mainDomain = hostnameParts.slice(-2, -1)[0]; // gooey, google, facebook etc
const ext = hostnameParts.slice(-1)[0]; // .ai, .com, .org, .net, etc
if (hostname.includes("googleapis"))
// for favicon logo from googleapis include subdomain
hostname = hostnameParts.slice(-3, -1).join("."); // storage.googleapis.com etc
hostname = mainDomain + "." + ext;
}
const {
body: html,
url,
headers,
redirectUrls = [],
} = await got(targetUrl, {
timeout: {
request: REQUEST_TIMEOUT_MS,
},
retry: {
limit: 0,
},
...proxyConfig, // Add proxy configuration here
});

const preMeta = {
redirect_urls: redirectUrls,
url: targetUrl,
logo: `https://www.google.com/s2/favicons?sz=128&domain=${hostname}`,
content_type: contentType,
};
const contentType = headers?.["content-type"];
let hostname = new URL(
redirectUrls.length ? [...redirectUrls].pop() : targetUrl
).hostname;
const hostnameParts = hostname.split(".");

if (!contentType.includes("text/html")) return preMeta;
const metaData = await metascraper({ html, url });
return {
...preMeta,
...metaData,
};
if (hostnameParts.length >= 2) {
const mainDomain = hostnameParts.slice(-2, -1)[0]; // gooey, google, facebook etc
const ext = hostnameParts.slice(-1)[0]; // .ai, .com, .org, .net, etc
if (hostname.includes("googleapis"))
// for favicon logo from googleapis include subdomain
hostname = hostnameParts.slice(-3, -1).join("."); // storage.googleapis.com etc
hostname = mainDomain + "." + ext;
}

const preMeta = {
redirect_urls: redirectUrls,
url: targetUrl,
logo: `https://www.google.com/s2/favicons?sz=128&domain=${hostname}`,
content_type: contentType,
};

if (!contentType.includes("text/html")) return preMeta;
const metaData = await metascraper({ html, url });
return {
...preMeta,
...metaData,
};
}
Loading

0 comments on commit d8a8ba1

Please sign in to comment.