Skip to content

Commit

Permalink
fix(sitemap):timeout to 10 sec
Browse files Browse the repository at this point in the history
  • Loading branch information
AndreiAlexandruParaschiv committed Nov 13, 2024
1 parent 908b311 commit 45bc8f4
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 38 deletions.
42 changes: 9 additions & 33 deletions src/sitemap/handler.js
Original file line number Diff line number Diff line change
Expand Up @@ -110,18 +110,13 @@ export async function checkRobotsForSitemap(protocol, domain, log) {
* Checks if the sitemap content is valid.
*
* @param {{ payload: string, type: string }} sitemapContent - The sitemap content to validate.
* @param log
* @returns {boolean} - True if the sitemap content is valid, otherwise false.
*/
export function isSitemapContentValid(sitemapContent, log) {
export function isSitemapContentValid(sitemapContent) {
const validStarts = ['<?xml', '<urlset', '<sitemapindex'];
const isValid = validStarts.some((start) => sitemapContent.payload.trim().startsWith(start))
|| VALID_MIME_TYPES.some((type) => sitemapContent.type.includes(type));

// Log the validation result if `log` is provided
log?.info?.(`Sitemap content validation result: ${isValid}`);

return isValid;
return validStarts.some((start) => sitemapContent.payload.trim()
.startsWith(start))
|| VALID_MIME_TYPES.some((type) => sitemapContent.type.includes(type));
}

/**
Expand All @@ -140,10 +135,8 @@ export function isSitemapContentValid(sitemapContent, log) {
*/
export async function checkSitemap(sitemapUrl, log) {
try {
log.info(`Fetching sitemap from: ${sitemapUrl}`);
const sitemapContent = await fetchContent(sitemapUrl, log);
const isValidFormat = isSitemapContentValid(sitemapContent, log);
log.info(`Sitemap format valid: ${isValidFormat}`);
const isValidFormat = isSitemapContentValid(sitemapContent);
const isSitemapIndex = isValidFormat && sitemapContent.payload.includes('</sitemapindex>');
const isText = isValidFormat && sitemapContent.type === 'text/plain';

Expand All @@ -159,8 +152,6 @@ export async function checkSitemap(sitemapUrl, log) {
details: { sitemapContent, isText, isSitemapIndex },
};
} catch (error) {
log.error(`Error in checkSitemap for ${sitemapUrl}: ${error.message}`);
log.info(`Error stack: ${error.stack}`);
if (error.message.includes('404')) {
return {
existsAndIsValid: false,
Expand All @@ -186,7 +177,7 @@ export async function checkSitemap(sitemapUrl, log) {
async function filterValidUrls(urls, log) {
const OK = 1;
const NOT_OK = 2;
const TIMEOUT = 10000; // 5sec timeout
const TIMEOUT = 10000; // 10sec timeout

const fetchWithTimeout = async (url, timeout) => {
const controller = new AbortController();
Expand All @@ -195,18 +186,15 @@ async function filterValidUrls(urls, log) {

try {
const response = await fetch(url, {
method: 'GET',
method: 'HEAD',
signal,
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36',
},
redirect: 'follow',
});
clearTimeout(id);
return response;
} catch (error) {
if (error instanceof AbortError) {
log.warn(`Request to ${url} timed out after ${timeout}ms`);
log.info(`Request to ${url} timed out after ${timeout}ms`);
return { status: 408 };
}
} finally {
Expand Down Expand Up @@ -259,16 +247,13 @@ export async function getBaseUrlPagesFromSitemaps(baseUrl, urls, log) {

// Prepare all promises for checking each sitemap URL.
const checkPromises = urls.map(async (url) => {
log.info(`Checking sitemap: ${url}`);
const urlData = await checkSitemap(url, log);
contentsCache[url] = urlData;
log.info(`Sitemap check result for ${url}: ${JSON.stringify(urlData)}`);
return { url, urlData };
});

// Execute all checks concurrently.
const results = await Promise.all(checkPromises);
log.info('[STEP] All sitemap checks completed');
const matchingUrls = [];

// Process each result.
Expand Down Expand Up @@ -298,7 +283,6 @@ export async function getBaseUrlPagesFromSitemaps(baseUrl, urls, log) {
const pages = getBaseUrlPagesFromSitemapContents(
baseUrl,
contentsCache[matchingUrl].details,
log,
);

if (pages.length > 0) {
Expand Down Expand Up @@ -338,7 +322,6 @@ export async function findSitemap(inputUrl, log) {
let sitemapUrls = { ok: [], notOk: [] };
try {
const robotsResult = await checkRobotsForSitemap(protocol, domain, log);
log.info('[STEP] Robots.txt check completed');
if (robotsResult && robotsResult.paths && robotsResult.paths.length) {
sitemapUrls.ok = robotsResult.paths;
}
Expand Down Expand Up @@ -367,9 +350,7 @@ export async function findSitemap(inputUrl, log) {
const filteredSitemapUrls = sitemapUrls.ok.filter(
(path) => path.startsWith(inputUrl) || path.startsWith(inputUrlToggledWww),
);
log.info('[STEP] Getting base URL pages from sitemaps');
const extractedPaths = await getBaseUrlPagesFromSitemaps(inputUrl, filteredSitemapUrls, log);
log.info('[STEP] Got base URL pages from sitemaps');
const notOkPagesFromSitemap = {};

if (extractedPaths && Object.keys(extractedPaths).length > 0) {
Expand Down Expand Up @@ -427,13 +408,8 @@ export async function sitemapAuditRunner(baseURL, context) {
const { log } = context;

try {
log.info(`[START] sitemapAuditRunner for ${baseURL}`);

const startTime = process.hrtime();
log.info(`[STEP] Calling findSitemap for ${baseURL}`);
const auditResult = await findSitemap(baseURL, log);
log.info(`[STEP] findSitemap completed for ${baseURL}`);

const endTime = process.hrtime(startTime);
const elapsedSeconds = endTime[0] + endTime[1] / 1e9;
const formattedElapsed = elapsedSeconds.toFixed(2);
Expand All @@ -448,7 +424,7 @@ export async function sitemapAuditRunner(baseURL, context) {
} catch (error) {
log.error(`[ERROR] in sitemapAuditRunner for ${baseURL}: ${error.message}`);
log.error(`[ERROR] Stack trace: ${error.stack}`);
throw error; // Re-throw to let the caller handle it
throw error;
}
}

Expand Down
6 changes: 1 addition & 5 deletions src/support/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -104,16 +104,14 @@ export function extractUrlsFromSitemap(content, tagName = 'url') {
*
* @param {string} baseUrl - The base URL to match against the URLs in the sitemap.
* @param {Object} sitemapDetails - An object containing details about the sitemap.
* @param log
* @param {boolean} sitemapDetails.isText - A flag indicating if the sitemap content is plain text.
* @param {Object} sitemapDetails.sitemapContent - The sitemap content object.
* @param {string} sitemapDetails.sitemapContent.payload - The actual content of the sitemap.
*
* @returns {string[]} URLs from the sitemap that start with the base URL or its www variant.
*/
export function getBaseUrlPagesFromSitemapContents(baseUrl, sitemapDetails, log) {
export function getBaseUrlPagesFromSitemapContents(baseUrl, sitemapDetails) {
if (!baseUrl || !sitemapDetails) {
log.info('Invalid input: baseUrl or sitemapDetails is undefined');
return [];
}

Expand All @@ -125,15 +123,13 @@ export function getBaseUrlPagesFromSitemapContents(baseUrl, sitemapDetails, log)

if (sitemapDetails.isText) {
if (!sitemapDetails.sitemapContent || !sitemapDetails.sitemapContent.payload) {
log.info('Invalid sitemap content: payload is undefined');
return [];
}
const lines = sitemapDetails.sitemapContent.payload.split('\n').map((line) => line.trim());

return filterPages(lines.filter((line) => line.length > 0));
} else {
if (!sitemapDetails.sitemapContent) {
log.info('Invalid sitemap content: sitemapContent is undefined');
return [];
}
const sitemapPages = extractUrlsFromSitemap(sitemapDetails.sitemapContent);
Expand Down

0 comments on commit 45bc8f4

Please sign in to comment.