From 39e4242dc736f4e8ded7b14afe2a01d53c95ae6f Mon Sep 17 00:00:00 2001 From: EarthlingDavey <15802017+EarthlingDavey@users.noreply.github.com> Date: Thu, 5 Dec 2024 12:41:58 +0000 Subject: [PATCH] Allow retry of failed S3 sync. --- conf/node/controllers/httrack.js | 17 +++++++++++------ conf/node/controllers/main.js | 30 ++++++++++++++++++------------ 2 files changed, 29 insertions(+), 18 deletions(-) diff --git a/conf/node/controllers/httrack.js b/conf/node/controllers/httrack.js index 0a573c1..ef524bb 100644 --- a/conf/node/controllers/httrack.js +++ b/conf/node/controllers/httrack.js @@ -141,6 +141,7 @@ export const getHttrackProgress = async (dest) => { } const files = { + index: `${dest}/index.html`, log: `${dest}/hts-cache/new.txt`, lock: `${dest}/hts-in_progress.lock`, }; @@ -148,7 +149,7 @@ export const getHttrackProgress = async (dest) => { const response = { requestCount: 0, rate: 0, - complete: !fs.existsSync(files.lock), + complete: fs.existsSync(files.index) && !fs.existsSync(files.lock), }; if (!fs.existsSync(files.log)) { @@ -167,7 +168,11 @@ export const getHttrackProgress = async (dest) => { // The first line is a header, so we subtract 1. response.requestCount = lineCount - 1; - // If the time is 23:59:55 - 00:00:05, await 5 seconds. + // If the current time is between 00:00:00 and 00:00:05, wait for 5 seconds to avoid spanning requests across two days. + // This prevents the recent requests in `/hts-cache/new.txt` from spanning 2 days. + if (new Date().toTimeString().split(" ")[0] < "00:00:05") { + await new Promise((resolve) => setTimeout(resolve, 5000)); + } // Get the last 20 lines from the file. Or all of them if there are less than 20. const lastLines = execSync( @@ -229,10 +234,10 @@ export const waitForHttrackComplete = async ( fs.existsSync(`${dest}/hts-in_progress.lock`) ) { if (iterations < 10 || iterations % logFrequency === 0) { - const elapsedTime = new Date(iterations * intervalSeconds * 1000).toISOString().substring(11, 19); - console.log( - `Waiting for httrack to complete ... ${elapsedTime} elapsed`, - ); + const elapsedTime = new Date(iterations * intervalSeconds * 1000) + .toISOString() + .substring(11, 19); + console.log(`Waiting for httrack to complete ... ${elapsedTime} elapsed`); } await new Promise((resolve) => setTimeout(resolve, intervalSeconds * 1000)); } diff --git a/conf/node/controllers/main.js b/conf/node/controllers/main.js index bbe1f3d..dded6da 100644 --- a/conf/node/controllers/main.js +++ b/conf/node/controllers/main.js @@ -5,6 +5,7 @@ import { getSnapshotPaths, getHttrackArgs, runHttrack, + getHttrackProgress, waitForHttrackComplete, } from "./httrack.js"; import { sync } from "./s3.js"; @@ -20,21 +21,26 @@ import { sync } from "./s3.js"; export const main = async ({ url, agency, depth }) => { const paths = getSnapshotPaths({ host: url.host, agency }); - const httrackArgs = getHttrackArgs({ - url, - dest: paths.fs, - agency, - jwt, - depth, - }); + const { complete } = getHttrackProgress(paths.fs); - runHttrack(httrackArgs); + // If the snapshot is already complete, skip httrack + if (!complete) { + const httrackArgs = getHttrackArgs({ + url, + dest: paths.fs, + agency, + jwt, + depth, + }); - const { timedOut } = await waitForHttrackComplete(paths.fs); + runHttrack(httrackArgs); - if (timedOut) { - console.error("Httrack timed out", { url: url.href, agency, depth }); - return; + const { timedOut } = await waitForHttrackComplete(paths.fs); + + if (timedOut) { + console.error("Httrack timed out", { url: url.href, agency, depth }); + return; + } } // Remove sensitive files - before syncing to S3