Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow retry of failed S3 sync. #28

Merged
merged 1 commit into from
Dec 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 11 additions & 6 deletions conf/node/controllers/httrack.js
Original file line number Diff line number Diff line change
Expand Up @@ -141,14 +141,15 @@ export const getHttrackProgress = async (dest) => {
}

const files = {
index: `${dest}/index.html`,
log: `${dest}/hts-cache/new.txt`,
lock: `${dest}/hts-in_progress.lock`,
};

const response = {
requestCount: 0,
rate: 0,
complete: !fs.existsSync(files.lock),
complete: fs.existsSync(files.index) && !fs.existsSync(files.lock),
};

if (!fs.existsSync(files.log)) {
Expand All @@ -167,7 +168,11 @@ export const getHttrackProgress = async (dest) => {
// The first line is a header, so we subtract 1.
response.requestCount = lineCount - 1;

// If the time is 23:59:55 - 00:00:05, await 5 seconds.
// If the current time is between 00:00:00 and 00:00:05, wait for 5 seconds to avoid spanning requests across two days.
// This prevents the recent requests in `/hts-cache/new.txt` from spanning 2 days.
if (new Date().toTimeString().split(" ")[0] < "00:00:05") {
await new Promise((resolve) => setTimeout(resolve, 5000));
}

// Get the last 20 lines from the file. Or all of them if there are less than 20.
const lastLines = execSync(
Expand Down Expand Up @@ -229,10 +234,10 @@ export const waitForHttrackComplete = async (
fs.existsSync(`${dest}/hts-in_progress.lock`)
) {
if (iterations < 10 || iterations % logFrequency === 0) {
const elapsedTime = new Date(iterations * intervalSeconds * 1000).toISOString().substring(11, 19);
console.log(
`Waiting for httrack to complete ... ${elapsedTime} elapsed`,
);
const elapsedTime = new Date(iterations * intervalSeconds * 1000)
.toISOString()
.substring(11, 19);
console.log(`Waiting for httrack to complete ... ${elapsedTime} elapsed`);
}
await new Promise((resolve) => setTimeout(resolve, intervalSeconds * 1000));
}
Expand Down
30 changes: 18 additions & 12 deletions conf/node/controllers/main.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import {
getSnapshotPaths,
getHttrackArgs,
runHttrack,
getHttrackProgress,
waitForHttrackComplete,
} from "./httrack.js";
import { sync } from "./s3.js";
Expand All @@ -20,21 +21,26 @@ import { sync } from "./s3.js";
export const main = async ({ url, agency, depth }) => {
const paths = getSnapshotPaths({ host: url.host, agency });

const httrackArgs = getHttrackArgs({
url,
dest: paths.fs,
agency,
jwt,
depth,
});
const { complete } = getHttrackProgress(paths.fs);

runHttrack(httrackArgs);
// If the snapshot is already complete, skip httrack
if (!complete) {
const httrackArgs = getHttrackArgs({
url,
dest: paths.fs,
agency,
jwt,
depth,
});

const { timedOut } = await waitForHttrackComplete(paths.fs);
runHttrack(httrackArgs);

if (timedOut) {
console.error("Httrack timed out", { url: url.href, agency, depth });
return;
const { timedOut } = await waitForHttrackComplete(paths.fs);

if (timedOut) {
console.error("Httrack timed out", { url: url.href, agency, depth });
return;
}
}

// Remove sensitive files - before syncing to S3
Expand Down
Loading