diff --git a/conf/node/constants.js b/conf/node/constants.js index 874dfba..5936230 100644 --- a/conf/node/constants.js +++ b/conf/node/constants.js @@ -5,18 +5,19 @@ export const jwt = process.env.JWT; * S3 */ -export const s3Region = 'eu-west-2'; +export const s3Region = "eu-west-2"; export const s3BucketName = process.env.S3_BUCKET_NAME; -export const s3Credentials = process.env.AWS_ACCESS_KEY_ID && process.env.AWS_SECRET_ACCESS_KEY && { - accessKeyId: process.env.AWS_ACCESS_KEY_ID, - secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY, -} +export const s3Credentials = process.env.AWS_ACCESS_KEY_ID && + process.env.AWS_SECRET_ACCESS_KEY && { + accessKeyId: process.env.AWS_ACCESS_KEY_ID, + secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY, + }; /** * Options */ -export const corsOptions ={ +export const corsOptions = { methods: ["POST"], origin: [ "http://spider.intranet.docker/", @@ -43,4 +44,16 @@ export const allowedTargetHosts = [ "demo.intranet.justice.gov.uk", ]; -export const allowedTargetAgencies = process.env.ALLOWED_AGENCIES?.split(",") ?? []; +export const allowedTargetAgencies = + process.env.ALLOWED_AGENCIES?.split(",") ?? []; + +/** + * Httrack + */ + +export const sensitiveFiles = [ + "cookies.txt", // Contains the JWT and CloudFront cookies. + "hts-log.txt", // Has the httrack command line arguments - this includes the JWT. + `hts-cache/doit.log`, // Has the httrack command line arguments - this includes the JWT. + `hts-cache/new.zip`, +]; diff --git a/conf/node/controllers/httrack.js b/conf/node/controllers/httrack.js index afdc32d..a98406d 100644 --- a/conf/node/controllers/httrack.js +++ b/conf/node/controllers/httrack.js @@ -9,15 +9,21 @@ import { jwt } from "../constants.js"; * @param {props} props * @param {string} props.host * @param {string} props.agency - * @returns {string} + * @returns {Object} object + * @returns {string} object.s3 - the s3 path + * @returns {string} object.fs - the local filesystem path */ -export const getSnapshotDir = ({ host, agency }) => { +export const getSnapshotPaths = ({ host, agency }) => { // Get date in format: 2023-01-17 const dateString = new Date().toISOString().slice(0, 10); + const s3Path = `${host}/${agency}/${dateString}`; + + const fsPath = `/tmp/snapshots/${s3Path}`; + // Return directory for the snapshot - return `/tmp/snapshots/${host}/${agency}/${dateString}`; + return { s3: s3Path, fs: fsPath }; }; /** diff --git a/conf/node/controllers/main.js b/conf/node/controllers/main.js index 8065724..49c7cf9 100644 --- a/conf/node/controllers/main.js +++ b/conf/node/controllers/main.js @@ -1,6 +1,8 @@ -import { jwt, s3BucketName } from "../constants.js"; +import fs from "node:fs/promises"; + +import { jwt, s3BucketName, sensitiveFiles } from "../constants.js"; import { - getSnapshotDir, + getSnapshotPaths, getHttrackArgs, runHttrack, waitForHttrackComplete, @@ -16,11 +18,11 @@ import { sync } from "./s3.js"; */ export const main = async ({ url, agency, depth }) => { - const directory = getSnapshotDir({ host: url.host, agency }); + const paths = getSnapshotPaths({ host: url.host, agency }); const httrackArgs = getHttrackArgs({ url, - dest: directory, + dest: paths.fs, agency, jwt, depth, @@ -28,10 +30,18 @@ export const main = async ({ url, agency, depth }) => { runHttrack(httrackArgs); - await waitForHttrackComplete(directory); + await waitForHttrackComplete(paths.fs); - // Delete any sensitive files before syncing to S3 + // Remove sensitive files - before syncing to S3 + await Promise.all( + sensitiveFiles.map(file => fs.rm(`${paths.fs}/${file}`, { force: true })) + ); // Sync the snapshot to S3 - await sync(directory, `s3://${s3BucketName}${directory}`); + await sync(paths.fs, `s3://${s3BucketName}/${paths.s3}`); + + // Clean up the snapshot directory + await fs.rm(paths.fs, { recursive: true, force: true }); + + console.log("Snapshot complete", { url, agency, depth }); }; diff --git a/conf/node/controllers/main.test.js b/conf/node/controllers/main.test.js index 4c96a73..ae42c31 100644 --- a/conf/node/controllers/main.test.js +++ b/conf/node/controllers/main.test.js @@ -4,7 +4,7 @@ import { afterAll, beforeEach, expect, it, jest } from "@jest/globals"; import { ListObjectsV2Command, GetObjectCommand } from "@aws-sdk/client-s3"; import { main } from "./main.js"; -import { getSnapshotDir } from "./httrack.js"; +import { getSnapshotPaths } from "./httrack.js"; import { client as s3Client, s3EmptyDir } from "./s3.js"; import { s3BucketName } from "../constants.js"; @@ -14,7 +14,7 @@ const skipLongTests = process.env.npm_lifecycle_event === "test:watch"; describe("main", () => { const url = new URL("https://intranet.justice.gov.uk/"); const agency = "hq"; - const directory = getSnapshotDir({ host: url.host, agency }); + const paths = getSnapshotPaths({ host: url.host, agency }); beforeAll(async () => { // Mock console.log so the tests are quiet. @@ -31,39 +31,67 @@ describe("main", () => { beforeEach(async () => { // Clean out the directory - await fs.rm(directory, { recursive: true, force: true }); + await fs.rm(paths.fs, { recursive: true, force: true }); // Clean out the s3 bucket folder - await s3EmptyDir(directory); + await s3EmptyDir(paths.s3); }); it("should get index files on a shallow scrape", async () => { await main({ url, agency, depth: 1 }); - const pathPrefix = directory.replace(/^\//, ""); - // The snapshot should be on s3 const objects = await s3Client.send( new ListObjectsV2Command({ Bucket: s3BucketName, - Prefix: pathPrefix, + Prefix: paths.s3, }), ); // Ensure there's an index.html file const httrackIndexHtml = objects.Contents.find( - (object) => object.Key === `${pathPrefix}/index.html`, + (object) => object.Key === `${paths.s3}/index.html`, ); expect(httrackIndexHtml).toBeDefined(); const intranetIndexHtml = objects.Contents.find( - (object) => object.Key === `${pathPrefix}/${url.host}/index.html`, + (object) => object.Key === `${paths.s3}/${url.host}/index.html`, ); expect(intranetIndexHtml).toBeDefined(); }, 10_000); + it("should delete sensitive files and cleanup local fs", async () => { + await main({ url, agency, depth: 1 }); + + // The snapshot should be on s3 + const objects = await s3Client.send( + new ListObjectsV2Command({ + Bucket: s3BucketName, + Prefix: paths.s3, + }), + ); + + const sensitiveFiles = [ + `${paths.s3}/cookies.txt`, + `${paths.s3}/hts-log.txt`, + `${paths.s3}/hts-cache/doit.log`, + `${paths.s3}/hts-cache/new.zip`, + ] + + const foundSensitiveFiles = objects.Contents.find( + (object) => sensitiveFiles.includes(object.Key), + ); + + expect(foundSensitiveFiles).toBeUndefined(); + + // Ensure the local fs is cleaned up + const pathExists = await fs.stat(paths.fs).catch(() => false); + + expect(pathExists).toBe(false); + }, 10_000); + /** * Long running tests... */ @@ -76,21 +104,18 @@ describe("main", () => { it("should get styles.css from the cdn", async () => { await main({ url, agency, depth: 2 }); - // Remove the leading slash, for S3 path - const pathPrefix = directory.replace(/^\//, ""); - // The snapshot should be on s3 const objects = await s3Client.send( new ListObjectsV2Command({ Bucket: s3BucketName, - Prefix: pathPrefix, + Prefix: paths.s3, }), ); const cdnCss = objects.Contents.find((object) => object.Key.match( new RegExp( - `^${pathPrefix}/cdn.${url.host}/build/[0-9a-f]{8}/app/themes/clarity/dist/css/style.css$`, + `^${paths.s3}/cdn.${url.host}/build/[0-9a-f]{8}/app/themes/clarity/dist/css/style.css$`, ), ), ); diff --git a/conf/node/controllers/s3.js b/conf/node/controllers/s3.js index 730b9bf..221b34e 100644 --- a/conf/node/controllers/s3.js +++ b/conf/node/controllers/s3.js @@ -48,10 +48,10 @@ export { sync }; /** * Empty an S3 folder by using sync and deleting all files * - * @param {string} folder - The folder to empty + * @param {string} path - The path to empty */ -export const s3EmptyDir = async (directory) => { +export const s3EmptyDir = async (path) => { // Make a tmp empty directory const emptyDir = `/tmp/${Date.now()}`; @@ -59,7 +59,7 @@ export const s3EmptyDir = async (directory) => { await fs.mkdir(emptyDir, { recursive: true }); // Sync the empty directory to the folder - await sync(emptyDir, `s3://${s3BucketName}${directory}`, { del: true }); + await sync(emptyDir, `s3://${s3BucketName}/${path}`, { del: true }); // Remove the empty directory await fs.rm(emptyDir, { recursive: true }); diff --git a/conf/node/controllers/s3.test.js b/conf/node/controllers/s3.test.js index 66533b6..8dccec2 100644 --- a/conf/node/controllers/s3.test.js +++ b/conf/node/controllers/s3.test.js @@ -113,7 +113,7 @@ describe("S3EmptyDir", () => { }); it("should empty the directory", async () => { - await s3EmptyDir("/test/s3-test"); + await s3EmptyDir("test/s3-test"); const objects = await client.send( new ListObjectsV2Command({