Skip to content

Commit

Permalink
Cleanup fs and handle sensitive files. (#22)
Browse files Browse the repository at this point in the history
  • Loading branch information
EarthlingDavey authored Dec 3, 2024
1 parent 7601d63 commit c340043
Show file tree
Hide file tree
Showing 6 changed files with 89 additions and 35 deletions.
27 changes: 20 additions & 7 deletions conf/node/constants.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,19 @@ export const jwt = process.env.JWT;
* S3
*/

export const s3Region = 'eu-west-2';
export const s3Region = "eu-west-2";
export const s3BucketName = process.env.S3_BUCKET_NAME;
export const s3Credentials = process.env.AWS_ACCESS_KEY_ID && process.env.AWS_SECRET_ACCESS_KEY && {
accessKeyId: process.env.AWS_ACCESS_KEY_ID,
secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY,
}
export const s3Credentials = process.env.AWS_ACCESS_KEY_ID &&
process.env.AWS_SECRET_ACCESS_KEY && {
accessKeyId: process.env.AWS_ACCESS_KEY_ID,
secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY,
};

/**
* Options
*/

export const corsOptions ={
export const corsOptions = {
methods: ["POST"],
origin: [
"http://spider.intranet.docker/",
Expand All @@ -43,4 +44,16 @@ export const allowedTargetHosts = [
"demo.intranet.justice.gov.uk",
];

export const allowedTargetAgencies = process.env.ALLOWED_AGENCIES?.split(",") ?? [];
export const allowedTargetAgencies =
process.env.ALLOWED_AGENCIES?.split(",") ?? [];

/**
* Httrack
*/

export const sensitiveFiles = [
"cookies.txt", // Contains the JWT and CloudFront cookies.
"hts-log.txt", // Has the httrack command line arguments - this includes the JWT.
`hts-cache/doit.log`, // Has the httrack command line arguments - this includes the JWT.
`hts-cache/new.zip`,
];
12 changes: 9 additions & 3 deletions conf/node/controllers/httrack.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,21 @@ import { jwt } from "../constants.js";
* @param {props} props
* @param {string} props.host
* @param {string} props.agency
* @returns {string}
* @returns {Object} object
* @returns {string} object.s3 - the s3 path
* @returns {string} object.fs - the local filesystem path
*/

export const getSnapshotDir = ({ host, agency }) => {
export const getSnapshotPaths = ({ host, agency }) => {
// Get date in format: 2023-01-17
const dateString = new Date().toISOString().slice(0, 10);

const s3Path = `${host}/${agency}/${dateString}`;

const fsPath = `/tmp/snapshots/${s3Path}`;

// Return directory for the snapshot
return `/tmp/snapshots/${host}/${agency}/${dateString}`;
return { s3: s3Path, fs: fsPath };
};

/**
Expand Down
24 changes: 17 additions & 7 deletions conf/node/controllers/main.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import { jwt, s3BucketName } from "../constants.js";
import fs from "node:fs/promises";

import { jwt, s3BucketName, sensitiveFiles } from "../constants.js";
import {
getSnapshotDir,
getSnapshotPaths,
getHttrackArgs,
runHttrack,
waitForHttrackComplete,
Expand All @@ -16,22 +18,30 @@ import { sync } from "./s3.js";
*/

export const main = async ({ url, agency, depth }) => {
const directory = getSnapshotDir({ host: url.host, agency });
const paths = getSnapshotPaths({ host: url.host, agency });

const httrackArgs = getHttrackArgs({
url,
dest: directory,
dest: paths.fs,
agency,
jwt,
depth,
});

runHttrack(httrackArgs);

await waitForHttrackComplete(directory);
await waitForHttrackComplete(paths.fs);

// Delete any sensitive files before syncing to S3
// Remove sensitive files - before syncing to S3
await Promise.all(
sensitiveFiles.map(file => fs.rm(`${paths.fs}/${file}`, { force: true }))
);

// Sync the snapshot to S3
await sync(directory, `s3://${s3BucketName}${directory}`);
await sync(paths.fs, `s3://${s3BucketName}/${paths.s3}`);

// Clean up the snapshot directory
await fs.rm(paths.fs, { recursive: true, force: true });

console.log("Snapshot complete", { url, agency, depth });
};
53 changes: 39 additions & 14 deletions conf/node/controllers/main.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import { afterAll, beforeEach, expect, it, jest } from "@jest/globals";
import { ListObjectsV2Command, GetObjectCommand } from "@aws-sdk/client-s3";

import { main } from "./main.js";
import { getSnapshotDir } from "./httrack.js";
import { getSnapshotPaths } from "./httrack.js";
import { client as s3Client, s3EmptyDir } from "./s3.js";
import { s3BucketName } from "../constants.js";

Expand All @@ -14,7 +14,7 @@ const skipLongTests = process.env.npm_lifecycle_event === "test:watch";
describe("main", () => {
const url = new URL("https://intranet.justice.gov.uk/");
const agency = "hq";
const directory = getSnapshotDir({ host: url.host, agency });
const paths = getSnapshotPaths({ host: url.host, agency });

beforeAll(async () => {
// Mock console.log so the tests are quiet.
Expand All @@ -31,39 +31,67 @@ describe("main", () => {

beforeEach(async () => {
// Clean out the directory
await fs.rm(directory, { recursive: true, force: true });
await fs.rm(paths.fs, { recursive: true, force: true });

// Clean out the s3 bucket folder
await s3EmptyDir(directory);
await s3EmptyDir(paths.s3);
});

it("should get index files on a shallow scrape", async () => {
await main({ url, agency, depth: 1 });

const pathPrefix = directory.replace(/^\//, "");

// The snapshot should be on s3
const objects = await s3Client.send(
new ListObjectsV2Command({
Bucket: s3BucketName,
Prefix: pathPrefix,
Prefix: paths.s3,
}),
);

// Ensure there's an index.html file
const httrackIndexHtml = objects.Contents.find(
(object) => object.Key === `${pathPrefix}/index.html`,
(object) => object.Key === `${paths.s3}/index.html`,
);

expect(httrackIndexHtml).toBeDefined();

const intranetIndexHtml = objects.Contents.find(
(object) => object.Key === `${pathPrefix}/${url.host}/index.html`,
(object) => object.Key === `${paths.s3}/${url.host}/index.html`,
);

expect(intranetIndexHtml).toBeDefined();
}, 10_000);

it("should delete sensitive files and cleanup local fs", async () => {
await main({ url, agency, depth: 1 });

// The snapshot should be on s3
const objects = await s3Client.send(
new ListObjectsV2Command({
Bucket: s3BucketName,
Prefix: paths.s3,
}),
);

const sensitiveFiles = [
`${paths.s3}/cookies.txt`,
`${paths.s3}/hts-log.txt`,
`${paths.s3}/hts-cache/doit.log`,
`${paths.s3}/hts-cache/new.zip`,
]

const foundSensitiveFiles = objects.Contents.find(
(object) => sensitiveFiles.includes(object.Key),
);

expect(foundSensitiveFiles).toBeUndefined();

// Ensure the local fs is cleaned up
const pathExists = await fs.stat(paths.fs).catch(() => false);

expect(pathExists).toBe(false);
}, 10_000);

/**
* Long running tests...
*/
Expand All @@ -76,21 +104,18 @@ describe("main", () => {
it("should get styles.css from the cdn", async () => {
await main({ url, agency, depth: 2 });

// Remove the leading slash, for S3 path
const pathPrefix = directory.replace(/^\//, "");

// The snapshot should be on s3
const objects = await s3Client.send(
new ListObjectsV2Command({
Bucket: s3BucketName,
Prefix: pathPrefix,
Prefix: paths.s3,
}),
);

const cdnCss = objects.Contents.find((object) =>
object.Key.match(
new RegExp(
`^${pathPrefix}/cdn.${url.host}/build/[0-9a-f]{8}/app/themes/clarity/dist/css/style.css$`,
`^${paths.s3}/cdn.${url.host}/build/[0-9a-f]{8}/app/themes/clarity/dist/css/style.css$`,
),
),
);
Expand Down
6 changes: 3 additions & 3 deletions conf/node/controllers/s3.js
Original file line number Diff line number Diff line change
Expand Up @@ -48,18 +48,18 @@ export { sync };
/**
* Empty an S3 folder by using sync and deleting all files
*
* @param {string} folder - The folder to empty
* @param {string} path - The path to empty
*/

export const s3EmptyDir = async (directory) => {
export const s3EmptyDir = async (path) => {
// Make a tmp empty directory
const emptyDir = `/tmp/${Date.now()}`;

// Ensure the directory exists
await fs.mkdir(emptyDir, { recursive: true });

// Sync the empty directory to the folder
await sync(emptyDir, `s3://${s3BucketName}${directory}`, { del: true });
await sync(emptyDir, `s3://${s3BucketName}/${path}`, { del: true });

// Remove the empty directory
await fs.rm(emptyDir, { recursive: true });
Expand Down
2 changes: 1 addition & 1 deletion conf/node/controllers/s3.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ describe("S3EmptyDir", () => {
});

it("should empty the directory", async () => {
await s3EmptyDir("/test/s3-test");
await s3EmptyDir("test/s3-test");

const objects = await client.send(
new ListObjectsV2Command({
Expand Down

0 comments on commit c340043

Please sign in to comment.