From ce53916ca6534b0e50ffec1126d6a5884abafa2f Mon Sep 17 00:00:00 2001 From: EarthlingDavey <15802017+EarthlingDavey@users.noreply.github.com> Date: Thu, 30 Jan 2025 09:07:27 +0000 Subject: [PATCH 1/9] Documentation WIP --- .env.example | 4 ++ .github/README.md | 79 ++++++++++++++++++++++++++++-- Dockerfile | 2 + bin/launch.sh | 9 ---- conf/node/constants.js | 1 + conf/node/controllers/main.test.js | 61 ++++++++++++++++++++--- conf/node/server.js | 6 +++ docker-compose.yml | 3 ++ 8 files changed, 145 insertions(+), 20 deletions(-) diff --git a/.env.example b/.env.example index e5dd9f5..ed4f766 100644 --- a/.env.example +++ b/.env.example @@ -2,6 +2,10 @@ # Needed environment variables ### +INTRANET_JWT_DEV="" +INTRANET_JWT_STAGING="" +INTRANET_JWT_PRODUCTION="" + S3_BUCKET_NAME=my-bucket-name S3_ACCESS_KEY_ID=my-iam-access-key S3_SECRET_ACCESS_KEY=my-iam-secret-key diff --git a/.github/README.md b/.github/README.md index 2b48b0f..6d06bd8 100644 --- a/.github/README.md +++ b/.github/README.md @@ -91,6 +91,7 @@ Start docker compose: ``` make run ``` + There is a script designed to help you install the [Dory Proxy](https://github.com/FreedomBen/dory), if you'd like to. If you chose to install Dory, you can access the application here: @@ -101,13 +102,81 @@ Otherwise, access the application here: [localhost:2000](http://localhost:2000/) +## Application routes + +### `/status` + +There is a private `/status` route that will return a JSON response with the applications status, +including if it has access to the S3 bucket and intranet URLs. + +``` +# Make a GET request with curl to the /status route +curl http://app.archive.intranet.docker/status +``` + +The response should include `{"fetchStatuses":[{"env":"local","status":200}],"s3Status":true}` + +### `/spider` + +This is a private route that will trigger a snapshot, it should only be used for debugging purposes. + +```bash +# Make a POST request with curl to the /spider route +curl -X POST http://app.archive.intranet.docker/spider -d "agency=hmcts&env=local&depth=1" +``` + +The response should be `{"status":200}` and the container logs should show the snapshot being created. + +### `/access` + +The primary route is `/access`, this is the only public route and it redirects to the CloudFront distribution. +For this to work, you should be running the intranet project locally, on the Intranet Dashboard click on the link to the archive. +Your browser will be sent to `http://app.archive.intranet.docker/access` and you will be redirected to a URL like `http://archive.intranet.docker/local-hmcts/index.html`. + + ## Understanding application logic -Let's begin with servers and their interactions within... +Let's begin with the main controller... + +The main controller, [main.js](./conf/node/controllers/main.js), is a script that runs all necessary functions in order to create a snapshot and then upload it to S3. + +The entrypoint script is [server.js](./conf/node/server.js). This script is responsible for setting up the server and scheduling the main controller to run at specific times. + +As we are running an Express server, we use middleware, located at [middleware.js](./conf/node/middleware.js), in order to parse and validate incoming requests. + +Along side the main controller are various distinct controllers. These controllers are each concerned with one distinct aspect of the snapshot process. +For example, the [cloudfront.js](./conf/node/controllers/cloudfront.js) controller is responsible for various functions related to the CloudFront distribution, including creating signed cookies. + +## Tests and TDD + +In an aim to make the application robust and easy to maintain, we have implemented tests using Jest. + +Middleware, and the controllers have tests, the tests are adjacent to the files they are testing e.g. `middleware.test.js` will be found next to `middleware.js`. + +When making a change to the application, you can run the tests with the following command: + +```bash +# Exec into the container +make bash +# Run the tests +npm run test +# Or, run tests while watching for changes +npm run test:watch +# Or, append a particular test file +npm run test middleware +# Or, append a particular test file and watch for changes +npm run test:watch middleware +``` + +The main test requires access to the live intranet. If you see the following logs: + +> Could not access production. + Add JWT to your .env file to access the intranet. + +... and the main test is failing, you should add a JWT to the `.env` file. + +Visit dev.intranet.justice.gov.uk and copy the JWT from the browser's cookies. -The Archiver has an Nginx server. This is used to display responses from the underlying NodeJS -server where Node processes form requests and decides how to treat them. Essentially, if happy with the request, Node -will instruct HTTrack to perform a website copy operation, and it does this with predefined options, and a custom plugin. ## HTTrack @@ -159,7 +228,7 @@ sed -i 's|href="https://intranet.justice.gov.uk/agency-switcher/"|href="/"|g' $0 ### Testing and making modifications to the application -All processing for HTTrack is managed in the `process.js` file located in the NodeJS application. You will find all the +All processing for HTTrack is managed in the `server.js` file located in the NodeJS application. You will find all the options used to set HTTrack up. To understand the build process further, please look at the Makefile. diff --git a/Dockerfile b/Dockerfile index eef989a..1d70c98 100644 --- a/Dockerfile +++ b/Dockerfile @@ -26,6 +26,8 @@ ENV NODE_ENV=development USER node +CMD [] + # Create a production image, from the base image. diff --git a/bin/launch.sh b/bin/launch.sh index c691e26..6d0461a 100755 --- a/bin/launch.sh +++ b/bin/launch.sh @@ -9,12 +9,3 @@ echo -e "${DOTS} ${DOTS} Firing the website up... ${DOTS}\n" # bring docker online (background) docker compose up -d - -# launch in browser -echo -e "${DOTS} ${DOTS} Launching your default browser... ${DOTS}\n" -sleep 2 - -if command -v python &> /dev/null -then - python -m webbrowser http://spider.intranet.docker -fi diff --git a/conf/node/constants.js b/conf/node/constants.js index 30a8509..1fb744d 100644 --- a/conf/node/constants.js +++ b/conf/node/constants.js @@ -1,5 +1,6 @@ import { parseScheduleString } from "./controllers/schedule.js"; +export const isLocal = process.env.NODE_ENV === "development"; export const ordinalNumber = parseInt(process.env.ORDINAL_NUMBER); export const port = 2000; diff --git a/conf/node/controllers/main.test.js b/conf/node/controllers/main.test.js index b21c554..3c1cbb2 100644 --- a/conf/node/controllers/main.test.js +++ b/conf/node/controllers/main.test.js @@ -1,12 +1,16 @@ import fs from "fs/promises"; import { afterAll, beforeEach, expect, it, jest } from "@jest/globals"; -import { S3Client, ListObjectsV2Command, GetObjectCommand } from "@aws-sdk/client-s3"; +import { + S3Client, + ListObjectsV2Command, + GetObjectCommand, +} from "@aws-sdk/client-s3"; import { main } from "./main.js"; import { getSnapshotPaths } from "./paths.js"; import { s3Options, s3EmptyDir } from "./s3.js"; -import { intranetUrls, s3BucketName } from "../constants.js"; +import { intranetUrls, intranetJwts, s3BucketName } from "../constants.js"; // Skip tests when running on CI, because this environment doesn't have access to the intranet. const skipAllTests = process.env.CI === "true"; @@ -14,7 +18,22 @@ const skipAllTests = process.env.CI === "true"; // Skip long tests when running in watch mode. const skipLongTests = process.env.npm_lifecycle_event === "test:watch"; -const envs = ['dev', 'production']; +const envs = ["dev", "production"]; + +/** + * Can we access the intranet? + * + * @param {string} env + * @returns {Promise} + */ + +const canFetchEnv = async (env) => { + const { status } = await fetch(intranetUrls[env], { + redirect: "manual", + headers: { Cookie: `jwt=${intranetJwts[env]}` }, + }); + return status === 200; +}; describe.each(envs)("main - %s", (env) => { if (skipAllTests) { @@ -31,7 +50,17 @@ describe.each(envs)("main - %s", (env) => { const paths = getSnapshotPaths({ env, agency }); const s3Client = new S3Client(s3Options); + // Can we access the intranet? i.e. is our JWT valid? + let access = false; + beforeAll(async () => { + access = await canFetchEnv(env); + if (!access) { + console.info( + `Could not access ${env}.\nAdd JWT to your .env file to access the intranet.`, + ); + } + // Mock console.log so the tests are quiet. jest.spyOn(console, "log").mockImplementation(() => {}); }); @@ -53,6 +82,10 @@ describe.each(envs)("main - %s", (env) => { }); it("should get index files on a shallow scrape", async () => { + if (!access) { + return expect(access).toBe(true); + } + await main({ env, agency, depth: 1 }); // The snapshot should be on s3 @@ -78,6 +111,10 @@ describe.each(envs)("main - %s", (env) => { }, 10_000); it("should delete sensitive files and cleanup local fs", async () => { + if (!access) { + return expect(access).toBe(true); + } + await main({ env, agency, depth: 1 }); // The snapshot should be on s3 @@ -108,24 +145,32 @@ describe.each(envs)("main - %s", (env) => { }, 10_000); it("should create an auth/heartbeat file", async () => { + if (!access) { + return expect(access).toBe(true); + } + await main({ env, agency, depth: 1 }); // The snapshot should be on s3 const objects = await s3Client.send( new ListObjectsV2Command({ Bucket: s3BucketName, - Prefix: 'auth/heartbeat', + Prefix: "auth/heartbeat", }), ); const heartbeat = objects.Contents.find( - (object) => object.Key === 'auth/heartbeat', + (object) => object.Key === "auth/heartbeat", ); expect(heartbeat).toBeDefined(); }, 10_000); it("should create root and agency index files", async () => { + if (!access) { + return expect(access).toBe(true); + } + await main({ env, agency, depth: 1 }); const rootIndexHtml = await s3Client.send( @@ -134,7 +179,7 @@ describe.each(envs)("main - %s", (env) => { Key: "production" === env ? `index.html` : `${env}.html`, }), ); - + expect(rootIndexHtml).toBeDefined(); const agencyIndexHtml = await s3Client.send( @@ -157,6 +202,10 @@ describe.each(envs)("main - %s", (env) => { } it("should get styles.css from the cdn", async () => { + if (!access) { + return expect(access).toBe(true); + } + await main({ env, agency, depth: 2 }); // The snapshot should be on s3 diff --git a/conf/node/server.js b/conf/node/server.js index 9deb080..7205628 100644 --- a/conf/node/server.js +++ b/conf/node/server.js @@ -8,6 +8,7 @@ import express from "express"; // Relative import { + isLocal, ordinalNumber, intranetUrls, intranetJwts, @@ -72,6 +73,10 @@ app.get("/status", async function (_req, res, next) { .filter(([, jwt]) => jwt) .map(([env]) => env); + if(isLocal) { + envs.push("local"); + } + const fetchStatuses = await Promise.all( envs.map(async (env) => { const url = intranetUrls[env]; @@ -92,6 +97,7 @@ app.get("/status", async function (_req, res, next) { res.status(200).send(data); } catch (err) { + console.log(err); // Handling errors like this will send the error to the default Express error handler. // It will log the error to the console, return a 500 error page, // and show the error message on dev environments, but hide it on production. diff --git a/docker-compose.yml b/docker-compose.yml index f080769..0326e59 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -23,6 +23,9 @@ services: minio-init: # Wait for minio-init to complete before starting. condition: service_completed_successfully + # Requests to intranet.docker should go to host machine + extra_hosts: + - "intranet.docker:host-gateway" minio: image: minio/minio From 734e851fd08d0a6fd6fa2ae14bb10c5f31ccc98d Mon Sep 17 00:00:00 2001 From: EarthlingDavey <15802017+EarthlingDavey@users.noreply.github.com> Date: Mon, 3 Feb 2025 09:50:03 +0000 Subject: [PATCH 2/9] Add config/secrets to readme --- .env.ci | 1 - .env.example | 16 +++-- .github/README.md | 134 +++++++++++++++++++++++++++++++++++----- Makefile | 41 +++++++++--- conf/node/middleware.js | 4 +- conf/node/server.js | 4 +- docker-compose.yml | 1 + 7 files changed, 167 insertions(+), 34 deletions(-) diff --git a/.env.ci b/.env.ci index 4215786..261def6 100644 --- a/.env.ci +++ b/.env.ci @@ -6,7 +6,6 @@ CI="true" AWS_ACCESS_KEY_ID=test-key-id AWS_SECRET_ACCESS_KEY=test-access-key S3_BUCKET_NAME=test-bucket -S3_ENDPOINT=http://minio:9000 ALLOWED_AGENCIES="hq,hmcts" INTRANET_JWT_DEV=test-jwt INTRANET_ARCHIVE_SHARED_SECRET=test-shared-secret diff --git a/.env.example b/.env.example index ed4f766..3a42951 100644 --- a/.env.example +++ b/.env.example @@ -2,12 +2,20 @@ # Needed environment variables ### +ALLOWED_AGENCIES="hq,hmcts" + +SNAPSHOT_SCHEDULE="dev::hmcts::Wed::16:08::3" + INTRANET_JWT_DEV="" INTRANET_JWT_STAGING="" INTRANET_JWT_PRODUCTION="" -S3_BUCKET_NAME=my-bucket-name -S3_ACCESS_KEY_ID=my-iam-access-key -S3_SECRET_ACCESS_KEY=my-iam-secret-key +# This should match the value generated for the intranet. +INTRANET_ARCHIVE_SHARED_SECRET="" + +# Minio/AWS credentials - for local only. +# On Cloud Platform, a service account is used. +AWS_ACCESS_KEY_ID=local-key-id +AWS_SECRET_ACCESS_KEY=local-access-key -SNAPSHOT_SCHEDULE="hq:Mon:17:30,hmcts:Tue:17:30" +S3_BUCKET_NAME=local-bucket diff --git a/.github/README.md b/.github/README.md index 6d06bd8..30215a0 100644 --- a/.github/README.md +++ b/.github/README.md @@ -168,31 +168,138 @@ npm run test middleware npm run test:watch middleware ``` -The main test requires access to the live intranet. If you see the following logs: +The main test requires access to dev and live intranet sites. If you see the following logs: > Could not access production. Add JWT to your .env file to access the intranet. ... and the main test is failing, you should add a JWT to the `.env` file. -Visit dev.intranet.justice.gov.uk and copy the JWT from the browser's cookies. +Visit dev.intranet.justice.gov.uk, wait for one heartbeat request (30s), and copy the JWT from the browser's cookies. +Save this to `INTRANET_JWT_DEV` in `.env`. + +Similarly, visit the production intranet and save the JWT to `INTRANET_JWT_PROD` in `.env`. + +The main test should run successfully. ## HTTrack -At the very heart of the Archiver sits [HTTrack](https://en.wikipedia.org/wiki/HTTrack). This application is configured -by Node to take a snapshot of the MoJ Intranet. Potentially, you can point the Archiver at any website address and, -using the settings for the Intranet, it will attempt to create an isolated copy of it. +At the very heart of the Archiver sits [HTTrack](https://en.wikipedia.org/wiki/HTTrack). This application is configured by Node to take a snapshot of the MoJ Intranet. + +Node's `spawn` and `exec` functions are used to run HTTrack in the background. The functions are located at [httrack.js](./conf/node/httrack.js), and the test suite is at [httrack.test.js](./conf/node/httrack.test.js). + +Observe HTTrack with the following actions: + +- HTTrack functions can be tested with the command `npm run test httrack`. +- It is also a dependency of `main`, that can be tested with the command `npm run test main`. +- And, it can be seen in action if the `/spider` route is requested. + ```bash + # Make a POST request with curl to the /spider route + curl -X POST http://app.archive.intranet.docker/spider -d "agency=hmcts&env=local&depth=1" + ``` +- Use the `SNAPSHOT_SCHEDULE` environment variable to schedule a snapshot. + +## Configuration + +The following table lists the environment variables that can be set in the `.env` file. + +When the application is deployed: +- the secrets are stored in the [Github Actions secrets](https://github.com/ministryofjustice/intranet-archive/settings/secrets/actions). +- config values (that are not secret) are stored in each environment's [config.yml](./deploy/dev/config.yml) file. + +| Variable | Description | Format/Example | +| ------------------------------------| ------------------------------------------------------------------- | ----------------------------- | +| Application Config | +| `ALLOWED_AGENCIES` | A comma separated list of agencies that are allowed to be archived. | `hq,hmcts` | +| `SNAPSHOT_SCHEDULE` | A comma separated of formatted schedules for the snapshots. | `dev::hq::Mon::17:30::3` | +| Intranet Secrets | +| `INTRANET_JWT_DEV` | JWT for the dev intranet | Header, payload and sig. | +| `INTRANET_JWT_STAGING` | JWT for the staging intranet | 〃 | +| `INTRANET_JWT_PRODUCTION` | JWT for the production intranet | 〃 | +| `INTRANET_ARCHIVE_SHARED_SECRET` | Shared secret for, for signing `/access` requests | 64 bit base64 string | +| AWS Secrets (local only) | +| `AWS_ACCESS_KEY_ID` | AWS access key (for minio) | `local-key-id` | +| `AWS_SECRET_ACCESS_KEY` | AWS secret access key (for minio) | `local-access-key` | +| S3 | +| `S3_BUCKET_NAME` | The S3 bucket on Cloud Platform this is an output of S3 module | `local-bucket` `cloud-platf…` | +| Cloudfront | +| `AWS_CLOUDFRONT_PRIVATE_KEY` | The private key for signing CloudFront cookies | RSA private key | +| `AWS_CLOUDFRONT_PUBLIC_KEY` | The public that CloudFront uses to verify the signed access policy | RSA public key | +| `AWS_CLOUDFRONT_PUBLIC_KEYS_OBJECT` | Active keys from the CF module (used to lookup ID from public key) | [{"id":"*","comment":"hash"}] | + +### JWTs for local development + +Obtaining JWTs when working locally is a manual process. + +Visit dev.intranet.justice.gov.uk, wait for one heartbeat request (30s), and copy the JWT from the browser's cookies. + +Save this to `INTRANET_JWT_DEV` in `.env`. + +Similarly, visit the production intranet and save the JWT to `INTRANET_JWT_PROD` in `.env`. + +### JWTs for the Cloud Platform + +To obtain JWTs that will be used by the application on Cloud Platform, you will need to run a command on an intranet FPM container. + +There is a helper script for this in the [intranet-tools](https://github.com/ministryofjustice/intranet-tools) repository. + +```bash +# Clone the intranet-tools repository +# Set NSP=intranet-dev, NSP=intranet-staging or NSP=intranet-production in .env +# Run this command from the project root +make gen-jwt role=intranet-archive +``` + +These JWTs should be stored in GitHub repository secrets. + +Note: these JWTs are valid for 3 years, and are only valid for requests originating from Cloud Platform's egress. + +### Shared secret + +The shared secret is used to sign requests to the `/access` route. This is to ensure that only authorised requests are able to access the snapshots. + +For local development: + +- Run `make key-gen` from the intranet project. +- Copy `INTRANET_ARCHIVE_SHARED_SECRET` from the intranet project's `.env` file to the intranet-archive project's `.env` file. + +For Cloud Platform: + +- Run `key-gen-shared-secret` from the root of this project. +- Paste the output to the GitHub repository secrets for both the intranet and intranet-archive repositories. +- Repeat this step for each environment: so that dev keys are different to staging keys, and staging keys are different to production keys. + +### CloudFront keys + +In this project, CloudFront keys are used to sign cookies. The keys are always generated locally by running `make key-gen-` commands. + +For local development: + +A set of dummy keys, that are not actually valid for a CloudFront distribution, are required so that the application can run and be tested. + +These keys are generated by running the following command: `key-gen-private`. Follow the instructions in the terminal (marked as A) to generate the keys for your .env file. + +For CI/CD: + +Again, set of dummy keys, that are not actually valid for a CloudFront distribution, are required so that the application can tested. + +Run: `key-gen-private`. Follow the instructions in the terminal (marked as B) to generate the keys for the `TEST_AWS_CLOUDFRONT_*` GitHub repository secrets. + +For Cloud Platform: + +The keys are generated by running the following command: `key-gen-private`. Follow the instructions in the terminal (marked as C) to generate the keys for the `AWS_CLOUDFRONT_*` GitHub repository secrets. ### Debugging -The output of HTTrack can be noted in Docker Composes' `stdout` in the running terminal window however, a more -detailed and linear output stream is available in the `hts-log.txt` file. You can find this in the root of the snapshot. +The output of the controllers and HTTrack can be noted in Docker Composes' `stdout` in the running terminal window. + +Fot HHTrack, a detailed and linear output stream is available in the `hts-log.txt` file. You can find this in the root of the snapshot. e.g. `/tmp/snapshots/hq/2021-09-01/hts-log.txt`. ### Custom commands During the build of the Archiver, we came across many challenges, two of which almost prevented our proof of concept -from succeeding. The first was an inability to display images. The second was an inability to download them. +from succeeding. The first was an inability to display images. The second was changing the Agency Switcher link destination. **1) The HTTrack `srcset` problem** @@ -226,13 +333,6 @@ This link to the root of the cdn domain will show the index page, and allow the sed -i 's|href="https://intranet.justice.gov.uk/agency-switcher/"|href="/"|g' $0 ``` -### Testing and making modifications to the application - -All processing for HTTrack is managed in the `server.js` file located in the NodeJS application. You will find all the -options used to set HTTrack up. - -To understand the build process further, please look at the Makefile. - ## Cloud Platform In an aim to towards good security practices, when this application is deployed to the Cloud Platform, the `/access` is the only route that is open publicly. @@ -240,9 +340,9 @@ The `/access` route allows users to be redirected to the CloudFront distribution Private routes, `/status` and `/spider` are used for developer purposes only. To access these endpoints, port-forward to the service. See the command below. -It may be possible to +It is possible to [interact with running pods with help from this cheatsheet](https://kubernetes.io/docs/reference/kubectl/cheatsheet/#interacting-with-running-pods). -Please be aware that with every call to the CP k8s cluster, you will need to provide the namespace, as shown below: +Please be aware that with every call to the Cloud Platform k8s cluster, you will need to provide the namespace, as shown below: ```bash kubectl -n intranet-archive-dev diff --git a/Makefile b/Makefile index 86f8f52..90894e0 100644 --- a/Makefile +++ b/Makefile @@ -40,26 +40,51 @@ build-prod: up-prod: docker compose -f docker-compose.prod.yml up +# Generate the shared secret used by the intranet for signing `/access` requests. +key-gen-shared-secret: + @openssl rand -base64 64 | tr -d '\n' | pbcopy + @echo "Shared secret copied to clipboard - either:" + @echo "A - Paste it into .env" + @echo " Once for the intranet project and once for the intranet-archive project" + @echo "B - Paste it into GitHub secrets" + @echo " Once for the intranet project and once for the intranet-archive project" + @echo " and repeat this command for each environment" + @echo "Use the name INTRANET_ARCHIVE_SHARED_SECRET" + # The following key-gen-* commands are for CloudFront RSA key generation/management. key-gen-private: @openssl genrsa -out /tmp/private_key.pem 2048 && pbcopy < /tmp/private_key.pem - @echo "Private key copied to clipboard - paste it into GitHub secrets" - @echo "Use the name AWS_CLOUDFRONT_PRIVATE_KEY_A or AWS_CLOUDFRONT_PRIVATE_KEY_B" + @echo "Private key copied to clipboard - either:" + @echo "A - Paste it into .env" + @echo " Use the name AWS_CLOUDFRONT_PRIVATE_KEY" + @echo "B - Paste it into GitHub secrets" + @echo " Use the name AWS_CLOUDFRONT_PRIVATE_KEY_A or AWS_CLOUDFRONT_PRIVATE_KEY_B" + @echo "C - Paste it into GitHub secrets" + @echo " Use the name TEST_AWS_CLOUDFRONT_PRIVATE_KEY" @echo "Then run 'make key-gen-public'" key-gen-public: @openssl rsa -in /tmp/private_key.pem -pubout -out /tmp/public_key.pem && pbcopy < /tmp/public_key.pem - @echo "Public key copied to clipboard - paste it into GitHub secrets" - @echo "Use the name AWS_CLOUDFRONT_PUBLIC_KEY_A or AWS_CLOUDFRONT_PUBLIC_KEY_B" - @echo "Optionally run 'make key-gen-object' if you are making a test object for GitHub actions" + @echo "Public key copied to clipboard - either:" + @echo "A - Paste it into .env" + @echo " Use the name AWS_CLOUDFRONT_PUBLIC_KEY" + @echo " Next run 'make key-gen-object'" + @echo "B - Paste it into GitHub secrets" + @echo " Use the name AWS_CLOUDFRONT_PUBLIC_KEY_A or AWS_CLOUDFRONT_PUBLIC_KEY_B" + @echo "C - Paste it into GitHub secrets" + @echo " Use the name TEST_AWS_CLOUDFRONT_PUBLIC_KEY" + @echo "Optionally run 'make key-gen-object' if you are populating .env or TEST_ secrets in GitHub actions" @echo "Then run 'make key-gen-clean'" key-gen-object: @echo "[{\"id\":\"GENERATED_BY_AWS\",\"comment\":\"$(shell cat /tmp/public_key.pem | openssl dgst -binary -sha256 | xxd -p -c 32 | cut -c 1-8)\"}]" | pbcopy - @echo "Public keys object copied to clipboard - paste it into GitHub secrets" - @echo "This is only used for testing. Use the name TEST_AWS_CLOUDFRONT_PUBLIC_KEYS_OBJECT" + @echo "Public keys object copied to clipboard - either: paste it into GitHub secrets" + @echo "A - Paste it into .env" + @echo " Use the name AWS_CLOUDFRONT_PUBLIC_KEYS_OBJECT" + @echo "C - Paste it into GitHub secrets" + @echo " Use the name TEST_AWS_CLOUDFRONT_PUBLIC_KEYS_OBJECT - This is only used for testing" @echo "Finally run 'make key-gen-clean'" key-gen-clean: @rm /tmp/private_key.pem /tmp/public_key.pem && echo "" | pbcopy - @echo "Keys removed from /tmp" \ No newline at end of file + @echo "Keys removed from /tmp" diff --git a/conf/node/middleware.js b/conf/node/middleware.js index 3fa415e..64a0789 100644 --- a/conf/node/middleware.js +++ b/conf/node/middleware.js @@ -162,6 +162,8 @@ export const checkSignature = (req, _res, next) => { */ export const errorHandler = (err, _req, res, _next) => { + console.log(err); + if (err.status === 400) { res .status(400) @@ -176,6 +178,6 @@ export const errorHandler = (err, _req, res, _next) => { return; } - // For everthing else, return a 500 error + // For everything else, return a 500 error res.status(500).sendFile("static/error-pages/500.html", { root: __dirname }); }; diff --git a/conf/node/server.js b/conf/node/server.js index 7205628..768cf03 100644 --- a/conf/node/server.js +++ b/conf/node/server.js @@ -97,10 +97,8 @@ app.get("/status", async function (_req, res, next) { res.status(200).send(data); } catch (err) { - console.log(err); // Handling errors like this will send the error to the default Express error handler. - // It will log the error to the console, return a 500 error page, - // and show the error message on dev environments, but hide it on production. + // It will log the error to the console, return a 500 error page. next(err); } }); diff --git a/docker-compose.yml b/docker-compose.yml index 0326e59..38ec71a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -14,6 +14,7 @@ services: ORDINAL_NUMBER: 0 VIRTUAL_HOST: app.archive.intranet.docker VIRTUAL_PORT: "2000" + S3_ENDPOINT: "http://minio:9000" volumes: - node_modules:/home/node/app/node_modules - ./conf/node:/home/node/app From c2921c203a31ada80c81c9e0ad68ad78d1578824 Mon Sep 17 00:00:00 2001 From: EarthlingDavey <15802017+EarthlingDavey@users.noreply.github.com> Date: Mon, 3 Feb 2025 11:02:07 +0000 Subject: [PATCH 3/9] Add diagrams to docs. --- .github/README.md | 76 +++++++++++++++++++++++++++++++++++---------- Makefile | 8 +---- bin/launch.sh | 2 +- conf/node/server.js | 2 +- 4 files changed, 62 insertions(+), 26 deletions(-) diff --git a/.github/README.md b/.github/README.md index 30215a0..48add5e 100644 --- a/.github/README.md +++ b/.github/README.md @@ -20,6 +20,23 @@ Archiving the Intranet, thankfully, is a task made simple using the following te 4. HTTrack Cli 5. NodeJS Server +## Infrastructure overview + +This diagram shows the flow of data from the intranet to the user. + +```mermaid +graph LR + A[Intranet] -->|Content| B[NodeJS & HTTrack] + B -->|Snapshot| C[S3] + C -->|Content| D[CloudFront] + D -->|Content| E[User] +``` + +> [!NOTE] +> The first part, where the content is moved from the Intranet to the S3 bucket, is handled by the Archiver, and this is a scheduled task. +> The second part where the user accesses a snapshot from S3, is handled by the CloudFront distribution. + + ## Viewing the latest snapshot Access is granted to the snapshot if, you: @@ -44,6 +61,20 @@ information. 4. The NodeJS responds by redirecting to the CloudFront distribution. The redirect URL contains cookies, so that the user can access the snapshot. +```mermaid +sequenceDiagram + participant User + participant Intranet + participant Archive (NodeJS) + participant CloudFront + User->>Intranet: Login + User->>Intranet: Click archive link + Intranet->>Archive (NodeJS): POST /access + Archive (NodeJS)->>Archive (NodeJS): Validate request + Archive (NodeJS)->>CloudFront: Redirect + CloudFront->>User: Snapshot +``` + ## Scheduling a snapshot Find the config file at `deploy//config.yml`. @@ -71,13 +102,14 @@ See the Cloud Platform and Commands sections below. ## Local development -> It's important to note that creating a snapshot of the intranet from a local machine proved to present resource -> related issues, such as VPN timeouts and rate limiting. - Requires - Docker +Optional + +- Local instance of the Intranet (for testing local scrape & access endpoints) + ### Installation Clone to your machine: @@ -210,20 +242,20 @@ When the application is deployed: | Variable | Description | Format/Example | | ------------------------------------| ------------------------------------------------------------------- | ----------------------------- | -| Application Config | +| **Application Config** | | `ALLOWED_AGENCIES` | A comma separated list of agencies that are allowed to be archived. | `hq,hmcts` | | `SNAPSHOT_SCHEDULE` | A comma separated of formatted schedules for the snapshots. | `dev::hq::Mon::17:30::3` | -| Intranet Secrets | +| **Intranet Secrets** | | `INTRANET_JWT_DEV` | JWT for the dev intranet | Header, payload and sig. | | `INTRANET_JWT_STAGING` | JWT for the staging intranet | 〃 | | `INTRANET_JWT_PRODUCTION` | JWT for the production intranet | 〃 | | `INTRANET_ARCHIVE_SHARED_SECRET` | Shared secret for, for signing `/access` requests | 64 bit base64 string | -| AWS Secrets (local only) | +| **WS Secrets (local only)** | | `AWS_ACCESS_KEY_ID` | AWS access key (for minio) | `local-key-id` | | `AWS_SECRET_ACCESS_KEY` | AWS secret access key (for minio) | `local-access-key` | -| S3 | +| **S3** | | `S3_BUCKET_NAME` | The S3 bucket on Cloud Platform this is an output of S3 module | `local-bucket` `cloud-platf…` | -| Cloudfront | +| **Cloudfront** | | `AWS_CLOUDFRONT_PRIVATE_KEY` | The private key for signing CloudFront cookies | RSA private key | | `AWS_CLOUDFRONT_PUBLIC_KEY` | The public that CloudFront uses to verify the signed access policy | RSA public key | | `AWS_CLOUDFRONT_PUBLIC_KEYS_OBJECT` | Active keys from the CF module (used to lookup ID from public key) | [{"id":"*","comment":"hash"}] | @@ -290,13 +322,13 @@ For Cloud Platform: The keys are generated by running the following command: `key-gen-private`. Follow the instructions in the terminal (marked as C) to generate the keys for the `AWS_CLOUDFRONT_*` GitHub repository secrets. -### Debugging +## Debugging The output of the controllers and HTTrack can be noted in Docker Composes' `stdout` in the running terminal window. Fot HHTrack, a detailed and linear output stream is available in the `hts-log.txt` file. You can find this in the root of the snapshot. e.g. `/tmp/snapshots/hq/2021-09-01/hts-log.txt`. -### Custom commands +## Custom HTTrack commands During the build of the Archiver, we came across many challenges, two of which almost prevented our proof of concept from succeeding. The first was an inability to display images. The second was changing the Agency Switcher link destination. @@ -365,10 +397,20 @@ kubectl -n intranet-archive-dev service/intranet-archive-service 2000:80 **Make** -| Command | Description | -| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- | -| `make image` | Used by GitHub action, cd.yml, during build step | -| `make launch` | Checks if the intranet docker instance is running; if not, launch dory and docker in the background and open the site in the systems default browser | -| `make run` | Launch the application locally with `docker compose up`, requiring `env` + `dory` | -| `make down` | Alias of `docker compose down`. | -| `make bash` | Open a bash shell on the spider container. The application must already be running (e.g. via `make run`) before this can be used. | +| Command | Description | +| ---------------------------- | ----------------------------------------------------------------------------------------------------------------------------------| +| **Local Development** | | +| `make launch` | Checks if the intranet docker instance is running; if not, launch dory and docker in the background | +| `make run` | Launch the application locally with `docker compose up`, requiring `env` + `dory` | +| `make down` | Alias of `docker compose down`. | +| `make bash` | Open a bash shell on the spider container. The application must already be running (e.g. via `make run`) before this can be used. | +| **Verify prod. locally** | | +| `make build-prod` | Build the production image (for verifying that the production image can be built locally). | +| `make up-prod` | Launch the production image locally (for verifying that the production image can be launched locally). | +| **Intranet Secrets** | | +| `make key-gen-shared-secret` | Generate a shared secret for the application, see [Shared Secret](#shared-secret). | +| **CloudFront** | | +| `make key-gen-private` | Generate a private key for CloudFront, see [CloudFront keys](#cloudfront-keys). | +| `make key-gen-public` | Generate a public key for CloudFront, see [CloudFront keys](#cloudfront-keys). | +| `make key-gen-object` | Generate an object for CloudFront, see [CloudFront keys](#cloudfront-keys). | +| `make key-gen-clean` | Remove all generated keys. | \ No newline at end of file diff --git a/Makefile b/Makefile index 90894e0..9e2a809 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,5 @@ default: launch -IMAGE := ministryofjustice/intranet-archive - # Start the application run: env dory docker compose up @@ -20,15 +18,11 @@ down: dory: @chmod +x ./bin/dory-start.sh && ./bin/dory-start.sh - launch: @bin/launch.sh - @echo "\n Intranet spider available here: http://spider.intranet.docker/\n" + @echo "\n Intranet archive available here: http://app.archive.intranet.docker/status\n" @docker compose logs -f spider -image: Dockerfile Makefile build - docker build -t $(IMAGE) . - # Get inside the spider container bash: docker compose exec spider /bin/bash diff --git a/bin/launch.sh b/bin/launch.sh index 6d0461a..fcc41e1 100755 --- a/bin/launch.sh +++ b/bin/launch.sh @@ -5,7 +5,7 @@ DOTS="\n \033[0;32m***\033[0m" echo -e "${DOTS} ${DOTS} Checking Dory... ${DOTS}\n" chmod +x ./bin/dory-start.sh && ./bin/dory-start.sh -echo -e "${DOTS} ${DOTS} Firing the website up... ${DOTS}\n" +echo -e "${DOTS} ${DOTS} Firing the application up... ${DOTS}\n" # bring docker online (background) docker compose up -d diff --git a/conf/node/server.js b/conf/node/server.js index 768cf03..cf395f5 100644 --- a/conf/node/server.js +++ b/conf/node/server.js @@ -164,7 +164,7 @@ app.post("/access", async function (req, res, next) { app.use(function (_req, res) { // Return a 404 page if no route is matched - res.status(404).sendFile("static/404.html", { root: __dirname }); + res.status(404).sendFile("static/error-pages/404.html", { root: __dirname }); }); /** From 8ac2365b599b85c900a767b8bdde266cfd1b481b Mon Sep 17 00:00:00 2001 From: EarthlingDavey <15802017+EarthlingDavey@users.noreply.github.com> Date: Mon, 3 Feb 2025 11:04:55 +0000 Subject: [PATCH 4/9] Update README.md --- .github/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/README.md b/.github/README.md index 48add5e..dc7ffc4 100644 --- a/.github/README.md +++ b/.github/README.md @@ -81,11 +81,11 @@ Find the config file at `deploy//config.yml`. Update the `SNAPSHOT_SCHEDULE` environment variable with values for the desired agency. -It should be in the following pattern `::::::`. +It should be in the following pattern `::::::(::)`. And, multiple values should be comma separated. -e.g. `dev::hq::Mon::17:30::3,dev::hmcts::Thu::17:30::3` +e.g. `dev::hq::Mon::17:30,dev::hmcts::Thu::17:30::3` ## Manually creating a snapshot From b971c0a4a91824968891f2301aea3caaa72973e1 Mon Sep 17 00:00:00 2001 From: EarthlingDavey <15802017+EarthlingDavey@users.noreply.github.com> Date: Thu, 6 Feb 2025 16:29:01 +0000 Subject: [PATCH 5/9] Update according to feedback. --- .env.ci | 12 ++-- .env.example | 50 +++++++++++---- .github/README.md | 64 +++++++++++++++++-- Makefile | 2 +- conf/local.Caddyfile | 8 +-- .../__snapshots__/httrack.test.js.snap | 7 +- conf/node/controllers/cloudfront.js | 7 ++ conf/node/controllers/cloudfront.test.js | 5 ++ conf/node/controllers/httrack.js | 24 ++++++- conf/node/controllers/main.test.js | 2 +- conf/node/server.js | 33 +++++++--- docker-compose.yml | 28 +++++--- 12 files changed, 196 insertions(+), 46 deletions(-) diff --git a/.env.ci b/.env.ci index 261def6..981cc83 100644 --- a/.env.ci +++ b/.env.ci @@ -1,11 +1,15 @@ -### +# # # # # # # # # # # # # # # # # # # # # # # # # # # # # Environment variables for CI -### +# # # # # # # # # # # # # # # # # # # # # # # # # # # # CI="true" + +ALLOWED_AGENCIES="hq,hmcts" + AWS_ACCESS_KEY_ID=test-key-id AWS_SECRET_ACCESS_KEY=test-access-key S3_BUCKET_NAME=test-bucket -ALLOWED_AGENCIES="hq,hmcts" -INTRANET_JWT_DEV=test-jwt + INTRANET_ARCHIVE_SHARED_SECRET=test-shared-secret + +INTRANET_JWT_DEV=test-jwt diff --git a/.env.example b/.env.example index 3a42951..9f1b472 100644 --- a/.env.example +++ b/.env.example @@ -1,21 +1,49 @@ -### -# Needed environment variables -### +# # # # # # # # # # # # # # # # # # # # # # # # # # # # +# ℹ️ For setup and details see README.md +# # # # # # # # # # # # # # # # # # # # # # # # # # # # +# # # # # # # # # # # # # # # # # # # # # # # # # # # # +# 👌 Env vars. with sane defaults, update accordingly. +# # # # # # # # # # # # # # # # # # # # # # # # # # # # + +# Allowed agencies, comma separated. ALLOWED_AGENCIES="hq,hmcts" +# Schedule for the snapshot, in the format `::::::(::)` SNAPSHOT_SCHEDULE="dev::hmcts::Wed::16:08::3" -INTRANET_JWT_DEV="" -INTRANET_JWT_STAGING="" -INTRANET_JWT_PRODUCTION="" - -# This should match the value generated for the intranet. -INTRANET_ARCHIVE_SHARED_SECRET="" +# # # # # # # # # # # # # # # # # # # # # # # # # # # # +# 🚫 Env vars. that are unlikely to need changes. +# # # # # # # # # # # # # # # # # # # # # # # # # # # # -# Minio/AWS credentials - for local only. -# On Cloud Platform, a service account is used. +# Minio/AWS credentials - for local only - On Cloud Platform, a service account is used. AWS_ACCESS_KEY_ID=local-key-id AWS_SECRET_ACCESS_KEY=local-access-key +# S3 bucket name - for local only. S3_BUCKET_NAME=local-bucket + +# # # # # # # # # # # # # # # # # # # # # # # # # # # # +# 📝 Env vars. where user action is required. +# # # # # # # # # # # # # # # # # # # # # # # # # # # # + +# 1️⃣ Run `make key-gen-shared-secret` to generate a new shared secret. +# Paste the value here and in the intranet's .env file. +INTRANET_ARCHIVE_SHARED_SECRET="" + +# 2️⃣ Run `make key-gen-private` to generate a new private key. +# Paste the value here. +AWS_CLOUDFRONT_PRIVATE_KEY="" +# 3️⃣ Run `make key-gen-public` to generate a new public key. +# Paste the value here. +AWS_CLOUDFRONT_PUBLIC_KEY="" +# 4️⃣ Run `make key-gen-object` to generate a new public keys object. +# Paste the value here. +AWS_CLOUDFRONT_PUBLIC_KEYS_OBJECT="" +# 5️⃣ Run `make key-gen-clean` to clean up the generated keys. + +# 6️⃣ For the full test suite to run locally, you need to set the JWTs for the intranet. +# Go to https://dev.intranet.justice.gov.uk wait 30s for a heartbeat request to complete, and copy the JWT cookie value. +INTRANET_JWT_DEV="" +# 7️⃣ Repeat for production. +INTRANET_JWT_PRODUCTION="" diff --git a/.github/README.md b/.github/README.md index dc7ffc4..970dcee 100644 --- a/.github/README.md +++ b/.github/README.md @@ -108,19 +108,43 @@ Requires Optional -- Local instance of the Intranet (for testing local scrape & access endpoints) +- Local instance of the Intranet + +### Local Intranet + +For reference, the code related to this project in the [intranet repository](https://github.com/ministryofjustice/intranet/) is at: + +- [app/themes/clarity/inc/agency.php](https://github.com/ministryofjustice/intranet/blob/main/public/app/themes/clarity/inc/agency.php) +- [app/themes/clarity/inc/admin/intranet-archive-link.php](https://github.com/ministryofjustice/intranet/blob/main/public/app/themes/clarity/inc/admin/intranet-archive-link.php) + +If you want to test scraping of the intranet from a local source then the intranet must be running locally at [http://intranet.docker]. + +For the archive link on the intranet dashboard to work correctly: + +- At least one agency should have `'has_archive' => true` set in `agency.php -> getList()`. +- The environment variables: `INTRANET_ARCHIVE_URL` and `INTRANET_ARCHIVE_SHARED_SECRET` must be set. See [Configuration section](#configuration). ### Installation Clone to your machine: -``` +```bash git clone https://github.com/ministryofjustice/intranet-archive.git && cd intranet-archive ``` -Start docker compose: +Prepare the environment: +```bash +make env ``` + +This command, will create a `.env` file in the root of the project. + +Open the .env file and set the variables, annotated with the numbers 1 - 7. + +Start docker compose: + +```bash make run ``` @@ -136,6 +160,14 @@ Otherwise, access the application here: ## Application routes +Locally, request can be made to these routes as part of familiarisation with the application. + +Ensure that previous steps have been followed: + +- populate the .env file of this project +- populate the .env file of the intranet project +- both projects are running and the intranet is accessible at [http://intranet.docker] + ### `/status` There is a private `/status` route that will return a JSON response with the applications status, @@ -144,6 +176,7 @@ including if it has access to the S3 bucket and intranet URLs. ``` # Make a GET request with curl to the /status route curl http://app.archive.intranet.docker/status +curl http://localhost:2000/status ``` The response should include `{"fetchStatuses":[{"env":"local","status":200}],"s3Status":true}` @@ -154,17 +187,32 @@ This is a private route that will trigger a snapshot, it should only be used for ```bash # Make a POST request with curl to the /spider route -curl -X POST http://app.archive.intranet.docker/spider -d "agency=hmcts&env=local&depth=1" +curl -X POST http://app.archive.intranet.docker/spider -d "agency=hmcts&env=local&depth=2" ``` The response should be `{"status":200}` and the container logs should show the snapshot being created. +> [!NOTE] +> The progress logs can be found in the terminal where the application is running. +> The updates interval is every second for the first second, then every 5 minutes after that. + +A scrape depth of 2 is sufficient to validate that HTTrack is working correctly. It will take approx. 1 minute to complete. + +For a more thorough scrape, set the depth to 3, that will take approx. 20 minutes. + +For a full scrape, remove the optional depth parameter, that will take approx. 12 hours. + ### `/access` The primary route is `/access`, this is the only public route and it redirects to the CloudFront distribution. For this to work, you should be running the intranet project locally, on the Intranet Dashboard click on the link to the archive. Your browser will be sent to `http://app.archive.intranet.docker/access` and you will be redirected to a URL like `http://archive.intranet.docker/local-hmcts/index.html`. +### Additional local endpoints + +It may help with local debugging to browse the S3 bucket. Minio is used as an alternative to AWS S3, and can be accessed at [http://minio.archive.intranet.docker] or [http://localhost:9010]. + +Refer to `AWS_ACCESS_KEY_ID` & `AWS_SECRET_ACCESS_KEY` in the `.env` file - these are the web interface credentials. ## Understanding application logic @@ -209,12 +257,20 @@ The main test requires access to dev and live intranet sites. If you see the fol Visit dev.intranet.justice.gov.uk, wait for one heartbeat request (30s), and copy the JWT from the browser's cookies. +> [!NOTE] +> As dev.intranet.justice.gov.uk uses an Entra App that is on the development tenant, +> you will need to use your `@devl.justice.gov.uk` email address to log in. + Save this to `INTRANET_JWT_DEV` in `.env`. Similarly, visit the production intranet and save the JWT to `INTRANET_JWT_PROD` in `.env`. The main test should run successfully. +> [!NOTE] +> These JWTs are short lived credentials and will expire after 60 minutes. +> It is therefore recommended to run the complete test suite `npm run test` immediately after obtaining the JWTs. + ## HTTrack At the very heart of the Archiver sits [HTTrack](https://en.wikipedia.org/wiki/HTTrack). This application is configured by Node to take a snapshot of the MoJ Intranet. diff --git a/Makefile b/Makefile index f4ffc0d..378651f 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ default: launch # Start the application -run: env dory +run: dory docker compose up up_daemon: env dory diff --git a/conf/local.Caddyfile b/conf/local.Caddyfile index 3c617f4..8a8a5fe 100644 --- a/conf/local.Caddyfile +++ b/conf/local.Caddyfile @@ -1,8 +1,8 @@ # A local mock CDN to proxy requests to the Minio (S3) server. # Mimics AWS CloudFront, and removes the bucket path from the URL. -# e.g. Request: http://archive.intranet.docker/intranet.justice.gov.uk/hmcts/2024-12-13/index.html -# proxies to : http://minio:9000/bucket-name/intranet.justice.gov.uk/hmcts/2024-12-13/index.html +# e.g. Request: http://archive.intranet.docker/dev-hmcts/2024-12-13/index.html +# proxies to : http://minio:9010/bucket-name/dev-hmcts/2024-12-13/index.html -:2019 +:2029 rewrite * /{$S3_BUCKET_NAME}{uri} -reverse_proxy minio:9000 +reverse_proxy intranet-archive-minio:9010 diff --git a/conf/node/controllers/__snapshots__/httrack.test.js.snap b/conf/node/controllers/__snapshots__/httrack.test.js.snap index b3a2251..e6e54cf 100644 --- a/conf/node/controllers/__snapshots__/httrack.test.js.snap +++ b/conf/node/controllers/__snapshots__/httrack.test.js.snap @@ -20,7 +20,12 @@ exports[`getHttrackArgs should return an array of arguments 1`] = ` "-*intranet.justice.gov.uk/?page_id=*", "-*intranet.justice.gov.uk/wp-json/*/embed*", "-*intranet.justice.gov.uk/wp/*", - "+*intranet.justice.gov.uk/?*agency=hq", + "-intranet.docker/agency-switcher/", + "-intranet.docker/?*agency=*", + "-intranet.docker/?p=*", + "-intranet.docker/?page_id=*", + "-intranet.docker/wp-json/*/embed*", + "-intranet.docker/wp/*", "-s0", "-V", ""sed -i 's/srcset="[^"]*"//g' $0 && sed -i 's|href="https://intranet.justice.gov.uk/agency-switcher/"|href="/"|g' $0"", diff --git a/conf/node/controllers/cloudfront.js b/conf/node/controllers/cloudfront.js index 5c7137b..b37ecfe 100644 --- a/conf/node/controllers/cloudfront.js +++ b/conf/node/controllers/cloudfront.js @@ -2,6 +2,7 @@ import crypto from "node:crypto"; import { getSignedCookies } from "@aws-sdk/cloudfront-signer"; import { + isLocal, cloudFrontKeysObject as keysObject, cloudFrontPublicKey as publicKey, cloudFrontPrivateKey as privateKey, @@ -19,6 +20,12 @@ let cachedKeyPairId = null; */ export const getCdnUrl = (appUrl) => { + // If the app is running locally without using `.docker` hostname. + if (appUrl.host === "localhost:2000") { + // Return the localhost CDN URL. + return new URL("http://localhost:2029"); + } + // Check appHost starts with `app.` if (!appUrl.host.startsWith("app.")) { throw new Error("Invalid host"); diff --git a/conf/node/controllers/cloudfront.test.js b/conf/node/controllers/cloudfront.test.js index ae10d74..857fd98 100644 --- a/conf/node/controllers/cloudfront.test.js +++ b/conf/node/controllers/cloudfront.test.js @@ -13,6 +13,11 @@ describe("getCdnUrl", () => { expect(result.origin).toBe("https://archive.example.com"); }); + it("should return a localhost URL object", () => { + const result = getCdnUrl(new URL("http://localhost:2000")); + expect(result).toStrictEqual(new URL("http://localhost:2029")); + }); + it("should throw an error for invalid host", () => { expect(() => getCdnUrl(new URL("https://archive.example.com"))).toThrow( "Invalid host", diff --git a/conf/node/controllers/httrack.js b/conf/node/controllers/httrack.js index 597373e..8af41f8 100644 --- a/conf/node/controllers/httrack.js +++ b/conf/node/controllers/httrack.js @@ -1,7 +1,7 @@ import { spawn, execSync } from "node:child_process"; import fs from "node:fs"; -import { intranetJwts } from "../constants.js"; +import { isLocal, intranetJwts } from "../constants.js"; /** * Get arguments for httrack cli. @@ -34,15 +34,27 @@ export const getHttrackArgs = ({ url, dest, agency, jwt, depth }) => { "+*.woff", "-ad.doubleclick.net/*", "-justiceuk.sharepoint.com/*", + // Exclude the agency switcher and WordPress URLs (on *.intranet.justice.gov.uk) "-*intranet.justice.gov.uk/agency-switcher/", "-*intranet.justice.gov.uk/?*agency=*", "-*intranet.justice.gov.uk/?p=*", "-*intranet.justice.gov.uk/?page_id=*", "-*intranet.justice.gov.uk/wp-json/*/embed*", "-*intranet.justice.gov.uk/wp/*", - "+*intranet.justice.gov.uk/?*agency=" + agency, ]; + if (isLocal) { + // Exclude the agency switcher and WordPress URLs (on intranet.docker) + rules.push( + "-intranet.docker/agency-switcher/", + "-intranet.docker/?*agency=*", + "-intranet.docker/?p=*", + "-intranet.docker/?page_id=*", + "-intranet.docker/wp-json/*/embed*", + "-intranet.docker/wp/*", + ); + } + const commands = { // Remove srcset attributes removeSrcset: `sed -i 's/srcset="[^"]*"//g' $0`, @@ -50,6 +62,12 @@ export const getHttrackArgs = ({ url, dest, agency, jwt, depth }) => { replaceAgencySwitcher: `sed -i 's|href="https://intranet.justice.gov.uk/agency-switcher/"|href="/"|g' $0`, }; + let cookie = `dw_agency=${agency}`; + + if (jwt) { + cookie += `; jwt=${jwt}`; + } + /** @type {string[]} */ const settings = [ "-s0", // never follow robots.txt and meta robots tags: https://www.mankier.com/1/httrack#-sN @@ -59,7 +77,7 @@ export const getHttrackArgs = ({ url, dest, agency, jwt, depth }) => { "-F", "intranet-archive", "-%X", - `Cookie: dw_agency=${agency}; jwt=${jwt}`, + `Cookie: ${cookie}`, ...(depth ? [`-r${depth}`] : []), // set the mirror depth "-O", // path for snapshot/logfiles+cache: https://www.mankier.com/1/httrack#-O dest, diff --git a/conf/node/controllers/main.test.js b/conf/node/controllers/main.test.js index 3c1cbb2..f5be4fe 100644 --- a/conf/node/controllers/main.test.js +++ b/conf/node/controllers/main.test.js @@ -30,7 +30,7 @@ const envs = ["dev", "production"]; const canFetchEnv = async (env) => { const { status } = await fetch(intranetUrls[env], { redirect: "manual", - headers: { Cookie: `jwt=${intranetJwts[env]}` }, + headers: { Cookie: `dw_agency=hq; jwt=${intranetJwts[env]}` }, }); return status === 200; }; diff --git a/conf/node/server.js b/conf/node/server.js index cf395f5..f0e9ce5 100644 --- a/conf/node/server.js +++ b/conf/node/server.js @@ -66,25 +66,39 @@ app.get("/health", function (_req, res) { res.status(200).send("OK"); }); -app.get("/status", async function (_req, res, next) { +app.get("/status", async function (req, res, next) { try { // Get envs where a JWT has been set. const envs = Object.entries(intranetJwts) .filter(([, jwt]) => jwt) .map(([env]) => env); - if(isLocal) { + if (isLocal) { envs.push("local"); } + // Set an agency cookie so that we don't get a redirect status code to the agency switcher page. + const defaultCookie = `dw_agency=hq`; + const fetchStatuses = await Promise.all( envs.map(async (env) => { const url = intranetUrls[env]; - const { status } = await fetch(url, { - redirect: "manual", - headers: { Cookie: `jwt=${intranetJwts[env]}` }, - }); - return { env, status }; + let cookie = defaultCookie; + + if (intranetJwts[env]) { + cookie += `; jwt=${intranetJwts[env]}`; + } + + try { + const { status } = await fetch(url, { + redirect: "manual", + headers: { Cookie: cookie }, + }); + return { env, status }; + } catch (err) { + console.error(`Error fetching ${url}`, err); + return { env, status: err.message }; + } }), ); @@ -143,7 +157,7 @@ app.post("/access", async function (req, res, next) { // Set the cookies on the response Object.entries(cookies).forEach(([name, value]) => { res.cookie(name, value, { - domain: cdnUrl.host, + domain: cdnUrl.hostname, secure: cdnUrl.protocol === "https:", sameSite: "lax", httpOnly: true, @@ -155,6 +169,9 @@ app.post("/access", async function (req, res, next) { res.clearCookie(name, { domain }); }); + // Clear the agency cookie from the CDN domain, it can cause a redirect loop. + res.clearCookie('dw_agency', { domain: cdnUrl.hostname }); + // Redirect to the CDN URL. res.redirect(`${cdnUrl.origin}/${getAgencyPath(env, agency)}/index.html`); } catch (err) { diff --git a/docker-compose.yml b/docker-compose.yml index 38ec71a..215b35f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -7,14 +7,14 @@ services: build: context: . target: dev + container_name: intranet-archive env_file: - .env - container_name: intranet-archive environment: ORDINAL_NUMBER: 0 VIRTUAL_HOST: app.archive.intranet.docker VIRTUAL_PORT: "2000" - S3_ENDPOINT: "http://minio:9000" + S3_ENDPOINT: "http://intranet-archive-minio:9010" volumes: - node_modules:/home/node/app/node_modules - ./conf/node:/home/node/app @@ -24,23 +24,29 @@ services: minio-init: # Wait for minio-init to complete before starting. condition: service_completed_successfully - # Requests to intranet.docker should go to host machine + # Requests to intranet.docker and cdn.intranet.docker should go to host machine extra_hosts: - "intranet.docker:host-gateway" + - "cdn.intranet.docker:host-gateway" minio: image: minio/minio + container_name: intranet-archive-minio ports: - - "9010:9000" - - "9011:9001" + - "9010:9010" + - "9011:9011" volumes: - minio_storage:/data environment: MINIO_ROOT_USER: ${AWS_ACCESS_KEY_ID} MINIO_ROOT_PASSWORD: ${AWS_SECRET_ACCESS_KEY} - command: server --console-address ":9001" /data + MINIO_ADDRESS: ':9010' + MINIO_CONSOLE_ADDRESS: ':9011' + VIRTUAL_HOST: minio.archive.intranet.docker + VIRTUAL_PORT: "9011" + command: server --console-address ":9011" /data healthcheck: - test: timeout 5s bash -c ':> /dev/tcp/127.0.0.1/9000' || exit 1 + test: timeout 5s bash -c ':> /dev/tcp/127.0.0.1/9010' || exit 1 start_period: 5s interval: 10s timeout: 5s @@ -48,12 +54,13 @@ services: minio-init: image: minio/mc + container_name: intranet-archive-minio-init depends_on: minio: condition: service_healthy entrypoint: | /bin/sh -c " - mc config host add intranet-archive http://minio:9000 ${AWS_ACCESS_KEY_ID} ${AWS_SECRET_ACCESS_KEY} + mc config host add intranet-archive http://intranet-archive-minio:9010 ${AWS_ACCESS_KEY_ID} ${AWS_SECRET_ACCESS_KEY} mc mb --ignore-existing intranet-archive/${S3_BUCKET_NAME} --region eu-west-2 mc anonymous set download intranet-archive/${S3_BUCKET_NAME}; exit 0 @@ -61,11 +68,14 @@ services: cdn: image: caddy:2-alpine + container_name: intranet-archive-cdn volumes: - ./conf/local.Caddyfile:/etc/caddy/Caddyfile environment: S3_BUCKET_NAME: ${S3_BUCKET_NAME} VIRTUAL_HOST: archive.intranet.docker - VIRTUAL_PORT: 2019 + VIRTUAL_PORT: 2029 depends_on: - minio + ports: + - "2029:2029" From df253e2376e98f496f0bcb28023c26b6326f39eb Mon Sep 17 00:00:00 2001 From: EarthlingDavey <15802017+EarthlingDavey@users.noreply.github.com> Date: Thu, 6 Feb 2025 16:37:40 +0000 Subject: [PATCH 6/9] Update .env.ci --- .env.ci | 1 + 1 file changed, 1 insertion(+) diff --git a/.env.ci b/.env.ci index 981cc83..821c55e 100644 --- a/.env.ci +++ b/.env.ci @@ -13,3 +13,4 @@ S3_BUCKET_NAME=test-bucket INTRANET_ARCHIVE_SHARED_SECRET=test-shared-secret INTRANET_JWT_DEV=test-jwt +INTRANET_JWT_PRODUCTION=test-jwt From 4527b77c543927ee24db269da6b2071b761bbdd1 Mon Sep 17 00:00:00 2001 From: EarthlingDavey <15802017+EarthlingDavey@users.noreply.github.com> Date: Thu, 6 Feb 2025 16:44:30 +0000 Subject: [PATCH 7/9] Update .env.example --- .env.example | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.env.example b/.env.example index 9f1b472..76437c0 100644 --- a/.env.example +++ b/.env.example @@ -45,5 +45,5 @@ AWS_CLOUDFRONT_PUBLIC_KEYS_OBJECT="" # 6️⃣ For the full test suite to run locally, you need to set the JWTs for the intranet. # Go to https://dev.intranet.justice.gov.uk wait 30s for a heartbeat request to complete, and copy the JWT cookie value. INTRANET_JWT_DEV="" -# 7️⃣ Repeat for production. +# 7️⃣ Repeat for production at https://intranet.justice.gov.uk INTRANET_JWT_PRODUCTION="" From 84b14617022907cb66fc8440f438bcf7584e6a97 Mon Sep 17 00:00:00 2001 From: EarthlingDavey <15802017+EarthlingDavey@users.noreply.github.com> Date: Thu, 6 Feb 2025 17:48:52 +0000 Subject: [PATCH 8/9] Update cloudfront.js --- conf/node/controllers/cloudfront.js | 1 - 1 file changed, 1 deletion(-) diff --git a/conf/node/controllers/cloudfront.js b/conf/node/controllers/cloudfront.js index b37ecfe..1305f3d 100644 --- a/conf/node/controllers/cloudfront.js +++ b/conf/node/controllers/cloudfront.js @@ -2,7 +2,6 @@ import crypto from "node:crypto"; import { getSignedCookies } from "@aws-sdk/cloudfront-signer"; import { - isLocal, cloudFrontKeysObject as keysObject, cloudFrontPublicKey as publicKey, cloudFrontPrivateKey as privateKey, From 1650e43aba25d3587a5fd81002da7afe48f74be0 Mon Sep 17 00:00:00 2001 From: EarthlingDavey <15802017+EarthlingDavey@users.noreply.github.com> Date: Fri, 7 Feb 2025 14:37:05 +0000 Subject: [PATCH 9/9] Update middleware.js --- conf/node/middleware.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/conf/node/middleware.js b/conf/node/middleware.js index 64a0789..220ea40 100644 --- a/conf/node/middleware.js +++ b/conf/node/middleware.js @@ -162,7 +162,8 @@ export const checkSignature = (req, _res, next) => { */ export const errorHandler = (err, _req, res, _next) => { - console.log(err); + // Log the error to the console - will be available in Kibana logs. + console.error(err); if (err.status === 400) { res