From 379654769bba578e16ae09b40b38dc80602bfc01 Mon Sep 17 00:00:00 2001
From: MrOrz <johnsonliang7@gmail.com>
Date: Sun, 1 Sep 2024 22:56:45 +0800
Subject: [PATCH 1/9] fix: use nodejs 18's typescript definition

---
 package-lock.json | 45 +++++++++++++++++++++++++++++++++++----------
 package.json      |  1 +
 2 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/package-lock.json b/package-lock.json
index 5095a09d..8491f4a0 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -59,6 +59,7 @@
         "@babel/preset-env": "^7.16.11",
         "@babel/preset-typescript": "^7.24.1",
         "@google-cloud/storage": "^6.11.0",
+        "@types/node": "^18",
         "@typescript-eslint/eslint-plugin": "^5.56.0",
         "@typescript-eslint/parser": "^5.56.0",
         "apollo-server-testing": "^2.18.2",
@@ -119,6 +120,11 @@
         "apollo-pbts": "bin/pbts"
       }
     },
+    "node_modules/@apollo/protobufjs/node_modules/@types/node": {
+      "version": "10.17.60",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-10.17.60.tgz",
+      "integrity": "sha512-F0KIgDJfy2nA3zMLmWGKxcH2ZVEtCZXHHdOQs2gSaQ27+lNeEfGxzkIw90aXswATX7AZ33tahPbzy6KAfUreVw=="
+    },
     "node_modules/@apollographql/apollo-tools": {
       "version": "0.4.4",
       "license": "MIT",
@@ -3046,10 +3052,6 @@
         "node": ">=6"
       }
     },
-    "node_modules/@grpc/grpc-js/node_modules/@types/node": {
-      "version": "17.0.29",
-      "license": "MIT"
-    },
     "node_modules/@grpc/grpc-js/node_modules/ansi-regex": {
       "version": "5.0.1",
       "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
@@ -4699,8 +4701,12 @@
       "integrity": "sha512-K0VQKziLUWkVKiRVrx4a40iPaxTUefQmjtkQofBkYRcoaaL/8rhwDWww9qWbrgicNOgnpIsMxyNIUM4+n6dUIA=="
     },
     "node_modules/@types/node": {
-      "version": "10.12.18",
-      "license": "MIT"
+      "version": "18.19.48",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.48.tgz",
+      "integrity": "sha512-7WevbG4ekUcRQSZzOwxWgi5dZmTak7FaxXDoW7xVxPBmKx1rTzfmRLkeCgJzcbBnOV2dkhAPc8cCeT6agocpjg==",
+      "dependencies": {
+        "undici-types": "~5.26.4"
+      }
     },
     "node_modules/@types/node-fetch": {
       "version": "2.5.5",
@@ -16643,6 +16649,11 @@
       "resolved": "https://registry.npmjs.org/underscore/-/underscore-1.13.6.tgz",
       "integrity": "sha512-+A5Sja4HP1M08MaXya7p5LvjuM7K6q/2EaC0+iovj/wOcMsTzMvDFbasi/oSapiwOlt252IqsKqPjCl7huKS0A=="
     },
+    "node_modules/undici-types": {
+      "version": "5.26.5",
+      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz",
+      "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA=="
+    },
     "node_modules/unicode-canonical-property-names-ecmascript": {
       "version": "2.0.0",
       "resolved": "https://registry.npmjs.org/unicode-canonical-property-names-ecmascript/-/unicode-canonical-property-names-ecmascript-2.0.0.tgz",
@@ -17257,6 +17268,13 @@
         "@types/long": "^4.0.0",
         "@types/node": "^10.1.0",
         "long": "^4.0.0"
+      },
+      "dependencies": {
+        "@types/node": {
+          "version": "10.17.60",
+          "resolved": "https://registry.npmjs.org/@types/node/-/node-10.17.60.tgz",
+          "integrity": "sha512-F0KIgDJfy2nA3zMLmWGKxcH2ZVEtCZXHHdOQs2gSaQ27+lNeEfGxzkIw90aXswATX7AZ33tahPbzy6KAfUreVw=="
+        }
       }
     },
     "@apollographql/apollo-tools": {
@@ -19342,9 +19360,6 @@
             "yargs": "^17.7.2"
           }
         },
-        "@types/node": {
-          "version": "17.0.29"
-        },
         "ansi-regex": {
           "version": "5.0.1",
           "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
@@ -20658,7 +20673,12 @@
       "integrity": "sha512-K0VQKziLUWkVKiRVrx4a40iPaxTUefQmjtkQofBkYRcoaaL/8rhwDWww9qWbrgicNOgnpIsMxyNIUM4+n6dUIA=="
     },
     "@types/node": {
-      "version": "10.12.18"
+      "version": "18.19.48",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.48.tgz",
+      "integrity": "sha512-7WevbG4ekUcRQSZzOwxWgi5dZmTak7FaxXDoW7xVxPBmKx1rTzfmRLkeCgJzcbBnOV2dkhAPc8cCeT6agocpjg==",
+      "requires": {
+        "undici-types": "~5.26.4"
+      }
     },
     "@types/node-fetch": {
       "version": "2.5.5",
@@ -29071,6 +29091,11 @@
       "resolved": "https://registry.npmjs.org/underscore/-/underscore-1.13.6.tgz",
       "integrity": "sha512-+A5Sja4HP1M08MaXya7p5LvjuM7K6q/2EaC0+iovj/wOcMsTzMvDFbasi/oSapiwOlt252IqsKqPjCl7huKS0A=="
     },
+    "undici-types": {
+      "version": "5.26.5",
+      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz",
+      "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA=="
+    },
     "unicode-canonical-property-names-ecmascript": {
       "version": "2.0.0",
       "resolved": "https://registry.npmjs.org/unicode-canonical-property-names-ecmascript/-/unicode-canonical-property-names-ecmascript-2.0.0.tgz",
diff --git a/package.json b/package.json
index a99bbf21..8fed135c 100644
--- a/package.json
+++ b/package.json
@@ -75,6 +75,7 @@
     "@babel/preset-env": "^7.16.11",
     "@babel/preset-typescript": "^7.24.1",
     "@google-cloud/storage": "^6.11.0",
+    "@types/node": "^18",
     "@typescript-eslint/eslint-plugin": "^5.56.0",
     "@typescript-eslint/parser": "^5.56.0",
     "apollo-server-testing": "^2.18.2",

From 325bc6105e4ae485e0e765f81abd15383256b866 Mon Sep 17 00:00:00 2001
From: MrOrz <johnsonliang7@gmail.com>
Date: Mon, 2 Sep 2024 00:08:52 +0800
Subject: [PATCH 2/9] feat(util): implement archiveUrlsFromText

---
 .env.sample                               |  6 ++
 src/util/__tests__/archiveUrlsFromText.ts | 92 +++++++++++++++++++++++
 src/util/archiveUrlsFromText.ts           | 39 ++++++++++
 3 files changed, 137 insertions(+)
 create mode 100644 src/util/__tests__/archiveUrlsFromText.ts
 create mode 100644 src/util/archiveUrlsFromText.ts

diff --git a/.env.sample b/.env.sample
index a033133c..940ae9f8 100644
--- a/.env.sample
+++ b/.env.sample
@@ -126,3 +126,9 @@ LOG_REQUESTS=
 # It will create the topic, subscription and schema if not exists.
 #
 ADMIN_PUBSUB_TOPIC=
+
+# Internet Archive S30Like API key and secret from https://archive.org/account/s3.php
+# They are used to call Save Page Now 2 Public API
+#
+INTERNET_ARCHIVE_S3_ACCESS_KEY=
+INTERNET_ARCHIVE_S3_SECRET_KEY=
diff --git a/src/util/__tests__/archiveUrlsFromText.ts b/src/util/__tests__/archiveUrlsFromText.ts
new file mode 100644
index 00000000..fa2558b9
--- /dev/null
+++ b/src/util/__tests__/archiveUrlsFromText.ts
@@ -0,0 +1,92 @@
+import { jest, describe, beforeAll, afterAll, it, expect } from '@jest/globals';
+import archiveUrlsFromText from '../archiveUrlsFromText';
+
+describe('archiveUrlsFromText', () => {
+  let realEnvs: { [key: string]: string | undefined };
+  let mockedFetch: jest.Spied<typeof fetch>;
+  beforeAll(() => {
+    // Spy on and mock the global fetch function
+    mockedFetch = jest.spyOn(global, 'fetch');
+    mockedFetch.mockImplementation(async (url) => {
+      // Make Tyepscript happy
+      if (typeof url !== 'string')
+        throw new Error(
+          'Fetch with non-string URL is not implemented in unit test'
+        );
+
+      // Extract URL to archive from fetched URL
+      const params = new URL(url).searchParams;
+      const urlToArchive = params.get('url');
+
+      return {
+        json: async () => ({ job_id: '123', url: urlToArchive }),
+      } as Response;
+    });
+
+    realEnvs = {
+      INTERNET_ARCHIVE_S3_ACCESS_KEY:
+        process.env.INTERNET_ARCHIVE_S3_ACCESS_KEY,
+      INTERNET_ARCHIVE_S3_SECRET_KEY:
+        process.env.INTERNET_ARCHIVE_S3_SECRET_KEY,
+    };
+
+    process.env.INTERNET_ARCHIVE_S3_ACCESS_KEY = 'test-access-key';
+    process.env.INTERNET_ARCHIVE_S3_SECRET_KEY = 'test-secret';
+  });
+
+  afterAll(() => {
+    jest.restoreAllMocks();
+    process.env.INTERNET_ARCHIVE_S3_ACCESS_KEY =
+      realEnvs.INTERNET_ARCHIVE_S3_ACCESS_KEY;
+    process.env.INTERNET_ARCHIVE_S3_SECRET_KEY =
+      realEnvs.INTERNET_ARCHIVE_S3_SECRET_KEY;
+  });
+
+  it('expect URL in text are archived', async () => {
+    const text =
+      'Please check https://example.com and https://example2.com?foo=bar&fbclid=123';
+    const results = await archiveUrlsFromText(text);
+
+    // Check if job_id is attached and fbclid is removed
+    //
+    expect(results).toMatchInlineSnapshot(`
+      Array [
+        Object {
+          "job_id": "123",
+          "url": "https://example.com/",
+        },
+        Object {
+          "job_id": "123",
+          "url": "https://example2.com/?foo=bar",
+        },
+      ]
+    `);
+
+    // Check if https://web.archive.org/save is called with expected params and headers
+    //
+    expect(mockedFetch.mock.calls).toMatchInlineSnapshot(`
+      Array [
+        Array [
+          "https://web.archive.org/save?url=https%3A%2F%2Fexample.com%2F&capture_screenshot=1&skip_first_archive=1&delay_wb_availability=1",
+          Object {
+            "headers": Object {
+              "Accept": "application/json",
+              "Authorization": "LOW test-access-key:test-secret",
+            },
+            "method": "POST",
+          },
+        ],
+        Array [
+          "https://web.archive.org/save?url=https%3A%2F%2Fexample2.com%2F%3Ffoo%3Dbar&capture_screenshot=1&skip_first_archive=1&delay_wb_availability=1",
+          Object {
+            "headers": Object {
+              "Accept": "application/json",
+              "Authorization": "LOW test-access-key:test-secret",
+            },
+            "method": "POST",
+          },
+        ],
+      ]
+    `);
+  });
+});
diff --git a/src/util/archiveUrlsFromText.ts b/src/util/archiveUrlsFromText.ts
new file mode 100644
index 00000000..b72703ac
--- /dev/null
+++ b/src/util/archiveUrlsFromText.ts
@@ -0,0 +1,39 @@
+/** Extract URLs from text and send to Internet Archive Wayback Machine */
+
+import urlRegex from 'url-regex';
+import { removeFBCLIDIfExist } from './scrapUrls';
+
+export default async function archiveUrlsFromText(text: string) {
+  const originalUrls = text.match(urlRegex()) || [];
+  if (originalUrls.length === 0) return [];
+
+  // Normalize URLs before sending to cache or scrapper to increase cache hit
+  //
+  const normalizedUrls = removeFBCLIDIfExist(originalUrls);
+
+  const results = await Promise.all(
+    normalizedUrls.map(async (url) => {
+      const params = new URLSearchParams({
+        url,
+        capture_screenshot: '1',
+        skip_first_archive: '1',
+        delay_wb_availability: '1', // Help reduce load on IA servers
+      });
+      return (
+        await fetch(`https://web.archive.org/save?${params.toString()}`, {
+          method: 'POST',
+          headers: {
+            Accept: 'application/json',
+            Authorization: `LOW ${process.env.INTERNET_ARCHIVE_S3_ACCESS_KEY}:${process.env.INTERNET_ARCHIVE_S3_SECRET_KEY}`,
+          },
+        })
+      ).json();
+    })
+  );
+
+  console.info(`[archiveUrlsFromText] Archiving ${results.length} URLs`);
+  results.forEach((result) =>
+    console.info(`[archiveUrlsFromText] [${result.job_id}]: ${result.url}`)
+  );
+  return results;
+}

From 84a55aace27f15cf3a7eff6c12ee220dcf2af6a3 Mon Sep 17 00:00:00 2001
From: MrOrz <johnsonliang7@gmail.com>
Date: Mon, 2 Sep 2024 00:29:40 +0800
Subject: [PATCH 3/9] feat(CreateReply): archive when creating replies

---
 src/graphql/mutations/CreateReply.js           |  8 ++++++++
 src/graphql/mutations/__tests__/CreateReply.js | 18 ++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/src/graphql/mutations/CreateReply.js b/src/graphql/mutations/CreateReply.js
index 79436c19..081ec9cd 100644
--- a/src/graphql/mutations/CreateReply.js
+++ b/src/graphql/mutations/CreateReply.js
@@ -4,6 +4,7 @@ import { assertUser } from 'util/user';
 
 import client from 'util/client';
 import scrapUrls from 'util/scrapUrls';
+import archiveUrlsFromText from 'util/archiveUrlsFromText';
 
 import ReplyTypeEnum from 'graphql/models/ReplyTypeEnum';
 import MutationResult from 'graphql/models/MutationResult';
@@ -90,6 +91,13 @@ export default {
         return _id;
       });
 
+    // Archive both text and reference.
+    // No need to wait for the result.
+    //
+    newReplyPromise.then(() =>
+      Promise.all([archiveUrlsFromText(text), archiveUrlsFromText(reference)])
+    );
+
     const scrapPromise = scrapUrls(`${text} ${reference}`, {
       cacheLoader: loaders.urlLoader,
       client,
diff --git a/src/graphql/mutations/__tests__/CreateReply.js b/src/graphql/mutations/__tests__/CreateReply.js
index 1f80f744..6514bf80 100644
--- a/src/graphql/mutations/__tests__/CreateReply.js
+++ b/src/graphql/mutations/__tests__/CreateReply.js
@@ -1,4 +1,5 @@
 jest.mock('util/grpc');
+jest.mock('util/archiveUrlsFromText', () => jest.fn(() => []));
 
 import gql from 'util/GraphQL';
 import { loadFixtures, unloadFixtures, resetFrom } from 'util/fixtures';
@@ -7,9 +8,13 @@ import MockDate from 'mockdate';
 import fixtures from '../__fixtures__/CreateReply';
 import resolveUrl from 'util/grpc';
 import delayForMs from 'util/delayForMs';
+import archiveUrlsFromText from 'util/archiveUrlsFromText';
 
 describe('CreateReply', () => {
   beforeAll(() => loadFixtures(fixtures));
+  beforeEach(() => {
+    archiveUrlsFromText.mockClear();
+  });
 
   it('creates replies and associates itself with specified article', async () => {
     MockDate.set(1485593157011);
@@ -66,6 +71,19 @@ describe('CreateReply', () => {
     });
     expect(article._source.articleReplies[0].replyId).toBe(replyId);
 
+    // Make sure archiveUrlsFromText is called with text and reference
+    //
+    expect(archiveUrlsFromText.mock.calls).toMatchInlineSnapshot(`
+      Array [
+        Array [
+          "FOO FOO",
+        ],
+        Array [
+          "http://shouldscrap.com/",
+        ],
+      ]
+    `);
+
     // Wait until urls are resolved
     await delayForMs(1000);
     MockDate.reset();

From 00dd2bde6c1aeeec34c3e113874b931a668f5274 Mon Sep 17 00:00:00 2001
From: MrOrz <johnsonliang7@gmail.com>
Date: Mon, 2 Sep 2024 00:50:04 +0800
Subject: [PATCH 4/9] test(util): add no URL test case

---
 src/util/__tests__/archiveUrlsFromText.ts | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/util/__tests__/archiveUrlsFromText.ts b/src/util/__tests__/archiveUrlsFromText.ts
index fa2558b9..0f0cf0f2 100644
--- a/src/util/__tests__/archiveUrlsFromText.ts
+++ b/src/util/__tests__/archiveUrlsFromText.ts
@@ -1,4 +1,12 @@
-import { jest, describe, beforeAll, afterAll, it, expect } from '@jest/globals';
+import {
+  jest,
+  describe,
+  beforeAll,
+  beforeEach,
+  afterAll,
+  it,
+  expect,
+} from '@jest/globals';
 import archiveUrlsFromText from '../archiveUrlsFromText';
 
 describe('archiveUrlsFromText', () => {
@@ -34,6 +42,10 @@ describe('archiveUrlsFromText', () => {
     process.env.INTERNET_ARCHIVE_S3_SECRET_KEY = 'test-secret';
   });
 
+  beforeEach(() => {
+    mockedFetch.mockClear();
+  });
+
   afterAll(() => {
     jest.restoreAllMocks();
     process.env.INTERNET_ARCHIVE_S3_ACCESS_KEY =
@@ -89,4 +101,11 @@ describe('archiveUrlsFromText', () => {
       ]
     `);
   });
+
+  it('do nothing if no URL in text', async () => {
+    const text = 'No URL here';
+    const results = await archiveUrlsFromText(text);
+    expect(results).toEqual([]);
+    expect(mockedFetch).not.toBeCalled();
+  });
 });

From 3b15c5e0fac494f7b0895d848ecf315ed9dac234 Mon Sep 17 00:00:00 2001
From: MrOrz <johnsonliang7@gmail.com>
Date: Mon, 2 Sep 2024 01:04:54 +0800
Subject: [PATCH 5/9] feat(CreateMediaArticle): archive OCR text

---
 src/graphql/mutations/CreateMediaArticle.js          |  5 +++++
 .../mutations/__tests__/CreateMediaArticle.js        | 12 ++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/src/graphql/mutations/CreateMediaArticle.js b/src/graphql/mutations/CreateMediaArticle.js
index 426e91ee..013fbdb0 100644
--- a/src/graphql/mutations/CreateMediaArticle.js
+++ b/src/graphql/mutations/CreateMediaArticle.js
@@ -17,6 +17,7 @@ import { ArticleReferenceInput } from 'graphql/models/ArticleReference';
 import MutationResult from 'graphql/models/MutationResult';
 import { createOrUpdateReplyRequest } from './CreateOrUpdateReplyRequest';
 import ArticleTypeEnum from 'graphql/models/ArticleTypeEnum';
+import archiveUrlsFromText from 'util/archiveUrlsFromText';
 
 const METADATA = {
   cacheControl: 'public, max-age=31536000, immutable',
@@ -273,6 +274,10 @@ export default {
           if (!aiResponse) {
             throw new Error('AI transcript not found');
           }
+
+          // Archive URLs in transcript; don't wait for it
+          archiveUrlsFromText(aiResponse.text);
+
           return writeAITranscript(articleId, aiResponse.text);
         })
         .then(() => {
diff --git a/src/graphql/mutations/__tests__/CreateMediaArticle.js b/src/graphql/mutations/__tests__/CreateMediaArticle.js
index c8dd2d58..b7f04b1c 100644
--- a/src/graphql/mutations/__tests__/CreateMediaArticle.js
+++ b/src/graphql/mutations/__tests__/CreateMediaArticle.js
@@ -7,13 +7,16 @@ import client from 'util/client';
 import fixtures from '../__fixtures__/CreateMediaArticle';
 import { getReplyRequestId } from '../CreateOrUpdateReplyRequest';
 import mediaManager from 'util/mediaManager';
+import archiveUrlsFromText from 'util/archiveUrlsFromText';
 
 jest.mock('util/mediaManager');
+jest.mock('util/archiveUrlsFromText', () => jest.fn(() => []));
 
 describe('creation', () => {
   beforeAll(() => loadFixtures(fixtures));
   beforeEach(() => {
     mediaManager.insert.mockClear();
+    archiveUrlsFromText.mockClear();
   });
   afterAll(() => unloadFixtures(fixtures));
 
@@ -68,6 +71,15 @@ describe('creation', () => {
       ]
     `);
 
+    // Expect archiveUrlsFromText is called with OCR result
+    expect(archiveUrlsFromText.mock.calls).toMatchInlineSnapshot(`
+      Array [
+        Array [
+          "OCR result of output image",
+        ],
+      ]
+    `);
+
     const {
       body: { _source: article },
     } = await client.get({

From db73355d18039082785d10714aef769fb23d6650 Mon Sep 17 00:00:00 2001
From: MrOrz <johnsonliang7@gmail.com>
Date: Mon, 2 Sep 2024 01:10:08 +0800
Subject: [PATCH 6/9] feat(CreateArticle): call archive

---
 src/graphql/mutations/CreateArticle.js          | 10 +++++++++-
 .../mutations/__tests__/CreateArticle.js        | 17 ++++++++++++++++-
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/src/graphql/mutations/CreateArticle.js b/src/graphql/mutations/CreateArticle.js
index 14ec76f7..93d31ab2 100644
--- a/src/graphql/mutations/CreateArticle.js
+++ b/src/graphql/mutations/CreateArticle.js
@@ -8,6 +8,7 @@ import scrapUrls from 'util/scrapUrls';
 import { ArticleReferenceInput } from 'graphql/models/ArticleReference';
 import MutationResult from 'graphql/models/MutationResult';
 import { createOrUpdateReplyRequest } from './CreateOrUpdateReplyRequest';
+import archiveUrlsFromText from 'util/archiveUrlsFromText';
 
 /* Instantiate hash function */
 const xxhash64 = h64();
@@ -45,7 +46,9 @@ async function createNewArticle({ text, reference: originalReference, user }) {
     appId: user.appId,
   };
 
-  await client.update({
+  const {
+    body: { result },
+  } = await client.update({
     index: 'articles',
     type: 'doc',
     id: articleId,
@@ -85,6 +88,11 @@ async function createNewArticle({ text, reference: originalReference, user }) {
     refresh: 'true', // Make sure the data is indexed when we create ReplyRequest
   });
 
+  if (result === 'created') {
+    // Archive URLs in article and don't wait for the result
+    archiveUrlsFromText(text);
+  }
+
   return articleId;
 }
 
diff --git a/src/graphql/mutations/__tests__/CreateArticle.js b/src/graphql/mutations/__tests__/CreateArticle.js
index 86f8f8ca..3e8728f2 100644
--- a/src/graphql/mutations/__tests__/CreateArticle.js
+++ b/src/graphql/mutations/__tests__/CreateArticle.js
@@ -5,9 +5,15 @@ import MockDate from 'mockdate';
 import fixtures, { fixture1Text } from '../__fixtures__/CreateArticle';
 import { getReplyRequestId } from '../CreateOrUpdateReplyRequest';
 import { getArticleId } from 'graphql/mutations/CreateArticle';
+import archiveUrlsFromText from 'util/archiveUrlsFromText';
+
+jest.mock('util/archiveUrlsFromText', () => jest.fn(() => []));
 
 describe('creation', () => {
-  beforeEach(() => loadFixtures(fixtures));
+  beforeEach(async () => {
+    archiveUrlsFromText.mockClear();
+    await loadFixtures(fixtures);
+  });
   afterEach(() => unloadFixtures(fixtures));
 
   it('creates articles and a reply request and fills in URLs', async () => {
@@ -47,6 +53,15 @@ describe('creation', () => {
     expect(article.replyRequestCount).toBe(1);
     expect(article).toMatchSnapshot();
 
+    // Make sure archiveUrlsFromText is called with article text
+    expect(archiveUrlsFromText.mock.calls).toMatchInlineSnapshot(`
+      Array [
+        Array [
+          "FOO FOO http://foo.com/article/1",
+        ],
+      ]
+    `);
+
     const replyRequestId = getReplyRequestId({
       articleId: data.CreateArticle.id,
       userId,

From 93fa2c411e80d9fa8ef5f830c60644f2ae3562e1 Mon Sep 17 00:00:00 2001
From: Johnson Liang <johnsonliang7@gmail.com>
Date: Mon, 2 Sep 2024 14:02:38 +0800
Subject: [PATCH 7/9] fix: wayback machine API actually takes form data instead
 of url params

---
 src/util/archiveUrlsFromText.ts | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/util/archiveUrlsFromText.ts b/src/util/archiveUrlsFromText.ts
index b72703ac..83a29441 100644
--- a/src/util/archiveUrlsFromText.ts
+++ b/src/util/archiveUrlsFromText.ts
@@ -13,19 +13,20 @@ export default async function archiveUrlsFromText(text: string) {
 
   const results = await Promise.all(
     normalizedUrls.map(async (url) => {
-      const params = new URLSearchParams({
-        url,
-        capture_screenshot: '1',
-        skip_first_archive: '1',
-        delay_wb_availability: '1', // Help reduce load on IA servers
-      });
+      const formData = new FormData();
+      formData.append('url', url);
+      formData.append('capture_screenshot', '1');
+      formData.append('skip_first_archive', '1');
+      formData.append('delay_wb_availability', '1'); // Help reduce load on IA servers
+
       return (
-        await fetch(`https://web.archive.org/save?${params.toString()}`, {
+        await fetch('https://web.archive.org/save', {
           method: 'POST',
           headers: {
             Accept: 'application/json',
             Authorization: `LOW ${process.env.INTERNET_ARCHIVE_S3_ACCESS_KEY}:${process.env.INTERNET_ARCHIVE_S3_SECRET_KEY}`,
           },
+          body: formData,
         })
       ).json();
     })

From fca0b32285fa664d21f5212de16a10337d8102b9 Mon Sep 17 00:00:00 2001
From: Johnson Liang <johnsonliang7@gmail.com>
Date: Mon, 2 Sep 2024 14:19:59 +0800
Subject: [PATCH 8/9] fix(util): mock fetch should get url from req body

---
 src/util/__tests__/archiveUrlsFromText.ts | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/util/__tests__/archiveUrlsFromText.ts b/src/util/__tests__/archiveUrlsFromText.ts
index 0f0cf0f2..7378d33e 100644
--- a/src/util/__tests__/archiveUrlsFromText.ts
+++ b/src/util/__tests__/archiveUrlsFromText.ts
@@ -15,7 +15,7 @@ describe('archiveUrlsFromText', () => {
   beforeAll(() => {
     // Spy on and mock the global fetch function
     mockedFetch = jest.spyOn(global, 'fetch');
-    mockedFetch.mockImplementation(async (url) => {
+    mockedFetch.mockImplementation(async (url, reqInit) => {
       // Make Tyepscript happy
       if (typeof url !== 'string')
         throw new Error(
@@ -23,8 +23,7 @@ describe('archiveUrlsFromText', () => {
         );
 
       // Extract URL to archive from fetched URL
-      const params = new URL(url).searchParams;
-      const urlToArchive = params.get('url');
+      const urlToArchive = (reqInit?.body as FormData).get('url');
 
       return {
         json: async () => ({ job_id: '123', url: urlToArchive }),

From fbd6d80b7d3f753c7883efc3acfb49e879fb0c94 Mon Sep 17 00:00:00 2001
From: MrOrz <johnsonliang7@gmail.com>
Date: Mon, 2 Sep 2024 14:57:43 +0800
Subject: [PATCH 9/9] fix: IA API actually requires form body instead of query
 strings.

---
 src/util/__tests__/archiveUrlsFromText.ts | 44 +++++++++++++++++++++--
 1 file changed, 42 insertions(+), 2 deletions(-)

diff --git a/src/util/__tests__/archiveUrlsFromText.ts b/src/util/__tests__/archiveUrlsFromText.ts
index 7378d33e..9c8b7942 100644
--- a/src/util/__tests__/archiveUrlsFromText.ts
+++ b/src/util/__tests__/archiveUrlsFromText.ts
@@ -78,8 +78,28 @@ describe('archiveUrlsFromText', () => {
     expect(mockedFetch.mock.calls).toMatchInlineSnapshot(`
       Array [
         Array [
-          "https://web.archive.org/save?url=https%3A%2F%2Fexample.com%2F&capture_screenshot=1&skip_first_archive=1&delay_wb_availability=1",
+          "https://web.archive.org/save",
           Object {
+            "body": FormData {
+              Symbol(state): Array [
+                Object {
+                  "name": "url",
+                  "value": "https://example.com/",
+                },
+                Object {
+                  "name": "capture_screenshot",
+                  "value": "1",
+                },
+                Object {
+                  "name": "skip_first_archive",
+                  "value": "1",
+                },
+                Object {
+                  "name": "delay_wb_availability",
+                  "value": "1",
+                },
+              ],
+            },
             "headers": Object {
               "Accept": "application/json",
               "Authorization": "LOW test-access-key:test-secret",
@@ -88,8 +108,28 @@ describe('archiveUrlsFromText', () => {
           },
         ],
         Array [
-          "https://web.archive.org/save?url=https%3A%2F%2Fexample2.com%2F%3Ffoo%3Dbar&capture_screenshot=1&skip_first_archive=1&delay_wb_availability=1",
+          "https://web.archive.org/save",
           Object {
+            "body": FormData {
+              Symbol(state): Array [
+                Object {
+                  "name": "url",
+                  "value": "https://example2.com/?foo=bar",
+                },
+                Object {
+                  "name": "capture_screenshot",
+                  "value": "1",
+                },
+                Object {
+                  "name": "skip_first_archive",
+                  "value": "1",
+                },
+                Object {
+                  "name": "delay_wb_availability",
+                  "value": "1",
+                },
+              ],
+            },
             "headers": Object {
               "Accept": "application/json",
               "Authorization": "LOW test-access-key:test-secret",