From 379654769bba578e16ae09b40b38dc80602bfc01 Mon Sep 17 00:00:00 2001 From: MrOrz Date: Sun, 1 Sep 2024 22:56:45 +0800 Subject: [PATCH 1/9] fix: use nodejs 18's typescript definition --- package-lock.json | 45 +++++++++++++++++++++++++++++++++++---------- package.json | 1 + 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/package-lock.json b/package-lock.json index 5095a09d..8491f4a0 100644 --- a/package-lock.json +++ b/package-lock.json @@ -59,6 +59,7 @@ "@babel/preset-env": "^7.16.11", "@babel/preset-typescript": "^7.24.1", "@google-cloud/storage": "^6.11.0", + "@types/node": "^18", "@typescript-eslint/eslint-plugin": "^5.56.0", "@typescript-eslint/parser": "^5.56.0", "apollo-server-testing": "^2.18.2", @@ -119,6 +120,11 @@ "apollo-pbts": "bin/pbts" } }, + "node_modules/@apollo/protobufjs/node_modules/@types/node": { + "version": "10.17.60", + "resolved": "https://registry.npmjs.org/@types/node/-/node-10.17.60.tgz", + "integrity": "sha512-F0KIgDJfy2nA3zMLmWGKxcH2ZVEtCZXHHdOQs2gSaQ27+lNeEfGxzkIw90aXswATX7AZ33tahPbzy6KAfUreVw==" + }, "node_modules/@apollographql/apollo-tools": { "version": "0.4.4", "license": "MIT", @@ -3046,10 +3052,6 @@ "node": ">=6" } }, - "node_modules/@grpc/grpc-js/node_modules/@types/node": { - "version": "17.0.29", - "license": "MIT" - }, "node_modules/@grpc/grpc-js/node_modules/ansi-regex": { "version": "5.0.1", "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", @@ -4699,8 +4701,12 @@ "integrity": "sha512-K0VQKziLUWkVKiRVrx4a40iPaxTUefQmjtkQofBkYRcoaaL/8rhwDWww9qWbrgicNOgnpIsMxyNIUM4+n6dUIA==" }, "node_modules/@types/node": { - "version": "10.12.18", - "license": "MIT" + "version": "18.19.48", + "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.48.tgz", + "integrity": "sha512-7WevbG4ekUcRQSZzOwxWgi5dZmTak7FaxXDoW7xVxPBmKx1rTzfmRLkeCgJzcbBnOV2dkhAPc8cCeT6agocpjg==", + "dependencies": { + "undici-types": "~5.26.4" + } }, "node_modules/@types/node-fetch": { "version": "2.5.5", @@ -16643,6 +16649,11 @@ "resolved": "https://registry.npmjs.org/underscore/-/underscore-1.13.6.tgz", "integrity": "sha512-+A5Sja4HP1M08MaXya7p5LvjuM7K6q/2EaC0+iovj/wOcMsTzMvDFbasi/oSapiwOlt252IqsKqPjCl7huKS0A==" }, + "node_modules/undici-types": { + "version": "5.26.5", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", + "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==" + }, "node_modules/unicode-canonical-property-names-ecmascript": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/unicode-canonical-property-names-ecmascript/-/unicode-canonical-property-names-ecmascript-2.0.0.tgz", @@ -17257,6 +17268,13 @@ "@types/long": "^4.0.0", "@types/node": "^10.1.0", "long": "^4.0.0" + }, + "dependencies": { + "@types/node": { + "version": "10.17.60", + "resolved": "https://registry.npmjs.org/@types/node/-/node-10.17.60.tgz", + "integrity": "sha512-F0KIgDJfy2nA3zMLmWGKxcH2ZVEtCZXHHdOQs2gSaQ27+lNeEfGxzkIw90aXswATX7AZ33tahPbzy6KAfUreVw==" + } } }, "@apollographql/apollo-tools": { @@ -19342,9 +19360,6 @@ "yargs": "^17.7.2" } }, - "@types/node": { - "version": "17.0.29" - }, "ansi-regex": { "version": "5.0.1", "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", @@ -20658,7 +20673,12 @@ "integrity": "sha512-K0VQKziLUWkVKiRVrx4a40iPaxTUefQmjtkQofBkYRcoaaL/8rhwDWww9qWbrgicNOgnpIsMxyNIUM4+n6dUIA==" }, "@types/node": { - "version": "10.12.18" + "version": "18.19.48", + "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.48.tgz", + "integrity": "sha512-7WevbG4ekUcRQSZzOwxWgi5dZmTak7FaxXDoW7xVxPBmKx1rTzfmRLkeCgJzcbBnOV2dkhAPc8cCeT6agocpjg==", + "requires": { + "undici-types": "~5.26.4" + } }, "@types/node-fetch": { "version": "2.5.5", @@ -29071,6 +29091,11 @@ "resolved": "https://registry.npmjs.org/underscore/-/underscore-1.13.6.tgz", "integrity": "sha512-+A5Sja4HP1M08MaXya7p5LvjuM7K6q/2EaC0+iovj/wOcMsTzMvDFbasi/oSapiwOlt252IqsKqPjCl7huKS0A==" }, + "undici-types": { + "version": "5.26.5", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", + "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==" + }, "unicode-canonical-property-names-ecmascript": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/unicode-canonical-property-names-ecmascript/-/unicode-canonical-property-names-ecmascript-2.0.0.tgz", diff --git a/package.json b/package.json index a99bbf21..8fed135c 100644 --- a/package.json +++ b/package.json @@ -75,6 +75,7 @@ "@babel/preset-env": "^7.16.11", "@babel/preset-typescript": "^7.24.1", "@google-cloud/storage": "^6.11.0", + "@types/node": "^18", "@typescript-eslint/eslint-plugin": "^5.56.0", "@typescript-eslint/parser": "^5.56.0", "apollo-server-testing": "^2.18.2", From 325bc6105e4ae485e0e765f81abd15383256b866 Mon Sep 17 00:00:00 2001 From: MrOrz Date: Mon, 2 Sep 2024 00:08:52 +0800 Subject: [PATCH 2/9] feat(util): implement archiveUrlsFromText --- .env.sample | 6 ++ src/util/__tests__/archiveUrlsFromText.ts | 92 +++++++++++++++++++++++ src/util/archiveUrlsFromText.ts | 39 ++++++++++ 3 files changed, 137 insertions(+) create mode 100644 src/util/__tests__/archiveUrlsFromText.ts create mode 100644 src/util/archiveUrlsFromText.ts diff --git a/.env.sample b/.env.sample index a033133c..940ae9f8 100644 --- a/.env.sample +++ b/.env.sample @@ -126,3 +126,9 @@ LOG_REQUESTS= # It will create the topic, subscription and schema if not exists. # ADMIN_PUBSUB_TOPIC= + +# Internet Archive S30Like API key and secret from https://archive.org/account/s3.php +# They are used to call Save Page Now 2 Public API +# +INTERNET_ARCHIVE_S3_ACCESS_KEY= +INTERNET_ARCHIVE_S3_SECRET_KEY= diff --git a/src/util/__tests__/archiveUrlsFromText.ts b/src/util/__tests__/archiveUrlsFromText.ts new file mode 100644 index 00000000..fa2558b9 --- /dev/null +++ b/src/util/__tests__/archiveUrlsFromText.ts @@ -0,0 +1,92 @@ +import { jest, describe, beforeAll, afterAll, it, expect } from '@jest/globals'; +import archiveUrlsFromText from '../archiveUrlsFromText'; + +describe('archiveUrlsFromText', () => { + let realEnvs: { [key: string]: string | undefined }; + let mockedFetch: jest.Spied; + beforeAll(() => { + // Spy on and mock the global fetch function + mockedFetch = jest.spyOn(global, 'fetch'); + mockedFetch.mockImplementation(async (url) => { + // Make Tyepscript happy + if (typeof url !== 'string') + throw new Error( + 'Fetch with non-string URL is not implemented in unit test' + ); + + // Extract URL to archive from fetched URL + const params = new URL(url).searchParams; + const urlToArchive = params.get('url'); + + return { + json: async () => ({ job_id: '123', url: urlToArchive }), + } as Response; + }); + + realEnvs = { + INTERNET_ARCHIVE_S3_ACCESS_KEY: + process.env.INTERNET_ARCHIVE_S3_ACCESS_KEY, + INTERNET_ARCHIVE_S3_SECRET_KEY: + process.env.INTERNET_ARCHIVE_S3_SECRET_KEY, + }; + + process.env.INTERNET_ARCHIVE_S3_ACCESS_KEY = 'test-access-key'; + process.env.INTERNET_ARCHIVE_S3_SECRET_KEY = 'test-secret'; + }); + + afterAll(() => { + jest.restoreAllMocks(); + process.env.INTERNET_ARCHIVE_S3_ACCESS_KEY = + realEnvs.INTERNET_ARCHIVE_S3_ACCESS_KEY; + process.env.INTERNET_ARCHIVE_S3_SECRET_KEY = + realEnvs.INTERNET_ARCHIVE_S3_SECRET_KEY; + }); + + it('expect URL in text are archived', async () => { + const text = + 'Please check https://example.com and https://example2.com?foo=bar&fbclid=123'; + const results = await archiveUrlsFromText(text); + + // Check if job_id is attached and fbclid is removed + // + expect(results).toMatchInlineSnapshot(` + Array [ + Object { + "job_id": "123", + "url": "https://example.com/", + }, + Object { + "job_id": "123", + "url": "https://example2.com/?foo=bar", + }, + ] + `); + + // Check if https://web.archive.org/save is called with expected params and headers + // + expect(mockedFetch.mock.calls).toMatchInlineSnapshot(` + Array [ + Array [ + "https://web.archive.org/save?url=https%3A%2F%2Fexample.com%2F&capture_screenshot=1&skip_first_archive=1&delay_wb_availability=1", + Object { + "headers": Object { + "Accept": "application/json", + "Authorization": "LOW test-access-key:test-secret", + }, + "method": "POST", + }, + ], + Array [ + "https://web.archive.org/save?url=https%3A%2F%2Fexample2.com%2F%3Ffoo%3Dbar&capture_screenshot=1&skip_first_archive=1&delay_wb_availability=1", + Object { + "headers": Object { + "Accept": "application/json", + "Authorization": "LOW test-access-key:test-secret", + }, + "method": "POST", + }, + ], + ] + `); + }); +}); diff --git a/src/util/archiveUrlsFromText.ts b/src/util/archiveUrlsFromText.ts new file mode 100644 index 00000000..b72703ac --- /dev/null +++ b/src/util/archiveUrlsFromText.ts @@ -0,0 +1,39 @@ +/** Extract URLs from text and send to Internet Archive Wayback Machine */ + +import urlRegex from 'url-regex'; +import { removeFBCLIDIfExist } from './scrapUrls'; + +export default async function archiveUrlsFromText(text: string) { + const originalUrls = text.match(urlRegex()) || []; + if (originalUrls.length === 0) return []; + + // Normalize URLs before sending to cache or scrapper to increase cache hit + // + const normalizedUrls = removeFBCLIDIfExist(originalUrls); + + const results = await Promise.all( + normalizedUrls.map(async (url) => { + const params = new URLSearchParams({ + url, + capture_screenshot: '1', + skip_first_archive: '1', + delay_wb_availability: '1', // Help reduce load on IA servers + }); + return ( + await fetch(`https://web.archive.org/save?${params.toString()}`, { + method: 'POST', + headers: { + Accept: 'application/json', + Authorization: `LOW ${process.env.INTERNET_ARCHIVE_S3_ACCESS_KEY}:${process.env.INTERNET_ARCHIVE_S3_SECRET_KEY}`, + }, + }) + ).json(); + }) + ); + + console.info(`[archiveUrlsFromText] Archiving ${results.length} URLs`); + results.forEach((result) => + console.info(`[archiveUrlsFromText] [${result.job_id}]: ${result.url}`) + ); + return results; +} From 84a55aace27f15cf3a7eff6c12ee220dcf2af6a3 Mon Sep 17 00:00:00 2001 From: MrOrz Date: Mon, 2 Sep 2024 00:29:40 +0800 Subject: [PATCH 3/9] feat(CreateReply): archive when creating replies --- src/graphql/mutations/CreateReply.js | 8 ++++++++ src/graphql/mutations/__tests__/CreateReply.js | 18 ++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/src/graphql/mutations/CreateReply.js b/src/graphql/mutations/CreateReply.js index 79436c19..081ec9cd 100644 --- a/src/graphql/mutations/CreateReply.js +++ b/src/graphql/mutations/CreateReply.js @@ -4,6 +4,7 @@ import { assertUser } from 'util/user'; import client from 'util/client'; import scrapUrls from 'util/scrapUrls'; +import archiveUrlsFromText from 'util/archiveUrlsFromText'; import ReplyTypeEnum from 'graphql/models/ReplyTypeEnum'; import MutationResult from 'graphql/models/MutationResult'; @@ -90,6 +91,13 @@ export default { return _id; }); + // Archive both text and reference. + // No need to wait for the result. + // + newReplyPromise.then(() => + Promise.all([archiveUrlsFromText(text), archiveUrlsFromText(reference)]) + ); + const scrapPromise = scrapUrls(`${text} ${reference}`, { cacheLoader: loaders.urlLoader, client, diff --git a/src/graphql/mutations/__tests__/CreateReply.js b/src/graphql/mutations/__tests__/CreateReply.js index 1f80f744..6514bf80 100644 --- a/src/graphql/mutations/__tests__/CreateReply.js +++ b/src/graphql/mutations/__tests__/CreateReply.js @@ -1,4 +1,5 @@ jest.mock('util/grpc'); +jest.mock('util/archiveUrlsFromText', () => jest.fn(() => [])); import gql from 'util/GraphQL'; import { loadFixtures, unloadFixtures, resetFrom } from 'util/fixtures'; @@ -7,9 +8,13 @@ import MockDate from 'mockdate'; import fixtures from '../__fixtures__/CreateReply'; import resolveUrl from 'util/grpc'; import delayForMs from 'util/delayForMs'; +import archiveUrlsFromText from 'util/archiveUrlsFromText'; describe('CreateReply', () => { beforeAll(() => loadFixtures(fixtures)); + beforeEach(() => { + archiveUrlsFromText.mockClear(); + }); it('creates replies and associates itself with specified article', async () => { MockDate.set(1485593157011); @@ -66,6 +71,19 @@ describe('CreateReply', () => { }); expect(article._source.articleReplies[0].replyId).toBe(replyId); + // Make sure archiveUrlsFromText is called with text and reference + // + expect(archiveUrlsFromText.mock.calls).toMatchInlineSnapshot(` + Array [ + Array [ + "FOO FOO", + ], + Array [ + "http://shouldscrap.com/", + ], + ] + `); + // Wait until urls are resolved await delayForMs(1000); MockDate.reset(); From 00dd2bde6c1aeeec34c3e113874b931a668f5274 Mon Sep 17 00:00:00 2001 From: MrOrz Date: Mon, 2 Sep 2024 00:50:04 +0800 Subject: [PATCH 4/9] test(util): add no URL test case --- src/util/__tests__/archiveUrlsFromText.ts | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/util/__tests__/archiveUrlsFromText.ts b/src/util/__tests__/archiveUrlsFromText.ts index fa2558b9..0f0cf0f2 100644 --- a/src/util/__tests__/archiveUrlsFromText.ts +++ b/src/util/__tests__/archiveUrlsFromText.ts @@ -1,4 +1,12 @@ -import { jest, describe, beforeAll, afterAll, it, expect } from '@jest/globals'; +import { + jest, + describe, + beforeAll, + beforeEach, + afterAll, + it, + expect, +} from '@jest/globals'; import archiveUrlsFromText from '../archiveUrlsFromText'; describe('archiveUrlsFromText', () => { @@ -34,6 +42,10 @@ describe('archiveUrlsFromText', () => { process.env.INTERNET_ARCHIVE_S3_SECRET_KEY = 'test-secret'; }); + beforeEach(() => { + mockedFetch.mockClear(); + }); + afterAll(() => { jest.restoreAllMocks(); process.env.INTERNET_ARCHIVE_S3_ACCESS_KEY = @@ -89,4 +101,11 @@ describe('archiveUrlsFromText', () => { ] `); }); + + it('do nothing if no URL in text', async () => { + const text = 'No URL here'; + const results = await archiveUrlsFromText(text); + expect(results).toEqual([]); + expect(mockedFetch).not.toBeCalled(); + }); }); From 3b15c5e0fac494f7b0895d848ecf315ed9dac234 Mon Sep 17 00:00:00 2001 From: MrOrz Date: Mon, 2 Sep 2024 01:04:54 +0800 Subject: [PATCH 5/9] feat(CreateMediaArticle): archive OCR text --- src/graphql/mutations/CreateMediaArticle.js | 5 +++++ .../mutations/__tests__/CreateMediaArticle.js | 12 ++++++++++++ 2 files changed, 17 insertions(+) diff --git a/src/graphql/mutations/CreateMediaArticle.js b/src/graphql/mutations/CreateMediaArticle.js index 426e91ee..013fbdb0 100644 --- a/src/graphql/mutations/CreateMediaArticle.js +++ b/src/graphql/mutations/CreateMediaArticle.js @@ -17,6 +17,7 @@ import { ArticleReferenceInput } from 'graphql/models/ArticleReference'; import MutationResult from 'graphql/models/MutationResult'; import { createOrUpdateReplyRequest } from './CreateOrUpdateReplyRequest'; import ArticleTypeEnum from 'graphql/models/ArticleTypeEnum'; +import archiveUrlsFromText from 'util/archiveUrlsFromText'; const METADATA = { cacheControl: 'public, max-age=31536000, immutable', @@ -273,6 +274,10 @@ export default { if (!aiResponse) { throw new Error('AI transcript not found'); } + + // Archive URLs in transcript; don't wait for it + archiveUrlsFromText(aiResponse.text); + return writeAITranscript(articleId, aiResponse.text); }) .then(() => { diff --git a/src/graphql/mutations/__tests__/CreateMediaArticle.js b/src/graphql/mutations/__tests__/CreateMediaArticle.js index c8dd2d58..b7f04b1c 100644 --- a/src/graphql/mutations/__tests__/CreateMediaArticle.js +++ b/src/graphql/mutations/__tests__/CreateMediaArticle.js @@ -7,13 +7,16 @@ import client from 'util/client'; import fixtures from '../__fixtures__/CreateMediaArticle'; import { getReplyRequestId } from '../CreateOrUpdateReplyRequest'; import mediaManager from 'util/mediaManager'; +import archiveUrlsFromText from 'util/archiveUrlsFromText'; jest.mock('util/mediaManager'); +jest.mock('util/archiveUrlsFromText', () => jest.fn(() => [])); describe('creation', () => { beforeAll(() => loadFixtures(fixtures)); beforeEach(() => { mediaManager.insert.mockClear(); + archiveUrlsFromText.mockClear(); }); afterAll(() => unloadFixtures(fixtures)); @@ -68,6 +71,15 @@ describe('creation', () => { ] `); + // Expect archiveUrlsFromText is called with OCR result + expect(archiveUrlsFromText.mock.calls).toMatchInlineSnapshot(` + Array [ + Array [ + "OCR result of output image", + ], + ] + `); + const { body: { _source: article }, } = await client.get({ From db73355d18039082785d10714aef769fb23d6650 Mon Sep 17 00:00:00 2001 From: MrOrz Date: Mon, 2 Sep 2024 01:10:08 +0800 Subject: [PATCH 6/9] feat(CreateArticle): call archive --- src/graphql/mutations/CreateArticle.js | 10 +++++++++- .../mutations/__tests__/CreateArticle.js | 17 ++++++++++++++++- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/src/graphql/mutations/CreateArticle.js b/src/graphql/mutations/CreateArticle.js index 14ec76f7..93d31ab2 100644 --- a/src/graphql/mutations/CreateArticle.js +++ b/src/graphql/mutations/CreateArticle.js @@ -8,6 +8,7 @@ import scrapUrls from 'util/scrapUrls'; import { ArticleReferenceInput } from 'graphql/models/ArticleReference'; import MutationResult from 'graphql/models/MutationResult'; import { createOrUpdateReplyRequest } from './CreateOrUpdateReplyRequest'; +import archiveUrlsFromText from 'util/archiveUrlsFromText'; /* Instantiate hash function */ const xxhash64 = h64(); @@ -45,7 +46,9 @@ async function createNewArticle({ text, reference: originalReference, user }) { appId: user.appId, }; - await client.update({ + const { + body: { result }, + } = await client.update({ index: 'articles', type: 'doc', id: articleId, @@ -85,6 +88,11 @@ async function createNewArticle({ text, reference: originalReference, user }) { refresh: 'true', // Make sure the data is indexed when we create ReplyRequest }); + if (result === 'created') { + // Archive URLs in article and don't wait for the result + archiveUrlsFromText(text); + } + return articleId; } diff --git a/src/graphql/mutations/__tests__/CreateArticle.js b/src/graphql/mutations/__tests__/CreateArticle.js index 86f8f8ca..3e8728f2 100644 --- a/src/graphql/mutations/__tests__/CreateArticle.js +++ b/src/graphql/mutations/__tests__/CreateArticle.js @@ -5,9 +5,15 @@ import MockDate from 'mockdate'; import fixtures, { fixture1Text } from '../__fixtures__/CreateArticle'; import { getReplyRequestId } from '../CreateOrUpdateReplyRequest'; import { getArticleId } from 'graphql/mutations/CreateArticle'; +import archiveUrlsFromText from 'util/archiveUrlsFromText'; + +jest.mock('util/archiveUrlsFromText', () => jest.fn(() => [])); describe('creation', () => { - beforeEach(() => loadFixtures(fixtures)); + beforeEach(async () => { + archiveUrlsFromText.mockClear(); + await loadFixtures(fixtures); + }); afterEach(() => unloadFixtures(fixtures)); it('creates articles and a reply request and fills in URLs', async () => { @@ -47,6 +53,15 @@ describe('creation', () => { expect(article.replyRequestCount).toBe(1); expect(article).toMatchSnapshot(); + // Make sure archiveUrlsFromText is called with article text + expect(archiveUrlsFromText.mock.calls).toMatchInlineSnapshot(` + Array [ + Array [ + "FOO FOO http://foo.com/article/1", + ], + ] + `); + const replyRequestId = getReplyRequestId({ articleId: data.CreateArticle.id, userId, From 93fa2c411e80d9fa8ef5f830c60644f2ae3562e1 Mon Sep 17 00:00:00 2001 From: Johnson Liang Date: Mon, 2 Sep 2024 14:02:38 +0800 Subject: [PATCH 7/9] fix: wayback machine API actually takes form data instead of url params --- src/util/archiveUrlsFromText.ts | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/util/archiveUrlsFromText.ts b/src/util/archiveUrlsFromText.ts index b72703ac..83a29441 100644 --- a/src/util/archiveUrlsFromText.ts +++ b/src/util/archiveUrlsFromText.ts @@ -13,19 +13,20 @@ export default async function archiveUrlsFromText(text: string) { const results = await Promise.all( normalizedUrls.map(async (url) => { - const params = new URLSearchParams({ - url, - capture_screenshot: '1', - skip_first_archive: '1', - delay_wb_availability: '1', // Help reduce load on IA servers - }); + const formData = new FormData(); + formData.append('url', url); + formData.append('capture_screenshot', '1'); + formData.append('skip_first_archive', '1'); + formData.append('delay_wb_availability', '1'); // Help reduce load on IA servers + return ( - await fetch(`https://web.archive.org/save?${params.toString()}`, { + await fetch('https://web.archive.org/save', { method: 'POST', headers: { Accept: 'application/json', Authorization: `LOW ${process.env.INTERNET_ARCHIVE_S3_ACCESS_KEY}:${process.env.INTERNET_ARCHIVE_S3_SECRET_KEY}`, }, + body: formData, }) ).json(); }) From fca0b32285fa664d21f5212de16a10337d8102b9 Mon Sep 17 00:00:00 2001 From: Johnson Liang Date: Mon, 2 Sep 2024 14:19:59 +0800 Subject: [PATCH 8/9] fix(util): mock fetch should get url from req body --- src/util/__tests__/archiveUrlsFromText.ts | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/util/__tests__/archiveUrlsFromText.ts b/src/util/__tests__/archiveUrlsFromText.ts index 0f0cf0f2..7378d33e 100644 --- a/src/util/__tests__/archiveUrlsFromText.ts +++ b/src/util/__tests__/archiveUrlsFromText.ts @@ -15,7 +15,7 @@ describe('archiveUrlsFromText', () => { beforeAll(() => { // Spy on and mock the global fetch function mockedFetch = jest.spyOn(global, 'fetch'); - mockedFetch.mockImplementation(async (url) => { + mockedFetch.mockImplementation(async (url, reqInit) => { // Make Tyepscript happy if (typeof url !== 'string') throw new Error( @@ -23,8 +23,7 @@ describe('archiveUrlsFromText', () => { ); // Extract URL to archive from fetched URL - const params = new URL(url).searchParams; - const urlToArchive = params.get('url'); + const urlToArchive = (reqInit?.body as FormData).get('url'); return { json: async () => ({ job_id: '123', url: urlToArchive }), From fbd6d80b7d3f753c7883efc3acfb49e879fb0c94 Mon Sep 17 00:00:00 2001 From: MrOrz Date: Mon, 2 Sep 2024 14:57:43 +0800 Subject: [PATCH 9/9] fix: IA API actually requires form body instead of query strings. --- src/util/__tests__/archiveUrlsFromText.ts | 44 +++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/src/util/__tests__/archiveUrlsFromText.ts b/src/util/__tests__/archiveUrlsFromText.ts index 7378d33e..9c8b7942 100644 --- a/src/util/__tests__/archiveUrlsFromText.ts +++ b/src/util/__tests__/archiveUrlsFromText.ts @@ -78,8 +78,28 @@ describe('archiveUrlsFromText', () => { expect(mockedFetch.mock.calls).toMatchInlineSnapshot(` Array [ Array [ - "https://web.archive.org/save?url=https%3A%2F%2Fexample.com%2F&capture_screenshot=1&skip_first_archive=1&delay_wb_availability=1", + "https://web.archive.org/save", Object { + "body": FormData { + Symbol(state): Array [ + Object { + "name": "url", + "value": "https://example.com/", + }, + Object { + "name": "capture_screenshot", + "value": "1", + }, + Object { + "name": "skip_first_archive", + "value": "1", + }, + Object { + "name": "delay_wb_availability", + "value": "1", + }, + ], + }, "headers": Object { "Accept": "application/json", "Authorization": "LOW test-access-key:test-secret", @@ -88,8 +108,28 @@ describe('archiveUrlsFromText', () => { }, ], Array [ - "https://web.archive.org/save?url=https%3A%2F%2Fexample2.com%2F%3Ffoo%3Dbar&capture_screenshot=1&skip_first_archive=1&delay_wb_availability=1", + "https://web.archive.org/save", Object { + "body": FormData { + Symbol(state): Array [ + Object { + "name": "url", + "value": "https://example2.com/?foo=bar", + }, + Object { + "name": "capture_screenshot", + "value": "1", + }, + Object { + "name": "skip_first_archive", + "value": "1", + }, + Object { + "name": "delay_wb_availability", + "value": "1", + }, + ], + }, "headers": Object { "Accept": "application/json", "Authorization": "LOW test-access-key:test-secret",