Merge pull request #344 from cofacts/wayback-machine

Archive article and reply text to Wayback Machine
cofacts · Sep 4, 2024 · 4857e9f · 4857e9f
2 parents 1a74e5d + fbd6d80
commit 4857e9f
Show file tree

Hide file tree

Showing 11 changed files with 300 additions and 12 deletions.
diff --git a/.env.sample b/.env.sample
@@ -126,3 +126,9 @@ LOG_REQUESTS=
 # It will create the topic, subscription and schema if not exists.
 #
 ADMIN_PUBSUB_TOPIC=
+
+# Internet Archive S30Like API key and secret from https://archive.org/account/s3.php
+# They are used to call Save Page Now 2 Public API
+#
+INTERNET_ARCHIVE_S3_ACCESS_KEY=
+INTERNET_ARCHIVE_S3_SECRET_KEY=
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -75,6 +75,7 @@
     "@babel/preset-env": "^7.16.11",
     "@babel/preset-typescript": "^7.24.1",
     "@google-cloud/storage": "^6.11.0",
+    "@types/node": "^18",
     "@typescript-eslint/eslint-plugin": "^5.56.0",
     "@typescript-eslint/parser": "^5.56.0",
     "apollo-server-testing": "^2.18.2",

diff --git a/src/graphql/mutations/CreateArticle.js b/src/graphql/mutations/CreateArticle.js
@@ -8,6 +8,7 @@ import scrapUrls from 'util/scrapUrls';
 import { ArticleReferenceInput } from 'graphql/models/ArticleReference';
 import MutationResult from 'graphql/models/MutationResult';
 import { createOrUpdateReplyRequest } from './CreateOrUpdateReplyRequest';
+import archiveUrlsFromText from 'util/archiveUrlsFromText';
 
 /* Instantiate hash function */
 const xxhash64 = h64();
@@ -45,7 +46,9 @@ async function createNewArticle({ text, reference: originalReference, user }) {
     appId: user.appId,
   };
 
-  await client.update({
+  const {
+    body: { result },
+  } = await client.update({
     index: 'articles',
     type: 'doc',
     id: articleId,
@@ -85,6 +88,11 @@ async function createNewArticle({ text, reference: originalReference, user }) {
     refresh: 'true', // Make sure the data is indexed when we create ReplyRequest
   });
 
+  if (result === 'created') {
+    // Archive URLs in article and don't wait for the result
+    archiveUrlsFromText(text);
+  }
+
   return articleId;
 }
 

diff --git a/src/graphql/mutations/CreateMediaArticle.js b/src/graphql/mutations/CreateMediaArticle.js
@@ -17,6 +17,7 @@ import { ArticleReferenceInput } from 'graphql/models/ArticleReference';
 import MutationResult from 'graphql/models/MutationResult';
 import { createOrUpdateReplyRequest } from './CreateOrUpdateReplyRequest';
 import ArticleTypeEnum from 'graphql/models/ArticleTypeEnum';
+import archiveUrlsFromText from 'util/archiveUrlsFromText';
 
 const METADATA = {
   cacheControl: 'public, max-age=31536000, immutable',
@@ -273,6 +274,10 @@ export default {
           if (!aiResponse) {
             throw new Error('AI transcript not found');
           }
+
+          // Archive URLs in transcript; don't wait for it
+          archiveUrlsFromText(aiResponse.text);
+
           return writeAITranscript(articleId, aiResponse.text);
         })
         .then(() => {

diff --git a/src/graphql/mutations/CreateReply.js b/src/graphql/mutations/CreateReply.js
@@ -4,6 +4,7 @@ import { assertUser } from 'util/user';
 
 import client from 'util/client';
 import scrapUrls from 'util/scrapUrls';
+import archiveUrlsFromText from 'util/archiveUrlsFromText';
 
 import ReplyTypeEnum from 'graphql/models/ReplyTypeEnum';
 import MutationResult from 'graphql/models/MutationResult';
@@ -90,6 +91,13 @@ export default {
         return _id;
       });
 
+    // Archive both text and reference.
+    // No need to wait for the result.
+    //
+    newReplyPromise.then(() =>
+      Promise.all([archiveUrlsFromText(text), archiveUrlsFromText(reference)])
+    );
+
     const scrapPromise = scrapUrls(`${text} ${reference}`, {
       cacheLoader: loaders.urlLoader,
       client,

diff --git a/src/graphql/mutations/__tests__/CreateArticle.js b/src/graphql/mutations/__tests__/CreateArticle.js
@@ -5,9 +5,15 @@ import MockDate from 'mockdate';
 import fixtures, { fixture1Text } from '../__fixtures__/CreateArticle';
 import { getReplyRequestId } from '../CreateOrUpdateReplyRequest';
 import { getArticleId } from 'graphql/mutations/CreateArticle';
+import archiveUrlsFromText from 'util/archiveUrlsFromText';
+
+jest.mock('util/archiveUrlsFromText', () => jest.fn(() => []));
 
 describe('creation', () => {
-  beforeEach(() => loadFixtures(fixtures));
+  beforeEach(async () => {
+    archiveUrlsFromText.mockClear();
+    await loadFixtures(fixtures);
+  });
   afterEach(() => unloadFixtures(fixtures));
 
   it('creates articles and a reply request and fills in URLs', async () => {
@@ -47,6 +53,15 @@ describe('creation', () => {
     expect(article.replyRequestCount).toBe(1);
     expect(article).toMatchSnapshot();
 
+    // Make sure archiveUrlsFromText is called with article text
+    expect(archiveUrlsFromText.mock.calls).toMatchInlineSnapshot(`
+      Array [
+        Array [
+          "FOO FOO http://foo.com/article/1",
+        ],
+      ]
+    `);
+
     const replyRequestId = getReplyRequestId({
       articleId: data.CreateArticle.id,
       userId,

diff --git a/src/graphql/mutations/__tests__/CreateMediaArticle.js b/src/graphql/mutations/__tests__/CreateMediaArticle.js
@@ -7,13 +7,16 @@ import client from 'util/client';
 import fixtures from '../__fixtures__/CreateMediaArticle';
 import { getReplyRequestId } from '../CreateOrUpdateReplyRequest';
 import mediaManager from 'util/mediaManager';
+import archiveUrlsFromText from 'util/archiveUrlsFromText';
 
 jest.mock('util/mediaManager');
+jest.mock('util/archiveUrlsFromText', () => jest.fn(() => []));
 
 describe('creation', () => {
   beforeAll(() => loadFixtures(fixtures));
   beforeEach(() => {
     mediaManager.insert.mockClear();
+    archiveUrlsFromText.mockClear();
   });
   afterAll(() => unloadFixtures(fixtures));
 
@@ -68,6 +71,15 @@ describe('creation', () => {
       ]
     `);
 
+    // Expect archiveUrlsFromText is called with OCR result
+    expect(archiveUrlsFromText.mock.calls).toMatchInlineSnapshot(`
+      Array [
+        Array [
+          "OCR result of output image",
+        ],
+      ]
+    `);
+
     const {
       body: { _source: article },
     } = await client.get({

diff --git a/src/graphql/mutations/__tests__/CreateReply.js b/src/graphql/mutations/__tests__/CreateReply.js
@@ -1,4 +1,5 @@
 jest.mock('util/grpc');
+jest.mock('util/archiveUrlsFromText', () => jest.fn(() => []));
 
 import gql from 'util/GraphQL';
 import { loadFixtures, unloadFixtures, resetFrom } from 'util/fixtures';
@@ -7,9 +8,13 @@ import MockDate from 'mockdate';
 import fixtures from '../__fixtures__/CreateReply';
 import resolveUrl from 'util/grpc';
 import delayForMs from 'util/delayForMs';
+import archiveUrlsFromText from 'util/archiveUrlsFromText';
 
 describe('CreateReply', () => {
   beforeAll(() => loadFixtures(fixtures));
+  beforeEach(() => {
+    archiveUrlsFromText.mockClear();
+  });
 
   it('creates replies and associates itself with specified article', async () => {
     MockDate.set(1485593157011);
@@ -66,6 +71,19 @@ describe('CreateReply', () => {
     });
     expect(article._source.articleReplies[0].replyId).toBe(replyId);
 
+    // Make sure archiveUrlsFromText is called with text and reference
+    //
+    expect(archiveUrlsFromText.mock.calls).toMatchInlineSnapshot(`
+      Array [
+        Array [
+          "FOO FOO",
+        ],
+        Array [
+          "http://shouldscrap.com/",
+        ],
+      ]
+    `);
+
     // Wait until urls are resolved
     await delayForMs(1000);
     MockDate.reset();