Skip to content

Commit

Permalink
Merge pull request #344 from cofacts/wayback-machine
Browse files Browse the repository at this point in the history
Archive article and reply text to Wayback Machine
  • Loading branch information
MrOrz authored Sep 4, 2024
2 parents 1a74e5d + fbd6d80 commit 4857e9f
Show file tree
Hide file tree
Showing 11 changed files with 300 additions and 12 deletions.
6 changes: 6 additions & 0 deletions .env.sample
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,9 @@ LOG_REQUESTS=
# It will create the topic, subscription and schema if not exists.
#
ADMIN_PUBSUB_TOPIC=

# Internet Archive S30Like API key and secret from https://archive.org/account/s3.php
# They are used to call Save Page Now 2 Public API
#
INTERNET_ARCHIVE_S3_ACCESS_KEY=
INTERNET_ARCHIVE_S3_SECRET_KEY=
45 changes: 35 additions & 10 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
"@babel/preset-env": "^7.16.11",
"@babel/preset-typescript": "^7.24.1",
"@google-cloud/storage": "^6.11.0",
"@types/node": "^18",
"@typescript-eslint/eslint-plugin": "^5.56.0",
"@typescript-eslint/parser": "^5.56.0",
"apollo-server-testing": "^2.18.2",
Expand Down
10 changes: 9 additions & 1 deletion src/graphql/mutations/CreateArticle.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import scrapUrls from 'util/scrapUrls';
import { ArticleReferenceInput } from 'graphql/models/ArticleReference';
import MutationResult from 'graphql/models/MutationResult';
import { createOrUpdateReplyRequest } from './CreateOrUpdateReplyRequest';
import archiveUrlsFromText from 'util/archiveUrlsFromText';

/* Instantiate hash function */
const xxhash64 = h64();
Expand Down Expand Up @@ -45,7 +46,9 @@ async function createNewArticle({ text, reference: originalReference, user }) {
appId: user.appId,
};

await client.update({
const {
body: { result },
} = await client.update({
index: 'articles',
type: 'doc',
id: articleId,
Expand Down Expand Up @@ -85,6 +88,11 @@ async function createNewArticle({ text, reference: originalReference, user }) {
refresh: 'true', // Make sure the data is indexed when we create ReplyRequest
});

if (result === 'created') {
// Archive URLs in article and don't wait for the result
archiveUrlsFromText(text);
}

return articleId;
}

Expand Down
5 changes: 5 additions & 0 deletions src/graphql/mutations/CreateMediaArticle.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import { ArticleReferenceInput } from 'graphql/models/ArticleReference';
import MutationResult from 'graphql/models/MutationResult';
import { createOrUpdateReplyRequest } from './CreateOrUpdateReplyRequest';
import ArticleTypeEnum from 'graphql/models/ArticleTypeEnum';
import archiveUrlsFromText from 'util/archiveUrlsFromText';

const METADATA = {
cacheControl: 'public, max-age=31536000, immutable',
Expand Down Expand Up @@ -273,6 +274,10 @@ export default {
if (!aiResponse) {
throw new Error('AI transcript not found');
}

// Archive URLs in transcript; don't wait for it
archiveUrlsFromText(aiResponse.text);

return writeAITranscript(articleId, aiResponse.text);
})
.then(() => {
Expand Down
8 changes: 8 additions & 0 deletions src/graphql/mutations/CreateReply.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { assertUser } from 'util/user';

import client from 'util/client';
import scrapUrls from 'util/scrapUrls';
import archiveUrlsFromText from 'util/archiveUrlsFromText';

import ReplyTypeEnum from 'graphql/models/ReplyTypeEnum';
import MutationResult from 'graphql/models/MutationResult';
Expand Down Expand Up @@ -90,6 +91,13 @@ export default {
return _id;
});

// Archive both text and reference.
// No need to wait for the result.
//
newReplyPromise.then(() =>
Promise.all([archiveUrlsFromText(text), archiveUrlsFromText(reference)])
);

const scrapPromise = scrapUrls(`${text} ${reference}`, {
cacheLoader: loaders.urlLoader,
client,
Expand Down
17 changes: 16 additions & 1 deletion src/graphql/mutations/__tests__/CreateArticle.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,15 @@ import MockDate from 'mockdate';
import fixtures, { fixture1Text } from '../__fixtures__/CreateArticle';
import { getReplyRequestId } from '../CreateOrUpdateReplyRequest';
import { getArticleId } from 'graphql/mutations/CreateArticle';
import archiveUrlsFromText from 'util/archiveUrlsFromText';

jest.mock('util/archiveUrlsFromText', () => jest.fn(() => []));

describe('creation', () => {
beforeEach(() => loadFixtures(fixtures));
beforeEach(async () => {
archiveUrlsFromText.mockClear();
await loadFixtures(fixtures);
});
afterEach(() => unloadFixtures(fixtures));

it('creates articles and a reply request and fills in URLs', async () => {
Expand Down Expand Up @@ -47,6 +53,15 @@ describe('creation', () => {
expect(article.replyRequestCount).toBe(1);
expect(article).toMatchSnapshot();

// Make sure archiveUrlsFromText is called with article text
expect(archiveUrlsFromText.mock.calls).toMatchInlineSnapshot(`
Array [
Array [
"FOO FOO http://foo.com/article/1",
],
]
`);

const replyRequestId = getReplyRequestId({
articleId: data.CreateArticle.id,
userId,
Expand Down
12 changes: 12 additions & 0 deletions src/graphql/mutations/__tests__/CreateMediaArticle.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,16 @@ import client from 'util/client';
import fixtures from '../__fixtures__/CreateMediaArticle';
import { getReplyRequestId } from '../CreateOrUpdateReplyRequest';
import mediaManager from 'util/mediaManager';
import archiveUrlsFromText from 'util/archiveUrlsFromText';

jest.mock('util/mediaManager');
jest.mock('util/archiveUrlsFromText', () => jest.fn(() => []));

describe('creation', () => {
beforeAll(() => loadFixtures(fixtures));
beforeEach(() => {
mediaManager.insert.mockClear();
archiveUrlsFromText.mockClear();
});
afterAll(() => unloadFixtures(fixtures));

Expand Down Expand Up @@ -68,6 +71,15 @@ describe('creation', () => {
]
`);

// Expect archiveUrlsFromText is called with OCR result
expect(archiveUrlsFromText.mock.calls).toMatchInlineSnapshot(`
Array [
Array [
"OCR result of output image",
],
]
`);

const {
body: { _source: article },
} = await client.get({
Expand Down
18 changes: 18 additions & 0 deletions src/graphql/mutations/__tests__/CreateReply.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
jest.mock('util/grpc');
jest.mock('util/archiveUrlsFromText', () => jest.fn(() => []));

import gql from 'util/GraphQL';
import { loadFixtures, unloadFixtures, resetFrom } from 'util/fixtures';
Expand All @@ -7,9 +8,13 @@ import MockDate from 'mockdate';
import fixtures from '../__fixtures__/CreateReply';
import resolveUrl from 'util/grpc';
import delayForMs from 'util/delayForMs';
import archiveUrlsFromText from 'util/archiveUrlsFromText';

describe('CreateReply', () => {
beforeAll(() => loadFixtures(fixtures));
beforeEach(() => {
archiveUrlsFromText.mockClear();
});

it('creates replies and associates itself with specified article', async () => {
MockDate.set(1485593157011);
Expand Down Expand Up @@ -66,6 +71,19 @@ describe('CreateReply', () => {
});
expect(article._source.articleReplies[0].replyId).toBe(replyId);

// Make sure archiveUrlsFromText is called with text and reference
//
expect(archiveUrlsFromText.mock.calls).toMatchInlineSnapshot(`
Array [
Array [
"FOO FOO",
],
Array [
"http://shouldscrap.com/",
],
]
`);

// Wait until urls are resolved
await delayForMs(1000);
MockDate.reset();
Expand Down
Loading

0 comments on commit 4857e9f

Please sign in to comment.