From 15e1c463436fe6ae1b04bdbfa3b4e7c8fedb99ea Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Wed, 25 Oct 2023 12:46:00 +0200 Subject: [PATCH 01/25] :construction: start drafting new table --- db/migration/1692042923850-AddPostsLinks.ts | 26 ++++++++++++ db/model/PostLink.ts | 46 +++++++++++++++++++++ db/syncPostsToGrapher.ts | 39 ++++++++++++++++- 3 files changed, 110 insertions(+), 1 deletion(-) create mode 100644 db/migration/1692042923850-AddPostsLinks.ts create mode 100644 db/model/PostLink.ts diff --git a/db/migration/1692042923850-AddPostsLinks.ts b/db/migration/1692042923850-AddPostsLinks.ts new file mode 100644 index 00000000000..cd78bf3467a --- /dev/null +++ b/db/migration/1692042923850-AddPostsLinks.ts @@ -0,0 +1,26 @@ +import { MigrationInterface, QueryRunner } from "typeorm" + +export class AddPostsLinks1692042923850 implements MigrationInterface { + public async up(queryRunner: QueryRunner): Promise { + queryRunner.query(`-- sql + CREATE TABLE posts_links ( + id int NOT NULL AUTO_INCREMENT, + sourceId int NOT NULL, + target varchar(2047) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_as_cs NOT NULL, + linkType enum('url','grapher','explorer') CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_as_cs DEFAULT NULL, + componentType varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_as_cs NOT NULL, + text varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_as_cs NOT NULL, + queryString varchar(2047) COLLATE utf8mb4_0900_as_cs NOT NULL, + hash varchar(2047) COLLATE utf8mb4_0900_as_cs NOT NULL, + PRIMARY KEY (id), + KEY sourceId (sourceId), + CONSTRAINT posts_links_ibfk_1 FOREIGN KEY (sourceId) REFERENCES posts (id) + ) ENGINE=InnoDB;`) + } + + public async down(queryRunner: QueryRunner): Promise { + queryRunner.query(`-- sql + DROP TABLE IF EXISTS posts_links; + `) + } +} diff --git a/db/model/PostLink.ts b/db/model/PostLink.ts new file mode 100644 index 00000000000..565c9995a5c --- /dev/null +++ b/db/model/PostLink.ts @@ -0,0 +1,46 @@ +import { Entity, PrimaryGeneratedColumn, Column, BaseEntity } from "typeorm" +import { formatUrls } from "../../site/formatting.js" +import { getLinkType, Url, getUrlTarget } from "@ourworldindata/utils" + +@Entity("posts_links") +export class PostLink extends BaseEntity { + @PrimaryGeneratedColumn() id!: number + // TODO: posts is not a TypeORM but a Knex class so we can't use a TypeORM relationship here yet + + @Column({ type: "int", nullable: false }) sourceId!: number + + @Column() linkType!: "gdoc" | "url" | "grapher" | "explorer" + @Column() target!: string + @Column() queryString!: string + @Column() hash!: string + @Column() componentType!: string + @Column() text!: string + + static createFromUrl({ + url, + sourceId, + text = "", + componentType = "", + }: { + url: string + sourceId: number + text?: string + componentType?: string + }): PostLink { + const formattedUrl = formatUrls(url) + const urlObject = Url.fromURL(formattedUrl) + const linkType = getLinkType(formattedUrl) + const target = getUrlTarget(formattedUrl) + const queryString = urlObject.queryStr + const hash = urlObject.hash + return PostLink.create({ + target, + linkType, + queryString, + hash, + sourceId, + text, + componentType, + }) + } +} diff --git a/db/syncPostsToGrapher.ts b/db/syncPostsToGrapher.ts index a8d6be458bb..8135fe1676f 100644 --- a/db/syncPostsToGrapher.ts +++ b/db/syncPostsToGrapher.ts @@ -3,12 +3,21 @@ import * as wpdb from "./wpdb.js" import * as db from "./db.js" -import { keyBy, PostRow } from "@ourworldindata/utils" +import { + differenceOfSets, + groupBy, + keyBy, + PostRow, +} from "@ourworldindata/utils" import { postsTable, select } from "./model/Post.js" +import { PostLink } from "./model/PostLink.js" const zeroDateString = "0000-00-00 00:00:00" const blockRefRegex = //g +const prominentLinkRegex = /"linkUrl":"(?[^"]+)"/g +const anyHrefRegex = /href="(?[^"]+)"/g +const anySrcRegex = /href="(?[^"]+)"/g interface ReusableBlock { ID: number @@ -197,9 +206,37 @@ const syncPostsToGrapher = async (): Promise => { .filter((p) => !doesExistInWordpress[p.id]) .map((p) => p.id) + const postLinks = await PostLink.find() + + const postLinksById = groupBy(postLinks, (link) => link.id) + const toInsert = rows.map((post: any) => { const content = post.post_content as string + // TODO: move this into a separate iteration, add + // code to delete/add rows, extract link fragment and query string + + const existingLinks = new Set( + postLinksById[post.id].map((link) => link.target) + ) + const allHrefs = [...content.matchAll(anyHrefRegex)].map( + (x) => x.groups?.["url"] ?? "" + ) + const allSrcs = [...content.matchAll(anySrcRegex)].map( + (x) => x.groups?.["url"] ?? "" + ) + const allProminentLinks = [...content.matchAll(prominentLinkRegex)].map( + (x) => x.groups?.["url"] ?? "" + ) + const allLinks = new Set([ + ...allHrefs, + ...allSrcs, + ...allProminentLinks, + ]) + + const linksToAdd = differenceOfSets([allLinks, existingLinks]) + const linksToDelete = differenceOfSets([existingLinks, allLinks]) + return { id: post.ID, title: post.post_title, From 365a41d18b3f789412258ba093c8437e6d12ac35 Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Wed, 25 Oct 2023 12:46:00 +0200 Subject: [PATCH 02/25] :constrution: tweak full posts link generation, almost complete --- .vscode/launch.json | 9 +- db/migration/1692042923850-AddPostsLinks.ts | 2 +- db/syncPostsToGrapher.ts | 127 +++++++++++++++----- 3 files changed, 107 insertions(+), 31 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index 4ef08867736..79c7b77208a 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -13,6 +13,9 @@ "skipFiles": [ "/**" ], + "skipFiles": [ + "/**" + ], "type": "node" }, { @@ -25,6 +28,10 @@ "${fileBasenameNoExtension}.js", "--watch" ], + "args": [ + "${fileBasenameNoExtension}.js", + "--watch" + ], "console": "integratedTerminal" // "internalConsoleOptions": "neverOpen" }, @@ -115,4 +122,4 @@ "port": 9000 } ] -} +} \ No newline at end of file diff --git a/db/migration/1692042923850-AddPostsLinks.ts b/db/migration/1692042923850-AddPostsLinks.ts index cd78bf3467a..ffccf1b01ea 100644 --- a/db/migration/1692042923850-AddPostsLinks.ts +++ b/db/migration/1692042923850-AddPostsLinks.ts @@ -7,7 +7,7 @@ export class AddPostsLinks1692042923850 implements MigrationInterface { id int NOT NULL AUTO_INCREMENT, sourceId int NOT NULL, target varchar(2047) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_as_cs NOT NULL, - linkType enum('url','grapher','explorer') CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_as_cs DEFAULT NULL, + linkType enum('url','grapher','explorer', 'gdoc') CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_as_cs DEFAULT NULL, componentType varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_as_cs NOT NULL, text varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_as_cs NOT NULL, queryString varchar(2047) COLLATE utf8mb4_0900_as_cs NOT NULL, diff --git a/db/syncPostsToGrapher.ts b/db/syncPostsToGrapher.ts index 8135fe1676f..e96f9f60bab 100644 --- a/db/syncPostsToGrapher.ts +++ b/db/syncPostsToGrapher.ts @@ -4,20 +4,24 @@ import * as wpdb from "./wpdb.js" import * as db from "./db.js" import { + chunk, differenceOfSets, + excludeNullish, groupBy, keyBy, + maxBy, PostRow, } from "@ourworldindata/utils" import { postsTable, select } from "./model/Post.js" import { PostLink } from "./model/PostLink.js" +import { dataSource } from "./dataSource.js" const zeroDateString = "0000-00-00 00:00:00" const blockRefRegex = //g const prominentLinkRegex = /"linkUrl":"(?[^"]+)"/g const anyHrefRegex = /href="(?[^"]+)"/g -const anySrcRegex = /href="(?[^"]+)"/g +const anySrcRegex = /src="(?[^"]+)"/g interface ReusableBlock { ID: number @@ -206,37 +210,9 @@ const syncPostsToGrapher = async (): Promise => { .filter((p) => !doesExistInWordpress[p.id]) .map((p) => p.id) - const postLinks = await PostLink.find() - - const postLinksById = groupBy(postLinks, (link) => link.id) - const toInsert = rows.map((post: any) => { const content = post.post_content as string - // TODO: move this into a separate iteration, add - // code to delete/add rows, extract link fragment and query string - - const existingLinks = new Set( - postLinksById[post.id].map((link) => link.target) - ) - const allHrefs = [...content.matchAll(anyHrefRegex)].map( - (x) => x.groups?.["url"] ?? "" - ) - const allSrcs = [...content.matchAll(anySrcRegex)].map( - (x) => x.groups?.["url"] ?? "" - ) - const allProminentLinks = [...content.matchAll(prominentLinkRegex)].map( - (x) => x.groups?.["url"] ?? "" - ) - const allLinks = new Set([ - ...allHrefs, - ...allSrcs, - ...allProminentLinks, - ]) - - const linksToAdd = differenceOfSets([allLinks, existingLinks]) - const linksToDelete = differenceOfSets([existingLinks, allLinks]) - return { id: post.ID, title: post.post_title, @@ -259,6 +235,79 @@ const syncPostsToGrapher = async (): Promise => { featured_image: post.featured_image || "", } }) as PostRow[] + const postLinks = await dataSource.getRepository(PostLink).find() + const postLinksById = groupBy(postLinks, (link) => link.id) + + const linksToAdd: PostLink[] = [] + const linksToDelete: PostLink[] = [] + + const postLinkCompareStringGenerator = (item: PostLink) => + `${item.linkType} - ${item.target} - ${item.hash} - ${item.queryString}` + + for (const post of rows) { + const content = post.post_content as string + + const linksInDb = groupBy( + postLinksById[post.ID], + postLinkCompareStringGenerator + ) + + const allHrefs = excludeNullish( + [...content.matchAll(anyHrefRegex)].map((x) => + x.groups?.["url"] + ? { + url: x.groups?.["url"].substring(0, 2046), + sourceId: post.ID, + componentType: "href", + } + : undefined + ) + ) + const allSrcs = excludeNullish( + [...content.matchAll(anySrcRegex)].map((x) => + x.groups?.["url"] + ? { + url: x.groups?.["url"].substring(0, 2046), + sourceId: post.ID, + componentType: "src", + } + : undefined + ) + ) + const allProminentLinks = excludeNullish( + [...content.matchAll(prominentLinkRegex)].map((x) => + x.groups?.["url"] + ? { + url: x.groups?.["url"].substring(0, 2046), + sourceId: post.ID, + componentType: "prominent-link", + } + : undefined + ) + ) + const linksInDocument = groupBy( + [ + ...allHrefs.map((link) => PostLink.createFromUrl(link)), + ...allSrcs.map((link) => PostLink.createFromUrl(link)), + ...allProminentLinks.map((link) => + PostLink.createFromUrl(link) + ), + ], + postLinkCompareStringGenerator + ) + + // This is doing a set difference, but we want to do the set operation on a subset + // of fields (the ones we stringify into the compare key) while retaining the full + // object so that we can e.g. delete efficiently by id later on + for (const [linkInDocCompareKey, linkInDoc] of Object.entries( + linksInDocument + )) + if (!(linkInDocCompareKey in linksInDb)) + linksToAdd.push(...linkInDoc) + for (const [linkInDbCompareKey, linkInDb] of Object.entries(linksInDb)) + if (!(linkInDbCompareKey in linksInDocument)) + linksToDelete.push(...linkInDb) + } await db.knexInstance().transaction(async (t) => { if (toDelete.length) @@ -270,10 +319,30 @@ const syncPostsToGrapher = async (): Promise => { else await t.insert(row).into(postsTable) } }) + + // const linksToAddAsObjects = linksToAdd.map((item) => { + // const { ...asObject } = item + // return asObject + // }) + console.log("linksToAdd", linksToAdd.length) + await PostLink.createQueryBuilder() + .insert() + .into(PostLink) + .values(linksToAdd) + .execute() + + if (linksToDelete.length) { + console.log("linksToDelete", linksToDelete.length) + await PostLink.createQueryBuilder() + .where("id in (:ids)", { ids: linksToDelete.map((x) => x.id) }) + .delete() + .execute() + } } const main = async (): Promise => { try { + await db.getConnection() await syncPostsToGrapher() } finally { await wpdb.singleton.end() From d681df23ddaaa85d8db10f51b067d5be25812dc6 Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Wed, 25 Oct 2023 12:46:00 +0200 Subject: [PATCH 03/25] :sparkles: add updating of PostLink to wp update hook --- baker/postUpdatedHook.ts | 37 ++++++++- db/syncPostsToGrapher.ts | 162 +++++++++++++++++++++------------------ 2 files changed, 125 insertions(+), 74 deletions(-) diff --git a/baker/postUpdatedHook.ts b/baker/postUpdatedHook.ts index 93b5793beb9..9d926cbe5cb 100644 --- a/baker/postUpdatedHook.ts +++ b/baker/postUpdatedHook.ts @@ -7,8 +7,12 @@ import { exit } from "../db/cleanup.js" import { PostRow } from "@ourworldindata/utils" import * as wpdb from "../db/wpdb.js" import * as db from "../db/db.js" -import { buildReusableBlocksResolver } from "../db/syncPostsToGrapher.js" +import { + buildReusableBlocksResolver, + getLinksToAddAndRemoveForPost, +} from "../db/syncPostsToGrapher.js" import { postsTable, select } from "../db/model/Post.js" +import { PostLink } from "../db/model/PostLink.js" const argv = parseArgs(process.argv.slice(2)) const zeroDateString = "0000-00-00 00:00:00" @@ -141,6 +145,37 @@ const syncPostToGrapher = async ( db.knexTable(postsTable).where({ id: postId }) ) )[0] + + if (postRow) { + const existingLinksForPost = await PostLink.findBy({ + sourceId: wpPost.ID, + }) + + const { linksToAdd, linksToDelete } = getLinksToAddAndRemoveForPost( + postRow, + existingLinksForPost, + postRow!.content, + wpPost.ID + ) + + // TODO: unify our DB access and then do everything in one transaction + if (linksToAdd.length) { + console.log("linksToAdd", linksToAdd.length) + await PostLink.createQueryBuilder() + .insert() + .into(PostLink) + .values(linksToAdd) + .execute() + } + + if (linksToDelete.length) { + console.log("linksToDelete", linksToDelete.length) + await PostLink.createQueryBuilder() + .where("id in (:ids)", { ids: linksToDelete.map((x) => x.id) }) + .delete() + .execute() + } + } return newPost ? newPost.slug : undefined } diff --git a/db/syncPostsToGrapher.ts b/db/syncPostsToGrapher.ts index e96f9f60bab..c83160e3d81 100644 --- a/db/syncPostsToGrapher.ts +++ b/db/syncPostsToGrapher.ts @@ -112,6 +112,78 @@ export async function buildReusableBlocksResolver(): Promise + `${item.linkType} - ${item.target} - ${item.hash} - ${item.queryString}` + +export function getLinksToAddAndRemoveForPost( + post: PostRow, + existingLinksForPost: PostLink[], + content: string, + postId: number +): { linksToAdd: PostLink[]; linksToDelete: PostLink[] } { + const linksInDb = groupBy( + existingLinksForPost, + postLinkCompareStringGenerator + ) + + const allHrefs = excludeNullish( + [...content.matchAll(anyHrefRegex)].map((x) => + x.groups?.["url"] + ? { + url: x.groups?.["url"].substring(0, 2046), + sourceId: postId, + componentType: "href", + } + : undefined + ) + ) + const allSrcs = excludeNullish( + [...content.matchAll(anySrcRegex)].map((x) => + x.groups?.["url"] + ? { + url: x.groups?.["url"].substring(0, 2046), + sourceId: postId, + componentType: "src", + } + : undefined + ) + ) + const allProminentLinks = excludeNullish( + [...content.matchAll(prominentLinkRegex)].map((x) => + x.groups?.["url"] + ? { + url: x.groups?.["url"].substring(0, 2046), + sourceId: postId, + componentType: "prominent-link", + } + : undefined + ) + ) + const linksInDocument = groupBy( + [ + ...allHrefs.map((link) => PostLink.createFromUrl(link)), + ...allSrcs.map((link) => PostLink.createFromUrl(link)), + ...allProminentLinks.map((link) => PostLink.createFromUrl(link)), + ], + postLinkCompareStringGenerator + ) + + const linksToAdd: PostLink[] = [] + const linksToDelete: PostLink[] = [] + + // This is doing a set difference, but we want to do the set operation on a subset + // of fields (the ones we stringify into the compare key) while retaining the full + // object so that we can e.g. delete efficiently by id later on + for (const [linkInDocCompareKey, linkInDoc] of Object.entries( + linksInDocument + )) + if (!(linkInDocCompareKey in linksInDb)) linksToAdd.push(...linkInDoc) + for (const [linkInDbCompareKey, linkInDb] of Object.entries(linksInDb)) + if (!(linkInDbCompareKey in linksInDocument)) + linksToDelete.push(...linkInDb) + return { linksToAdd, linksToDelete } +} + const syncPostsToGrapher = async (): Promise => { const dereferenceReusableBlocksFn = await buildReusableBlocksResolver() @@ -241,72 +313,17 @@ const syncPostsToGrapher = async (): Promise => { const linksToAdd: PostLink[] = [] const linksToDelete: PostLink[] = [] - const postLinkCompareStringGenerator = (item: PostLink) => - `${item.linkType} - ${item.target} - ${item.hash} - ${item.queryString}` - for (const post of rows) { + const existingLinksForPost = postLinksById[post.ID] const content = post.post_content as string - - const linksInDb = groupBy( - postLinksById[post.ID], - postLinkCompareStringGenerator - ) - - const allHrefs = excludeNullish( - [...content.matchAll(anyHrefRegex)].map((x) => - x.groups?.["url"] - ? { - url: x.groups?.["url"].substring(0, 2046), - sourceId: post.ID, - componentType: "href", - } - : undefined - ) - ) - const allSrcs = excludeNullish( - [...content.matchAll(anySrcRegex)].map((x) => - x.groups?.["url"] - ? { - url: x.groups?.["url"].substring(0, 2046), - sourceId: post.ID, - componentType: "src", - } - : undefined - ) + const linksToModify = getLinksToAddAndRemoveForPost( + post, + existingLinksForPost, + content, + post.ID ) - const allProminentLinks = excludeNullish( - [...content.matchAll(prominentLinkRegex)].map((x) => - x.groups?.["url"] - ? { - url: x.groups?.["url"].substring(0, 2046), - sourceId: post.ID, - componentType: "prominent-link", - } - : undefined - ) - ) - const linksInDocument = groupBy( - [ - ...allHrefs.map((link) => PostLink.createFromUrl(link)), - ...allSrcs.map((link) => PostLink.createFromUrl(link)), - ...allProminentLinks.map((link) => - PostLink.createFromUrl(link) - ), - ], - postLinkCompareStringGenerator - ) - - // This is doing a set difference, but we want to do the set operation on a subset - // of fields (the ones we stringify into the compare key) while retaining the full - // object so that we can e.g. delete efficiently by id later on - for (const [linkInDocCompareKey, linkInDoc] of Object.entries( - linksInDocument - )) - if (!(linkInDocCompareKey in linksInDb)) - linksToAdd.push(...linkInDoc) - for (const [linkInDbCompareKey, linkInDb] of Object.entries(linksInDb)) - if (!(linkInDbCompareKey in linksInDocument)) - linksToDelete.push(...linkInDb) + linksToAdd.push(...linksToModify.linksToAdd) + linksToDelete.push(...linksToModify.linksToDelete) } await db.knexInstance().transaction(async (t) => { @@ -320,16 +337,15 @@ const syncPostsToGrapher = async (): Promise => { } }) - // const linksToAddAsObjects = linksToAdd.map((item) => { - // const { ...asObject } = item - // return asObject - // }) - console.log("linksToAdd", linksToAdd.length) - await PostLink.createQueryBuilder() - .insert() - .into(PostLink) - .values(linksToAdd) - .execute() + // TODO: unify our DB access and then do everything in one transaction + if (linksToAdd.length) { + console.log("linksToAdd", linksToAdd.length) + await PostLink.createQueryBuilder() + .insert() + .into(PostLink) + .values(linksToAdd) + .execute() + } if (linksToDelete.length) { console.log("linksToDelete", linksToDelete.length) From a80c490be56a3dda4a93e8317c4bb6af418d7040 Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Wed, 25 Oct 2023 12:46:00 +0200 Subject: [PATCH 04/25] :construction: WIP - query for related research and writing --- db/wpdb.ts | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/db/wpdb.ts b/db/wpdb.ts index fe8cc90f812..b875cf54be7 100644 --- a/db/wpdb.ts +++ b/db/wpdb.ts @@ -660,6 +660,80 @@ export const getRelatedChartsForVariable = async ( `) } +export const getRelatedResearchAndWritingForVariable = async ( + variableId: number +): Promise => { + const wp_posts = await db.queryMysql( + `-- sql + select + distinct + pl.target as linkTargetSlug, + c.slug as chartSlug, + p.title as title, + p.slug as postSlug, + coalesce(csr.chart_id, c.id) as chartId, + p.authors as authors, + '' as thumbnail, -- TODO: add thumbnail once we have it + pv.views_365d as pageviews, + 'wordpress' as post_source + from + posts_links pl + join posts p on + pl.sourceId = p.id + left join charts c on + pl.target = c.slug + left join chart_slug_redirects csr on + pl.target = csr.slug + left join chart_dimensions cd on + cd.chartId = c.id + left join pageviews pv on + pv.url = concat('https://ourworldindata.org/', p.slug ) + where + pl.linkType = 'grapher' + and cd.variableId = ? + and p.status = 'publish' + `, + [variableId] + ) + + const gdocs_posts = await db.queryMysql( + `-- sql + select + distinct + pl.target as linkTargetSlug, + c.slug as chartSlug, + p.content ->> '$.title' as title, + p.slug as postSlug, + coalesce(csr.chart_id, c.id) as chartId, + p.content ->> '$.authors' as authors, + p.content ->> '$."featured-image"' as thumbnail, + pv.views_365d as pageviews, + 'gdocs' as post_source + from + posts_gdocs_links pl + join posts_gdocs p on + pl.sourceId = p.id + left join charts c on + pl.target = c.slug + left join chart_slug_redirects csr on + pl.target = csr.slug + join chart_dimensions cd on + cd.chartId = c.id + left join pageviews pv on + pv.url = concat('https://ourworldindata.org/', p.slug ) + where + pl.linkType = 'grapher' + and cd.variableId = ? + and p.published = 1 + order by + pageviews desc`, + [variableId] + ) + + // TODO: combine results, discarding wp posts where gdocs are available + return [...wp_posts, ...gdocs_posts] +} + export const getRelatedArticles = async ( chartId: number ): Promise => { From 8d4b75e6492129350c1cffc63adfd20303abdc3f Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Wed, 25 Oct 2023 12:46:01 +0200 Subject: [PATCH 05/25] :bug: fix group by --- db/syncPostsToGrapher.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/db/syncPostsToGrapher.ts b/db/syncPostsToGrapher.ts index c83160e3d81..d349dcf4833 100644 --- a/db/syncPostsToGrapher.ts +++ b/db/syncPostsToGrapher.ts @@ -308,7 +308,7 @@ const syncPostsToGrapher = async (): Promise => { } }) as PostRow[] const postLinks = await dataSource.getRepository(PostLink).find() - const postLinksById = groupBy(postLinks, (link) => link.id) + const postLinksById = groupBy(postLinks, (link) => link.sourceId) const linksToAdd: PostLink[] = [] const linksToDelete: PostLink[] = [] From 63a9737f6f0c2630cdadee14698ce7fbb74df79d Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Wed, 25 Oct 2023 12:46:01 +0200 Subject: [PATCH 06/25] :tada: start showing related research and writing --- .vscode/launch.json | 3 ++- baker/GrapherBaker.tsx | 5 +++++ db/wpdb.ts | 29 ++++++++++++++++++++++------- site/DataPageV2Content.tsx | 13 ++++++++----- 4 files changed, 37 insertions(+), 13 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index 79c7b77208a..10d972a1136 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -96,7 +96,8 @@ "name": "Launch admin server", "program": "${workspaceFolder}/itsJustJavascript/adminSiteServer/app.js", "request": "launch", - "type": "node" + "type": "node", + "runtimeExecutable": "/home/daniel/.local/share/fnm/node-versions/v18.16.0/installation/bin/node" }, { "name": "Attach to node", diff --git a/baker/GrapherBaker.tsx b/baker/GrapherBaker.tsx index 21d154755f7..8179f02b0c5 100644 --- a/baker/GrapherBaker.tsx +++ b/baker/GrapherBaker.tsx @@ -28,6 +28,7 @@ import { getRelatedArticles, getRelatedCharts, getRelatedChartsForVariable, + getRelatedResearchAndWritingForVariable, isWordpressAPIEnabled, isWordpressDBEnabled, } from "../db/wpdb.js" @@ -272,6 +273,10 @@ export async function renderDataPageV2({ variableId, grapher && "id" in grapher ? [grapher.id as number] : [] ) + + datapageData.relatedResearch = + await getRelatedResearchAndWritingForVariable(variableId) + return renderToHtmlPage( => { +): Promise => { const wp_posts = await db.queryMysql( `-- sql select @@ -688,10 +690,14 @@ export const getRelatedResearchAndWritingForVariable = async ( cd.chartId = c.id left join pageviews pv on pv.url = concat('https://ourworldindata.org/', p.slug ) + left join posts_gdocs pg on + pg.id = p.gdocSuccessorId where pl.linkType = 'grapher' and cd.variableId = ? - and p.status = 'publish' + and cd.property in ('x', 'y') -- ignore cases where the indicator is size, color etc + and p.status = 'publish' -- only use published wp charts + and coalesce(pg.published, 0) = 0 -- if the wp post has a published gdoc successor then ignore it `, [variableId] ) @@ -724,14 +730,23 @@ export const getRelatedResearchAndWritingForVariable = async ( where pl.linkType = 'grapher' and cd.variableId = ? - and p.published = 1 - order by - pageviews desc`, + and cd.property in ('x', 'y') -- ignore cases where the indicator is size, color etc + and p.published = 1`, [variableId] ) - // TODO: combine results, discarding wp posts where gdocs are available - return [...wp_posts, ...gdocs_posts] + const combined = [...wp_posts, ...gdocs_posts] + // we could do the sorting in the SQL query if we'd union the two queries + // but it seemed easier to understand if we do the sort here + const sorted = sortBy(combined, (post) => -post.pageviews) + + return sorted.map((post) => ({ + title: post.title, + url: `/${post.postSlug}`, + variantName: "", + authors: JSON.parse(post.authors), + imageUrl: post.thumbnail, + })) } export const getRelatedArticles = async ( diff --git a/site/DataPageV2Content.tsx b/site/DataPageV2Content.tsx index fcf53211b0f..e663e6886cb 100644 --- a/site/DataPageV2Content.tsx +++ b/site/DataPageV2Content.tsx @@ -282,7 +282,7 @@ export const DataPageV2Content = ({ From b8f28fde9d4f07d94b57bf6c1c93d6da3308d3ba Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Wed, 25 Oct 2023 12:46:01 +0200 Subject: [PATCH 07/25] :hammer: add temporary thumbnail rendering --- site/DataPageV2Content.tsx | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/site/DataPageV2Content.tsx b/site/DataPageV2Content.tsx index e663e6886cb..1461fc246d8 100644 --- a/site/DataPageV2Content.tsx +++ b/site/DataPageV2Content.tsx @@ -24,7 +24,12 @@ import { AttachmentsContext, DocumentContext } from "./gdocs/OwidGdoc.js" import StickyNav from "./blocks/StickyNav.js" import cx from "classnames" import { DebugProvider } from "./gdocs/DebugContext.js" +import Image from "./gdocs/Image.js" import dayjs from "dayjs" +import { + IMAGE_HOSTING_BUCKET_SUBFOLDER_PATH, + IMAGE_HOSTING_CDN_URL, +} from "../settings/clientSettings.js" declare global { interface Window { _OWID_DATAPAGEV2_PROPS: DataPageV2ContentFields @@ -288,8 +293,24 @@ export const DataPageV2Content = ({ key={research.url} className="related-research__item grid grid-cols-4 grid-lg-cols-6 grid-sm-cols-12 span-cols-4 span-lg-cols-6 span-sm-cols-12" > + {/*
+ +
*/} + {/* // TODO: switch this to use the Image component and put the required information for the thumbnails into hte attachment context or similar */} From 5204cf5ca758a499742a19b4021db7fc55c14498 Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Wed, 25 Oct 2023 12:46:01 +0200 Subject: [PATCH 08/25] :bug: fix wordpress authors display --- db/wpdb.ts | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/db/wpdb.ts b/db/wpdb.ts index 640e960ad9e..c8522f321f6 100644 --- a/db/wpdb.ts +++ b/db/wpdb.ts @@ -38,6 +38,7 @@ import { uniqBy, sortBy, DataPageRelatedResearch, + isString, OwidGdocType, Tag, } from "@ourworldindata/utils" @@ -662,10 +663,21 @@ export const getRelatedChartsForVariable = async ( `) } +interface RelatedResearchQueryResult { + linkTargetSlug: string + chartSlug: string + title: string + postSlug: string + chartId: number + authors: string + thumbnail: string + pageviews: number + post_source: string +} export const getRelatedResearchAndWritingForVariable = async ( variableId: number ): Promise => { - const wp_posts = await db.queryMysql( + const wp_posts: RelatedResearchQueryResult[] = await db.queryMysql( `-- sql select distinct @@ -702,7 +714,7 @@ export const getRelatedResearchAndWritingForVariable = async ( [variableId] ) - const gdocs_posts = await db.queryMysql( + const gdocs_posts: RelatedResearchQueryResult[] = await db.queryMysql( `-- sql select distinct @@ -736,17 +748,27 @@ export const getRelatedResearchAndWritingForVariable = async ( ) const combined = [...wp_posts, ...gdocs_posts] + // we could do the sorting in the SQL query if we'd union the two queries // but it seemed easier to understand if we do the sort here const sorted = sortBy(combined, (post) => -post.pageviews) - return sorted.map((post) => ({ - title: post.title, - url: `/${post.postSlug}`, - variantName: "", - authors: JSON.parse(post.authors), - imageUrl: post.thumbnail, - })) + return sorted.map((post) => { + const parsedAuthors = JSON.parse(post.authors) + // The authors in the gdocs table are just a list of strings, but in the wp_posts table + // they are a list of objects with an "author" key and an "order" key. We want to normalize this so that + // we can just use the same code to display the authors in both cases. + const authors = parsedAuthors.map((author: any) => + !isString(author) ? author.author : author + ) + return { + title: post.title, + url: `/${post.postSlug}`, + variantName: "", + authors, + imageUrl: post.thumbnail, + } + }) } export const getRelatedArticles = async ( From 6480d6f0af01f6c99064d538c7ffb9cbe14b9aef Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Wed, 25 Oct 2023 12:46:01 +0200 Subject: [PATCH 09/25] :hammer: tweak related research query This deduplicates by url, sorts authors and only uses full chart embeds and ignores plain links to charts --- db/wpdb.ts | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/db/wpdb.ts b/db/wpdb.ts index c8522f321f6..606c2b78843 100644 --- a/db/wpdb.ts +++ b/db/wpdb.ts @@ -665,6 +665,7 @@ export const getRelatedChartsForVariable = async ( interface RelatedResearchQueryResult { linkTargetSlug: string + componentType: string chartSlug: string title: string postSlug: string @@ -682,6 +683,7 @@ export const getRelatedResearchAndWritingForVariable = async ( select distinct pl.target as linkTargetSlug, + pl.componentType as componentType, c.slug as chartSlug, p.title as title, p.slug as postSlug, @@ -706,6 +708,7 @@ export const getRelatedResearchAndWritingForVariable = async ( pg.id = p.gdocSuccessorId where pl.linkType = 'grapher' + and componentType = 'src' -- this filters out links in tags and keeps only embedded charts and cd.variableId = ? and cd.property in ('x', 'y') -- ignore cases where the indicator is size, color etc and p.status = 'publish' -- only use published wp charts @@ -719,6 +722,7 @@ export const getRelatedResearchAndWritingForVariable = async ( select distinct pl.target as linkTargetSlug, + pl.componentType as componentType, c.slug as chartSlug, p.content ->> '$.title' as title, p.slug as postSlug, @@ -741,6 +745,7 @@ export const getRelatedResearchAndWritingForVariable = async ( pv.url = concat('https://ourworldindata.org/', p.slug ) where pl.linkType = 'grapher' + and componentType = 'chart' -- this filters out links in tags and keeps only embedded charts and cd.variableId = ? and cd.property in ('x', 'y') -- ignore cases where the indicator is size, color etc and p.published = 1`, @@ -753,14 +758,17 @@ export const getRelatedResearchAndWritingForVariable = async ( // but it seemed easier to understand if we do the sort here const sorted = sortBy(combined, (post) => -post.pageviews) - return sorted.map((post) => { + const allSortedRelatedResearch = sorted.map((post) => { const parsedAuthors = JSON.parse(post.authors) // The authors in the gdocs table are just a list of strings, but in the wp_posts table // they are a list of objects with an "author" key and an "order" key. We want to normalize this so that // we can just use the same code to display the authors in both cases. - const authors = parsedAuthors.map((author: any) => - !isString(author) ? author.author : author - ) + let authors + if (parsedAuthors.length > 0 && !isString(parsedAuthors[0])) { + authors = sortBy(parsedAuthors, (author) => author.order).map( + (author: any) => author.author + ) + } else authors = parsedAuthors return { title: post.title, url: `/${post.postSlug}`, @@ -769,6 +777,10 @@ export const getRelatedResearchAndWritingForVariable = async ( imageUrl: post.thumbnail, } }) + // the queries above use distinct but because of the information we pull in if the same piece of research + // uses different charts that all use a single indicator we would get duplicates for the post to link to so + // here we deduplicate by url. The first item is retained by uniqBy, latter ones are discarded. + return uniqBy(allSortedRelatedResearch, "url") } export const getRelatedArticles = async ( From 6cd3a2d3c374f06f517f4b8cdad33276a0e0f0d2 Mon Sep 17 00:00:00 2001 From: danyx23 Date: Wed, 25 Oct 2023 12:46:01 +0200 Subject: [PATCH 10/25] =?UTF-8?q?=F0=9F=A4=96=20style:=20prettify=20code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .vscode/launch.json | 7 ------- 1 file changed, 7 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index 10d972a1136..79dec72f133 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -13,9 +13,6 @@ "skipFiles": [ "/**" ], - "skipFiles": [ - "/**" - ], "type": "node" }, { @@ -28,10 +25,6 @@ "${fileBasenameNoExtension}.js", "--watch" ], - "args": [ - "${fileBasenameNoExtension}.js", - "--watch" - ], "console": "integratedTerminal" // "internalConsoleOptions": "neverOpen" }, From e9738a9607551ce43e0f191e0b2f64f7a387ba97 Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Wed, 25 Oct 2023 12:46:01 +0200 Subject: [PATCH 11/25] :honeybee: fix lint issues --- db/model/PostLink.ts | 3 ++- db/syncPostsToGrapher.ts | 12 ++---------- site/DataPageV2Content.tsx | 6 +----- 3 files changed, 5 insertions(+), 16 deletions(-) diff --git a/db/model/PostLink.ts b/db/model/PostLink.ts index 565c9995a5c..35db9cb5d3c 100644 --- a/db/model/PostLink.ts +++ b/db/model/PostLink.ts @@ -1,6 +1,7 @@ import { Entity, PrimaryGeneratedColumn, Column, BaseEntity } from "typeorm" import { formatUrls } from "../../site/formatting.js" -import { getLinkType, Url, getUrlTarget } from "@ourworldindata/utils" +import { Url } from "@ourworldindata/utils" +import { getLinkType, getUrlTarget } from "@ourworldindata/components" @Entity("posts_links") export class PostLink extends BaseEntity { diff --git a/db/syncPostsToGrapher.ts b/db/syncPostsToGrapher.ts index d349dcf4833..f531fa97a2e 100644 --- a/db/syncPostsToGrapher.ts +++ b/db/syncPostsToGrapher.ts @@ -3,15 +3,7 @@ import * as wpdb from "./wpdb.js" import * as db from "./db.js" -import { - chunk, - differenceOfSets, - excludeNullish, - groupBy, - keyBy, - maxBy, - PostRow, -} from "@ourworldindata/utils" +import { excludeNullish, groupBy, keyBy, PostRow } from "@ourworldindata/utils" import { postsTable, select } from "./model/Post.js" import { PostLink } from "./model/PostLink.js" import { dataSource } from "./dataSource.js" @@ -112,7 +104,7 @@ export async function buildReusableBlocksResolver(): Promise +export const postLinkCompareStringGenerator = (item: PostLink): string => `${item.linkType} - ${item.target} - ${item.hash} - ${item.queryString}` export function getLinksToAddAndRemoveForPost( diff --git a/site/DataPageV2Content.tsx b/site/DataPageV2Content.tsx index 1461fc246d8..fba577c4d46 100644 --- a/site/DataPageV2Content.tsx +++ b/site/DataPageV2Content.tsx @@ -24,12 +24,8 @@ import { AttachmentsContext, DocumentContext } from "./gdocs/OwidGdoc.js" import StickyNav from "./blocks/StickyNav.js" import cx from "classnames" import { DebugProvider } from "./gdocs/DebugContext.js" -import Image from "./gdocs/Image.js" import dayjs from "dayjs" -import { - IMAGE_HOSTING_BUCKET_SUBFOLDER_PATH, - IMAGE_HOSTING_CDN_URL, -} from "../settings/clientSettings.js" +import { IMAGE_HOSTING_CDN_URL } from "../settings/clientSettings.js" declare global { interface Window { _OWID_DATAPAGEV2_PROPS: DataPageV2ContentFields From 50c0f71de5cbaf3f5f90085d63f1922325ffd14d Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Wed, 25 Oct 2023 12:46:01 +0200 Subject: [PATCH 12/25] :hammer: add tooling to get pageview data into local mysql --- Makefile | 31 +++++++++++-------- db/refreshPageviewsFromDatasette.ts | 47 +++++++++++++++++++++++++++++ package.json | 1 + 3 files changed, 66 insertions(+), 13 deletions(-) create mode 100644 db/refreshPageviewsFromDatasette.ts diff --git a/Makefile b/Makefile index 315e584f7d9..badc7844aa5 100644 --- a/Makefile +++ b/Makefile @@ -24,23 +24,24 @@ help: @echo 'Available commands:' @echo @echo ' GRAPHER ONLY' - @echo ' make up start dev environment via docker-compose and tmux' - @echo ' make down stop any services still running' - @echo ' make refresh (while up) download a new grapher snapshot and update MySQL' - @echo ' make migrate (while up) run any outstanding db migrations' - @echo ' make test run full suite (except db tests) of CI checks including unit tests' - @echo ' make dbtest run db test suite that needs a running mysql db' - @echo ' make svgtest compare current rendering against reference SVGs' + @echo ' make up start dev environment via docker-compose and tmux' + @echo ' make down stop any services still running' + @echo ' make refresh (while up) download a new grapher snapshot and update MySQL' + @echo ' make refresh.pageviews (while up) download and load pageviews from the private datasette instance' + @echo ' make migrate (while up) run any outstanding db migrations' + @echo ' make test run full suite (except db tests) of CI checks including unit tests' + @echo ' make dbtest run db test suite that needs a running mysql db' + @echo ' make svgtest compare current rendering against reference SVGs' @echo @echo ' GRAPHER + WORDPRESS (staff-only)' - @echo ' make up.full start dev environment via docker-compose and tmux' - @echo ' make down.full stop any services still running' - @echo ' make refresh.wp download a new wordpress snapshot and update MySQL' - @echo ' make refresh.full do a full MySQL update of both wordpress and grapher' + @echo ' make up.full start dev environment via docker-compose and tmux' + @echo ' make down.full stop any services still running' + @echo ' make refresh.wp download a new wordpress snapshot and update MySQL' + @echo ' make refresh.full do a full MySQL update of both wordpress and grapher' @echo @echo ' OPS (staff-only)' - @echo ' make deploy Deploy your local site to production' - @echo ' make stage Deploy your local site to staging' + @echo ' make deploy Deploy your local site to production' + @echo ' make stage Deploy your local site to staging' @echo up: export DEBUG = 'knex:query' @@ -136,6 +137,10 @@ refresh: @echo '==> Updating grapher database' @. ./.env && DATA_FOLDER=tmp-downloads ./devTools/docker/refresh-grapher-data.sh +refresh.pageviews: + @echo '==> Refreshing pageviews' + yarn && yarn buildTsc && yarn refreshPageviews + refresh.wp: @echo '==> Downloading wordpress data' ./devTools/docker/download-wordpress-mysql.sh diff --git a/db/refreshPageviewsFromDatasette.ts b/db/refreshPageviewsFromDatasette.ts new file mode 100644 index 00000000000..80bd6f55e26 --- /dev/null +++ b/db/refreshPageviewsFromDatasette.ts @@ -0,0 +1,47 @@ +// index.ts +import fetch from "node-fetch" +import Papa from "papaparse" +import * as db from "./db.js" + +async function downloadAndInsertCSV(): Promise { + const csvUrl = "http://datasette-private/owid/pageviews.csv?_size=max" + const response = await fetch(csvUrl) + + if (!response.ok) { + throw new Error(`Failed to fetch CSV: ${response.statusText}`) + } + + const csvText = await response.text() + const parsedData = Papa.parse(csvText, { + header: true, + }) + + if (parsedData.errors.length > 1) { + console.error("Errors while parsing CSV:", parsedData.errors) + return + } + + const onlyValidRows = [...parsedData.data].filter( + (row) => Object.keys(row as any).length === 5 + ) as any[] + + console.log("Parsed CSV data:", onlyValidRows.length, "rows") + console.log("Columns:", parsedData.meta.fields) + + await db.knexRaw("TRUNCATE TABLE pageviews") + + await db.knexInstance().batchInsert("pageviews", onlyValidRows) + console.log("CSV data inserted successfully!") +} + +const main = async (): Promise => { + try { + await downloadAndInsertCSV() + } catch (e) { + console.error(e) + } finally { + await db.closeTypeOrmAndKnexConnections() + } +} + +main() diff --git a/package.json b/package.json index 2c1bdf9e2b8..07e49461fdf 100644 --- a/package.json +++ b/package.json @@ -26,6 +26,7 @@ "fixPrettierAll": "yarn prettier --write \"**/*.{tsx,ts,jsx,js,json,md,html,css,scss,yml}\"", "runRegionsUpdater": "node --enable-source-maps ./itsJustJavascript/devTools/regionsUpdater/update.js", "runDbMigrations": "yarn typeorm migration:run -d itsJustJavascript/db/dataSource.js", + "refreshPageviews": "node --enable-source-maps ./itsJustJavascript/db/refreshPageviewsFromDatasette.js", "revertLastDbMigration": "yarn typeorm migration:revert -d itsJustJavascript/db/dataSource.js", "runPostUpdateHook": "node --enable-source-maps ./itsJustJavascript/baker/postUpdatedHook.js", "startAdminServer": "node --enable-source-maps ./itsJustJavascript/adminSiteServer/app.js", From f7a695c1acd9638535312d739550ae90cab7a101 Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Wed, 25 Oct 2023 12:46:01 +0200 Subject: [PATCH 13/25] :hammer: make sure pageviews as 0 and not null --- db/wpdb.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/db/wpdb.ts b/db/wpdb.ts index 606c2b78843..b55fc06e74a 100644 --- a/db/wpdb.ts +++ b/db/wpdb.ts @@ -690,7 +690,7 @@ export const getRelatedResearchAndWritingForVariable = async ( coalesce(csr.chart_id, c.id) as chartId, p.authors as authors, '' as thumbnail, -- TODO: add thumbnail once we have it - pv.views_365d as pageviews, + coalesce(pv.views_365d, 0) as pageviews, 'wordpress' as post_source from posts_links pl @@ -729,7 +729,7 @@ export const getRelatedResearchAndWritingForVariable = async ( coalesce(csr.chart_id, c.id) as chartId, p.content ->> '$.authors' as authors, p.content ->> '$."featured-image"' as thumbnail, - pv.views_365d as pageviews, + coalesce(pv.views_365d, 0) as pageviews, 'gdocs' as post_source from posts_gdocs_links pl From 7ab6aafc9eb2bb8ac70245cc9943b6e1beb0c1cc Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Tue, 31 Oct 2023 16:56:11 +0100 Subject: [PATCH 14/25] :sparkles: use thumbnails for wp posts --- db/wpdb.ts | 2 +- site/DataPageV2Content.tsx | 13 ++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/db/wpdb.ts b/db/wpdb.ts index b55fc06e74a..c6629a6a0d5 100644 --- a/db/wpdb.ts +++ b/db/wpdb.ts @@ -689,7 +689,7 @@ export const getRelatedResearchAndWritingForVariable = async ( p.slug as postSlug, coalesce(csr.chart_id, c.id) as chartId, p.authors as authors, - '' as thumbnail, -- TODO: add thumbnail once we have it + p.featured_image as thumbnail, coalesce(pv.views_365d, 0) as pageviews, 'wordpress' as post_source from diff --git a/site/DataPageV2Content.tsx b/site/DataPageV2Content.tsx index fba577c4d46..714a1046bd6 100644 --- a/site/DataPageV2Content.tsx +++ b/site/DataPageV2Content.tsx @@ -304,9 +304,16 @@ export const DataPageV2Content = ({ */} {/* // TODO: switch this to use the Image component and put the required information for the thumbnails into hte attachment context or similar */} From b726d7fcb4d24f8c578b8bfaf0ebb584ad5e79da Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Wed, 1 Nov 2023 12:36:45 +0100 Subject: [PATCH 15/25] :hammer: add tags to content that is retrieved --- db/wpdb.ts | 25 +++- .../@ourworldindata/utils/src/owidTypes.ts | 1 + site/DataPageV2Content.tsx | 114 ++++++++++-------- 3 files changed, 85 insertions(+), 55 deletions(-) diff --git a/db/wpdb.ts b/db/wpdb.ts index c6629a6a0d5..9e94091ff0a 100644 --- a/db/wpdb.ts +++ b/db/wpdb.ts @@ -674,6 +674,7 @@ interface RelatedResearchQueryResult { thumbnail: string pageviews: number post_source: string + tags: string } export const getRelatedResearchAndWritingForVariable = async ( variableId: number @@ -691,7 +692,12 @@ export const getRelatedResearchAndWritingForVariable = async ( p.authors as authors, p.featured_image as thumbnail, coalesce(pv.views_365d, 0) as pageviews, - 'wordpress' as post_source + 'wordpress' as post_source, + (select JSON_ARRAYAGG(t.name) + from post_tags pt + join tags t on pt.tag_id = t.id + where pt.post_id = p.id + ) as tags from posts_links pl join posts p on @@ -706,6 +712,8 @@ export const getRelatedResearchAndWritingForVariable = async ( pv.url = concat('https://ourworldindata.org/', p.slug ) left join posts_gdocs pg on pg.id = p.gdocSuccessorId + left join post_tags pt on + pt.post_id = p.id where pl.linkType = 'grapher' and componentType = 'src' -- this filters out links in tags and keeps only embedded charts @@ -730,7 +738,12 @@ export const getRelatedResearchAndWritingForVariable = async ( p.content ->> '$.authors' as authors, p.content ->> '$."featured-image"' as thumbnail, coalesce(pv.views_365d, 0) as pageviews, - 'gdocs' as post_source + 'gdocs' as post_source, + (select JSON_ARRAYAGG(t.name) + from posts_gdocs_x_tags pt + join tags t on pt.tagId = t.id + where pt.gdocId = p.id + ) as tags from posts_gdocs_links pl join posts_gdocs p on @@ -743,12 +756,15 @@ export const getRelatedResearchAndWritingForVariable = async ( cd.chartId = c.id left join pageviews pv on pv.url = concat('https://ourworldindata.org/', p.slug ) + left join posts_gdocs_x_tags pt on + pt.gdocId = p.id where pl.linkType = 'grapher' and componentType = 'chart' -- this filters out links in tags and keeps only embedded charts and cd.variableId = ? and cd.property in ('x', 'y') -- ignore cases where the indicator is size, color etc - and p.published = 1`, + and p.published = 1 + and p.content ->> '$.type' != 'fragment'`, [variableId] ) @@ -769,12 +785,15 @@ export const getRelatedResearchAndWritingForVariable = async ( (author: any) => author.author ) } else authors = parsedAuthors + const parsedTags = post.tags !== "" ? JSON.parse(post.tags) : [] + return { title: post.title, url: `/${post.postSlug}`, variantName: "", authors, imageUrl: post.thumbnail, + tags: parsedTags, } }) // the queries above use distinct but because of the information we pull in if the same piece of research diff --git a/packages/@ourworldindata/utils/src/owidTypes.ts b/packages/@ourworldindata/utils/src/owidTypes.ts index f0c9488dc49..99a6205a4c4 100644 --- a/packages/@ourworldindata/utils/src/owidTypes.ts +++ b/packages/@ourworldindata/utils/src/owidTypes.ts @@ -1606,6 +1606,7 @@ export interface DataPageRelatedResearch { url: string authors: string[] imageUrl: string + tags: string[] } export interface DataPageRelatedData { diff --git a/site/DataPageV2Content.tsx b/site/DataPageV2Content.tsx index 714a1046bd6..7111936d080 100644 --- a/site/DataPageV2Content.tsx +++ b/site/DataPageV2Content.tsx @@ -191,6 +191,19 @@ export const DataPageV2Content = ({ ? `“Data Page: ${datapageData.title}”, part of the following publication: ${datapageData.primaryTopic.citation}. Data adapted from ${producers}. Retrieved from ${canonicalUrl} [online resource]` : `“Data Page: ${datapageData.title}”. Our World in Data (${currentYear}). Data adapted from ${producers}. Retrieved from ${canonicalUrl} [online resource]` + const relatedResearchCandidates = datapageData.relatedResearch + const relatedResearch = + relatedResearchCandidates.length > 10 //&& + ? //datapageData.primaryTopic?.topicTag + relatedResearchCandidates.filter((research) => + research.tags.includes( + datapageData.primaryTopic?.topicTag ?? "Economic Growth" + ) + ) + : relatedResearchCandidates + // TODO: if there are more than 10 pages and the data page has topic tags, only show then ones that have overlap + // TODO: mark topic pages + return (
- {datapageData.relatedResearch && - datapageData.relatedResearch.length > 0 && ( -
-

- Related research and writing -

-
- {datapageData.relatedResearch.map( - (research) => ( - - {/*
+ {relatedResearch && relatedResearch.length > 0 && ( +
+

+ Related research and writing +

+
+ {/* // TODO: switch this to use the Image component and put the required information for the thumbnails into hte attachment context or similar */} + +
+

+ {research.title} +

+
+ {research.authors && + research.authors + .length && + formatAuthors({ + authors: + research.authors, + })} +
+
+ + ))}
- )} +
+ )} {!!datapageData.relatedData?.length && (

Date: Thu, 2 Nov 2023 12:18:31 +0100 Subject: [PATCH 16/25] : hammer: incorporate tags when matching related research --- db/wpdb.ts | 4 ++-- site/DataPageV2Content.tsx | 12 +++++------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/db/wpdb.ts b/db/wpdb.ts index 9e94091ff0a..3b9c7c96da0 100644 --- a/db/wpdb.ts +++ b/db/wpdb.ts @@ -693,7 +693,7 @@ export const getRelatedResearchAndWritingForVariable = async ( p.featured_image as thumbnail, coalesce(pv.views_365d, 0) as pageviews, 'wordpress' as post_source, - (select JSON_ARRAYAGG(t.name) + (select coalesce(JSON_ARRAYAGG(t.name), JSON_ARRAY()) from post_tags pt join tags t on pt.tag_id = t.id where pt.post_id = p.id @@ -739,7 +739,7 @@ export const getRelatedResearchAndWritingForVariable = async ( p.content ->> '$."featured-image"' as thumbnail, coalesce(pv.views_365d, 0) as pageviews, 'gdocs' as post_source, - (select JSON_ARRAYAGG(t.name) + (select coalesce(JSON_ARRAYAGG(t.name), JSON_ARRAY()) from posts_gdocs_x_tags pt join tags t on pt.tagId = t.id where pt.gdocId = p.id diff --git a/site/DataPageV2Content.tsx b/site/DataPageV2Content.tsx index 7111936d080..3c3c5f1daf9 100644 --- a/site/DataPageV2Content.tsx +++ b/site/DataPageV2Content.tsx @@ -19,6 +19,7 @@ import { uniq, pick, formatAuthors, + intersection, } from "@ourworldindata/utils" import { AttachmentsContext, DocumentContext } from "./gdocs/OwidGdoc.js" import StickyNav from "./blocks/StickyNav.js" @@ -193,15 +194,12 @@ export const DataPageV2Content = ({ const relatedResearchCandidates = datapageData.relatedResearch const relatedResearch = - relatedResearchCandidates.length > 10 //&& - ? //datapageData.primaryTopic?.topicTag - relatedResearchCandidates.filter((research) => - research.tags.includes( - datapageData.primaryTopic?.topicTag ?? "Economic Growth" - ) + relatedResearchCandidates.length > 10 && + datapageData.topicTagsLinks?.length + ? relatedResearchCandidates.filter((research) => + intersection([research.tags, datapageData.topicTagsLinks]) ) : relatedResearchCandidates - // TODO: if there are more than 10 pages and the data page has topic tags, only show then ones that have overlap // TODO: mark topic pages return ( From a8e8f74505cf27733e23944698a9d0fd9cc8dc49 Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Thu, 2 Nov 2023 13:02:34 +0100 Subject: [PATCH 17/25] :honeybee: fix accidental commits in launch.json --- .eslintignore | 1 + .vscode/launch.json | 17 +++++++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/.eslintignore b/.eslintignore index 9e84750c1e7..4667d8bf136 100644 --- a/.eslintignore +++ b/.eslintignore @@ -15,3 +15,4 @@ wordpress/web/wp/wp-content/** wordpress/vendor/** packages/@ourworldindata/*/dist/ dist/ +.vscode/ diff --git a/.vscode/launch.json b/.vscode/launch.json index 79dec72f133..25ccd948758 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -13,6 +13,9 @@ "skipFiles": [ "/**" ], + "skipFiles": [ + "/**" + ], "type": "node" }, { @@ -25,6 +28,10 @@ "${fileBasenameNoExtension}.js", "--watch" ], + "args": [ + "${fileBasenameNoExtension}.js", + "--watch" + ], "console": "integratedTerminal" // "internalConsoleOptions": "neverOpen" }, @@ -70,7 +77,7 @@ "skipFiles": [ "/**" ], - "type": "node" + "type": "node", }, { "name": "Run SVGTester", @@ -79,18 +86,24 @@ "skipFiles": [ "/**" ], + "skipFiles": [ + "/**" + ], "type": "node", "args": [ "-g", "367" ] + "args": [ + "-g", + "367" + ] }, { "name": "Launch admin server", "program": "${workspaceFolder}/itsJustJavascript/adminSiteServer/app.js", "request": "launch", "type": "node", - "runtimeExecutable": "/home/daniel/.local/share/fnm/node-versions/v18.16.0/installation/bin/node" }, { "name": "Attach to node", From 796f8c41e59a62c001feac7e375fd83da7a9e89d Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Thu, 2 Nov 2023 17:09:24 +0100 Subject: [PATCH 18/25] :hammer: fix filter query --- site/DataPageV2Content.tsx | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/site/DataPageV2Content.tsx b/site/DataPageV2Content.tsx index 3c3c5f1daf9..f2a4b317254 100644 --- a/site/DataPageV2Content.tsx +++ b/site/DataPageV2Content.tsx @@ -196,9 +196,13 @@ export const DataPageV2Content = ({ const relatedResearch = relatedResearchCandidates.length > 10 && datapageData.topicTagsLinks?.length - ? relatedResearchCandidates.filter((research) => - intersection([research.tags, datapageData.topicTagsLinks]) - ) + ? relatedResearchCandidates.filter((research) => { + const shared = intersection( + research.tags, + datapageData.topicTagsLinks ?? [] + ) + return shared.length > 0 + }) : relatedResearchCandidates // TODO: mark topic pages From f640483160c29b3c6a922ac8bbcf2df0bc31b506 Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Thu, 2 Nov 2023 17:23:27 +0100 Subject: [PATCH 19/25] :hammer: fix page title fallback to chart tile --- baker/GrapherBaker.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/baker/GrapherBaker.tsx b/baker/GrapherBaker.tsx index 8179f02b0c5..10d9c6c6411 100644 --- a/baker/GrapherBaker.tsx +++ b/baker/GrapherBaker.tsx @@ -228,7 +228,7 @@ export async function renderDataPageV2({ } const datapageData = await getDatapageDataV2( variableMetadata, - grapherConfigForVariable ?? {} + grapher ?? {} ) const firstTopicTag = datapageData.topicTagsLinks?.[0] From 99dc5baf3467b9150745f8674acc4bdb54399978 Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Mon, 6 Nov 2023 18:35:08 +0100 Subject: [PATCH 20/25] :bug: fix url not showing up in citation --- site/DataPageV2.tsx | 1 + 1 file changed, 1 insertion(+) diff --git a/site/DataPageV2.tsx b/site/DataPageV2.tsx index 2c2d3a16afa..3cb35a6db4a 100644 --- a/site/DataPageV2.tsx +++ b/site/DataPageV2.tsx @@ -135,6 +135,7 @@ export const DataPageV2 = (props: { { datapageData, faqEntries, + canonicalUrl, } )}`, }} From be2a07f4eb3ebed7e71be9db45015711141288db Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Mon, 6 Nov 2023 18:48:26 +0100 Subject: [PATCH 21/25] :hammer: hide charts thumbnails in all charts block for single charts --- site/blocks/RelatedCharts.tsx | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/site/blocks/RelatedCharts.tsx b/site/blocks/RelatedCharts.tsx index 62faa6618de..2c49649789a 100644 --- a/site/blocks/RelatedCharts.tsx +++ b/site/blocks/RelatedCharts.tsx @@ -42,7 +42,24 @@ export const RelatedCharts = ({ useEmbedChart(activeChartIdx, refChartContainer) - return ( + const singleChartView = ( +
+
+
+
+
+
+
+ ) + + const multipleChartsView = (
@@ -91,6 +108,8 @@ export const RelatedCharts = ({
) + + return charts.length === 1 ? singleChartView : multipleChartsView } export const runRelatedCharts = (charts: RelatedChart[]) => { From a07aa340a8b6f2a1dc203835928c54b20cae93fb Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Tue, 7 Nov 2023 11:32:02 +0100 Subject: [PATCH 22/25] :hammer: hard code link redirects from country templates to selector --- site/DataPageV2Content.tsx | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/site/DataPageV2Content.tsx b/site/DataPageV2Content.tsx index f2a4b317254..bd8171d6523 100644 --- a/site/DataPageV2Content.tsx +++ b/site/DataPageV2Content.tsx @@ -204,6 +204,17 @@ export const DataPageV2Content = ({ return shared.length > 0 }) : relatedResearchCandidates + for (const item of relatedResearch) { + // TODO: these are workarounds to not link to the (not really existing) template pages for energy or co2 + // country profiles but instead to the topic page at the country selector. + if (item.url === "/co2-country-profile") + item.url = + "/co2-and-greenhouse-gas-emissions#co2-and-greenhouse-gas-emissions-country-profiles" + else if (item.url === "/energy-country-profile") + item.url = "/energy#country-profiles" + else if (item.url === "/coronavirus-country-profile") + item.url = "/coronavirus#coronavirus-country-profiles" + } // TODO: mark topic pages return ( From 78c20b5225db3c3602957e8124e935829b4d4b56 Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Thu, 16 Nov 2023 15:15:59 +0100 Subject: [PATCH 23/25] Simplify find postlink Co-authored-by: Marcel Gerber --- db/syncPostsToGrapher.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/db/syncPostsToGrapher.ts b/db/syncPostsToGrapher.ts index f531fa97a2e..80bc2c96089 100644 --- a/db/syncPostsToGrapher.ts +++ b/db/syncPostsToGrapher.ts @@ -299,7 +299,7 @@ const syncPostsToGrapher = async (): Promise => { featured_image: post.featured_image || "", } }) as PostRow[] - const postLinks = await dataSource.getRepository(PostLink).find() + const postLinks = await PostLink.find() const postLinksById = groupBy(postLinks, (link) => link.sourceId) const linksToAdd: PostLink[] = [] From b617889a7d417545a518be4240dc8bca21331153 Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Thu, 16 Nov 2023 15:47:18 +0000 Subject: [PATCH 24/25] =?UTF-8?q?=F0=9F=94=A8incorporate=20feedback?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- db/refreshPageviewsFromDatasette.ts | 4 +++- db/syncPostsToGrapher.ts | 6 +++--- db/wpdb.ts | 29 ++++++++++++++++++++++++----- site/DataPageV2Content.tsx | 2 +- 4 files changed, 31 insertions(+), 10 deletions(-) diff --git a/db/refreshPageviewsFromDatasette.ts b/db/refreshPageviewsFromDatasette.ts index 80bd6f55e26..e46145ac650 100644 --- a/db/refreshPageviewsFromDatasette.ts +++ b/db/refreshPageviewsFromDatasette.ts @@ -8,7 +8,9 @@ async function downloadAndInsertCSV(): Promise { const response = await fetch(csvUrl) if (!response.ok) { - throw new Error(`Failed to fetch CSV: ${response.statusText}`) + throw new Error( + `Failed to fetch CSV: ${response.statusText} from ${csvUrl}` + ) } const csvText = await response.text() diff --git a/db/syncPostsToGrapher.ts b/db/syncPostsToGrapher.ts index 80bc2c96089..521ece3fc1c 100644 --- a/db/syncPostsToGrapher.ts +++ b/db/syncPostsToGrapher.ts @@ -151,7 +151,7 @@ export function getLinksToAddAndRemoveForPost( : undefined ) ) - const linksInDocument = groupBy( + const linksInDocument = keyBy( [ ...allHrefs.map((link) => PostLink.createFromUrl(link)), ...allSrcs.map((link) => PostLink.createFromUrl(link)), @@ -165,11 +165,11 @@ export function getLinksToAddAndRemoveForPost( // This is doing a set difference, but we want to do the set operation on a subset // of fields (the ones we stringify into the compare key) while retaining the full - // object so that we can e.g. delete efficiently by id later on + // object so that we can e.g. delete efficiently by id later on. for (const [linkInDocCompareKey, linkInDoc] of Object.entries( linksInDocument )) - if (!(linkInDocCompareKey in linksInDb)) linksToAdd.push(...linkInDoc) + if (!(linkInDocCompareKey in linksInDb)) linksToAdd.push(linkInDoc) for (const [linkInDbCompareKey, linkInDb] of Object.entries(linksInDb)) if (!(linkInDbCompareKey in linksInDocument)) linksToDelete.push(...linkInDb) diff --git a/db/wpdb.ts b/db/wpdb.ts index 3b9c7c96da0..d57b0a7171e 100644 --- a/db/wpdb.ts +++ b/db/wpdb.ts @@ -681,11 +681,16 @@ export const getRelatedResearchAndWritingForVariable = async ( ): Promise => { const wp_posts: RelatedResearchQueryResult[] = await db.queryMysql( `-- sql + -- What we want here is to get from the variable to the charts + -- to the posts and collect different pieces of information along the way + -- One important complication is that the slugs that are used in posts to + -- embed charts can either be the current slugs or old slugs that are redirected + -- now. select distinct pl.target as linkTargetSlug, pl.componentType as componentType, - c.slug as chartSlug, + coalesce(charts_via_redirects.slug, c.slug) as chartSlug, p.title as title, p.slug as postSlug, coalesce(csr.chart_id, c.id) as chartId, @@ -706,21 +711,35 @@ export const getRelatedResearchAndWritingForVariable = async ( pl.target = c.slug left join chart_slug_redirects csr on pl.target = csr.slug + left join charts charts_via_redirects on + charts_via_redirects.id = csr.chart_id left join chart_dimensions cd on - cd.chartId = c.id + cd.chartId = coalesce(csr.chart_id, c.id) left join pageviews pv on pv.url = concat('https://ourworldindata.org/', p.slug ) left join posts_gdocs pg on pg.id = p.gdocSuccessorId + left join posts_gdocs pgs on + pgs.slug = p.slug left join post_tags pt on pt.post_id = p.id where + -- we want only urls that point to grapher charts pl.linkType = 'grapher' - and componentType = 'src' -- this filters out links in tags and keeps only embedded charts + -- componentType src is for those links that matched the anySrcregex (not anyHrefRegex or prominentLinkRegex) + -- this means that only the links that are of the iframe kind will be kept - normal a href style links will + -- be disregarded + and componentType = 'src' and cd.variableId = ? and cd.property in ('x', 'y') -- ignore cases where the indicator is size, color etc - and p.status = 'publish' -- only use published wp charts - and coalesce(pg.published, 0) = 0 -- if the wp post has a published gdoc successor then ignore it + and p.status = 'publish' -- only use published wp posts + and coalesce(pg.published, 0) = 0 -- ignore posts if the wp post has a published gdoc successor. The + -- coalesce makes sure that if there is no gdoc successor then + -- the filter keeps the post + and coalesce(pgs.published, 0) = 0 -- ignore posts if there is a gdoc post with the same slug that is published + -- this case happens for example for topic pages that are newly created (successorId is null) + -- but that replace an old wordpress page + `, [variableId] ) diff --git a/site/DataPageV2Content.tsx b/site/DataPageV2Content.tsx index bd8171d6523..82bd3971649 100644 --- a/site/DataPageV2Content.tsx +++ b/site/DataPageV2Content.tsx @@ -194,7 +194,7 @@ export const DataPageV2Content = ({ const relatedResearchCandidates = datapageData.relatedResearch const relatedResearch = - relatedResearchCandidates.length > 10 && + relatedResearchCandidates.length > 3 && datapageData.topicTagsLinks?.length ? relatedResearchCandidates.filter((research) => { const shared = intersection( From 2943fbe67d6167d4b2f61f5a18359fed1c11b1d1 Mon Sep 17 00:00:00 2001 From: sophiamersmann Date: Fri, 24 Nov 2023 09:06:54 +0000 Subject: [PATCH 25/25] :lipstick: (lint) remove unused variable --- db/syncPostsToGrapher.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/db/syncPostsToGrapher.ts b/db/syncPostsToGrapher.ts index 521ece3fc1c..7c8ad2c4b7c 100644 --- a/db/syncPostsToGrapher.ts +++ b/db/syncPostsToGrapher.ts @@ -6,7 +6,6 @@ import * as db from "./db.js" import { excludeNullish, groupBy, keyBy, PostRow } from "@ourworldindata/utils" import { postsTable, select } from "./model/Post.js" import { PostLink } from "./model/PostLink.js" -import { dataSource } from "./dataSource.js" const zeroDateString = "0000-00-00 00:00:00"