-
-
Notifications
You must be signed in to change notification settings - Fork 229
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
🎉 Add related research and writing via content graph to data pages (#…
…2739) This PR implements #2379. It adds the missing link in our db from wordpress posts to charts that are used there. It then uses this new posts_links table together with the existing posts_gdocs_links table to find the related writing for a data page by going from indciator id -> charts using this indicator -> articles using this indicator. The posts_links table was modelled on the posts_gdocs_links table as I thought that uniformity is more important than the optimal layout here. Extracting the links is a bit crudely done ATM in that it just uses regex's on the raw html tag instead of parsing the html and querying for a tags. The latter would give us the text content of the content that establishes the links which is probably often useful, but it would complicate and slow down the script. I'd like to hear your opinions on whether this should switch to proper parsing and filling richer information into the DB. The thumbnail rendering is also a bit ad-hoc. We have an Image component but that one is built for use in gdocs and we need to show thumbnails for both WP posts and Gdocs articles. To rank related research and writing we use the pageviews table. This is empty by default in dev environments and so this PR adds a make command to refresh pageviews (fetched from datasette-private) - [ ] ❗ after merging this to production, run the db/syncPostsToGrapher.js script to fill the new relationship table!
- Loading branch information
Showing
15 changed files
with
586 additions
and
58 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,3 +15,4 @@ wordpress/web/wp/wp-content/** | |
wordpress/vendor/** | ||
packages/@ourworldindata/*/dist/ | ||
dist/ | ||
.vscode/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
import { MigrationInterface, QueryRunner } from "typeorm" | ||
|
||
export class AddPostsLinks1692042923850 implements MigrationInterface { | ||
public async up(queryRunner: QueryRunner): Promise<void> { | ||
queryRunner.query(`-- sql | ||
CREATE TABLE posts_links ( | ||
id int NOT NULL AUTO_INCREMENT, | ||
sourceId int NOT NULL, | ||
target varchar(2047) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_as_cs NOT NULL, | ||
linkType enum('url','grapher','explorer', 'gdoc') CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_as_cs DEFAULT NULL, | ||
componentType varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_as_cs NOT NULL, | ||
text varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_as_cs NOT NULL, | ||
queryString varchar(2047) COLLATE utf8mb4_0900_as_cs NOT NULL, | ||
hash varchar(2047) COLLATE utf8mb4_0900_as_cs NOT NULL, | ||
PRIMARY KEY (id), | ||
KEY sourceId (sourceId), | ||
CONSTRAINT posts_links_ibfk_1 FOREIGN KEY (sourceId) REFERENCES posts (id) | ||
) ENGINE=InnoDB;`) | ||
} | ||
|
||
public async down(queryRunner: QueryRunner): Promise<void> { | ||
queryRunner.query(`-- sql | ||
DROP TABLE IF EXISTS posts_links; | ||
`) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
import { Entity, PrimaryGeneratedColumn, Column, BaseEntity } from "typeorm" | ||
import { formatUrls } from "../../site/formatting.js" | ||
import { Url } from "@ourworldindata/utils" | ||
import { getLinkType, getUrlTarget } from "@ourworldindata/components" | ||
|
||
@Entity("posts_links") | ||
export class PostLink extends BaseEntity { | ||
@PrimaryGeneratedColumn() id!: number | ||
// TODO: posts is not a TypeORM but a Knex class so we can't use a TypeORM relationship here yet | ||
|
||
@Column({ type: "int", nullable: false }) sourceId!: number | ||
|
||
@Column() linkType!: "gdoc" | "url" | "grapher" | "explorer" | ||
@Column() target!: string | ||
@Column() queryString!: string | ||
@Column() hash!: string | ||
@Column() componentType!: string | ||
@Column() text!: string | ||
|
||
static createFromUrl({ | ||
url, | ||
sourceId, | ||
text = "", | ||
componentType = "", | ||
}: { | ||
url: string | ||
sourceId: number | ||
text?: string | ||
componentType?: string | ||
}): PostLink { | ||
const formattedUrl = formatUrls(url) | ||
const urlObject = Url.fromURL(formattedUrl) | ||
const linkType = getLinkType(formattedUrl) | ||
const target = getUrlTarget(formattedUrl) | ||
const queryString = urlObject.queryStr | ||
const hash = urlObject.hash | ||
return PostLink.create({ | ||
target, | ||
linkType, | ||
queryString, | ||
hash, | ||
sourceId, | ||
text, | ||
componentType, | ||
}) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
// index.ts | ||
import fetch from "node-fetch" | ||
import Papa from "papaparse" | ||
import * as db from "./db.js" | ||
|
||
async function downloadAndInsertCSV(): Promise<void> { | ||
const csvUrl = "http://datasette-private/owid/pageviews.csv?_size=max" | ||
const response = await fetch(csvUrl) | ||
|
||
if (!response.ok) { | ||
throw new Error( | ||
`Failed to fetch CSV: ${response.statusText} from ${csvUrl}` | ||
) | ||
} | ||
|
||
const csvText = await response.text() | ||
const parsedData = Papa.parse(csvText, { | ||
header: true, | ||
}) | ||
|
||
if (parsedData.errors.length > 1) { | ||
console.error("Errors while parsing CSV:", parsedData.errors) | ||
return | ||
} | ||
|
||
const onlyValidRows = [...parsedData.data].filter( | ||
(row) => Object.keys(row as any).length === 5 | ||
) as any[] | ||
|
||
console.log("Parsed CSV data:", onlyValidRows.length, "rows") | ||
console.log("Columns:", parsedData.meta.fields) | ||
|
||
await db.knexRaw("TRUNCATE TABLE pageviews") | ||
|
||
await db.knexInstance().batchInsert("pageviews", onlyValidRows) | ||
console.log("CSV data inserted successfully!") | ||
} | ||
|
||
const main = async (): Promise<void> => { | ||
try { | ||
await downloadAndInsertCSV() | ||
} catch (e) { | ||
console.error(e) | ||
} finally { | ||
await db.closeTypeOrmAndKnexConnections() | ||
} | ||
} | ||
|
||
main() |
Oops, something went wrong.