From 8cd125aed94a07ece432e640e1a466485b8992fe Mon Sep 17 00:00:00 2001 From: Carlos Silva Date: Sun, 13 Oct 2024 14:24:03 -0300 Subject: [PATCH] refactor thisweekinreact extractor --- scraper/commands/update.ts | 4 +- scraper/commands/updateUrl.ts | 7 +- scraper/newsletter/swiftnews.ts | 37 +++++++--- scraper/newsletter/thisweekinreact.ts | 100 ++++++++++++++++++-------- 4 files changed, 105 insertions(+), 43 deletions(-) diff --git a/scraper/commands/update.ts b/scraper/commands/update.ts index c205a93..eb9882b 100644 --- a/scraper/commands/update.ts +++ b/scraper/commands/update.ts @@ -1,6 +1,5 @@ import type { Browser } from "happy-dom"; import type { Storage } from "../database.ts"; -import { extractThisWeekInReact } from "../newsletter/thisweekinreact.ts"; import { extractJavascriptlyWeekly } from "../newsletter/javascripweekly/index.ts"; import { extractFrontendFocus, @@ -10,6 +9,7 @@ import { extractReactStatus, extractRubyWeekly, } from "../newsletter/common.ts"; +import { ThisWeekInReact } from "../newsletter/thisweekinreact.ts"; import { PyCoders } from "../newsletter/pycoders.ts"; import { SwiftNews } from "../newsletter/swiftnews.ts"; @@ -23,7 +23,7 @@ export async function update(browser: Browser, storage: Storage) { extractPostgresWeekly(browser, storage), extractReactStatus(browser, storage), extractRubyWeekly(browser, storage), - extractThisWeekInReact(browser, storage), + new ThisWeekInReact(browser, storage).update(), new PyCoders(browser, storage).update(), new SwiftNews(browser, storage).update(), ]); diff --git a/scraper/commands/updateUrl.ts b/scraper/commands/updateUrl.ts index a9550ed..930073b 100644 --- a/scraper/commands/updateUrl.ts +++ b/scraper/commands/updateUrl.ts @@ -5,11 +5,12 @@ import { PyCoders } from "../newsletter/pycoders.ts"; import type { Storage } from "../database.ts"; import { defined } from "../utils.ts"; import { SwiftNews } from "../newsletter/swiftnews.ts"; +import { ThisWeekInReact } from "../newsletter/thisweekinreact.ts"; export async function updateUrl( url: string, browser: Browser, - storage: Storage, + storage: Storage ) { let extractor: InfoExtractor | null = null; @@ -21,6 +22,10 @@ export async function updateUrl( extractor = new SwiftNews(browser, storage); } + if (ThisWeekInReact.canHandle(url)) { + extractor = new ThisWeekInReact(browser, storage); + } + if (!defined(extractor)) { throw new Error(`Extractor not found for url: ${url}`); } diff --git a/scraper/newsletter/swiftnews.ts b/scraper/newsletter/swiftnews.ts index 4fbf494..1d0c357 100644 --- a/scraper/newsletter/swiftnews.ts +++ b/scraper/newsletter/swiftnews.ts @@ -5,12 +5,12 @@ import type { HTMLTimeElement, } from "happy-dom"; import { isValid, parse } from "date-fns"; +import { enUS } from "date-fns/locale"; import type { InfoExtractor } from "./interface.ts"; import type { Storage } from "../database.ts"; import type { InfoContent } from "./javascripweekly/helper.ts"; import { defined, resolveUrl } from "../utils.ts"; -import { enUS } from "date-fns/locale"; export class SwiftNews implements InfoExtractor { browser: Browser; @@ -26,7 +26,23 @@ export class SwiftNews implements InfoExtractor { return url.startsWith(SwiftNews.baseUrl); } - async updateUrl(url: string) {} + async updateUrl(url: string) { + if (!this.db.isSaved(url)) { + throw new Error(`URL does not exist on database: ${url}`); + } + + const page = this.browser.newPage(); + await page.goto(url); + + console.log(`parsing: ${url}`); + const [, content] = await extractIssueContent(page); + + if (content.length) { + this.db.updateUrl(url, content); + } + + page.close(); + } async update() { const page = this.browser.newPage(); @@ -72,8 +88,8 @@ async function* extractIssues(page: BrowserPage) { } async function extractIssueContent( - page: BrowserPage, -): Promise<[string, (InfoContent | null)[]]> { + page: BrowserPage +): Promise<[string, InfoContent[]]> { const timeEl = page.mainFrame.document.querySelector("header time"); const dateStr = (timeEl as HTMLTimeElement).dateTime; const date = parse(dateStr, "yyyy-MM-dd", new Date(), { locale: enUS }); @@ -81,12 +97,12 @@ async function extractIssueContent( const sections = Array.from( page.mainFrame.document.querySelectorAll( - 'section.category:not([class*="sponsor"])', - ), + 'section.category:not([class*="sponsor"])' + ) ); const items = sections.flatMap((it) => - Array.from(it.querySelectorAll(".item.item--issue")), + Array.from(it.querySelectorAll(".item.item--issue")) ); const promisesInfo: Promise[] = []; @@ -105,12 +121,13 @@ async function extractIssueContent( description = `${titleEl?.textContent} - ${description}`; return { link, description }; - })(), + })() ); } - let infoList = await Promise.all(promisesInfo); - infoList = infoList.filter((it) => defined(it)); + const infoList = (await Promise.all(promisesInfo)).filter((it) => + defined(it) + ); console.log(`extracted ${infoList.length} items`); return [dateStr, infoList]; diff --git a/scraper/newsletter/thisweekinreact.ts b/scraper/newsletter/thisweekinreact.ts index 2215315..4e2bcb2 100644 --- a/scraper/newsletter/thisweekinreact.ts +++ b/scraper/newsletter/thisweekinreact.ts @@ -5,48 +5,88 @@ import type { Element, } from "happy-dom"; import type { Storage } from "../database"; -import { extractContentDate } from "./javascripweekly/helper.ts"; +import { + extractContentDate, + type InfoContent, +} from "./javascripweekly/helper.ts"; +import type { InfoExtractor } from "./interface.ts"; + +export class ThisWeekInReact implements InfoExtractor { + browser: Browser; + db: Storage; + static issueUrlPattern = /newsletter\/\d+$/i; + static baseURL = "https://thisweekinreact.com"; + + static canHandle(url: string) { + return ( + url.startsWith(ThisWeekInReact.baseURL) && + ThisWeekInReact.issueUrlPattern.test(url) + ); + } + + constructor(browser: Browser, db: Storage) { + this.browser = browser; + this.db = db; + } + + async updateUrl(url: string) { + if (!ThisWeekInReact.canHandle(url)) { + throw new Error(`Invalid URL: ${url}`); + } + if (!this.db.isSaved(url)) { + throw new Error(`URL does not exist on database: ${url}`); + } -const baseUrl = "https://thisweekinreact.com"; + const page = this.browser.newPage(); + await page.goto(url); -const issueUrlPattern = /newsletter\/\d+$/i; + console.log(`parsing: ${url}`); + const [, content] = extractContent(page); -export async function extractThisWeekInReact(browser: Browser, db: Storage) { - const page = browser.newPage(); + if (content.length) { + this.db.updateUrl(url, content); + } + + page.close(); + } - await page.goto(`${baseUrl}/newsletter`); + async update() { + const page = this.browser.newPage(); - const issues = page.mainFrame.document.querySelectorAll( - `nav[class^="sidebar"] ul[class^="sidebarItemList"] li`, - ); + await page.goto(`${ThisWeekInReact.baseURL}/newsletter`); - for (const issue of issues) { - const url = issue?.querySelector("a")?.href; + const issues = page.mainFrame.document.querySelectorAll( + `nav[class^="sidebar"] ul[class^="sidebarItemList"] li`, + ); - if (!url) throw new Error("failed to extract url"); - if (!issueUrlPattern.test(url)) continue; - if (db.isSaved(url)) continue; - if (url.endsWith("previous")) continue; + for (const issue of issues) { + const url = issue?.querySelector("a")?.href; - try { - await page.goto(url); - console.log(`parsing: ${url}`); + if (!url) throw new Error("failed to extract url"); + if (!ThisWeekInReact.issueUrlPattern.test(url)) continue; + if (this.db.isSaved(url)) continue; + if (url.endsWith("previous")) continue; - const { date, content } = extractContent(page); - db.saveContent("thisweekinreact", { - url, - date, - content, - }); - } catch (err) { - console.log(err); + try { + await page.goto(url); + console.log(`parsing: ${url}`); + + const [date, content] = extractContent(page); + this.db.saveContent("thisweekinreact", { + url, + date, + content, + }); + } catch (err) { + console.log(err); + } } - } - await page.close(); + await page.close(); + } } -function extractContent(page: BrowserPage) { +function extractContent(page: BrowserPage): [string, InfoContent[]] { const dateStr = page.mainFrame.document.querySelector("main header time")?.textContent; @@ -72,5 +112,5 @@ function extractContent(page: BrowserPage) { console.log(`extracted ${infoList.length} items`); - return { date, content: infoList.length === 0 ? null : infoList }; + return [date, infoList]; }