Skip to content

Commit

Permalink
refactor thisweekinreact extractor
Browse files Browse the repository at this point in the history
  • Loading branch information
carlosqsilva committed Oct 13, 2024
1 parent 70ffc0e commit 8cd125a
Show file tree
Hide file tree
Showing 4 changed files with 105 additions and 43 deletions.
4 changes: 2 additions & 2 deletions scraper/commands/update.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import type { Browser } from "happy-dom";
import type { Storage } from "../database.ts";
import { extractThisWeekInReact } from "../newsletter/thisweekinreact.ts";
import { extractJavascriptlyWeekly } from "../newsletter/javascripweekly/index.ts";
import {
extractFrontendFocus,
Expand All @@ -10,6 +9,7 @@ import {
extractReactStatus,
extractRubyWeekly,
} from "../newsletter/common.ts";
import { ThisWeekInReact } from "../newsletter/thisweekinreact.ts";
import { PyCoders } from "../newsletter/pycoders.ts";
import { SwiftNews } from "../newsletter/swiftnews.ts";

Expand All @@ -23,7 +23,7 @@ export async function update(browser: Browser, storage: Storage) {
extractPostgresWeekly(browser, storage),
extractReactStatus(browser, storage),
extractRubyWeekly(browser, storage),
extractThisWeekInReact(browser, storage),
new ThisWeekInReact(browser, storage).update(),
new PyCoders(browser, storage).update(),
new SwiftNews(browser, storage).update(),
]);
Expand Down
7 changes: 6 additions & 1 deletion scraper/commands/updateUrl.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@ import { PyCoders } from "../newsletter/pycoders.ts";
import type { Storage } from "../database.ts";
import { defined } from "../utils.ts";
import { SwiftNews } from "../newsletter/swiftnews.ts";
import { ThisWeekInReact } from "../newsletter/thisweekinreact.ts";

export async function updateUrl(
url: string,
browser: Browser,
storage: Storage,
storage: Storage
) {
let extractor: InfoExtractor | null = null;

Expand All @@ -21,6 +22,10 @@ export async function updateUrl(
extractor = new SwiftNews(browser, storage);
}

if (ThisWeekInReact.canHandle(url)) {
extractor = new ThisWeekInReact(browser, storage);
}

if (!defined(extractor)) {
throw new Error(`Extractor not found for url: ${url}`);
}
Expand Down
37 changes: 27 additions & 10 deletions scraper/newsletter/swiftnews.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@ import type {
HTMLTimeElement,
} from "happy-dom";
import { isValid, parse } from "date-fns";
import { enUS } from "date-fns/locale";

import type { InfoExtractor } from "./interface.ts";
import type { Storage } from "../database.ts";
import type { InfoContent } from "./javascripweekly/helper.ts";
import { defined, resolveUrl } from "../utils.ts";
import { enUS } from "date-fns/locale";

export class SwiftNews implements InfoExtractor {
browser: Browser;
Expand All @@ -26,7 +26,23 @@ export class SwiftNews implements InfoExtractor {
return url.startsWith(SwiftNews.baseUrl);
}

async updateUrl(url: string) {}
async updateUrl(url: string) {
if (!this.db.isSaved(url)) {
throw new Error(`URL does not exist on database: ${url}`);
}

const page = this.browser.newPage();
await page.goto(url);

console.log(`parsing: ${url}`);
const [, content] = await extractIssueContent(page);

if (content.length) {
this.db.updateUrl(url, content);
}

page.close();
}

async update() {
const page = this.browser.newPage();
Expand Down Expand Up @@ -72,21 +88,21 @@ async function* extractIssues(page: BrowserPage) {
}

async function extractIssueContent(
page: BrowserPage,
): Promise<[string, (InfoContent | null)[]]> {
page: BrowserPage
): Promise<[string, InfoContent[]]> {
const timeEl = page.mainFrame.document.querySelector("header time");
const dateStr = (timeEl as HTMLTimeElement).dateTime;
const date = parse(dateStr, "yyyy-MM-dd", new Date(), { locale: enUS });
if (!isValid(date)) throw new Error("failed to extract date");

const sections = Array.from(
page.mainFrame.document.querySelectorAll(
'section.category:not([class*="sponsor"])',
),
'section.category:not([class*="sponsor"])'
)
);

const items = sections.flatMap((it) =>
Array.from(it.querySelectorAll(".item.item--issue")),
Array.from(it.querySelectorAll(".item.item--issue"))
);

const promisesInfo: Promise<InfoContent | null>[] = [];
Expand All @@ -105,12 +121,13 @@ async function extractIssueContent(
description = `${titleEl?.textContent} - ${description}`;

return { link, description };
})(),
})()
);
}

let infoList = await Promise.all(promisesInfo);
infoList = infoList.filter((it) => defined(it));
const infoList = (await Promise.all(promisesInfo)).filter((it) =>
defined(it)
);

console.log(`extracted ${infoList.length} items`);
return [dateStr, infoList];
Expand Down
100 changes: 70 additions & 30 deletions scraper/newsletter/thisweekinreact.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,48 +5,88 @@ import type {
Element,
} from "happy-dom";
import type { Storage } from "../database";
import { extractContentDate } from "./javascripweekly/helper.ts";
import {
extractContentDate,
type InfoContent,
} from "./javascripweekly/helper.ts";
import type { InfoExtractor } from "./interface.ts";

export class ThisWeekInReact implements InfoExtractor {
browser: Browser;
db: Storage;
static issueUrlPattern = /newsletter\/\d+$/i;
static baseURL = "https://thisweekinreact.com";

static canHandle(url: string) {
return (
url.startsWith(ThisWeekInReact.baseURL) &&
ThisWeekInReact.issueUrlPattern.test(url)
);
}

constructor(browser: Browser, db: Storage) {
this.browser = browser;
this.db = db;
}

async updateUrl(url: string) {
if (!ThisWeekInReact.canHandle(url)) {
throw new Error(`Invalid URL: ${url}`);
}
if (!this.db.isSaved(url)) {
throw new Error(`URL does not exist on database: ${url}`);
}

const baseUrl = "https://thisweekinreact.com";
const page = this.browser.newPage();
await page.goto(url);

const issueUrlPattern = /newsletter\/\d+$/i;
console.log(`parsing: ${url}`);
const [, content] = extractContent(page);

export async function extractThisWeekInReact(browser: Browser, db: Storage) {
const page = browser.newPage();
if (content.length) {
this.db.updateUrl(url, content);
}

page.close();
}

await page.goto(`${baseUrl}/newsletter`);
async update() {
const page = this.browser.newPage();

const issues = page.mainFrame.document.querySelectorAll(
`nav[class^="sidebar"] ul[class^="sidebarItemList"] li`,
);
await page.goto(`${ThisWeekInReact.baseURL}/newsletter`);

for (const issue of issues) {
const url = issue?.querySelector("a")?.href;
const issues = page.mainFrame.document.querySelectorAll(
`nav[class^="sidebar"] ul[class^="sidebarItemList"] li`,
);

if (!url) throw new Error("failed to extract url");
if (!issueUrlPattern.test(url)) continue;
if (db.isSaved(url)) continue;
if (url.endsWith("previous")) continue;
for (const issue of issues) {
const url = issue?.querySelector("a")?.href;

try {
await page.goto(url);
console.log(`parsing: ${url}`);
if (!url) throw new Error("failed to extract url");
if (!ThisWeekInReact.issueUrlPattern.test(url)) continue;
if (this.db.isSaved(url)) continue;
if (url.endsWith("previous")) continue;

const { date, content } = extractContent(page);
db.saveContent("thisweekinreact", {
url,
date,
content,
});
} catch (err) {
console.log(err);
try {
await page.goto(url);
console.log(`parsing: ${url}`);

const [date, content] = extractContent(page);
this.db.saveContent("thisweekinreact", {
url,
date,
content,
});
} catch (err) {
console.log(err);
}
}
}

await page.close();
await page.close();
}
}

function extractContent(page: BrowserPage) {
function extractContent(page: BrowserPage): [string, InfoContent[]] {
const dateStr =
page.mainFrame.document.querySelector("main header time")?.textContent;

Expand All @@ -72,5 +112,5 @@ function extractContent(page: BrowserPage) {

console.log(`extracted ${infoList.length} items`);

return { date, content: infoList.length === 0 ? null : infoList };
return [date, infoList];
}

0 comments on commit 8cd125a

Please sign in to comment.