From a553f2611dd8d1e97ba63b98b1d24de5ff0a7223 Mon Sep 17 00:00:00 2001 From: Eddy Chen <89349085+ecxyzzy@users.noreply.github.com> Date: Tue, 14 Jan 2025 11:51:08 -0800 Subject: [PATCH 1/3] feat(websoc-scraper): implement chunk-wise scraping --- apps/data-pipeline/websoc-scraper/src/lib.ts | 88 ++++++++++++++++---- 1 file changed, 72 insertions(+), 16 deletions(-) diff --git a/apps/data-pipeline/websoc-scraper/src/lib.ts b/apps/data-pipeline/websoc-scraper/src/lib.ts index 9c25212..c56b50f 100644 --- a/apps/data-pipeline/websoc-scraper/src/lib.ts +++ b/apps/data-pipeline/websoc-scraper/src/lib.ts @@ -10,8 +10,8 @@ import type { } from "@icssc/libwebsoc-next"; import { request } from "@icssc/libwebsoc-next"; import type { database } from "@packages/db"; -import { and, asc, eq, gte, inArray, lte } from "@packages/db/drizzle"; -import { type WebsocSectionFinalExam, courseView, instructorView } from "@packages/db/schema"; +import { and, asc, eq, gte, inArray, lte, sql } from "@packages/db/drizzle"; +import type { WebsocSectionFinalExam } from "@packages/db/schema"; import { calendarTerm, course, @@ -32,6 +32,18 @@ import { baseTenIntOrNull, intersectAll, notNull, sleep } from "@packages/stdlib import { parseMeetingDays, parseStartAndEndTimes } from "@packages/stdlib"; import { load } from "cheerio"; +/** + * WebSoc allows us to scrape up to 900 sections per chunk. + * This provides a 1% margin of error in case sections magically appear within ranges. + */ +const SECTIONS_PER_CHUNK = 891; + +/** + * Section codes 98000-99999 are reserved for Study Abroad and Registrar testing. + * These are not associated with any department that is searchable directly through WebSoc. + */ +const LAST_SECTION_CODE = "97999"; + export async function getDepts(db: ReturnType) { const response = await fetch("https://www.reg.uci.edu/perl/WebSoc").then((x) => x.text()); @@ -350,11 +362,11 @@ function meetingMapper( }; } -const doDepartmentUpsert = async ( +const doChunkUpsert = async ( db: ReturnType, term: Term, resp: WebsocResponse, - department: string, + department: string | null, ) => await db.transaction(async (tx) => { const updatedAt = new Date(); @@ -658,6 +670,15 @@ function normalizeCourse(courses: WebsocCourse[]): WebsocCourse[] { function normalizeResponse(json: WebsocResponse): WebsocResponse { for (const school of json.schools) { + const deptMapping = new Map(); + for (const dept of school.departments) { + if (!deptMapping.has(dept.deptCode)) { + deptMapping.set(dept.deptCode, dept); + } else { + deptMapping.get(dept.deptCode)?.courses.push(...dept.courses); + } + } + school.departments = deptMapping.values().toArray(); for (const dept of school.departments) { dept.deptName = dept.deptName.replace(/&/g, "&"); const courseMapping = new Map(); @@ -763,20 +784,57 @@ export async function scrapeTerm( ) { const name = termToName(term); console.log(`Scraping term ${name}`); - for (const department of departments) { - console.log(`Scraping department ${department}`); - const resp = await request(term, { department, cancelledCourses: "Include" }).then( - normalizeResponse, - ); - if (resp.schools.length) await doDepartmentUpsert(db, term, resp, department); - await sleep(1000); + const sectionCodeBounds = await db + .execute(sql>` + SELECT section_code FROM ( + SELECT LPAD(section_code::TEXT, 5, '0') AS section_code, + (ROW_NUMBER() OVER (ORDER BY section_code)) AS rownum + FROM ${websocSection} WHERE ${websocSection.year} = ${term.year} AND ${websocSection.quarter} = ${term.quarter} + ) + WHERE MOD(rownum, ${SECTIONS_PER_CHUNK}) = 0 OR MOD(rownum, ${SECTIONS_PER_CHUNK}) = 1; + `) + .then((xs) => xs.map((x) => x.section_code)); + if (departments.length) { + console.log(`Resuming scraping run at department ${departments[0]}.`); + for (const department of departments) { + console.log(`Scraping department ${department}`); + const resp = await request(term, { + department, + cancelledCourses: "Include", + }).then(normalizeResponse); + if (resp.schools.length) await doChunkUpsert(db, term, resp, department); + await sleep(1000); + } + } else if (!sectionCodeBounds.length) { + console.log("This term has never been scraped before. Falling back to department-wise scrape."); + for (const department of await getDepts(db)) { + console.log(`Scraping department ${department}`); + const resp = await request(term, { + department, + cancelledCourses: "Include", + }).then(normalizeResponse); + if (resp.schools.length) await doChunkUpsert(db, term, resp, department); + await sleep(1000); + } + } else { + console.log("Performing chunk-wise scrape."); + for (let i = 0; i < sectionCodeBounds.length; i += 2) { + const lower = sectionCodeBounds[i]; + const upper = sectionCodeBounds[i + 1] ?? LAST_SECTION_CODE; + const sectionCodes = `${lower}-${upper}`; + console.log(`Scraping chunk ${sectionCodes}`); + const resp = await request(term, { + sectionCodes, + cancelledCourses: "Include", + }).then(normalizeResponse); + if (resp.schools.length) await doChunkUpsert(db, term, resp, null); + await sleep(1000); + } } await scrapeGEsForTerm(db, term); const lastScraped = new Date(); const values = { name, lastScraped, lastDeptScraped: null }; await db.transaction(async (tx) => { - await tx.refreshMaterializedView(courseView); - await tx.refreshMaterializedView(instructorView); await tx .insert(websocMeta) .values(values) @@ -804,9 +862,7 @@ export async function doScrape(db: ReturnType) { await scrapeTerm( db, nameToTerm(term.name), - term?.lastDeptScraped - ? departments.slice(departments.indexOf(term.lastDeptScraped)) - : departments, + term?.lastDeptScraped ? departments.slice(departments.indexOf(term.lastDeptScraped)) : [], ); } catch (e) { console.error(e); From 0d645bf8602c759fe7f5cccead2187911ce4abc0 Mon Sep 17 00:00:00 2001 From: Eddy Chen <89349085+ecxyzzy@users.noreply.github.com> Date: Tue, 14 Jan 2025 12:11:43 -0800 Subject: [PATCH 2/3] feat: implement mview-refresher --- .../mview-refresher/package.json | 19 +++++++++++++++++++ .../mview-refresher/src/handler.ts | 11 +++++++++++ .../mview-refresher/tsconfig.json | 10 ++++++++++ .../mview-refresher/worker-configuration.d.ts | 5 +++++ .../mview-refresher/wrangler.toml | 17 +++++++++++++++++ pnpm-lock.yaml | 10 ++++++++++ 6 files changed, 72 insertions(+) create mode 100644 apps/data-pipeline/mview-refresher/package.json create mode 100644 apps/data-pipeline/mview-refresher/src/handler.ts create mode 100644 apps/data-pipeline/mview-refresher/tsconfig.json create mode 100644 apps/data-pipeline/mview-refresher/worker-configuration.d.ts create mode 100644 apps/data-pipeline/mview-refresher/wrangler.toml diff --git a/apps/data-pipeline/mview-refresher/package.json b/apps/data-pipeline/mview-refresher/package.json new file mode 100644 index 0000000..2ae004d --- /dev/null +++ b/apps/data-pipeline/mview-refresher/package.json @@ -0,0 +1,19 @@ +{ + "name": "@apps/mview-refresher", + "version": "0.0.0", + "private": true, + "description": "Script for automatically refreshing materialized views used by the API", + "type": "module", + "scripts": { + "check:types": "tsc -p ./tsconfig.json -noEmit", + "deploy": "wrangler deploy", + "postinstall": "wrangler types --x-include-runtime", + "start": "dotenv -e ../../../.env -- tsx src/index.ts" + }, + "dependencies": { + "@packages/db": "workspace:*" + }, + "devDependencies": { + "wrangler": "3.100.0" + } +} diff --git a/apps/data-pipeline/mview-refresher/src/handler.ts b/apps/data-pipeline/mview-refresher/src/handler.ts new file mode 100644 index 0000000..0ba99d6 --- /dev/null +++ b/apps/data-pipeline/mview-refresher/src/handler.ts @@ -0,0 +1,11 @@ +import { database } from "@packages/db"; +import { courseView, instructorView } from "@packages/db/schema"; + +export default { + async scheduled(_, env) { + const db = database(env.DB.connectionString); + await db.refreshMaterializedView(courseView); + await db.refreshMaterializedView(instructorView); + await db.$client.end({ timeout: 5 }); + }, +} satisfies ExportedHandler; diff --git a/apps/data-pipeline/mview-refresher/tsconfig.json b/apps/data-pipeline/mview-refresher/tsconfig.json new file mode 100644 index 0000000..1f3a364 --- /dev/null +++ b/apps/data-pipeline/mview-refresher/tsconfig.json @@ -0,0 +1,10 @@ +{ + "extends": "../../../tsconfig.json", + "compilerOptions": { + "baseUrl": ".", + "types": ["./worker-configuration.d.ts", "./.wrangler/types/runtime.d.ts"], + "paths": { + "$lib": ["./src/lib.ts"] + } + } +} diff --git a/apps/data-pipeline/mview-refresher/worker-configuration.d.ts b/apps/data-pipeline/mview-refresher/worker-configuration.d.ts new file mode 100644 index 0000000..ff6ddbd --- /dev/null +++ b/apps/data-pipeline/mview-refresher/worker-configuration.d.ts @@ -0,0 +1,5 @@ +// Generated by Wrangler by running `wrangler types --x-include-runtime` + +interface Env { + DB: Hyperdrive; +} diff --git a/apps/data-pipeline/mview-refresher/wrangler.toml b/apps/data-pipeline/mview-refresher/wrangler.toml new file mode 100644 index 0000000..06bccf8 --- /dev/null +++ b/apps/data-pipeline/mview-refresher/wrangler.toml @@ -0,0 +1,17 @@ +#:schema node_modules/wrangler/config-schema.json +name = "anteater-api_mview-refresher" +compatibility_date = "2024-12-30" +compatibility_flags = ["nodejs_compat"] +workers_dev = false +main = "src/handler.ts" +minify = true + +[triggers] +crons = ["0 0 * * *"] + +[observability] +enabled = true + +[[hyperdrive]] +binding = "DB" +id = "c498e76d4f68446ea55d0d92865257b5" diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 62886de..dddfc10 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -242,6 +242,16 @@ importers: specifier: 3.100.0 version: 3.100.0(@cloudflare/workers-types@4.20241230.0) + apps/data-pipeline/mview-refresher: + dependencies: + '@packages/db': + specifier: workspace:* + version: link:../../../packages/db + devDependencies: + wrangler: + specifier: 3.100.0 + version: 3.100.0(@cloudflare/workers-types@4.20241230.0) + apps/data-pipeline/study-location-scraper: dependencies: '@packages/db': From 27db9229e088c363fe26a09204036fdf62910731 Mon Sep 17 00:00:00 2001 From: Eddy Chen <89349085+ecxyzzy@users.noreply.github.com> Date: Tue, 14 Jan 2025 16:27:30 -0800 Subject: [PATCH 3/3] chore(mview-refresher): remove start script --- apps/data-pipeline/mview-refresher/package.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/apps/data-pipeline/mview-refresher/package.json b/apps/data-pipeline/mview-refresher/package.json index 2ae004d..a5a96f5 100644 --- a/apps/data-pipeline/mview-refresher/package.json +++ b/apps/data-pipeline/mview-refresher/package.json @@ -7,8 +7,7 @@ "scripts": { "check:types": "tsc -p ./tsconfig.json -noEmit", "deploy": "wrangler deploy", - "postinstall": "wrangler types --x-include-runtime", - "start": "dotenv -e ../../../.env -- tsx src/index.ts" + "postinstall": "wrangler types --x-include-runtime" }, "dependencies": { "@packages/db": "workspace:*"