From 4689cd37c4403892d62d10c462bdb17a1ff41d05 Mon Sep 17 00:00:00 2001 From: Eddy Chen <89349085+ecxyzzy@users.noreply.github.com> Date: Mon, 20 May 2024 15:24:08 -0700 Subject: [PATCH] =?UTF-8?q?feat:=20=E2=9C=A8=20implement=20larc=20caching?= =?UTF-8?q?=20(#148)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/api/src/routes/v1/rest/larc/+endpoint.ts | 60 +++------------- libs/db/prisma/schema.prisma | 9 +++ pnpm-lock.yaml | 25 +++++++ services/larc-scraper/build.mjs | 72 +++++++++++++++++++ services/larc-scraper/package.json | 24 +++++++ services/larc-scraper/src/index.ts | 69 ++++++++++++++++++ .../larc => services/larc-scraper/src}/lib.ts | 0 tools/cdk/src/constructs/LarcScraper.ts | 42 +++++++++++ tools/cdk/src/stacks/services.ts | 3 + 9 files changed, 252 insertions(+), 52 deletions(-) create mode 100644 services/larc-scraper/build.mjs create mode 100644 services/larc-scraper/package.json create mode 100644 services/larc-scraper/src/index.ts rename {apps/api/src/routes/v1/rest/larc => services/larc-scraper/src}/lib.ts (100%) create mode 100644 tools/cdk/src/constructs/LarcScraper.ts diff --git a/apps/api/src/routes/v1/rest/larc/+endpoint.ts b/apps/api/src/routes/v1/rest/larc/+endpoint.ts index 7b624927..6337c34c 100644 --- a/apps/api/src/routes/v1/rest/larc/+endpoint.ts +++ b/apps/api/src/routes/v1/rest/larc/+endpoint.ts @@ -1,11 +1,11 @@ +import { PrismaClient } from "@libs/db"; import { createHandler } from "@libs/lambda"; -import { load } from "cheerio"; -import { fetch } from "cross-fetch"; import { ZodError } from "zod"; -import { fmtBldg, fmtDays, fmtTime, quarterToLarcSuffix } from "./lib"; import { QuerySchema } from "./schema"; +const prisma = new PrismaClient(); + export const GET = createHandler(async (event, context, res) => { const headers = event.headers; const requestId = context.awsRequestId; @@ -14,55 +14,11 @@ export const GET = createHandler(async (event, context, res) => { try { const { year, quarter } = QuerySchema.parse(query); - // SS10wk does not have LARC sessions apparently - if (quarter === "Summer10wk") return res.createOKResult([], headers, requestId); - - // TODO: move this code to its own scraper, and rewrite this route to fetch - // data from the DB. - - const html = await fetch( - `https://enroll.larc.uci.edu/${year}${quarterToLarcSuffix(quarter)}`, - ).then((response) => response.text()); - - const $ = load(html); - - const larcSections = $(".tutorial-group") - .toArray() - .map((card) => { - const match = $(card) - .find(".card-header") - .text() - .trim() - .match( - /(?[^()]*)( \(same as (?.*)\))? - (.*) \((?.*)\)/, - ); - - const sections = $(card) - .find(".list-group") - .toArray() - .map((group) => { - const rows = $(group).find(".col-lg-4"); - - const [days, time] = $(rows[0]) - .find(".col") - .map((_, col) => $(col).text().trim()); - - const [instructor, building] = $(rows[1]) - .find(".col") - .map((_, col) => $(col).text().trim()); - - return { - days: fmtDays(days), - time: fmtTime(time), - instructor, - bldg: fmtBldg(building), - }; - }); - - return { courseInfo: { ...match?.groups }, sections }; - }); - - return res.createOKResult(larcSections, headers, requestId); + return res.createOKResult( + (await prisma.larcTerm.findFirst({ where: { year, quarter } }))?.courses ?? [], + headers, + requestId, + ); } catch (e) { if (e instanceof ZodError) { const messages = e.issues.map((issue) => issue.message); diff --git a/libs/db/prisma/schema.prisma b/libs/db/prisma/schema.prisma index a7ee3295..5afd2fdc 100644 --- a/libs/db/prisma/schema.prisma +++ b/libs/db/prisma/schema.prisma @@ -181,6 +181,15 @@ model Instructor { courses Json @default("[]") } +model LarcTerm { + year String + quarter Quarter + courses Json + + @@id([year, quarter]) + @@unique([year, quarter], name: "idx") +} + model Major { id String @id degreeId String diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index b4ddf087..5e7bc262 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -319,6 +319,31 @@ importers: specifier: 0.20.1 version: 0.20.1 + services/larc-scraper: + dependencies: + '@libs/db': + specifier: workspace:^ + version: link:../../libs/db + '@libs/uc-irvine-lib': + specifier: workspace:^ + version: link:../../libs/uc-irvine-lib + '@libs/utils': + specifier: workspace:^ + version: link:../../libs/utils + cheerio: + specifier: 1.0.0-rc.12 + version: 1.0.0-rc.12 + cross-fetch: + specifier: 4.0.0 + version: 4.0.0 + devDependencies: + '@peterportal-api/types': + specifier: workspace:^ + version: link:../../packages/types + esbuild: + specifier: 0.20.1 + version: 0.20.1 + services/websoc-proxy: dependencies: '@libs/lambda': diff --git a/services/larc-scraper/build.mjs b/services/larc-scraper/build.mjs new file mode 100644 index 00000000..9594b598 --- /dev/null +++ b/services/larc-scraper/build.mjs @@ -0,0 +1,72 @@ +import { chmod, copyFile, mkdir, rm } from "node:fs/promises"; +import { dirname, join } from "node:path"; +import { fileURLToPath } from "node:url"; + +import { build } from "esbuild"; + +const __dirname = dirname(fileURLToPath(import.meta.url)); + +/** + * @see https://github.com/evanw/esbuild/issues/1921#issuecomment-1623640043 + */ +// language=JavaScript +const js = ` + import topLevelModule from "node:module"; + import topLevelUrl from "node:url"; + import topLevelPath from "node:path"; + + const require = topLevelModule.createRequire(import.meta.url); + const __filename = topLevelUrl.fileURLToPath(import.meta.url); + const __dirname = topLevelPath.dirname(__filename); +`; + +async function buildApp() { + const options = { + entryPoints: { index: "src/index.ts" }, + outdir: "dist", + outExtension: { ".js": ".mjs" }, + bundle: true, + minify: true, + format: "esm", + platform: "node", + target: "node20", + logLevel: "info", + banner: { js }, + plugins: [ + { + name: "clean", + setup(build) { + build.onStart(async () => { + await rm(join(__dirname, "dist/"), { recursive: true, force: true }); + await mkdir(join(__dirname, "dist/")); + }); + }, + }, + { + name: "copy", + setup(build) { + build.onEnd(async () => { + await copyFile( + join( + __dirname, + "../../libs/db/node_modules/prisma/libquery_engine-linux-arm64-openssl-3.0.x.so.node", + ), + join(__dirname, "dist/libquery_engine-linux-arm64-openssl-3.0.x.so.node"), + ); + await copyFile( + join(__dirname, "../../libs/db/prisma/schema.prisma"), + join(__dirname, "dist/schema.prisma"), + ); + await chmod( + join(__dirname, "dist/libquery_engine-linux-arm64-openssl-3.0.x.so.node"), + 0o755, + ); + }); + }, + }, + ], + }; + await build(options); +} + +buildApp().then(); diff --git a/services/larc-scraper/package.json b/services/larc-scraper/package.json new file mode 100644 index 00000000..af6a5409 --- /dev/null +++ b/services/larc-scraper/package.json @@ -0,0 +1,24 @@ +{ + "name": "@services/larc-scraper", + "version": "0.0.0", + "private": true, + "description": "Automated scraper for LARC sections", + "license": "MIT", + "type": "module", + "main": "src/index.ts", + "types": "src/index.ts", + "scripts": { + "build": "node build.mjs" + }, + "dependencies": { + "@libs/db": "workspace:^", + "@libs/uc-irvine-lib": "workspace:^", + "@libs/utils": "workspace:^", + "cheerio": "1.0.0-rc.12", + "cross-fetch": "4.0.0" + }, + "devDependencies": { + "@peterportal-api/types": "workspace:^", + "esbuild": "0.20.1" + } +} diff --git a/services/larc-scraper/src/index.ts b/services/larc-scraper/src/index.ts new file mode 100644 index 00000000..f50d5175 --- /dev/null +++ b/services/larc-scraper/src/index.ts @@ -0,0 +1,69 @@ +import { PrismaClient } from "@libs/db"; +import { LarcResponse, Quarter } from "@peterportal-api/types"; +import { load } from "cheerio"; +import { fetch } from "cross-fetch"; + +import { fmtBldg, fmtDays, fmtTime, quarterToLarcSuffix } from "./lib"; + +const EARLIEST_YEAR = 2019; + +const prisma = new PrismaClient(); + +export const sleep = async (duration: number) => + new Promise((resolve) => setTimeout(resolve, duration)); + +export const handler = async () => { + const data: Array<{ year: string; quarter: Quarter; courses: LarcResponse }> = []; + const quarters = ["Fall", "Winter", "Spring", "Summer1", "Summer2"] as const; + for (let year = EARLIEST_YEAR; year < new Date().getFullYear() + 2; ++year) { + for (const quarter of quarters) { + console.log(`Scraping ${year} ${quarter}`); + const html = await fetch( + `https://enroll.larc.uci.edu/${year}${quarterToLarcSuffix(quarter)}`, + ).then((response) => response.text()); + + const $ = load(html); + + const courses = $(".tutorial-group") + .toArray() + .map((card) => { + const match = $(card) + .find(".card-header") + .text() + .trim() + .match( + /(?[^()]*)( \(same as (?.*)\))? - (.*) \((?.*)\)/, + ); + + const sections = $(card) + .find(".list-group") + .toArray() + .map((group) => { + const rows = $(group).find(".col-lg-4"); + + const [days, time] = $(rows[0]) + .find(".col") + .map((_, col) => $(col).text().trim()); + + const [instructor, building] = $(rows[1]) + .find(".col") + .map((_, col) => $(col).text().trim()); + + return { + days: fmtDays(days), + time: fmtTime(time), + instructor, + bldg: fmtBldg(building), + }; + }); + + return { courseInfo: { ...match?.groups }, sections }; + }); + data.push({ year: year.toString(), quarter, courses: (courses as LarcResponse) ?? [] }); + await sleep(1000); + } + } + await prisma.$transaction([prisma.larcTerm.deleteMany({}), prisma.larcTerm.createMany({ data })]); +}; + +handler().then(); diff --git a/apps/api/src/routes/v1/rest/larc/lib.ts b/services/larc-scraper/src/lib.ts similarity index 100% rename from apps/api/src/routes/v1/rest/larc/lib.ts rename to services/larc-scraper/src/lib.ts diff --git a/tools/cdk/src/constructs/LarcScraper.ts b/tools/cdk/src/constructs/LarcScraper.ts new file mode 100644 index 00000000..14202920 --- /dev/null +++ b/tools/cdk/src/constructs/LarcScraper.ts @@ -0,0 +1,42 @@ +import { dirname, join } from "node:path"; +import { fileURLToPath } from "node:url"; + +import { Duration } from "aws-cdk-lib"; +import { Rule, RuleTargetInput, Schedule } from "aws-cdk-lib/aws-events"; +import { LambdaFunction } from "aws-cdk-lib/aws-events-targets"; +import { Architecture, Code, Function, Runtime } from "aws-cdk-lib/aws-lambda"; +import { Construct } from "constructs"; + +export class LarcScraper extends Construct { + constructor(scope: Construct, id: string) { + super(scope, id); + + const ruleName = `${id}-rule`; + + const rule = new Rule(this, ruleName, { + ruleName, + schedule: Schedule.rate(Duration.days(1)), + }); + + const functionName = `${id}-function`; + + rule.addTarget( + new LambdaFunction( + new Function(this, functionName, { + architecture: Architecture.ARM_64, + code: Code.fromAsset( + join(dirname(fileURLToPath(import.meta.url)), "../../../../services/larc-scraper/dist"), + ), + functionName, + handler: "index.handler", + timeout: Duration.seconds(15), + runtime: Runtime.NODEJS_20_X, + memorySize: 512, + }), + { + event: RuleTargetInput.fromObject({ body: "{}" }), + }, + ), + ); + } +} diff --git a/tools/cdk/src/stacks/services.ts b/tools/cdk/src/stacks/services.ts index ece3efd1..ec483d3f 100644 --- a/tools/cdk/src/stacks/services.ts +++ b/tools/cdk/src/stacks/services.ts @@ -4,6 +4,7 @@ import { SubnetType, Vpc } from "aws-cdk-lib/aws-ec2"; import type { Construct } from "constructs"; import { CalendarScraper } from "../constructs/CalendarScraper"; +import { LarcScraper } from "../constructs/LarcScraper"; import { WebsocProxy } from "../constructs/WebsocProxy"; import { WebsocScraperV2 } from "../constructs/WebsocScraperV2"; @@ -30,6 +31,8 @@ export class ServicesStack extends Stack { new CalendarScraper(this, `${id}-calendar-scraper`); + new LarcScraper(this, `${id}-larc-scraper`); + new WebsocProxy(this, `${id}-websoc-proxy`); new WebsocScraperV2(this, `${id}-websoc-scraper-v2`, vpc);