From 676c37a3792238912b055e43528b3d597db1e97e Mon Sep 17 00:00:00 2001 From: Christopher Kwong Date: Wed, 28 Feb 2024 18:45:34 -0800 Subject: [PATCH] =?UTF-8?q?feat(registrar-scraper):=20=E2=9C=A8=20implemen?= =?UTF-8?q?t=20course=20history=20scraper?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix #113 --- .../src/course-scraper/index.ts | 33 ++++++++++--------- tools/registrar-scraper/src/lib.ts | 16 +++++---- 2 files changed, 26 insertions(+), 23 deletions(-) diff --git a/tools/registrar-scraper/src/course-scraper/index.ts b/tools/registrar-scraper/src/course-scraper/index.ts index aa34c8de..bb546c67 100644 --- a/tools/registrar-scraper/src/course-scraper/index.ts +++ b/tools/registrar-scraper/src/course-scraper/index.ts @@ -5,7 +5,7 @@ import fetch from "cross-fetch"; const CATALOGUE_BASE_URL = "https://catalogue.uci.edu"; const URL_TO_ALL_COURSES = `${CATALOGUE_BASE_URL}/allcourses/`; const URL_TO_ALL_SCHOOLS = `${CATALOGUE_BASE_URL}/schoolsandprograms/`; -const ENROLL_HIST_URL = 'https://www.reg.uci.edu/perl/EnrollHist.pl' +const ENROLL_HIST_URL = "https://www.reg.uci.edu/perl/EnrollHist.pl"; const YEAR_THRESHOLD = 9; // Number of years to look back when grabbing course history @@ -246,6 +246,7 @@ export async function getCourses() { courses.forEach((v, k) => allCourses.set(k, { ...v, school: schoolMapping.get(v.department) ?? "" }), ); + await sleep(1000); } if (deptsWithoutSchools.size > 0) { throw new Error( @@ -259,29 +260,27 @@ export async function getCourses() { export async function getCourseHistory( department: string, - year_threshold: number + year_threshold: number, ): Promise<{ [key: string]: Set }> { const courseTerms: { [key: string]: Set } = {}; let page: string; let continueParsing: boolean; - var ptr = -6; + let ptr = -6; const params = { dept_name: department, action: "Submit", ptr: "", - } + }; try { do { - page = await ( - await fetch(ENROLL_HIST_URL + "?" + new URLSearchParams(params)) - ).text(); + page = await (await fetch(ENROLL_HIST_URL + "?" + new URLSearchParams(params))).text(); const $ = load(page); const warning = $("tr td.lcRegWeb_red_message"); if (warning.length && warning.text().startsWith("No results found")) { - return courseTerms; - } + return courseTerms; + } continueParsing = await parseCourseHistoryPage(page, year_threshold, courseTerms); - ptr += 6 + ptr += 6; params["action"] = "Prev"; params["ptr"] = ptr.toString(); } while (continueParsing); @@ -294,24 +293,26 @@ export async function getCourseHistory( async function parseCourseHistoryPage( courseHistoryPage: string, year_threshold: number, - courseTerms: { [key: string]: Set } + courseTerms: { [key: string]: Set }, ): Promise { const fieldLabels = { term: 0, courseNo: 4, - } + }; const currentYear = new Date().getFullYear() % 100; let entryFound = false; try { const $ = load(courseHistoryPage); + let term = ""; $("table tbody tr").each(function (this) { const entry = $(this).find("td"); if ($(entry).length == 15) { - const term = $(entry[fieldLabels.term]).text().trim(); - if (term === "Term") { + const termValue = $(entry[fieldLabels.term]).text().trim(); + if (termValue === "Term") { return true; } - if (term.length === 3) { + if (termValue.length === 3) { + term = termValue; entryFound = true; const termYear = parseInt(term.replace(/\D/g, "")); if (currentYear - termYear > year_threshold) { @@ -319,7 +320,7 @@ async function parseCourseHistoryPage( return false; } } - if (term.length) { + if (term && termValue.length === 0) { const courseNo = $(entry[fieldLabels.courseNo]).text().trim(); if (!courseTerms[courseNo]) { courseTerms[courseNo] = new Set(); diff --git a/tools/registrar-scraper/src/lib.ts b/tools/registrar-scraper/src/lib.ts index c0ea5eb5..2d461392 100644 --- a/tools/registrar-scraper/src/lib.ts +++ b/tools/registrar-scraper/src/lib.ts @@ -140,6 +140,7 @@ export const createCourses = corequisite, ge_list, ge_text, + terms, }, ]: [string, ScrapedCourse]): Prisma.CourseCreateManyInput => { const courseId = `${department} ${number}`; @@ -195,12 +196,13 @@ export const createCourses = } }), geText: ge_text, - // terms: Array.from( - // new Set( - // Object.values(instructorInfo) - // .filter((x) => Object.keys(x.courseHistory ?? {}).includes(courseId)) - // .flatMap((x) => x.courseHistory[courseId]), - // ), - // ).sort(sortTerms), + terms: Array.from( + new Set([ + ...terms.map(transformTerm).filter((x) => x.length), + ...Object.values(instructorInfo) + .filter((x) => Object.keys(x.courseHistory ?? {}).includes(courseId)) + .flatMap((x) => x.courseHistory[courseId]), + ]), + ).sort(sortTerms), }; };