From d25925a0fa7bb386bd2d42ef8c79a036207e7f10 Mon Sep 17 00:00:00 2001 From: Christopher Kwong Date: Wed, 28 Feb 2024 16:14:44 -0800 Subject: [PATCH 1/2] feat(registrar-scraper): implement course term scraper --- .../src/course-scraper/index.ts | 89 ++++++++++++++++++- tools/registrar-scraper/src/lib.ts | 14 +-- 2 files changed, 95 insertions(+), 8 deletions(-) diff --git a/tools/registrar-scraper/src/course-scraper/index.ts b/tools/registrar-scraper/src/course-scraper/index.ts index 06ca7714..aa34c8de 100644 --- a/tools/registrar-scraper/src/course-scraper/index.ts +++ b/tools/registrar-scraper/src/course-scraper/index.ts @@ -5,6 +5,9 @@ import fetch from "cross-fetch"; const CATALOGUE_BASE_URL = "https://catalogue.uci.edu"; const URL_TO_ALL_COURSES = `${CATALOGUE_BASE_URL}/allcourses/`; const URL_TO_ALL_SCHOOLS = `${CATALOGUE_BASE_URL}/schoolsandprograms/`; +const ENROLL_HIST_URL = 'https://www.reg.uci.edu/perl/EnrollHist.pl' + +const YEAR_THRESHOLD = 9; // Number of years to look back when grabbing course history const Ia = "GE Ia: Lower Division Writing"; const Ib = "GE Ib: Upper Division Writing"; @@ -167,6 +170,8 @@ async function getCoursesOfDepartment(deptURL: string) { const $ = load(await res.text()); const courses: [string, Course][] = []; const deptName = normalized($(".page-title").text()).split("(")[0].trim(); + const deptCode = normalized($(".page-title").text()).split("(")[1].slice(0, -1).trim(); + const courseTerms = await getCourseHistory(deptCode, YEAR_THRESHOLD); $("#courseinventorycontainer > .courses > .courseblock").each((_, courseBlock) => { const header: string[] = normalized($(courseBlock).find(".courseblocktitle").text()) .split(" ") @@ -219,7 +224,7 @@ async function getCoursesOfDepartment(deptURL: string) { .map((x) => x.filter((y) => y)[1]) .map((x) => GE_DICTIONARY[x]), ge_text: courseBody.filter((x) => x.match(/^\({1,2}[IV]/))[0] ?? "", - terms: [], + terms: [...(courseTerms[courseNumber] ?? [])], }, ]); }); @@ -251,3 +256,85 @@ export async function getCourses() { } return Object.fromEntries(allCourses.entries()); } + +export async function getCourseHistory( + department: string, + year_threshold: number +): Promise<{ [key: string]: Set }> { + const courseTerms: { [key: string]: Set } = {}; + let page: string; + let continueParsing: boolean; + var ptr = -6; + const params = { + dept_name: department, + action: "Submit", + ptr: "", + } + try { + do { + page = await ( + await fetch(ENROLL_HIST_URL + "?" + new URLSearchParams(params)) + ).text(); + const $ = load(page); + const warning = $("tr td.lcRegWeb_red_message"); + if (warning.length && warning.text().startsWith("No results found")) { + return courseTerms; + } + continueParsing = await parseCourseHistoryPage(page, year_threshold, courseTerms); + ptr += 6 + params["action"] = "Prev"; + params["ptr"] = ptr.toString(); + } while (continueParsing); + } catch (error) { + console.log(error); + } + return courseTerms; +} + +async function parseCourseHistoryPage( + courseHistoryPage: string, + year_threshold: number, + courseTerms: { [key: string]: Set } +): Promise { + const fieldLabels = { + term: 0, + courseNo: 4, + } + const currentYear = new Date().getFullYear() % 100; + let entryFound = false; + try { + const $ = load(courseHistoryPage); + $("table tbody tr").each(function (this) { + const entry = $(this).find("td"); + if ($(entry).length == 15) { + const term = $(entry[fieldLabels.term]).text().trim(); + if (term === "Term") { + return true; + } + if (term.length === 3) { + entryFound = true; + const termYear = parseInt(term.replace(/\D/g, "")); + if (currentYear - termYear > year_threshold) { + entryFound = false; + return false; + } + } + if (term.length) { + const courseNo = $(entry[fieldLabels.courseNo]).text().trim(); + if (!courseTerms[courseNo]) { + courseTerms[courseNo] = new Set(); + } + courseTerms[courseNo].add(term); + } + } + return true; + }); + if ($('a:contains("Prev")').length === 0) { + entryFound = false; + return false; + } + } catch (error) { + console.log(error); + } + return entryFound; +} diff --git a/tools/registrar-scraper/src/lib.ts b/tools/registrar-scraper/src/lib.ts index 40a97225..c0ea5eb5 100644 --- a/tools/registrar-scraper/src/lib.ts +++ b/tools/registrar-scraper/src/lib.ts @@ -195,12 +195,12 @@ export const createCourses = } }), geText: ge_text, - terms: Array.from( - new Set( - Object.values(instructorInfo) - .filter((x) => Object.keys(x.courseHistory ?? {}).includes(courseId)) - .flatMap((x) => x.courseHistory[courseId]), - ), - ).sort(sortTerms), + // terms: Array.from( + // new Set( + // Object.values(instructorInfo) + // .filter((x) => Object.keys(x.courseHistory ?? {}).includes(courseId)) + // .flatMap((x) => x.courseHistory[courseId]), + // ), + // ).sort(sortTerms), }; }; From 676c37a3792238912b055e43528b3d597db1e97e Mon Sep 17 00:00:00 2001 From: Christopher Kwong Date: Wed, 28 Feb 2024 18:45:34 -0800 Subject: [PATCH 2/2] =?UTF-8?q?feat(registrar-scraper):=20=E2=9C=A8=20impl?= =?UTF-8?q?ement=20course=20history=20scraper?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix #113 --- .../src/course-scraper/index.ts | 33 ++++++++++--------- tools/registrar-scraper/src/lib.ts | 16 +++++---- 2 files changed, 26 insertions(+), 23 deletions(-) diff --git a/tools/registrar-scraper/src/course-scraper/index.ts b/tools/registrar-scraper/src/course-scraper/index.ts index aa34c8de..bb546c67 100644 --- a/tools/registrar-scraper/src/course-scraper/index.ts +++ b/tools/registrar-scraper/src/course-scraper/index.ts @@ -5,7 +5,7 @@ import fetch from "cross-fetch"; const CATALOGUE_BASE_URL = "https://catalogue.uci.edu"; const URL_TO_ALL_COURSES = `${CATALOGUE_BASE_URL}/allcourses/`; const URL_TO_ALL_SCHOOLS = `${CATALOGUE_BASE_URL}/schoolsandprograms/`; -const ENROLL_HIST_URL = 'https://www.reg.uci.edu/perl/EnrollHist.pl' +const ENROLL_HIST_URL = "https://www.reg.uci.edu/perl/EnrollHist.pl"; const YEAR_THRESHOLD = 9; // Number of years to look back when grabbing course history @@ -246,6 +246,7 @@ export async function getCourses() { courses.forEach((v, k) => allCourses.set(k, { ...v, school: schoolMapping.get(v.department) ?? "" }), ); + await sleep(1000); } if (deptsWithoutSchools.size > 0) { throw new Error( @@ -259,29 +260,27 @@ export async function getCourses() { export async function getCourseHistory( department: string, - year_threshold: number + year_threshold: number, ): Promise<{ [key: string]: Set }> { const courseTerms: { [key: string]: Set } = {}; let page: string; let continueParsing: boolean; - var ptr = -6; + let ptr = -6; const params = { dept_name: department, action: "Submit", ptr: "", - } + }; try { do { - page = await ( - await fetch(ENROLL_HIST_URL + "?" + new URLSearchParams(params)) - ).text(); + page = await (await fetch(ENROLL_HIST_URL + "?" + new URLSearchParams(params))).text(); const $ = load(page); const warning = $("tr td.lcRegWeb_red_message"); if (warning.length && warning.text().startsWith("No results found")) { - return courseTerms; - } + return courseTerms; + } continueParsing = await parseCourseHistoryPage(page, year_threshold, courseTerms); - ptr += 6 + ptr += 6; params["action"] = "Prev"; params["ptr"] = ptr.toString(); } while (continueParsing); @@ -294,24 +293,26 @@ export async function getCourseHistory( async function parseCourseHistoryPage( courseHistoryPage: string, year_threshold: number, - courseTerms: { [key: string]: Set } + courseTerms: { [key: string]: Set }, ): Promise { const fieldLabels = { term: 0, courseNo: 4, - } + }; const currentYear = new Date().getFullYear() % 100; let entryFound = false; try { const $ = load(courseHistoryPage); + let term = ""; $("table tbody tr").each(function (this) { const entry = $(this).find("td"); if ($(entry).length == 15) { - const term = $(entry[fieldLabels.term]).text().trim(); - if (term === "Term") { + const termValue = $(entry[fieldLabels.term]).text().trim(); + if (termValue === "Term") { return true; } - if (term.length === 3) { + if (termValue.length === 3) { + term = termValue; entryFound = true; const termYear = parseInt(term.replace(/\D/g, "")); if (currentYear - termYear > year_threshold) { @@ -319,7 +320,7 @@ async function parseCourseHistoryPage( return false; } } - if (term.length) { + if (term && termValue.length === 0) { const courseNo = $(entry[fieldLabels.courseNo]).text().trim(); if (!courseTerms[courseNo]) { courseTerms[courseNo] = new Set(); diff --git a/tools/registrar-scraper/src/lib.ts b/tools/registrar-scraper/src/lib.ts index c0ea5eb5..2d461392 100644 --- a/tools/registrar-scraper/src/lib.ts +++ b/tools/registrar-scraper/src/lib.ts @@ -140,6 +140,7 @@ export const createCourses = corequisite, ge_list, ge_text, + terms, }, ]: [string, ScrapedCourse]): Prisma.CourseCreateManyInput => { const courseId = `${department} ${number}`; @@ -195,12 +196,13 @@ export const createCourses = } }), geText: ge_text, - // terms: Array.from( - // new Set( - // Object.values(instructorInfo) - // .filter((x) => Object.keys(x.courseHistory ?? {}).includes(courseId)) - // .flatMap((x) => x.courseHistory[courseId]), - // ), - // ).sort(sortTerms), + terms: Array.from( + new Set([ + ...terms.map(transformTerm).filter((x) => x.length), + ...Object.values(instructorInfo) + .filter((x) => Object.keys(x.courseHistory ?? {}).includes(courseId)) + .flatMap((x) => x.courseHistory[courseId]), + ]), + ).sort(sortTerms), }; };