-
Notifications
You must be signed in to change notification settings - Fork 7
/
index.ts
260 lines (233 loc) · 7.83 KB
/
index.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
import asyncPool from "tiny-async-pool";
import {
download,
list,
parse,
downloadCourseDetails,
attachDescriptions,
attachPrereqs,
write,
parseCourseDescription,
parseCoursePrereqs,
writeIndex,
} from "./steps";
import { Prerequisites } from "./types";
import {
setLogFormat,
isLogFormat,
log,
error,
span,
warn,
getLogFormat,
} from "./log";
import { getIntConfig } from "./utils";
// Current scraped JSON version
const CURRENT_VERSION = 3;
// Number of terms to scrape (scrapes most recent `NUM_TERMS`)
const NUM_TERMS = getIntConfig("NUM_TERMS") ?? 2;
// Whether to always scrape the current term, even if it's not in the
// most recent `NUM_TERMS` terms.
const ALWAYS_SCRAPE_CURRENT_TERM: boolean =
getIntConfig("ALWAYS_SCRAPE_CURRENT_TERM") === 1;
// IO Concurrency to download files using.
// This is a completely arbitrary number.
const DETAILS_CONCURRENCY = getIntConfig("DETAILS_CONCURRENCY") ?? 128;
async function main(): Promise<void> {
const rawLogFormat = process.env.LOG_FORMAT;
if (rawLogFormat != null) {
if (isLogFormat(rawLogFormat)) {
setLogFormat(rawLogFormat);
} else {
warn(`invalid log format provided`, { logFormat: rawLogFormat });
process.exit(1);
}
} else {
setLogFormat("text");
}
log(`starting crawler`, {
currentVersion: CURRENT_VERSION,
numTerms: NUM_TERMS,
detailsConcurrency: DETAILS_CONCURRENCY,
logFormat: getLogFormat(),
});
try {
// Create a new top-level span for the entire crawler operation.
// This simply logs when before/after the operation
// so we know how long it took.
await span(`crawling Oscar`, {}, async () => crawl());
process.exit(0);
} catch (err) {
error(`a fatal error occurred while running the crawler`, err);
process.exit(1);
}
}
async function crawl(): Promise<void> {
const termsToScrape = await span(
`listing all terms`,
{},
async (setFinishFields) => {
const terms = await list();
const recentTerms = terms.slice(0, NUM_TERMS);
let toScrape = recentTerms;
if (ALWAYS_SCRAPE_CURRENT_TERM) {
// Make sure that, in addition to the most-recent terms,
// the 'current' term is also scraped. This is done by
// computing a rough estimate of the current term based on
// the current date.
//
// Motivation: at the beginning of 2023, Oscar had all 3 terms for the
// year (Spring, Summer, Fall) listed (but no courses were in Summer/
// Fall). In the past (to my knowledge), this wasn't the case; terms
// would only appear once the course schedule was released (in the
// middle of the prior semester). The crawler is configured to scrape
// the most recent 2 terms, so to make sure it continues to scrape the
// Spring schedule during the Spring semester, this was added as a
// workaround.
type TermLabel = "spring" | "summer" | "fall";
const getTermEstimate = (date: Date): TermLabel => {
const month = date.getMonth();
if (month <= 3 /* Until end of April */) {
return "spring";
}
if (month <= 6 /* Until end of July */) {
return "summer";
}
return "fall";
};
/**
* Reverse of getSemesterName from https://github.com/gt-scheduler/website/blob/main/src/utils/semesters.ts:
*/
const termLabelToPossibleTermCodes = (
termString: TermLabel,
year: number
): string[] => {
switch (termString) {
case "spring":
return [`${year}02`, `${year}03`];
case "summer":
return [`${year}05`, `${year}06`];
case "fall":
return [`${year}08`, `${year}09`];
default:
throw new Error(`invalid term string: ${termString}`);
}
};
const now = new Date();
const currentTermEstimate = getTermEstimate(now);
const possibleTermCodes = termLabelToPossibleTermCodes(
currentTermEstimate,
now.getFullYear()
);
const matchingTerms = terms.filter((term) =>
possibleTermCodes.includes(term)
);
if (matchingTerms.length === 0) {
warn(`no terms match the current term estimate`, {
currentTermEstimate,
possibleTermCodesFromEstimate: possibleTermCodes,
actualTermCodes: terms,
});
} else {
const [matchingTerm] = matchingTerms;
const alreadyInRecentTerms = recentTerms.includes(matchingTerm);
if (!alreadyInRecentTerms) {
toScrape = [matchingTerm, ...recentTerms];
}
setFinishFields({
addedCurrentTerm: !alreadyInRecentTerms,
currentTerm: matchingTerm,
});
}
}
setFinishFields({
terms,
termsToScrape: toScrape,
recentTerms,
desiredNumTerms: NUM_TERMS,
});
return toScrape;
}
);
// Scrape each term in parallel
await Promise.all(
termsToScrape.map(async (term) => {
// Set the base fields that are added to every span
const termSpanFields: Record<string, unknown> = {
term,
version: CURRENT_VERSION,
};
await span(`crawling term`, termSpanFields, () =>
crawlTerm(term, termSpanFields)
);
})
);
// Output a JSON file containing all of the scraped term files
await writeIndex();
}
async function crawlTerm(
term: string,
baseSpanFields: Record<string, unknown>
): Promise<void> {
// Alias the parameter so we can modify it
let spanFields = baseSpanFields;
// Download the term HTML page containing every course.
const html = await span(`downloading term`, spanFields, () => download(term));
const termData = await span(`parsing term data to JSON`, spanFields, () =>
parse(html, CURRENT_VERSION)
);
const allCourseIds = Object.keys(termData.courses);
const courseIdCount = allCourseIds.length;
spanFields = { ...spanFields, courseIdCount };
log(`collected all course ids`, { allCourseIds, ...spanFields });
const allPrereqs: Record<string, Prerequisites | []> = {};
const allDescriptions: Record<string, string | null> = {};
await span(
`downloading & parsing prerequisite info & course descriptions`,
{ ...spanFields, concurrency: DETAILS_CONCURRENCY },
async () =>
asyncPool(DETAILS_CONCURRENCY, allCourseIds, async (courseId) => {
const [coursePrereqs, courseDescription] = await span(
`crawling individual course`,
{
...spanFields,
courseId,
},
async (setCompletionFields) => {
const [htmlLength, prereqs, description] = await crawlCourseDetails(
term,
courseId
);
setCompletionFields({
htmlLength,
hasDescription: description != null,
});
return [prereqs, description];
}
);
allPrereqs[courseId] = coursePrereqs;
allDescriptions[courseId] = courseDescription;
})
);
await span(`attaching prereq information`, spanFields, () =>
attachPrereqs(termData, allPrereqs)
);
await span(`attaching course descriptions`, spanFields, () =>
attachDescriptions(termData, allDescriptions)
);
await span(`writing scraped data to disk`, spanFields, () =>
write(term, termData)
);
}
async function crawlCourseDetails(
term: string,
courseId: string
): Promise<
[htmlLength: number, prereqs: Prerequisites | [], descriptions: string | null]
> {
const detailsHtml = await downloadCourseDetails(term, courseId);
const prereqs = await parseCoursePrereqs(detailsHtml, courseId);
const description = parseCourseDescription(detailsHtml, courseId);
return [detailsHtml.length, prereqs, description];
}
main();