Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: sync data from dbh #35

Merged
merged 9 commits into from
Jan 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion clients/client.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from bs4 import BeautifulSoup
import requests

from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

class Client:
session = None
Expand All @@ -11,3 +12,19 @@ def __init__(self):

def init_soup(self, page_text: str):
return BeautifulSoup(page_text, "html5lib")

def requests_retry_session(
retries=3, backoff_factor=0.3, status_forcelist=(500, 502, 504), session=None
):
session = session or requests.Session()
retry = Retry(
total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_forcelist,
)
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session
271 changes: 212 additions & 59 deletions clients/course_pages.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,87 @@
import re

from grades.models import Course
from typing import Optional

import bs4
from .client import Client


class CoursePagesClient(Client):
base_url = "https://www.ntnu.no/studier/emner"
base_url_eng = "https://www.ntnu.edu/studies/courses"
USE_ENGLISH_VERSION_FILTERS = [
"se engelsk versjon",
"see english version",
"se engelsk beskrivelse",
"se engelsk tekst",
"se engelsk emnebeskrivelse",
"se engelsk utgave",
"see engelsk version",
"see english text",
]

def extract_div_content(self, soup, div_id):
div = soup.find("div", {"id": div_id})
if not div:
return ""

result = []

pDelimiter = "\n\n"
liDelimiter = "\n- "

for element in div.children:
if element.name:
if element.name == "p":
result.append(pDelimiter + element.get_text(strip=True))
elif element.name == "ul" or element.name == "ol":
result.extend(
liDelimiter + li.get_text(strip=True)
for li in element.find_all("li")
)
elif element.string:
result.append(element.strip())

return "".join(result).strip()

def use_english_version(self, text):
text = text.lower()
return text.strip() == "" or any(
use_english_version_filter in text
for use_english_version_filter in self.USE_ENGLISH_VERSION_FILTERS
)

def get_course_url(self, course_code: str):
return f"{self.base_url}/{course_code}"
def extract_course_name(self, soup):
name_raw = soup.title.get_text().split("-")

def parse_card_content_by_title(self, soup, title: str):
card_title = soup.find("div", {"class": "card-header"}, string=title)
facts_card = card_title.parent
card_body = facts_card.find("div", {"class": "card-body"})
if len(name_raw) <= 4:
return name_raw[1][1 : len(name_raw[1]) - 1]

content_text = card_body.get_text()
search_pattern = re.compile(r"(?P<name>\S+):(\s+)(?P<value>(\S| )+)")
results = search_pattern.finditer(self.normalize(content_text))
name = name_raw[1][1 : len(name_raw[1])]
for i in range(2, len(name_raw) - 4):
name += "-"
name += name_raw[i]
name += "-"
name += name_raw[len(name_raw) - 3][0 : len(name_raw[len(name_raw) - 3]) - 1]

content = {}
for match in results:
groupdict = match.groupdict()
key = groupdict.get("name")
value = groupdict.get("value")
content[key] = value
return content
return name

def extract_has_digital_exam(self, soup):
om_eksamen: Optional[bs4.element.Tag] = soup.find(attrs={"id": "omEksamen"})
if om_eksamen is None:
return False

dl: Optional[bs4.element.Tag] = om_eksamen.find("dl")
if dl is None:
return False

for dt in dl.find_all("dt"):
term: Optional[bs4.element.Tag] = dt.find(class_="exam-term")
if term is None:
continue

system: Optional[bs4.element.Tag] = dt.find(class_="exam-system")

if system.text.strip() == "INSPERA":
return True

return False

@staticmethod
def get_study_level_from_description(description: str):
Expand Down Expand Up @@ -62,62 +116,161 @@ def get_study_level_from_description(description: str):
def normalize(string: str):
return string.replace("\xa0\xc2", " ").replace("\xc2", " ").replace("\xa0", " ")

def request_course_page(self, course_code: str):
course_url = self.get_course_url(course_code)
page_response = self.session.get(course_url)
page_text = self.normalize(page_response.text)
page_soup = self.init_soup(page_text)
def get_course_data(self, code, year: int = None):
year_segment = ""
if year:
year_segment += f"/{str(year)}"

no_content_title = "Ingen info for gitt studieår"
page_title = (
page_soup.find("div", {"id": "course-details"}).h1.get_text().strip()
)
base_url_no = f"https://www.ntnu.no/studier/emner/{code}{year_segment}"
data_no = self.requests_retry_session(self.session).get(url=base_url_no)
text_no = self.normalize(data_no.text)
soup_no = self.init_soup(text_no)

if page_title == no_content_title:
no_content_title = "Det finnes ingen informasjon for dette studieåret"
course_detail_h1 = no_content_title
try:
course_detail_h1 = (
soup_no.find_all("div", {"id": "course-details"})[0]
.h1.get_text()
.strip()
)
except IndexError:
print("Something very wrong for course: " + code)

if course_detail_h1 == no_content_title:
print(
f"No info found for course {code}"
+ (f" for year {year}" if year else "")
)
return None

facts_title_no = "Fakta om emnet"
facts = self.parse_card_content_by_title(page_soup, facts_title_no)
no_longer_taught_text = "Det tilbys ikke lenger undervisning i emnet."
try:
strong_text = soup_no.find("div", {"class": "content"}).p.strong.get_text(
strip=True
)
if strong_text == no_longer_taught_text:
year_info = f" in year {year}" if year else ""
print(f"Course {code} not taught{year_info}")
return None
except Exception:
pass

has_digital_exam = self.extract_has_digital_exam(soup_no)

base_url_eng = f"https://www.ntnu.edu/studies/courses/{code}{year_segment}"
data_eng = self.requests_retry_session(self.session).get(url=base_url_eng)
text_eng = self.normalize(data_eng.text)
soup_eng = self.init_soup(text_eng)

facts_about_course = ""
try:
facts_about_course = (
soup_no.findAll("div", {"class": "card-body"})[1]
.p.get_text()
.split(":")
)
except IndexError:
print("Cannot find facts about course at all, code " + code)

course_version = facts.get("Versjon")
credits_string = facts.get("Studiepoeng")
course_level_description = facts.get("Studienivå")
study_level = self.get_study_level_from_description(course_level_description)
credit = -1
try:
credit = float(facts_about_course[2].split("\n")[2][20:24])
except:
print("Not valid number")

norwegian_name = self.extract_course_name(soup_no)
english_name = self.extract_course_name(soup_eng)

if len(facts_about_course) > 3:
course_level_text = facts_about_course[3].split("\n")[0][
1 : len(facts_about_course[3].split("\n")[0])
]
study_level = self.get_study_level_from_description(course_level_text)
else:
study_level = 0

last_year_taught = 0
taught_from = 2008
taught_in_autumn = False
taught_in_spring = False
taught_in_english = False

place = ""

education = self.parse_card_content_by_title(page_soup, "Undervisning")
language = education.get("Undervisningsspråk")
taught_in_english = language == "Engelsk"
try:
undervisning = soup_no.find_all("div", {"class": "card-body"})[2]
classes = undervisning.get_text().split("Undervises")
try:
place = undervisning.get_text().split("Sted:")[1].strip()
except IndexError:
print("Cannot get place")
for elements in classes:
if "HØST" in elements:
taught_in_autumn = True
if "VÅR" in elements:
taught_in_spring = True
if "Engelsk" in elements:
taught_in_english = True
except IndexError:
print("Cannot get undervisning")

term: str = education.get("Undervises")
taught_in_spring = True if "VÅR" in term else None
taught_in_autumn = True if "HØST" in term else None
exam_type = ""
grade_type = ""
try:
exam_type_raw = (
soup_no.find_all("div", {"class": "content-assessment"})[0]
.p.contents[0]
.strip()
.split(":")[1]
)
exam_type = exam_type_raw[1 : len(exam_type_raw)]
except IndexError:
print("Cannot get exam type")

place = education.get("Sted")
try:
grade_type_raw = (
soup_no.find_all("div", {"class": "content-assessment"})[0]
.p.contents[2]
.strip()
.split(":")[1]
)
grade_type = grade_type_raw[1 : len(grade_type_raw)]
except IndexError:
print("Cannot get exam type")

content = page_soup.find("div", {"id": "course-content-toggler"}).p.get_text()
learning_form = page_soup.find(
"div", {"id": "learning-method-toggler"}
).p.get_text()
learning_goal = page_soup.find(
"div", {"id": "learning-goal-toggler"}
).p.get_text()
content = self.extract_div_content(soup_no, "course-content-toggler")
if self.use_english_version(content):
content = self.extract_div_content(soup_eng, "course-content-toggler")

data = {
learning_form = self.extract_div_content(soup_no, "learning-method-toggler")
if self.use_english_version(learning_form):
learning_form = self.extract_div_content(
soup_eng, "learning-method-toggler"
)

learning_goal = self.extract_div_content(soup_no, "learning-goal-toggler")
if self.use_english_version(learning_goal):
learning_goal = self.extract_div_content(soup_eng, "learning-goal-toggler")

course = {
"norwegian_name": norwegian_name,
"english_name": english_name,
"code": code,
"credit": credit,
"study_level": study_level,
"taught_in_english": taught_in_english,
"last_year_taught": last_year_taught,
"taught_from": taught_from,
"taught_in_autumn": taught_in_autumn,
"taught_in_spring": taught_in_spring,
"place": place,
"taught_in_english": taught_in_english,
"content": content,
"learning_form": learning_form,
"learning_goal": learning_goal,
"exam_type": exam_type,
"grade_type": grade_type,
"place": place,
"has_had_digital_exam": has_digital_exam,
}

return {k: v for k, v in data.items() if v is not None}

def update_course(self, course_code: str):
course_data = self.request_course_page(course_code)
Course.objects.filter(code=course_code).update(**course_data)
return course
Loading
Loading