Skip to content

Commit

Permalink
New HTML scraping common functions
Browse files Browse the repository at this point in the history
  • Loading branch information
JackGilmore committed Oct 29, 2023
1 parent aafcfb9 commit 4da1363
Showing 1 changed file with 61 additions and 3 deletions.
64 changes: 61 additions & 3 deletions processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@


class Processor:
USER_AGENT = (
"Open Data Scotland Scraper - https://github.com/OpenDataScotland/the_od_bods"
)

# Type should be one of the following: 'dcat', 'arcgis', 'usmart'
def __init__(self, type):
self.type = type
Expand Down Expand Up @@ -66,6 +70,58 @@ def get_json(self, url):

return "NULL"

def get_html(self, url):
"""Performs an HTTP request to get the HTML content of the portal page"""
headers = {"User-Agent": self.USER_AGENT}
req = request.Request(url, headers=headers)
try:
return request.urlopen(req).read().decode()
except HTTPError as err1:
print(url, "cannot be accessed. The URL returned:", err1.code, err1.reason)
error_dict = {
"url": url,
"error_code": err1.code,
"error_reason": err1.reason,
}
except URLError as err2:
print(type(err2))
print(url, "cannot be accessed. The URL returned:", err2.reason)
error_dict = {
"url": url,
"error_code": "",
"error_reason": str(err2.reason),
}

def get_html_head(self, url):
"""Performs an HTTP HEAD request"""
headers = {"User-Agent": self.USER_AGENT}
req = request.Request(url, headers=headers)
try:
return request.urlopen(req).info()
except HTTPError as err1:
print(url, "cannot be accessed. The URL returned:", err1.code, err1.reason)
error_dict = {
"url": url,
"error_code": err1.code,
"error_reason": err1.reason,
}
except URLError as err2:
print(type(err2))
print(url, "cannot be accessed. The URL returned:", err2.reason)
error_dict = {
"url": url,
"error_code": "",
"error_reason": str(err2.reason),
}

def get_http_content_length(self, url):
"""Tries to get file content length without downloading by using a HEAD request"""
try:
response_headers = self.get_html_head(url)
return response_headers["Content-Length"]
except:
return None

def get_license(self, dataset):
try:
# Known Licenses info
Expand Down Expand Up @@ -107,16 +163,18 @@ def write_csv(self, fname, prepped):
r[-1] = r[-1].replace("\n", " ")
w.writerow(r)

def write_json(self, fname, prepped):
def write_json(self, fname, prepped):
with open(fname, "w", encoding="utf8") as json_file:
json.dump(prepped, json_file, indent=4)

def get_datasets(self, owner, url, fname):
print("Override this method")

def process(self, file_type = "csv"):
def process(self, file_type="csv"):
self.get_urls()

for name, url in self.urls.items():
print(name)
self.get_datasets(name, url, os.path.join("data", self.type, f"{name}.{file_type}"))
self.get_datasets(
name, url, os.path.join("data", self.type, f"{name}.{file_type}")
)

0 comments on commit 4da1363

Please sign in to comment.