-
Notifications
You must be signed in to change notification settings - Fork 0
/
webScraper.py
99 lines (68 loc) · 3.15 KB
/
webScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
"""
Author: Blake McBride ([email protected])
Created: 12/02/2023
Overview: This file defines the `WebScraper` class which will be used to extract the text data some webpages at given URLS from the client
"""
# import standard modules
import requests
from bs4 import BeautifulSoup
import re
from alive_progress import alive_bar
# import src modules
from src.webScraping.document import Document
class WebScraper():
"""
A class for extracting text data from webpages
"""
def __init__(self) -> None:
"""
Initializes a `WebScraper` object
"""
self.corpus = []
def getWebpageText(self, response : object) -> str:
"""
Extracts and preprocesses the data from a webpage from a given `requests.Response` object
Args:
self (object): `WebScraper` object
response (object): `requests.Response` object
Returns:
str: A clean string of the text data from the webpage
"""
soup = BeautifulSoup(response.text, 'html.parser')
for script_or_style in soup(['script', 'style']):
script_or_style.extract()
text = soup.get_text()
cleaned_text = re.sub(r'\s+', ' ', text)
return cleaned_text
def scrapeWebpages(self, urls : list) -> None:
"""
Extracts text data from webpage(s) at a given url and saves their text data as a string into the `Webscraper.corpus` hashmap
Args:
self (object): `WebScraper` object
urls (str | list): one or multiple urls for the webpages you want to scrape
"""
urls = list(set(urls))
with alive_bar(len(urls)) as bar:
for url in urls:
try:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
title_tag = soup.find('title')
title = title_tag.text if title_tag else url
title = title.replace(':', '-').replace('|', '-').replace('?', '').replace('/', '-').replace('"', '').replace('*', '').replace('<', '').replace('>', '')
text = self.getWebpageText(response)
doc = Document(title, text, url)
self.corpus.append(doc)
except Exception as e:
print(e); print(f"Error: Failed to scrape webpage at {url}\n\nPlease verify that this is a valid url.")
bar()
return self.corpus
def saveCorpus(self, filepath='sample_data') -> None:
"""
Saves the corpus of scraped webpages to .txt files
Args:
filepath (string): The destination to save the files (Default is `sample_data`)
"""
for webpage_title, text in self.corpus.items():
with open(f'{filepath}/{webpage_title}.txt', 'w', encoding='utf-8') as file:
file.write(text)