-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathglassdoor_scraper.py
173 lines (142 loc) · 5.8 KB
/
glassdoor_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
import progressbar
def get_position_links(url):
'''
This function sends a request to Glassdoor, crawls links with class 'jobLink',
and collects data science job application links on a single page.
Args:
url (str): The URL of the page.
Returns:
list: A list containing links for job applications on the page.
'''
links = []
header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
response = requests.get(url, headers=header)
soup = BeautifulSoup(response.text, 'html.parser')
# Find all links with class 'jobLink' and construct full URLs
job_links = soup.find_all('a', class_='jobLink')
for link in job_links:
links.append('https://www.glassdoor.com' + link.get('href'))
return links
def get_all_links(num_pages, base_url):
'''
Collects all job application links from multiple pages.
Args:
num_pages (int): Number of pages to crawl.
base_url (str): The base URL of a single page.
Returns:
list: A list of lists containing job application links.
'''
all_links = []
i = 1
print('Collecting links....')
while i <= num_pages:
try:
# Construct the URL for the current page
url_main = f'{base_url}{i}.htm'
all_links.append(get_position_links(url_main))
i += 1
time.sleep(0.5)
except:
print('No more pages found.')
return all_links
def scrape_job_page(url):
'''
Collects data from a single job application page and stores it in a dictionary.
Args:
url (str): The URL of a job application page.
Returns:
dict: A dictionary containing collected data.
'''
data_dict = {}
header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
response = requests.get(url, headers=header)
soup = BeautifulSoup(response.text, 'html.parser')
body = soup.find('body')
try:
data_dict['job_title'] = body.find('h2', class_='noMargTop margBotXs strong').text.strip()
except:
data_dict['job_title'] = np.nan
try:
data_dict['company_name'] = body.find('span', class_='strong ib').text.strip()
except:
data_dict['company_name'] = np.nan
try:
location = body.find('span', class_='subtle ib').text.strip().replace('–\xa0', '')
data_dict['location'] = location
except:
data_dict['location'] = np.nan
try:
data_dict['salary_estimated'] = body.find('h2', class_='salEst').text.strip()
except:
data_dict['salary_estimated'] = np.nan
try:
data_dict['salary_min'] = body.find('div', class_='minor cell alignLt').text.strip()
except:
data_dict['salary_min'] = np.nan
try:
data_dict['salary_max'] = body.find('div', class_='minor cell alignRt').text.strip()
except:
data_dict['salary_max'] = np.nan
try:
date = body.find('span', class_='minor nowrap').text.strip()
# Parse and convert the date to a standardized format
data_dict['date_posted'] = parse_date(date)
except:
data_dict['date_posted'] = datetime.today().date()
list_skills = []
job_des = body.find('div', class_='jobDesc')
for li in job_des.find_all("li"):
list_skills.append(li.text.strip())
data_dict['job_description'] = list_skills
return data_dict
def parse_date(date_str):
'''
Parses and converts the date string to a standardized format.
Args:
date_str (str): The date string to parse.
Returns:
datetime: A datetime object representing the parsed date.
'''
split = date_str.split(" ")
if "second" in split or "seconds" in split or "minute" in split or "minutes" in split or "hours" in split or "hour" in split:
return datetime.today().date()
elif "week" in split or "weeks" in split:
return (datetime.today() - (timedelta(days=int(split[0]) * 7))).date()
elif "days" in split or "day" in split:
return (datetime.today() - timedelta(days=int(split[0]))).date()
elif "month" in date_str or "months" in date_str:
return (datetime.today() - (timedelta(days=int(split[0]) * 30))).date()
else:
return datetime.today().date()
if __name__ == '__main__':
# Specify the base URL for data science jobs on Glassdoor
base_url = 'https://www.glassdoor.com/Job/data-scientist-jobs-SRCH_KO0,14_IP'
# Number of pages to crawl
num_pages = 30
# Collect all job application links
links = get_all_links(num_pages, base_url)
# Flatten the list of links and remove duplicates
unique_links = list(set(item for sublist in links for item in sublist))
# UI progress bar
bar = progressbar.ProgressBar(maxval=len(unique_links), \
widgets=['Crawling the site: ', progressbar.Bar('=', '[', ']'), ' ',
progressbar.Percentage()])
list_results = []
for page in unique_links:
bar.update(unique_links.index(page))
try:
list_results.append(scrape_job_page(page))
except:
pass
time.sleep(0.5)
# Save the data in a DataFrame
df_glass = pd.DataFrame(list_results)
# Save the DataFrame to an Excel file
with pd.ExcelWriter('data_glassdoor.xlsx', engine='openpyxl') as writer:
df_glass.to_excel(writer, index=False)