-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathproject3.Rmd
67 lines (47 loc) · 2.89 KB
/
project3.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
---
title: "project2"
author: "Alvaro Bueno"
date: "10/16/2017"
output: html_document
---
```{r setup, include=FALSE}
library(rvest)
#library(tidyverse)
library(stringr)
```
```{r cars}
jobOffers <- data.frame(job_title=character(), company=character(), location=character(), description=character(), age=character())
# the urls of your 88 pages, links for ny, sf and boston from glassdoor
base_url <- "https://www.glassdoor.com"
main_url_boston <- "/Job/jobs.htm?suggestCount=0&suggestChosen=true&clickSource=searchBtn&typedKeyword=data+sc&sc.keyword=data+scientist&locT=C&locId=1154532&jobType="
main_url_ny <- "/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword=data+scientist&sc.keyword=data+scientist&locT=C&locId=1132348&jobType="
main_url_sf <- "/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword=data+scientist&sc.keyword=data+scientist&locT=C&locId=1147401&jobType="
main_url <- "/Job/jobs.htm?suggestCount=0&suggestChosen=true&clickSource=searchBtn&typedKeyword=data+sci&sc.keyword=data+scientist&locT=C&locId=1128808&jobType="
links <- read_html(paste(base_url,main_url, sep="")) %>% html_nodes(xpath='//a[contains(@class, "jobLink")]') %>% xml_attr("href")
next_link <- read_html(paste(base_url,main_url, sep="")) %>% html_nodes(xpath='//li[contains(@class, "next")]//a') %>% xml_attr("href")
count <- 0
# get links from 20 pages only
while(count < 20) {
newLinks <- read_html(paste(base_url,next_link, sep="")) %>% html_nodes(xpath='//a[contains(@class, "jobLink")]') %>% xml_attr("href")
links <- c(links, newLinks)
next_link <- read_html(paste(base_url,next_link, sep="")) %>% html_nodes(xpath='//li[contains(@class, "next")]//a') %>% xml_attr("href")
count <- count + 1
}
```
```{r}
# process only the first 400 links
for(the_link in links[1:400]){
tryCatch({
jobOffer <- NULL
the_html <- read_html(paste(base_url, the_link, sep=""))
jobOffer$job_title <- the_html %>% html_nodes(xpath='//div[contains(@class, "header")]//h2') %>% html_text()
jobOffer$company <- the_html %>% html_nodes(xpath='//div[contains(@class, "header")]//span[contains(@class, "ib padRtSm")]') %>% html_text()
jobOffer$location <- the_html %>% html_nodes(xpath='//div[contains(@class, "header")]//span[contains(@class, "subtle ib")]') %>% html_text()
jobOffer$description <- the_html %>% html_nodes(xpath='//div[contains(@class, "jobDescriptionContent")]') %>% html_text()
jobOffer$age <- the_html %>% html_nodes(xpath='//div[contains(@class, "cell alignRt showDesk")]//span[contains(@class, "minor nowrap")]') %>% html_text()
jobOffers <- rbind(jobOffers, data.frame(job_title=jobOffer$job_title, company=jobOffer$company, location=jobOffer$location, description=jobOffer$description, age=jobOffer$age))
print("job added.")
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
}
write.csv(jobOffers, file = "jobOffers_CHI.csv")
```