-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathproject3_wednesday.rmd
130 lines (112 loc) · 6.08 KB
/
project3_wednesday.rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
---
title: "DATA_607_Project3- Data Science Skills"
authors: "Jill Anderson, Alvaro Bueno Castillo, Nathan Cooper, Silverio Vasquez, Sarah Wigodsky"
date: "October 16, 2017"
output: html_document
---
##What are the most valued data science skills?
To answer this question, we mined the job board glassdoor.com to identify which skills were requested the most frequently.
####Importing Libraries
```{r libraries, eval=TRUE}
suppressWarnings(suppressMessages(library(stringr)))
suppressWarnings(suppressMessages(library(dplyr)))
suppressWarnings(suppressMessages(library(RCurl)))
suppressWarnings(suppressMessages(library(tidyr)))
```
####Importing National Average Data Science Salary By Company
```{r salarydata, eval=TRUE}
salary <- read.csv("https://raw.githubusercontent.com/sjv1030/group_project/master/company_salary.csv", stringsAsFactors = FALSE)
```
####Tidying the Salary Data - Removing dollar sign, commas and making the salary be stored as a number
```{r tidysalary, eval=TRUE}
salarytidy <- salary[,-1]
salarytidy <- salarytidy %>%
separate(salary, c("salary", "pay_period"), sep="per ")
salarytidy$salary <- gsub(",", "", salarytidy$salary)
salarytidy$salary <- unlist(str_extract_all(salarytidy$salary, "[[:digit:]]{3,}|[[:digit:]]{1,}.[[:digit:]]{1,}"))
salarytidy$salary <- as.numeric(salarytidy$salary)
head(salarytidy)
```
####Converting Daily, Monthly, and Hourly Salaries into a yearly salary assuming a 40 hour work week for 12 months
```{r yearly-salary, eval=TRUE}
#pay_period still says month, day or week even though it is now yearly so that it is clear which salaries were changed so that they are all yearly
for (i in 1:length(salarytidy$salary)){
if (salarytidy$pay_period[i] == "month") {salarytidy$salary[i] <- salarytidy$salary[i]*12}
if (salarytidy$pay_period[i] == "week") {salarytidy$salary[i] <- salarytidy$salary[i]*52}
if (salarytidy$pay_period[i] == "hour") {salarytidy$salary[i] <- salarytidy$salary[i]*8*5*52}
if (salarytidy$pay_period[i] == "day") {salarytidy$salary[i] <- salarytidy$salary[i]*5*52}
}
```
####Importing the data from Glassdoor for Boston
```{r importingdataMA, eval=TRUE}
joblistma <- read.csv("https://raw.githubusercontent.com/delagroove/dataScience/master/jobOffers_final_with_dupes_reduced.csv", stringsAsFactors = FALSE, fileEncoding = "latin1")
```
####Importing data from Glassdoor for NYC
```{r importingdataNY, eval=TRUE}
joblistny <- read.csv("https://raw.githubusercontent.com/delagroove/dataScience/master/jobOffers_ny.csv", stringsAsFactors=FALSE, fileEncoding = "latin1")
```
####Importing data from Glassdoor for SF
```{r importingdataSF, eval=TRUE}
joblistsf <- read.csv("https://raw.githubusercontent.com/delagroove/dataScience/master/jobOffers_SF.csv", stringsAsFactors=FALSE, fileEncoding = "latin1")
joblistsf <- joblistsf[,-1]
```
####Importing data from Glassdoor for CHI
```{r importingdataSF, eval=TRUE}
joblistchi <- read.csv("https://raw.githubusercontent.com/delagroove/dataScience/master/jobOffers_CHI.csv", stringsAsFactors=FALSE, fileEncoding = "latin1")
joblistchi <- joblistchi[,-1]
```
####Combining data tables from Boston, NY and SF
```{r combining-data, eval=TRUE}
joblisttidy <- rbind(joblistma, joblistny, joblistsf, joblistchi)
```
####Eliminating Duplicate Entries
\n\ Since some companies post the same job on different days, the posted date column is elimintated prior to testing for duplicate entries.
```{r eliminate_duplicates, eval=TRUE}
joblisttidy <- joblisttidy[,-5]
joblisttidy <- subset(joblisttidy, duplicated(joblisttidy)==FALSE)
```
####Eliminating symbols in front of location
```{r location, eval = TRUE}
joblisttidy$location <- unlist(str_extract_all(joblisttidy$location, "[[:upper:]]{1}[[:lower:]]{2,}, [[:alpha:]]{2}|[[:upper:]]{1}[[:lower:]]{2,} [[:alpha:]]{2,}, [[:alpha:]]{2}"))
joblisttidy$description <- gsub("[^[:ascii:]]", "", joblisttidy$description, perl=T)
```
####Identifying Job Descriptions that look for specific Computer Skills
```{r computerskills, eval=TRUE}
compskills <- joblisttidy %>%
mutate(python = grepl("python", description, ignore.case=TRUE)) %>%
mutate(perl = grepl("perl", description, ignore.case=TRUE)) %>%
mutate(Cplusplus = grepl("C++", description, fixed=TRUE)) %>%
mutate(SQL = grepl("SQL", description)) %>%
mutate(java = grepl("java\\b", description, ignore.case=TRUE)) %>%
mutate(javascript = grepl("javascript", description, ignore.case=TRUE)) %>%
mutate(R = grepl("\\bR\\b,", description)) %>%
mutate(hadoop = grepl("hadoop", description, ignore.case=TRUE)) %>%
mutate(spark = grepl("spark", description, ignore.case=TRUE)) %>%
mutate(scala = grepl("scala", description, ignore.case=TRUE)) %>%
select(job_title, company, python, perl, Cplusplus, SQL, java, javascript, R, hadoop, spark, scala)
summary(compskills)
```
####Identifying analytical skills
```{r analytical-skills, eval=TRUE}
skills <- joblisttidy %>%
mutate(machinelearning = grepl("machine learning", description, ignore.case=TRUE)) %>%
mutate(statisticalmodeling = grepl("statistical model", description, ignore.case=TRUE)) %>%
mutate(techwriting = grepl("technical writing", description, ignore.case=TRUE)) %>%
mutate(plateau = grepl("plateau", description, ignore.case=TRUE)) %>%
mutate(d3 = grepl("D3", description)) %>%
select(job_title, company, machinelearning, statisticalmodeling, techwriting, plateau, d3)
summary(skills)
```
####Identifying soft skills
```{r soft-skills, eval=TRUE}
softskills <- joblisttidy %>%
mutate(collaborative = grepl("collaborat", description, ignore.case=TRUE)) %>%
mutate(organized = grepl("organized", description, ignore.case=TRUE)) %>%
mutate(selfstarter = grepl("self starter", description, ignore.case=TRUE)) %>%
mutate(attndetail = grepl("attention to detail", description, ignore.case=TRUE)) %>%
mutate(communication = grepl("communicat", description, ignore.case=TRUE)) %>%
mutate(creative = grepl("creativ", description, ignore.case=TRUE)) %>%
mutate(visualization = grepl("visualization", description, ignore.case=TRUE)) %>%
select(job_title, company, collaborative, organized, selfstarter, attndetail, communication, creative, visualization)
summary(softskills)
```