-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathChapter_2.Rmd
346 lines (300 loc) · 12.1 KB
/
Chapter_2.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
---
title: "tidy Baseball Chapter 2"
author: "Chris Hamm"
date: "`r format(Sys.Date())`"
output:
html_document:
keep_md: TRUE
---
```{r setup, include = FALSE}
knitr::opts_chunk$set(echo = TRUE, fig.align = "center")
```
## Chapter 2 - *Introduction to R*
#### Starting here because Chapter 1 doesn't have much for us.
### Load packages, set seed, and note which package versions we will use.
```{r preliminaries}
library("Lahman")
library("tidyverse"); options(dplyr.width = Inf)
set.seed(8761825)
devtools::session_info()
```
### Section 2.3 - **Vectors and Warren Spahn**
```{r Ch2.S3}
W <- c(8, 21, 15, 21, 21, 22, 14)
L <- c(5, 10, 12, 14, 17, 14, 19)
Win.Pct <- 100 * W / (W + L)
Win.Pct
Year <- seq(1946, 1952)
Year <- 1946 : 1952
Age <- Year - 1921
# We need to make a data frame to work with the ggplot2 package, so I will do that here.
WaSp <- as.data.frame(cbind(Age, Year, Win.Pct))
WaSp
```
#### Figure 2.4
```{r Fig_2.4}
ggplot(WaSp, aes(y = Win.Pct, x = Age)) +
theme_bw() +
geom_point(size = 3) +
ylab("Win %") +
xlab("Age") +
ylim(20, 75)
```
### Section 2.4 - **Objects and containers in R**s
```{r Sec_2.4}
NL <- c("FLA", "STL", "HOU", "STL", "COL", "PHI", "PHI", "SFG", "STL", "SFG")
AL <- c("NYY", "BOS", "CHW", "DET", "BOS", "TBR", "NYY", "TEX", "TEX", "DET")
Winner <- c("NL", "AL", "AL", "NL", "NL", "NL", "AL", "NL", "NL", "NL")
N.Games <- c(6, 4, 4, 5, 4, 5, 6, 5, 7, 4)
Year <- 2003 : 2012
results <- matrix(c(NL, AL), 10, 2)
results
dimnames(results)[[1]] <- Year
dimnames(results)[[2]] <- c("NL Team", "AL Team")
results
Winner <- as.data.frame(Winner) # Note that ggplot2 only works with data frames, so we need to convert the object.
table(Winner)
```
#### Figure 2.5
```{r Fig_2.5}
ggplot(Winner, aes(x = Winner)) +
theme_bw() +
geom_bar() +
ylab("WS wins")
```
```{r Spahn}
Spahn <- read_csv("https://raw.githubusercontent.com/maxtoki/baseball_R/master/data/spahn.csv", col_names = TRUE) # Note the "raw." filepath to get the unformatted .csv
# create the FIP stat
Spahn <- Spahn %>%
mutate(FIP = ((13 * HR) + (3 * BB) -2 * SO) / IP)
head(Spahn)
pos <- order(Spahn$FIP)
head(Spahn[pos, c("Year", "Age", "W", "L", "ERA", "FIP")])
Spahn1 <- Spahn %>% filter(Tm == "BSN" | Tm == "MLN") %>% mutate(Tm = factor(Tm, levels = c("BSN", "MLN")))
head(Spahn1)
by(Spahn1[, c("W-L", "ERA", "WHIP", "FIP")], Spahn1$Tm, summary)
```
```{r Batting}
# Note, by loading the "Lahman" package we don't need to import the raw data. I have included the code to download the external file in case you want it
# Batting <- read_csv("https://raw.githubusercontent.com/maxtoki/baseball_R/master/data/Batting.csv", col_names = TRUE)
head(Batting)
dim(Batting)
Batting.60 <- Batting %>%
filter(yearID >= 1960 & yearID <= 1969)
head(Batting.60)
dim(Batting.60)
max(Batting.60$yearID)
min(Batting.60$yearID)
compute.hr <- function(pid){
d <- Batting.60 %>%
filter(playerID == pid)
sum(d$HR)
}
# The book way with multiple steps and a function:
players <- unique(Batting.60$playerID)
system.time(S <- sapply(players, compute.hr))
head(S)
length(S) #1786 players had home runs in the 1960's
# The tidy way, one line of code, really fast:
system.time(S1 <- Batting.60 %>%
group_by(playerID) %>%
summarize(sum(HR)))
head(S1)
dim(S1)
# create a data frame
dataframe.AB <- Batting %>%
select(playerID, AB, HR, SO) %>%
group_by(playerID) %>%
filter(!is.na(AB)) %>%
summarize(AB = sum(AB), HR = sum(HR), SO = sum(SO))
head(dataframe.AB)
dim(dataframe.AB)
# Note here that merging as described in the book makes no sense. Why add repeared rows of summarized data back to the original data frame "Batting"
Batting2 <- dplyr::full_join(Batting, dataframe.AB, by = "playerID")
head(Batting2)
dim(Batting2)
Batting.5000 <- dataframe.AB %>% filter(AB >= 5000)
dim(Batting.5000)
head(Batting.5000)
# This function "compute.hr" is now unnecessary, as we have already calculated the relevent statistics. See how, with one line of dplyr code, we can replace a function and other awkward subsetting.
Batting.5000 %>% filter(playerID == "aaronha01") %>%
summarize(AB = sum(AB, na.rm = TRUE),
HR = sum(HR, na.rm = TRUE),
SO = sum(AB, na.rm = TRUE))
```
#### Figure 2.8 - where we want to plot the SO/AB (Y) against the HR/AB (X)
```{r Fig_2.8}
ggplot(Batting.5000, aes(x = HR / AB, y = SO / AB)) +
theme_bw() +
xlim(0, 0.08) +
ylim(0, 0.4) +
geom_point(cex = 2) +
stat_smooth(method = "loess", col = "red") +
ylab("SO / AB") +
xlab("HR / AB")
```
### Chapter 2 exercises
1. Question 1
+ In R, place the stolen base, caught stealing, and game counts in the vectors SB, CS, and G.
```{r Ch2.Q1a}
# Import the Hall of Fame data set from the repo:
hof <- read_csv("https://raw.githubusercontent.com/maxtoki/baseball_R/master/data/hofbatting.csv", col_names = TRUE)
dim(hof)
head(hof)
hof$X2 <- gsub(" HOF", "", hof$X2)
head(hof$X2)
SB <- hof %>% select(SB) %>% arrange(desc(SB))
CS <- hof %>% select(CS) %>% arrange(desc(CS))
G <- hof %>% select(G) %>% arrange(desc(G))
```
+ For all players, compute the number of stolen base attempts SB + CS and store in the vector SB.Attempt.
```{r Ch2.Q1b}
SB.Attempt <- hof %>% select(SB, CS) %>% transmute(SB.Attempt = SB + CS) %>% arrange(desc(SB.Attempt))
head(SB.Attempt)
```
+ For all players, compute the success rate Success.Rate = SB / SB.Attempt.
```{r Ch2.Q1c}
Success.Rate <- hof %>% select(SB, CS) %>% transmute(Success.Rate = SB / (SB + CS)) %>% arrange(desc(Success.Rate))
head(Success.Rate)
```
+ Compute the number of stolen bases per game SB.Game = SB / Game.
```{Ch2.Q1d}
SB.Game <- hof %>% select(SB, G) %>% transmute(SB.Game = SB / G) %>% arrange(desc(SB.Game))
head(SB.Game)
```
+ Construct a scatterplot of the stolen bases per game against the success rates. Are there particular players with unusually high or low stolen base success rates? Which player had the greatest number of stolen bases per game?
```{r Ch2.Q1e}
ggplot(hof, aes(x = (SB / (SB + CS)), y = (SB / G)), label = X2) + theme_bw() + geom_point(size = 1.5) + xlab("Stolen Base \nSuccess Rate") + ylab("Stolen Bases \nper Game") + geom_text(size= 2, aes(label = X2), nudge_y = 0.0125)
```
2. Question 2 - Suppose one records the outcomes of a batter in ten plate appearances:
*Single*, *Out*, *Out*, *Single*, *Out*, *Double*, *Out*, *Walk*, *Out*, *Single*
+ Use the c function to collect these outcomes in a character vector "outcomes."
```{r Ch2.Q2a}
outcomes <- c("Single", "Out", "Out", "Single", "Out", "Double", "Out", "Walk", "Out", "Single")
```
+ Use the table function to construct a frequency table of "outcomes."
```{r Ch2.Q2b}
table(outcomes)
```
+ In tabulating these results, suppose one prefers the results to be ordered from least-successful to most-successful. Use the following
code to convert the character vector outcomes to a factor variable "f.outcomes."
```{r Ch2.Q2c1}
f.outcomes <- factor(outcomes, levels=c("Out", "Walk", "Single", "Double"))
```
+ Use the table function to tabulate the values in f.outcomes. How does the output differ from what you saw in part (b)?
```{r Ch2.Q2c2}
table(f.outcomes)
# The output in the first call was alphabetical.
```
+ Suppose you want to focus only on the walks in the plate appearances. Describe what is done in each of the following statements.
```{r Ch2.Q2c3}
outcomes == "Walk" # This calls all instances of Walk
sum(outcomes == "Walk") # This sums all instances of Walk
```
3. Question 3
+ In R, place the wins and losses in the vectors W and L, respectively. Also, create a character vector Name containing the last names of these pitchers.
```{r Ch2.Q3a}
# The Pitching database is contained in the Lahman package, but just in case you want to download the raw data from GitHub.
# Pitching <- read_csv("https://raw.githubusercontent.com/maxtoki/baseball_R/master/data/pitching.csv", col_names = TRUE)
dim(Pitching)
head(Pitching)
pitching.350 <- Pitching %>%
group_by(playerID) %>%
summarize(W = sum(W), L = sum(L), SO = sum(SO), BB = sum(BB)) %>%
filter(W >= 350) %>%
rename(Name = playerID)
dim(pitching.350)
head(pitching.350)
```
+ Compute the winning percentage for all pitchers defined by 100 × W/(W+L) and put these winning percentages in the vector Win.PCT.
```{r Ch2.Q3b}
Win.PCT <- pitching.350 %>%
mutate(Win.PCT = ((100 * W) / (W + L))) %>%
select(Name, Win.PCT)
Win.PCT
```
+ By use of the command Wins.350 <- data.frame(Name, W, L, Win.PCT) create a data frame Wins.350 containing the names, wins, losses, and winning percentages.
```{r Ch2.Q3c}
Wins.350 <- pitching.350 %>%
select(Name, W, L) %>%
inner_join(Win.PCT, by = "Name")
Wins.350
```
+ By use of the "order" function, sort the data frame Wins.350 by winning percentage. Among these pitchers, who had the largest and smallest winning percentages?
```{r Ch2.Q3d}
Wins.350 %>% arrange(desc(Win.PCT))
```
4. Question 4
+ In R, place the strikeout and walk totals from the 350 win pitchers in the vectors SO and BB, respectively. Also, create a character vector "Name" containing the last names of these pitchers.
```{r Ch2.Q4a}
# Already done!
pitching.350
```
+ Compute the strikeout-walk ratio by SO/BB and put these ratios in the vector SO.BB.Ratio.
```{r Ch2.Q4b}
SO.BB.Ratio <- pitching.350 %>%
mutate(SO.BB.Ratio = (SO / BB)) %>%
select(Name, SO.BB.Ratio)
SO.BB.Ratio
```
+ by use of the command "SO.BB <- data.frame(Name, SO, BB, SO.BB.Ratio)" create a data frame "SO.BB" containing the names, strikeouts, walks, and strikeout-walk ratios.
```{r Ch2.Q4c}
SO.BB <- pitching.350 %>%
mutate(SO.BB.Ratio = (SO / BB)) %>%
select(Name, SO, BB, SO.BB.Ratio)
SO.BB
```
+ By use of the subset function, find the pitchers who had a strikeout-walk ratio exceeding 2.8.
```{r Ch2.Q4d}
SO.BB %>% filter(SO.BB.Ratio > 2.8)
```
+ By use of the order function, sort the data frame by the number of walks. Did the pitcher with the largest number of walks have a high or low strikeout-walk ratio?
```{r Ch2.Q4e}
SO.BB %>% arrange(desc(BB))
# The pitcher with the most walks (Roger Clemens) had a high SO/BB ratio.
```
5. Question 5
+ Read the Lahman “pitching.csv” data file into R into a data frame Pitching.
```{r Ch2.Q5a}
# Already done! (see above code)
```
+ The following function computes the cumulative strikeouts, cumulative walks, mid career year, and the total innings pitched (measured in terms of outs) for a pitcher whose season statistics are stored in the data frame d.
```{r Ch2.Q5b1}
stats <- function(d){
c.SO <- sum(d$SO, na.rm=TRUE)
c.BB <- sum(d$BB, na.rm=TRUE)
c.IPouts <- sum(d$IPouts, na.rm=TRUE)
c.midYear <- median(d$yearID, na.rm=TRUE)
data.frame(SO = c.SO, BB = c.BB, IPouts = c.IPouts,
midYear=c.midYear)
}
```
+ Using the function "ddply"" (plyr package) together with the function stats, find the career statistics for all pitchers in the pitching dataset. Call this new data frame career.pitching.
```{r Ch2.Q5b2}
# We don't need to run the function, just use "summarize" in dplyr.
career.pitching <- Pitching %>% group_by(playerID) %>% summarize(SO = sum(SO, na.rm = TRUE), BB = sum(BB, na.rm = TRUE), IPouts = sum(IPouts, na.rm = TRUE), midyear = median(yearID, na.rm = TRUE))
head(career.pitching)
```
+ Use the merge function to merge the Pitching and career.pitching data frames.
```{r Ch2.Q5c}
# I confess this makes no sense to me because the dimensions of the data.frames are different.
dim(career.pitching)
dim(Pitching)
new.pitching <- right_join(Pitching, career.pitching, by = "playerID")
```
+ Use the subset function to construct a new data frame career.10000 consisting of data for only those pitchers with at least 10,000 career IPouts.
```{r Ch2.Q5d}
career.1000 <- career.pitching %>%
filter(IPouts >= 10000)
dim(career.1000)
head(career.1000)
```
+ For the pitchers with at least 10,000 career IPouts, construct a scatterplot of mid career year and ratio of strikeouts to walks. Comment on the general pattern in this scatterplot.
```{r Ch2.Q5e}
ggplot(career.1000, aes(x = midyear, y = (SO / BB))) +
theme_bw() +
geom_point(size = 1.5) +
ylab("Strikeout : Walk ratio") +
xlab("Midcareer Year") +
stat_smooth(method = lm, color = "black") # I see only a slightly positive trend between the SO / BB ratio and midcareer year.
```