-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathChapter_4.Rmd
302 lines (258 loc) · 10.4 KB
/
Chapter_4.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
---
title: "tidy Baseball Chapter 4"
author: "Chris Hamm"
date: "`r format(Sys.Date())`"
output:
html_document:
keep_md: TRUE
---
```{r setup, include = FALSE}
knitr::opts_chunk$set(echo = TRUE, fig.align = "center")
```
## Chapter 4 - *The relation between runs and wins*
```{r preliminaries}
library("Lahman")
library("tidyverse"); options(dplyr.width = Inf)
set.seed(8761825)
devtools::session_info()
```
### Section 4.2 - Teams in Lahman's Database
```{r Sec_4.2}
# The Teams table is includedin the Lahman package
str(Teams)
tail(Teams)
dim(Teams)
# run differential is the difference between runs scored and runs allowed. Rather than use the with statements in section 4.2 we will use a dplyr pipeline
myteams <- Teams %>%
filter(yearID > 2000) %>%
select(teamID, yearID, lgID, G, W, L, R, RA) %>%
mutate(RD = R - RA, Wpct = W / (W + L))
tail(myteams)
```
#### Figure 4.1
```{r Fig_4.1}
ggplot(myteams, aes(x = RD, y = Wpct)) +
geom_point(cex = 2, alpha = 0.6) +
theme_bw() +
stat_smooth(method = lm, colour = "black") +
xlab("Run differential") +
ylab("Winning percentage")
```
```{r Sec_4.2b}
linfit <- lm(Wpct ~ RD, data = myteams)
summary(linfit)
# Wpct = 0.50 + 0.000628 * RD
myteams <- myteams %>%
mutate(linWpct = predict(linfit), linResiduals = residuals(linfit))
tail(myteams)
min(myteams$linResiduals)
max(myteams$linResiduals)
```
#### Figure 4.2
```{r Fig_4.2}
ggplot(myteams, aes(x = RD, y = linResiduals)) +
geom_point(size = 2) +
theme_bw() +
ylab("Residuals") +
xlab("Run differential") +
scale_y_continuous(limits = c(-0.09, 0.09)) +
scale_x_continuous(limits = c(-400, 400))
```
```{r Sec_4.4c}
mean(myteams$linResiduals)
linRMSE <- sqrt(mean(myteams$linResiduals^2))
linRMSE
nrow(subset(myteams, abs(linResiduals) < linRMSE)) / nrow(myteams)
nrow(subset(myteams, abs(linResiduals) < 2 * linRMSE)) / nrow(myteams)
```
### Section 4.4 - Now let's explore the Pythagorean expectation
```{r Sec_4.4}
myteams <- myteams %>%
mutate(pytWpct = R^2 / (R^2 + RA^2), pytResiduals = Wpct - pytWpct)
tail(myteams)
sqrt(mean(myteams$pytResiduals^2))
ggplot(myteams, aes(y = pytWpct, x = Wpct)) +
theme_bw() +
geom_point(cex = 2, alpha = 0.6) +
ylab("Pythagorean Win %") +
xlab("Win %") +
stat_smooth(method = "lm", color = "black")
lm2 <- lm(pytWpct ~ Wpct, data = myteams)
summary(lm2) # 0.034 + 0.93*Wpct
# I hate repeating myself in code so here is a function to compare the Bill James Pythagorean model with the linear model. RunsSc = Runs Scored, RunsAll = Runs Allowed, N = exponant
Pythag.line <- function(RunsSc, RunsAll, N){
linear <- 0.50 + 0.000628 * (RunsSc - RunsAll)
Pythag <- RunsSc^N / (RunsSc^N + RunsAll^N)
return(list = c("linear" = linear, "Pythagorean" = Pythag))
}
Pythag.line(1620, 810, N = 2)
Pythag.line(186, 0, N = 2)
Pythag.line(100, 100, N = 2)
```
### Section 4.5
```{r Sec_4.5}
# Section 4.5 - the exponent of the Pythagorean formula
myteams <- myteams %>%
mutate(logWratio = log(W / L),
logRratio = log(R / RA))
tail(myteams)
pytFit <- lm(logWratio ~ logRratio, data = myteams)
summary(pytFit) # suggests a Pythagorean exponent of 1.88 rather than 2.
```
### Section 4.6 - Good and Bad predictions based on the Pythagorean Formula
```{r Sec_4.6}
games1 <- Pythag.line(875, 737, N = 2)
games1 * 162
games1a <- Pythag.line(875, 757, N = 1.88)
games1a * 162
gl2011 <- read_delim("Data/gl2011.txt", delim = ",", col_names = FALSE)
head(gl2011) # needs headers
glheaders <- read_csv("Data/game_log_header.csv")
glheaders
names(gl2011) <- names(glheaders)
head(gl2011)
BOS2011 <- gl2011 %>% filter(HomeTeam == "BOS" | VisitingTeam == "BOS") %>%
select(VisitingTeam, HomeTeam, VisitorRunsScored, HomeRunsScore) %>%
mutate(ScoreDiff = ifelse(HomeTeam == "BOS", yes = HomeRunsScore - VisitorRunsScored, no = VisitorRunsScored - HomeRunsScore),
W = ScoreDiff > 0)
head(BOS2011)
# aggregate(abs(BOS2011$ScoreDiff), list(W = BOS2011$W), summary)
results <- gl2011 %>%
select(VisitingTeam, HomeTeam, VisitorRunsScored, HomeRunsScore) %>%
mutate(winner = ifelse(HomeRunsScore > VisitorRunsScored, yes = as.character(HomeTeam), no = as.character(VisitingTeam)),
diff = abs(VisitorRunsScored - HomeRunsScore))
head(results)
onerungames <- results %>% filter(diff == 1)
dim(onerungames)
head(onerungames)
onerunwins <- onerungames %>%
group_by(winner) %>%
tally()
names(onerunwins) <- c("teamID", "onerunW")
onerunwins
teams2011 <- myteams %>% filter(yearID == 2011)
teams2011[teams2011$teamID == "LAA", "teamID"] <- "ANA"
teams2011 <- merge(teams2011, onerunwins)
head(teams2011)
```
#### Figure 4.3
```{r Fig_4.3}
ggplot(teams2011, aes(x = onerunW, y = pytResiduals)) +
theme_bw() +
geom_point(cex = 2) +
xlab("One run wins") +
ylab("Pythagorean residuals")
```
```{r Code}
head(Pitching)
top_closers <- Pitching %>%
filter(GF > 50, ERA < 2.5) %>%
select(playerID, yearID, teamID)
head(top_closers)
teams_top_closers <- merge(myteams, top_closers)
summary(teams_top_closers$pytResiduals)
mean(teams_top_closers$pytResiduals) * 162 # teams with a top closer will outperform their Pythagorean expectation by 0.8 games
```
### Section 4.7 - How many runs for a win?
```{r Sec_4.7}
IR <- function(RS, RA){
round((RS^2 + RA^2)^2 / (2 * RS * RA^2), 1)
}
IRtable <- expand.grid(RS = seq(3, 6, 0.5), RA = seq(3, 6, 0.5))
dim(IRtable)
rbind(head(IRtable), tail(IRtable))
IRtable <- IRtable %>% mutate(IRW = IR(RS, RA))
head(IRtable)
xtabs(IRW ~ RS + RA, data = IRtable)
```
### Chapter 4 exercises
1. Section 4.3 used a simple linear model to predict a team’s winning percentage based on its run differential. This model was fit using team data since the 2001 season.
+ Refit this linear model using data from the seasons 1961-1970, the seasons 1971-1980, the seasons 1981-1990, and the seasons 1991-2000.
```{r Ch4.Q1a}
# I am sure there is a "tidier" way to do this, but this is my first crack:
yr61_70 <- Teams %>%
filter(yearID >= 1961 & yearID <= 1970) %>%
select(teamID, yearID, lgID, G, W, L, R, RA) %>%
mutate(RD = R - RA, Wpct = W / (W + L))
lm61_70 <- lm(Wpct ~ RD, data = yr61_70)
yr71_80 <- Teams %>%
filter(yearID >= 1971 & yearID <= 1980) %>%
select(teamID, yearID, lgID, G, W, L, R, RA) %>%
mutate(RD = R - RA, Wpct = W / (W + L))
lm71_80 <- lm(Wpct ~ RD, data = yr71_80)
yr81_90 <- Teams %>%
filter(yearID >= 1981 & yearID <= 1990) %>%
select(teamID, yearID, lgID, G, W, L, R, RA) %>%
mutate(RD = R - RA, Wpct = W / (W + L))
lm81_90 <- lm(Wpct ~ RD, data = yr81_90)
yr91_2k <- Teams %>%
filter(yearID >= 1991 & yearID <= 2000) %>%
select(teamID, yearID, lgID, G, W, L, R, RA) %>%
mutate(RD = R - RA, Wpct = W / (W + L))
lm91_2k <- lm(Wpct ~ RD, data = yr91_2k)
```
+ Compare across the five decades the predicted winning percentage for a team with a run differential of 10 runs.
```{r Ch4.Q1b}
Q1teams <- Teams %>%
filter(yearID >= 1951 & yearID <= 2000) %>%
select(teamID, yearID, lgID, G, W, L, R, RA) %>%
mutate(RD = R - RA, Wpct = W / (W + L))
Q1teams %>% filter(RD == 10) # It looks like only two teams has an RD of 10.
```
2. *Pythagorean Residuals for Poor and Great Teams in the 19th Century.* As baseball was evolving into its ultimate form, nineteenth century leagues often featured abysmal teams that did not even succeed in finishing their season, as well as some dominant clubs.
+ Fit a Pythagorean formula model to the run-differential, win-loss data for teams who played in the 19th century.
```{r Ch4.Q2a}
C19teams <- Teams %>%
filter(yearID >= 1800 & yearID <= 1900) %>%
mutate(RD = R - RA, Wpct = W / (W + L), pytWpct = R^2 / (R^2 + RA^2), pytResiduals = Wpct - pytWpct)
head(C19teams)
```
+ By inspecting the residual plot of your fitted model from (a), did the great and poor teams in the 19th century do better or worse than one would expect on the basis of their run differentials?
```{r Ch4.Q2b}
ggplot(C19teams, aes(y = pytWpct, x = Wpct)) +
theme_bw() +
geom_abline(slope = 1, color = "red") +
geom_point(size = 1.5) +
ylab("Pythagorean residuals") +
xlab("Winning percentage") +
xlim(0, 1) +
ylim(0, 1) +
coord_fixed(ratio = 1)
# It would appear that the Pythagorean expectations hold for 19th Century teams as well, but at the lower extremes there is a bit of spread.
```
3. *Exploring the manager effect in baseball*. Retrosheet game logs report, for every game played, the managers of both teams.
+ Select a period of your choice (encompassing at least ten years) and fit the Pythagorean formula model to the run-differential, win-loss data.
```{r Ch4.Q3a}
# I was at game one of the 1988 World Series and love the Dodgers, so I'll select the 1980's.
Teams80s <- Teams %>%
filter(yearID >= 1980 & yearID <= 1989) %>%
mutate(RD = R - RA, Wpct = W / (W + L), pytWpct = R^2 / (R^2 + RA^2), pytResiduals = Wpct - pytWpct) %>% arrange(pytResiduals)
head(Teams80s)
# For kicks
Teams80s %>% filter(franchID == "LAD" & yearID == 1988)
ggplot(Teams80s, aes(y = pytWpct, x = Wpct)) +
theme_bw() +
geom_abline(slope = 1, color = "red") +
geom_point(size = 1.5) +
ylab("Pythagorean residuals") +
xlab("Winning percentage") +
xlim(0.3, 0.7) +
ylim(0.3, 0.7) +
coord_fixed(ratio = 1)
```
+ On the basis of your fit in part (a) and the list of managers, compile a list of the managers who most overperformed their Pythagorean winning percentage and the managers who most underperformed it.
```{r Ch4.Q3b}
# To find the most over- and under-performing managers, I'll use the residuals of the five years that had the largest positive and negative differences between actual winning percentange and Pythagorean WP.
Bot <- Teams80s %>% slice(1:5)
Bot$name
# It gives me a bit of pleasure to see that The The Angels Angels of Anaheim are on this list.
Top <- Teams80s %>% slice(256:260)
Top$name
head(Managers)
# The most underperforming Pythagorean teams from the 1980s are:
left_join(Bot, Managers, by = c("teamID", "yearID", "W"))
# This in interesting. Notet that there is a blank for playerID fot the STL 1980 (they had FOUR managers that year), CAL 1981 (2 managers - note this was a strike year), and MIL 1980 (2 managers)spots. This is beacuse there were muliple managers for these teams. "leylaji99" is Jim Leyland, and "tannech01" was Chuck Tanner.
# The most overperforming Pythagorean teams from the 1980s are:
left_join(Top, Managers, by = c("teamID", "yearID", "W"))
# "rodgebu01" is Buck Rogers, "howear01" is Art Howe, "weaveea99" was Earl Weaver, "johnsda02" is Davey Johnson, and "mcnamjo99" is John McNamara.
```