This repository has been archived by the owner on May 5, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdalex-test.R
65 lines (53 loc) · 1.71 KB
/
dalex-test.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# Investigating DALEX and modelDown with baseball pitch data
# Matt Dray
# Jan 2019
# Prepare workspace -------------------------------------------------------
library("readr")
library("dplyr")
library("DALEX")
library("modelDown")
# Data --------------------------------------------------------------------
pitches <- read_csv(
file = "data/pitches.csv",
col_types = cols(ab_id = col_character())
)
atbats <- read_csv(
file = "data/atbats.csv",
col_types = cols(
ab_id = col_character(),
g_id = col_character(),
pitcher_id = col_character(),
batter_id = col_character()
)
)
games <- read_csv(
file = "data/games.csv",
col_types = cols(g_id = col_character())
)
# Join datasets
# To get dates for pitches we need to join 'games' to 'pitches' via 'atbats'
pitch <-
pitches %>%
left_join(y = atbats, by = "ab_id") %>%
left_join(y = games, by = "g_id") %>%
select(
# identifiers
g_id, ab_id, pitcher_id, batter_id, # game, at-bat and batter
# outcome
type, code, event, # type is simplified to S (strike), B (ball), X (in play)
# game information
home_team, away_team,
# play state
inning, top, # which inning and whether top or bottom
p_score, # score for pitcher's team at time of pitch
outs, # number of outs before pitch is thrown
on_1b, on_2b, on_3b, # on base
# pitch characteristics
pitch_type, # type of pitch
pitch_num, # pitch number of at-bat
p_throws, # which hand the pitcher throws with
px, pz, # location at plate (x=0 down the middle, z=0 the ground)
spin_dir, spin_rate, # spin direction and speed
start_speed, end_speed # speed of pitch
) %>%
filter(!is.na(px)) # data is missing for ~14k