From 0c9c86ea2c67221f4597e7b0603e19964f5a9fa3 Mon Sep 17 00:00:00 2001 From: Achim Zeileis Date: Fri, 18 Feb 2022 17:04:58 +0100 Subject: [PATCH] Count data: From basic probability theory to regression models (#73) * added FIFA2018 goals data to illustrate basic Poisson distribution and regression * use prop.table() instead of proportions() for now to be compatible with older R versions * more specific comment regarding 'expected probabilities' from Poisson * re-ran devtools::document() --- R/FIFA2018.R | 105 +++++++++++++++++++++++++++++++++++++++++ data/FIFA2018.rda | Bin 0 -> 1887 bytes man/FIFA2018.Rd | 116 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 221 insertions(+) create mode 100644 R/FIFA2018.R create mode 100644 data/FIFA2018.rda create mode 100644 man/FIFA2018.Rd diff --git a/R/FIFA2018.R b/R/FIFA2018.R new file mode 100644 index 00000000..fb24b420 --- /dev/null +++ b/R/FIFA2018.R @@ -0,0 +1,105 @@ +#' Goals scored in all 2018 FIFA World Cup matches +#' +#' Data from all 64 matches in the 2018 FIFA World Cup along with predicted +#' ability differences based on bookmakers odds. +#' +#' To investigate the number of goals scored per match in the 2018 FIFA World Cup, +#' \code{FIFA2018} provides two rows, one for each team, for each of the matches +#' during the tournament. In addition some basic meta-information for the matches +#' (an ID, team name abbreviations, type of match, group vs. knockout stage), +#' information on the estimated log-ability for each team is provided. These +#' have been estimated by Zeileis et al. (2018) prior to the start of the +#' tournament (2018-05-20) based on quoted odds from 26 online bookmakers using +#' the bookmaker consensus model of Leitner et al. (2010). The difference in +#' log-ability between a team and its opponent is a useful predictor for the +#' number of goals scored. +#' +#' To model the data a basic Poisson regression model provides a good fit. +#' This treats the number of goals by the two teams as independent given the +#' ability difference which is a reasonable assumption in this data set. +#' +#' @usage data("FIFA2018", package = "distributions3") +#' +#' @format A data frame with 128 rows and 7 columns. +#' \describe{ +#' \item{goals}{integer. Number of goals scored in normal time (90 minutes), \ +#' i.e., excluding potential extra time or penalties in knockout matches.} +#' \item{team}{character. 3-letter FIFA code for the team.} +#' \item{match}{integer. Match ID ranging from 1 (opening match) to 64 (final).} +#' \item{type}{factor. Type of match for groups A to H, round of 16 (R16), quarter final, +#' semi-final, match for 3rd place, and final.} +#' \item{stage}{factor. Group vs. knockout tournament stage.} +#' \item{logability}{numeric. Estimated log-ability for each team based on +#' bookmaker consensus model.} +#' \item{difference}{numeric. Difference in estimated log-abilities between +#' a team and its opponent in each match.} +#' } +#' +#' @source The goals for each match have been obtained from Wikipedia +#' (\url{https://en.wikipedia.org/wiki/2018_FIFA_World_Cup}) and the log-abilities +#' from Zeileis et al. (2018) based on quoted odds from Oddschecker.com and Bwin.com. +#' +#' @references Leitner C, Zeileis A, Hornik K (2010). +#' Forecasting Sports Tournaments by Ratings of (Prob)abilities: A Comparison for the EURO 2008. +#' \emph{International Journal of Forecasting}, \bold{26}(3), 471-481. +#' \doi{10.1016/j.ijforecast.2009.10.001} +#' +#' Zeileis A, Leitner C, Hornik K (2018). +#' Probabilistic Forecasts for the 2018 FIFA World Cup Based on the Bookmaker Consensus Model. +#' Working Paper 2018-09, Working Papers in Economics and Statistics, +#' Research Platform Empirical and Experimental Economics, University of Innsbruck. +#' \url{https://EconPapers.RePEc.org/RePEc:inn:wpaper:2018-09} +#' +#' @examples +#' ## load data +#' data("FIFA2018", package = "distributions3") +#' +#' ## observed relative frequencies of goals in all matches +#' obsrvd <- prop.table(table(FIFA2018$goals)) +#' +#' ## expected probabilities assuming a simple Poisson model, +#' ## using the average number of goals across all teams/matches +#' ## as the point estimate for the mean (lambda) of the distribution +#' p_const <- Poisson(lambda = mean(FIFA2018$goals)) +#' p_const +#' expctd <- pdf(p_const, 0:6) +#' +#' ## comparison: observed vs. expected frequencies +#' ## frequencies for 3 and 4 goals are slightly overfitted +#' ## while 5 and 6 goals are slightly underfitted +#' cbind("observed" = obsrvd, "expected" = expctd) +#' +#' ## instead of fitting the same average Poisson model to all +#' ## teams/matches, take ability differences into account +#' m <- glm(goals ~ difference, data = FIFA2018, family = poisson) +#' summary(m) +#' ## when the ratio of abilities increases by 1 percent, the +#' ## expected number of goals increases by around 0.4 percent +#' +#' ## this yields a different predicted Poisson distribution for +#' ## each team/match +#' p_reg <- Poisson(lambda = fitted(m)) +#' head(p_reg) +#' +#' ## as an illustration, the following goal distributions +#' ## were expected for the final (that France won 4-2 against Croatia) +#' p_final <- tail(p_reg, 2) +#' p_final +#' pdf(p_final, 0:6) +#' ## clearly France was expected to score more goals than Croatia +#' ## but both teams scored more goals than expected, albeit not unlikely many +#' +#' ## assuming independence of the number of goals scored, obtain +#' ## table of possible match results (after normal time), along with +#' ## overall probabilities of win/draw/lose +#' res <- outer(pdf(p_final[1], 0:6), pdf(p_final[2], 0:6)) +#' sum(res[lower.tri(res)]) ## France wins +#' sum(diag(res)) ## draw +#' sum(res[upper.tri(res)]) ## France loses +#' +#' ## update expected frequencies table based on regression model +#' expctd <- pdf(p_reg, 0:6) +#' head(expctd) +#' expctd <- colMeans(expctd) +#' cbind("observed" = obsrvd, "expected" = expctd) +"FIFA2018" diff --git a/data/FIFA2018.rda b/data/FIFA2018.rda new file mode 100644 index 0000000000000000000000000000000000000000..3caf9868d3abda792a6dda07b2462435b193f0b3 GIT binary patch literal 1887 zcmV-l2cY;LiwFP!000001MQZ1P*YbF#$OU5n+xu{qG;U_jl19_K!T`XOh83DwkZZg z1EC>W*BU!AZn#vzWvbP2>EPgktt+iw>%LUY(7Iq;f>qFQL)@@cX)Ncw+&-Q|Gxg7A z=9lljd%t_`dH0>SXd_1V^cn5Lah#O%kVrYH%-QsiI{ze`2j}fnFLjh!*)ym|KaP{t zcOH8>spZ7#M9#T>5+^cH3G~T8rA#0CWK18&%#(m$Dl(5n_>&6e<4?vMOBf&9C!EK& z*<4sZ>l4Prb!7sd9JGeWc}Yc-U6+l8>cO0!aDOC%xng_CaaD5{#uDbp!Mg9-e%!k{ zsH_v2Hcp4lp*kftRgv-7jMK)6c*PA&1o3I&OGGQR*o@M~U{f=k?^WqEBE90%7_WFR zW*XyFYx&HSu6#8r-W#Ec5ot1BCW831YL`%IBl*;I!>}2y9gfYINTrBZ+`vQ-pC-OU zxHcS{I&By>!(6HB;-W-)#iudeYJ)KmB3)Y&ts2dHN2&NSa$QZP$#|Iv;?t^KB22~a zjz-B}!q_-hoyA0oc*PA&1o3I&ON0;S4MQ|cDf7KLS8*{0rkC-$6O8e)MeC{^=I$21 zO__V1DK6$_V2Xgb0L+%KC*-;;Y{9saWHyz_j7gnYjah<;%R~@QT`d}cF^FGSrdQ40 zEyFq2`&jt?#CMPfAO~syo`4tN4fp`QKuw?)P#dTN_yKi+dO&@k0niX=1T+Sk08N2r zKy#o4&=P0`v%LrOBb6z1{Jl&h0d^+rLC@&uzk9aH2JPt+$^OlD{6?SClk|M{@E`>6Q9yek`N z1y@-qQY@b&vuSb~9C%GiH6=_kO}12;WoQ;HtH0l3R&sPziD{kOMvD)9)4V|9PtTpc#PxK&BJlL?)-Jc zdKT6x)Hi?Kux@bQasTir&o;R+Cc|bfq4(>Dh1$prEz|Z#!_LV!B3Y(dJxE zx$yx(P1)h!D{wzYQH!1|+qQRYSAu$PR7LKd2*@Tv{18SwTKihAUbq9UughM-=`F5Fzt+mC{hGLK~ebt^I`>f`to>Otjw z2q-<1GJEjS7dC?WxbGkKH5Y;@C~@T0ER4E3m4cG%+H3N_CwD2T(aJ^64dMtWeH=U8 zUZgV;Q0iEcStuPjn1IrPr1rLu^)`Z9mXgxS4FPO-atU< zvqoFzuiLkZpceZ2ZVXWMqoCx;zK+Y^T+F4Qq#)x&#-tHm6!psMS@jg(h7#1Lv6uRE zzx#@!zI+jRdEvo<1ognxtBN+g92As1i=7#_d8CG-mij*oiVeC-P|IG&-+H)fCIux= zrq$Wxb8!bnJz+Ow{UY}xpmfhy9lA`64kf5X3X(DLMg#>VzmDBCta-E56g9tbo}yEZ zgMiYXy^kh1Olbs^78b1=eP^zXfKt0-`CDJDhM*q5)-3&T=W>d=d*|BZ3Ed78)RKP5 zr-xq8q@d)IIV5^ivKIvADC!%W+TYZ+idzG5VL&u Zg)Fr?|0*gftnhjB_Af7ULD}FJ002ootd;-( literal 0 HcmV?d00001 diff --git a/man/FIFA2018.Rd b/man/FIFA2018.Rd new file mode 100644 index 00000000..76cba8a8 --- /dev/null +++ b/man/FIFA2018.Rd @@ -0,0 +1,116 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FIFA2018.R +\docType{data} +\name{FIFA2018} +\alias{FIFA2018} +\title{Goals scored in all 2018 FIFA World Cup matches} +\format{ +A data frame with 128 rows and 7 columns. +\describe{ +\item{goals}{integer. Number of goals scored in normal time (90 minutes), \ +i.e., excluding potential extra time or penalties in knockout matches.} +\item{team}{character. 3-letter FIFA code for the team.} +\item{match}{integer. Match ID ranging from 1 (opening match) to 64 (final).} +\item{type}{factor. Type of match for groups A to H, round of 16 (R16), quarter final, +semi-final, match for 3rd place, and final.} +\item{stage}{factor. Group vs. knockout tournament stage.} +\item{logability}{numeric. Estimated log-ability for each team based on +bookmaker consensus model.} +\item{difference}{numeric. Difference in estimated log-abilities between +a team and its opponent in each match.} +} +} +\source{ +The goals for each match have been obtained from Wikipedia +(\url{https://en.wikipedia.org/wiki/2018_FIFA_World_Cup}) and the log-abilities +from Zeileis et al. (2018) based on quoted odds from Oddschecker.com and Bwin.com. +} +\usage{ +data("FIFA2018", package = "distributions3") +} +\description{ +Data from all 64 matches in the 2018 FIFA World Cup along with predicted +ability differences based on bookmakers odds. +} +\details{ +To investigate the number of goals scored per match in the 2018 FIFA World Cup, +\code{FIFA2018} provides two rows, one for each team, for each of the matches +during the tournament. In addition some basic meta-information for the matches +(an ID, team name abbreviations, type of match, group vs. knockout stage), +information on the estimated log-ability for each team is provided. These +have been estimated by Zeileis et al. (2018) prior to the start of the +tournament (2018-05-20) based on quoted odds from 26 online bookmakers using +the bookmaker consensus model of Leitner et al. (2010). The difference in +log-ability between a team and its opponent is a useful predictor for the +number of goals scored. + +To model the data a basic Poisson regression model provides a good fit. +This treats the number of goals by the two teams as independent given the +ability difference which is a reasonable assumption in this data set. +} +\examples{ +## load data +data("FIFA2018", package = "distributions3") + +## observed relative frequencies of goals in all matches +obsrvd <- prop.table(table(FIFA2018$goals)) + +## expected probabilities assuming a simple Poisson model, +## using the average number of goals across all teams/matches +## as the point estimate for the mean (lambda) of the distribution +p_const <- Poisson(lambda = mean(FIFA2018$goals)) +p_const +expctd <- pdf(p_const, 0:6) + +## comparison: observed vs. expected frequencies +## frequencies for 3 and 4 goals are slightly overfitted +## while 5 and 6 goals are slightly underfitted +cbind("observed" = obsrvd, "expected" = expctd) + +## instead of fitting the same average Poisson model to all +## teams/matches, take ability differences into account +m <- glm(goals ~ difference, data = FIFA2018, family = poisson) +summary(m) +## when the ratio of abilities increases by 1 percent, the +## expected number of goals increases by around 0.4 percent + +## this yields a different predicted Poisson distribution for +## each team/match +p_reg <- Poisson(lambda = fitted(m)) +head(p_reg) + +## as an illustration, the following goal distributions +## were expected for the final (that France won 4-2 against Croatia) +p_final <- tail(p_reg, 2) +p_final +pdf(p_final, 0:6) +## clearly France was expected to score more goals than Croatia +## but both teams scored more goals than expected, albeit not unlikely many + +## assuming independence of the number of goals scored, obtain +## table of possible match results (after normal time), along with +## overall probabilities of win/draw/lose +res <- outer(pdf(p_final[1], 0:6), pdf(p_final[2], 0:6)) +sum(res[lower.tri(res)]) ## France wins +sum(diag(res)) ## draw +sum(res[upper.tri(res)]) ## France loses + +## update expected frequencies table based on regression model +expctd <- pdf(p_reg, 0:6) +head(expctd) +expctd <- colMeans(expctd) +cbind("observed" = obsrvd, "expected" = expctd) +} +\references{ +Leitner C, Zeileis A, Hornik K (2010). +Forecasting Sports Tournaments by Ratings of (Prob)abilities: A Comparison for the EURO 2008. +\emph{International Journal of Forecasting}, \bold{26}(3), 471-481. +\doi{10.1016/j.ijforecast.2009.10.001} + +Zeileis A, Leitner C, Hornik K (2018). +Probabilistic Forecasts for the 2018 FIFA World Cup Based on the Bookmaker Consensus Model. +Working Paper 2018-09, Working Papers in Economics and Statistics, +Research Platform Empirical and Experimental Economics, University of Innsbruck. +\url{https://EconPapers.RePEc.org/RePEc:inn:wpaper:2018-09} +} +\keyword{datasets}