From 0c9c86ea2c67221f4597e7b0603e19964f5a9fa3 Mon Sep 17 00:00:00 2001
From: Achim Zeileis <Achim.Zeileis@R-project.org>
Date: Fri, 18 Feb 2022 17:04:58 +0100
Subject: [PATCH] Count data: From basic probability theory to regression
 models (#73)

* added FIFA2018 goals data to illustrate basic Poisson distribution and regression

* use prop.table() instead of proportions() for now to be compatible with older R versions

* more specific comment regarding 'expected probabilities' from Poisson

* re-ran devtools::document()
---
 R/FIFA2018.R      | 105 +++++++++++++++++++++++++++++++++++++++++
 data/FIFA2018.rda | Bin 0 -> 1887 bytes
 man/FIFA2018.Rd   | 116 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 221 insertions(+)
 create mode 100644 R/FIFA2018.R
 create mode 100644 data/FIFA2018.rda
 create mode 100644 man/FIFA2018.Rd

diff --git a/R/FIFA2018.R b/R/FIFA2018.R
new file mode 100644
index 00000000..fb24b420
--- /dev/null
+++ b/R/FIFA2018.R
@@ -0,0 +1,105 @@
+#' Goals scored in all 2018 FIFA World Cup matches
+#'
+#' Data from all 64 matches in the 2018 FIFA World Cup along with predicted
+#' ability differences based on bookmakers odds.
+#'
+#' To investigate the number of goals scored per match in the 2018 FIFA World Cup,
+#' \code{FIFA2018} provides two rows, one for each team, for each of the matches
+#' during the tournament. In addition some basic meta-information for the matches
+#' (an ID, team name abbreviations, type of match, group vs. knockout stage),
+#' information on the estimated log-ability for each team is provided. These
+#' have been estimated by Zeileis et al. (2018) prior to the start of the
+#' tournament (2018-05-20) based on quoted odds from 26 online bookmakers using
+#' the bookmaker consensus model of Leitner et al. (2010). The difference in
+#' log-ability between a team and its opponent is a useful predictor for the
+#' number of goals scored.
+#' 
+#' To model the data a basic Poisson regression model provides a good fit.
+#' This treats the number of goals by the two teams as independent given the
+#' ability difference which is a reasonable assumption in this data set.
+#'
+#' @usage data("FIFA2018", package = "distributions3")
+#'
+#' @format A data frame with 128 rows and 7 columns.
+#' \describe{
+#'   \item{goals}{integer. Number of goals scored in normal time (90 minutes), \
+#'     i.e., excluding potential extra time or penalties in knockout matches.}
+#'   \item{team}{character. 3-letter FIFA code for the team.}
+#'   \item{match}{integer. Match ID ranging from 1 (opening match) to 64 (final).}
+#'   \item{type}{factor. Type of match for groups A to H, round of 16 (R16), quarter final,
+#'     semi-final, match for 3rd place, and final.}
+#'   \item{stage}{factor. Group vs. knockout tournament stage.}
+#'   \item{logability}{numeric. Estimated log-ability for each team based on
+#'     bookmaker consensus model.}
+#'   \item{difference}{numeric. Difference in estimated log-abilities between
+#'     a team and its opponent in each match.}
+#'   }
+#'
+#' @source The goals for each match have been obtained from Wikipedia
+#'   (\url{https://en.wikipedia.org/wiki/2018_FIFA_World_Cup}) and the log-abilities
+#'   from Zeileis et al. (2018) based on quoted odds from Oddschecker.com and Bwin.com.
+#'
+#' @references Leitner C, Zeileis A, Hornik K (2010).
+#'   Forecasting Sports Tournaments by Ratings of (Prob)abilities: A Comparison for the EURO 2008.
+#'   \emph{International Journal of Forecasting}, \bold{26}(3), 471-481.
+#'   \doi{10.1016/j.ijforecast.2009.10.001}
+#'
+#' Zeileis A, Leitner C, Hornik K (2018).
+#'   Probabilistic Forecasts for the 2018 FIFA World Cup Based on the Bookmaker Consensus Model.
+#'   Working Paper 2018-09, Working Papers in Economics and Statistics,
+#'   Research Platform Empirical and Experimental Economics, University of Innsbruck. 
+#'   \url{https://EconPapers.RePEc.org/RePEc:inn:wpaper:2018-09}
+#'
+#' @examples
+#' ## load data
+#' data("FIFA2018", package = "distributions3")
+#'
+#' ## observed relative frequencies of goals in all matches
+#' obsrvd <- prop.table(table(FIFA2018$goals))
+#'
+#' ## expected probabilities assuming a simple Poisson model,
+#' ## using the average number of goals across all teams/matches
+#' ## as the point estimate for the mean (lambda) of the distribution
+#' p_const <- Poisson(lambda = mean(FIFA2018$goals))
+#' p_const
+#' expctd <- pdf(p_const, 0:6)
+#' 
+#' ## comparison: observed vs. expected frequencies
+#' ## frequencies for 3 and 4 goals are slightly overfitted
+#' ## while 5 and 6 goals are slightly underfitted
+#' cbind("observed" = obsrvd, "expected" = expctd)
+#'
+#' ## instead of fitting the same average Poisson model to all
+#' ## teams/matches, take ability differences into account
+#' m <- glm(goals ~ difference, data = FIFA2018, family = poisson)
+#' summary(m)
+#' ## when the ratio of abilities increases by 1 percent, the
+#' ## expected number of goals increases by around 0.4 percent
+#'
+#' ## this yields a different predicted Poisson distribution for
+#' ## each team/match
+#' p_reg <- Poisson(lambda = fitted(m))
+#' head(p_reg)
+#'
+#' ## as an illustration, the following goal distributions
+#' ## were expected for the final (that France won 4-2 against Croatia)
+#' p_final <- tail(p_reg, 2)
+#' p_final
+#' pdf(p_final, 0:6)
+#' ## clearly France was expected to score more goals than Croatia
+#' ## but both teams scored more goals than expected, albeit not unlikely many
+#'
+#' ## assuming independence of the number of goals scored, obtain
+#' ## table of possible match results (after normal time), along with
+#' ## overall probabilities of win/draw/lose
+#' res <- outer(pdf(p_final[1], 0:6), pdf(p_final[2], 0:6))
+#' sum(res[lower.tri(res)]) ## France wins
+#' sum(diag(res))           ## draw
+#' sum(res[upper.tri(res)]) ## France loses
+#'
+#' ## update expected frequencies table based on regression model
+#' expctd <- pdf(p_reg, 0:6)
+#' head(expctd)
+#' expctd <- colMeans(expctd)
+#' cbind("observed" = obsrvd, "expected" = expctd)
+"FIFA2018"
diff --git a/data/FIFA2018.rda b/data/FIFA2018.rda
new file mode 100644
index 0000000000000000000000000000000000000000..3caf9868d3abda792a6dda07b2462435b193f0b3
GIT binary patch
literal 1887
zcmV-l2cY;LiwFP!000001MQZ1P*YbF#$OU5n+xu{qG;U_jl19_K!T`XOh83DwkZZg
z1EC>W*BU!AZn#vzWvbP2>EPgktt+iw>%LUY(7Iq;f>qFQL)@@cX)Ncw+&-Q|Gxg7A
z=9lljd%t_`dH0>SXd_1V^cn5Lah#O%kVrYH%-QsiI{ze`2j}fnFLjh!*)ym|KaP{t
zcOH8>spZ7#M9#T>5+^cH3G~T8rA#0CWK18&%#(m$Dl(5n_>&6e<4?vMOBf&9C!EK&
z*<4sZ>l4Prb!7sd9JGeWc}Yc-U6+l8>cO0!aDOC%xng_CaaD5{#uDbp!Mg9-e%!k{
zsH_v2Hcp4lp*kftRgv-7jMK)6c*PA&1o3I&OGGQR*o@M~U{f=k?^WqEBE90%7_WFR
zW*XyFYx&HSu6#8r-W#Ec5ot1BCW831YL`%IBl*;I!>}2y9gfYINTrBZ+`vQ-pC-OU
zxHcS{I&By>!(6HB;-W-)#iudeYJ)KmB3)Y&ts2dHN2&NSa$QZP$#|Iv;?t^KB22~a
zjz-B}!q_-hoyA0oc*PA&1o3I&ON0;S4MQ|cDf7KLS8*{0rkC-$6O8e)MeC{^=I$21
zO__V1DK6$_V2Xgb0L+%KC*-;;Y{9saWHyz_j7gnYjah<;%R~@QT`d}cF^FGSrdQ40
zEyFq2`&jt?#CMPfAO~syo`4tN4fp`QKuw?)P#dTN_yKi+dO&@k0niX=1T+Sk08N2r
zKy#o4&=P0`v<CcvHb7e-0B8ra2R;Hi03Cr&Kxd!}&=u$g1OnZGAfN}(6HoxXfZjkK
zpfAu5=no741_HrA2rvjBKxid0VIBNiq;4W}6Aw3$yGac<@pKa}H}U?D_*8?h0KxtF
zkMH*yM#B_?F`cWtLu)~=1fL)he3?q{r74NP20qgyYHZ*mNHUmz#`Wj}P0taN^=6B~
z42K>%LrOBb6z1{Jl&h0d^+rLC@&uzk9aH2JPt+$^OlD{6?SClk|M{@E`>6Q9yek`N
z1y@-qQY@b&vuSb~9C%GiH6=_kO}12;WoQ;HtH0l3R&sPziD{kOMvD)9)4V|9PtT<H
zKAWhqk@I6`Z4Y^t#*Yj7(O<<Uh(m9Tj5o~M<wtLPy=O~tZ7caLeoN^qc@BBdDq)MF
zHHXJld@wGYhdzG(gK_yU&3#wWYl{{ta~Epp`K{RlPPVkseV2|*NtAQE!Z`02_}&+X
z6qoECGsSp?+BPhfs*jx_wwakb50;#ww!+2Ii|0DxxUe7nLSL0Qj?CP)!REpA-1txL
zPFY|jr4w6DT2h=u54XsU$Q_i&9}9dq{=RwWC#5x;PMq-heVU(FwqRs1N6)B=#|_+{
zMsMaE*u9`zD6cTi`vt!D#Ua1<zv44{h8MkK^51Z}$7*`(kp0K8C8fkJe=}tD)A{_k
zpdbBJe8M;<4n<|%9kYyH_8b$cUm8ljD`;}8^)@TD%}G_PYVU>pc#PxK&BJlL?)-Jc
zdKT6x)Hi?Kux@bQasTi<FfZIkyq>r&o;R+Cc|bfq4(>Dh1$prEz|Z#!_LV<IKlu3P
z!2QixFAs!eQPgWm@jq$XZXl?a<xAGh$<<I$@+fZlyf$A3Q&6&JUe_7Q>!B3Y(dJxE
zx$yx(P1)h!D{wzYQH!1|+qQRYSAu$PR7LK<u^&^^=Rd3uu3z7ufYN)Dx1TjFY(PNi
z^UbHal>d2*@Tv{18SwTKihAUbq9UughM-=`F5Fzt+mC{hGLK~ebt^I`>f`to>Otjw
z2q-<1GJEjS7dC?WxbGkKH5Y;@C~@T0ER4E3m4cG%+H3N_CwD2T(aJ^64dMtWeH=U8
zUZgV;Q0iEcStuPjn1IrPr1rLu^)`Z9mXg<?T}%iCCFObM*V|h!qNw>xS4FPO-atU<
zvqoFzuiLkZpceZ2ZVXWMqoCx;zK+Y^T+F4Qq#)x&#-tHm6!psMS@jg(h7#1Lv6uRE
zzx#@!zI+jRdEvo<1ognxtBN+g92As1i=7#_d8CG-mij*oiVeC-P|IG&-+H)fCIux=
zrq$Wxb8!bnJz+Ow{UY}xpmfhy9lA`64kf5X3X(DLMg#>VzmDBCta-E56g9tbo}yEZ
zgMiYXy^kh1Olbs^78b1=eP^zXfKt0-`CDJDhM*q5)-3&T=W>d=d*|BZ3Ed78)RKP5
zr-xq8q@d)IIV5^ivKIv<WevM#wHwlbf|5rEGdkym<xte!+KmeTr&a<=FGTO+qB5-n
zl-@pR&up7NouJx#9bXf?DU+h!ZZI|2p5!2)H2+A#&4Vkf1hpXQc;TICEhy^odPTkW
zHntK}+*jP^+y0jIE5g60pu(@xRDFs8ze+v%Zw-^)`85sSxE6yx1qV{}mV^oXz_c_2
z4y0T3NqFF6G$rXjOExB3reV+c<itdS*^ruGc+VG$&v?B>ADC!%W+TYZ+idzG5VL&u
Zg)Fr?|0*gftnhjB_Af7ULD}FJ002ootd;-(

literal 0
HcmV?d00001

diff --git a/man/FIFA2018.Rd b/man/FIFA2018.Rd
new file mode 100644
index 00000000..76cba8a8
--- /dev/null
+++ b/man/FIFA2018.Rd
@@ -0,0 +1,116 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/FIFA2018.R
+\docType{data}
+\name{FIFA2018}
+\alias{FIFA2018}
+\title{Goals scored in all 2018 FIFA World Cup matches}
+\format{
+A data frame with 128 rows and 7 columns.
+\describe{
+\item{goals}{integer. Number of goals scored in normal time (90 minutes), \
+i.e., excluding potential extra time or penalties in knockout matches.}
+\item{team}{character. 3-letter FIFA code for the team.}
+\item{match}{integer. Match ID ranging from 1 (opening match) to 64 (final).}
+\item{type}{factor. Type of match for groups A to H, round of 16 (R16), quarter final,
+semi-final, match for 3rd place, and final.}
+\item{stage}{factor. Group vs. knockout tournament stage.}
+\item{logability}{numeric. Estimated log-ability for each team based on
+bookmaker consensus model.}
+\item{difference}{numeric. Difference in estimated log-abilities between
+a team and its opponent in each match.}
+}
+}
+\source{
+The goals for each match have been obtained from Wikipedia
+(\url{https://en.wikipedia.org/wiki/2018_FIFA_World_Cup}) and the log-abilities
+from Zeileis et al. (2018) based on quoted odds from Oddschecker.com and Bwin.com.
+}
+\usage{
+data("FIFA2018", package = "distributions3")
+}
+\description{
+Data from all 64 matches in the 2018 FIFA World Cup along with predicted
+ability differences based on bookmakers odds.
+}
+\details{
+To investigate the number of goals scored per match in the 2018 FIFA World Cup,
+\code{FIFA2018} provides two rows, one for each team, for each of the matches
+during the tournament. In addition some basic meta-information for the matches
+(an ID, team name abbreviations, type of match, group vs. knockout stage),
+information on the estimated log-ability for each team is provided. These
+have been estimated by Zeileis et al. (2018) prior to the start of the
+tournament (2018-05-20) based on quoted odds from 26 online bookmakers using
+the bookmaker consensus model of Leitner et al. (2010). The difference in
+log-ability between a team and its opponent is a useful predictor for the
+number of goals scored.
+
+To model the data a basic Poisson regression model provides a good fit.
+This treats the number of goals by the two teams as independent given the
+ability difference which is a reasonable assumption in this data set.
+}
+\examples{
+## load data
+data("FIFA2018", package = "distributions3")
+
+## observed relative frequencies of goals in all matches
+obsrvd <- prop.table(table(FIFA2018$goals))
+
+## expected probabilities assuming a simple Poisson model,
+## using the average number of goals across all teams/matches
+## as the point estimate for the mean (lambda) of the distribution
+p_const <- Poisson(lambda = mean(FIFA2018$goals))
+p_const
+expctd <- pdf(p_const, 0:6)
+
+## comparison: observed vs. expected frequencies
+## frequencies for 3 and 4 goals are slightly overfitted
+## while 5 and 6 goals are slightly underfitted
+cbind("observed" = obsrvd, "expected" = expctd)
+
+## instead of fitting the same average Poisson model to all
+## teams/matches, take ability differences into account
+m <- glm(goals ~ difference, data = FIFA2018, family = poisson)
+summary(m)
+## when the ratio of abilities increases by 1 percent, the
+## expected number of goals increases by around 0.4 percent
+
+## this yields a different predicted Poisson distribution for
+## each team/match
+p_reg <- Poisson(lambda = fitted(m))
+head(p_reg)
+
+## as an illustration, the following goal distributions
+## were expected for the final (that France won 4-2 against Croatia)
+p_final <- tail(p_reg, 2)
+p_final
+pdf(p_final, 0:6)
+## clearly France was expected to score more goals than Croatia
+## but both teams scored more goals than expected, albeit not unlikely many
+
+## assuming independence of the number of goals scored, obtain
+## table of possible match results (after normal time), along with
+## overall probabilities of win/draw/lose
+res <- outer(pdf(p_final[1], 0:6), pdf(p_final[2], 0:6))
+sum(res[lower.tri(res)]) ## France wins
+sum(diag(res))           ## draw
+sum(res[upper.tri(res)]) ## France loses
+
+## update expected frequencies table based on regression model
+expctd <- pdf(p_reg, 0:6)
+head(expctd)
+expctd <- colMeans(expctd)
+cbind("observed" = obsrvd, "expected" = expctd)
+}
+\references{
+Leitner C, Zeileis A, Hornik K (2010).
+Forecasting Sports Tournaments by Ratings of (Prob)abilities: A Comparison for the EURO 2008.
+\emph{International Journal of Forecasting}, \bold{26}(3), 471-481.
+\doi{10.1016/j.ijforecast.2009.10.001}
+
+Zeileis A, Leitner C, Hornik K (2018).
+Probabilistic Forecasts for the 2018 FIFA World Cup Based on the Bookmaker Consensus Model.
+Working Paper 2018-09, Working Papers in Economics and Statistics,
+Research Platform Empirical and Experimental Economics, University of Innsbruck.
+\url{https://EconPapers.RePEc.org/RePEc:inn:wpaper:2018-09}
+}
+\keyword{datasets}