diff --git a/section1_introduction.jl b/section1_introduction.jl index 6449bf9..609245f 100644 --- a/section1_introduction.jl +++ b/section1_introduction.jl @@ -17,6 +17,12 @@ end; # ╔═╡ b2ad130c-5d40-4a4b-adbe-5880984a9460 TableOfContents() +# ╔═╡ 3388e67f-a5bb-409a-843d-0958243c3b8f +md" + +# An introduction to Bayesian inference +" + # ╔═╡ 97d325ca-2c41-42b4-82b8-f74659e61dc3 md""" @@ -2077,6 +2083,7 @@ version = "0.9.1+5" # ╔═╡ Cell order: # ╟─078e8979-6753-411f-9c34-ba0164ea9cb2 # ╟─b2ad130c-5d40-4a4b-adbe-5880984a9460 +# ╟─3388e67f-a5bb-409a-843d-0958243c3b8f # ╟─97d325ca-2c41-42b4-82b8-f74659e61dc3 # ╟─300929c0-3279-47ef-b165-0e936b757679 # ╟─41e15a4f-8ddc-47f9-8286-bf583b7d748a diff --git a/section2_modelling.jl b/section2_modelling.jl index 25869c1..4073642 100644 --- a/section2_modelling.jl +++ b/section2_modelling.jl @@ -18,6 +18,13 @@ end; # ╔═╡ 904c5f8d-e5a9-4f18-a7d7-b4561ad37655 TableOfContents() +# ╔═╡ 613a65ce-9384-46ed-ab0d-abfda374f73c +md""" + +# More on Bayesian modelling + +""" + # ╔═╡ 5b4c3b01-bd19-4085-8a83-781086c85825 md""" @@ -63,10 +70,12 @@ md""" # ╔═╡ e574a570-a76b-4ab2-a395-ca40dc383e5e let - Plots.plot(0:0.1:10, Exponential(1),lw=2, label=L"\texttt{Exponential}(1.0)", legend=:best, xlabel=L"\theta", ylabel=L"p(\theta)", title="Priors for "*L"σ^2\in (0,∞)") - Plots.plot!(0:0.1:10, Exponential(2),lw=2, label=L"\texttt{Exponential}(2.0)") - Plots.plot!(0:0.1:10, Exponential(5), lw=2, label=L"\texttt{Exponential}(5.0)") - Plots.plot!(0:0.1:10, truncated(Cauchy(0, 1), lower= 0), lw=2, label=L"\texttt{HalfCauchy}(0, 1)") + xpltlim1= -1 + xpltlim2 = 6 + Plots.plot(xpltlim1:0.01:xpltlim2, Exponential(1),lw=1.5, label=L"\texttt{Exponential}(1.0)", legend=:best, xlabel=L"\theta", ylabel=L"p(\theta)", title="Priors for "*L"σ^2\in (0,∞)") + Plots.plot!(xpltlim1:0.01:xpltlim2, Exponential(2),lw=1.5, label=L"\texttt{Exponential}(2.0)") + Plots.plot!(xpltlim1:0.01:xpltlim2, Exponential(5), lw=1.5, label=L"\texttt{Exponential}(5.0)") + Plots.plot!(xpltlim1:0.01:xpltlim2, truncated(Cauchy(0, 1), lower= 0), lw=1.5, label=L"\texttt{HalfCauchy}(0, 1)") # Plots.plot!(0:0.1:10, truncated(Cauchy(0, 2), lower= 0), label="HalfCauchy(0, 2)") # Plots.plot!(0:0.1:10, truncated(Cauchy(0, 5), lower= 0), label="HalfCauchy(0, 5)") end @@ -246,12 +255,12 @@ md""" > Assume we have made ``N`` independent samples from a Gaussian distribution with known ``\mu`` but unknown variance ``\sigma^2>0``. What is the observation variance ``\sigma^2`` ? -It is more convenient to model the precision ``\lambda\triangleq 1/\sigma^2``. A conjugate prior for the precision parameter is Gamma distribution. Gamma distributions have support on the positive real line which matches the precision's value range. +It is more convenient to model the precision ``\phi\triangleq 1/\sigma^2``. A conjugate prior for the precision parameter is Gamma distribution. Gamma distributions have support on the positive real line which matches the precision's value range. A Gamma distribution, parameterised with a shape ``a_0>0``, and a rate parameter ``b_0>0``, has a probability density function: ```math -p(\lambda; a_0, b_0) = \texttt{Gamma}(\lambda; a_0, b_0)=\frac{b_0^{a_0}}{\Gamma(b_0)} \lambda^{a_0-1} e^{-b_0\lambda}. +p(\phi; a_0, b_0) = \texttt{Gamma}(\phi; a_0, b_0)=\frac{b_0^{a_0}}{\Gamma(b_0)} \phi^{a_0-1} e^{-b_0\phi}. ``` """ @@ -259,7 +268,7 @@ p(\lambda; a_0, b_0) = \texttt{Gamma}(\lambda; a_0, b_0)=\frac{b_0^{a_0}}{\Gamma let as_ = [1,2,3,5,9,7.5,0.5] bs_ = [1/2, 1/2, 1/2, 1, 1/0.5, 1, 1] - plt= plot(xlim =[0,20], ylim = [0, 0.8], xlabel=L"λ", ylabel="density") + plt= plot(xlim =[0,20], ylim = [0, 0.8], xlabel=L"\phi", ylabel="density") for i in 1:length(as_) plot!(Gamma(as_[i], 1/bs_[i]), fill=(0, .1), lw=1.5, label=L"\texttt{Gamma}"*"($(as_[i]),$(bs_[i]))") end @@ -271,20 +280,20 @@ md""" **Conjugacy.** The data follow a Gaussian distribution. That is for ``n = 1,2,\ldots, N``: ```math -d_n \sim \mathcal N(\mu, 1/\lambda); +d_n \sim \mathcal N(\mu, 1/\phi); ``` -The likelihood therefore is a product of Gaussian likelihoods: +The likelihood, therefore, is a product of Gaussian likelihoods: ```math -p(\mathcal D|\mu, \lambda) = \prod_{n=1}^N p(d_n|\mu, \lambda) = \prod_{n=1}^N \mathcal N(d_n; \mu, 1/\lambda) . +p(\mathcal D|\mu, \phi) = \prod_{n=1}^N p(d_n|\mu, \phi) = \prod_{n=1}^N \mathcal N(d_n; \mu, 1/\phi) . ``` It can be shown that the posterior formed according to Baye's rule ```math -p(\lambda|\mathcal D, \mu) \propto p(\lambda) p(\mathcal D|\mu, \lambda) +p(\phi|\mathcal D, \mu) \propto p(\phi) p(\mathcal D|\mu, \phi) ``` @@ -293,7 +302,7 @@ p(\lambda|\mathcal D, \mu) \propto p(\lambda) p(\mathcal D|\mu, \lambda) is still of a Gamma form (therefore conjugacy is established): ```math -p(\lambda|\mathcal D, \mu) = \texttt{Gamma}(a_N, b_N), +p(\phi|\mathcal D, \mu) = \texttt{Gamma}(a_N, b_N), ``` where @@ -302,11 +311,11 @@ $$a_n= a_0 +\frac{N}{2}, \;\;b_N = b_0 + \frac{\sum_{n} (d_n- \mu)^2}{2}.$$ The Bayesian computation reduces to hyperparameter update again. Note the posterior mean (standard result of a Gamma distribution) is -$$\mathbb{E}[\lambda|\mathcal D] = \frac{a_N}{b_N} = \frac{a_0 + N/2}{b_0 + {\sum_n (d_n -\mu)^2}/{2}}.$$ +$$\mathbb{E}[\phi|\mathcal D] = \frac{a_N}{b_N} = \frac{a_0 + N/2}{b_0 + {\sum_n (d_n -\mu)^2}/{2}}.$$ If we assume ``a_0= b_0=0`` (a flat prior), we recover the regular maximum likelihood estimator for ``{\sigma^2}``: -$$\hat \sigma^2 =1/\hat{\lambda} = \frac{\sum_n (d_n-\mu)^2}{N}.$$ +$$\hat \sigma^2 =1/\hat{\phi} = \frac{\sum_n (d_n-\mu)^2}{N}.$$ Based on the above result, we can intuitively interpret the hyperparameters as * ``a_N`` : the total count of observations contained in the posterior (both pseudo ``a_0`` and real observation ``N/2``) ; @@ -317,14 +326,14 @@ Based on the above result, we can intuitively interpret the hyperparameters as # ╔═╡ edbf5fa3-a064-47a4-9ff4-a93dbd7b9112 md""" -**Demonstration:** We first simulate ``N=100`` Gaussian observations with unknown ``\mu=0`` and ``\lambda= 1/\sigma^2 = 2``. +**Demonstration:** We first simulate ``N=100`` Gaussian observations with unknown ``\mu=0`` and ``\phi= 1/\sigma^2 = 2``. """ # ╔═╡ 958d6028-e38f-4a56-a606-47b6f8ee86f1 begin σ² = 0.5 - true_λ = 1/σ² + true_ϕ = 1/σ² N = 100 Random.seed!(100) # Gaussian's density in Distributions.jl is implemented with standard deviation σ rather than σ² @@ -343,8 +352,8 @@ begin a₀ = 0.5 b₀ = 0.5 # Gamma in Distributions.jl is implemented with shape and scale parameters where the second parameter is 1/b - prior_λ = Gamma(a₀, 1/b₀) - posterior_λ = Gamma(a₀+ N/2, 1/(b₀+ sum(gaussian_data.^2)/2)) + prior_ϕ = Gamma(a₀, 1/b₀) + posterior_ϕ = Gamma(a₀+ N/2, 1/(b₀+ sum(gaussian_data.^2)/2)) end; # ╔═╡ 03a650cb-7610-4d20-8ae7-2c6e308770f6 @@ -354,45 +363,45 @@ We can plot the prior and posterior distributions to visually check the Bayesian # ╔═╡ cb37f8a8-83aa-44f6-89a5-43fe0e4a5fa8 let - plot(prior_λ, xlim = [0, 5], ylim=[0, 1.5], lw=2, fill=(0, 0.2), label="Prior "* L"p(\lambda)", xlabel=L"\lambda", ylabel="density", title="Conjugate inference of a Gaussian's precision") + plot(prior_ϕ, xlim = [0, 5], ylim=[0, 1.5], lw=2, fill=(0, 0.2), label="Prior "* L"p(\phi)", xlabel=L"\phi", ylabel="density", title="Conjugate inference of a Gaussian's precision") - plot!(posterior_λ, lw=2,fill=(0, 0.2), label="Posterior "* L"p(\lambda|\mathcal{D})") - vline!([true_λ], label="true "*L"λ", lw=4) - vline!([mean(prior_λ)], label="prior mean", lw=2, lc=1, ls=:dash) - vline!([mean(posterior_λ)], label="posterior mean", lw=2, lc=2, ls=:dash) + plot!(posterior_ϕ, lw=2,fill=(0, 0.2), label="Posterior "* L"p(\phi|\mathcal{D})") + vline!([true_ϕ], label="true "*L"\phi", lw=4) + vline!([mean(prior_ϕ)], label="prior mean", lw=2, lc=1, ls=:dash) + vline!([mean(posterior_ϕ)], label="posterior mean", lw=2, lc=2, ls=:dash) # vline!([1/σ²], label="true "*L"λ", lw=2) # end end # ╔═╡ 2d7792cd-5b32-4228-bfef-e1ab250724f3 md""" -**Sequential update.** The conjugacy also provides us a computational cheap way to sequentially update the posterior. Due to the independence assumption, we do not need to calculate the posterior in one go. +**Sequential update.** The conjugacy also provides us with a computationally cheap way to sequentially update the posterior. Due to the independence assumption, we do not need to calculate the posterior in one go. ```math \begin{align} -p(\lambda|\{d_1, d_2, \ldots, d_N\})&\propto p(\lambda) \prod_{n=1}^N p(d_n|\lambda) p(d_N|\lambda)\\ -&= \underbrace{p(\lambda) \prod_{n=1}^{N-1} p(d_n|\lambda)}_{p(\lambda|\mathcal D_{N-1})}\cdot p(d_N|\lambda)\\ -&= \underbrace{p(\lambda|\mathcal D_{N-1})}_{\text{new prior}} p(d_N|\lambda) +p(\phi|\{d_1, d_2, \ldots, d_N\})&\propto p(\phi) \prod_{n=1}^N p(d_n|\phi) p(d_N|\phi)\\ +&= \underbrace{p(\phi) \prod_{n=1}^{N-1} p(d_n|\phi)}_{p(\phi|\mathcal D_{N-1})}\cdot p(d_N|\phi)\\ +&= \underbrace{p(\phi|\mathcal D_{N-1})}_{\text{new prior}} p(d_N|\phi) \end{align} ``` -With observations up to ``N-1``, we obtain a posterior ``p(\lambda|\mathcal D_{N-1})``. -The posterior now serves as the new prior to update the next observation ``d_N``. It can be shown that the final posterior sequentially updated this way is the same as the off-line posterior. +With observations up to ``N-1``, we obtain a posterior ``p(\phi|\mathcal D_{N-1})``. +The posterior now serves as the new prior waiting to be updated with the next observation ``d_N``. It can be shown that the final posterior sequentially updated this way is the same as the off-line posterior. To be more specific, the sequential update algorithm for the precision example is: -Initialise with a prior ``p(\lambda|\emptyset)``; +Initialise with a prior ``p(\phi|\varnothing)``; for ``n = 1,2,\ldots, N`` * update ``a_n = a_{n-1} + 0.5 ``, ``b_n = b_{n-1} + 0.5 \cdot (d_n-\mu)^2`` * report the posterior at ``n`` if needed -To demonstrate the idea, check the following animatin. The posterior update starts from a vague prior. As more data observed and absorbed to the posterior, the posterior distribution is more and more informative and finally recover the ground truth. +To demonstrate the idea, check the following animation. The posterior update starts from a vague prior. As more data is observed and absorbed into the posterior, the posterior distribution is more and more informative and finally recovers the ground truth. """ # ╔═╡ 7ccd4caf-ef8f-4b78-801f-b000f5aad430 let - plot(prior_λ, xlim = [0, 5], ylim=[0, 1.5], lw=2, fill=(0, 0.1), label="Prior "* L"p(\lambda)", xlabel=L"\lambda", ylabel="density", title="Sequential update N=0") - plot!(posterior_λ, lw=2, fill=(0, 0.1), label="Posterior "* L"p(\lambda|\mathcal{D})") - vline!([true_λ], label="true "*L"λ", lw=2, lc=3) + plot(prior_ϕ, xlim = [0, 5], ylim=[0, 1.5], lw=2, fill=(0, 0.1), label="Prior "* L"p(\phi)", xlabel=L"\phi", ylabel="density", title="Sequential update N=0") + plot!(posterior_ϕ, lw=2, fill=(0, 0.1), label="Posterior "* L"p(\phi|\mathcal{D})") + vline!([true_ϕ], label="true "*L"\phi", lw=2, lc=3) anim=@animate for n in [(1:9)..., (10:10:N)...] an = a₀ + n/2 bn = b₀ + sum(gaussian_data[1:n].^2)/2 @@ -455,7 +464,7 @@ The flexibility of introducing priors that reflect a modeller's local expert kno # ╔═╡ 3ca3ee53-c744-4474-a8da-6209ec5e6904 md""" -## More on Bayesian modelling +## Likelihood model variants """ @@ -537,9 +546,9 @@ $$\mu \sim \mathcal N(m_0, v_0),$$ To show our ignorance, we can set ``m_0=0`` (or the sample average) and ``v_0`` to a very large positive number, say 10,000. The prior then becomes a very flat vague distribution. -It is more convenient to model the observation precision ``\lambda \triangleq 1/\sigma^2`` instead of variance ``\sigma^2``. Here we assume a Gamma prior for the precision parameter: +It is more convenient to model the observation precision ``\phi \triangleq 1/\sigma^2`` instead of variance ``\sigma^2``. Here we assume a Gamma prior for the precision parameter: -$$\lambda \sim \texttt{Gamma}(a_0, b_0)$$ +$$\phi \sim \texttt{Gamma}(a_0, b_0)$$ Again, to show our ignorance, we can set ``a_0, b_0`` such that the distribution is as flat and vague as possible. A possible parameterisation is ``a_0=b_0=0.5.`` Note that Gamma is a distribution on the positive real line which has matching support for the precision parameter. @@ -547,8 +556,8 @@ To put them together, the full Bayesian model is: ```math \begin{align} \text{prior}: \mu &\sim \mathcal N(m_0=0, v_0=10000)\\ -\lambda &\sim \texttt{Gamma}(a_0=0.5, b_0=0.5)\\ -\text{likelihood}: d_n &\overset{\mathrm{i.i.d}}{\sim} \mathcal N(\mu, 1/\lambda) \;\; \text{for } n = 1,2,\ldots, 7. +\phi &\sim \texttt{Gamma}(a_0=0.5, b_0=0.5)\\ +\text{likelihood}: d_n &\overset{\mathrm{i.i.d}}{\sim} \mathcal N(\mu, 1/\phi) \;\; \text{for } n = 1,2,\ldots, 7. \end{align} ``` **Computation:** @@ -556,33 +565,33 @@ After specifying the model, we need to apply Baye's rule to compute the posterio ```math \begin{align} -p(\mu, \lambda|\mathcal D) &\propto p(\mu, \lambda) p(\mathcal D|\mu, \lambda) +p(\mu, \phi|\mathcal D) &\propto p(\mu, \phi) p(\mathcal D|\mu, \phi) \\ -&= p(\mu)p(\lambda) p(\mathcal D|\mu, \lambda) \\ -&= p(\mu)p(\lambda) \prod_{n=1}^N p(d_n|\mu, \lambda); +&= p(\mu)p(\phi) p(\mathcal D|\mu, \phi) \\ +&= p(\mu)p(\phi) \prod_{n=1}^N p(d_n|\mu, \phi); \end{align} ``` -where we have assumed the prior for ``\mu`` and ``\lambda`` are independent. Sub-in the definition of the prior and likelihood, we can plot the posterior. +where we have assumed the prior for ``\mu`` and ``\phi`` are independent. Sub-in the definition of the prior and likelihood, we can plot the posterior. """ # ╔═╡ d6056188-9a46-4b5f-801b-e028b6eb0b7f let m₀, v₀ = 0, 10000 a₀, b₀ = 0.5, 0.5 - function ℓπ(μ, λ; data) - σ² = 1/λ - logprior = logpdf(Normal(m₀, v₀), μ) + logpdf(Gamma(a₀, 1/b₀), λ) + function ℓπ(μ, ϕ; data) + σ² = 1/ϕ + logprior = logpdf(Normal(m₀, v₀), μ) + logpdf(Gamma(a₀, 1/b₀), ϕ) logLik = sum(logpdf.(Normal(μ, sqrt(σ²)), data)) return logprior + logLik end - plot(-13:0.05:20, 0:0.001:0.019, (x, y) -> exp(ℓπ(x, y; data=scientist_data)), st=:contour,fill=true, ylim=[0.001, 0.015], xlabel=L"μ", ylabel=L"λ", title="Contour plot of "*L"p(\mu, \lambda|\mathcal{D})") + plot(-13:0.05:20, 0:0.001:0.019, (x, y) -> exp(ℓπ(x, y; data=scientist_data)), st=:contour,fill=true, ylim=[0.001, 0.015], xlabel=L"μ", ylabel=L"\phi", title="Contour plot of "*L"p(\mu, \phi|\mathcal{D})") end # ╔═╡ bbbb73c8-0254-463a-8e81-5acdd08583ac md""" **Marginal posterior ``p(\mu|\mathcal D)``:** -The posterior distribution shows the posterior peaks around ``\mu = 3.5``, which is roughly the same as the sample average. However, to better answer the question, we should treat ``\lambda`` as a *nuisance* parameter, and integrate it out to find the marginal posterior for ``\mu`` only. After some algebra, we find the unnormalised marignal posterior is of the form: +The posterior distribution shows the posterior peaks around ``\mu = 3.5``, which is roughly the same as the sample average. However, to better answer the question, we should treat ``\phi`` as a *nuisance* parameter, and integrate it out to find the marginal posterior for ``\mu`` only. After some algebra, we find the unnormalised marginal posterior is of the form: ```math p(\mu|\mathcal D) \propto p(\mu)\cdot \Gamma\left (a_0+ \frac{N}{2}\right )\left (b_0+\frac{\sum_n (d_n-\mu)^2}{2}\right )^{- (a_0+ \frac{N}{2})}, @@ -637,16 +646,16 @@ md""" md""" **Modelling:** -The i.i.d assumption does not reflect the sutblties of the data generation process. A better model should assume each observation follows an independent but not identical Gaussian distribution. In particular, for each scientist, we should introduce their own observation precision, ``\lambda_n \triangleq \sigma^2_n`` to reflect their "different levels of experimental skills". +The i.i.d assumption does not reflect the subtleties of the data generation process. A better model should assume each observation follows an independent but not identical Gaussian distribution. In particular, for each scientist, we should introduce their own observation precision, ``\phi_n \triangleq \sigma^2_n`` to reflect their "different levels of experimental skills". -The improved model now includes 7+1 unknown parameters: the unknown signal ``\mu`` and seven observation precisions ``\lambda_n`` for the seven scientists: +The improved model now includes 7+1 unknown parameters: the unknown signal ``\mu`` and seven observation precisions ``\phi_n`` for the seven scientists: ```math \begin{align} \mu &\sim \mathcal N(m_0, v_0)\\ -\lambda_n &\sim \texttt{Gamma}(a_0, b_0)\;\; \text{for }n = 1,2,\ldots,7\\ -d_n &\sim \mathcal N(\mu, 1/\lambda_n)\;\; \text{for }n = 1,2,\ldots,7\\ +\phi_n &\sim \texttt{Gamma}(a_0, b_0)\;\; \text{for }n = 1,2,\ldots,7\\ +d_n &\sim \mathcal N(\mu, 1/\phi_n)\;\; \text{for }n = 1,2,\ldots,7\\ \end{align} ``` """ @@ -655,7 +664,7 @@ d_n &\sim \mathcal N(\mu, 1/\lambda_n)\;\; \text{for }n = 1,2,\ldots,7\\ md""" **Computation:** -Similarly, we want to find out the marginal posterior distribution. After integrating out the nuisance parameters ``\{\lambda_n\}`` from the posterior, we can find the marginal posterior is of the following form: +Similarly, we want to find out the marginal posterior distribution. After integrating out the nuisance parameters ``\{\phi_n\}`` from the posterior, we can find the marginal posterior is of the following form: ```math p(\mu|\mathcal D) \propto p(\mu)\prod_{n=1}^N p(d_n|\mu) \propto p(\mu)\prod_{n=1}^N \frac{\Gamma{(a_n)}}{b_n^{a_n}}, @@ -702,17 +711,19 @@ end # ╔═╡ 0a8a5aae-3567-41a3-9f15-d07235d07da2 md""" -## Predictive checks +## Check your models: Predictive checks + +Bayesian inference provides a great amount of modelling freedom to its user. The modeller, for example, can incorporate his or her expert knowledge in the prior distribution and also choose a suitable likelihood that best matches the data generation process. However, greater flexibility comes with a price. The modeller also needs to take full responsibility for the modelling decisions. For example, for a Bayesian model, one needs to check whether +* the prior makes sense? +* the generative model as a whole (i.e. prior plus likelihood) match the observed data? +In other words, the user needs to **validate the model** before making any final inference decisions. **Predictive checks** are a great way to empirically validate a model. """ # ╔═╡ 8fe3ed2f-e826-4faa-ace0-2286b944991f md""" ### Posterior predictive check -Bayesian inference provides a great amount of modelling freedom to its user. The modeller, for example, can incorporate his or her expert knowledge in the prior distribution and also choose a suitable likelihood that best matches the data generation process. However, greater flexibility comes with a price. The modeller also needs to take full responsibility for the modelling decisions. For example, for a Bayesian model, one needs to check whether -* the prior makes sense? -* the generative model as a whole (i.e. prior plus likelihood) match the observed data? -In other words, the user needs to **validate the model** before making any final inference decisions. **Predictive checks** are a great way to empirically validate a model. + The idea of predictive checks is to generate future *pseudo observations* based on the assumed model's (posterior) **prediction distribution**: @@ -947,7 +958,7 @@ md""" [^1]: [MacKay, D. J. C. Information Theory, Inference, and Learning Algorithms. (Cambridge University Press, 2003).](https://www.inference.org.uk/itprnn/book.pdf) -[^2]: Simulating predictive data for this case is bit more involved. The difficulty lies in simulating posterior samples from the posterior distribution: ``p(\mu, \{\lambda_n\}|\mathcal D)``. The joint distribution is not in any simple form. We need algorithms such as MCMC to draw samples from such a complicated posterior. MCMC will be introduced in the next chapter. +[^2]: Simulating predictive data for this case is bit more involved. The difficulty lies in simulating posterior samples from the posterior distribution: ``p(\mu, \{\phi_n\}|\mathcal D)``. The joint distribution is not in any simple form. We need algorithms such as MCMC to draw samples from such a complicated posterior. MCMC will be introduced in the next chapter. """ # ╔═╡ 83bf58df-e2d0-4552-98b7-29ce0ce5fb5e @@ -998,16 +1009,16 @@ $$p(\theta|\mathcal D) =\text{Beta}(a_N, b_N).$$ Foldable("Derivation details on the Gamma-Gaussian conjugacy.", md" ```math \begin{align} - p(\lambda|\mathcal D)&\propto p(\lambda) p(\mathcal D|\mu, \lambda)\\ -&= \underbrace{\frac{b_0^{a_0}}{\Gamma(a_0)} \lambda^{a_0-1} e^{-b_0 \lambda}}_{p(\lambda)} \underbrace{\frac{1}{ (\sqrt{2\pi})^N}\lambda^{\frac{N}{2}}e^{\frac{-\lambda\cdot \sum_{n} (d_n-\mu)^2}{2}}}_{p(\mathcal D|\lambda, \mu)}\\ -&\propto \lambda^{a_0+\frac{N}{2}-1} \exp\left \{-\left (b_0 + \frac{\sum_{n} (d_n- \mu)^2}{2}\right )\lambda\right \} \\ -&= \lambda^{a_N-1} e^{- b_N \lambda}, + p(\phi|\mathcal D)&\propto p(\phi) p(\mathcal D|\mu, \phi)\\ +&= \underbrace{\frac{b_0^{a_0}}{\Gamma(a_0)} \phi^{a_0-1} e^{-b_0 \phi}}_{p(\phi)} \underbrace{\frac{1}{ (\sqrt{2\pi})^N}\phi^{\frac{N}{2}}e^{\frac{-\phi\cdot \sum_{n} (d_n-\mu)^2}{2}}}_{p(\mathcal D|\phi, \mu)}\\ +&\propto \phi^{a_0+\frac{N}{2}-1} \exp\left \{-\left (b_0 + \frac{\sum_{n} (d_n- \mu)^2}{2}\right )\phi\right \} \\ +&= \phi^{a_N-1} e^{- b_N \phi}, \end{align} ``` where ``a_n= a_0 +\frac{N}{2}, \;\;b_N = b_0 + \frac{\sum_{n} (d_n- \mu)^2}{2}.`` Note this is a unnormalised Gamma distribution (whose normalising constant can be read off directly from a Gamma distribution), therefore -$$p(\lambda|\mathcal D)= \text{Gamma}(a_N, b_N).$$ +$$p(\phi|\mathcal D)= \text{Gamma}(a_N, b_N).$$ ") @@ -1015,22 +1026,22 @@ $$p(\lambda|\mathcal D)= \text{Gamma}(a_N, b_N).$$ Foldable("Details on the posterior marginal distribution of μ.", md" ```math \begin{align} -p(\mu|\mathcal D) &= \int_0^\infty p(\mu, \lambda|\mathcal D) \mathrm{d}\lambda \\ -&= \int_0^\infty p(\mu, \lambda|\mathcal D) \mathrm{d}\lambda\\ -&= \frac{1}{p(\mathcal D)} \int_0^\infty p(\mu)p(\lambda) p(\mathcal D|\mu, \lambda)\mathrm{d}\lambda\\ -&\propto p(\mu)\int_0^\infty p(\lambda) p(\mathcal D|\mu, \lambda)\mathrm{d}\lambda\\ -&= p(\mu)\int_0^\infty \frac{b_0^{a_0}}{\Gamma(a_0)} \lambda^{a_0-1} e^{-b_0 \lambda} \frac{1}{ (\sqrt{2\pi})^N}\lambda^{\frac{N}{2}}e^{\frac{-\lambda\cdot \sum_{n} (d_n-\mu)^2}{2}}\mathrm{d}\lambda\\ -&\propto p(\mu)\int_0^\infty\lambda^{a_0+\frac{N}{2}-1} \exp\left \{-\left (b_0 + \frac{\sum_{n} (d_n- \mu)^2}{2}\right )\lambda\right \}\mathrm{d}\lambda \\ -&= p(\mu)\int_0^\infty\lambda^{a_N-1} e^{- b_N \lambda}\mathrm{d}\lambda \\ +p(\mu|\mathcal D) &= \int_0^\infty p(\mu, \phi|\mathcal D) \mathrm{d}\phi \\ +&= \int_0^\infty p(\mu, \phi|\mathcal D) \mathrm{d}\phi\\ +&= \frac{1}{p(\mathcal D)} \int_0^\infty p(\mu)p(\phi) p(\mathcal D|\mu, \phi)\mathrm{d}\phi\\ +&\propto p(\mu)\int_0^\infty p(\phi) p(\mathcal D|\mu, \phi)\mathrm{d}\phi\\ +&= p(\mu)\int_0^\infty \frac{b_0^{a_0}}{\Gamma(a_0)} \phi^{a_0-1} e^{-b_0 \phi} \frac{1}{ (\sqrt{2\pi})^N}\phi^{\frac{N}{2}}e^{\frac{-\phi\cdot \sum_{n} (d_n-\mu)^2}{2}}\mathrm{d}\phi\\ +&\propto p(\mu)\int_0^\infty\phi^{a_0+\frac{N}{2}-1} \exp\left \{-\left (b_0 + \frac{\sum_{n} (d_n- \mu)^2}{2}\right )\phi\right \}\mathrm{d}\phi \\ +&= p(\mu)\int_0^\infty\phi^{a_N-1} e^{- b_N \phi}\mathrm{d}\phi \\ &= p(\mu)\frac{\Gamma(a_N)}{b_N^{a_N}}, \end{align} ``` -where ``a_n= a_0 +\frac{N}{2}`` and ``b_N = b_0 + \frac{\sum_{n} (d_n- \mu)^2}{2}``. Note that we have used the normalising constant trick of Gamma distribution in the second last step, where we recognise ``\lambda^{a_N-1} e^{- b_N \lambda}`` is the unnormalised part of a Gamma distribution with ``a_N, b_N`` as parameters; Then the corresponding Gamma density must integrates to one: +where ``a_n= a_0 +\frac{N}{2}`` and ``b_N = b_0 + \frac{\sum_{n} (d_n- \mu)^2}{2}``. Note that we have used the normalising constant trick of Gamma distribution in the second last step, where we recognise ``\phi^{a_N-1} e^{- b_N \phi}`` is the unnormalised part of a Gamma distribution with ``a_N, b_N`` as parameters; Then the corresponding Gamma density must integrate to one: ```math -\int_{0}^\infty \frac{b_N^{a_N}}{\Gamma(a_N)}\lambda^{a_N-1} e^{- b_N \lambda}\mathrm{d}\lambda =1, +\int_{0}^\infty \frac{b_N^{a_N}}{\Gamma(a_N)}\phi^{a_N-1} e^{- b_N \phi}\mathrm{d}\phi =1, ``` -which leads to ``\int_{0}^\infty \lambda^{a_N-1} e^{- b_N \lambda}\mathrm{d}\lambda= \frac{\Gamma(a_N)}{b_N^{a_N}}.`` +which leads to ``\int_{0}^\infty \phi^{a_N-1} e^{- b_N \phi}\mathrm{d}\phi= \frac{\Gamma(a_N)}{b_N^{a_N}}.`` ") # ╔═╡ 15b09064-2c48-4013-8a08-c32e32d1f4df @@ -1038,17 +1049,17 @@ Foldable("Derivation details on the marginal distribution.", md" To find the marginal posterior for ``\mu``, we need to find the following marginal likelihood for observation ``d_n``: -$p(d_n|\mu) = \int p(\lambda_n) p(d_n|\mu, \lambda_n)\mathrm{d}\lambda_n,$ +$p(d_n|\mu) = \int p(\phi_n) p(d_n|\mu, \phi_n)\mathrm{d}\phi_n,$ -where we have assumed ``\mu, \lambda_n`` are independent, i.e. ``p(\lambda_n|\mu) = p(\lambda_n)``. +where we have assumed ``\mu, \phi_n`` are independent, i.e. ``p(\phi_n|\mu) = p(\phi_n)``. -The marginal likelihood is the normalising constant of the marginal posterior ``p(\lambda_n|d_n, \mu)``, since +The marginal likelihood is the normalising constant of the marginal posterior ``p(\phi_n|d_n, \mu)``, since -$$p(\lambda_n|d_n, \mu) = \frac{p(\lambda_n) p(d_n|\mu, \lambda_n)}{p(d_n|\mu)}.$$ +$$p(\phi_n|d_n, \mu) = \frac{p(\phi_n) p(d_n|\mu, \phi_n)}{p(d_n|\mu)}.$$ -Due to conjugacy, it can be shown that the conditional posterior of ``\lambda_n`` is of Gamma form +Due to conjugacy, it can be shown that the conditional posterior of ``\phi_n`` is of Gamma form -$p(\lambda_n|\mu, x_n) = \text{Gamma}(a_n, b_n),$ where +$p(\phi_n|\mu, x_n) = \text{Gamma}(a_n, b_n),$ where $$a_n = a_0 + \frac{1}{2}, b_n = b_0+ \frac{1}{2} (d_n-\mu)^2.$$ The normalising constant of the unnormalised posterior is therefore the corresponding Gamma distribution's normalising constant: @@ -2415,8 +2426,9 @@ version = "1.4.1+0" """ # ╔═╡ Cell order: -# ╠═69b19a54-1c79-11ed-2a22-ebe65ccbfcdb +# ╟─69b19a54-1c79-11ed-2a22-ebe65ccbfcdb # ╟─904c5f8d-e5a9-4f18-a7d7-b4561ad37655 +# ╟─613a65ce-9384-46ed-ab0d-abfda374f73c # ╟─5b4c3b01-bd19-4085-8a83-781086c85825 # ╟─0c123e4e-e0ff-4c47-89d6-fdf514f9e9d0 # ╟─f479a1f3-a008-4622-82f0-ab154a431a33 @@ -2433,7 +2445,7 @@ version = "1.4.1+0" # ╟─754c5fe2-3899-4467-8fd4-828fb0ec5040 # ╠═f5dfcd1f-9d10-49d1-b68b-eafdf10baaec # ╟─2ba73fd3-ae2b-4347-863f-28e6c21e7a91 -# ╠═56e80a69-6f84-498d-b04e-5be59c1488eb +# ╟─56e80a69-6f84-498d-b04e-5be59c1488eb # ╟─65f0dc31-c2bd-464f-983a-9f4ca2c04a35 # ╟─d2c3bd6d-fb8c-41df-8112-c7bfbc180b0a # ╟─76c641dd-c790-4f51-abc0-e157c00e3ba7