-
Notifications
You must be signed in to change notification settings - Fork 0
/
section1_introduction.jl
2444 lines (1863 loc) · 95.5 KB
/
section1_introduction.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
### A Pluto.jl notebook ###
# v0.19.46
using Markdown
using InteractiveUtils
# ╔═╡ 078e8979-6753-411f-9c34-ba0164ea9cb2
begin
using PlutoUI
using Distributions
using StatsPlots
using PlutoTeachingTools
using Plots; default(fontfamily="Computer Modern", framestyle=:box) # LaTex-style
using LogExpFunctions, Random
using LaTeXStrings
using Logging; Logging.disable_logging(Logging.Info);
end;
# ╔═╡ b2ad130c-5d40-4a4b-adbe-5880984a9460
TableOfContents()
# ╔═╡ 1e40f26a-6381-4bb7-a70f-b35a51f33dfd
md"[**↩ Home**](https://lf28.github.io/BayesianModelling/)
[**↪ Next Chapter**](./section2_modelling.html)"
# ╔═╡ 3388e67f-a5bb-409a-843d-0958243c3b8f
md"
# An introduction to Bayesian inference
"
# ╔═╡ 97d325ca-2c41-42b4-82b8-f74659e61dc3
md"""
## Introduction
**Statistical inference**, or learning in the machine learning community, is the process of inferring properties about the population distribution from observed data. The observed data is usually assumed to be drawn from the population:
$$d_1, d_2, \ldots, d_N \sim \mathcal P,$$
where "``\sim``" means the data are *distributed according to*, or equivalently, drawn from the distribution. And we want to *infer* some properties of the population distribution ``\mathcal P`` based on the sample ``\mathcal D = \{d_n\}``.
The most common inference case is parametric inference: that is when the population ``\mathcal P`` is some probability distribution with some finite number of parameters. For example, to test whether a coin is fair, we toss the coin ``N`` times. In this case, the population distribution is a Bernoulli distribution, *i.e.* a random variable with binary outcomes. The population distribution is fully determined by one parameter ``\theta \in [0,1]``, *i.e.* the bias of the coin:
$$\mathcal P \triangleq \text{Bernoulli}(d; \theta)= \begin{cases} \theta, & d=\texttt{head} \\ 1-\theta, & d= \texttt{tail}\end{cases}$$
and ``d_i`` are random independent samples from the distribution: for ``i \in 1, \ldots, N``
$$d_i \sim \text{Bernoulli}(\theta).$$
Then a typical inference query is:
> Based on the observed data ``\mathcal D=\{d_1, d_2, \ldots, d_N\},`` what the population distribution's bias ``\theta`` is?
There are two camps of statistical inferences: the **frequentist** and the **Bayesian approach**. Historically, the frequentist's approach has been more widely taught and used. However, the Bayesian approach has gained more momentum in the 21st century, after being successfully applied in a wide range of applications.
Compared with the frequentist approach, Bayesian inference is procedurally standard. All Bayesian inferences start with a generative stochastic model that tells a story of how the data is generated and end with invoking Bayes' rule to find the updated distribution of the unknowns. In other words, as long as you can tell a story about your data, you can be a proper Bayesian statistician.
However, the same thing cannot be said for the Frequentist's method. Speaking for myself, I cannot recall the ANOVA's (or just choose any from the following: MANOVA, ``\chi^2``, Paired/Independent T-test) test procedure. And I certainly have forgotten the derivation details behind the steps of those tests.
Apart from flexibility, if properly specified, Bayesian inference offers more benefits: such as numerical stability, better-generalised performance, and natural interpretation. However, the benefits come with a price: Bayesian methods cannot be used like a black box (that said, neither should Frequentist methods or any scientific method). You do need to understand your data well to be able to come up with a good story (or a proper Bayesian model) first. And after the modelling, one also should be able to diagnose whether the inference algorithms are behaving well. However, modern computational tools, such as probabilistic programming language, make the hurdle easier to overcome.
In this first section, are going to have an overview of Bayesian inference. Firstly, we are going to review Bayes' theorem, which is the cornerstone of Bayesian inference. Next, the coin-tossing problem is used as an example to demonstrate how Bayesian inference is used in practice. To better understand the differences between the two inference camps, a frequentist's take is also shown.
"""
# ╔═╡ 300929c0-3279-47ef-b165-0e936b757679
md"""
## Bayes' theorem
We first review Bayes' theorem, which forms the core idea of Bayesian inference.
!!! infor "Bayes' theorem"
Bayes' rule provides us with a mechanism to infer an unknown quantity $\theta$ based on observed data $\mathcal D$:
$$\text{posterior}=\frac{\text{prior} \times \text{likelihood}}{\text{evidence}}\;\;\text{or}\;\;p(\theta|\mathcal D) =\frac{\overbrace{p(\theta)}^{\text{prior}} \overbrace{p(\mathcal D|\theta)}^{\text{likelihood}}}{\underbrace{p(\mathcal D)}_{\text{evidence}}},$$
which is often simplified to:
$$p(\theta|\mathcal D) \propto {p(\theta)} p(\mathcal D|\theta),$$
("``\propto``" denotes "proportional to") as the evidence term
$$p(\mathcal D) = \int p(\theta) (\mathcal D|\theta)\mathrm{d}\theta$$ is constant respect to ``\theta``.
* ``p(\theta)`` **prior** distribution: prior belief of ``\theta`` before we observe the data
* ``p(\mathcal D|\theta)`` **likelihood**: conditional probability of observing the data given a particular $\theta$
* ``p(\mathcal D)`` **evidence** (also known as *marginal likelihood*). It scales the product of prior and likelihood, ``p(\theta) \times p(\mathcal D|\theta)``, such that the posterior ``p(\theta|\mathcal D)`` is a valid probability distribution. That is the total probability mass sum to one.
"""
# ╔═╡ 41e15a4f-8ddc-47f9-8286-bf583b7d748a
md"""
### Why Bayes' theorem is useful?
Human brains are not very good at doing reverse logic reasoning directly. We are however good at thinking in the forward direction. Bayes' rule provides us with a tool to do the reverse reasoning routinely based on the forward logic.
When it comes to statistical inference, it is way more straightforward to specify the *forward probability*:
$$\theta \xRightarrow[\text{probability}]{\text{forward}} \mathcal{D}.$$
That is assuming the unknown parameter ``\theta``, how likely one observes the data, which is known as **likelihood** function and denoted as a conditional probability:
$$p(\mathcal D|\theta):\;\; \text{likelihood function}.$$
In practice, we are more interested in the opposite direction:
$$\mathcal D \xRightarrow[\text{probability}]{\text{inverse}} \theta.$$
That is given the observed data ``\mathcal D``, what ``\theta \in \Theta`` is more likely to have been used to generate data, which is exactly the posterior distribution:
$$p(\theta|\mathcal D):\;\; \text{Posterior distribution}.$$
Bayes' rule provides the exact tool to do this reverse engineering. And the recipe is simple and mechanical: multiply a prior with the likelihood (forward probability):
$$p(\theta|\mathcal D) \propto p(\theta) p(\mathcal D|\theta).$$
The following example demonstrates how Bayes' rule is used to answer a non-trivial inverse probability problem.
"""
# ╔═╡ c91937d3-1a46-4dc2-9d0d-f6fda482ea36
md"""
**Example (Switch point detection)** Your friend has two coins: one fair (*i.e.* with the probability of a head turning up $p=0.5$) and one bent (with a probability of head turning up $p= 0.2$). He always uses the fair coin at the beginning and then switches to the bent coin at some unknown time.
You have observed the following tossing results: ``\mathcal{D}= [0,1, 0,0,0,0]``. We use 1 to represent the head and 0 for the tail. When did he switch the coin?
"""
# ╔═╡ 41bd090a-ed31-4ebc-ad57-0785323378d4
function ℓ_switch(D, p₁=0.5, p₂=0.2)
likes = zeros(length(D)-1)
for t in 1:length(likes)
# Bernoulli(p) return a instance of Bernoulli r.v.
# pdf(Bernoulli(p), y) return the probability, either p or 1-p depends on y
# prod times everything together
likes[t] = prod(pdf.(Bernoulli(p₁), D[1:t])) * prod(pdf.(Bernoulli(p₂), D[(t+1):end]))
end
return likes, sum(likes)
end;
# ╔═╡ 6f57947d-0ecb-4e9c-b8c7-6ca8de2630d8
md"""
**Solution**
We identify the unknown $\theta \triangleq S$: *i.e.* the unknown switching point ``S``:
* ``S\in \{1,2,3,\ldots, 5\}``: after $S$-th toss, he switches the coin;
First, we determine the **likelihood**, or **forward probability**: the conditional probability of observing the data ``\mathcal D`` assuming (*i.e.* conditioning on) we know the switching point S. It is worth noting that this forward probability is straightforward to specify. Knowing the switching point, the whole data becomes two segments of independent coin tosses:
$$p(\mathcal D|S) = \underbrace{\prod_{i=1}^S p(d_i|p =0.5)}_{\text{coin 1}} \underbrace{\prod_{j=S+1}^N p(d_j|p = 0.2)}_{\text{coin 2}};$$
For example, if ``S=2``, the likelihood is ``p(\mathcal D|S=2) = \underbrace{0.5\cdot 0.5}_{\text{before switch}} \times \underbrace{(1-0.2)^4}_{\text{after switch}}``.
$(begin
Plots.plot(1:5, ℓ_switch([0,1,0,0,0,0])[1], xlabel=L"S", ylabel=L"p(\mathcal{D}|S)", title="Likelihood function", st=:sticks, marker=:circle, label="")
end)
To apply Bayes' rule, we need to impose a *prior* distribution over the unknown. To reflect our ignorance, a natural choice is a uniform distribution: for ``S \in \{1,2,\ldots, 5\}``: ``p(S) = 1/5.``
Lastly, we combine the prior and likelihood according to Bayes' rule:
$$p(S|\mathcal D) \propto p(S) p(\mathcal D|S).$$
$(begin
Plots.plot(1:5, 1/5 * ones(5), st=:sticks, color=1,marker=:circle, label="")
Plots.plot!(1:5, 1/5 * ones(5), st=:path, fill=true, color=1, alpha= 0.3, label="Prior")
Plots.plot!(1:5, ℓ_switch([0,1,0,0,0,0])[1]./ℓ_switch([0,1,0,0,0,0])[2], xlabel=L"S", ylabel=L"p(S|\mathcal{D})", title="Posterior distribution of the switching point", st=:sticks, marker=:circle, color=2, label="")
Plots.plot!(1:5, ℓ_switch([0,1,0,0,0,0])[1]./ℓ_switch([0,1,0,0,0,0])[2], st=:path, fill=true, alpha=0.3, color=2, label="Posterior")
end)
Bayes' theorem **updates** our prior belief to the posterior by combining the likelihood. And according to the posterior, it is *most likely* that the switch point is after the second toss.
However, there is a great amount of uncertainty about this *single point estimator*: an alternative hypothesis ``S=3`` is also likely. A true Bayesian way to answer the question is to ship the posterior distribution as an answer rather than just report the most likely point estimator.
"""
# ╔═╡ dc551c95-cae5-4459-bdcd-d98aecb8b5e5
md"""
## Bayesian inference procedure
The switching point example demonstrates how Bayes' theorem is used to answer an inference question. The whole process can be summarised in two stages: **modelling**
and **computation**.
**Bayesian *modelling* stage**
At the modelling stage, we are *telling a story* of the forward probability: *i.e.* how the data is generated hypothetically.
Different from the frequentist approach, Bayesian inference assumes both the **unknown parameter** ``\theta`` and the **observed data** ``\mathcal D`` random.
Therefore, Bayesian's storyline includes the data generation process for ``\theta``, *i.e.* **prior** and the observed ``\mathcal D``, *i.e.* **likelihood**.
In other words, both methods require the likelihood but specifying priors for the unknowns is unique to the Bayesian approach.
**Bayesian *computation* stage**
At the **computation** stage, we routinely apply Bayes' rule to answer the inverse inference question or the posterior distribution and finally summarise the posterior to answer specific downstream inference questions.
However, the computation step is only *conceptually* straightforward. There is no technical difficulty when the parameter space $\Theta$ is discrete and finite: *e.g.* the unknown switching point can take one of the five discrete choices. In practice, this exact enumeration method becomes unattainable when the parameter space is continuous and higher-dimensional. And we need more advanced algorithms, *e.g.* Markov Chain Monte Carlo (MCMC) or variational method, to make the computation scalable. We will discuss the advanced computation algorithm in the next chapter.
"""
# ╔═╡ 07e16486-c07c-450c-8de3-8c088c80816a
md"""
It is also worth noting that all Bayesian inference problems follow the same two procedures. Compared with the frequentist methods which are formed by a variety of techniques (to name a few: *Z-test, T-test, Χ²-test, bootstrap*), the Bayesian procedure is uniform and standard.
"""
# ╔═╡ fcb0fab4-d2f7-411f-85f6-0db536f8ee5a
md"""
In summary, Bayesian inference is formed by two blocks of steps: modelling, and computation.
!!! note "Modelling"
1. Specify **Prior** distribution for the unknown: ``p(\theta)``. As the name suggests, a prior represents our prior belief of the unknown variable *before* we see the data. It represents a subjective belief.
2. Determine **Likelihood** function for the observed: ``p(\mathcal D|\theta)``. A likelihood function is a conditional probability of observing the data $\mathcal D$ given a particular $\theta \in \Theta$. The likelihood is used both in classic Frequentist and Bayesian inference.
!!! note "Computation"
3. Compute **Posterior** distribution: ``p(\theta|\mathcal D)``. The third step is straightforward, at least conceptually: *i.e.* mechanically apply Bayes' rule to find the posterior.
4. (optional) Report findings. Summarise the posterior to answer the inference question at hand. This step is optional as Bayesians report the posterior distribution from step three as the answer.
"""
# ╔═╡ 481044fe-07ea-4118-83d8-e9f287406d10
md"""
To consolidate the idea of Bayesian inference's procedure, we will see a few inference examples. When possible, both the frequentist and Bayesian methods will be applied and compared.
"""
# ╔═╡ 198e2787-4901-4478-a092-a84410ad2dd5
md"""
## An example: coin-flipping
To better understand the Bayesian inference procedure and also draw comparisons with the Frequentist's approach, we revisit the inference problem mentioned earlier: inferring the bias of a coin.
> A coin 🪙 is tossed 10 times. And the tossing results are recorded:
> $$\mathcal D=\{1, 1, 1, 0, 1, 0, 1, 1, 1, 0\}$$;
> *i.e.* seven out of the ten tosses are heads (ones). Is the coin **fair**?
We will first see how the frequentist approach solves the problem then the Bayesian method.
"""
# ╔═╡ 32c1a567-0b61-4927-81fc-f66773ed3e05
md"""
### Method 1. Frequentist approach*
The frequentist approach believes ``\theta`` is not a random variable but some fixed constant. Therefore, they will not use Bayes' rule, which treats ``\theta`` as a random variable (with prior and posterior distributions). Instead, the frequentists will form some estimator for ``\theta`` and try to find some long-term frequency property of the estimator. A natural estimator for the bias ``\theta`` is the observed frequency:
$$\hat\theta = \frac{1}{N}\sum_{n=1}^N d_n = \frac{N_h}{N} =0.7,$$
where ``N_h=\sum_n d_n`` denotes the total count of heads, and ``N`` is the number of tosses. However, the true bias of the coin will not exactly be ``\hat\theta=0.7``. If we toss the coin another 10 times, the observed frequency will likely be different from ``0.7``. So how serious shall we treat this ``0.7``? The frequentist wants to know the long-term frequency pattern of the estimator ``\hat\theta`` (but **not** ``\theta``).
One of the frequentist's methods to achieve this is to form a **confidence interval**: an interval ``\theta \in (l, u)`` that traps the true unknown parameter with some good confidence or high probability.
"""
# ╔═╡ ff41d114-0bd4-41ea-9c22-b6af52b3fa21
md"""
A Gaussian-based confidence interval can be formed for the coin's unknown bias:
$$(\hat{\theta} - z_{\alpha/2} \hat{\texttt{se}}, \hat{\theta} + z_{\alpha/2} \hat{\texttt{se}}),$$
where
$$\hat{\texttt{se}}= \sqrt{\frac{\hat{\theta}(1-\hat{\theta})}{N}}.$$
"""
# ╔═╡ ccd3bb12-02a9-4593-b53c-d7081c4e743f
Foldable("Details about the Gaussian based confidence interval.", md"
Based on central limit theory, we know all sample average, regardless of their population distribution, asymptotically converges to a Gaussian: i.e.
$$\hat\theta \sim \mathcal N\left (\theta, \frac{\theta(1-\theta)}{N}\right ).$$
*Note that ``\hat\theta`` (the estimator rather than the parameter) is assumed a random variable here as it is a function of the data ``d_i``, which are assumed to be drawn from a Bernoulli distribution. Therefore, we can find ``\hat\theta``'s mean and variance.*
And we apply the so called *plug-in principle*: replace all ``\theta`` in the sampling distribution with its estimator ``\hat\theta``.
Note that better methods exist for the model. Ideally, the Gaussian-based confidence interval should only be used when ``N`` is large. We are only illustrating the idea of the frequentist's confidence interval here.
")
# ╔═╡ 278d302f-6b81-4b00-bdd2-55057417809e
md"""
For our case,
``\hat\theta = \frac{\sum_{n=1}^N d_n}{N}=0.7`` and the standard error about ``\hat \theta`` is: ``\hat{\texttt{se}} = \sqrt{\frac{\hat{\theta} (1-\hat{\theta})}{N}}\approx 0.145.``
Picking a significance level at ``\alpha = 10\%``, the corresponding $z_{\alpha/2}=1.645$. A confidence interval of ``90\%`` therefore is
$$(0.46, 0.94).$$
Since the confidence interval encloses ``\theta=0.5``, the frequentists conclude that we do not have overwhelming evidence to rule out the coin being fair.
But what does the confidence interval mean here? Frequentist's methods assume ``\theta`` is a fixed unknown quantity (and only data $\mathcal D$ is random). The confidence interval surely is **not** a probability statement about ``\theta``. But rather it is a probability statement about the interval itself (which depends on the data therefore random). The idea is probably better explained in the following way.
!!! danger ""
A ``\alpha=10\%`` **confidence interval**: if you
*repeat the following two steps, say 10,000 times:*
1. *toss the coin another 10 times*
2. *calculate another confidence interval*: ``(\hat{\theta} - 1.645 \hat{\texttt{se}}, \hat{\theta} + 1.645 \hat{\texttt{se}})`` *as above*
Over the 10,000 realised random intervals, approximately 90% of them trap the true parameter.
The animation below illustrates the idea of a confidence interval. Conditional on the hypothesis that the coin is fair, *i.e.* ``\theta =0.5`` (it works for any ``\theta\in [0,1]``), the above two steps were repeated 100 times, *i.e.* tossing the coin ten times and then forming a confidence interval. The red vertical intervals (there are roughly 10 of them) are the 10% CIs that **do not** trap the true bias.
"""
# ╔═╡ 1c64b953-5a3d-4a80-b523-8b8f213f6849
md"""
The final plot of the 100 confidence intervals is also listed below for your reference.
"""
# ╔═╡ 65b17042-1e16-4097-86c9-83c516de803d
md"""
In practice, we make an inference decision based on one of the realised CI (0.46, 0.94), *i.e.* one experiment: it either traps or misses the true value. Nevertheless, it is likely (90% of chance) that the CI at hand is one of the good CIs. And based on this fact, we reach a conclusion. However, there is a small chance that the CI is wrong, *i.e.* it misses the true value. And the error is known as the **Type 1** error.
"""
# ╔═╡ b42dc5e4-143e-4469-be7f-7b1f73ca9064
md"""
If you think the Frequentist's confidence interval is confusing, I agree with you. The thought experiment of repeated experiments does not even make sense in cases like time series analysis: how can one travel back in time to collect another sample?
If you want a more direct answer to the inference question, we need to resort to Bayesian inference. It provides us with a probability statement about the unknown quantity directly:
> "*In light of the observed data, what ``\theta`` is more credible, i.e. what is ``p(\theta|\mathcal D)``?*''
"""
# ╔═╡ c8c99b5a-1bd2-4762-beb2-518e0e8c9aa3
md"""
### Method 2. Bayesian approach
The Bayesian approach assumes ``\theta`` is a random variable with some prior subjective belief. And apply Bayes' rule to update the prior belief to answer the question.
Bayesian inference starts with a model: or "a story on how the data is observed". Note that, from Bayesian's perspective, the data here include **all** data: both known (or observed) and the unknown. On the other hand, the frequentists' storyline only cares about the observed.
For the coin flipping problem, the *story* is straightforward: the unknown ``\theta`` is first drawn from the prior belief's distribution, and then ``N`` realisations ``d_n`` are subsequently drawn from the coin.
**Step 1. set a prior for the unknown: ``p(\theta)``**
The bias, denoted as ``\theta``, is the unknown parameter. To make the posterior computation easier, let's assume $\theta$ can only take 11 discrete values: 0, 0.1, 0.2 up to 1.0:
$\theta \in [0.0, 0.1, 0.2, \ldots, 1.0],$
To show our ignorance, we can further assume a uniform prior over the 11 discrete choices, *i.e.*
$$p(\theta) = \begin{cases} 1/11, & \theta \in \{0, 0.1, \ldots, 1.0\} \\
0, & \text{otherwise}; \end{cases}$$
The prior distribution is shown below:
$(begin
prior_plt = Plots.plot(0:0.1:1.0, 1/11 * ones(11), ylim= [0,1], seriestype=[:path, :sticks], color= 1, fill=true, alpha=0.3, markershape=:circle, xlabel=L"θ", ylabel=L"p(θ)", label="", title="Prior")
end)
"""
# ╔═╡ 65ef62da-f095-4bb2-aa0f-120827bed6e0
md"""
After spelling out the model (*i.e.* prior and likelihood), we apply Bayes' rule to find the posterior.
**Step 3. Apply Bayes' theorem**
Mechanically apply Bayes' rule: *i.e.* multiplying the prior and likelihood then normalise.
Note the prior is a constant, and the posterior is proportional to the likelihood:
$$p(\theta|\mathcal D) \propto p(\theta)\cdot p(\mathcal D|\theta) = \frac{1}{11} \cdot p(\mathcal D|\theta)\propto p(\mathcal D|\theta).$$
The normalising constant $p(\mathcal D)$ is
$$p(\mathcal D) = \sum_{\theta\in \{0.0, 0.1, \ldots, 1.0\}} p(\theta)\cdot p(\mathcal D|\theta).$$
"""
# ╔═╡ db7f7652-c37d-4f3c-a7c6-0fc57112c3ba
Foldable("Why it is called normalising constant?*", md"It *normalises* the unnormalised posterior (the product of prior and likelihood) such that the posterior probability mass add to one:
$$\sum_\theta p(\theta|\mathcal D) = \sum_\theta \frac{p(\theta) \cdot p(\mathcal D|\theta)}{p(\mathcal D)} = \sum_\theta \frac{p(\theta) \cdot p(\mathcal D|\theta)}{\sum_\theta p(\theta)\cdot p(\mathcal D|\theta)} = 1.$$")
# ╔═╡ 785bdafd-4bfc-441b-bd79-f1bbced4efb9
md"""
The posterior therefore can be calculated as follows: for ``\theta \in \{0.0, 0.1, \ldots, 1.0\}``:
$$p(\theta|\mathcal D) = \frac{p(\mathcal D|\theta)}{\sum_{\theta} p(\mathcal D|\theta)}$$
The posterior update procedure is illustrated below. Note that the posterior is of the same shape as the likelihood (since a flat prior has been used). After observing the data, the posterior now centres around 0.7. But it seems 0.8 and 0.6 are also likely.
"""
# ╔═╡ d31477e0-35b0-4968-bfbf-aefa727f3a41
md"""
**Step 4. (optional) Report findings**
Technically, the Bayesian approach has finished once the posterior is finalised. One can ship the full posterior as an answer. However, to answer the particular question of the coin's fairness, we can proceed to *summarise the posterior*.
One way to summarise a posterior distribution is to find the most likely region of the posterior, or **highest probability density interval (HPDI)** such that the enclosed probability within the interval is some number close to 100 %, *e.g.* 90%:
$$p(l \leq \theta \leq u|\mathcal D) = 90 \%.$$
Based on the posterior, we find the corresponding interval is between 0.5 and 0.9 inclusive: *i.e.*
$$p(0.5 \leq \theta \leq 0.9|\mathcal D) \approx 0.90.$$
"""
# ╔═╡ dcc5758e-b1da-48c6-a9cf-5f906fcf76a9
md"""
The fair coin hypothesis, *i.e.* $\theta=0.5$ is within the 90% HPDI. Therefore, we should *not* reject the hypothesis that the coin is fair, which agrees with the frequentist's judgement.
However, compared with Frequentist's confidence interval, **the credible interval is a probability statement about the unknown bias**: the coin's unknown bias is likely to lie within the range 0.5 to 0.9 with a probability of 0.9.
"""
# ╔═╡ cf054a9c-8617-4d22-a398-f73b2d57139f
md"""
## An extension: multiple parameter model
"""
# ╔═╡ 0ff3bf61-3bd3-449d-94d6-cd6abff3d41b
md"""
As mentioned earlier, the Bayesian approach is very flexible in dealing with customised problems. To demonstrate this, we consider an extension of the previous simple inference problem.
Suppose now we have two coins ``A`` and ``B`` with unknown biases: ``0\leq \theta_A \leq 1, 0\leq\theta_B \leq 1``. The two coins are tossed ``N_A=10`` and ``N_B=100`` times, and the numbers of heads, ``N_{A,h}=9, N_{B,h}=89,`` are recorded. Now the inference question is
> Which coin has a larger bias?
*Remarks. Judging from the empirical frequencies only, one may conclude that coin A has a higher probability of success since ``\frac{9}{10} > \frac{89}{100}``. However, this naive answer clearly does not take all possible uncertainties into account. Coin two's estimator clearly is more reliable and certain than coin one. To answer the question, the frequentist needs to start over to find a suitable test.*
"""
# ╔═╡ 657a73e1-88c7-4c4d-be22-410aeeb6ef40
Foldable("More application of the two coin problem.", md"The above problem might look bizarre. But it offers a solution to a wide range of practical problems. One variant is an Amazon sellers' review problem.
!!! note \"Amazon review problem\"
There are two second-hand booksellers on Amazon. Seller A has sold 10 books in total and 7 out of the 10 are left with positive ratings; whereas seller B has 69 out of 100 positive reviews. Which seller is likely to be better?
It is not hard to see the connection between the two problems. Both problems can be modelled with the same stochastic model. And a Bayesian model can be specified as the followings.")
# ╔═╡ 397e354d-dd17-4f66-a578-1b6374392953
md"""
**Step 4. Answer questions.**
Bayesian inference can be used to answer questions directly. Rather than framing a null/alternative hypothesis and doing tests like what the frequentists do. We can instead frame the straight-to-the-point question as a posterior probability statement:
*In light of the data, how likely coin ``A`` has a higher bias than coin ``B``?*, *i.e.*
```math
p(\theta_A > \theta_B|\mathcal{D}).
```
The probability can be calculated based on our posterior distribution by summing up the corresponding entries of the joint distribution table:
```math
p(\theta_A > \theta_B|\mathcal{D}) = \sum_{\theta_A > \theta_B:\;\theta_A,\theta_B\in \{0.0, \ldots 1.0\}^2} p(\theta_A, \theta_B|\mathcal{D})
```
"""
# ╔═╡ e100c593-172c-4919-9e5b-b6dd612653f5
md"For our problem, the posterior probability is: "
# ╔═╡ c98177ca-e773-4f00-b27f-79ccb143a5c3
md"""
Figuratively, we are summing up the probability below the line ``\theta_A=\theta_B``, which corresponds to our query's condition. If one happens to want to know the chance that coin ``A``'s bias is twice (or ``k`` times) coin ``B``'s (imagine we are comparing two vaccine's effectiveness rather than two coins), the Bayesian approach can answer that immediately, while the frequentist would need to restart the inference all over again!
```math
p(\theta_A > 2\cdot \theta_B|\mathcal{D}) = \sum_{\theta_A > 2\cdot\theta_B} p(\theta_A, \theta_B|\mathcal{D})
```
"""
# ╔═╡ bc4d9710-7e6c-4bc9-905b-7b3c0c8e9abe
md"""
## What's next?
In the following sections, we are going to discuss topics revolving around the two core building blocks of Bayesian inference: **modelling** and **computation**.
**Modelling**
We will introduce more Bayesian modelling concepts in the next chapter, *e.g.* generative model and prior choices, and model checking. However, the modelling step is more an art than a science. Ideally, each problem should have its own bespoke model. Therefore, more model-specific details will be introduced later when we introduce the individual models.
**Computation**
In chapter 3, we will introduce Markov Chain Monte Carlo (MCMC), a core algorithm that makes Bayesian inference feasible. We will focus on the intuitions behind some popular MCMC algorithms and also practical applied issues on using MCMC, such as chain diagnostics.
**Probabilistic programming `Turing.jl`**
Implementing each Bayesian model and its inference algorithm from scratch is not practical for applied users. Instead, we can specify a Bayesian model effortless with the help of probabilistic programming languages, such as `Turing.jl`[^1] and `Stan`[^2]. The downstream Bayesian computation tasks can also be done automatically by the packages. We will see how to use `Turing.jl` to do applied Bayesian inference in chapter 4.
**Bayesian models**
After the general concepts being introduced in the first four chapters, the second half of the course will cover a range of commonly used Bayesian models.
* parametric density estimation
* linear regression
* generalised linear regressions
* multi-level models *a.k.a.* hierarchical Bayesian models
* and so on
"""
# ╔═╡ 66159a91-6a82-4a52-b3fb-9749bb66d4e2
md"""
## Notes
[^1]: [Turing.jl: Bayesian inference with probabilistic programming.](https://turing.ml/stable/)
[^2]: [Stan Development Team. 2022. Stan Modeling Language Users Guide and Reference Manual](https://mc-stan.org/users/documentation/)
"""
# ╔═╡ f0a54db9-2cb1-48ad-953f-35d4e0be7b7c
md"
[**↩ Home**](https://lf28.github.io/BayesianModelling/)
[**↪ Next Chapter**](./section2_modelling.html)"
# ╔═╡ 6a49bd7b-1211-4480-83a3-ca87e26f9b97
md"""
## Appendix
Code used for this chapter (Folded).
"""
# ╔═╡ 14710249-9dc1-4a75-b0b3-25ac565452a5
begin
head, tail = 1, 0
𝒟 = [head, head, head, tail, head, tail, head, head, head, tail]
h_total = sum(𝒟)
N = length(𝒟)
function ℓ(θ, N, Nₕ; logLik=false)
logℓ = xlogy(Nₕ, θ) + xlogy(N-Nₕ, 1-θ)
logLik ? logℓ : exp(logℓ)
end
θs = 0:0.1:1.0
likes = ℓ.(θs, N, h_total)
end;
# ╔═╡ 94ae916e-49fb-4426-b261-57b39599a4e7
md"""
**Step 2. Determine the likelihood function: ``p(\mathcal D|\theta)``**
Next, we need to determine the likelihood function: how the observed data, ``\mathcal D``, *is generated* conditional on ``\theta``. As shown earlier, a coin toss follows a Bernoulli distribution:
$$p(d|\theta) = \begin{cases} \theta, & d = \texttt{h} \\ 1-\theta, & d= \texttt{t},\end{cases}$$
Due to the independence assumption, the likelihood for ``\mathcal D`` is just the product:
$$p(\mathcal D|\theta) = p(d_1|\theta)p(d_2|\theta)\ldots p(d_{10}|\theta)= \prod_{n=1}^{N} p(d_n|\theta) = \theta^{\sum_{n}d_n} (1-\theta)^{N-\sum_{n}d_n},$$
For our case, we have observed ``N_h=\sum_n d_n = 7`` heads out of ``N=10`` total tosses, we can therefore evaluate the likelihood function at the pre-selected 11 values of $\theta$.
$(begin
like_plt =
Plots.plot(0:0.1:1.0, θ -> ℓ(θ, N, h_total), seriestype=:sticks, color=1, markershape=:circle, xlabel=L"θ", ylabel=L"p(𝒟|θ)", label="", title="Likelihood")
end)
"""
# ╔═╡ 009a824f-c26d-43e9-bb6d-fd538a19863b
begin
l = @layout [a b; c]
posterior_dis = likes/ sum(likes)
post_plt = Plots.plot(0:0.1:1.0, posterior_dis, seriestype=:sticks, markershape=:circle, label="", color=2, title="Posterior", xlabel=L"θ", ylabel=L"p(θ|𝒟)", legend=:outerleft)
Plots.plot!(post_plt, 0:0.1:1.0, posterior_dis, color=2, label ="Posterior", fill=true, alpha=0.5)
Plots.plot!(post_plt, 0:0.1:1.0, 1/11 * ones(11), seriestype=:sticks, markershape=:circle, color =1, label="")
Plots.plot!(post_plt, 0:0.1:1.0, 1/11 * ones(11), color=1, label ="Prior", fill=true, alpha=0.5)
# Plots.plot!(post_plt, 0.5:0.1:0.9, posterior_dis[6:10], seriestype=:sticks, markershape=:circle, label="95% credible interval", legend=:topleft)
Plots.plot(prior_plt, like_plt, post_plt, layout=l)
end
# ╔═╡ 633ad986-dc19-4994-b04e-4ec34432dbf2
begin
θs_refined = 0:0.01:1
posterior_dis_refined = ℓ.(θs_refined, N, h_total)
posterior_dis_refined ./= sum(posterior_dis_refined)
end;
# ╔═╡ be2a2acd-5633-45cc-ab8c-8a064324b287
begin
cint_normal(n, k; θ=k/n, z=1.96) = max(k/n - z * sqrt(θ*(1-θ)/n),0), min(k/n + z * sqrt(θ*(1-θ)/n),1.0)
within_int(intv, θ=0.5) = θ<intv[2] && θ>intv[1]
end;
# ╔═╡ bd85c6ef-8623-4e06-8f7e-1e30927b25b7
begin
Random.seed!(100)
θ_null = 0.5
n_exp = 100
trials = 10
outcomes = rand(Binomial(trials, θ_null), n_exp)
intvs = cint_normal.(trials, outcomes; z= 1.645)
true_intvs = within_int.(intvs)
ϵ⁻ = outcomes/trials .- [intvs[i][1] for i in 1:length(intvs)]
ϵ⁺ = [intvs[i][2] for i in 1:length(intvs)] .- outcomes/trials
end;
# ╔═╡ b113a7ce-7e00-4df3-b1a4-4cf7af005aaf
begin
p = hline([0.5], label=L"\mathrm{true}\;θ=0.5", color= 3, linestyle=:dash, linewidth=2, xlabel="Experiments", ylim =[0,1])
@gif for i in 1:n_exp
k_ = outcomes[i]
# intv = cint_normal(trials, k_; z= 1.645)
# intv = intvs[i]
in_out = true_intvs[i]
col = in_out ? 1 : 2
θ̂ = k_/trials
Plots.scatter!([i], [θ̂], label="", yerror= ([ϵ⁻[i]], [ϵ⁺[i]]), markerstrokecolor=col, color=col)
end
end
# ╔═╡ 8332304c-2264-46df-baf1-0a1070927152
begin
first_20_intvs = true_intvs[1:100]
Plots.scatter(findall(first_20_intvs), outcomes[findall(first_20_intvs)]/trials, ylim= [0,1], yerror =(ϵ⁻[findall(first_20_intvs)], ϵ⁺[findall(first_20_intvs)]), label="true", markerstrokecolor=:auto, legend=:outerbottom,legendtitle = "true "*L"\theta"* " within CI ?")
Plots.scatter!(findall(.!first_20_intvs), outcomes[findall(.!first_20_intvs)]/trials, ylim= [0,1], yerror =(ϵ⁻[findall(.!first_20_intvs)],ϵ⁺[findall(.!first_20_intvs)]), label="false", markerstrokecolor=:auto)
hline!([0.5], label=L"\mathrm{true}\;θ=0.5", linewidth =2, linestyle=:dash, xlabel="Experiments")
end
# ╔═╡ 5db12a02-7c7c-47f3-8bea-24e4b5d67cae
# only works for uni-modal
function find_hpdi(ps, α = 0.95)
cum_p, idx = findmax(ps)
l = idx - 1
u = idx + 1
while cum_p <= α
if l >= 1
if u > length(ps) || ps[l] > ps[u]
cum_p += ps[l]
l = max(l - 1, 0)
continue
end
end
if u <= length(ps)
if l == 0 || ps[l] < ps[u]
cum_p += ps[u]
u = min(u + 1, length(ps))
end
end
end
return l+1, u-1, cum_p
end;
# ╔═╡ ccc239c6-327f-4905-9040-4a7b4a51e6e1
begin
post_plt2 = Plots.plot(θs, posterior_dis, seriestype=:sticks, markershape=:circle, label="", title=L"\mathrm{Posterior}\, \, p(\theta|\mathcal{D})", xlabel=L"θ", ylabel=L"p(θ|𝒟)")
l1, u1, _ = find_hpdi(posterior_dis, 0.87)
Plots.plot!(post_plt2, θs[l1:u1], posterior_dis[l1:u1], seriestype=:sticks, markershape=:circle, label="90% credible interval", legend=:topleft)
Plots.plot!(post_plt2, θs[l1:u1], posterior_dis[l1:u1], seriestype=:path, color=2, fill=true, alpha=0.3, label="")
Plots.annotate!((0.7, maximum(posterior_dis)/2, ("90 % HDI", 14, :red, :center, "courier")))
end
# ╔═╡ 954ea5d8-ba25-44b9-ac7b-7d3e683cc8e0
begin
post_plt3 = Plots.plot(θs_refined, posterior_dis_refined, seriestype=:sticks, markershape=:circle, markersize=2, label="", title=L"\mathrm{Posterior}\, \, p(\theta|\mathcal{D}) \mathrm{with\; refined\; space}", xlabel=L"θ", ylabel=L"p(θ|𝒟)")
l2, u2, _ = find_hpdi(posterior_dis_refined, 0.9)
Plots.plot!(post_plt3, θs_refined[l2:u2], posterior_dis_refined[l2:u2], seriestype=:sticks, markershape=:circle, markersize=2, label="", legend=:topleft)
Plots.plot!(post_plt3, θs_refined[l2:u2], posterior_dis_refined[l2:u2], seriestype=:path, fill=true, alpha=0.3, color=2, label="90% credible interval", legend=:topleft)
Plots.annotate!((0.7, maximum(posterior_dis_refined)/2, ("90 % HDI", 15, :red, :center, "courier")))
end
# ╔═╡ 8f46fde2-c906-4d2f-804d-0ae83fb4c87f
md"""
**A more refined approximation:**
We can have a more refined discretisation of the parameter space. For example, we can choose $$\theta \in [0, 0.01, 0.02, \ldots, 1.0],$$ a step size of ``0.01`` rather than ``0.1``. Check the next chapter for a continuous posterior approach. The corresponding 90% HPDI is
``p(``$(θs_refined[l2]) ``\leq \theta \leq`` $(θs_refined[u2])``|\mathcal D)= 90\%.`` We still reach the same conclusion.
"""
# ╔═╡ e93a7045-e652-433e-9444-2213b93b57d0
begin
N₁, N₂ = 10, 100
Nₕ₁, Nₕ₂ = 9, 89
dis_size = 51
θ₁s, θ₂s = range(0, 1 , dis_size), range(0, 1 , dis_size)
function ℓ_two_coins(θ₁, θ₂; N₁=10, N₂=100, Nh₁=7, Nh₂=69, logLik=false)
logℓ = ℓ(θ₁, N₁, Nh₁; logLik=true) + ℓ(θ₂, N₂, Nh₂; logLik=true)
logLik ? logℓ : exp(logℓ)
end
ℓ_twos = [ℓ_two_coins(xi, yi; N₁=N₁, N₂=N₂, Nh₁=Nₕ₁, Nh₂=Nₕ₂, logLik=true) for xi in θ₁s, yi in θ₂s]
p𝒟 = exp(logsumexp(ℓ_twos))
ps = exp.(ℓ_twos .- logsumexp(ℓ_twos))
post_AmoreB = sum([ps[i,j] for j in 1:size(ps)[2] for i in (j+1):size(ps)[1]])
end;
# ╔═╡ 4db4b59e-d59b-4197-befd-0dfa9f703f64
md"""
### A two-parameter Bayesian model
According to the problem statement, we now have a two-parameter model. The two unknowns are the two biases ``\theta_A,\theta_B``. The likelihood is simply the count of tosses and also the counts of heads: ``\mathcal{D} = \{N_A, N_B, N_{A,h}, N_{B,h}\}``.
**Step 1. Prior for the unknown ``p(\theta_A, \theta_B)``.**
Similar to the single coin model, we discrete the two parameters. Now the two-parameter can take any value in a grid of values: ``(\theta_A, \theta_B) \in \{0, 0.01, 0.02, \ldots, 1\}\times \{0, 0.01, 0.02, \ldots, 1\}.`` That means the tuple can take any ``101^2`` possible combinations of the discretised value pairs, such as ``(0.0, 0.0), (0.0, 0.01), (0.0, 0.02)`` and so on.
To show our ignorance, we can further assume a uniform prior over the ``101^2`` choices, *i.e.*
$$p(\theta_A, \theta_B) = \begin{cases} 1/101^2, & \theta_A,\theta_B \in \{0, 0.01, \ldots, 1.0\}^2 \\
0, & \text{otherwise}; \end{cases}$$
The prior distribution is shown below:
$(begin
p1 = Plots.plot(θ₁s, θ₂s, (x,y) -> 1/(length(θ₁s)^2), st=:surface, xlabel=L"\theta_A", ylabel=L"\theta_B", ratio=1, xlim=[0,1], ylim=[0,1], zlim =[-0.003, maximum(ps)], colorbar=false, c=:plasma, alpha =0.2, zlabel="density", title="Prior")
end)
**Step 2. Determine the likelihood for ``p(\mathcal{D}|\theta_A, \theta_B)``**
According to the problem statement, the two coins' tosses are all independent. The joint likelihood there is simply a product of two individual single coin's likelihoods.
```math
\begin{align}
p(\mathcal{D}|\theta_A, \theta_B) &= p(\mathcal{D}_A|\theta_A) p(\mathcal{D}_B|\theta_B)\\
&=\theta_A^{N_{A,h}}(1-\theta_A)^{N_A- N_{A,h}}\times\theta_B^{N_{B,h}}(1-\theta_B)^{N_B- N_{B,h}}
\end{align}
```
We can evaluate the likelihood at the pre-selected ``101^2`` values of ``(\theta_A, \theta_B)`` pair.
"""
# ╔═╡ 92d267d5-e808-46a2-aa82-744e66cf5c76
md"""
After finishing specifying the model, what is left is to routinely apply Bayes' rule to find the posterior.
**Step 3. Apply Bayes' rule to find the posterior**
Note the prior is a constant, and the posterior is proportional to the likelihood:
$$p(\theta_A, \theta_B|\mathcal D) \propto p(\theta_A, \theta_B)\cdot p(\mathcal D|\theta_A, \theta_B) = \frac{1}{101^2} \cdot p(\mathcal D|\theta_A, \theta_B).$$
And the normalising constant $p(\mathcal D)$ is
$$p(\mathcal D) = \sum_{\theta_A\in \{0.0, \ldots,1.0\}}\sum_{\theta_B\in \{0.0,\ldots, 1.0\}} p(\theta_A, \theta_B)\cdot p(\mathcal D|\theta_A, \theta_B).$$
The updated posterior is plotted below. After observing the data, the posterior now centres around the (0.9, 0.89); however, the ``\theta_A``'s marginal distribution has a much heavier tail than the other, which makes perfect sense.
$(begin
p2 = Plots.plot(θ₁s, θ₂s, ps', st=:surface, xlabel=L"\theta_A", ylabel=L"\theta_B", ratio=1, xlim=[0,1], ylim=[0,1], zlim =[-0.003, maximum(ps)], colorbar=false, c=:plasma, zlabel="density", alpha=0.7, title="Posterior")
end)
"""
# ╔═╡ a2fc25e6-e9e7-4ce7-9532-0449e6423545
md"""
The heatmap of the posterior density is also plotted for reference.
$(begin
Plots.plot(θ₁s, θ₂s, ps', st=:heatmap, xlabel=L"\theta_A", ylabel=L"\theta_B", ratio=1, xlim=[0,1], ylim=[0,1], zlim =[-0.003, maximum(ps)], colorbar=false, c=:plasma, zlabel="density", alpha=0.7, title="Posterior's density heapmap")
end)
"""
# ╔═╡ 78ad6108-8058-4c6e-b254-1b508dee2b6f
L"p(\theta_A > \theta_B|\mathcal{D}) \approx %$(round(post_AmoreB, digits=2))"
# ╔═╡ 86b7124e-5042-4f9f-91d7-31b8daad4f98
begin
post_p =Plots.plot(θ₁s, θ₂s, ps', st=:contour, xlabel=L"\theta_A", ylabel=L"\theta_B", ratio=1, xlim=[0,1], ylim=[0,1], colorbar=false, c=:thermal, framestyle=:origin)
plot!((x) -> x, lw=1, lc=:gray, label="")
equalline = Shape([(0., 0.0), (1,1), (1, 0)])
plot!(equalline, fillcolor = plot_color(:gray, 0.2), label=L"\theta_A>\theta_B", legend=:bottomright)
k=2
kline = Shape([(0., 0.0), (1,1/k), (1, 0)])
plot!(kline, fillcolor = plot_color(:gray, 0.5), label=L"\theta_A>%$(k)\cdot \theta_B", legend=:bottomright)
end
# ╔═╡ 00000000-0000-0000-0000-000000000001
PLUTO_PROJECT_TOML_CONTENTS = """
[deps]
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
LaTeXStrings = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
LogExpFunctions = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
PlutoTeachingTools = "661c6b06-c737-4d37-b85c-46df65de6f69"
PlutoUI = "7f904dfe-b85e-4ff6-b463-dae2292396a8"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd"
[compat]
Distributions = "~0.25.103"
LaTeXStrings = "~1.4.0"
LogExpFunctions = "~0.3.26"
Plots = "~1.40.8"
PlutoTeachingTools = "~0.2.15"
PlutoUI = "~0.7.54"
StatsPlots = "~0.15.6"
"""
# ╔═╡ 00000000-0000-0000-0000-000000000002
PLUTO_MANIFEST_TOML_CONTENTS = """
# This file is machine-generated - editing it directly is not advised
julia_version = "1.11.0"
manifest_format = "2.0"
project_hash = "c1cd568323f5109a533d045e5ec5fc9cf4b44bd8"
[[deps.AbstractFFTs]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef"
uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
version = "1.5.0"
weakdeps = ["ChainRulesCore", "Test"]
[deps.AbstractFFTs.extensions]
AbstractFFTsChainRulesCoreExt = "ChainRulesCore"
AbstractFFTsTestExt = "Test"
[[deps.AbstractPlutoDingetjes]]
deps = ["Pkg"]
git-tree-sha1 = "6e1d2a35f2f90a4bc7c2ed98079b2ba09c35b83a"
uuid = "6e696c72-6542-2067-7265-42206c756150"
version = "1.3.2"
[[deps.Adapt]]
deps = ["LinearAlgebra", "Requires"]
git-tree-sha1 = "6a55b747d1812e699320963ffde36f1ebdda4099"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "4.0.4"
weakdeps = ["StaticArrays"]
[deps.Adapt.extensions]
AdaptStaticArraysExt = "StaticArrays"
[[deps.AliasTables]]
deps = ["PtrArrays", "Random"]
git-tree-sha1 = "9876e1e164b144ca45e9e3198d0b689cadfed9ff"
uuid = "66dad0bd-aa9a-41b7-9441-69ab47430ed8"
version = "1.1.3"
[[deps.ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
version = "1.1.2"
[[deps.Arpack]]
deps = ["Arpack_jll", "Libdl", "LinearAlgebra", "Logging"]
git-tree-sha1 = "9b9b347613394885fd1c8c7729bfc60528faa436"
uuid = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97"
version = "0.5.4"
[[deps.Arpack_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "OpenBLAS_jll", "Pkg"]
git-tree-sha1 = "5ba6c757e8feccf03a1554dfaf3e26b3cfc7fd5e"
uuid = "68821587-b530-5797-8361-c406ea357684"
version = "3.5.1+1"
[[deps.Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
version = "1.11.0"
[[deps.AxisAlgorithms]]
deps = ["LinearAlgebra", "Random", "SparseArrays", "WoodburyMatrices"]
git-tree-sha1 = "01b8ccb13d68535d73d2b0c23e39bd23155fb712"
uuid = "13072b0f-2c55-5437-9ae7-d433b7a33950"
version = "1.1.0"
[[deps.Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
version = "1.11.0"
[[deps.BitFlags]]
git-tree-sha1 = "0691e34b3bb8be9307330f88d1a3c3f25466c24d"
uuid = "d1d4a3ce-64b1-5f1a-9ba4-7e7e69966f35"
version = "0.1.9"
[[deps.Bzip2_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "9e2a6b69137e6969bab0152632dcb3bc108c8bdd"
uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0"
version = "1.0.8+1"
[[deps.Cairo_jll]]
deps = ["Artifacts", "Bzip2_jll", "CompilerSupportLibraries_jll", "Fontconfig_jll", "FreeType2_jll", "Glib_jll", "JLLWrappers", "LZO_jll", "Libdl", "Pixman_jll", "Xorg_libXext_jll", "Xorg_libXrender_jll", "Zlib_jll", "libpng_jll"]
git-tree-sha1 = "009060c9a6168704143100f36ab08f06c2af4642"
uuid = "83423d85-b0ee-5818-9007-b63ccbeb887a"
version = "1.18.2+1"
[[deps.ChainRulesCore]]
deps = ["Compat", "LinearAlgebra"]
git-tree-sha1 = "3e4b134270b372f2ed4d4d0e936aabaefc1802bc"
uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
version = "1.25.0"
weakdeps = ["SparseArrays"]
[deps.ChainRulesCore.extensions]
ChainRulesCoreSparseArraysExt = "SparseArrays"
[[deps.Clustering]]
deps = ["Distances", "LinearAlgebra", "NearestNeighbors", "Printf", "Random", "SparseArrays", "Statistics", "StatsBase"]
git-tree-sha1 = "9ebb045901e9bbf58767a9f34ff89831ed711aae"
uuid = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
version = "0.15.7"
[[deps.CodeTracking]]
deps = ["InteractiveUtils", "UUIDs"]
git-tree-sha1 = "7eee164f122511d3e4e1ebadb7956939ea7e1c77"
uuid = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2"
version = "1.3.6"
[[deps.CodecZlib]]
deps = ["TranscodingStreams", "Zlib_jll"]
git-tree-sha1 = "bce6804e5e6044c6daab27bb533d1295e4a2e759"
uuid = "944b1d66-785c-5afd-91f1-9de20f533193"
version = "0.7.6"
[[deps.ColorSchemes]]
deps = ["ColorTypes", "ColorVectorSpace", "Colors", "FixedPointNumbers", "PrecompileTools", "Random"]
git-tree-sha1 = "b5278586822443594ff615963b0c09755771b3e0"
uuid = "35d6a980-a343-548e-a6ea-1d62b119f2f4"
version = "3.26.0"
[[deps.ColorTypes]]
deps = ["FixedPointNumbers", "Random"]
git-tree-sha1 = "b10d0b65641d57b8b4d5e234446582de5047050d"
uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
version = "0.11.5"
[[deps.ColorVectorSpace]]
deps = ["ColorTypes", "FixedPointNumbers", "LinearAlgebra", "Requires", "Statistics", "TensorCore"]
git-tree-sha1 = "a1f44953f2382ebb937d60dafbe2deea4bd23249"
uuid = "c3611d14-8923-5661-9e6a-0046d554d3a4"
version = "0.10.0"
weakdeps = ["SpecialFunctions"]
[deps.ColorVectorSpace.extensions]
SpecialFunctionsExt = "SpecialFunctions"
[[deps.Colors]]
deps = ["ColorTypes", "FixedPointNumbers", "Reexport"]
git-tree-sha1 = "362a287c3aa50601b0bc359053d5c2468f0e7ce0"
uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
version = "0.12.11"
[[deps.Compat]]
deps = ["TOML", "UUIDs"]
git-tree-sha1 = "8ae8d32e09f0dcf42a36b90d4e17f5dd2e4c4215"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "4.16.0"
weakdeps = ["Dates", "LinearAlgebra"]
[deps.Compat.extensions]
CompatLinearAlgebraExt = "LinearAlgebra"