-
Notifications
You must be signed in to change notification settings - Fork 0
/
section8_nonlinear.jl
3968 lines (3238 loc) · 147 KB
/
section8_nonlinear.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
### A Pluto.jl notebook ###
# v0.19.46
using Markdown
using InteractiveUtils
# ╔═╡ 00581558-cd4e-4a98-9fd7-82bcd9de6561
using Turing
# ╔═╡ f49dc24c-2220-11ed-0ff2-2df7e9f3a0cf
begin
using PlutoUI
using StatsPlots
using Plots; default(fontfamily="Computer Modern", framestyle=:box) # LaTex-style
using PlutoTeachingTools
using CSV, DataFrames
using LinearAlgebra
using RDatasets
using StatsBase
using StatsFuns:logistic
# using Turing
using GLM
using Random
using LaTeXStrings
using Logging;
Logging.disable_logging(Logging.Warn);
end;
# ╔═╡ ff6fcfa7-dcd1-4529-ac83-46f9f1e17bc7
using Splines2
# ╔═╡ 709e2b13-67d2-4e5b-b148-aba16431b0ae
TableOfContents()
# ╔═╡ 46ea7275-eb7e-4f65-bab9-445a02887064
md"[**↩ Home**](https://lf28.github.io/BayesianModelling/)
"
# ╔═╡ 81a47ad3-18de-4437-a5b2-c803e7938842
md"""
In this chapter, we are going to see how non-linear analysis can be achieved via a technique called fixed basis expansion. And it turns out the Bayesian approach works well with non-linear basis expansion. Compared with the Frequentist estimation, Bayesian inference is more sparse and generalises better in an unseen dataset.
"""
# ╔═╡ 7f290bd5-06be-4627-b34a-4748521c48e8
md"""
## Non-linear models
The simplest approach to obtaining a non-linear model is to apply **non-linear** functions to the original input predictors first and then estimate a model with the expanded features. And such a technique is called *basis expansion*. A lot of more advanced techniques, such as neural networks, support vector machines, and splines, can be viewed as special cases of this technique.
More formally, given input vector ``\mathbf{x}_n \in \mathbb{R}^D`` and its target ``y_n``, instead of fitting a linear regression model with ``\mathbf{x}_n``, *i.e.*
```math
y_n = \beta_0 +\mathbf{x}_n^\top \boldsymbol{\beta}_1 + \varepsilon_n
```
we fit the model
```math
\begin{align}
y_n &= \beta_0 +\beta_1 \phi_1(\mathbf{x}_n) + \beta_2\phi_2(\mathbf{x}_n) + \ldots + \beta_K \phi_K(\mathbf{x}_n) + \varepsilon_n
\end{align}
```
where ``\varepsilon_n`` are still random white Gaussian noises; and the functions ``\{\phi_k\}_{k=1}^K`` are called **basis function**, and each ``\phi_k`` is
* a ``\mathbb{R}^D\rightarrow \mathbb{R}`` function that transforms a ``D`` dimensional input observation ``\mathbf{x}_n`` to a scalar;
* and the function has to be non-linear.
"""
# ╔═╡ 89a89522-52d4-4591-a22f-5f0737176cf8
md"""
*Remarks. The idea can also be applied to generalised linear models (GLMs), such as logistic regression to achieve generalised non-linear models. The model assumption becomes*
```math
g(\mathbf{E}[y_n|\mathbf{x}_n]) = \beta_0 +\beta_1 \phi_1(\mathbf{x}_n) + \beta_2\phi_2(\mathbf{x}_n) + \ldots + \beta_K \phi_K(\mathbf{x}_n),
```
*where ``g`` is the appropriate link function. For example, ``g^{-1}`` is a logistic function for the logistic regression.*
"""
# ╔═╡ 9c53adc3-8ecc-4c45-9c12-addfb30edf8d
md"""
By using matrix notation, the regression model with basis expansion can be compactly written as:
```math
p(\mathbf{y}|\mathbf{X}, \boldsymbol{\beta}, \sigma^2) = \mathcal{N}_N(\mathbf{y}; \mathbf{\Phi} \boldsymbol{\beta}, \sigma^2\mathbf{I}_N),
```
where ``\mathbf{y} = [y_1, y_2,\ldots,y_N]^\top``, ``\mathbf{I}_{N}`` is a ``N\times N`` identity matrix, and ``\mathbf{X}`` is original ``N\times D`` design matrix where each row corresponds to one observation, and ``D`` is number of input feature:
```math
\mathbf{X} = \begin{bmatrix}\rule[.5ex]{3.5 ex}{0.5pt} & \mathbf{x}_1^\top & \rule[.5ex]{3.5ex}{0.5pt} \\
\rule[.5ex]{3.5ex}{0.5pt} & \mathbf{x}_2^\top & \rule[.5ex]{3.5ex}{0.5pt} \\
&\vdots& \\
\rule[.5ex]{3.5ex}{0.5pt} & \mathbf{x}_N^\top & \rule[.5ex]{3.5ex}{0.5pt}
\end{bmatrix},
```
whereas ``\mathbf\Phi`` is the expanded ``N\times K`` matrix:
```math
\mathbf{\Phi} = \begin{bmatrix}
\phi_1(\mathbf{x}_1) & \phi_2(\mathbf{x}_1) & \ldots & \phi_K(\mathbf{x}_1) \\
\phi_1(\mathbf{x}_2) & \phi_2(\mathbf{x}_2) & \ldots & \phi_K(\mathbf{x}_2) \\
\vdots & \vdots & \ddots & \vdots \\
\phi_1(\mathbf{x}_N) & \phi_2(\mathbf{x}_N) & \ldots & \phi_K(\mathbf{x}_N) \\
\end{bmatrix} = \begin{bmatrix}\rule[.5ex]{3.5 ex}{0.5pt} & \boldsymbol{\phi}_1^\top & \rule[.5ex]{3.5ex}{0.5pt} \\
\rule[.5ex]{3.5ex}{0.5pt} & \boldsymbol{\phi}_2^\top & \rule[.5ex]{3.5ex}{0.5pt} \\
&\vdots& \\
\rule[.5ex]{3.5ex}{0.5pt} & \boldsymbol{\phi}_N^\top & \rule[.5ex]{3.5ex}{0.5pt}
\end{bmatrix},
```
where each row is the ``n``-th observation's ``K`` dimensional expanded feature ``\boldsymbol{\phi}_n \in R^K``.
It can be immediately observed that the new regression model is still linear with respect to ``\mathbf\Phi``, *i.e.*
```math
\mathbf{y} = \mathbf{\Phi}\boldsymbol{\beta} + \boldsymbol{\varepsilon}.
```
Therefore, the ordinary linear model's inference algorithms can be applied directly. One only needs to replace ``\mathbf{X}`` with the new design matrix ``\mathbf{\Phi}``.
"""
# ╔═╡ 4d0c7d09-4eac-40ab-9824-5b8739d4b146
md"""
**Example (Polynomial regression)** For simplicity, consider a simple regression problem with one predictor here: *i.e.*
```math
y_n = \beta_0 +\beta_1 x_n
```
A K-th degree polynomial regression can be achieved by assuming
```math
\phi_k(x) = x^k,\;\; \text{for }k = 0,\ldots, K.
```
Note here we have introduced a dummy basis function ``\phi_0(x) = 1`` here to serve as intercept. Substitute in the basis function, we have a non-linear regression
```math
y_n = \beta_0 + \beta_1 x_n + \beta_2 x_n^2 +\ldots, \beta_K x_n^K.
```
"""
# ╔═╡ eb9e22f8-2867-4320-9c81-5d63262089cc
md"As a concrete example, we fit a ``K=4``-th degree Polynomial regression to the Wage data."
# ╔═╡ 8b3660d5-e3a6-4f25-9c5b-553ca36a6b28
md" And the fitted polynomial model is plotted below."
# ╔═╡ b7981b85-d045-442b-89d0-d20cca6381f3
md"""
### Popular basis functions
"""
# ╔═╡ 988fb829-5ddb-43af-8836-f4fc95a945af
md"""
Technically speaking, any non-linear function can be used as a basis function. However, there are a few function choices that are more popular and successful. For example, the following are commonly used in the machine learning community:
* radial basis function (RBF):
```math
\phi_k(x) = \exp\left \{-\frac{1}{2}\left (\frac{x-\mu_k}{s}\right )^2\right \}
```
* sigmoid function (or logistic function)
```math
\phi_k(x) = \frac{1}{1 + \exp(-\frac{x-\mu_k}{s})}
```
* tanh function
```math
\phi_k(x) = \tanh\left (\frac{x-\mu_k}{s}\right )
```
* `ReLU` function
```math
\phi_k(x) = \max\left(0, \frac{x-\mu_k}{s}\right)
```
"""
# ╔═╡ 9e54b0ce-baaa-4116-a08e-2cc93e12026c
md"""
Some of the basis functions are plotted below for your reference. It is worth noting that all four functions are *local functions* which are parameterised with
* a location parameter ``\mu_k`` and
* a scale parameter ``s``.
Compared with the polynomial basis functions which are *global* functions, the local basis functions are more flexible. To overcome this limitation, one possible solution is to fit multiple polynomial functions in multiple truncated locations (predefined by knots). And the corresponding basis functions are commonly known as **splines functions**.
"""
# ╔═╡ df342965-f2eb-456f-b4d4-1fc76615e52a
begin
plt_poly_basis = plot(title="Polynomial basis functions")
for k in 1:10
plot!(-1:0.05:1, (x) -> x^k, label="")
end
plt_rbf_basis = plot(title="RBF basis functions")
s = 0.25
for μ in -1:0.2:1
plot!(-1:0.01:1, (x) -> exp(-(x-μ)^2/(2*s^2)), label="")
end
plt_sigmoid_basis = plot(title="Sigmoid basis functions")
s_sig = 0.1
for μ in -1:0.2:1
plot!(-1:0.01:1, (x) -> logistic((x-μ)/s_sig), label="")
end
plt_relu_basis = plot(title="ReLu basis functions")
for μ in -1:0.2:1
plot!(-1:0.01:1, (x) -> max(0,x-μ), label="")
end
plot(plt_poly_basis, plt_rbf_basis, plt_sigmoid_basis, plt_relu_basis)
end
# ╔═╡ d6c1074b-530b-453a-a1e4-cb41b09f4fdf
md"""
#### Specification of basis function parameters
For **fixed basis** expansion models, the user needs to decide what the expansion locations ``\{\mu_k\}_{k=1}^K ``and scale ``s`` (which is usually shared among the ``K`` functions) are. One option is to choose the ``k/(K+1)`` percentile of the input data as the expanded locations. For example, for ``K=3``, the 25%, 50% and 75% percentile of the input data will be used as the expansion locations.
A **non-parametric option** is to choose all the data points as expansion locations, *i.e.* ``\mu_n= \mathbf{x}_n`` for ``n=1,\ldots, N``. The new design matrix ``\mathbf{\Phi}`` then is a ``N\times (N+1)`` matrix. The expanded regression has ``N+1`` parameters, which potentially grow unbounded with the observation size. As will be demonstrated soon, this over-parameterised model proves to be too flexible to handle for the ordinary frequentist method (*i.e.* maximum likelihood estimation).
Lastly, the hyperparameters can also be learnt based on the data. The model becomes an **adaptive basis function** model. And the model is more commonly known as **Neural networks**.
"""
# ╔═╡ 47d9675b-4427-45af-a02a-950961692d5d
md"""
### Frequentist MLE estimation
To demonstrate the idea, we consider a one-dimensional toy dataset first. The data is generated with a non-linear true function:
```math
f(x) = -5\tanh(0.5x) \cdot (1- (\tanh(0.5x))^2).
```
and 18 noisy observations are simulated based on some randomly selected input locations. The code to simulate the data is listed below.
"""
# ╔═╡ 2aebebfd-7009-4def-8362-1617d18b5c64
begin
function true_f(x)
-5*tanh(0.5*x) * (1- tanh(0.5*x)^2)
end
Random.seed!(100)
x_input = [range(-4, -0.4, 10); range(0.8, 4, 8)]
y_output = true_f.(x_input) .+ sqrt(0.05) * randn(length(x_input))
end;
# ╔═╡ 0edef66b-e323-475b-bd83-972afe153c23
md"The simulated data is plotted below."
# ╔═╡ e9dfde47-306d-4f04-b468-2178c00cce74
begin
scatter(x_input, y_output, label="Observations")
plot!(true_f, lw=2, xlim=[-10, 10], framestyle=:default, lc=:gray, ls=:dash,label="true function", xlabel=L"x", ylabel=L"y")
end
# ╔═╡ 61398b43-d307-4e03-9097-1aae3414a8e7
md"""
We choose `RBF` as the basis function and choose basis expansion locations non-parametrically. That means we apply `RBF` basis functions on all input locations. Specifically, ``\mu_n=x_n, s^2=1.0`` are used to generate the expanded design matrix ``\mathbf \Phi``.
This expansion can be accomplished easily in `Julia`. The code below expand the given input vector ``\mathbf{X}``, which is a ``N`` element vector to a ``N\times (N+1)`` design matrix ``\mathbf\Phi``.
"""
# ╔═╡ e21db2b0-2c2f-4bfd-9e5a-05b06db4d0fe
begin
# Radial basis function
# called Gaussian kernel in the paper
function rbf_basis(x, μ, s)
exp.(-1 ./(2 .* s^2) .* (x .- μ).^2)
end
# vectorised version of basis expansion by RBF
function basis_expansion(xtest, xtrain, s=1, intercept = false)
# number of rows of xtrain, which is the size of basis
n_basis = size(xtrain)[1]
n_obs = size(xtest)[1]
X = rbf_basis(xtest, xtrain', s)
intercept ? [ones(n_obs) X] : X
end
intercept = true
sϕ = 1.0
Φ = basis_expansion(x_input, x_input, sϕ, intercept);
end;
# ╔═╡ 2a0c544f-0442-41d7-84b7-b716b2a909ec
md"""
Next, we apply the frequentist maximum likelihood estimation (MLE) method (a.k.a. ordinary least square for linear regression) to fit the model with the new design matrix ``\mathbf\Phi`` and the output ``\mathbf y``.
"""
# ╔═╡ bf7857fd-1b69-4f54-86b7-4f6082081693
freq_ols_model = lm(Φ, y_output);
# ╔═╡ 2097b04a-c6a2-433a-8393-a5c4794db6c1
md"""
The model is then used to predict some testing data ``\mathbf{X}_{\text{test}}``. Note that to use the fitted model, one needs to apply the same basis expansion function to the testing data, and then make the prediction. The estimated model is plotted with the true signal function and also the observations.
"""
# ╔═╡ 53c7c995-c9c3-409c-9a77-1bea31bd9fa9
begin
# apply the same expansion on the testing dataset
x_test = -10:0.1:10
Φₜₑₛₜ = basis_expansion(x_test, x_input, sϕ, intercept)
tₜₑₛₜ = true_f.(x_test)
# predict on the test dataset
βₘₗ = coef(freq_ols_model)
pred_y_ols = Φₜₑₛₜ*βₘₗ # or equivalently, one can use predict(freq_ols_model, Φₜₑₛₜ) from the GLM.jl package
end;
# ╔═╡ cd3c7fa8-dc4c-4101-b9d5-47e92c8df6f3
let
plot(x_test, tₜₑₛₜ, linecolor=:black, ylim= [-3, 3], lw=2, linestyle=:dash, lc=:gray,framestyle=:default, label="true signal")
scatter!(x_input, y_output, markershape=:xcross, markersize=3, markerstrokewidth=3, markercolor=:gray, markeralpha=2, label="Obvs")
plot!(x_test, pred_y_ols, linestyle=:solid, lw=2, xlabel=L"x", ylabel=L"y", legend=:topright, label="Frequentist Est.")
end
# ╔═╡ c2ac1425-cff9-4cd9-8b6e-6465a9299274
md"""
**Frequentist MLE estimation overfits the data.** It can be observed that the frequentist's estimation fits the data too perfectly! The prediction goes through every single observation. However, the prediction is very bumpy and extravagant at many locations. This behaviour is called **overfitting**. An overfitting model has a poor generalisation performance on an unseen dataset.
"""
# ╔═╡ 8975b5af-3e5b-45ba-8954-6336addaa246
md"""
## Bayesian inference
"""
# ╔═╡ 10dac645-8078-45d7-ad66-64160f99a9a4
md"""
Since the model is still linear after fixed basis expansion, the Bayesian model specified before can be immediately reused here. Take regression as an example, a Bayesian model with Gaussian priors on the regression parameter can be specified as follows:
!!! infor "Bayesian linear regression"
```math
\begin{align}
\text{Priors: }\;\;\;\;\;\;\beta_0 &\sim \mathcal{N}(0, v_0^{\beta_0})\\
\boldsymbol{\beta}_1 &\sim \mathcal{N}(0, \lambda^2\mathbf{I})\\
\sigma^2 &\sim \texttt{HalfCauchy}(s_0) \\
\text{Likelihood: }\;\;\text{for } n &= 1,2,\ldots, N:\\
\mu_n &=\beta_0 + \boldsymbol{\beta}_1^\top \boldsymbol{\phi}_n \\
y_n &\sim \mathcal{N}(\mu_n, \sigma^2).
\end{align}
```
Compared with the Bayesian model used earlier, the only difference is that we have used the generated feature ``\boldsymbol{\phi}_n`` instead of ``\mathbf{x}_n`` in the likelihood part.
"""
# ╔═╡ 100e5718-52df-4451-a42a-58c54ca5dfe5
md"""
### Hierarchical Bayesian modelling
One drawback of the above model is that the user needs to specify the parameters of the priors, which are called hyper-parameters. For example, a zero mean Gaussian prior is imposed for the regression parameter ``\boldsymbol{\beta}_1``:
```math
\boldsymbol{\beta}_1 \sim \mathcal{N}(0, \lambda^2\mathbf{I}),
```
where ``\lambda^2 >0``, the prior variance, is a hyper-parameter that needs to be specified. In previous chapters, we have set all hyperparameters manually such that the priors are weakly-informative.
However, the Bayesian approach offers us an alternative more principled approach. The idea is to introduce an additional layer of prior, called **hyper-priors**, for the hyperparameters. We usually set non-informative priors or weakly informative priors for the hyper-parameters.
For our regression problem, the prior scale parameter ``\lambda`` controls the complexity of the regression. It, therefore, makes better sense to let the data determine its value. Since ``\lambda`` is a positive real value, a suitable prior is ``\texttt{HalfCauchy}(1)``.
!!! infor "Hierarchical Bayesian linear regression"
```math
\begin{align}
\text{Hyper-prior: }\;\;\;\;\;\;\;\lambda &\sim \texttt{HalfCauchy}(1.0) \\
\text{Priors: }\;\;\;\;\;\;\beta_0 &\sim \mathcal{N}(0, v_0^{\beta_0})\\
\boldsymbol{\beta}_1 &\sim \mathcal{N}(\mathbf{0}, \lambda^2\mathbf{I})\\
\sigma^2 &\sim \texttt{HalfCauchy}(s_0) \\
\text{Likelihood: }\;\;\text{for } n &= 1,2,\ldots, N:\\
\mu_n &=\beta_0 + \boldsymbol{\beta}_1^\top \boldsymbol{\phi}_n \\
y_n &\sim \mathcal{N}(\mu_n, \sigma^2).
\end{align}
```
The hierarchical Bayesian model is translated in `Turing` as follows.
"""
# ╔═╡ f969414b-6439-489d-af7a-fbb80cc9fff3
# Bayesian linear regression.
@model function hier_linear_regression(X, ys; v₀ = 10^2, s₀ = 5)
# Set hyperprior for the λ
λ ~ truncated(Cauchy(0.0,1.0), lower=0.0)
# Set variance prior.
σ² ~ truncated(Cauchy(0, s₀), 0, Inf)
intercept ~ Normal(0, sqrt(v₀))
# Set the priors on our coefficients.
nfeatures = size(X, 2)
coefficients ~ MvNormal(nfeatures, λ)
# Calculate all the mu terms.
μs = intercept .+ X * coefficients
for i in eachindex(ys)
ys[i] ~ Normal(μs[i], sqrt(σ²))
end
# ys .~ Normal.(μs, sqrt(σ²))
# ys ~ arraydist(Normal.(μs, sqrt(σ²)))
# ys ~ MvNormal(μs, sqrt(σ²))
return (; μs, ys)
end
# ╔═╡ 7c84879e-7bfe-41da-81f8-eab4f1360511
md"""
Next, we apply the `Turing` model to fit our simulated dataset.
"""
# ╔═╡ ef0da715-15b3-4dee-8a8a-dccd2bcc36f6
chain_hier = let
Random.seed!(100)
# note that we exclude the first column i.e. the intercept since it has already been included in the Turing model
sample(hier_linear_regression(Φ[:, 2:end], y_output; v₀ = 5, s₀= 2),
# NUTS{Turing.TrackerAD}(),
NUTS(),
1000)
end;
# ╔═╡ 33803b7b-a60d-4c1b-a5ac-92b5e8aae8cd
md"""
Lastly, we compare the Bayesian prediction with the frequentist. To use `Turing` to make predictions on new testing input, we can use `generated_quantities()`. For example, the following code will return samples from the posterior predictive ``\mathbf{y}_{\texttt{test}}\sim p(\mathbf{y}_{\texttt{test}}|\mathcal{D}, \mathbf{X}_{\texttt{test}})`` and also the prediction means ``\mathbf{\mu}_{\texttt{test}}\sim p(\mathbf{\mu}_{\texttt{test}}|\mathcal{D}, \mathbf{X}_{\texttt{test}})`` based on the posterior samples.
```julia
pred_model = hier_linear_regression(Φₜₑₛₜ[:, 2:end],
Vector{Union{Missing, Float64}}(undef, size(Φₜₑₛₜ)[1]);
v₀ = 5, s₀= 2)
generated_quantities(pred_model, chain_hier)
```
"""
# ╔═╡ 3f66d862-46f7-43b4-a93f-e4b5d861b5b5
Foldable("Find the posterior samples manually.", md"""
Since the problem is relatively simple, we can manually calculate the posterior samples for the prediction mean ``\mu`` based on the following deterministic relationship:
```math
\boldsymbol{\mu}^{(r)} = \mathbf{\Phi}_{\texttt{test}}\boldsymbol{\beta}^{(r)}
```
which can be implemented as
```julia
# extract the posterior samples as a R × (N+1) matrix; the third to last are the samples for β
βs_samples = Array(chain_hier[:,:,1])[:, 3:end]
# need to apply a transpose on the β samples such that the matrix multiplication makes sense
μ_preds = Φₜₑₛₜ * βs_samples'
```
""")
# ╔═╡ 1e7628d4-e698-4469-b433-a0b9c3ffaa4a
md"""
Lastly, we plot the Bayesian prediction with the frequentist together to draw some comparison. It can be observed that the Bayesian approach automatically avoids **overfitting**. Noticeably, the Bayesian prediction is more reasonable at the two ends, where the data is scarce.
"""
# ╔═╡ 0dd33f5f-7d6c-4ebc-aff4-4a545f41e9bc
begin
# extract the posterior samples as a R × D matrix; the third to last are the samples for β
βs_samples = Array(chain_hier[:,:,1])[:, 3:end]
μ_preds = Φₜₑₛₜ * βs_samples'
plot(x_test, tₜₑₛₜ, linecolor=:black, ylim= [-3, 3], linestyle=:dash, lw=2, lc=:gray, framestyle=:default, label="true signal")
scatter!(x_input, y_output, markershape=:xcross, markersize=3, markerstrokewidth=3, markercolor=:gray, markeralpha=2, label="Obvs")
plot!(x_test, pred_y_ols, linestyle=:solid, lw=2, label="Frequentist")
plot!(x_test, mean(μ_preds, dims=2), linestyle=:solid, lw=2, label="Bayesian's mean")
end
# ╔═╡ 0165ce73-cad2-4e21-ab1c-1539e853bdc2
md"""
What is shown below is the 95% prediction credible intervals for the regression function ``\mu``. It can be observed that the uncertainty grows at the two ends and shrinks when there are enough data.
"""
# ╔═╡ 81040385-7216-4b79-af50-14098754a27f
begin
stats_μ_pred = describe(Chains(μ_preds'))[2]
lower = stats_μ_pred[:, Symbol("2.5%")]
upper = stats_μ_pred[:, Symbol("97.5%")]
middle = stats_μ_pred[:, Symbol("50.0%")]
plot(x_test, tₜₑₛₜ, linecolor=:black, ylim= [-3, 3], linestyle=:dash, lw=2, lc=:gray, framestyle=:default, label="true signal")
scatter!(x_input, y_output, markershape=:xcross, markersize=3, markerstrokewidth=3, markercolor=:gray, markeralpha=2, label="Obvs")
βs_hier = Array(chain_hier[:,:,1])[:, 3:end]
bayes_preds2 = Φₜₑₛₜ * βs_hier'
plot!(x_test, middle, linestyle=:solid, lw=2, lc=4, ribbon = (middle-lower, upper-middle), label="Bayesian's mean ± 95% CI")
end
# ╔═╡ 60f86e5e-98c1-475a-badd-6eeab9ebaaf7
md"""
### Why does Bayesian work?
Bayesian makes better-generalised predictions almost out-of-box. Compared with the frequentist approach, the Bayesian approach has two unique features that make it stand out. The first is the inclusion of prior. As discussed before, zero-mean prior distributions play a key role in *regularising* extreme values of the regression parameters. And the second factor is Bayesian's *ensemble estimation* approach versus Frequentist's plug-in principle.
"""
# ╔═╡ 38fade9c-fb40-40cd-8c43-d70d896c1f6f
βml_mean = sum(abs.(βₘₗ)); βml_bayes = mean(sum(abs.(βs_samples), dims=1));
# ╔═╡ 7ffdb827-59aa-4742-95cc-d0ff9b9f1fed
md"""
**Regularisation effect**
As discussed in one of the previous chapters, the zero mean Gaussian prior regulates the estimation such that large extreme values of ``\boldsymbol{\beta}`` are discouraged. As a result, the posterior mean of the regression parameters is also shrunken towards ``\mathbf{0}``, the prior mean.
To draw a more direct comparison, we compare the averaged ``L_1`` norms,
```math
\Vert \boldsymbol{\beta}\Vert_1 = \sum_{d=1}^D |\beta_d|
```
of the two estimations. For the Bayesian method, we report the average of samples' ``L_1`` norms.
| | Frequentist | Bayesian|
|---|---|---|
|``{\Vert\beta \Vert_1}``|$(round(βml_mean, digits=2))|$(round(βml_bayes, digits=2))|
As expected, the Bayesian estimator is much smaller in magnitude than the unregulated OLS estimator. The posterior samples of the regression parameters are also plotted below in a `boxplot`.
"""
# ╔═╡ 7e8d71db-6cdd-4b09-b93a-9d58e74c08e3
boxplot("β" .* string.([18:-1:1]'...), Array(chain_hier[:, end:-1:4, :]), leg=false, permute=(:x, :y), outliers=false, ylim=[-10,10], title="Box plot on Bayesian β MCMC samples")
# ╔═╡ 100d76d5-61ed-4e78-8103-a450b3834647
md"""
It can be observed that the majority of the parameters are close to zero except for a handful of parameters, such as ``\beta_8, \beta_9, \beta_{11}, \beta_{12}``, which corresponds to the 8-th, 9-th, 11-th and 12-th observations' basis functions (check the plot below). In other words, the corresponding basis (or input features) are considered more important in predicting the target. And the *importance* is automatically determined by the Bayesian algorithm.
The posterior means of the 18 parameters are plotted below together with the observed data and their corresponding basis functions. The important bases, *i.e.* ``\beta_d`` estimated with larger magnitudes, are highlighted.
"""
# ╔═╡ fa34843a-dafc-49f4-b3c3-b5426843176b
let
mean_β = mean(βs_samples, dims=1)[2:end]
plt = plot(x_input, mean_β, st=:sticks, xticks=true, ylabel=L"\beta", xlim=[-7,7], xlabel=L"x", marker=:circle, label=L"\mathbb{E}[\beta|\mathcal{D}]")
for i in 1: 50
lbl = i == 1 ? L"\beta^{(r)} \sim p(\beta|\mathcal{D})" : ""
plot!(x_input, βs_samples[i, 2:end], lw=0.2, alpha=0.5, st=:line, ls=:dash, label=lbl, lc=:gray)
end
colors= cgrad(:tab20, length(x_input),scale = :linear, categorical = true)
important_basis = [8,9, 11,12]
for (i, μ) in enumerate(x_input)
scatter!([μ], [y_output[i]], markershape=:xcross, markersize=3, markerstrokewidth=3, markercolor=colors[i], markeralpha=2, label="")
plot!(-7:0.1:7, (x) -> exp(-(x-μ)^2/(2*sϕ^2)), lw=0.8, lc=colors[i], label="")
if i ∈ important_basis
plot!([μ], [mean_β[i]], st=:sticks, marker=:circle, c=:red, label="")
end
end
plt
end
# ╔═╡ 4de2a978-d13b-4968-b45c-645c2da210ff
md"""
**Ensemble learning**
As discussed before, when it comes to prediction at a new testing input location ``\mathbf{x}_\ast``, Bayesian predicts by integration rather than plug-in estimation:
```math
\begin{align}
p(\mu_\ast |\mathbf{x}_\ast, \mathcal{D}) &= \int p(\mu_\ast|\boldsymbol{\beta}, \mathbf{x}_\ast)p(\boldsymbol{\beta}|\mathcal{D}) \mathrm{d}\boldsymbol{\beta}\\
&= \int {\boldsymbol{\phi}_\ast}^\top \boldsymbol{\beta}\cdot p(\boldsymbol{\beta}|\mathcal{D}) \mathrm{d}\boldsymbol{\beta}\\
&\approx \frac{1}{R} \sum_{r=1}^R \boldsymbol{\phi}_\ast^\top \boldsymbol{\beta}^{(r)},
\end{align}
```
where ``\{\boldsymbol{\beta}^{(r)}\}_r`` are the MCMC posterior samples. In other words, there are ``R`` models, each indexed by the MCMC samples, that make contributions to the prediction. Bayesian simple takes the average of them. The ensemble prediction idea is illustrated below. The posterior mean of the regression function ``\mathbb E[\mu^{(r)}(x_\ast)|\mathcal{D}]`` (the thick solid line) is plotted with a few individual models' predictions (gray lighter lines).
"""
# ╔═╡ 2c2779a7-9983-4dec-9f99-c110c2d15fef
let
plt =plot(x_test, tₜₑₛₜ, linecolor=:black, ylim= [-3, 3], linestyle=:dash, lw=2, lc=:gray, framestyle=:default, label="true signal")
scatter!(x_input, y_output, markershape=:xcross, markersize=3, markerstrokewidth=3, markercolor=:gray, markeralpha=2, label="Obvs")
# plot!(xs, pred_y_ols, linestyle=:solid, lw=2, label="Frequentist")
βs_hier = Array(chain_hier[:,:,1])[:, 3:end]
bayes_preds2 = Φₜₑₛₜ * βs_hier'
plot!(x_test, mean(bayes_preds2, dims=2), linestyle=:solid, lw=2, lc=4, label="Bayesian's mean")
for i in 1:35
plot!(x_test, bayes_preds2[:, i], linestyle=:dash, lw=0.3, lc=4, label="")
end
plt
end
# ╔═╡ 2770759d-1da9-4194-a3fc-0e6de9241bc5
md"""
## Bayesian non-linear classification
The basis expansion idea can also be applied to solve non-linear classification problems. As a concrete example, consider the following simulated dataset. The two classes clearly cannot be classified by a linear model since class 2 data is scattered in two centres.
"""
# ╔═╡ 850d643d-f34f-44dd-9a19-e214280d9f21
md"""
### Basis expansion with RBF
"""
# ╔═╡ 5f053027-33ce-4915-b4ba-3cafb99001a6
md"""
As a reference, we fit a logistic regression with the expanded design matrix. The estimated model is plotted below. The frequentist method returns an irregular decision boundary that follows the shape of class 2's data. And the posterior prediction is very clean-cut (or overconfident), *i.e.* either 0% or 100% classification predictions are made for all input locations. However, one may argue input locations further away from the data centre should be given less confident predictions.
"""
# ╔═╡ 9837843d-c1a2-4782-8872-3547de23dc8f
md"""
### Bayesian inference with `Turing`
"""
# ╔═╡ dc4715c7-83f5-48bb-9990-1aacdd3050d5
md"""
Next, we apply the Bayesian model to solve the problem. A hierarchical Bayesian logistic regression model is specified below in `Turing`. The prior structure is almost the same as the regression model and the only difference is the Bernoulli likelihood. The model is then inferred with a `NUTS()` sampler. As a quick demonstration, only 1000 samples were drawn.
"""
# ╔═╡ d46e2d45-92ea-47df-99d6-9e4b11fedfba
begin
@model function hier_logistic_reg(X, y; v₀=10^2)
# Set hyperprior for the λ
λ ~ truncated(Cauchy(0.0,1.0), lower=0.0)
# priors
β₀ ~ Normal(0, sqrt(v₀))
nfeatures = size(X)[2]
β ~ MvNormal(zeros(nfeatures), λ)
# Likelihood
μs = β₀ .+ X * β
# logistic transformations
σs = logistic.(μs)
for i in eachindex(y)
y[i] ~ Bernoulli(σs[i])
end
# y ~ arraydist(Bernoulli.(σs))
return (; σs)
end
end;
# ╔═╡ 3e7742e7-9faf-4a2c-bd05-1c5eb386492d
md"""
### Comparison
"""
# ╔═╡ fbfa39b0-d3f9-455f-bcf5-30c877a44e21
md"""
Next, we compare the two methods' prediction performance. Note that to make a prediction at a new testing location, we need to first apply the basis expansion functions and then use the Monte Carlo estimation for the Bayesian approach.
One can make a prediction at ``\mathbf{x}_\ast`` by using `predict()`. As an example, the following code makes predictions on randomly generated testing data ``\mathbf{X}_\text{test}``.
```julia
Nₜₑₛₜ = 10
Xₜₑₛₜ = rand(Nₜₑₛₜ,2)
ϕₜₑₛₜ = apply_rbs_expansion(Xₜₑₛₜ, D[xknots,:], σ²_rbf)
pred_model = hier_logistic_reg(ϕₜₑₛₜ, Vector{Union{Missing, Bool}}(undef, Nₜₑₛₜ))
predict(pred_model, chain_bayes_class)
```
"""
# ╔═╡ eaaaaf70-78f5-4da7-a3c4-b10757702991
md"The Bayesian's expected predictions are plotted below together with the Frequentists prediction. It can be observed that Bayesian prediction is more reasonable.
* a circular decision boundary is formed (check the .5 contour line) rather than an irregular one
* the prediction is no longer black and white;
* at input locations further away from the observed data, the predictions are around 0.7 rather than 1.0 "
# ╔═╡ e1f91394-8da7-490a-acb9-d39b95ebdf33
md"""
## Appendix
"""
# ╔═╡ 3c06eb27-6147-4092-9ac5-312d7332cebd
md"The wage dataset"
# ╔═╡ a0b03301-805e-4d05-98ce-a320efff9667
begin
wage_df = dataset("ISLR", "Wage");
end;
# ╔═╡ 4cc79cbf-68cd-4376-8fd8-22f44a0fe3f8
begin
K_degree = 4
N_wage = length(wage_df.Age)
x_wage = wage_df.Age
# create a new design matrix Φ
Φ_poly = Float64.(hcat([x_wage.^k for k in 0:K_degree]...))
lm(Φ_poly, wage_df.Wage) # fit with GLM.jl
end;
# ╔═╡ e0d46820-da53-47e0-83eb-6f6503b3b3fb
let
dt = fit(ZScoreTransform, Φ_poly[:, 2:end], dims=1)
Φ_poly_t= [ones(N_wage) StatsBase.transform(dt, Φ_poly[:, 2:end])]
poly_reg = lm(Φ_poly_t, wage_df.Wage)
@df wage_df scatter(:Age, :Wage, ms=3, malpha=0.1, mc=1,markershape=:circle, label="", xlabel="Age", ylabel="Wage", yguidefontsize=8, xguidefontsize=8, title="$(K_degree)-th degree polynomial regression")
order=sortperm(wage_df.Age)
plot!(wage_df.Age[order], predict(poly_reg)[order], lw=3, lc=2, label="")
end
# ╔═╡ c9c15c9e-7400-4047-b8c1-2bd0bf7f4dfb
begin
wage = Float64.(wage_df.Age)
B = bs(wage, df=7, intercept=false)
end;
# ╔═╡ 2ab8821e-393f-4687-ac38-5f55b6263944
lbm=lm([ones(size(B)[1]) B], wage_df.Wage);
# ╔═╡ 5303c9ea-95c9-49fb-a9ff-0b085c2afae0
logrbm=glm([ones(size(B)[1]) B], wage_df.Wage .> 250, Binomial(), LogitLink());
# ╔═╡ b1d7ad9f-5019-47ea-84cc-7fc53153033b
plt_wage_reg = let
@df wage_df scatter(:Age, :Wage, ms=3, malpha=0.1, mc=1,markershape=:circle, label="", xlabel="Age", ylabel="Wage", yguidefontsize=8, xguidefontsize=8)
order=sortperm(wage_df.Age)
plot!(wage_df.Age[order], predict(lbm)[order], lw=3, lc=2, label="")
end;
# ╔═╡ 41c95a38-d477-4047-bb29-c9d001fc3593
plt_wage_class=let
order=sortperm(wage_df.Age)
plot(wage_df.Age[order], predict(logrbm)[order], lw=3, lc=2, label="", ylim=[-0.01, 0.22], xlabel="Age", ylabel=L"p(\texttt{Wage} >250|\texttt{Age})", yguidefontsize=8, xguidefontsize=8)
id_w250 = wage_df.Wage .> 250
scatter!(wage_df.Age[id_w250] + 0.2*randn(sum(id_w250)), 0.2 .* ones(sum(id_w250)), markershape=:vline, markersize=3, mc=:gray, label="")
scatter!(wage_df.Age[.!id_w250] + 0.2*randn(sum(.!id_w250)), zeros(sum(.!id_w250)), markershape=:vline, markersize=3, mc=:gray, label="")
end;
# ╔═╡ 2c50b998-12e7-4bba-a29c-5b892aa1612a
md"""
# Bayesian sparse models
So far we have focused on linear models, *i.e.* the relationship between the predictors and the target (or its transformation) is linear. The linear assumption makes sense for some simple cases. However, real-world data usually exhibit a more complicated correlation structure, and the relationships usually are **non-linear**.
As an example, we consider the `Wage` dataset discussed in the book [Introduction to Statistical Learning](https://hastie.su.domains/ISLR2/). The data records a number of factors that are considered to relate to wages for a group of men from the Atlantic region of the United States. The `Age` factor and `Wage` are plotted below on the left. It seems that wage increases with age initially but declines again after approximately age 60. We have also plotted `Age` against the binary variable `Wage > 250` on the right. The chance of earning over 250k is also non-linear with respect to the age factor.
$(begin
plot(plt_wage_reg, plt_wage_class, size=(650,330))
end)
"""
# ╔═╡ 09ed0b3a-1a92-441d-80da-d4ac2b7f80b3
md"Non-linear classification dataset"
# ╔═╡ 50007061-33bc-408f-8d8a-21b05e668cc3
begin
Random.seed!(123)
n_= 30
D1 = [0 0] .+ randn(n_,2)
# D1 = [D1; [-5 -5] .+ randn(n_,2)]
D2 = [5 5] .+ randn(n_,2)
D2 = [D2; [-5 -5] .+ randn(n_,2)]
D = [D1; D2]
targets_ = [repeat(["class 1"], n_); repeat(["class 2"], n_*2)]
targets = [zeros(n_); ones(n_*2)]
df_class = DataFrame(x₁ = D[:, 1], x₂ = D[:, 2], y=targets_)
end;
# ╔═╡ 31647188-cb5b-4e92-8ae3-91247c15d976
begin
Random.seed!(100)
nknots = 25
xknots = randperm(size(D)[1])[1:nknots]
rbf(xs, μ, σ²) = exp.(- 0.5 .* sum((xs .- μ).^2, dims=2) ./ σ²)[:]
function apply_rbs_expansion(D, μs, σ²)
hcat(map((x)-> rbf(D, x', σ²) , eachrow(μs))...)
end
σ²_rbf = 1.0
Φ_class = Array{Float64}(undef, size(D)[1], length(xknots))
for j in 1:length(xknots)
Φ_class[:, j] = rbf(D, D[xknots[j], :]', σ²_rbf)
end
end
# ╔═╡ 009ff641-1cf6-475b-8e63-2594bb40878f
md"""
To solve the problem, we apply (multi-dimensional) RBF basis expansions with randomly selected $(nknots) data points as centres and a fixed scale ``s^2=1.0``:
```math
\text{rbf}(\mathbf{x}, \boldsymbol{\mu}, s^2) = \exp\left( -\frac{1}{2s^2} (\mathbf{x}- \boldsymbol{\mu})^\top (\mathbf{x}- \boldsymbol{\mu})\right)
```
The randomly picked expansion locations and the contour plots of the corresponding RBF functions are plotted below. Each circle represents a new feature in the expanded design matrix ``\mathbf{\Phi}_{\text{class}}``.
"""
# ╔═╡ 267c8824-ad28-4e50-b331-7b6174778562
let
plt=@df df_class scatter(:x₁, :x₂, group=:y, legend=:right, xlabel=L"x_1", alpha=0.2, ylabel=L"x_2", size=(400,350) )
scatter!(D[xknots, 1], D[xknots, 2], markershape=:xcross,alpha=1, color=:gray,markerstrokewidth=2, ms=5, label=L"{\mu}")
for i in 1:length(xknots)
contour!(-7:0.1:7, -7:0.1:7, (x,y)->rbf([x y], D[xknots[i],:]', σ²_rbf)[1], levels=1, alpha=0.5)
end
plt
# b = glm([ones(size(D)[1]) D], targets, Bernoulli(), LogitLink()) |> coef
# contourf!(-10:0.1:10, -10:0.1:10, (x, y) -> logistic(b[1] + x*b[2] + y*b[3]), fill=false, alpha=0.1)
end
# ╔═╡ e0067b9a-dc9e-4dcb-95b9-71f040dd3d5c
rbf_freq_fit=glm([ones(length(targets)) Φ_class], targets, Binomial(), LogitLink());
# ╔═╡ 30547131-c50b-4ab0-a3ce-c55385f070cd
pred_class_feq, pred_class_freq_surf=let
xs = -10:0.2:10
ys = -10:0.2:10
function pred(x, y)
predict(rbf_freq_fit, [1 apply_rbs_expansion([x y], D[xknots,:], σ²_rbf)])[1]
end
plt = contour(xs, ys, pred, levels=10, fill=true, c=:jet1, alpha=0.5, title="Frequentist prediction: "*L"p(y=1|x)")
plt2 = surface(xs, ys, pred, levels=10, fill=true, c=:jet1, alpha=0.5, title="Frequentist prediction: "*L"p(y=1|x)")
@df df_class scatter!(plt, :x₁, :x₂, group=:y, legend=:right, xlabel=L"x_1", alpha=0.2, ylabel=L"x_2", size=(400,350), framestyle=:box )
plt, plt2
end;
# ╔═╡ 73718d33-7a81-41fd-bb80-66f805e08c50
plot(pred_class_feq, pred_class_freq_surf, size=(990,400))
# ╔═╡ 1d3b5326-be29-4c8d-b32a-853a7b46fd2b
chain_bayes_class=let
Random.seed!(123)
sample(hier_logistic_reg(Φ_class, targets), NUTS(), 1000; discard_init=500)
end;
# ╔═╡ fd46999c-ad42-4ad3-a94c-271263a5d7ad
pred_class_bayes, pred_class_bayes_surf =let
pred_class_mod = hier_logistic_reg(Φ_class, Vector{Union{Missing, Bool}}(undef, length(targets)))
β_samples = Array(chain_bayes_class[:,2:end,:])
xs = -10:0.2:10
ys = -10:0.2:10
function pred(x, y, βs)
ϕₓ = [1 apply_rbs_expansion([x y], D[xknots,:], σ²_rbf)]
mean(logistic.(βs * ϕₓ'))
end
plt = contour(xs, ys, (x,y)-> pred(x, y, β_samples), levels= 10,fill=true, alpha=0.5, c=:jet, lw=1, colorbar=true, xlim=[-10,10], framestyle=:box, title="Bayesian prediction: "*L"p(y=1|x)")
plt2 = surface(xs, ys, (x,y)-> pred(x, y, β_samples), levels= 10,fill=true, alpha=0.5, c=:jet, lw=1, colorbar=true, xlim=[-10,10], framestyle=:box, title="Bayesian prediction: "*L"p(y=1|x)")
@df df_class scatter!(plt, :x₁, :x₂, group=:y, legend=:right, xlabel=L"x_1", alpha=0.2, ylabel=L"x_2", size=(400,350) )
plt, plt2
end;
# ╔═╡ fa87351a-80dc-4486-85e6-e852ec244497
plot(pred_class_feq, pred_class_bayes, size=(990,400))
# ╔═╡ 84af3bc4-e82f-4049-96a6-2d5ee018bd97
plot(pred_class_freq_surf, pred_class_bayes_surf, size=(990,400))
# ╔═╡ 2951083d-79c3-4850-9d00-8a6fe030edd0
p_nl_cls = let
@df df_class scatter(:x₁, :x₂, group=:y, legend=:right, xlabel=L"x_1", ylabel=L"x_2")
# b = glm([ones(size(D)[1]) D], targets, Bernoulli(), LogitLink()) |> coef
# contourf!(-10:0.1:10, -10:0.1:10, (x, y) -> logistic(b[1] + x*b[2] + y*b[3]), fill=false, alpha=0.1)
end;
# ╔═╡ db1c1f87-c460-4e32-a965-13bff50fcd40
md"""
$(begin
plot(p_nl_cls, size=(400,350), title="A non-linear classification data")
end)
"""
# ╔═╡ 00000000-0000-0000-0000-000000000001
PLUTO_PROJECT_TOML_CONTENTS = """
[deps]
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
LaTeXStrings = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
PlutoTeachingTools = "661c6b06-c737-4d37-b85c-46df65de6f69"
PlutoUI = "7f904dfe-b85e-4ff6-b463-dae2292396a8"
RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Splines2 = "5a560754-308a-11ea-3701-ef72685e98d6"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd"
Turing = "fce5fe82-541a-59a6-adf8-730c64b5f9a0"
[compat]
CSV = "~0.10.4"
DataFrames = "~1.3.4"
GLM = "~1.8.0"
LaTeXStrings = "~1.3.0"
Plots = "~1.40.8"
PlutoTeachingTools = "~0.2.15"
PlutoUI = "~0.7.39"
RDatasets = "~0.7.7"
Splines2 = "~0.2.1"
StatsBase = "~0.34.3"
StatsFuns = "~1.3.0"
StatsPlots = "~0.15.1"
Turing = "~0.34.1"
"""
# ╔═╡ 00000000-0000-0000-0000-000000000002
PLUTO_MANIFEST_TOML_CONTENTS = """
# This file is machine-generated - editing it directly is not advised
julia_version = "1.11.0"
manifest_format = "2.0"
project_hash = "645b92b6a7a766684befbda3746e87ed977182b2"
[[deps.ADTypes]]
git-tree-sha1 = "eea5d80188827b35333801ef97a40c2ed653b081"
uuid = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
version = "1.9.0"
[deps.ADTypes.extensions]
ADTypesChainRulesCoreExt = "ChainRulesCore"
ADTypesEnzymeCoreExt = "EnzymeCore"
[deps.ADTypes.weakdeps]
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
[[deps.AbstractFFTs]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef"
uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
version = "1.5.0"
weakdeps = ["ChainRulesCore", "Test"]
[deps.AbstractFFTs.extensions]
AbstractFFTsChainRulesCoreExt = "ChainRulesCore"
AbstractFFTsTestExt = "Test"
[[deps.AbstractMCMC]]
deps = ["BangBang", "ConsoleProgressMonitor", "Distributed", "FillArrays", "LogDensityProblems", "Logging", "LoggingExtras", "ProgressLogging", "Random", "StatsBase", "TerminalLoggers", "Transducers"]
git-tree-sha1 = "c732dd9f356d26cc48d3b484f3fd9886c0ba8ba3"
uuid = "80f14c24-f653-4e6a-9b94-39d6b0f70001"
version = "5.5.0"
[[deps.AbstractPPL]]
deps = ["AbstractMCMC", "Accessors", "DensityInterface", "Random"]
git-tree-sha1 = "6380a9a03a4207bac53ac310dd3a283bb4df54ef"
uuid = "7a57a42e-76ec-4ea3-a279-07e840d6d9cf"
version = "0.8.4"
[[deps.AbstractPlutoDingetjes]]
deps = ["Pkg"]
git-tree-sha1 = "6e1d2a35f2f90a4bc7c2ed98079b2ba09c35b83a"
uuid = "6e696c72-6542-2067-7265-42206c756150"
version = "1.3.2"
[[deps.AbstractTrees]]
git-tree-sha1 = "2d9c9a55f9c93e8887ad391fbae72f8ef55e1177"
uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
version = "0.4.5"
[[deps.Accessors]]
deps = ["CompositionsBase", "ConstructionBase", "InverseFunctions", "LinearAlgebra", "MacroTools", "Markdown"]
git-tree-sha1 = "b392ede862e506d451fc1616e79aa6f4c673dab8"
uuid = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
version = "0.1.38"
[deps.Accessors.extensions]
AccessorsAxisKeysExt = "AxisKeys"
AccessorsDatesExt = "Dates"
AccessorsIntervalSetsExt = "IntervalSets"
AccessorsStaticArraysExt = "StaticArrays"
AccessorsStructArraysExt = "StructArrays"