forked from dccuchile/CC6104
-
Notifications
You must be signed in to change notification settings - Fork 0
/
2_1_ST-inference.tex
1753 lines (1111 loc) · 50.9 KB
/
2_1_ST-inference.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
%\documentclass[mathserif]{beamer}
\documentclass[handout]{beamer}
%\usetheme{Goettingen}
\usetheme{Warsaw}
%\usetheme{Singapore}
%\usetheme{Frankfurt}
%\usetheme{Copenhagen}
%\usetheme{Szeged}
%\usetheme{Montpellier}
%\usetheme{CambridgeUS}
%\usecolortheme{}
%\setbeamercovered{transparent}
\usepackage[english, activeacute]{babel}
\usepackage[utf8]{inputenc}
\usepackage{amsmath, amssymb}
\usepackage{dsfont}
\usepackage{graphics}
\usepackage{cases}
\usepackage{graphicx}
\usepackage{pgf}
\usepackage{epsfig}
\usepackage{amssymb}
\usepackage{multirow}
\usepackage{amstext}
\usepackage[ruled,vlined,lined]{algorithm2e}
\usepackage{amsmath}
\usepackage{epic}
\usepackage{fontenc}
\usepackage{framed,color}
\usepackage{palatino, url, multicol}
\usepackage{listings}
%\algsetup{indent=2em}
\vspace{-0.5cm}
\title{Introduction to Statistical Inference}
\vspace{-0.5cm}
\author[Felipe Bravo Márquez]{\footnotesize
%\author{\footnotesize
\textcolor[rgb]{0.00,0.00,1.00}{Felipe José Bravo Márquez}}
\date{ \today }
\begin{document}
\begin{frame}
\titlepage
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Useful references: http://www.buders.com/UNIVERSITE/Universite_Dersleri/istatistik/sampling_distributions_and_point_estimation_of_parameters.pdf
% http://homepage.divms.uiowa.edu/~rdecook/stat2020/notes/ch7_pt1.pdf
\begin{frame}{Populations and Samples}
\scriptsize{
\begin{itemize}
\item The main goal of statistical inference is investigate properties about a target \textbf{population}.
\item A \textbf{population} is the entire group of individuals that we are interested in studying.
\item This could be anything from all humans to a specific type of cell.
\item The individual elements of the population sometimes are called \textbf{units}.
\item Example: What is the average height of all people in Chile? Here the population is all the inhabitants of Chile.
\item In order to draw conclusions about a \textbf{population}, it is generally not feasible to gather all the data about it.
\item The special case where you collect data on the entire population is a \textbf{census}.
\end{itemize}
}
\end{frame}
\begin{frame}{Populations and Samples}
\scriptsize{
\begin{itemize}
\item In statistical inference we try to make reasonable conclusions about a population based on the evidence provided by \textbf{sample data}.
\item A \textbf{sample statistic} or simply \textbf{statistic} is a quantitative measure calculated from a sample. Examples: the mean, the standard deviation, the minimum, the maximum.
\end{itemize}
\begin{block}{Example}
\begin{itemize}
\item Taking a sample \textbf{survey} can help you determine the percentage of people in a population who have a particular characteristic.
\item Nielsen Media Research takes a \textbf{survey} so they can get an estimate of the proportion of all U.S. households that are tuned to a particular television program \cite{watkins2010statistics}.
\item The true proportion that Nielsen would get from a survey of \textbf{every} household is called a \textbf{population parameter}.
\item Nielsen uses the proportion in the \textbf{sample} as an
\textbf{estimate} of this parameter.
\item Such an estimate from a sample is called a \textbf{statistic}.
\end{itemize}
\end{block}
}
\end{frame}
\begin{frame}{Sampling Methods}
\scriptsize{
\begin{itemize}
\item Our goal in sampling is to determine the value of a statistic for an entire population of interest, using just a small subset of the population.
\item We do this primarily to save time and effort.
\item A good sample is \textbf{representative}: it looks like a small version of the population.
\item In practice, we can't tell if if a sample is representative, since we can't get all the population data.
\item But, we can tell whether a \textbf{sampling method} is good or not.
\item A sampling method is \textbf{biased} if it produces samples such that the estimate from the sample is larger or smaller, on average, than the population parameter being estimated.
\item The problems or \textbf{biases} of a sampling method can come from two sources:
\begin{enumerate}
\scriptsize{
\item \textbf{Sampling selection bias}: the way in which the sample is built.
\item \textbf{Response bias}: the method for getting a response.
}
\end{enumerate}
\end{itemize}
}
\end{frame}
\begin{frame}{Sample selection bias}
\scriptsize{
Sample selection bias happens when samples tend to result in estimates of population parameters that systematically are too high or too low \cite{watkins2010statistics}. \\ It can occur in the following ways:
\begin{itemize}
\item \textbf{Size bias}: using a method that gives larger units a bigger change of being in the sample.
\begin{itemize}
\scriptsize{
\item Example: patients who spent more days in a hospital are more likely to be selected for the sample.}
\end{itemize}
\item \textbf{Voluntary response bias}: letting people volunteer to be in the sample.
\begin{itemize}
\scriptsize{
\item Example: When a radio program asks people to call in and take sides on some issue, those who care about the issue will be over-represented, and those who don't care as much might not be represented at all.}
\end{itemize}
\item \textbf{Convenience sample bias}: units are chosen because of convenience.
\begin{itemize}
\scriptsize{
\item Example: What percentage of the students in your graduating class plan to go to work immediately after graduation?
\item We can use our friends as a quicker and more convenient sample, but almost certainly biased because friends are unlikely to be representative of the entire target population.}
\end{itemize}
\end{itemize}
}
\end{frame}
\begin{frame}{Sample selection bias}
\scriptsize{
\begin{itemize}
\item \textbf{Judgment sampling bias}: selecting the sampling units based on ``expert'' judgment. Problem: experts might overlook important features of a population.
\begin{itemize}
\scriptsize{
\item Example: In the early days of US election polling, local ``experts'' were hired to sample voters in their locale by filling certain quotas (so many men, so many women, so many voters over the age of 40, so many employed workers, and so on).
\item The poll takers used their own judgment as to whom they selected for the poll.
\item It took a very close election (the 1948 presidential election, in which polls were wrong in their prediction) for the polling organizations to realize that quota sampling was a biased method.}
\end{itemize}
\end{itemize}
}
\end{frame}
\begin{frame}{Sample selection bias}
\scriptsize{
\begin{itemize}
\item \textbf{Sampling frame bias}: a sampling frame is the ``list'' of all population units from which you select the sample. Constructing an inadequate sampling frame is a cause of bias.
\begin{itemize}
\scriptsize{
\item The problem is that in many problems is extremely hard to make that ``list''.
\item How would you list all the people using the Internet worldwide or all the ants in a park?
\item For all practical purposes, you can't.
\item There will often be a difference between the population and the sampling frame.
\item A sample might represent the units in the frame quite well, but how well your sample represents the population depends on how well you've chosen your frame.
\item If you start from a bad frame, even the best sampling methods can't save you: bad frame, bad sample \cite{watkins2010statistics}.
}
\end{itemize}
\end{itemize}
}
\end{frame}
\begin{frame}{Response bias}
\scriptsize{
These types of bias derive from the method of obtaining the response.
\begin{itemize}
\item \textbf{Nonresponse bias}: people often refuse to respond to a survey. These people may be different from those who agree to participate.
\item \textbf{Incorrect response or measurement bias}: the bias might be the result of intentional lying, or come from inaccurate measuring devices, including inaccurate memories of people being interviewed in self-reported data.
\begin{itemize}
\begin{scriptsize}
\item Example 1: many people don't want to admit that they watch a certain television program.
\item Example 2: patients in medical studies are prone to overstate how well they have followed the physician's orders.
\item Example 3: many people tend to underestimate the amount of time they actually spend with their cell phones.
\end{scriptsize}
\end{itemize}
\end{itemize}
}
\end{frame}
\begin{frame}{Response bias}
\scriptsize{
\begin{itemize}
\item \textbf{Questionnaire bias}: people's opinions may vary depending on the interviewer's tone of voice, the order in which the questions are asked and the wording of the questions, etc..
\begin{itemize}
\scriptsize{
\item Example: Reader's Digest asked the same 1031 people to respond to these two statements:
\begin{enumerate}
\scriptsize{
\item I would be disappointed if Congress cut its funding for public television.
\item Cuts in funding for public television are justified as part of an overall effort to reduce federal spending.
}
\end{enumerate}
\item Note that agreeing with the first statement is pretty much the same as disagreeing with the second. However:
\item First statement: 54\% agreed, 40\% disagreed, and 6\% didn't know.
\item Second statement: 52\% agreed, 37\% disagreed, and 10\% didn’t know. \cite{barnes1995can}
}
\end{itemize}
\end{itemize}
}
\end{frame}
\begin{frame}{Random Samples}
\scriptsize{
\begin{itemize}
\item The key idea for building a good sample is to \textbf{randomize}, that is, let chance choose the sampling units.
\item Selecting your sample by chance is the only method guaranteed to be unbiased.
\end{itemize}
\begin{block}{Simple Random Sampling}
\begin{itemize}
\item All possible samples of a given fixed size are equally likely.
\item All individuals in the population are indexed and randomly drawn with equal probability until the sample size is reached.
\end{itemize}
\end{block}
\begin{block}{Stratified Random Sampling}
\begin{itemize}
\item We divide the population into subgroups based on shared characteristics (e.g., country, city) that do not overlap and that cover the entire sampling frame.
\item These subgroups are called strata.
\item Take a simple random sample for each strata proportional to its size.
\item It ensures that every strata is properly represented in the sample.
\end{itemize}
\end{block}
}
\end{frame}
\begin{frame}{Random Samples}
\scriptsize{
\begin{figure}[h!]
\centering
\includegraphics[scale=0.4]{pics/sample_strat.png}
\end{figure}
\footnotemark{Figure source: \url{https://www.scribbr.com/methodology/sampling-methods/}}
}
\end{frame}
\begin{frame}{A Formal Definition of Statistical Inference }
\scriptsize{
\begin{itemize}
\item The process of drawing conclusions about a population from sample data is known as \textbf{statistical inference}.
\item From a general point of view, the goal of inference is to \textbf{infer} the distribution that generates the observed data.
\item Example: Given a sample $X_1, \dots, X_n \sim F$, how do we infer $F$?
\item However, in most cases we are only interested in inferring some property of $F$ (e.g., its \textbf{mean} value).
\item Statistical models that assume that the distribution can be modeled with a finite set of parameters $\theta= (\theta_{1},\theta_{2},\dots,\theta_{k})$ are called \textbf{parametric models}.
\item Example: if we assume that the data comes from a normal distribution $N(\mu,\sigma^2)$, $\mu$ and $\sigma$ would be the parameters of the model.
\end{itemize}
}
\end{frame}
\begin{frame}{Frequentist Approach}
\scriptsize{
The statistical methods to be presented is this class are known as \textbf{frequentist (or classical)} methods. They are based on the following postulates \cite{wasserman2013all}:
\begin{itemize}
\item Probability refers to limiting relative frequencies. Probabilities are objective properties of the real world.
\item Parameters are fixed, unknown constants. Because they are not fluctuating, no useful probability statements can be made about parameters.
\item Statistical procedures should be designed to have well-defined long run frequency properties. For example, a 95 percent confidence interval should trap the true value of the parameter with limiting frequency at least 95 percent.
\end{itemize}
There is another approach to inference called \textbf{Bayesian inference}, which is based on different postulates, to be discussed later in the course.
}
\end{frame}
\begin{frame}{Point Estimation}
\scriptsize{
\begin{itemize}
\item Point estimation is the process of finding the \textbf{best guess} for some quantity of interest from a \textbf{statistical sample}.
\item In a general sense, this quantity of interest could be a parameter in a parametric model, a CDF $F$, a probability density function $f$, a regression function $r$, or a prediction for a future value $Y$ of some random variable.
\item In this class we will consider this quantity of interest as a \textbf{population parameter} $\theta$.
\item By convention, we denote a point estimate of $\theta$ by $\hat{\theta}$ or $\hat{\theta}_n$.
\item It is important to remark that while $\theta$ is an unknown fixed value, $\hat{\theta}$ depends on the sample data and is therefore a random variable.
\item We need to bear in mind that the process of sampling is by definition a \textbf{random experiment}.
\end{itemize}
}
\end{frame}
\begin{frame}{Point Estimation}
%http://homepage.divms.uiowa.edu/~rdecook/stat2020/notes/ch7_pt1.pdf
\scriptsize{
\begin{block}{Formal Definition}
\begin{itemize}
\item Let $X_1, \dots, X_n$ be $n$ IID data points from some distribution $F$.
\item A point estimator $\hat{\theta}_n$ of a parameter $\theta$ is some function of $X_1, \dots, X_n$:
\begin{displaymath}
\hat{\theta}_n=g(X_1, \dots, X_n)
\end{displaymath}
\end{itemize}
\end{block}
\begin{itemize}
\item The \textbf{bias} of an estimator is defined as:
\begin{displaymath}
\text{bias}(\hat{\theta}_n)=\mathbb{E}(\hat{\theta}_n)-\theta
\end{displaymath}
\item An estimator is unbiased if $\mathbb{E}(\hat{\theta}_n)=\theta$ or $\text{bias}(\hat{\theta}_n)=0 $
\end{itemize}
}
\end{frame}
\begin{frame}{Sampling Distribution}
\scriptsize{
% https://en.wikipedia.org/wiki/Sampling_distribution
\begin{itemize}
\item If we take multiple samples, the value of our statistical estimate $\hat{\theta}_n$ will also vary from sample to sample.
\item We refer to this distribution of our estimator across samples as the \textbf{sampling distribution} \cite{poldrack2019statistical}.
\item The sampling distribution may be considered as the distribution of $\hat{\theta}_n$ for all possible samples from the same population of size $n$.
\item The sampling distribution describes the variability of the point estimate around the true population parameter from sample to sample.
\item We need to bear in mind this is an imaginary concept, since in real situations we can't obtain all possible samples.
\item Actually, in most cases we will only work with a single sample.
\end{itemize}
}
\end{frame}
\begin{frame}{Standard Error}
\scriptsize{
\begin{itemize}
\item The standard deviation of $\hat{\theta}_n$ is called the \textbf{standard error} $se$:
\begin{displaymath}
se(\hat{\theta}_n)=\sqrt{\mathbb{V}(\hat{\theta}_n})
\end{displaymath}
\item The standard error tells us about the variability of the estimator between all possible samples of the same size.
\item It can be considered as the standard deviation of the sampling distribution.
\item It is a measure of the uncertainty of the point estimate.
\end{itemize}
}
\end{frame}
\begin{frame}{The Sample Mean}
\scriptsize{
\begin{itemize}
\item Let $X_1,X_2,\dots,X_n$ be a random sample of a population of mean $\mu$ and variance $\sigma^2$.
\item Let's suppose that we are interested in estimating the \textbf{population mean} $\mu$ (e.g., the mean height of Chilean people).
\item A sample statistic we can derive from the data is the \textbf{sample mean} $\overline{X_{n}}$:
\begin{displaymath}
\overline{X_{n}}=\frac{1}{n}\sum_{i=1}^{n} X_i
\end{displaymath}
\item The sample mean is a \textbf{point estimator} of the mean $\overline{X_{n}} = \hat{\mu}$.
\item We can show that the sample mean is an unbiased estimator of $\mu$:
\begin{displaymath}
\mathbb{E}(\overline{X_{n}}) = \mathbb{E}(\frac 1n \sum_{i=1}^{n} X_i) = \frac 1n \times \mathbb{E}(\sum_{i=1}^{n} X_i) = \frac 1n (n \times \mu) = \mu
\end{displaymath}
\end{itemize}
}
\end{frame}
\begin{frame}{The Standard Error of the Sample Mean}
\scriptsize{
\begin{itemize}
\item The standard error of the sample mean $se(\overline{X_{n}}) = \sqrt{\mathbb{V}(\overline{X_{n}})}$ can be calculated as:
\begin{displaymath}
\mathbb{V}(\overline{X_{n}})=\mathbb{V}(\frac 1n \sum_{i=1}^{n} X_i) = \frac{1}{n^2} \mathbb{V}(\sum_{i=1}^{n} X_i) = \frac{n}{n^2} \mathbb{V}(X_i)=\frac{\sigma^2}{n}
\end{displaymath}
\item Then,
\begin{displaymath}
se(\overline{X_{n}}) = \frac{\sigma}{\sqrt{n}}
\end{displaymath}
\item The formula for the standard error of the mean implies that the quality of our measurement involves two quantities: the population variability $\sigma$, and the size of our sample $n$.
\end{itemize}
}
\end{frame}
\begin{frame}{The Standard Error of the Sample Mean}
\scriptsize{
\begin{itemize}
\item We have no control over the population variability, but we do have control over the sample size.
\item Thus, if we wish to improve our sample statistics (by reducing their sampling variability) then we should use larger samples.
\item However, the formula also tells us something very fundamental about statistical sampling.
\item That the utility of larger samples diminishes with the square root of the sample size.
\item This means that doubling the sample size will not double the quality of the statistics; rather, it will improve it by a factor of $\sqrt{2}$. \cite{poldrack2019statistical}
\end{itemize}
}
\end{frame}
\begin{frame}{Sample Variance}
\scriptsize{
\begin{itemize}
\item A common problem when calculating $ se(\overline{X_{n}})$ is that, in general, we do not know $\sigma$ of the population.
\item In those cases we can estimate $\sigma$ using the \textbf{sample variance} $s$:
\begin{displaymath}
s^{2}= \frac{1}{n-1} \sum_{i}^{n}(X_{i}-\overline{X_{n}})^2
\end{displaymath}
\item This is an unbiased estimator of the variance.
\item The standard error of the sample mean when the population variance is unknown can be estimated as follows:
\begin{displaymath}
\hat{se}(\overline{X_{n}}) = \frac{s}{\sqrt{n}} \end{displaymath}
\end{itemize}
}
\end{frame}
\begin{frame}{Population Variance}
\scriptsize{
\begin{itemize}
\item There is also the population variance, defined as follows:
\begin{displaymath}
\sigma^{2}= \frac{1}{N} \sum_{i}^{N}(X_{i}-\overline{X_{N}})^2
\end{displaymath}
\item The population variance should only be calculated from population data (all the individuals).
\item Note that we are using $N$ instead of $n$ to denote the entire population rather than a sample.
\item If is calculated from a sample, it would be a \textbf{biased} estimator of the population variance.
\end{itemize}
}
\end{frame}
\begin{frame}[fragile]{The Sampling Distribution of the Sample Mean}
\scriptsize{
\begin{itemize}
\item We discussed earlier that the sampling distribution is an imaginary concept.
\item Let's imagine the sampling distribution of the sample mean.
\item Imagine drawing (with replacement) all possible samples of size $n$ from a population.
\item Then for each sample, calculate the sample statistic, which is this case is the sample mean.
\item The frequency distribution of those sample means would be the sampling distribution of the mean (for samples of size $n$ drawn from that particular population).
\item In the next example we will calculate the sampling distribution for a toy example in which the population is known.
\end{itemize}
}
\end{frame}
\begin{frame}[fragile]{The Sampling Distribution of the Sample Mean}
\scriptsize{
\begin{itemize}
\item Suppose our entire population is a family of 5 siblings and our property of interest is age measured in years.
\item Our population consists of the following 5 values: 2, 3, 4, 5, and 6.
\item Let's calculate the population mean $\mu$ and the population standard deviation $\sigma$.
\end{itemize}
\begin{verbatim}
> pop <-c(2,3,4,5,6)
> mean(pop)
[1] 4
> sd.p=function(x){sd(x)*sqrt((length(x)-1)/length(x))}
> sd.p(pop)
[1] 1.414214
\end{verbatim}
$\mu$=4 and $\sigma=1.414214$
}
\end{frame}
\begin{frame}[fragile]{The Sampling Distribution of the Sample Mean}
\scriptsize{
\begin{itemize}
\item Now, we will use the R library ``gtools'' to draw all 25 possible samples (with replacement) of size $2$.
\end{itemize}
\begin{verbatim}
> library(gtools)
> library(tidyverse)
> samp_size <- 2
> samples<-as_tibble(permutations(length(pop), samp_size,
+ pop, repeats.allowed=TRUE))
> samples
# A tibble: 25 x 2
V1 V2
<dbl> <dbl>
1 2 2
2 2 3
3 2 4
4 2 5
5 2 6
6 3 2
7 3 3
8 3 4
9 3 5
10 3 6
# … with 15 more rows
\end{verbatim}
}
\end{frame}
\begin{frame}[fragile]{The Sampling Distribution of the Sample Mean}
\scriptsize{
\begin{itemize}
\item We can calculate the sample mean of each sample using the command ``mutate'':
\end{itemize}
\begin{verbatim}
> samples <- samples %>% rowwise() %>%
+ mutate(sample_mean=mean(c(V1,V2)))
> samples
# A tibble: 25 x 3
# Rowwise:
V1 V2 sample_mean
<dbl> <dbl> <dbl>
1 2 2 2
2 2 3 2.5
3 2 4 3
4 2 5 3.5
5 2 6 4
6 3 2 2.5
7 3 3 3
8 3 4 3.5
9 3 5 4
10 3 6 4.5
# … with 15 more rows
\end{verbatim}
}
\end{frame}
\begin{frame}[fragile]{The Sampling Distribution of the Sample Mean}
\scriptsize{
\begin{itemize}
\item The distribution of these sample means is the \textbf{sampling distribution}.
\item We can visualize its shape by plotting an histogram:
\begin{verbatim}
ggplot(samples, aes(x=sample_mean)) +
geom_histogram(bins = 10, color="black", fill="white")
\end{verbatim}
\end{itemize}
\begin{figure}[h!]
\centering
\includegraphics[scale=0.35]{pics/hist_sampdist.pdf}
\end{figure}
}
\end{frame}
\begin{frame}[fragile]{The Sampling Distribution of the Sample Mean}
\scriptsize{
\begin{itemize}
\item You may noticed that the histogram is peaked in the middle, and symmetrical.
\item This is a consequence of the Central Limit Theorem!!!
\item We can see that the population distribution is very different from the sampling distribution:
\begin{verbatim}
ggplot(data.frame(pop), aes(x=pop)) +
geom_histogram(bins = 5, color="black", fill="white")
\end{verbatim}
\end{itemize}
\begin{figure}[h!]
\centering
\includegraphics[scale=0.3]{pics/pop_dist.pdf}
\end{figure}
}
\end{frame}
\begin{frame}[fragile]{The Sampling Distribution of the Sample Mean}
\scriptsize{
\begin{itemize}
\item Let's calculate the mean and the standard deviation of the sample means:
\begin{verbatim}
> mean(samples$sample_mean)
[1] 4
> sd.p(samples$sample_mean)
[1] 1
\end{verbatim}
\item We can see that mean of the sampling distribution of the mean $\mu_{\overline{X}}$ equals the population mean $\mu$.
\item We can also calculate the theoretical standard error $se=\sigma/\sqrt{n}$
\begin{verbatim}
> sd.p(pop)/sqrt(samp_size)
[1] 1
\end{verbatim}
which is the same as the standard distribution of the sampling distribution of the sample mean.
\item We have validated empirically that the sample mean is a good estimator of the population mean and that its standard error can be calculated from the population standard deviation and the sample size.
\end{itemize}
}
\end{frame}
\begin{frame}[fragile]{The Sampling Distribution of the Sample Mean}
\scriptsize{
\begin{itemize}
\item The central limit theorem tell us the conditions under which the sampling distribution of the mean is normally distributed or at least approximately normal.
\item If the population from which you sample is itself normally distributed, then the sampling distribution of the mean will be normal, regardless of sample size.
\item If the population from which you sample is non-normal, the sampling distribution of the mean will still be approximately normal given a large enough sample size.
\item What size is sufficient? Some authors say 30 or 40. But if the population distribution is extremely non-normal (i.e. very skewed) you will need more.
\end{itemize}
}
\end{frame}
\begin{frame}{Point Estimation of a Proportion}
\scriptsize{
\begin{itemize}
\item Suppose we want to estimate the fraction of people who will vote for a certain candidate.
\item Our population parameter $p$ corresponds to the true fraction of voters for this candidate.
\item We can model a sample of independent voters $X_1, \dots, X_n$, as Bernoulli distributed random variables with parameter $p$.
\item We interpret $X_i=0$ as a negative vote and $X_i=1$ as a positive vote.
\item The sample proportion $\hat{p}_{n}=\frac 1n \sum_{i}X_{i}$ is our estimator of $p$.
\end{itemize}
}
\end{frame}
\begin{frame}{Point Estimation of a Proportion}
\scriptsize{
\begin{itemize}
\item Then $\mathbb{E}(\hat{p}_{n})= \frac 1n \sum_i \mathbb{E}(X_i)=p$, and $\hat{p}_n$ is unbiased.
\item The standard error $se$ would be
\begin{displaymath}
se = \sqrt{\mathbb{V}(\hat{p}_n)}= \sqrt{p(1-p)/n}
\end{displaymath}
\item The estimated standard error $\hat{se}$:
\begin{displaymath}
\hat{se} =\sqrt{\hat{p}(1-\hat p)/n}
\end{displaymath}
\item By the Central Limit Theorem the sampling distribution of the sample proportion converges to a Normal distribution: $\hat{p}_{n} \approx N(p, \hat{se}^2)$.
\item This is because the sample proportion is actually the sample mean of a binary population.
\end{itemize}
}
\end{frame}
\begin{frame}{Consistency}
\scriptsize{
\begin{itemize}
\item A good estimator is expected to be unbiased and of minimum standard error.
\item Unbiasedness used to receive much attention but these days is considered less important
\item Many of the estimators we will use are biased.
\item A reasonable requirement for an estimator is that it should converge to the true parameter value as we collect more and more data.
\item A point estimator $\hat{\theta}_n$ of a parameter $\theta$ is \textbf{consistent} if it converges to the true value when the number of data in the sample tends to infinity.
\end{itemize}
}
\end{frame}
\begin{frame}{Consistency}
\scriptsize{
\begin{itemize}
\item Theorem: If for an estimator $\hat{\theta}_n$, its $bias \rightarrow 0$ and its $se \rightarrow 0$ when $n\rightarrow \infty$, $\hat{\theta}_n$, it is a consistent estimator of $\theta$.
\item For example, for the sample mean $\mathbb{E}(\overline{X_{n}})=\mu$, which implies that the $bias =0$.
\item Then $se(\overline{X_{n}}) = \frac{\sigma}{\sqrt{n}}$ converges to zero when $n\rightarrow \infty$.
\item $\overline{X_{n}}$ is a consistent estimator of the mean.
\item For the case of the Bernoulli experiment one has that $\mathbb{E}(\hat{p})=p \Rightarrow bias=0$ and $se = \sqrt{p(1-p)/n} \rightarrow 0$ when $n\rightarrow \infty$.
\item Then $\hat{p}$ is a consistent estimator of $p$.
\end{itemize}
}
\end{frame}
\begin{frame}{Maximum Likelihood Estimation}
\scriptsize{
\begin{itemize}
\item The estimators we have presented so far (e.g., the sample mean, the sample proportion) are intuitive, easy to compute, and consistent.
\item Maximum Likelihood Estimation (MLE) is a more general framework for estimating the \textbf{parameters} of any \textbf{parametric model}.
\item In MLE, we assume that the sample data is generated by a given probability distribution ( continuous or discrete) parameterized by $\theta$ and try to find the value of $\theta$ that maximizes the joint probability of the data under that distribution.
\item Idea: find the parameter values of the assumed statistical model that make the observed data most probable.
\item For example, we can assume that each data point is generated by $N(\mu,\sigma^2)$, then we compute the joint PDF (or PMF) of our data and find the parameter values for $\mu$ and $\sigma$ that maximize that joint density (or mass).
\end{itemize}
}
\end{frame}
\begin{frame}{Maximum Likelihood Estimation}
\scriptsize{
\begin{itemize}
\item Students learning statistics often ask: how would we ever know that the distribution that generated the data is in some parametric model? \cite{wasserman2013all}
\item There are cases where background knowledge suggests that a parametric model provides a reasonable approximation.
\item Example 1: independent binary experiments (e.g., flipping a coin or voting for a candidate) can be adequately represented with Bernoulli or Binomial distributions.
\item Example 2: counts of traffic accidents are known from prior experience to follow approximately a Poisson model.
\item In many other cases non parametric methods are preferable, but they are beyond the scope of this course.
\end{itemize}
}
\end{frame}
\begin{frame}{Maximum Likelihood Estimation}
\scriptsize{
\begin{itemize}
\item Let $X_1,\dots,X_n$ be IID with PDF (or PMF) $f(x;\theta)$.
\item Since we are assuming that our data samples are independent random variables, the joint density (or mass) would be the product of each PDF (or PMF):
\begin{displaymath}
\mathcal{L}_{n}(\theta)=\prod_{i=1}^nf(X_i;\theta)
\end{displaymath}
\item We refer to this joint density (or mass) as the \textbf{likelihood function}.
\item The likelihood function is just the joint density (or mass) of the data, except that we treat it is a function of the parameter $\theta$.
\item In MLE, we turn the estimation task into an optimization problem:
\begin{center}
\begin{equation}
\begin{split}
\max_{\theta} & \quad \mathcal{L}_{n}(\theta)
\end{split}
\end{equation}
\end{center}
\item The maximum likelihood estimator MLE, denoted by $\hat{\theta}_n$, is the value of $\theta$ that maximizes $\mathcal{L}_{n}(\theta)$.
\end{itemize}
}
\end{frame}
\begin{frame}{Maximum Likelihood Estimation}
\scriptsize{
\begin{itemize}
\item In many cases the log-likelihood is easier to optimize $l_n(\theta)$:
\begin{displaymath}
l_n(\theta) = \log(\mathcal{L}_{n}(\theta))=\log(\prod_{i=1}^nf(X_i;\theta))= \sum_{i=1}^{n}\log(f(X_i;\theta))
\end{displaymath}
\item Since the logarithm is a monotonic function, the maximum of occurs at the same value of $\theta$.
\item If the $l_n$ is differentiable we can find $\hat{\theta}_n$ by setting the derivatives to zero:
\begin{displaymath}
\frac{\partial l_n}{\partial \theta} = 0
\end{displaymath}
\item The MLE has many good mathematical properties that go beyond the scope of this course to discuss.
\item Some properties worth knowing are that the MLE is \textbf{consistent} and is \textbf{asymptotically Normal distribuited} (i.e., its sampling distribution converges to a Gaussian).
\end{itemize}
}
\end{frame}
\begin{frame}{Maximum Likelihood Estimation}
\scriptsize{
\begin{itemize}
\item Example 1: Suppose that $X_1,\dots, X_n \sim$ Bernoulli$(p)$.
\item The probability mass function is $f(x;p)= p^x(1- p)^{1-x}$ for $x = 0,1$. The unknown parameter is $p$.
\item Then,
\begin{displaymath}
\mathcal{L}_{n}(p)=\prod_{i=1}^nf(X_i;p) = \prod_{i=1}^np^{X_i}(1-p)^{1-X_i}=p^S(1-p)^{n-S}
\end{displaymath}
where $S=\sum_{i}X_i$. Hence,
\begin{displaymath}
l_n(\theta) = S\log p+ (n-S)\log(1-p).
\end{displaymath}