diff --git a/10-R-trick.Rmd b/10-R-trick.Rmd index f089318..7a35314 100644 --- a/10-R-trick.Rmd +++ b/10-R-trick.Rmd @@ -94,3 +94,37 @@ for (i in 1:100) { total = iter, clear = FALSE) ``` + + +-[Benchmark](https://www.alexejgossmann.com/benchmarking_r/) + +其中的试例代码: +``` +benchmark("lm" = { + X <- matrix(rnorm(1000), 100, 10) + y <- X %*% sample(1:10, 10) + rnorm(100) + b <- lm(y ~ X + 0)$coef + }, + "pseudoinverse" = { + X <- matrix(rnorm(1000), 100, 10) + y <- X %*% sample(1:10, 10) + rnorm(100) + b <- solve(t(X) %*% X) %*% t(X) %*% y + }, + "linear system" = { + X <- matrix(rnorm(1000), 100, 10) + y <- X %*% sample(1:10, 10) + rnorm(100) + b <- solve(t(X) %*% X, t(X) %*% y) + }, + replications = 1000, + columns = c("test", "replications", "elapsed", + "relative", "user.self", "sys.self")) + +``` + +可以很简单的改成我的代码: +``` +benchmark("Rvec2tril"=BayesJMCM::vec2tril(phiiVec) + "Rcppvec2tril"=vec2trilcpp(phiiVec) + ) +``` + diff --git a/22-cpp-for-computing.Rmd b/22-cpp-for-computing.Rmd new file mode 100644 index 0000000..e8b5200 --- /dev/null +++ b/22-cpp-for-computing.Rmd @@ -0,0 +1,428 @@ +# Guide to Scientific Computing in C++ + + +## Basics + +- first compile in command line: +`g++ -o HelloWorld HelloWorld.cpp` +也可以直接编译HelloWorld.cpp,默认的输出名字是a.out +``` +g++ HelloWorld.cpp +``` + +- compile flag: +让编译器保留debug信息,然后不优化代码,这样能进行debug,用“-g” flag +``` +g++ -g -o HelloWorld HelloWorld.cpp +``` + +让编译器链接library +``` +g++ -lm -o HelloWorld HelloWorld.cpp +``` + +## Basics in C++ + +Assert语句进行简单的error抛出: + +``` +#include #include #include +int main(int argc, char * argv[]) { + double a; std::cout << "Enter a non-negative number\n"; + std::cin >> a; assert(a >= 0.0); + std::cout << "The square root of "<< a; + std::cout << " is " << sqrt(a) << "\n"; + return 0; +} +``` + + +## Redirect Console Output to File + +- Require additional header file `fstream`: include +- Declare an output stream variable `write_output` as `std::ofstream` +- Specify filename `Output.dat` +- Check the file is opened successfully: use assert +- Use `write_output` instead of `std::cout` : +- Close the file handle. + +In order to finish these job, the code is + +``` +#include +#include +#include +int main(int argc, char * argv[]) { + double x[3] = {0.0, 1.0, 0.0}; + double y[3] = {0.0, 0.0, 1.0}; + std::ofstream write_output("Output.dat"); + assert(write_output.is_open()); + for (int i=0; i<3; i++) { + write_output << x[i] << " " << y[i] << "\n"; + } + write_output.close(); + return 0; +} +``` + +In scientific computing, the precision is crucial, they can be set as: + +``` +#include +#include + +int main(int argc, char * argv[]) +{ + double x = 1.8364238; std::ofstream write_output("Output.dat"); + write_output.precision(3); // 3 sig figs + write_output << x << "\n"; + + write_output.precision(5); // 5 sig figs + write_output << x << "\n"; + + write_output.precision(10); // 10 sig figs + write_output << x << "\n"; + + write_output.close(); + return 0; +} +``` + +It's similar for read file in stream with cout, just change `cout` to `std::ifstream read_file`. + +The example code in the book is: +``` +#include +#include #include + +int main(int argc, char * argv[]) +{ + double x[6], y[6]; + std::ifstream read_file("Output.dat"); + assert(read_file.is_open()); + for (int i=0; i<6; i++) + { + read_file >> x[i] >> y[i]; + } + read_file.close(); + return 0; +} + +``` + +This is the simplest code that just loop the given length as 6, we want read until it is end. Then we need write until get `read_file.oef()`, so we use while instead of for. +``` +while(!read_file.oef()) +``` + + +### Reading from the Command Line + +感觉这是命令行管道的东西的感觉。 + +User want to set some parameters when executing the code. + +Remind of the definition of main function +``` +int main(int argc, char * argv[]) +``` +The argc and argv function will be used to finish this purpose. + +- Addition header `filecstdlib` and function `atoi` and `atof` will be sued. + +``` +#include +#include +int main(int argc, char *argv[]) +{ + std::cout << "Number of command line arguments = " << argc << "\n"; + for (int i=0; i>>>>如果没定义的话,就定义 +#include + +class ComplexNumber { +private: + double mRealPart; + //real part + double mImaginaryPart; + //imaginary part大概是虚数部分? +public: + ComplexNumber(); //初始化函数,让实部和虚部为0 + ComplexNumber(double x, double y); + double CalculateModulus() const; // 返回一个双精度浮点数包含了这个复数的模 + double CalculateArgument() const;// + ComplexNumber CalculatePower(double n) const; + ComplexNumber& operator=(const ComplexNumber& z); //这个是&的要注意,做了运算符重载? + ComplexNumber operator-() const; + ComplexNumber operator+(const ComplexNumber& z) const; + ComplexNumber operator-(const ComplexNumber& z) const; + friend std::ostream& operator<<(std::ostream& output, + const ComplexNumber& z); + +}; + +#endif +``` + + + +一个新的东西: + +``` +ComplexNumber& ComplexNumber:: + operator=(const ComplexNumber& z) +{ + mRealPart = z.mRealPart; + mImaginaryPart = z.mImaginaryPart; + return * this; +} +``` +这里面出现了两个&,需要看一眼是啥意思: +这里是运算符重载,重载了"="运算符 +用了 const 保证这个赋值时是拷贝而不是alter重名 +所以返回的是这个类自己的地址。 +然后因为是赋值,所以指向了给定值,然后给了一个const,所以不会被改变。 +下面的"-"的重载就没用取地址和指针,这个“-”是取符号不是“减”。 +``` +ComplexNumber ComplexNumber::operator-() const +{ + ComplexNumber w; + w.mRealPart = -mRealPart; + w.mImaginaryPart = -mImaginaryPart; + return w; +} +``` +然后 +是加运算符 +``` +// Overloading the binary + operator +ComplexNumber ComplexNumber::operator+(const ComplexNumber& z) const +{ + ComplexNumber w; + w.mRealPart = mRealPart + z.mRealPart; + w.mImaginaryPart = mImaginaryPart + z.mImaginaryPart; + return w; +} +``` +这里的参数又用了const ComplexNumber& z +Point &pt2=pt1; 定义了pt2为pt1的引用。通过这样的定义,pt1和pt2表示同一对象。 +所以大部分都是引用的感觉。。。 + +所以上面是引用了z,所以是z的正常调用(而且不能更改z),然后把值丢到一个新的w里面。 +这叫常引用,可以不开辟新的内存空间,而又不会影响原值。 + +## 类的继承 + +``` +#ifndef EBOOKHEADERDEF +#define EBOOKHEADERDEF + +#include +#include "Book.hpp" +class Ebook: public Book +{ +public: + Ebook(); + std::string hiddenUrl; +}; +#endif +``` + +这是继承的基础语法 +新的东西是覆盖了初始构造函数的新构造函数Ebook(),和一个std::string的类成员hiddenUrl. + +[三种继承的解释](https://www.cnblogs.com/qlwy/archive/2011/08/25/2153584.html) + + +### 继承类的实时多态 Run-Time Polymorphism + + + +``` +#ifndef GUESTDEF +#define GUESTDEF + +#include + +class Guest +{ +public: + std::string name, roomType, arrivalDate; + int numberOfNights; + double telephoneBill; + virtual double CalculateBill(); + }; + +#endif + +``` + +关注点在virtual关键字,是告诉编译器这个方法可能被继承的类覆盖了。 +这就是虚函数了。。 + +## 模板 + + +[函数返回引用和返回值的区别](https://www.cnblogs.com/JMLiu/p/7928425.html),关键就是有没有多搞一个零时变量。 + + +一个简单的模板的例子: + +``` +#include + +template T GetMaximum(T number1, T number2); + +int main(int argc, char * argv[]) +{ + std::cout << GetMaximum(10, -2) << "\n"; + std::cout << GetMaximum(-4.6, 3.5) << "\n"; return 0; +} + +template T GetMaximum(T number1, T number2) +{ + T result; + if (number1 > number2) + { + result = number1; + } + else + { + //number1 <= number2 + result = number2; + } + return result; +} +``` +相当于T是占位符,然后使用的时候用尖括号<>把类型告诉程序。 + + + +### Brief Survey of the Standard Template Library + +#### Vector + +## Class for linear algebra + +每个new 需要匹配一个 delete. + + + + + + + + + diff --git "a/23-Rcpp-\345\256\236\351\252\214.Rmd" "b/23-Rcpp-\345\256\236\351\252\214.Rmd" new file mode 100644 index 0000000..93e4d65 --- /dev/null +++ "b/23-Rcpp-\345\256\236\351\252\214.Rmd" @@ -0,0 +1,86 @@ +# Rcpp + +根据实验,见jmcmBoost项目里面的试验,代码是: +``` +// [[Rcpp::export]] +void ReceiveAlist(Rcpp::List theta0) +{ + double beta0=Rcpp::as(theta0["beta"]); + double gamma0=Rcpp::as(theta0["gamma"]); + double xi0=Rcpp::as(theta0["xi"]); + cout<的数据类型必须是一个scala,不能给一个Numeric Vector +所以问题是如何把这个转换成一个c++的变量 + +``` +Sys.setenv("PKG_CXXFLAGS"="-std=c++11") +``` +要用auto打印std::vector的话得这样,是c++11标准 + +理论上应该是用一个c++的STL的vector容器来搞 +为啥那本书里面没有,好奇怪 +所以进过谷歌,stackexchange的[问题](https://stackoverflow.com/questions/37328978/convert-rvector-to-stdvector) 所以解决方案是:: +``` + std::vector b = as >(a) +``` +于是代码就改成如下: +``` +void ReceiveAlist(Rcpp::List theta0) +{ + std::vector beta= Rcpp::as >(theta0["beta"]); + //double beta0=Rcpp::as(theta0["beta"]); + std::vector gamma= Rcpp::as >(theta0["gamma"]); + //double gamma0=Rcpp::as(theta0["gamma"]); + std::vector xi= Rcpp::as >(theta0["xi"]); + //double xi0=Rcpp::as(theta0["xi"]); + //cout<(a0); + return Rcpp::List::create(Rcpp::Named("beta") = Rcpp::wrap(cc), + Rcpp::Named("gamma") = Rcpp::wrap(a2), + Rcpp::Named("lambda") = x + ); +} +``` + +这就能返回一个R的数据结构list,接口的问题就基本上解决了。 + + + + + + diff --git a/docs/advanced-r.html b/docs/advanced-r.html index 8f0871a..abb6087 100644 --- a/docs/advanced-r.html +++ b/docs/advanced-r.html @@ -25,7 +25,7 @@ - + @@ -136,6 +136,8 @@
  • 2 Mathematical statistic Trick
  • 3 Statistician Tool Box
    • 3.1 Matrix algebra
        @@ -263,6 +265,29 @@
      • 14 Imputation
      • References
      • 15 R web scrape
      • +
      • 16 Guide to Scientific Computing in C++
      • +
      • 17 Rcpp
      • Published with bookdown
      • @@ -346,20 +371,20 @@

        12.1.1 Ordering (integer subsetti df2=df[sample(nrow(df)),3:1] df2[order(df2$x),]
        ##   z y x
        -## 1 a 6 1
         ## 2 b 5 1
        +## 1 a 6 1
         ## 4 d 3 2
         ## 3 c 4 2
        -## 5 e 2 3
        -## 6 f 1 3
        +## 6 f 1 3 +## 5 e 2 3
        df2[,order(names(df2))]
        ##   x y z
         ## 4 2 3 d
        -## 5 3 2 e
         ## 3 2 4 c
        -## 1 1 6 a
         ## 6 3 1 f
        -## 2 1 5 b
        +## 2 1 5 b +## 1 1 6 a +## 5 3 2 e
        df=data.frame(x=1:3,y=3:1,z=letters[1:3])
        diff --git a/docs/basicmcmc.html b/docs/basicmcmc.html index d401b76..f260d2a 100644 --- a/docs/basicmcmc.html +++ b/docs/basicmcmc.html @@ -25,7 +25,7 @@ - + @@ -136,6 +136,8 @@
      • 2 Mathematical statistic Trick
      • 3 Statistician Tool Box
      • diff --git a/docs/r-web-scrape.html b/docs/r-web-scrape.html index 7024a87..e5b6553 100644 --- a/docs/r-web-scrape.html +++ b/docs/r-web-scrape.html @@ -25,7 +25,7 @@ - + @@ -33,7 +33,7 @@ - + @@ -136,6 +136,8 @@
      • 2 Mathematical statistic Trick
      • 3 Statistician Tool Box
      • 12 Advanced R
          -
        • 12.1 Fundmental
        • -
        • 12.2 Data.frame
        • 13 Numeric Derivatives
        • 14 Imputation
        • References
        • 15 R web scrape
        • +
        • 16 Guide to Scientific Computing in C++
        • +
        • 17 Rcpp
        • Published with bookdown
        • @@ -288,8 +311,10 @@

          Chapter 15 R web scrape

          Web scrape用

          -
          library(rvest)
          -lego_movie <- read_html("http://www.imdb.com/title/tt1490017/")
          +
          library(rvest)
          +
          ## Warning: package 'rvest' was built under R version 3.5.2
          +
          ## Loading required package: xml2
          +
          lego_movie <- read_html("http://www.imdb.com/title/tt1490017/")
           
           rating <- lego_movie %>% 
             html_nodes("strong span") %>% # 这个"strong span"是指什么意思?
          @@ -358,8 +383,8 @@ 

          Chapter 15 R web scrape

          - - + + diff --git a/docs/rcpp.html b/docs/rcpp.html new file mode 100644 index 0000000..85808d4 --- /dev/null +++ b/docs/rcpp.html @@ -0,0 +1,450 @@ + + + + + + + + Chapter 17 Rcpp | Notes on Statistics + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
          + +
          + +
          + +
          +
          + + +
          +
          + +
          +
          +

          Chapter 17 Rcpp

          +

          根据实验,见jmcmBoost项目里面的试验,代码是:

          +
          // [[Rcpp::export]]
          +void ReceiveAlist(Rcpp::List theta0)
          +{
          +  double beta0=Rcpp::as<double>(theta0["beta"]);
          +  double gamma0=Rcpp::as<double>(theta0["gamma"]);
          +  double xi0=Rcpp::as<double>(theta0["xi"]);
          +  cout<<beta0<<endl;
          +    cout<<gamma0<<endl;
          +      cout<<xi0<<endl;
          +}
          +
          +
          +/*** R
          +ReceiveAlist(list(beta=c(0,1),gamma=c(1,1),xi=c(1,2)))
          +ReceiveAlist(list(beta=1,gamma=1,xi=1))
          +*/
          +

          第一个报错,第二个可以正确输出。 +因为as的数据类型必须是一个scala,不能给一个Numeric Vector +所以问题是如何把这个转换成一个c++的变量

          +
          Sys.setenv("PKG_CXXFLAGS"="-std=c++11")
          +

          要用auto打印std::vector的话得这样,是c++11标准

          +

          理论上应该是用一个c++的STL的vector容器来搞 +为啥那本书里面没有,好奇怪 +所以进过谷歌,stackexchange的问题 所以解决方案是::

          +
           std::vector<double> b = as<std::vector<double> >(a)
          +

          于是代码就改成如下:

          +
          void ReceiveAlist(Rcpp::List theta0)
          +{
          +  std::vector<double> beta= Rcpp::as<std::vector<double> >(theta0["beta"]);
          +  //double beta0=Rcpp::as<double>(theta0["beta"]);
          +  std::vector<double> gamma= Rcpp::as<std::vector<double> >(theta0["gamma"]);
          +  //double gamma0=Rcpp::as<double>(theta0["gamma"]);
          +  std::vector<double> xi= Rcpp::as<std::vector<double> >(theta0["xi"]);
          +  //double xi0=Rcpp::as<double>(theta0["xi"]);
          +  //cout<<beta0<<endl;
          +    //cout<<gamma0<<endl;
          +    //  cout<<xi0<<endl;
          +      for (auto i: beta)
          +        std::cout << i << ' ';
          +      std::cout<<endl;
          +      for (auto i: gamma)
          +        std::cout << i << ' ';
          +      std::cout<<endl;
          +      for (auto i: xi)
          +        std::cout << i << ' ';
          +      std::cout<<endl;
          +}
          +

          然后结合之前的

          +
          Rcpp::List Jmcmoutput2(NumericVector x){
          +    Rcpp::List a=Rcpp::List::create( // with static names
          +    Rcpp::CharacterVector::create("cc", "dd"),
          +    Rcpp::CharacterVector::create("ee", "ff")
          +  );
          +  
          +  double cc=0;
          +  Rcpp::NumericVector a0=x[1];
          +  double a2= Rcpp::as<double>(a0);
          +  return Rcpp::List::create(Rcpp::Named("beta")   = Rcpp::wrap(cc),
          +                            Rcpp::Named("gamma")   = Rcpp::wrap(a2),
          +                            Rcpp::Named("lambda")   = x
          +                            );
          +}
          +

          这就能返回一个R的数据结构list,接口的问题就基本上解决了。

          + +
          +
          +

          Andrieu, Christophe, and Johannes Thoms. 2008. “A tutorial on adaptive MCMC.” Statistics and Computing 18 (4): 343–73.

          +
          +
          +
          +
          + +
          +
          +
          + + +
          +
          + + + + + + + + + + + + + + diff --git a/docs/references.html b/docs/references.html index fd6b383..1714bf5 100644 --- a/docs/references.html +++ b/docs/references.html @@ -25,7 +25,7 @@ - + @@ -136,6 +136,8 @@
        • 2 Mathematical statistic Trick
        • 3 Statistician Tool Box
          • 3.1 Matrix algebra
              @@ -263,6 +265,29 @@
            • 14 Imputation
            • References
            • 15 R web scrape
            • +
            • 16 Guide to Scientific Computing in C++
            • +
            • 17 Rcpp
            • Published with bookdown
            • diff --git a/docs/reversible-jump-mcmc.html b/docs/reversible-jump-mcmc.html index fe5f2e0..3e3322a 100644 --- a/docs/reversible-jump-mcmc.html +++ b/docs/reversible-jump-mcmc.html @@ -25,7 +25,7 @@ - + @@ -136,6 +136,8 @@
            • 2 Mathematical statistic Trick
            • 3 Statistician Tool Box
              • 3.1 Matrix algebra
                  @@ -263,6 +265,29 @@
                • 14 Imputation
                • References
                • 15 R web scrape
                • +
                • 16 Guide to Scientific Computing in C++
                • +
                • 17 Rcpp
                • Published with bookdown
                • diff --git a/docs/search_index.json b/docs/search_index.json index 58ec785..17aa242 100644 --- a/docs/search_index.json +++ b/docs/search_index.json @@ -1,3 +1,22 @@ [ -["mathematical-statistic-trick.html", "Chapter 2 Mathematical statistic Trick 2.1 Normal distribution as exponential family 2.2 密度变换公式: 2.3 Probability mass function:", " Chapter 2 Mathematical statistic Trick 2.1 Normal distribution as exponential family Theorem: If the density function with the form: \\[ f(x) \\propto \\exp \\left\\{-\\frac{1}{2} x^{T} A x+B x\\right\\} \\] Then, we have \\(X\\sim N\\left(\\left(B A^{-1}\\right)^{T}, A^{-1}\\right)\\) 可以配合3.2 食用更佳 2.2 密度变换公式: 设随机变量 \\(X\\) 有概率密度函数 \\(f(x)\\), \\(x\\in(a, b)\\) (\\(a, b\\) 可以为 \\(\\infty\\)), 而 \\(y = g(x)\\) 在 \\(x \\in (a, b)\\) 上是严格单调的连 续函数,存在唯一的反函数 \\(x = h(y)\\), \\(y \\in (\\alpha, \\beta)\\) 并且 \\(h′(y)\\) 存在且 连续,那么 \\(Y = g(X)\\) 也是连续型随机变量且有概率密度函数 \\[ p(y)=f(h(y))\\left|h^{\\prime}(y)\\right|, \\quad y \\in(\\alpha, \\beta) \\] 2.3 Probability mass function: 就是离散的的分布嘛,记差了还行 "] +["index.html", "Notes on Statistics preliminary", " Notes on Statistics Jiaming Shen 2019-07-31 preliminary "], +["e69d82e4b883e69d82e585ab.html", "杂七杂八", " 杂七杂八 This is a document describe the build procedure. The other branch “gh-pg”(or something like this) 是用于用alternative approach去在github publish 这本书,但是做到一半没做下去,卡在这里,中,Create a token之后,没做到encrypt it.然后改用docs发布发现一次成功就不想试了。 "], +["intro.html", "Chapter 1 Introduction", " Chapter 1 Introduction This is a notes about the book reading and useful tricks while reading books and codes. "], +["mathematical-statistic-trick.html", "Chapter 2 Mathematical statistic Trick 2.1 Normal distribution as exponential family 2.2 密度变换公式: 2.3 Probability mass function: 2.4 Vector to diagonal matrix:", " Chapter 2 Mathematical statistic Trick 2.1 Normal distribution as exponential family Theorem: If the density function with the form: \\[ f(x) \\propto \\exp \\left\\{-\\frac{1}{2} x^{T} A x+B x\\right\\} \\] Then, we have \\(X\\sim N\\left(\\left(B A^{-1}\\right)^{T}, A^{-1}\\right)\\) 可以配合3.2 食用更佳 2.2 密度变换公式: 设随机变量 \\(X\\) 有概率密度函数 \\(f(x)\\), \\(x\\in(a, b)\\) (\\(a, b\\) 可以为 \\(\\infty\\)), 而 \\(y = g(x)\\) 在 \\(x \\in (a, b)\\) 上是严格单调的连 续函数,存在唯一的反函数 \\(x = h(y)\\), \\(y \\in (\\alpha, \\beta)\\) 并且 \\(h′(y)\\) 存在且 连续,那么 \\(Y = g(X)\\) 也是连续型随机变量且有概率密度函数 \\[ p(y)=f(h(y))\\left|h^{\\prime}(y)\\right|, \\quad y \\in(\\alpha, \\beta) \\] 2.3 Probability mass function: 就是离散的的分布嘛,记差了还行 2.4 Vector to diagonal matrix: (source)[https://mathoverflow.net/questions/55820/vector-to-diagonal-matrix] Let \\(E_i\\) be the \\(n\\times n\\) matrix with a 1 on position (i,i) and zeros else where. Similarly let \\(e_i\\) be a row vector with 1 on position (1,i) and zero else where, then: \\[ D=\\sum_{i=1}^{n} E_{i} v e_{i} \\] D is the diagonal matrix with its entry is vector \\(v\\). "], +["statistician-tool-box.html", "Chapter 3 Statistician Tool Box 3.1 Matrix algebra 3.2 两个二次型相加", " Chapter 3 Statistician Tool Box 3.1 Matrix algebra Definition 3.1 定义:矩阵\\(A(t)\\)的导数 如果矩阵\\(A(t)=\\left(a_{i j}(t)\\right)_{m \\times n}\\) 的每一个元素\\(a_{ij}(t)\\)是变量t的可微函数,则称A(t)可微,其导数(微商)定义为 \\[ A^{\\prime}(t)=\\frac{\\mathrm{d}}{\\mathrm{d} t} A(t)=\\left(\\frac{\\mathrm{d}}{\\mathrm{d} t} a_{i j}(t)\\right)_{m \\times n} \\] Definition 3.2 定义:矩阵对矩阵的导数 设 \\(\\boldsymbol{X}=\\left(\\xi_{i j}\\right)_{m \\times n}\\), \\(mn\\) 元函数 \\(f(\\boldsymbol{X})=f\\left(\\xi_{11}, \\xi_{12}\\cdots, \\xi_{1 n}, \\xi_{21}, \\cdots, \\xi_{m n} )\\right.\\),定义\\(f(X)\\) 对矩阵\\(X\\)的导数为 \\[\\begin{equation} \\frac{\\mathrm{d} f}{\\mathrm{d} \\boldsymbol{X}}=\\left(\\frac{\\partial f}{\\partial \\xi_{i j}}\\right)_{m \\times n}= \\left[ \\begin{array}{ccc} {\\frac{\\partial f}{\\partial \\xi_{11}}} & {\\dots} & {\\frac{\\partial f}{\\partial \\xi_{1 n}}} \\\\ {\\vdots} & { } & {\\vdots} \\\\ {\\frac{\\partial f}{\\partial \\xi_{m 1}}} & {\\cdots} & {\\frac{\\partial f}{\\partial \\xi_{m n}}} \\end{array}\\right] \\end{equation}\\] 根据这个定义,能够直接证明以下两个最常见的结论 \\[ \\frac{\\partial x'a}{\\partial x}=a=\\frac{\\partial a'x}{\\partial x} \\] \\[ \\frac{\\partial x'Ax}{\\partial x}=(A+A')x \\] 证明: 将矩阵乘法\\(x^TAx\\)打开得到 \\[ \\begin{aligned} f(\\boldsymbol{x})=& \\sum_{i=1}^{n} \\sum_{j=1}^{n} a_{i j} \\xi_{i} \\xi_{j}=\\\\ & \\xi_{1} \\sum_{j=1}^{n} a_{1 j} \\xi_{j}+\\cdots+\\xi_{k} \\sum_{j=1}^{n} a_{k j} \\xi_{j}+\\cdots+\\xi_{n} \\sum_{j=1}^{n} a_{n j} \\xi_{j} \\end{aligned} \\] 且有 \\[ \\begin{align} \\frac{\\partial f}{\\partial \\xi_{k}}=\\xi_{1} a_{1 k}+\\cdots+\\xi_{k-1} a_{k-1, k}+\\left(\\sum_{j=1}^{n} a_{k j} \\xi_{j}+\\xi_{k} a_{k k}\\right)+\\\\ \\xi_{k+1} a_{k+1, k}+\\cdots+\\xi_{n} a_{\\# k}=\\sum_{j=1}^{n} a_{i j} \\xi_{j}+\\sum_{i=1}^{n} a_{k} \\xi_{i} \\end{align} \\] 所以有 \\[ \\frac{d f}{d x}=\\left[ \\begin{array}{c}{\\frac{\\partial f}{\\partial \\xi_{1}}} \\\\ {\\vdots} \\\\ {\\frac{\\partial f}{\\partial \\xi_{n}}}\\end{array}\\right]=\\left[ \\begin{array}{c}{\\sum_{j=1}^{n} a_{1 j} \\xi_{j}} \\\\ {\\vdots} \\\\ {\\sum_{j=1}^{n} a_{n j} \\xi_{j}}\\end{array}\\right]+\\left[ \\begin{array}{c}{\\sum_{i=1}^{n} a_{i 1} \\xi_{i}} \\\\ {\\vdots} \\\\ {\\sum_{i=1}^{n} a_{i n} \\xi_{i}}\\end{array}\\right]=\\\\ \\boldsymbol{A x}+\\boldsymbol{A}^{\\mathrm{T}} \\boldsymbol{x}=\\left(\\boldsymbol{A}+\\boldsymbol{A}^{\\mathrm{T}}\\right) \\boldsymbol{x} \\] 3.1.1 Block diagonal matrices \\[ \\mathbf{A}=\\left[ \\begin{array}{cccc}{\\mathbf{A}_{1}} & {0} & {\\cdots} & {0} \\\\ {0} & {\\mathbf{A}_{2}} & {\\cdots} & {0} \\\\ {\\vdots} & {\\vdots} & {\\ddots} & {\\vdots} \\\\ {0} & {0} & {\\cdots} & {\\mathbf{A}_{n}}\\end{array}\\right] \\] property: \\[ \\begin{aligned} \\operatorname{det} \\mathbf{A} &=\\operatorname{det} \\mathbf{A}_{1} \\times \\cdots \\times \\operatorname{det} \\mathbf{A}_{n} \\\\ \\operatorname{tr} \\mathbf{A} &=\\operatorname{tr} \\mathbf{A}_{1}+\\cdots+\\operatorname{tr} \\mathbf{A}_{n} \\end{aligned} \\] It’s inverse: \\[ \\left( \\begin{array}{cccc}{\\mathbf{A}_{1}} & {0} & {\\cdots} & {0} \\\\ {0} & {\\mathbf{A}_{2}} & {\\cdots} & {0} \\\\ {\\vdots} & {\\vdots} & {\\ddots} & {\\vdots} \\\\ {0} & {0} & {\\cdots} & {\\mathbf{A}_{n}}\\end{array}\\right)=\\left( \\begin{array}{cccc}{\\mathbf{A}_{1}^{-1}} & {0} & {\\cdots} & {0} \\\\ {0} & {\\mathbf{A}_{2}^{-1}} & {\\cdots} & {0} \\\\ {\\vdots} & {\\vdots} & {\\ddots} & {\\vdots} \\\\ {0} & {0} & {\\cdots} & {\\mathbf{A}_{n}^{-1}}\\end{array}\\right) \\] 3.2 两个二次型相加 In vector formulation(Assuming \\(\\Sigma_1\\),\\(\\Sigma_2\\) are symmetric) 有: \\[ \\begin{array}{c}{-\\frac{1}{2}\\left(\\mathbf{x}-\\mathbf{m}_{1}\\right)^{T} \\mathbf{\\Sigma}_{1}^{-1}\\left(\\mathbf{x}-\\mathbf{m}_{1}\\right)} \\\\ {-\\frac{1}{2}\\left(\\mathbf{x}-\\mathbf{m}_{2}\\right)^{T} \\mathbf{\\Sigma}_{2}^{-1}\\left(\\mathbf{x}-\\mathbf{m}_{2}\\right)} \\\\ {=-\\frac{1}{2}\\left(\\mathbf{x}-\\mathbf{m}_{c}\\right)^{T} \\mathbf{\\Sigma}_{c}^{-1}\\left(\\mathbf{x}-\\mathbf{m}_{c}\\right)+C}\\end{array} \\] 其中: \\[ \\begin{aligned} \\mathbf{\\Sigma}_{c}^{-1}=& \\mathbf{\\Sigma}_{1}^{-1}+\\mathbf{\\Sigma}_{2}^{-1} \\\\ \\mathbf{m}_{c} &=\\left(\\mathbf{\\Sigma}_{1}^{-1}+\\mathbf{\\Sigma}_{2}^{-1}\\right)^{-1}\\left(\\mathbf{\\Sigma}_{1}^{-1} \\mathbf{m}_{1}+\\mathbf{\\Sigma}_{2}^{-1} \\mathbf{m}_{2}\\right) \\\\ C &=\\frac{1}{2}\\left(\\mathbf{m}_{1}^{T} \\mathbf{\\Sigma}_{1}^{-1}+\\mathbf{m}_{2}^{T} \\mathbf{\\Sigma}_{2}^{-1}\\right)\\left(\\mathbf{\\Sigma}_{1}^{-1}+\\mathbf{\\Sigma}_{2}^{-1}\\right)^{-1}\\left(\\mathbf{\\Sigma}_{1}^{-1} \\mathbf{m}_{1}+\\mathbf{\\Sigma}_{2}^{-1} \\mathbf{m}_{2}\\right) \\\\ &-\\frac{1}{2}\\left(\\mathbf{m}_{1}^{T} \\mathbf{\\Sigma}_{1}^{-1} \\mathbf{m}_{1}+\\mathbf{m}_{2}^{T} \\mathbf{\\Sigma}_{2}^{-1} \\mathbf{m}_{2}\\right) \\end{aligned} \\] 而对于我需要的情况,简化\\(m_1=0\\),\\(m_2=0\\) 那么就有,只需要\\(\\mathbf{\\Sigma}_{c}^{-1}=\\mathbf{\\Sigma}_{1}^{-1}+\\mathbf{\\Sigma}_{2}^{-1}\\),同时\\(m_c=0\\). "], +["longitudinal-data-analysis.html", "Chapter 4 Longitudinal data analysis 4.1 Linear mixed model 4.2 Generalised linear mixed models 4.3 The Bayesian analysis approach for covariance modelling", " Chapter 4 Longitudinal data analysis 4.1 Linear mixed model The model is \\(Y_i=X_i\\beta+Z_ib_i+\\epsilon_i\\), with \\(b_i\\sim N(0,G)\\) Under MVN distribution assumption in LMM, we have the density function and log-likelihood function as \\[\\begin{equation} f\\left(\\mathbf{Y}_{i}\\right)=(2 \\pi)^{-\\frac{n_{i}}{2}}\\left|\\mathbf{\\Sigma}_{i}\\right|^{-\\frac{1}{2}} \\exp \\left\\{-\\frac{1}{2}\\left(\\mathbf{Y}_{i}-\\mathbf{X}_{i} \\beta\\right)^{\\prime} \\mathbf{\\Sigma}_{i}^{-1}\\left(\\mathbf{Y}_{i}-\\mathbf{X}_{i} \\beta\\right)\\right\\} \\tag{4.1} \\end{equation}\\] with \\(\\Sigma_{i}=Z_{i} G Z_{i}^{\\prime}+R_{i}=\\Sigma_{i}(\\theta)\\) \\[\\begin{equation} \\ell(\\beta, \\theta)=-\\frac{1}{2}\\left[\\sum_{i=1}^{m} \\log \\left|\\mathbf{\\Sigma}_{i}\\right|+\\sum_{i=1}^{m}\\left(\\mathbf{Y}_{i}-\\mathbf{X}_{i} \\beta\\right)^{\\prime} \\mathbf{\\Sigma}_{i}^{-1}\\left(\\mathbf{Y}_{i}-\\mathbf{X}_{i} \\beta\\right)\\right] \\tag{4.2} \\end{equation}\\] 对\\(\\beta\\) 求一阶导,如3.1中所述,对含有\\(\\beta\\)的项求导,令导数为0,得到 \\[ \\tilde{\\beta}(\\theta)=\\left(\\sum_{i=1}^{m} \\mathbf{x}_{i}^{\\prime} \\mathbf{\\Sigma}_{i}^{-1} \\mathbf{X}_{i}\\right)^{-1}\\left(\\sum_{i=1}^{m} \\mathbf{X}_{i}^{\\prime} \\mathbf{\\Sigma}_{i}^{-1} \\mathbf{Y}_{i}\\right) \\] 将其带入(4.2) 得到关于variance components \\(\\theta\\) in \\(\\Sigma\\)的profile loglikelihood \\[ \\ell_p(\\theta)=-\\frac{1}{2}\\left[\\sum_{i=1}^{m} \\log \\left|\\mathbf{\\Sigma}_{i}\\right|+\\sum_{i=1}^{m}\\left\\{\\mathbf{Y}_{i}-\\mathbf{X}_{i} \\tilde{\\beta}(\\theta)\\right\\}^{\\prime} \\mathbf{\\Sigma}_{i}^{-1}\\left\\{\\mathbf{Y}_{i}-\\mathbf{X}_{i} \\tilde{\\beta}(\\theta)\\right\\}\\right]. \\] 有 \\[ R S S(\\theta)=\\sum_{i=1}^{m}\\left\\{\\mathbf{Y}_{i}-\\mathbf{X}_{i} \\tilde{\\beta}(\\theta)\\right\\}^{\\prime} \\mathbf{\\Sigma}_{i}^{-1}\\left\\{\\mathbf{Y}_{i}-\\mathbf{X}_{i} \\tilde{\\beta}(\\theta)\\right\\}\\\\ =\\mathbf{Y}^{\\prime}\\left[\\mathbf{\\Sigma}^{-1}-\\mathbf{\\Sigma}^{-1} \\mathbf{X}\\left(\\mathbf{X}^{\\prime} \\mathbf{\\Sigma}^{-1} \\mathbf{X}\\right)^{-1} \\mathbf{X}^{\\prime} \\mathbf{\\Sigma}^{-1}\\right] \\mathbf{Y} \\] 所以profile/reduced loglikelihood 就是 \\[ \\ell_{p}(\\theta)=\\ell_{p}(\\tilde{\\beta}(\\theta), \\theta)=-\\frac{1}{2}\\{\\log |\\Sigma(\\theta)|+R S S(\\theta)\\} \\] MLE of \\(\\hat\\theta\\) 一般没有解析形式。但是对于简单的情况,比如Random Intercept Model,可以用EM算法进行求解,详见J.Pan,LDA notes, p160(376). 4.1.1 Condition Mean vs Marginal mean Condition mean of \\(Y_i\\) is the mean under the condition that given the subject-specific random effects \\(b_i\\). \\[ E\\left(\\mathbf{Y}_{i} | \\mathbf{b}_{i}\\right)=\\mathbf{X}_{i} \\beta+\\mathbf{Z}_{i} \\mathbf{b}_{i} \\] Marginal mean is integral out the random effects, the same as the marginal general linear model \\[ \\begin{aligned} E\\left(\\mathbf{Y}_{i}\\right) &=E_{b_{i}}\\left(E_{Y_{i}}\\left(\\mathbf{Y}_{i} | \\mathbf{b}_{i}\\right)\\right) \\\\ &=E_{b_{i}}\\left(\\mathbf{X}_{i} \\beta+\\mathbf{Z}_{i} \\mathbf{b}_{i}\\right) \\\\ &=\\mathbf{X}_{i} \\beta \\end{aligned} \\] The conditional variance of \\(Y_i\\) is \\(R_i\\) The marginal variance of \\(Y_i\\) is \\(Z_iGZ_i'+R_i\\) 4.1.2 Restricted maximum likelihood estimation MLEs of variance components \\(\\theta\\) may be biased since the estimation proceure does not account for the loss of data information used in estimating the regression coefficients \\(\\beta\\), consider restricted maximum likelihood estimation procedure to obtain less biased estimators of variance components \\(\\theta\\). Use the REML function instead of original log-likelihood function, (Derive can be found in the 没mixed的部分有详细推导) \\[ \\begin{aligned} \\ell_{R}(\\theta)=&-\\frac{1}{2} \\sum_{i=1}^{m} \\log \\left|\\boldsymbol{\\Sigma}_{i}\\right|-\\frac{1}{2} \\sum_{i=1}^{m} \\log \\left|\\mathbf{X}_{i}^{\\prime} \\boldsymbol{\\Sigma}_{i}^{-1} \\mathbf{X}_{i}\\right| \\\\ &-\\frac{1}{2} \\sum_{i=1}^{m}\\left(\\mathbf{Y}_{i}-\\mathbf{X}_{i} \\tilde{\\beta}(\\theta)\\right)^{\\prime} \\boldsymbol{\\Sigma}_{i}^{-1}\\left(\\mathbf{Y}_{i}-\\mathbf{X}_{i} \\tilde{\\beta}(\\theta)\\right) \\end{aligned} \\] Once we get the estimation of parameters, the random effects \\(b_i\\) can be predicted by \\[ \\hat{\\mathbf{b}}_{i}=\\mathrm{E}\\left[\\mathbf{b}_{i} | \\mathbf{Y}_{i}, \\hat{\\beta}, \\hat{\\theta}\\right]=\\hat{\\mathbf{G}} \\mathbf{Z}_{i} \\hat{\\mathbf{\\Sigma}}_{i}^{-1}\\left(\\mathbf{Y}_{i}-\\mathbf{X}_{i} \\hat{\\beta}\\right) \\] 4.1.3 Prediction Posterior distribution of \\(b_i\\) given the data \\(Y_i\\) is \\[ f\\left(b_{i} | Y_{i}\\right)=\\frac{f\\left(Y_{i} | b_{i}\\right) f\\left(b_{i}\\right)}{\\int f\\left(Y_{i} | b_{i}\\right) f\\left(b_{i}\\right) d b_{i}} \\] Thus, the posterior mean of \\(b_i\\) can be obtained by \\[ \\begin{aligned} \\hat{b}_{i} &=E\\left(b_{i} | Y_{i}\\right) \\\\ &=\\int b_{i} f\\left(b_{i} | Y_{i}\\right) d b_{i} \\\\ &=G Z_{i}^{\\prime} \\Sigma_{i}^{-1}\\left(Y_{i}-X_{i} \\beta\\right) \\end{aligned} \\] This is called empirical Bayes estimator or the empirical best linear unbiased predictors. 4.2 Generalised linear mixed models Before only consider the data follows Normal distribution, need extend to binomial, binary, Poisson counts, etc. 推广线性模型的normal到其他分部是用GLM方法 结合mixed model和GLM,得到GLMMs 4.2.1 Exponential distribution family 涉及到Generalised linear model就会把正态推广到指数函数族,所以这里简单的提一下 \\[ f(y ; \\theta, \\phi)=\\exp \\left\\{\\frac{(y \\theta-b(\\theta))}{a(\\phi)}+c(y, \\phi)\\right\\} \\] \\(\\theta\\) is natural parameter and \\(\\phi\\) is called additional scale or nuisance/dispersion parameter. 对于指数分布组,我们有 \\[ \\begin{aligned} \\mathrm{E}(Y) &=b^{\\prime}(\\theta) \\\\ \\operatorname{Var}(Y) &=\\phi b^{\\prime \\prime}(\\theta) \\end{aligned} \\] 若要求MLE,有其score function为 \\[ \\begin{aligned} U &=\\frac{\\partial}{\\partial \\theta} \\ell(\\theta, \\phi | y) \\\\ &=\\frac{y-b^{\\prime}(\\theta)}{\\phi} \\end{aligned} \\] \\[ g(\\mu_i)=x_i'\\beta \\] Y的不同分布通过不同的连接函数g进行刻画。 \\[ Normal: \\mu_i=\\eta_i=x_i'\\beta\\\\ Possion/log: \\log \\lambda_{i}=\\eta_{i}=\\boldsymbol{x}_{i}^{\\prime} \\boldsymbol{\\beta}\\\\ \\textit{Bionomial/logistic:}\\log \\frac{\\pi_{i}}{1-\\pi_{i}}=\\eta_{i}=x_{i}^{\\prime} \\beta \\] 4.2.2 Iteratively reweighted Least square algorithm (IWLS) \\[ \\boldsymbol{\\beta}^{(\\boldsymbol{m})}=\\left(\\boldsymbol{X}^{\\prime} \\boldsymbol{W}^{(\\boldsymbol{m}-\\mathbf{1})} \\boldsymbol{X}\\right)^{-\\mathbf{1}} \\boldsymbol{X}^{\\prime} \\boldsymbol{W}^{(\\boldsymbol{m}-\\mathbf{1})} \\boldsymbol{Z}^{(\\boldsymbol{m}-\\mathbf{1})} \\] with \\[ \\begin{aligned} W &=\\operatorname{diag}\\left(W_{1}, W_{2}, \\ldots, W_{n}\\right) \\text { with } W_{i}=\\frac{1}{\\left[g^{\\prime}\\left(\\mu_{i}\\right)\\right]^{2} V_{i}} \\\\ Z &=X \\beta+D(Y-\\mu) \\\\ D &=\\operatorname{diag}\\left(D_{1}, D_{2}, \\ldots, D_{n}\\right) \\text { with } D_{i}=g^{\\prime}\\left(\\mu_{i}\\right) \\end{aligned} \\] More detail about the IWRLS can be found in /MasterCourse/GLM/IWLRS 4.2.3 GLMMs Random Components: Response: \\[ \\mathbf{Y}_{i j}\\left|\\mathbf{b}_{i} \\sim \\exp \\left\\{\\frac{\\left(y_{i j} \\theta_{i j}-b\\left(\\theta_{i j}\\right)\\right)}{a_{i j}(\\phi)}+c\\left(y_{i j}, \\phi\\right)\\right\\}\\right. \\] Random effect: \\[ \\mathbf{b}_{i} \\sim N_{q}(\\mathbf{0}, \\mathbf{G}) \\] Systematic components: \\[ \\eta_{i j}=\\mathbf{x}_{i j}^{\\prime} \\beta+\\mathbf{z}_{i j}^{\\prime} \\mathbf{b}_{i} \\] 比如说,response服从Binomial-Normal,那么有 \\[ Y_{i j}\\left|b_{i} \\sim \\mathcal{B}\\left(K_{i j}, \\pi_{i j}\\right), i=1,2, \\cdots, m ; j=1,2, \\dots, n_{i}\\right. \\] 然后就有模型如下 \\[ \\log \\frac{\\pi_{i j}}{1-\\pi_{i j}}=\\eta_{i j}=\\boldsymbol{x}_{i j}^{\\prime} \\boldsymbol{\\beta}+\\boldsymbol{z}_{i j}^{\\prime} \\boldsymbol{b}_{i} \\] Binomial-Normal意味着response的条件分布式Binomial,而random-effect 的分布是Normal. 4.2.3.1 Likelihood function 用\\(\\theta\\)表示随机效应\\(b_i\\)方差阵\\(G\\)中的参数,则可以很自然的写出似然函数: \\[ L(\\beta, \\theta)=\\prod_{i=1}^{m} \\int \\prod_{j=1}^{n_{i}} f_{y|b}\\left(Y_{i j} | \\mathbf{b}_{i}\\right) f_{b}\\left(\\mathbf{b}_{i}\\right) \\mathrm{d} \\mathbf{b}_{i} \\] 似然函数中,\\(f_{y|b}\\left(Y_{i j} | \\mathbf{b}_{i}\\right)\\) condition on random effect \\(b_i\\)的generalised的分布,\\(f_b(b_i)\\)是随机效应的分布\\(N(0,G)\\). 这块似然函数需要把未知的随机效应进行积分,把他积掉,但是一般而言,这个积分十分困难,所以要基于这个似然函数计算MLE不是很容易。 在GLMM中,如果已知随机效应\\(b_i\\),则有 \\[ \\mathrm{E}\\left[Y_{i j} | \\mathbf{b}_{i}\\right]=\\mu_{i j} \\quad \\quad \\operatorname{Var}\\left[Y_{i j} | \\mathbf{b}_{i}\\right]=\\phi a_{i j} v\\left(\\mu_{i j}\\right) \\] 注:\\(\\phi\\) 和 \\(v(\\mu_{ij})\\) 可以从exponential family那块找到定义。 为了处理似然函数中的积分,可以构造 integrated quasi-likelihood 进行处理。 \\[ \\begin{equation} \\begin{aligned} & \\exp \\left\\{q \\ell_{i}(\\beta, \\theta)\\right\\} \\\\ \\propto &|\\mathbf{G}|^{-\\frac{1}{2}} \\int \\exp \\left[-\\frac{1}{2 \\phi} \\sum_{j=1}^{n_{i}} d_{i j}\\left(y_{i j} ; \\mu_{i j}\\right)-\\frac{1}{2} \\mathbf{b}_{i}^{\\prime} \\mathbf{G}^{-1} \\mathbf{b}_{i}\\right] \\mathrm{d} \\mathbf{b}_{i} \\end{aligned} \\tag{4.3} \\end{equation} \\] with \\[ d_{i j}(y, \\mu)=-2 \\int_{y}^{\\mu} \\frac{y-u}{a_{i j} v(u)} \\mathrm{d} u \\] 这里有一个问题,为啥quasi-likelihood长这样,根据(Wedderburn,1974),quasi-likelihood的定义是 Definition 4.1 quasi-likelihood function \\(K(z_i,\\mu_i)\\) \\[ \\frac{\\partial K\\left(z_{i}, \\mu_{i}\\right)}{\\partial \\mu_{i}}=\\frac{z_{i}-\\mu_{i}}{V\\left(\\mu_{i}\\right)} \\] or equivelantly \\[ K\\left(z_{i}, \\mu_{i}\\right)=\\int^{\\mu_{*}} \\frac{z_{i}-\\mu_{i}^{\\prime}}{V\\left(\\mu_{i}^{\\prime}\\right)} d \\mu_{i}^{\\prime}+\\text { function of } z_{i} \\] 根据其原因,我觉得大概是这样,回到 4.2.1 中,我们可以看到,对Exponential family distribution求导得到Score function,其形式就是 \\[ U=\\frac{y-E(y)}{\\phi} \\] 于是乎quasi-likelihood就反过来,用只要满足求导为这个式子的函数就是一个“likelihood”。 而同时,quasi-likelihood有一些类似于MLE,比较好的计算性质可以进行利用。 回到(Breslow and Clayton,1993)的定义式 (4.3),其中第一部分关于fixed-effect的,直接是用以上quasi-likelihood的定义4.1 带进去,然后加上第二部分关于random effect中和\\(G(\\theta)\\)有关的部分。 因为quasi-likelihood是从exponential family destribution的Score function来的,所以对于\\(Y\\sim exponential family\\),quasi-likelihood等价于真正的likelihood,然后\\(d_{ij}\\)变成了在积分上下限的loglikelihood 的差值:\\(2 \\phi\\{\\ell(y ; y, \\phi)-\\ell(y ; \\mu, \\phi)\\}\\). Integrated quasi-likelihood(4.3) 对于个体i,能写成如下形式 \\[ c|\\mathbf{G}|^{-\\frac{1}{2}} \\int e^{-\\kappa\\left(\\mathbf{b}_{i}\\right)} \\mathrm{d} \\mathbf{b}_{i} \\] 这只是把exponential那里一大堆二次型全部塞到\\(-\\kappa(b_i)\\)里面去。 这个积分无法解析的写出来,于是考虑Laplace逼近。 \\[ \\int e^{-\\kappa\\left(\\mathbf{b}_{i}\\right)} \\mathrm{d} \\mathbf{b}_{i}=(2 \\pi)^{q / 2}\\left|\\kappa^{\\prime \\prime}\\left(\\tilde{\\mathbf{b}}_{i}\\right)\\right|^{-1 / 2} \\exp \\left\\{-\\kappa\\left(\\tilde{\\mathbf{b}}_{i}\\right)\\right\\} \\] \\(q=\\operatorname{dim}(b_i)\\).\\(\\tilde b_i\\) 是\\(\\kappa'(b_i)=0\\)的解。 于是有 \\[ \\begin{equation} q \\ell_{i}(\\beta, \\theta) \\simeq-\\frac{1}{2} \\log |\\mathbf{G}|-\\frac{1}{2} \\log \\left|\\kappa^{\\prime \\prime}\\left(\\tilde{\\mathbf{b}}_{i}\\right)\\right|-\\kappa\\left(\\tilde{\\mathbf{b}}_{i}\\right) \\tag{4.4} \\end{equation} \\] 以及 \\[ \\kappa^{\\prime}\\left(\\mathbf{b}_{i}\\right)=-\\sum_{j=1}^{n_{i}} \\frac{\\left(Y_{i j}-\\mu_{i j}\\right) \\mathbf{z}_{i j}}{\\phi a_{i j} v\\left(\\mu_{i j}\\right) g^{\\prime}\\left(\\mu_{i j}\\right)}+\\mathbf{G}^{-1} \\mathbf{b}_{i}=0 \\] 和 \\[ \\begin{aligned} \\kappa^{\\prime \\prime}\\left(\\mathbf{b}_{i}\\right) &=\\sum_{j=1}^{n_{i}} \\frac{\\mathbf{z}_{i j} \\mathbf{z}_{i j}^{\\prime}}{\\phi a_{i j} v\\left(\\mu_{i j}\\right)\\left[g^{\\prime}\\left(\\mu_{i j}\\right)\\right]^{2}}+\\mathbf{G}^{-1}+\\mathbf{R}_{i} \\\\ & \\simeq \\mathbf{Z}_{i}^{\\prime} \\mathbf{W}_{i} \\mathbf{Z}_{i}+\\mathbf{G}^{-1} \\end{aligned} \\] 注意这个\\(W_i\\)有点像IRWLS中的reweighted matrix。 其中\\(\\boldsymbol R_i\\)是 \\[ \\mathbf{R}_{i}=-\\sum_{j=1}^{n_{i}}\\left(Y_{i j}-\\mu_{i j}\\right) \\mathbf{z}_{i j} \\frac{\\partial}{\\partial \\mathbf{b}_{i}}\\left[\\frac{1}{\\phi a_{i j} v\\left(\\mu_{i j}\\right) g^{\\prime}\\left(\\mu_{i j}\\right)}\\right] \\] \\(E(\\boldsymbol R_i)=0\\), 当使用canonical link function时\\(\\boldsymbol R_i=0\\),所以可以忽略掉。把\\(\\kappa''(b_i)\\)带回 (4.4),则有 \\[ q \\ell_{i}(\\beta, \\theta) \\simeq-\\frac{1}{2} \\log |\\mathbf{G}|-\\frac{1}{2} \\log \\left|\\mathbf{Z}_{i}^{\\prime} \\mathbf{W}_{i} \\mathbf{Z}_{i}+\\mathbf{G}^{-1}\\right|-\\kappa\\left(\\tilde{\\mathbf{b}}_{i}\\right) \\] 把\\(\\kappa(\\cdot)\\)打开可以得到 \\[ \\begin{aligned} q \\ell(\\beta, \\theta)=& \\sum_{i=1}^{m} q \\ell_{i}(\\beta, \\theta) \\\\=&-\\frac{1}{2} \\sum_{i=1}^{m} \\log \\left|\\mathbf{I}_{n_{i}}+\\mathbf{Z}_{i}^{\\prime} \\mathbf{W}_{i} \\mathbf{Z}_{i} \\mathbf{G}\\right| \\\\ &-\\frac{1}{2 \\phi} \\sum_{i=1}^{m} \\sum_{j=1}^{n_{i}} d_{i j}\\left(y_{i j} ; \\tilde{\\mu}_{i j}\\right)-\\frac{1}{2} \\sum_{i=1}^{m} \\tilde{\\mathbf{b}}_{i}^{\\prime} \\mathbf{G}^{-1} \\tilde{\\mathbf{b}}_{i} \\end{aligned} \\] 极大化这个式子得到的估计值称为 penalised quasi-likelihood(PQL)。把后面那项\\(\\frac{1}{2} \\sum_{i=1}^{m} \\tilde{\\mathbf{b}}_{i}^{\\prime} \\mathbf{G}^{-1} \\tilde{\\mathbf{b}}_{i}\\) 视为penalty。 基于某种复杂的和penalized likelihood estimation相同的机制,可以把\\(\\tilde b_i\\) 看成一个罚函数的参数单独进行估计,所以有 \\[ q \\ell(\\beta, \\mathbf{b}) \\simeq-\\frac{1}{2 \\phi} \\sum_{i=1}^{m} \\sum_{j=1}^{n_{i}} d_{i j}\\left(y_{i j} ; \\mu_{i j}\\right)-\\frac{1}{2} \\sum_{i=1}^{m} \\mathbf{b}_{i}^{\\prime} \\mathbf{G}^{-1} \\mathbf{b}_{i} \\] 通过分别找�使得上式极大化的\\(\\beta\\)和\\(\\boldsymbol b\\) 得到需要的估计。对上式进行微分,得到 \\[ \\begin{array}{c}{\\sum_{i=1}^{m} \\sum_{j=1}^{n_{i}} \\frac{\\left(y_{i j}-\\mu_{i j}\\right) \\mathbf{x}_{i j}}{\\phi a_{i j} v\\left(\\mu_{i j}\\right) g^{\\prime}\\left(\\mu_{i j}\\right)}=0} \\\\ {\\sum_{j=1}^{n_{i}} \\frac{\\left(y_{i j}-\\mu_{i j}\\right) \\mathbf{z}_{i j}}{\\phi a_{i j} v\\left(\\mu_{i j}\\right) g^{\\prime}\\left(\\mu_{i j}\\right)}=\\mathbf{G}^{-1} \\mathbf{b}_{i}}\\end{array} \\] . 可以使用Fisher-Scoring algorithm进行求解。 。。。 求解过程很长很麻烦可以去看课件。。 。。。 我们得到了对\\(b\\) 和 \\(\\beta\\)的估计以后,下一步就是对Random effect 的 covariance matrix \\(G(\\theta)\\) 进行估计。 这里没太懂。 Ignoring throughout the dependence of \\(W\\) on \\(\\theta\\) and replacing the deviance \\[ \\sum_{i=1}^{m} \\sum_{j=1}^{n_{i}} d_{i j}\\left(y_{i j}, \\mu_{i j}\\right) \\] by the Pearson Chi-squared statistic \\[ \\sum_{i=1}^{m} \\sum_{j=1}^{n_{i}} \\frac{\\left(y_{i j}-\\mu_{i j}\\right)^{2}}{a_{i j} v\\left(\\mu_{i j}\\right)} \\] 带回之前的PQL,可以得到 \\[ \\begin{aligned} q \\ell(\\hat{\\beta}(\\theta), \\hat{\\mathbf{b}}(\\theta)) \\simeq &-\\frac{1}{2} \\sum_{i=1}^{m} \\log \\left|\\mathbf{V}_{i}(\\theta)\\right| \\\\ &-\\frac{1}{2} \\sum_{i=1}^{m}\\left(\\mathbf{Y}_{i}^{*}-\\mathbf{X}_{i} \\hat{\\beta}(\\theta)\\right)^{\\prime} \\mathbf{V}_{i}^{-1}(\\theta)\\left(\\mathbf{Y}_{i}^{*}-\\mathbf{X}_{i} \\hat{\\beta}(\\theta)\\right) \\end{aligned} \\] 同样的,这个也可以有一个REML的版本避免\\(\\beta\\)的估计对\\(\\theta\\)的影响. \\[ \\begin{aligned} q \\ell^{*}(\\hat{\\beta}(\\theta), \\hat{\\mathbf{b}}(\\theta)) \\simeq &-\\frac{1}{2} \\sum_{i=1}^{m} \\log \\left|\\mathbf{V}_{i}(\\theta)\\right|-\\frac{1}{2} \\sum_{i=1}^{m} \\log \\left|\\mathbf{X}_{i}^{\\prime} \\mathbf{V}_{i}^{-1}(\\theta) \\mathbf{X}_{i}\\right| \\\\ &-\\frac{1}{2} \\sum_{i=1}^{m}\\left(\\mathbf{Y}_{i}^{*}-\\mathbf{X}_{i} \\hat{\\beta}(\\theta)\\right)^{\\prime} \\mathbf{V}_{i}^{-1}(\\theta)\\left(\\mathbf{Y}_{i}^{*}-\\mathbf{X}_{i} \\hat{\\beta}(\\theta)\\right) \\end{aligned} \\] 同样的求导也可以导出估计方程,然后用fisher-scoring algorithm求解估计方程。 \\[ -\\frac{1}{2} \\sum_{i=1}^{m}\\left[\\left(\\mathbf{Y}_{i}^{*}-\\mathbf{X}_{i} \\beta\\right)^{\\prime} \\mathbf{V}_{i}^{-1} \\frac{\\partial \\mathbf{V}_{i}}{\\partial \\theta_{j}} \\mathbf{V}_{i}^{-1}\\left(\\mathbf{Y}_{i}^{*}-\\mathbf{X}_{i} \\beta\\right)-\\operatorname{trace}\\left(\\mathbf{P}_{i} \\frac{\\partial \\mathbf{V}_{i}}{\\partial \\theta_{j}}\\right)\\right]=0 \\] with \\[ \\mathbf{P}_{i}=\\mathbf{V}_{i}^{-1}-\\mathbf{V}_{i}^{-1} \\mathbf{X}_{i}\\left(\\mathbf{X}_{i}^{\\prime} \\mathbf{V}_{i}^{-1} \\mathbf{X}_{i}\\right)^{-1} \\mathbf{X}_{i}^{\\prime} \\mathbf{V}_{i}^{-1} \\] 4.3 The Bayesian analysis approach for covariance modelling It is obvious that the mean part \\(X\\beta\\) is conditional normal distribution under the condition that we know the covariance matrix part \\(\\Sigma=LDL^T\\) or \\(\\Sigma=\\Lambda\\Gamma\\Gamma^T\\Lambda\\) "], +["section-5.html", "Chapter 5 统计图形笔记", " Chapter 5 统计图形笔记 使用 palette() 进行批量获得色彩 比如说 x=seq(0,1,0.01) plot(x,x,type='l',ylim=c(0,10)) for (i in 2:10) { lines(x,i*x,type = 'l',col=palette()[i]) } 可以使用adjustcolor(palette()[i], alpha.f = 0.5) 给上述色彩增加透明度 例如: plot(x,x,type='l',ylim=c(0,10)) for (i in 2:10) { lines(x,i*x,type = 'l',col=adjustcolor(palette()[i], alpha.f = 0.3)) } "], +["basicmcmc.html", "Chapter 6 BasicMCMC 6.1 Metropolis-Hastings Update 6.2 The Gibbs Update 6.3 Combining Updates 6.4 A Metropolis Example", " Chapter 6 BasicMCMC 6.1 Metropolis-Hastings Update Specified distribution has unnormalized density \\(h\\),that is, \\(h\\) is a positive constant ties a probability density. The MH update does: When current state is \\(x\\), propose a move to \\(y\\), having conditional probability density given \\(x\\) denoted \\(q(x,\\cdot)\\). Calculate the Hastings ratio \\[ r(x, y)=\\frac{h(y) q(y, x)}{h(x) q(x, y)} \\] Accept the proposed move \\(y\\) with probability \\[ a(x, y)=\\min (1, r(x, y)) \\] That is, the state after the update is \\(y\\) with probability \\(a(x,y)\\), and the state after the update is \\(x\\) with probability \\(1-a(x,y)\\). The different between the Metropolis rejection with the “reject sampling” in MC method is that MC rejection samping is done repeatedly until some proposal is accepted. MH rejection is always new state but remains the same. Propose a move, how? Care \\(h\\) is the stationary distribution(what we want), the \\(q(\\cdot|\\cdot)\\) the proposal distribution. Update session: if (runif(1)<MH_ratio){ x=y } or in log-scale if (log_MH_raio>=0||runif(1)<exp(log_MH_ratio)){ x=y } The Metropolis-Hastings update is reversible with respect to \\(h\\), Assume \\(X_n\\) is the current state and \\(Y_n\\) is the proposal, we have \\(X_n=X_{n+1}\\) whenever the proposal is rejected. The distribution of \\((X_n,X_{n+1})\\) given rejection is exchangeable. 相当于恒等映射的可逆的,当然没问题。 Hence, it only remains to be shown that \\((X_n,Y_n)\\) is exchangeable given acceptance. We need to show that \\[ \\mathrm{E}\\left\\{f\\left(X_{n}, Y_{n}\\right) a\\left(X_{n}, Y_{n}\\right)\\right\\}=\\mathrm{E}\\left\\{f\\left(Y_{n}, X_{n}\\right) a\\left(X_{n}, Y_{n}\\right)\\right\\} \\] for any function \\(f\\) that has expectation(assuming \\(X_n\\) has desired stationary distribution). That is, we must show we can interchange arguments of \\(f\\) in \\[ \\iint f(x, y) h(x) a(x, y) q(x, y) d x d y \\] integrals can be replaced by sums if the state is discrete. This follows if we can interchange \\(x\\) and \\(y\\) in \\[ h(x) a(x, y) q(x, y) \\] because we can exchange \\(x\\) and \\(y\\) in the integration. The set of \\(x\\) and \\(y\\) such that \\(h(x)>0,q(x,y)>0\\) and \\(a(x,y)>0\\) contributes to the integral or (in the discrete case) sum. These inequalities further imply that \\(h(y)>0\\) and \\(q(y,x)>0\\).Thus we may assume these inequalities, in which case we have \\[ r(y, x)=\\frac{1}{r(x, y)} \\] > 这个式子来源于MH-ratio 中\\(r\\)的定义 Suppose that \\(r(x,y)\\leq 1\\), so \\(r(x,y)=a(x,y)\\) and \\(a(y,x)=1\\) > 来源于\\(a(x,y)\\) 的定义 Then \\[ \\begin{aligned} h(x) a(x, y) q(x, y) &=h(x) r(x, y) q(x, y) \\\\ &=h(y) q(y, x) \\\\ &=h(y) q(y, x) a(y, x) \\end{aligned} \\] Conversely, suppose that \\(r(x,y)>1\\), so \\(a(x,y)=1\\) and \\(a(y,x)=r(y,x)\\). Then \\[ \\begin{aligned} h(x) a(x, y) q(x, y) &=h(x) q(x, y) \\\\ &=h(y) r(y, x) q(y, x) \\\\ &=h(y) a(y, x) q(y, x) \\end{aligned} \\] In either case, we cn exchange \\(x\\) and \\(y\\), the proof is done. 因为 reversible 好像是个更强的。原文:Reversibility implies stationarity,but not vice versa. 6.1.1 Metropolis Update The special case of the Metropolis-Hastings update when \\(q(x,y)=q(y,x)\\) for all \\(x\\) and \\(y\\). Then the MH-ratio has a simpler form: \\[ r(x,y)=\\frac{h(y)}{h(x)} \\] One obvious way is make proposal as the form \\(y=x+\\epsilon\\) where e is stochastically independent of \\(x\\) and symmetrically distributed about zero. Then let \\(q(x,y)=f(y-x)\\), where \\(f\\) is the density of \\(e\\). Widely used proposal is \\(N(0,\\sigma)\\) or uniformly distributed on a ball or a hypercube centered at zero. 6.2 The Gibbs Update The proposal is from a conditional distribution of the desired equilibrium distribution. Always accept version of MH algorithm. The proof of reversible is : Suppose \\(X_n\\) has the desired stationary distribution. Suppose that the conditional distribution of \\(X_{n+1}\\) given \\(f(X_n)\\) is same as the conditional distribution of \\(X_n\\) given \\(f(X_n)\\). Then the pair \\(X_n,X_{n+1}\\) is conditionally exchangeable given \\(f(X_n)\\), hence unconditionally exchangeable. A Gibbs update uses the conditional distribution of one component of the state vector given the rest of the components, that is, the special case of the update described above where \\(f(X_n)\\) is \\(X_n\\) with one component omitted. Conditional distribution of this form are called “full conditionals.” There is no reason other than tradition why such conditional distributions should be preferred. Gibbs updates have one curious property not shared by other MH updates: Gibbs are idempotent, meaning is multiple updates is the same as the effect of just one. Because update never changes \\(f(X_n)\\) . 6.2.1 Variable-at-a-Time Metropolis-Hastings Gibbs updates alter only part of the state vector; when using “full conditionals” the part is a single component. Metropolis-Hastings updates can be modified todo the same. 所以看来,从理论上来说,最初始的MH是全部待估参数一起来的,但是可以做到类似Gibbs的conditional distribution然后一个一个更新。 Divide the state vector into two parts, \\(x=(u,v)\\). Let the proposal alter \\(u\\) but no \\(v\\). Hence, the proposal density has the form \\(q(x,u)\\) instead of \\(q(x,y)\\) we had in Section before. Again let \\(h(x)=h(u,v)\\) be the unnormalized density of the desired equilibrium distribution. The variable-at-a-time MH update does the following: When the current state is \\(x=(u,v)\\), propose a move to \\(y=(u^*,v)\\), where \\(u^*\\) has conditional probability density given \\(x\\) denoted \\(q(x,\\cdot)=q(u,v,\\cdot)\\). Calculate the Hastings ratio \\[ r(x, y)=\\frac{h\\left(u^{*}, v\\right) q\\left(u^{*}, v, u\\right)}{h(u, v) q\\left(u, v, u^{*}\\right)} \\] Accept the proposed move \\(y\\) with probability we defined before, that is, the state after the update is y with porbability \\(a(x,y)\\), and the state after the update is \\(x\\) with probability \\(1-a(x,y)\\). Actually the sampler run in Metropolis et.al(1953) was a “variable-at-a-time” sampler. So name it as “variable-at-a-time MH” is sometimes of a misnomer. 很显然,因为那么高维一起抽接受率堪忧。。 6.2.2 The Gibbs is a special case of Metropolis-Hastings: \\(x=(u,v)\\), v is the part of the state on which the Gibbs update conditions. Doing block Gibbs updating \\(u\\) from its conditional distribution given v. Factor the unnormalized density \\(h(u,v)=g(v)q(v,u)\\), where \\(g(v)\\) is an unnormalized marginal of \\(v\\) and q(v,u) is the conditional of \\(u\\) given \\(v\\). Now do a MH update with \\(q\\) as the proposal distribution. >也就是说,把h,目标密度,分为g和q,g是initi部分,q是condition部分。则用q的部分作为MH的proposal。则在这种情况下可以得到MH ratio. \\[ r(x, y)=\\frac{h\\left(u^{*}, v\\right) q(u, v)}{h(u, v) q\\left(v, u^{*}\\right)}=\\frac{g(v) q\\left(v, u^{*}\\right) q(u, v)}{g(v) q(v, u) q\\left(v, u^{*}\\right)}=1 \\] 6.2.3 Gibbs Full conditional distibution 为了抽Gibbs,所以我们必须得搞到Full conditional distribution. 通常,我们都是把posterior写出来,然后用和每个系数相关的部分挑出来,看看能不能满足什么分布。而一个很自然的问题就是,为什么要那么做?这么做的理论依据是什么。这篇课件 给了一个相对详细的说明,简单来说就是,道理还是用条件概率公式\\(\\frac{f(\\theta_1,\\theta_2,y)}{f(\\theta_1,y)}=f(\\theta_1|\\theta_2,y)\\).但是呢,因为对normalizing constant不感兴趣,所以直接捡分母那块就搞定。 例子如下: \\[ \\begin{aligned} Y_{1}, \\ldots, Y_{n} & \\stackrel{\\mathrm{iid}}{\\sim} N\\left(\\mu, \\tau^{-1}\\right) \\\\ \\mu & \\sim N(0,1) \\\\ \\tau & \\sim \\mathrm{Gamma}(2,1) \\end{aligned} \\] 一个标准的hierachical model。 Likelihood如下: \\[ L(\\mu, \\tau)=\\prod_{i=1}^{n} f\\left(Y_{i} | \\mu, \\tau\\right)=\\prod_{i=1}^{n} \\frac{\\sqrt{\\tau}}{\\sqrt{2 \\pi}} \\exp \\left\\{-\\tau\\left(Y_{i}-\\mu\\right)^{2} / 2\\right\\}=(2 \\pi)^{-n / 2} \\tau^{n / 2} \\exp \\left\\{-\\tau \\sum_{i=1}^{n}\\left(Y_{i}-\\mu\\right)^{2} / 2\\right\\} \\] 注意这里Likelihood的形式是\\(f(Y_i|\\mu,\\tau)\\),是因为这样再乘上prior就是joint distribution了。所以乘上prior得到: \\[ p(\\mathbf{Y}, \\mu, \\tau)=(\\text { constant }) \\times \\tau^{n / 2} \\exp \\left\\{-\\tau \\sum_{i=1}^{n}\\left(Y_{i}-\\mu\\right)^{2} / 2\\right\\} \\exp \\left\\{-\\mu^{2} / 2\\right\\} \\tau \\exp ^{-\\tau} \\] 如果\\(\\mathbf{Y}\\)是观测值,那要得到conditional density of \\((\\mu,\\tau)\\) given \\(\\mathbf Y\\).则通过条件概率公式,需要: \\[ p(\\mu, \\tau | \\mathbf{Y})=\\frac{p(\\mathbf{Y}, \\mu, \\tau)}{p(\\mathbf{Y})}=\\frac{p(\\mathbf{Y}, \\mu, \\tau)}{\\iint p\\left(\\mathbf{Y}, \\mu^{*}, \\tau^{*}\\right) d \\mu^{*} d \\tau^{*}} \\] 但是呢,分母对于我们的目标来讲并不重要,因为并不涉及我们关心的变量\\((\\mu,\\tau)\\),而\\(\\mathbf{Y}\\) 又是观测(数),所以分母就是一个normalising constant,不重要,所以就能写成: \\[ p(\\mu, \\tau | \\mathbf{Y}) \\propto p(\\mathbf{Y}, \\mu, \\tau). \\] 现在我们去求我们关心的变量各自的full conditionals. \\[ p(\\mu | \\tau, \\mathbf{Y})=\\frac{p(\\mu, \\tau | \\mathbf{Y})}{p(\\tau | \\mathbf{Y})} \\] ,同样的,分母不关心,因为不涉及\\(\\mu\\),所以作为一个关于\\(\\mu\\)的函数 \\[ p(\\mu | \\tau, \\mathbf{Y}) \\propto p(\\mathbf{Y}, \\mu, \\tau) \\propto \\exp \\left\\{-\\tau \\sum_{i=1}^{n}\\left(Y_{i}-\\mu\\right)^{2} / 2\\right\\} \\exp \\left\\{-\\mu^{2} / 2\\right\\} \\] 同样的,作为\\(\\tau\\)的函数,full conditional for \\(\\tau\\)是: \\[ p(\\tau | \\mu, \\mathbf{Y}) \\propto p(\\mathbf{Y}, \\mu, \\tau)=\\tau^{n / 2} \\exp \\left\\{-\\tau \\sum_{i=1}^{n}\\left(Y_{i}-\\mu\\right)^{2} / 2\\right\\} \\tau \\exp ^{-\\tau} \\] . 所以可以简单的说,full condition就是joint posterior只关心所求系数的形式。但是难点在于这个形式是什么分布,见 2.1 ,这个定理就能断定一类函数形式是Normal distribution。 6.3 Combining Updates 6.3.1 Composition Let \\(P_1,...,P_k\\) be update mechanism(computer code) and let \\(P_1P_2...P_k\\) denote the composite update that consists of these updates done in that order with \\(P_1\\) first and \\(P_k\\) last. If each \\(P_i\\) preserves a distribution, then obviously so does \\(P_1P_2...P_k\\). If \\(P_1,...,P_k\\) are the Gibbs updates foro the “full conditionals” ofo the desired equilibrium distribution, then the composition update is often called a fixed scan Gibbs sampler. 6.3.2 Palindromic Composition Note that \\(P_1P_2...P_k\\) is not reversible with respect to the distribution it preserves unless the transition probabilities associated with \\(P_1P_2...P_k\\) and \\(P_kP_{k-1}...P_1\\) are the same. The most obvious way to arrange reversibility is to make \\(P_i=P_{k-i}\\) for \\(i=1,...,k\\). Then we call this composite update palindromic. 6.3.3 State-Independent Mixing Let \\(P_y\\) be update mechanism(computer code), and let \\(E(P_y)\\) denote the update that consists of doing a random one of these updates: generate \\(Y\\) from some distribution and do \\(P_y\\). 相当于是 比如给出3个更新机制,随机选一个。 如果Y独立于current state,同时\\(P_y\\) 有同样的分布,则\\(E(P_y)\\) 也一样。如果\\(X_n\\) 有同样的稳定分布,则\\(X_n\\)关于Y的条件分布也是同一分布,\\(X_{n+1}\\)也有同样的条件分布。 也就是说,Markov chain update with \\(E(P_Y)\\) 是可逆的如果\\(P_y\\)是可逆的。 这个mixing一般用于mixture model。 6.3.4 Subsampling Let P is an update mechanism, write \\(P^k\\) to denote the k-fold composition of P. This process is take every kth element of a Markov chain \\(X_1,X_2,..\\) forming a new Markov chain \\(X_1,X_{k+1},X_{2k+1},...\\) is called subsampling the original Markov chain. Subsampling cannot improve the accuracy of MCMC approximation; it must make things worse. The original motivation for subsampling appears to have been to reduce autocorrelation in the subsampled chain to a negligible level. This is only done by before Markov chain CLT was well understood. Now the Markov chain CLT is well understood, this cannot be a justification for subsamplig. 所以唯一有用的部分是just to reduce the amount of output of a Markov chain sampler to manageable levels. Billions of iterations may be needed for convergence, but billions of iterations of output may be too much to handle, especially in R. 然而batches 方法也可以做到,所以唯一能用subsampling的时候就是不能使用batching的时候。 这种情况很少,所以基本不用sub-sampling… 6.4 A Metropolis Example Frequency logistic regression. library(mcmc) ## Warning: package 'mcmc' was built under R version 3.5.2 data("logit") out=glm(y~ x1+x2+x3+x4,data=logit,family=binomial(),x=TRUE) summary(out) ## ## Call: ## glm(formula = y ~ x1 + x2 + x3 + x4, family = binomial(), data = logit, ## x = TRUE) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -1.7461 -0.6907 0.1540 0.7041 2.1943 ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) 0.6328 0.3007 2.104 0.03536 * ## x1 0.7390 0.3616 2.043 0.04100 * ## x2 1.1137 0.3627 3.071 0.00213 ** ## x3 0.4781 0.3538 1.351 0.17663 ## x4 0.6944 0.3989 1.741 0.08172 . ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: 137.628 on 99 degrees of freedom ## Residual deviance: 87.668 on 95 degrees of freedom ## AIC: 97.668 ## ## Number of Fisher Scoring iterations: 6 Do this in bayesian analysis where the prior for coefficients are \\(N(0,2)\\). x <- out$x y <- out$y lupost <- function(beta, x, y, ...) { eta <- as.numeric(x %*% beta) logp <- ifelse(eta < 0, eta - log1p(exp(eta)),- log1p (exp(- eta))) logq <- ifelse(eta < 0, - log1p(exp(eta)), - eta - log1p (exp(- eta))) logl <- sum(logp[y == 1]) + sum(logq[y == 0]) return(logl - sum(beta^2) / 8) } beta.init <- as.numeric(coefficients(out)) out <- metrop(lupost, beta.init, 1e3, x = x, y = y) out2=metrop(out,x=x,y=y) The functions in the mcmc package are designed so that if given the output of a preceding run as their first argument, they continue the run of the Markov chain where the other run left off. It is generally accepted that an acceptance rate of about 20% is right. Although this magic number can fail, but this is good for most cases, and easy to understand and implement for beginners. "], +["r-trick.html", "Chapter 7 R trick", " Chapter 7 R trick 科学计数法: 取消科学计数法 options(scipen = 200) scipen 表示在200个数字以内都不使用科学计数法 显示位数: options(digits=3) Rmarkdown 不运行 eval=FALSE 清除环境变量 rm(list=ls()) 拼接字符串 paste() Try catch for (i in 1:10) { tryCatch({log(4-i)},warning=function(a){print("warning!");browser();stop()},finally = {print(i);}) } Try后面expr中的式子,如果一切无事,则run finally里面的expr,如果有warning,就run warning里面的function,如果有error,就run error 里面的function。 Call browser when error options(error=browser) options(warn=2) options(error=recover) R 取余:%% 2300%%100 ## [1] 0 230%%100 ## [1] 30 一个进度条的包,这个是外跳一个进度条窗口 library(tcltk2) u <- 1:2000 plot.new() pb <- tkProgressBar("进度","已完成 %", 0, 100) for(i in u) { x<-rnorm(u) points(x,x^2,col=i) info <- sprintf("已完成 %d%%", round(i*100/length(u))) setTkProgressBar(pb, i*100/length(u), sprintf("进度 (%s)", info), info) } 进度条 我觉得这个更好用 library(progress) pb <- progress_bar$new(total = 100) for (i in 1:100) { pb$tick() Sys.sleep(1 / 100) } 一个比较完整应用: pb <- progress::progress_bar$new( format = "MCMC running [:bar] :percent in :elapsed eta: :eta", total = iter, clear = FALSE) -Benchmark 其中的试例代码: benchmark("lm" = { X <- matrix(rnorm(1000), 100, 10) y <- X %*% sample(1:10, 10) + rnorm(100) b <- lm(y ~ X + 0)$coef }, "pseudoinverse" = { X <- matrix(rnorm(1000), 100, 10) y <- X %*% sample(1:10, 10) + rnorm(100) b <- solve(t(X) %*% X) %*% t(X) %*% y }, "linear system" = { X <- matrix(rnorm(1000), 100, 10) y <- X %*% sample(1:10, 10) + rnorm(100) b <- solve(t(X) %*% X, t(X) %*% y) }, replications = 1000, columns = c("test", "replications", "elapsed", "relative", "user.self", "sys.self")) 可以很简单的改成我的代码: benchmark("Rvec2tril"=BayesJMCM::vec2tril(phiiVec) "Rcppvec2tril"=vec2trilcpp(phiiVec) ) "], +["reversible-jump-mcmc.html", "Chapter 8 Reversible Jump MCMC 8.1 Introduction 8.2 Implementation 8.3 Mapping Functions and Proposal Distribution", " Chapter 8 Reversible Jump MCMC 8.1 Introduction Provides a general framework for Markov chain Monte Carlo simulation in which the dimension of the parameter space can vary between iterates of the Markov chain. Extension of the Metropolis-Hastings algorithm onto more general state spaces. Suppose that for observed data \\(x\\) we have a countable collection of candidate models \\(\\mathcal{M}=\\left\\{\\mathcal{M}_{1}, \\mathcal{M}_{2}, \\ldots\\right\\}\\) indexed by a parameter \\(k \\in \\mathcal{K}\\). The index k can be considered as an auxiliary model indicator variable, such that \\(\\mathcal{M}_{k^{\\prime}}\\) denotes the model where \\(k=k'\\). Each model \\(\\mathcal{M}_k\\) has an \\(n_k\\)-dimensional vector of unknown parameters,\\(\\boldsymbol{\\theta}_{k} \\in \\mathbb{R}^{n_{k}}\\), where \\(n_k\\) can take different values for different models \\(k\\in\\mathcal K\\). The joint posterior distribution of \\((k,\\boldsymbol{\\theta}_k)\\) given observed data,x, is obtained as the product of the likelihood, \\(L\\left(\\mathbf{x} | k, \\boldsymbol{\\theta}_{k}\\right)\\), and the joint prior, \\(p\\left(k, \\boldsymbol{\\theta}_{k}\\right)=p\\left(\\boldsymbol{\\theta}_{k} | k\\right) p(k)\\), constructed from the prior distribution of \\(\\boldsymbol \\theta_k\\) under model \\(\\mathcal M_k\\), and the prior for the model indicator k(i.e. the prior for model \\(\\mathcal M_k\\)). Hence, the joint posterior is \\[ \\pi\\left(k, \\boldsymbol{\\theta}_{k} | \\mathbf{x}\\right)=\\frac{L\\left(\\mathbf{x} | k, \\boldsymbol{\\theta}_{k}\\right) p\\left(\\boldsymbol{\\theta}_{k} | k\\right) p(k)}{\\sum_{k^{\\prime} \\in \\mathcal{K}} \\int_{\\mathbb{R}^{n} k^{\\prime}} L\\left(\\mathbf{x} | k^{\\prime}, \\boldsymbol{\\theta}_{k^{\\prime}}^{\\prime}\\right) p\\left(\\boldsymbol{\\theta}_{k^{\\prime}}^{\\prime} | k^{\\prime}\\right) p\\left(k^{\\prime}\\right) d \\boldsymbol{\\theta}_{k^{\\prime}}^{\\prime}} \\] > 也就是说,这个posterior就是选一个模型(自带\\(n_k\\)个参数),然后再这个模型下可以得到一个posterior,而选定了模型以后,对应\\(n_k\\)个参数也有一个prior,在已知k的时候的prior,这样就构造了对于k个模型,已知数据x的posterior 因为reversible jump就按前言中所说的,是用来解决参数维度变化的。而对于coefficient维度变化则对应了不同的模型,也就是说在不同模型之间跳来跳去的, The target of an MCMC sampler over the state space \\(\\boldsymbol{\\Theta}=\\bigcup_{k \\in \\mathcal{K}}\\left(\\{k\\} \\times \\mathbb{R}^{n_{k}}\\right)\\), where the states of the Markov chain are of the form \\((k,\\boldsymbol \\theta_k)\\), the dimension of which can vary over the state space. Accordingly, from the output of a single Markov chain sampler, the user is able to obtain a full probabilistic description of the posterior probabilities of each model having observed the data, x, in addition of the posterior porbabilities of each model having observed the data,x, in addition to the posterior distributions of the individual models. 输出是一个单独的Markov chain sampler。可以的到期描述每个模型的全后验概率,关于观测数据。 8.1.1 From Metropolis-Hastings to Reversible Jump Stand form of MH relies on the construction of a time-reversible Markov chain via the detailed balance condition. That is, the moves from state \\(\\theta\\) to \\(\\theta'\\) are made as often as moves from \\(\\theta'\\) to \\(\\theta\\) with respect to the ratget density. Then extension MH to the setting that the dimension of the parameter vector varies. The previous part construction of a Markov chain on a general state space \\(\\Theta\\) with invariant or stationary distribution \\(\\pi\\), the detailed balance condition can be written as \\[ \\int_{\\left(\\theta, \\theta^{\\prime}\\right) \\in \\mathcal{A} \\times \\mathcal{B}} \\pi(d \\theta) P\\left(\\theta, d \\theta^{\\prime}\\right)=\\int_{\\left(\\theta, \\theta^{\\prime}\\right) \\in \\mathcal{A} \\times \\mathcal{B}} \\pi\\left(d \\theta^{\\prime}\\right) P\\left(\\theta^{\\prime}, d \\theta\\right) \\] 所以只是换了state space,其他都没变? Compare to standard MH algorithm, Markov chain transitions from a current state \\(\\boldsymbol{\\theta}=\\left(k, \\boldsymbol{\\theta}_{k}^{\\prime}\\right) \\in \\mathcal{A}\\), in model \\(\\mathcal M_k\\) are realized by first proposing a new state \\(\\boldsymbol{\\theta}^{\\prime}=\\left(k^{\\prime}, \\boldsymbol{\\theta}_{k^{\\prime}}\\right) \\in \\mathcal{B}\\),(\\(A\\)是\\(k\\)的域,比如\\(\\mathbb N\\)? \\(B\\)是\\(\\mathbb R^{n_k}\\),也就是正常的\\(\\theta\\)的域。) by proposal distribution \\(q(\\boldsymbol\\theta,\\boldsymbol \\theta')\\). The detailed balance condition is enforced through the acceptance probability, where the move to the candidate state \\(\\boldsymbol \\theta'\\) is accepted with probability \\(\\alpha\\left(\\theta, \\theta^{\\prime}\\right)\\). If rejected, the chain remains at the current state \\(\\boldsymbol \\theta\\) in model \\(\\mathcal M_k\\). Under this mechanism, detail balance condition equation becomes \\[ \\int_{\\left(\\theta, \\theta^{\\prime}\\right) \\in \\mathcal{A} \\times \\mathcal{B}} \\pi(\\boldsymbol{\\theta} | \\mathbf{x}) q\\left(\\boldsymbol{\\theta}, \\boldsymbol{\\theta}^{\\prime}\\right) \\alpha\\left(\\boldsymbol{\\theta}, \\boldsymbol{\\theta}^{\\prime}\\right) d \\boldsymbol{\\theta} d \\boldsymbol{\\theta}^{\\prime}=\\int_{\\left(\\theta, \\boldsymbol{\\theta}^{\\prime}\\right) \\in \\mathcal{A} \\times \\mathcal{B}} \\pi\\left(\\boldsymbol{\\theta}^{\\prime} | \\mathbf{x}\\right) q\\left(\\boldsymbol{\\theta}^{\\prime}, \\boldsymbol{\\theta}\\right) \\alpha\\left(\\boldsymbol{\\theta}^{\\prime}, \\boldsymbol{\\theta}\\right) d \\boldsymbol{\\theta} d \\boldsymbol{\\theta}^{\\prime} \\] where \\(\\pi(\\theta|x)\\) are the posterior we want sample. One way to enforce this equation is by setting the acceptance probability as \\[ \\alpha\\left(\\boldsymbol{\\theta}, \\boldsymbol{\\theta}^{\\prime}\\right)=\\min \\left\\{1, \\frac{\\pi(\\boldsymbol{\\theta} | \\mathbf{x}) q\\left(\\boldsymbol{\\theta}, \\boldsymbol{\\theta}^{\\prime}\\right)}{\\pi\\left(\\boldsymbol{\\theta}^{\\prime} | \\mathbf{x}\\right) q\\left(\\boldsymbol{\\theta}^{\\prime}, \\boldsymbol{\\theta}\\right)}\\right\\} \\] This resembles the usual metropolis-hastings acceptance ratio. Then a reversible jump sampler with N iterations is commonly constructed as follows: Initialize k and \\(\\boldsymbol \\theta_k\\) at iteration \\(t=1\\) For iteration \\(t\\geq 1\\) perform Within-model move: with a fixed model k, update the parameters \\(\\boldsymbol \\theta_k\\) acoording to any MCMC updating scheme Between-models move: simultaneously update model indicator k and the parameter \\(\\boldsymbol \\theta_k\\) according to the general reversible proposal/acceptance mechanism Increment iteration t=t+1. If \\(t<N\\), go to step 2. 问题:Within-model move是一个传统的MH move,Between-model move是延拓的MH move,关键是,必须要在本model做一个move,然后再抽model吗?还是说其实只做两个一起抽也是可以的? 8.1.2 Application Area Change-point models One of the original application of the reversible jump sampler was in Bayesian change-point problems. Green(2005) analyzed mining disaster count data using a Poisson process with the rate parameter described as a step function with an unknown number and location of steps. Fan and Brooks(2000) applies the reversible jump sampler to model the shape of prehistoric tombs, where the curvature of the dome changes an unknown number of times Finite mixture models. Mixture models are widely used where each data observation is generated according to some underlying categorical mechanism, which the mechanism is typically unobserved, so there is uncertainty regarding which component of the resulting mixture distribution each data observation was generated from, and uncertainty ober the nuber of mixture components. \\[ f(x|\\theta_k)=\\sum_{j=1}^k w_jf_j(x|\\phi_j) \\] Variable selection The multi-model setting emerges when attempting to identify the most relevant subsets of predictors, making it a natural candidate for the reversible jump sampler. Nonparametrics Within Bayesian nonparametrics, many authors have successfully explored the use of the reversible jump sampler as a method to automate the knot selection process when using a pth-order spline model for curve fitting. \\[ f(x)=\\alpha_0+\\sum_{j=1}^{P}\\alpha_j x^j+\\sum_{i=1}^k \\eta_j(x-\\kappa_i)^P_+ \\] with \\(x\\in[a,b]\\) Time series models In the modelling of temporally dependent data, \\(x_1,...,x_T\\), multiple models naturally arise under uncertainty over the degree of dependence. For example, under a kth-order autoregressive process \\[ X_t=\\sum^k_{\\tau=1}a_\\tau X_{t-\\tau}+\\epsilon_t \\] with \\(t=k+1,...,T\\) 8.2 Implementation In practice, the construction of proposal moves between different models is achieved via the concept of “dimension matching.” Most simply, under a general Bayesian model determination setting, suppose that we are currently in state \\((k,\\theta_k)\\) in model \\(\\mathcal M_k\\), and we wish to propose a move to a state \\((k',\\theta_k')\\) in model \\(\\mathcal M_{k'}\\), which is of a higher dimension, so that \\(n_{k^{\\prime}}>n_{k}\\). In order to “math dimensions” between the two model states, a random vector \\(\\boldsymbol u\\) of length \\(d_{k \\rightarrow k^{\\prime}}=n_{k^{\\prime}}-n_{k}\\) is generated from a known density \\(q_{d_{k \\rightarrow k^{\\prime}}}(\\mathbf{u})\\). The current state \\(\\boldsymbol \\theta_k\\) and the random vector \\(\\boldsymbol u\\) are then mapped to the new state \\(\\boldsymbol{\\theta}_{k^{\\prime}}^{\\prime}=g_{k \\rightarrow k^{\\prime}}\\left(\\boldsymbol{\\theta}_{k}, \\mathbf{u}\\right)\\) through a one-to-one mapping function \\(g_{k \\rightarrow k^{\\prime}} : \\mathbb{R}^{n_{k}} \\times \\mathbb{R}^{d_{k}} \\rightarrow \\mathbb{R}^{n_{k^{\\prime}}}\\). The acceptance probability of this proposal, combined with the joint posterior expression becomes \\[ \\alpha\\left[\\left(k, \\theta_{k}\\right),\\left(k^{\\prime}, \\theta_{k^{\\prime}}^{\\prime}\\right)\\right]=\\min \\left\\{1, \\frac{\\pi\\left(k^{\\prime}, \\theta_{k^{\\prime}}^{\\prime} | \\mathbf{x}\\right) q\\left(k^{\\prime} \\rightarrow k\\right)}{\\pi\\left(k, \\theta_{k} | \\mathbf{x}\\right) q\\left(k \\rightarrow k^{\\prime}\\right) q_{d_{k \\rightarrow k^{\\prime}}}(\\mathbf{u})}\\left|\\frac{\\partial g_{k \\rightarrow k^{\\prime}}\\left(\\boldsymbol{\\theta}_{k}, \\mathbf{u}\\right)}{\\partial\\left(\\theta_{k}, \\mathbf{u}\\right)}\\right|\\right\\} \\] 为了填补缺少的维度,提出一个随机向量\\(u\\),维度是缺少的维度,来源于一个已知的密度函数\\(q_{d_{k\\rightarrow k'}}(u)\\).(这个密度怎么取?)。然后目前的状态\\(\\theta_k\\)和\\(u\\)可以映射到新的状态\\(\\boldsymbol{\\theta}_{k^{\\prime}}^{\\prime}=g_{k \\rightarrow k^{\\prime}}\\left(\\boldsymbol{\\theta}_{k}, \\mathbf{u}\\right)\\),通过g这个一一映射,\\(g_{k \\rightarrow k^{\\prime}} : \\mathbb{R}^{n_{k}} \\times \\mathbb{R}^{d_{k}} \\rightarrow \\mathbb{R}^{n_{k^{\\prime}}}\\). 然后有接受概率如上。解释,分子就是从新到老的概率,下面是从老到新的概率,因为多了u,所以多一块u的概率,而反过来则没有。 后面那块则是一一映射g的jacobian矩阵。这一块是来自于因为对\\(\\theta\\)做了变换。在detail-balance condition equation那块做积分的时候会用到。 反过来reverse move的话(从高维move到低维),则有接受概率 \\[ \\alpha\\left[\\left(k^{\\prime}, \\boldsymbol{\\theta}_{k^{\\prime}}^{\\prime}\\right),\\left(k, \\boldsymbol{\\theta}_{k}\\right)\\right]=\\alpha\\left[\\left(k, \\boldsymbol{\\theta}_{k}\\right),\\left(k^{\\prime}, \\boldsymbol{\\theta}_{k^{\\prime}}^{\\prime}\\right)\\right]^{-1} \\] 。 更一般的,我们可以放宽u长度的条件,也就是让\\(d_{k \\rightarrow k^{\\prime}} \\geq n_{k^{\\prime}}-n_{k}\\). 则在这种情况下,不确定的反向移动可以通过一个\\(d_{k^{\\prime} \\rightarrow k}\\)-维的向量,\\(\\mathbf{u}^{\\prime} \\sim q_{d_{k^{\\prime} \\rightarrow k}}\\left(\\mathbf{u}^{\\prime}\\right)\\),使得对应“dimension-match”条件能成立:在同一维度:\\(n_{k}+d_{k \\rightarrow k^{\\prime}}=n_{k^{\\prime}}+d_{k^{\\prime} \\rightarrow k}\\). 则反过来的映射可以写成\\(\\boldsymbol{\\theta}_{k}=g_{k^{\\prime} \\rightarrow k}\\left(\\boldsymbol{\\theta}_{k^{\\prime}}^{\\prime} \\mathbf{u}^{\\prime}\\right)\\) ,使得有\\(\\boldsymbol{\\theta}_{k}=g_{k^{\\prime} \\rightarrow k}\\left(g_{k \\rightarrow k^{\\prime}}\\left(\\boldsymbol{\\theta}_{k}, \\mathbf{u}\\right), \\mathbf{u}^{\\prime}\\right)\\),以及\\(\\boldsymbol{\\theta}_{k^{\\prime}}^{\\prime}=g_{k \\rightarrow k^{\\prime}}\\left(g_{k^{\\prime} \\rightarrow k}\\left(\\boldsymbol{\\theta}_{k^{\\prime}}^{\\prime} \\mathbf{u}^{\\prime}\\right), \\mathbf{u}\\right)\\).接受概率则可以写成: \\[ \\alpha\\left[\\left(k, \\theta_{k}\\right),\\left(k^{\\prime}, \\theta_{k^{\\prime}}^{\\prime}\\right)\\right]=\\min \\left\\{1, \\frac{\\pi\\left(k^{\\prime}, \\theta_{k^{\\prime}} | \\mathbf{x}\\right) q\\left(k^{\\prime} \\rightarrow k\\right) q_{d_{k^{\\prime} \\rightarrow k}}\\left(\\mathbf{u}^{\\prime}\\right)}{\\pi\\left(k, \\theta_{k} | \\mathbf{x}\\right) q\\left(k \\rightarrow k^{\\prime}\\right) q_{d_{k \\rightarrow k^{\\prime}}}(\\mathbf{u})}\\left|\\frac{\\partial g_{k \\rightarrow k^{\\prime}}\\left(\\boldsymbol{\\theta}_{k}, \\mathbf{u}\\right)}{\\partial\\left(\\theta_{k}, \\mathbf{u}\\right)}\\right|\\right\\} \\] 相当于,可以找一个更高维的中间变量,然后一个随机向量u,补齐各状态与中间变量之间的差异。这样就可以避免高维到低维,低维到高维的不对等关系,使得算法表达形式能统一。 8.2.1 Example Dimension Matching Example in Green(1995) and Brooks(1998). Suppose that model \\(\\mathcal M_1\\) has states (\\(k=1,\\theta_1\\in \\mathbb R^1\\)) and model \\(\\mathcal M_2\\) with states (\\(k=2,\\theta_2\\in \\mathbb R^2\\)). So under this setting, \\((1,\\theta^*)\\) denote the current state in \\(\\mathcal M_1\\) and \\((2,(\\theta^{(1)},\\theta^{(2)}))\\) denote the proposed state in \\(\\mathcal M_2\\). Under dimension matching, we might generate a random scalar u, and let \\(\\theta^{(1)}=\\theta^{*}+u \\text { and } \\theta^{(2)}=\\theta^{*}-u\\), with the reverse move given deterministically by \\(\\theta^{*}=\\frac{1}{2}\\left(\\theta^{(1)}+\\theta^{(2)}\\right)\\). 所以就把1维和2维的统一在了一起,并且给定了转换表达式。 8.2.2 Example: Moment Matching in a Finite Mixture of Univariate Normals 如果模型是有限个单变量正态分布的混合,观测数据,\\(x\\),则服从混合正态分布,每块是\\(N(\\mu_j,\\sigma_j)\\). 模型之间的移动,Richardson and Green(1997) 实现了一个分割和合并的策略。 当两个正态\\(j_1\\)和\\(j_2\\) 合并成一个,\\(j^*\\),有如下确定的映射,同时保持了0,1,2阶的矩。 \\[ \\begin{aligned} w_{j^{*}} &=w_{j_{1}}+w_{j_{2}} \\\\ w_{j^{*}} \\mu_{j^{*}} &=w_{j_{1}} \\mu_{j_{1}}+w_{j_{2}} \\mu_{j_{2}} \\\\ w_{j^{*}}\\left(\\mu_{j^{*}}^{2}+\\sigma_{j^{*}}^{2}\\right) &=w_{j_{1}}\\left(\\mu_{j 1}^{2}+\\sigma_{j_{1}}^{2}\\right)+w_{j_{2}}\\left(\\mu_{j_{2}}^{2}+\\sigma_{j_{2}}^{2}\\right) \\end{aligned} \\] 分割则是 \\[ \\begin{aligned} w_{j_{1}} &=w_{j^{*}} * u_{1}, \\quad w_{j_{2}}=w_{j^{*}} *\\left(1-u_{1}\\right) \\\\ \\mu_{j_{1}} &=\\mu_{j^{*}}-u_{2} \\sigma_{j^{*}} \\sqrt{\\frac{w_{j_{2}}}{w_{j_{1}}}} \\\\ \\mu_{j_{2}} &=\\mu_{j^{*}}+u_{2} \\sigma_{j^{*}} \\sqrt{\\frac{w_{j_{1}}}{w_{j_{2}}}} \\\\ \\sigma_{j_{1}}^{2} &=u_{3}\\left(1-u_{2}^{2}\\right) \\sigma_{j^{*}}^{2} \\frac{w_{j^{*}}}{w_{j_{2}}} \\\\ \\sigma_{j_{2}}^{2} &=\\left(1-u_{3}\\right)\\left(1-u_{2}^{2}\\right) \\sigma_{j^{*}}^{2} \\frac{w_{j^{*}}}{w_{j_{2}}} \\end{aligned} \\] 同时随机量\\(u_{1}, u_{2} \\sim \\operatorname{Beta}(2,2)\\),以及\\(u_{3} \\sim \\operatorname{Beta}(1,1)\\)。这样满足了维度相同,然后能算分割步的接受概率和合并的接受概率。 8.3 Mapping Functions and Proposal Distribution 虽然维度匹配的想法很简单,但是实现却很复杂。因为存在一个任意的映射函数\\(g_{k \\rightarrow k^{\\prime}}\\) 和proposal distribution \\(q_{d_{k \\rightarrow k^{\\prime}}}(\\mathbf{u})\\)。好的映射很自然的能提高采样的效率,体现在模型之间跳的接受率和chain本身的收敛。难点在于即是是很简单的nested model,也很难定义一个好的关系。模型之间的自由度只由\\(q_{d_{k \\rightarrow k^{\\prime}}}(\\mathbf{u})\\)决定。然而没有一个很显然的准则选择好的q。对比模型内的选择,随机游走Mh算法可能对应任意的接受概率,一个小的游走对应高的接受概率,大步游走则接受概率会比较低。可以借鉴这个“local”的想法到模型空间\\(k\\in\\mathcal K\\)中。proposal来自于\\(\\theta_k\\) 在模型\\(\\mathcal M_k\\)比起另外一个\\(\\theta_{k'}\\)来自于\\(\\mathcal M_{k'}\\)会有更高的接受率,如果数据对应的likelihood差别不大的话。比如说Richardson 和Green(1997)提出的“birth/death” and “split.merge”映射,在切换模型的时候尽量切换likelihood差不多的模型。但是这并不容易实现。 尽管如此,RjMCMC的采样效果还是很差。 所以要用的时候还是从最简单的birth-dead proposal和merge-split proposal开始看起吧。 8.3.1 Marginalization and augmentation: Reduced- or fixed- dimensional sampler may be substituted because reversible jump MCMC would be heaby-handed. In lower dimensions, the reversible jump sampler is often easier to implement, as the problems associated with mapping function specification are conceptually simpler to resolve. Example:Marginalization in Variable Selection. 用辅助变量,则能把parameter变成固定维数的问题。Under certain prior specifications for the regression coefficient \\(\\beta\\) and error variance \\(\\sigma^2\\),the \\(\\beta\\) coefficients can be analytically integrated out of the posterior. A gibbs sampler directly on model space is then available for \\(\\gamma\\)。也就是说主要关注选取哪个模型的问题的话可以把\\(\\beta\\)先积掉然后研究模型问题,等选定模型再估\\(\\beta\\)? Example:Marginalization in Finite Mixture of Multivariate Normal Models. 在Clusting问题里面,待估参数不是那么重要,所以Tadesse et al. (2005)提出来选取特定的先验,正态的系数可以积掉,然后对分几类做Reversible Jump,这样问题就被大大简化了。 细节等要用这块再说。 8.3.2 Centering and Order Methods Brooks et al.(2003c) introduce a class of methods to achieve the automatic scaling of the proposal density \\(q_{d_{k\\rightarrow k'}}(u)\\), based on “local” move proposal distribution, which are centered around the point of equal likelihood values under current and proposed models. 这块以后再看好了。。。从目前的文献来看,birth-death 和 split-merge 策略已经足够很多了?等要用再来看后来这个auto-adaptive的方法 "], +["optimal-proposal-distributions-and-adaptive-mcmc.html", "Chapter 9 Optimal Proposal Distributions and Adaptive MCMC 9.1 Intro 9.2 Optimal Scaling of Random-Walk Metropolis 9.3 Adaptive MCMC 9.4 Ergodicity of Adaptive MCMC 9.5 FAQ 9.6 Conclusion 9.7 A tutorial on adaptive MCMC", " Chapter 9 Optimal Proposal Distributions and Adaptive MCMC 9.1 Intro MH 算法需要选proposal,很自然,有些proposal比较好用。决定使用哪个proposal非常重要也非常难。这个问题经常使用一个ad hoc 方法试错。但是,这个问题可以尝试从理论角度估计出一个最优的proposal scalings 或者adaptive algorithms去自动的找到一个好的proposal。这章回顾了这几个可能性的方法。 9.1.1 MH algorithm Accept rate(虽然之前有但是既然写了就再抄一遍) \\[ \\alpha(\\mathbf{x}, \\mathbf{y})=\\left\\{\\begin{array}{ll}{\\min \\left\\{\\frac{\\pi(\\mathbf{y})}{\\pi(\\mathbf{x})} \\frac{q(\\mathbf{y}, \\mathbf{x})}{q(\\mathbf{x}, \\mathbf{y})}, 1\\right\\},} & {\\pi(\\mathbf{x}) q(\\mathbf{x}, \\mathbf{y})>0} \\\\ {1,} & {\\pi(\\mathbf{x}) q(\\mathbf{x}, \\mathbf{y})=0}\\end{array}\\right. \\] If the proposal is symmetric, than this becomes a simpler version: \\[ \\alpha(\\mathbf{x}, \\mathbf{y})=\\left\\{\\begin{array}{ll}{\\min \\left\\{\\frac{\\pi(\\mathbf{y})}{\\pi(\\mathbf{x})}, 1\\right\\},} & {\\pi(\\mathbf{x}) q(\\mathbf{x}, \\mathbf{y})>0} \\\\ {1,} & {\\pi(\\mathbf{x}) q(\\mathbf{x}, \\mathbf{y})=0}\\end{array}\\right. \\] 9.1.2 Optimal Scaling 当然最快的收敛方式是\\(q(x,y)=\\pi(y)\\) and in which case \\(\\alpha(x,y)=1\\). Then the convergence is immediate. But ofc in MCMC contex the \\(\\pi(y)\\) cannot be sampled directly. 最常见的是symmetric random-walk Metropolis algoithm(RMW)对称随机游走Metropolis。形式为\\(Y_{n+1}=X_n+Z_{n+1}\\), 而加的部分\\(\\{Z_n\\}\\) 是独立的服从某些对称分布(比如正态\\(N(0,\\sigma^2)\\)).在这种情况下,核心问题就变成如何选择proposal的scale。(选\\(\\sigma\\))。过小的话会让chain移动的过于缓慢。过大的话proposal就会经常被拒绝。必须要排除这些及短期看。(Goldilocks principle.) Metropolis在1953的文章就认识到了这一点,并且考虑\\(Z_{n} \\sim U[-\\alpha, \\alpha]\\). 最近,在选取最优proposal的scaling上有了显著的进步,可以得到渐进接受率。在某些情况下,可以精确的得到最优的scaling。 具体在4.2讨论。 9.1.3 Adaptive MCMC 一般来讲手动选取proposal,试错法。但是这很难,特别是在高维的情况。一个替代方法是adaptive MCMC。思想是asks the computer to automatically “learn” better parameter values “on the fly”.也就是说,在算法运行的时候。 假设\\(\\left\\{P_{\\gamma}\\right\\}_{\\gamma \\in \\mathcal{Y}}\\) 是一族Markov Chain,with stationary distribution \\(\\pi\\)。 比方说一个RWM, with increment distribution \\(N\\left(0, \\gamma^{2} I_{d}\\right)\\).Adaptive MCMC讲随机的更新\\(\\gamma\\)在每个迭代过程中,以便于去寻找最合适的值。 反直觉的是,adaptive MCMC并不总是保证stationarity of \\(\\pi\\) 。但是,如果adaptation设计的满足一定条件,stationarity是可以保证的。同时速度上的优势能保持。在4.3会讲。 9.1.4 Comparing Markov Chains 以上方法都是为了去找“better”或者“best” MCMC samplers, 但是在我们深入这些问题之前,我们必须先考虑一个问题,对于一个MCMC sampler,什么叫“好”? 假设\\(P_1\\)和\\(P_2\\)是两个Markov chain,并且都有stationary distribution \\(\\pi\\). 那么我们称“\\(P_1\\)收敛比\\(P_2\\)快”,如果\\(\\sup _{A}\\left|P_{1}^{n}(x, A)-\\pi(A)\\right| \\leq \\sup _{A}\\left|P_{2}^{n}(x, A)-\\pi(A)\\right|\\)" for all \\(n\\) and \\(x\\). 也可以说\\(P_1\\)比\\(P_2\\)方差小如果\\(\\mathbf{\\operatorname { V a r }}\\left(\\frac{1}{n} \\sum_{i=1}^{n} g\\left(X_{i}\\right)\\right)\\) 对\\(P_1\\)来说更小。这个方法关注于g以后的方差,并可能依赖于\\(g\\)的选取。也可能依赖于迭代次数n或者起始点。 一般我们假设Markov chain已经stationary了,那么对于足够大的n,有\\(\\mathbf{\\operatorname { Var }}\\left(\\frac{1}{n} \\sum_{i=1}^{n} g\\left(X_{i}\\right)\\right) \\approx \\frac{1}{n} \\mathbf{V} \\mathbf{a} \\mathbf{r}_{\\pi}(g) \\tau_{g}\\),where \\(\\tau_{g}=\\sum_{k=-\\infty}^{\\infty} \\operatorname{corr}\\left(g\\left(X_{0}\\right), g\\left(X_{k}\\right)\\right)=1+2 \\sum_{i=1}^{\\infty} \\operatorname{corr}\\left(g\\left(X_{0}\\right), g\\left(X_{i}\\right)\\right)\\) is the integrated autocorrelation time. 所以可以从\\(\\tau_g\\)的角度来判断。哪个链比较好。 另外一个角度是Markov chain是否能更快的探索整个状态空间。称为“\\(P_1\\) mixed faster than \\(P_2\\)”, if \\(\\mathrm{E}\\left[\\left(X_{n}-X_{n-1}\\right)^{2}\\right]\\) is larger under \\(P_1\\) than under \\(P_2\\). \\(\\mathrm{E}\\left[\\left(X_{n}-X_{n-1}\\right)^{2}\\right]\\) 经常用\\(\\frac{1}{n} \\sum_{i=1}^{n}\\left(X_{i}-X_{i-1}\\right)^{2}\\)或者\\(\\frac{1}{n-B} \\sum_{i=B}^{n}\\left(X_{i}-X_{i-1}\\right)^{2}\\)进行估计。 但是要注意的是,在这个估计下,拒绝的移动会导致\\(\\left(X_{n}-X_{n-1}\\right)^{2}=0\\),也就是说,拒绝减缓了chain,但是小步的移动也不会有很大的帮助。最好能找到有理由的,大步的移动。 由于以上几个角度,如果定义“更好”的Markov chain取决于具体要研究的问题。但是,在某些条件下,这几个准则可以等价,所以可以得到一致最优的proposal scaling选取。 9.2 Optimal Scaling of Random-Walk Metropolis Method: RWM with \\(Y_{n+1}=X_n+Z_{n+1}\\), where \\(\\{Z_i\\}\\) are i.i.d. with fixed symmetric density with some scaling parameter \\(\\sigma>0\\). 9.2.1 Basic principle A first observation is that if \\(\\sigma\\) is very samll, then virtually all proposed moves will be accepted, but the movement is very small, so overall the chain will not mix well. Similarly, if \\(\\sigma\\) is too large, then most moves will be rejected, the main will usually not move at all. What we need is reasonable-sized proposal moves together with a reasonably high acceptance probability. 9.2.2 Optimal Acceptance Rate as \\(d\\rightarrow \\infty\\) Major progress about optimal scalings was made by Roberts et al.(1997). (做MCMC中心极限定理那帮人) They considered RWM on \\(\\mathbf R^d\\) for very special target densities, of the form \\[ \\pi\\left(x_{1}, x_{2}, \\ldots, x_{d}\\right)=f\\left(x_{1}\\right) f\\left(x_{2}\\right) \\ldots f\\left(x_{d}\\right) \\] for some one-dimensional smooth density \\(f\\). That is, the target density is assumed to consist of i.i.d. components. Of course, this assumption is entirely unrealistic for MCMC, since it means that to sample from \\(\\pi\\) it suffices to sample each component separately from the one-dimensional density f(which is generally easy to do numerically). Under this assumption, and assuming proposal increment distribution of the form \\(N(0,\\sigma^2I_d)\\), Roberts proved the remarkable result that as \\(d\\rightarrow \\infty\\), the optimal acceptance rate is precisely 0.234. This is clearly a major refinement of the general principle that the acceptance rate should be far from 0 and far from 1. More precisely, their result is the following. Suppose that \\(\\sigma=\\ell/\\sqrt d\\) from some \\(\\ell>0\\). Then as \\(d\\rightarrow \\infty\\), if time is sppeded up by a factor of \\(d\\),and space is shrunk by a factor of \\(\\sqrt d\\), then each component of the Markov chain converges to a diffusion having stationary distribution f, and speed function given by \\(h(\\ell)=2\\ell^2\\Phi(-\\sqrt I \\ell)\\), where \\(\\Phi\\) is the cumulative distribution function of a standard normal, and \\(I\\) is a constant depending on \\(f\\), given in fact by \\(I=\\int_{-\\infty}^{\\infty}\\left[\\left(\\frac{f^{\\prime}(X)}{f(X)}\\right)^{2}\\right] f(x) d x\\). It follows that this diffusion is optimized (in terms of any of the criteria in previous section) when \\(\\ell\\) is chosen to maximize \\(h(\\ell)\\). It is computed numerically that this optimal value of \\(\\ell\\) is given by \\(\\ell_{\\mathrm{opt}} \\doteq 2.38 / \\sqrt{I}\\). Furthermore, the asymptotic (stationary) acceptance rate is given by \\(A(\\ell)=2 \\Phi(-\\sqrt{I} \\ell / 2)\\). Hence, the optimal acceptance rate is equal to \\(A\\left(\\ell_{\\mathrm{opt}}\\right) \\doteq 2 \\Phi(-2.38 / 2) \\doteq=0.234\\), which is where the figure 0.234 comes from. 这点没看懂,不过好像作者也没好好写,两个问题,第一个,因为有\\(\\sigma=\\ell/\\sqrt d\\) 当\\(d\\rightarrow \\infty\\) 时\\(\\sigma\\rightarrow 0\\). 这个收敛性就很奇怪?是什么东西呢 第二个问题Markov chain converges to a diffusion,这个diffusion是什么意思,然后这个speed function 又是什么呢,而diffusion is optimized,所以极大化这个speed function又是什么?h function versus A function又有啥意义,感觉从最后这个图来看更像是某种criteria或者mixing的速度的感觉。 9.2.3 Inhomogeneous Target Distributions Above result requires the strong assumption that \\(\\pi(x)=\\prod ^d_{i=1}f(x_i)\\),Roberts and Rosenthal considered inhomogeneous target densities of the form \\[ \\pi(\\mathbf{x})=\\prod_{i=1}^{d} C_{i} f\\left(C_{i} x_{i}\\right) \\] \\(\\{C_i\\}\\) are i.i.d. from some fixed distribution. They proved that in this case, the result previous still holds, except that the limiting diffusion speed is divided by an “inhomogeneity factor” of \\(b \\equiv \\mathrm{E}\\left(C_{i}^{2}\\right) /\\left(\\mathrm{E}\\left(C_{i}\\right)\\right)^{2} \\geq 1\\). In particular, the more inhomogeneous the target distibution, (the greater the variability of the \\(C_i\\)), the slower the resulting algorithm. As a special case, if the target distribution is \\(N(0,\\Sigma)\\) , by change of basis this is equivalent to the case of proposal increment \\(N(0,I_d)\\) and target distribution \\(N(0,\\Sigma\\Sigma_p^{-1})\\). In the corresponding eigenbasis, this target distribution is of the form, where now \\(C_i=\\sqrt\\lambda_i\\) with \\(\\{\\lambda_i\\}^d_{i=1}\\) the eigenvalues of the matrix \\(\\Sigma\\Sigma_p^{-1}\\). For large d, this approximately corresponds to the case where the \\(\\{C_i\\}\\) are random with \\(\\mathrm{E}\\left(C_{i}\\right)=\\frac{1}{d} \\sum_{j=1}^{d} \\sqrt{\\lambda_{j}}\\) and \\(\\mathrm{E}\\left(C_{i}^{2}\\right)=\\frac{1}{d} \\sum_{j=1}^{d} \\lambda_{j}\\). The inhomogeneity factor \\(b\\) then becomes \\[ b \\equiv \\frac{\\mathrm{E}\\left(C_{i}^{2}\\right)}{\\left(\\mathrm{E}\\left(C_{i}\\right)^{2}\\right.} \\approx \\frac{\\frac{1}{d} \\sum_{j=1}^{d} \\lambda_{j}}{\\left(\\frac{1}{d} \\sum_{j=1}^{d} \\sqrt{\\lambda_{j}}\\right)^{2}}=d \\frac{\\sum_{i=1}^{d} \\lambda_{j}}{\\left(\\sum_{j=1}^{d} \\sqrt{\\lambda_{j}}\\right)^{2}} \\] Sherlock(2006) did explicit finite-dimensional computations for the case of normal target distributions, and came to similar conclusions. 9.2.4 Metropolis-Adjusted Langevin Algorithm. Roberts and Tweedie(1996) and Roberts and Rosenthal(1998) considered the more sophisticated Metropolis-Adjusted Langevin algorithm (MALA). This algorithm is similar to RWM, except that the proposal increment distribution \\(Z_i\\sim N(0,\\sigma^2I)\\) is replaced by \\[ Z_{i} \\sim N\\left(\\frac{\\sigma^{2}}{2} \\nabla \\log \\pi\\left(X_{n}\\right), \\sigma^{2} I_{d}\\right). \\] Here the extra term \\(\\frac{\\sigma^2}{2}\\nabla \\log \\pi (X_n)\\), corresponding to the discrete-time approximation to the continuous-time Langevin diffusion for \\(\\pi\\), is an attempt to move in the direction in which the (smooth) target density \\(\\pi\\) is increasing. In this case, under some speed and scale, \\(A\\left(\\ell_{\\mathrm{opt}}\\right)=0.574\\) (as opposed to 0.234). This proves that the optimal proposal scaling \\(\\sigma\\) and the acceptance rate are both significantly larger for MALA than for RWM, indicating that MALA an improved algorithm with faster convergence. The catch, of course, is that the gradient of \\(\\pi\\) must be computed at each new state reached, which could be difficult and/or time-consuming. Thus, TWM is much more popular than MALA in practice. 这个Langevin diffusion有点意思,感觉很奇怪。 \\(X_t\\)是一个随机过程,因为是一个Markov chain,作为,所以\\(\\dot{X}=\\nabla \\log \\pi(X)+\\sqrt{2} \\dot{W}\\),而 也就是用\\(\\dot{X}=\\nabla \\log \\pi(X)+\\sqrt{2} \\dot{W}\\) 进行考虑。 从随机微分方程角度,因为这个东西确实是一个随机过程,所以可以用那套定义来做。其中有个langevin equation就是这个,所以大概可能也许描述如下: 如果我们有一个随机过程\\(X(t)\\),其平稳分布是\\(\\pi(\\cdot)\\),那这个平稳分布的langevin diffusion就应该是 \\(\\dot{X}=\\nabla \\log \\pi(X)+\\sqrt{2} \\dot{W}\\),而我们用MH算法构造的Markov Chain应该和这个随机过程\\(X(t)\\)尽量靠近,那么在\\(X_t\\)往其梯度方向移动就很靠近。 9.2.5 Numerical Examples Consider the case as \\(d=10\\). Target density \\(\\pi\\) is ten-dimensional normal with some covariance matrix \\(\\Sigma\\). Let M be the \\(d\\times d\\) matrix having diagonal elements 1, and off-diagonal elements given by the product of the row and column number divided by \\(d^2\\), that is, \\(m_{ii}=1\\), and \\(m_{ij}=ij/d^2\\) for \\(j\\neq i\\). Then let \\(\\Sigma^{-1}=M^2\\). Then let \\(\\Sigma^{-1}=M^{2}\\) (since M is symmetrix, \\(\\Sigma\\) is positive-definite), and let the target density \\(\\pi\\) be that of \\(N(0,\\Sigma)\\). The result of arthor is This result showed that if the sigma is too small (0.1), or too large(3.0), the result will be poor. The best medium select is with accept rate around 0.234. Then this confirms that, when scaling the increment covariance as \\(\\sigma I_d\\), it is optimal to find \\(\\sigma\\) to make the acceptance rate close to 0.234. 9.2.6 Inhomogeneous Covariance Consider a case for non-diagonal proposal increment, the same target density \\(N(0,\\Sigma)\\) but with \\(\\Sigma=diag(1^2,2^2,...,10^2)\\). This is interesting because the last coordinate now has the highest variance. Start the algorithms with the initial value \\(X_0=(1,0,0,...,0)\\). Usual RWM algorithm \\[ \\begin{array}{cccc}{\\sigma} & {\\text { Mean Acc. Rate }} & {\\text { Estimate }} & {\\text { RMSE }} \\\\ \\hline 0.7 & {0.230} & {114.8 \\pm 28.2} & {30.5} \\\\ \\hline\\end{array} \\] Even though \\(\\sigma\\) was well chosen, the resulting algorithm still converges poorly, leading to a poor estimate with large standard error and large RMSE. Next consider the modified algorithm with increment proposal equal to \\(N(0,\\sigma^2\\Sigma)\\), with result \\[ \\begin{array}{cccc}{\\sigma} & {\\text { Mean Acc. Rate }} & {\\text { Estimate }} & {\\text { RMSE }} \\\\ \\hline 0.7 & {0.294} & {100.25 \\pm 1.91} & {1.83} \\\\ \\hline\\end{array} \\] The increment proposal covariance proportional to the target covariance is very dramatic. Estimate is much better and very close to true value. That indicate that use increment proposals which mimic the covariance of the target distribution if at all possible. FAQ: Larger acceptance preferable? No. Showed above. Essential for acceptance rate be exactly 0.234? No. As the example shows, efficiency remains high when AR between 0.1 and 0.6. Are there asymptotic results relevant to finite-dimensional problems? Yes. Infinete dimensional results are good approximations to finite-dimensional situations 没看懂上面这个 Do these result hold for all taget distributions? No. They are only proved for very special cases. Simulation seem to suggest that they approximately hold in other cases too. Do these results hold for multimodal distributions? Yes, but only in principle. At least for distributions with independent (while perhaps multimodal). But the asymptotic acceptance rate is only respect to the entire target distribution. That is, if the sampler stuck in one mode, it may misrepresent the asymptotic acceptance rate. Do these results hold for,say, Metropolis-within-Gibbs algorithms? No, since they are proved for full-dimensional Metropolis updates only. Indeed, the Metropolis-within-Gibbs algorithm involves updating just one coordinate at a time, and thus essentially corresponds to the case \\(d=1\\). In that case, it appears that the optimal acceptance rate is usually closer to 0.44 than 0.234. 9.3 Adaptive MCMC 之前的章节讲了在某些criteria一个MCMC算法的proposal能有最优。但是问题的关键还没解决,如何找到这些最优解。就比如说如果我们确信接受率为0.234的时候最优,如何找到一个合适的proposal scaling从而得到这个接受率。 一个最常见的方法,就是试错法,如果AR太高,则降低\\(\\sigma\\),然后再来,如果太低就加高..这个方法一般来说都很有用,但是问题是比较耗时间,需要搜东调整。而且这个方法不能处理更复杂的进步,比如说让proposal的covariance matrix渐进的服从未知分布的真实covariance matrix\\(\\Sigma\\)。 As an alternative, 考虑一个算法能自己提升Markov chain.特别的,让\\(\\left\\{P_{\\gamma}\\right\\}_{y \\in \\mathcal{Y}}\\) 为一族Markov chain的转移核,都有一样的平稳分布\\(\\pi\\).让\\(\\Gamma_n\\) 是选取的kernel在第n次迭代,所以有: \\[ \\operatorname{Pr}\\left(X_{n+1} \\in A | X_{n}=x, \\Gamma_{n}=\\gamma, X_{n-1}, \\ldots, X_{0}, \\Gamma_{n-1}, \\ldots, \\Gamma_{0}\\right)=P_{\\gamma}(x, A) \\] for \\(n=0,1,2,...\\), \\(\\{\\Gamma_n\\}\\)是通过某种自适应的更新算法。理论上来说\\(\\Gamma_n\\) 可以依赖于整个Markov chain的历史\\(X_{n-1}, \\ldots, X_{0}, \\Gamma_{n-1}, \\ldots, \\Gamma_{0}\\), 虽然在实践中这一堆元素\\(\\left\\{\\left(X_{n}, \\Gamma_{n}\\right)\\right\\}_{n=0}^{\\infty}\\)是Markovian的。一般来讲这个算法非常容易实现,只需要一些额外的修改。 是否这些adaptive的方法会提高收敛依赖,显然依赖于选取的adaptive algorithm。更基础的问题,我们现在考虑的,是是否adaptation会破坏 收敛性。 9.4 Ergodicity of Adaptive MCMC Adaptive MCMC方法的遍历性。 一般来讲,既然每个Markov chain的\\(P_{\\gamma}\\)都收敛到了\\(\\pi\\),那么所有的自适应混合应该也收敛到\\(\\pi\\)才对。但是,不是这个情况。一个很简单的反例如下:(这里有三篇文献) let \\(\\mathcal{Y}=\\{1,2\\}\\), let \\(\\mathcal{X}=\\{1,2,3,4\\}\\), with \\(\\pi(1)=\\pi(3)=\\pi(4)=0.333\\) and \\(\\pi(2)=0.001\\). 设\\(P_\\gamma\\)是一个RWM algorithm,with proposal \\(Y_{n+1} \\sim U\\left\\{X_{n}-1, X_{n}+1\\right\\}\\) for \\(P_1\\), or \\(Y_{n+1} \\sim U\\left\\{X_{n}-2, X_{n}-1, X_{n}+1, X_{n}+2\\right\\}\\) for \\(P_2\\).(当然,任意跑出了\\(\\mathcal X\\)).定义一个adaptive策略如下 \\(\\Gamma_{n+1}=2\\) 如果nth proposal被接受了,否则\\(\\Gamma_{n+1}=1\\) . 这时候因为proposal是uniform,但是因为\\(\\pi(2)=0.001\\),所以MH 的accept ratio肯定不会高,所以会失败. 1-> 如果从\\(\\Gamma_2\\)开始,那么可能-2,-1,+1,+2,那50%的概率直接越界拒绝,切到1,+1的话到2但是MH ratio很低也容易被拒绝,只有+2才不会被stuck, 如果第一次失败了,留在1,但是就切到\\(\\Gamma_1\\),那只能跳到2,或者-1直接拒绝,但是2的概率也小,所以也会拒绝,所以就stuck在1上了。 根据反例,很重要有足够的条件保证\\(\\{X_n\\}\\)能收敛到\\(\\pi\\). 近些年有很多文献证明了adaptive MCMC在不同情况下的遍历性(各态历经性?)。 特别,Roberts and Rosenthal (2007) 证明了\\(\\lim _{n \\rightarrow \\infty} \\sup _{A \\subseteq \\mathcal{X}} \\| \\operatorname{Pr}\\left(X_{n} \\in A\\right)-\\pi(A) \\|=0\\),渐进收敛,并且有\\(\\lim _{n \\rightarrow \\infty} \\frac{1}{n} \\sum_{i=1}^{n} g\\left(X_{i}\\right)=\\pi(g)\\),对于所有有界的映射\\(g : \\mathcal{X} \\rightarrow \\mathbf{R}(\\mathrm{WLLN})\\),假设只有diminishing (a.k.a. vanishing) adaption condition \\[ \\lim _{n \\rightarrow \\infty} \\sup _{x \\in \\mathcal{X}}\\left\\|P_{\\Gamma_{n+1}}(x, \\cdot)-P_{\\Gamma_{n}}(x, \\cdot)\\right\\|=0 \\quad \\text { in probability } \\] 并且有containment(a.k.a. bounded convergence) condition \\[ \\left\\{M_{\\epsilon}\\left(X_{n}, \\Gamma_{n}\\right)\\right\\}_{n=0}^{\\infty} \\text { is bounded in probability, } \\epsilon>0 \\] where \\(M_{\\epsilon}(x, \\gamma)=\\inf \\left\\{n \\geq 1 :\\left\\|P_{\\gamma}^{n}(x, \\cdot)-\\pi(\\cdot)\\right\\| \\leq \\epsilon\\right\\}\\) is the convergence time of the kernel \\(P_\\gamma\\) when beginning in state \\(x\\in \\mathcal X\\). The previous equation is a technical condition which is satisfied for virtually all reasonable adaptive schemes. It holds whenever \\(\\mathcal X \\times \\mathcal Y\\) is finite, or is compact in some topology in which either the transition kernels \\(P_\\gamma\\), or the Metropolis-Hastings proposal kernels \\(Q_\\gamma\\),have jointly continuous densities. It also holds for adaptive RWM and Metropolis-within-Gibbs algorithms under very general conditions(Bai et al.,2008) So, in practice, the requirement , the previous equation can be largely ignored. By contrast, condition diminishing adaption condition is much more fundmental. It requires that the amount of adapting at the nth iteration goes to 0 as \\(n\\rightarrow \\infty\\).也就是说adapting过程中,当\\(n\\rightarrow \\infty\\) 时adapting的量到0,也就是不adapting了?原文的英文没太懂,啥叫amount of adapting,但是应该是这个理解。(Note that the sum of the adaptations can still be infinite,i.e. an infinite total amount of adaptation is still permissible, and it is not neccessarily required that the adaptive parameter \\(\\{\\Gamma_n\\}\\) converge to some fixed value.) 所以还是能有无限个proposal,proposal的scaling也不用收敛到一个常数。 比如说,如果算法adapts 在n次迭代有概率\\(p(n)\\) ,则这个条件自动满足如果\\(p(n)\\rightarrow 0.\\) 换种方式,如果\\(\\gamma\\)的选取基于一个empirical averageover iterations 1,通过n,那么这n次迭代是\\(O(1/n)\\) ,则goes to 0. 这段还是没懂,特别是最后这句话,\\(O(1/n)\\)这块是什么意思?还有nth iteration only with probability \\(p(n)\\)这个也没懂。 这些结论能让我们更新我们的参数\\(\\{\\Gamma_n\\}\\) 实际上任何方法我们希望,只要vanishing condition holds。那么这样,哪个adaption的方法最好? 9.4.1 Adaptive Metropolis 第一个重要的adaptive MCMC是adaptive Metropolis (AM) algorithm. (Haario et al.(2001))。这个算法是启发于观察diminishing adaption condition,对于Random Walk Metropolis (RWM) in \\(\\mathbf R^d\\) ,至少对于正态的目标分布,最优的是一个proposal covariance matrix of the form \\((2.38)^2/d\\) 乘以目标协方差矩阵\\(\\Sigma\\). 因为\\(\\Sigma\\) 不知道,所以用\\(\\Sigma_n\\) 进行估计,经验协方差矩阵\\(X_0,...,X_n\\). 则AM算法最后在第n次迭代使用的proposal分布如下: \\[ Y_{n+1} \\sim N\\left(X_{n},\\left[\\frac{(2.38)^{2}}{d}\\right] \\Sigma_{n}\\right) \\] 为了保证这个proposal covariances不会落到0(会违反vanishing/diminishing 条件)。有几种方法,比如在每次iteration对\\(\\Sigma_n\\)上加一个\\(\\epsilon I_d\\) \\(\\epsilon>0\\). 另外一个可能性是替代性的使用一个混合分布有如下形式: \\[ (1-\\beta) N\\left(X_{n},\\left[\\frac{(2.38)^{2}}{d}\\right] \\Sigma_{n}\\right)+\\beta N\\left(X_{n}, \\Sigma_{0}\\right) \\] 对于某些\\(0<\\beta<1\\) 和某些固定的非退化矩阵\\(\\Sigma_0\\).(比如说\\(\\Sigma_{0}=\\left[(0.1)^{2} / d\\right] I_{d}\\)). (其他版本,用某些固定的proposal distribution对一开始几次迭代,当经验协方差矩阵不是良定的,比如说一开始几次迭代,求不了经验分布函数(n<p))。 Since empirical estimates change at the nth iteration by only \\(O(1/n)\\), it follows that the diminishing adaption condition will be satisfied. Furthermore, the containment condition will certainly be satisfied if one restricts to compact regions, and in fact containment still holds provided the target density \\(\\pi\\) decays at least polynomially in each coordinate, a very mild assumption. So, AM is indeed a valid sampling algorithm. Computer simulation demonstrate AM algorithm will indeed “learn” the target covariance matrix, and approach an optimal algorithm, even in very high dimensions. This will converge considerably faster than a nonadapted RWM algorithm. 9.4.2 Adaptive Metropolis-within-Gibbs A standard alternative to the usual full-dimensional Metropolis algorithm is the “Metropolis-within_Gibbs” algorithm. To be specific, suppose that the ith coordinate is updated using a proposal increment distribution \\(N(0,e^{2 ls_i})\\), so \\(ls_i\\) is the log of the standard deviation of the increment. We would like to find optimal values of the \\(ls_i\\), whcih may of course be different for the different variables.由之前的章节可知,一个经验法则是选取accept rate approximately 0.44. 但是就算用这些方法,在高维条件下手动调整\\(ls_i\\)也非常困难。 一个方法(Roberts and Rosenthal,2009)去adapt \\(ls_i\\) 是拆分run into “batches”,比如说50个迭代。在第n个batch,更新\\(ls_i\\) by 加或者减一个adaptation amount \\(\\delta(n)\\). 这个adapting努力使得接受率十分靠近0.44.特别的,我们增加\\(ls_i\\) 用\\(\\delta(n)\\) 如果对分布variable i 的接受率高于0.44在第n个batch,或者降低\\(ls_i\\)如果比0.44低. 为了满足diminishing condition,我们需要\\(\\delta\\rightarrow 0\\);比如取\\(\\delta(n)=\\min \\left(0.01, n^{-1 / 2}\\right)\\). 而第二个条件很容易满足,只需要限制\\(ls_i\\) 在一个有限的区域\\([-M,M]\\). 但是,这还不够。由Bai et.al.2008, 这个条件几乎都能满足,如果target density \\(\\pi\\) decreases at least polynomially in each direction (a very mild condition). Hence, the restriction is once again not of practical concern. Simulations (Roberts and Rosenthal,2009) 表示adaptive Metropolis-within-Gibbs algorithm does a good job of correctly scaling the \\(ls_i\\) values. 即是维度高达500,leading to chains which mix much faster than those with pre-chosen proposal scalings. Preliminary general-purpose software to implement this algorithm is now available (Rosenthal,2007). 9.4.3 State-Dependent Proposal Scalings 另外一个方法让proposal scaling,也就是proposal的方差 基于目前的状态\\(X_n\\),使得,比如说,给定\\(X_n=x\\),我们可能提议\\(\\mathcal Y_{n+1}\\sim N(x,\\sigma^2_x)\\). 在这种情况,接受概率变为 \\[ \\alpha(x, y)=\\min \\left[1, \\frac{\\pi(y)}{\\pi(x)}\\left(\\frac{\\sigma_{x}}{\\sigma_{y}}\\right)^{d} \\exp \\left(-\\frac{1}{2}(x-y)^{2}\\left(\\sigma_{y}^{-2}-\\sigma_{x}^{-2}\\right)\\right)\\right] \\] \\(\\sigma_x\\)的函数形式可以有多种方法进行选择,去得到更快的收敛性。 比如说,在很多问题中,目标分布将会更分散当我们离初始点很远的时候。这种情况,可能更适合,让\\(\\sigma_x=e^{a}(1+|x|)^{b}\\),\\(a\\)和\\(b\\)adaptively的选取。比如说,我们能在此把链分为batches,每个batches包含50次迭代。每次迭代之后,算法会更新\\(a\\)通过加或者减\\(\\delta(n)\\) 使得接受率尽可能的接近0.234或者0.44. 这个算法同时也加或者减\\(\\delta(n)\\) 给 \\(b\\) 使得接受概率在这两个区间之内:\\(\\{x \\in \\mathcal{X} :|x|>C\\}\\),\\(\\{x \\in \\mathcal{X} :|x| \\leq C\\}\\)C是取定的常数。 再一次的,要满足diminishing adaptive condition得有\\(\\delta(n)\\rightarrow 0\\),并且有界性条件则非常容易满足。所以,者提供了一个方便的方法去给出一个useful functional form to \\(\\sigma_x\\),就算不知道a和b的值也不要紧。 (Roberts and Rosenthal 2009) 表示算法工作的很好,至少在简单的时候如此。 另外一个方法,有些时候被称为regional adaptive Metropolis algorithm(RAMA).使用有限次分割状态空间:\\(\\mathcal{X}=\\mathcal{X}_{1} \\dot{\\cup} \\ldots \\dot{\\cup} \\mathcal{X}_{m}\\). The proposal scaling is then giving by \\(\\sigma_x=e^{a_i}\\) whenever \\(x\\in \\mathcal X_i\\) with the acceptance probability in previous equation. 每个\\(a_i\\)的值is again adapted after each batch of iterations, by adding or subtracting \\(\\delta(n)\\) in an attempt to make the acceptance fraction of \\(\\mathcal X_i\\) close to 0.234.(作为一个特殊的情况,如果在一个batch里面没有访问到\\(\\mathcal X_i\\)),那么我们总是加一块\\(\\delta(n)\\)到\\(a_i\\)上,去避免\\(a_i\\)过小然后使得proposed move永远不move到\\(\\mathcal X_i\\)。 再一次的,这个算法在非常宽泛的条件下能够使得\\(\\delta(n)\\rightarrow 0\\). 最近(Craiu et al.(2009))的工作考虑特殊的改造RAMA,多个算法的拷贝同时运行去找到所有最大值点而不是困在一个极值点。他们的工作同时也允许proposal distribution是一个加权混合不同的\\(N\\left(x, e^{2 a_{i}}\\right)\\).,使得不同分割\\(\\{\\mathcal X_i\\}\\) 不完美的选取,使得有更复杂的RAMA-type算法得到更广泛的应用。 当然,Langevin(MALA)算法可能也被归类于一类状态依赖 scaling,当然也可能有adaptive的版本的MALA(Atchade,2006). 9.4.4 Limit Theorem 很多MCMC的一款能够用中可以使用如下Markov chain limit theorem,比如weak law of large numbers(WLLN), strong law of large numbers (SSLN), and central lmit theorem (CLT), in order to guarantee good asymptotic estimates and estimate stnadard errors. 所以一个很自然的问题就是adaptive MCMC这些极限定理还是否成立。 基于diminishing adaptation and containment 的假设,WLLN对于所有的 bounded functional都成立。 (Roberts and Rosenthal,2007,Thm3).所以至少意味着使用adaptive MCMC用于估计有界泛函的均值,将会以很高的概率得到一个准确的答案,如果运行足够长的链。 对于无界泛函,WLLN一般来说也成立,但是不总是成立。甚至对有界泛函,SLLN不一定能成立,同样的例子证明CLT可能也不成立。所以建议是估计MCMC的标准差对于adaptive MCMC来说就很challenging如果我们只假设diminishing adaptation and containment. 基于更强的假设,将会有更多的结论。比如说(文献),证明了对于adaptive MCMC算法各种情况下的极限定理,假设假设adaptive parameters converge to fixed values sufficiently quickly.他们也证明了这些adaptive algorithms将会继承很多渐进最优性质对于对应的固定参数方法、这些结论促进了adaptive MCMC的更多的应用。然而,浙西需要不同的technical条件,实践中会更难以验证。 9.5 FAQ 需要满足diminishing adaptation condition才能保证converge 不需要搞定所有technical condition才能用adaptive MCMC,只要满足diminishing adaptation,基本上就能渐进可用。 adaptive MCMC经常用于加速高维收敛。比如几百维的基因问题 需要adaptation过程设计的能特别的达到最优的parameter值吗,不需要,各态历经性在这方面完全不需要\\(\\{\\Gamma_n\\}\\) 收敛,只需要满足diminishing adaptation condition,只需要满足the possibility of infinite total adaptation. 但是,很多特定的adaptive MCMC algorithms proposed are indeed designed to attempt to converge to specific values. (比如说proposal scaling能使得渐进接受率是0.234) 可以adapt一定时间,然后就不adapt了,用最后adapt的参数抽?可以,因为每个kernel都是ergodic的,但是要adapt多久没有理论支持,比如说对于200维的数据,将会需要上百万的迭代adapt之后才会得到一个比较好的proposal covariance matrix。 只能用特定的几种adaptive MCMC吗?(adaptive Metropolis,adaptive Metropolis-within-Gibbs,RAMA,…)不是,什么情况都行,可以使用任意规则对\\(\\{\\Gamma_n\\}\\)进行adapt。只要满足adaptation diminishes,那么算法就有很大的可能性可行。最优挑战的部分就是找到sensible/clever adaptation rules. 有其他方法,不仅仅是adaptive MCMC,可以帮助算法“学习”如何去converge well? 当然有,比如说particle filters, population Monte Carlo, sequential Monte Carlo,可以都帮忙考虑作为尝试“学习”快速收敛的方法。但是这些算法的细节和这里提出的adaptive MCMC不一样。 9.6 Conclusion 9.7 A tutorial on adaptive MCMC (Andrieu and Thoms 2008) ergodic: It is guaranteed to eventually produce samples \\(\\{X_i\\}\\) distributed according to \\(\\pi\\) The aim of this paper is to review the theoretical underpinnings and recent methodological advances in the area of computer algorithms that aim to “optimise” such parameterised MCMC transition probabilities in order to lead to computationally efficient and reliable procedures. Such as tempering type algorithms. Discuss the choice of a criterion to optimise. Class of process called Controlled Markov chains Controlled Markov chain Monte Carlo Sample initial values \\(\\theta_0,X_0\\in \\Theta\\times X\\) Iteration \\(i+1\\) (i0), given \\(\\theta_i=\\theta_i(\\theta_0,X_0,...,X_i)\\) from iteration i Sample \\(X_{i+1}|(\\theta_0,X_0,...,X_i)\\sim P_{\\theta_i}(X_i,\\cdot)\\). Compte \\(\\theta_{i+1}=\\theta_{i+1}(\\theta_0,X_0,...,X_{i+1})\\) References "], +["hamiltonian-monte-carlo.html", "Chapter 10 Hamiltonian Monte Carlo 10.1 Discretizing Hamilton’s Equations: The leapfrog method. 10.2 MCMC from Hamiltonian Dynamics. 10.3 HMC in Practice and Theory 10.4 Extensions of and Variations on HMC", " Chapter 10 Hamiltonian Monte Carlo 10.0.1 Properties of Hamiltonian Dynamics Hamiltonian dynamics is reversible. That is, the mapping \\(T_{s}\\) from the state at time t,\\((q(t),p(t))\\) to the state at time \\(t+s,(q(t+s),p(t+s))\\), is one-to-one, so hence there exists an inverse, \\(T_{-s}\\). 可以从空间能量那一块来理解,由一个最普通的Hamilton的定义\\(H(q, p)=U(q)+K(p)\\),中,动能部分\\(K(p)=K(-p)\\),而由后文的\\(K(p)==p^{T} M^{-1} p / 2\\),则p是速度,K(p)是动能(kinetic energy),所以和p的方向无关。所以inverse mapping也可以通过对negating p之后再作用\\(T_s\\),然后再negating p。 在一元的例子中,正逆变换都是逆时针转s radians。 而Hamiltonian dynamics的reversibility对MCMC更新非常重要,这样使用dynamics更新的MCMC就能使稳定分布是需要的分布。最简单证明稳定性的方法就是使用Reversible。 10.0.2 Conservation of the Hamiltonian A second property of the dynamics is that it keeps the Hamiltonian invariant (i.e. conserved). \\[ \\frac{d H}{d t}=\\sum_{i=1}^{d}\\left[\\frac{d q_{i}}{d t} \\frac{\\partial H}{\\partial q_{i}}+\\frac{d p_{i}}{d t} \\frac{\\partial H}{\\partial p_{i}}\\right]=\\sum_{i=1}^{d}\\left[\\frac{\\partial H}{\\partial p_{i}} \\frac{\\partial H}{\\partial q_{i}}-\\frac{\\partial H}{\\partial q_{i}} \\frac{\\partial H}{\\partial p_{i}}\\right]=0 \\] 等式成立来源于Hamilton’s equations. 在一元的例子中,相当于是说旋转变换保持了Hamiltonian的值不变,是half the squared distance from the origin. 在Metropolis updates中,如果用 Hamiltonian dynamics作为proposal的话,也就是HMC,接受率是1如果H能保持invariant. 后面可以看到。但是在实践中只能保持H approximately invariant, 所以我们很难达到这个目标。 10.0.3 Volume preservation Hamiltonian dynamics的第三个基础性质是在(q,p)空间内他保持volume。(Liouville’s theorem的结论)。如果我们队映射\\(T_s\\)作用到一个在(q,p)空间的区域R上的一些点,有体积V,则R在\\(T_s\\)的像也有体积V。 在一维的例子中的体现就是映射只是旋转变化,明显不改变面基。当然也不改变区域的形状。后者则不总是成立。Hamiltonian dynamics 可能对区域在某个方向进行拉伸,当这个区域因为在其他方向非常集中,为了保证volume,所以进行拉伸。 在MCMC中,保证volume不变的意以不导致对于任意change in volume in the acceptance probability for Metropolis updates. 改变volume不会导致Metropolis updates的接受概率变化。如果我们考虑proposed新的状态,用一些随机的,非Hamiltonian,dynamics,我们可能会需要去计算Jacobian matrix for the mapping the dynamics defines的行列式,这可能做不到。 由之前的定义,divergence of the vector field have \\[ \\sum_{i=1}^{d}\\left[\\frac{\\partial}{\\partial q_{i}} \\frac{d q_{i}}{d t}+\\frac{\\partial}{\\partial p_{i}} \\frac{d p_{i}}{d t}\\right]=\\sum_{i=1}^{d}\\left[\\frac{\\partial}{\\partial q_{i}} \\frac{\\partial H}{\\partial p_{i}}-\\frac{\\partial}{\\partial p_{i}} \\frac{\\partial H}{\\partial q_{i}}\\right]=\\sum_{i=1}^{d}\\left[\\frac{\\partial^{2} H}{\\partial q_{i} \\partial p_{i}}-\\frac{\\partial^{2} H}{\\partial p_{i} \\partial q_{i}}\\right]=0 \\] 而有一个结论,A vector field with zero divergence can be shown to preserve volume. 如果不通过divergence,可以有一个不严格的证明。从之前提到的行列式的角度进行思考。 The volume preservation is equivalent to the determinant of the Jacobian matrix of \\(T_s\\) having absolute value one, which is related to the well-known role of this determinant in regard to the effect of transformations on definite integrals and on probability density functions. Jacobian matrix of \\(T_s\\), \\(2d\\times 2d\\), as a mapping of \\(z=(q,p)\\), will be written as \\(B_s\\). In general, \\(B_s\\) will depend on the values of \\(q\\) and \\(p\\) before the mapping. When \\(B_s\\) is diagonal, it is easy to see that the absolute values of its diagonal elements are the factors by which \\(T_s\\) wtrtches or compresses a region in each dimesnions, so that the product of these factos, which is equal to the absolute value of \\(det(B_s)\\), is the factor by which the volume of the region changes. Detail and general prove will not be put here, but, note that if rotate the coordinate system used, \\(B_s\\) would no longer be diagonal, but the determinant of \\(B_s\\) is invariant to such transformations, and so would still give the gactor by which the volume changes. Consider Volume preservation for Hamiltonian dynamics in one dimension. Approximate \\(T_\\delta\\) for \\(\\delta\\) near zero as follows: \\[ T_{\\delta}(q, p)=\\left[ \\begin{array}{c}{q} \\\\ {p}\\end{array}\\right]+\\delta \\left[ \\begin{array}{l}{d q / d t} \\\\ {d p / d t}\\end{array}\\right]+\\text { terms of order } \\delta^{2} \\text { or higher. } \\] Taking the time derivatives from the Hamiltonian equation, the Jacobian matrix can be written as \\[ B_{\\delta}=\\left[ \\begin{array}{cc}{1+\\delta \\frac{\\partial^{2} H}{\\partial q \\partial p}} & {\\delta \\frac{\\partial^{2} H}{\\partial p^{2}}} \\\\ {-\\delta \\frac{\\partial^{2} H}{\\partial q^{2}}} & {1-\\delta \\frac{\\partial^{2} H}{\\partial p \\partial q}}\\end{array}\\right]+\\text { terms of order } \\delta^{2} \\text { or higher. } \\] Then we can write the determinant of this matrix as \\[ \\begin{aligned} \\operatorname{det}\\left(B_{\\delta}\\right) &=1+\\delta \\frac{\\partial^{2} H}{\\partial q \\partial p}-\\delta \\frac{\\partial^{2} H}{\\partial p \\partial q}+\\text { terms of order } \\delta^{2} \\text { or higher } \\\\ &=1+\\text { terms of order } \\delta^{2} \\text { or higher. } \\end{aligned} \\] Since \\(\\log (1+x) \\approx x\\) for x near zero, \\(log det (B_\\delta)\\) is zero, except perhaps the terms of order \\(\\delta^2\\) or higher. Now consider \\(log det (B_s)\\) for some time interval s that is not close to zero let \\(\\delta=s/n\\) (现在就close to zero了),then可以把\\(T_s\\)分解成作用n次\\(T_\\delta\\), so \\(det (B_s)\\) is the n-fold product of \\(det(B_\\delta)\\) evaluated at n points along the trajectory. Then we have \\[ \\begin{aligned} \\log \\operatorname{det}\\left(B_{S}\\right) &=\\sum_{i=1}^{n} \\log \\operatorname{det}\\left(B_{\\delta}\\right) \\\\ &=\\sum_{i=1}^{n}\\left\\{\\text { terms of order } 1 / n^{2} \\text { or smaller }\\right\\} \\\\ &=\\text { terms of order } 1 / n \\text { or smaller. } \\end{aligned} \\] 所以对于不靠近0的\\(B_s\\), \\(log det (B_s)\\),也有log几乎为0当\\(n\\rightarrow 0\\). 当然在sum过程中,\\(B_\\delta\\)的值可能会依赖于i,也就是轨迹上的点(p,q)的变化会影响\\(T_s\\). Assumeing that trajectories are not singular, the variation in \\(B_\\delta\\) must be bounded along any particular trajectory. (这个没懂,轨迹非退化,那么就有界然后就能通过n收敛?) 所以就preserves volume。 当高于一维的情况,Jacobian matrix有如下形式 \\[ B_{8}=\\left[ \\begin{array}{c}{I+\\delta\\left[\\frac{\\partial^{2} H}{\\partial q_{j} \\partial p_{i}}\\right]} & {\\delta\\left[\\frac{\\partial^{2} H}{\\partial p_{j} \\partial p_{i}}\\right]} \\\\ {-\\delta\\left[\\frac{\\partial^{2} H}{\\partial q_{j} \\partial q_{i}}\\right]} & {I-\\delta\\left[\\frac{\\partial^{2} H}{\\partial p_{j} \\partial q_{i}}\\right]}\\end{array}\\right]+\\text { terms of order } \\delta^{2} \\text { or higher. } \\] 类似的形式,但是变成了分块矩阵。 高阶的项消掉了,剩下的项处理方式和1维的时候类似。 10.0.4 Symplecticness (辛?) volume不变性是Hamiltonian dynamics being symplectic的一个结论。假设\\(z=(q,p)\\), 定义J是\\(J=\\left[ \\begin{array}{cc}{0_{d \\times d}} & {I_{d \\times d}} \\\\ {-I_{d \\times d}} & {0_{d \\times d}}\\end{array}\\right]\\), symplecticness condition is that the Jacobian matrix, \\(B_s\\), of the mapping \\(T_s\\) satisfies \\[ B_{s}^{T} J^{-1} B_{s}=J^{-1} \\] This implies volume conservation, since \\(det(B^T_s)det(J^{-1})det(B_s)=det(J^{-1})\\) 则有\\(det(B_s)^2=1\\). 当d>1的时候,symplecticness condition is stronger than volume preservation. Hamiltonian dynamics and the symplecticness condition can be generalized to where J is any matrix for which \\(J^T=-J\\) and \\(det(j)\\neq 0\\). 在实践中,Reversibility, preservation of volume, and symplecticness可以确实被保证,也必须被保证。 10.1 Discretizing Hamilton’s Equations: The leapfrog method. 为了在计算机中实现,Hamilton’s equations must be approximated by discretizing time, using some small stepsize,\\(\\epsilon\\). 在0时刻开始,iteratively compute (approximately) the state at times \\(\\epsilon,2\\epsilon,3\\epsilon,etc.\\) In discussing how to do this, assume that the Hamiltonian has the form \\(H(q,p)=U(q)+K(p)\\). 虽然这个方法对各种定义的动能都适用,但是为了简化操作还是假设\\(K(p)=p^{T} M^{-1} p / 2\\). 然后M是diagonal的,对角元是\\(m_1,...,m_d\\), so that \\[ K(p)=\\sum_{i=1}^{d} \\frac{p_{i}^{2}}{2 m_{i}} \\] 相当于动能是每个方向上的动能之和。 10.1.0.1 Euler’s Method 最广为人知的逼近微分方程系统的解的方法是Euler’s method. 对于Hamilton’s equations, this method performs the following step, for each component of position and momentum, indexed by i=1,…,d: \\[ \\begin{array}{l}{p_{i}(t+\\varepsilon)=p_{i}(t)+\\varepsilon \\frac{d p_{i}}{d t}(t)=p_{i}(t)-\\varepsilon \\frac{\\partial U}{\\partial q_{i}}(q(t))} \\\\ {q_{i}(t+\\varepsilon)=q_{i}(t)+\\varepsilon \\frac{d q_{i}}{d t}(t)=q_{i}(t)+\\varepsilon \\frac{p_{i}(t)}{m_{i}}}\\end{array} \\] 对于每个分量的速度和(势能?),下一时刻的等于这一时刻加上一阶导乘以步长,然后通过Hamilton’s equation就等于另外一个量的变化量。(减去的势能等于增加的动能?增加的动能等于减去的势能?) 对于一般的Euler法解的轨迹并不好,轨迹叉到无穷上了,但是真实的轨道是一个圆、 10.1.1 Modification of Euler’s Method \\[ \\begin{array}{c}{p_{i}(t+\\varepsilon)=p_{i}(t)-\\varepsilon \\frac{\\partial U}{\\partial q_{i}}(q(t))} \\\\ {q_{i}(t+\\varepsilon)=q_{i}(t)+\\varepsilon \\frac{p_{i}(t+\\varepsilon)}{m_{i}}}\\end{array} \\] 变化是使用\\(p_i(t+\\epsilon)\\)代替了\\(p_i(t)\\)。也就是说使用了“目前的动量”代替了前一时刻的动量。 图中显示了虽然不完美,但是这个方法更靠近了真实的轨迹。 事实上,修改后的方法确实确保了volue,这样帮助避免发散到无穷或者螺旋地回到起点,这样就让volume expand to infinity or contracting to zero. 要探究modification of Euler’s method preserve volume, exactly despite the finite discretization of time, 注意这两个变换 from \\((q(t), p(t))\\) to \\(q(t),p(t+\\epsilon)\\) 通过modification的第一个等式,然后第二个等式是从\\((q(t), p(t+\\varepsilon))\\)变换到\\((q(t+\\varepsilon), p(t+\\varepsilon))\\). 这是“shear” transformation (剪羊毛?没懂),每次变换只变换一个值,要么\\(p_i\\),要么\\(q_i\\),这样只会变一个参数,而any shear transformation will preserve volume,因为这样变换的Jacobian只有一个元,并且值为1. 10.1.2 The leapfrog Method Leapfrog method可以得到更好的结果 \\[ \\begin{aligned} p_{i}(t+\\varepsilon / 2) &=p_{i}(t)-(\\varepsilon / 2) \\frac{\\partial U}{\\partial q_{i}}(q(t)) \\\\ q_{i}(t+\\varepsilon) &=q_{i}(t)+\\varepsilon \\frac{p_{i}(t+\\varepsilon / 2)}{m_{i}} \\\\ p_{i}(t+\\varepsilon) &=p_{i}(t+\\varepsilon / 2)-(\\varepsilon / 2) \\frac{\\partial U}{\\partial q_{i}}(q(t+\\varepsilon)) \\end{aligned} \\] 从冲量对应的变量开始半步更新,然后再重新开始做全更新。使用新的(更新了半步的)冲量变量,然后最后做剩下半步冲量变量的更新,使用新的位置变量(\\(q_i(t+\\epsilon)\\))。一个类似的方法可以对任何的动能函数成立,只需要用\\(\\partial K / \\partial p_{i}\\)替换\\(p_{i} / m_{i}\\). 这个方法当然也保证了volume,因为第一个更新通过第三个更新是shear transformation,由于对称性,这也是reversible的by simply negating p again. 10.1.3 Local and Global Error of discretization Methods. How the error from discretizing the dynamics behaves in the limit as the stepsize, \\(\\epsilon\\), goes to zero; Leimkuhler and Reich(2004) provide a much more detail discussion. The error goes to zero as \\(\\epsilon\\) goes to zero, so that any upper limit on the error will apply (apart from a usually unknown constant factor) to any differentiable function of state. For example, if the error for (q,p) is no more than order \\(\\epsilon^2\\), the error for \\(H(q,p)\\) will also be no more than order \\(\\epsilon^2\\). The local error is the error after one step, that moves from time t to time \\(t+\\epsilon\\). The global error is the error after simulating for some fixed time interval, s, which will require \\(s/epsilon\\) steps. If the local error is order \\(\\epsilon^p\\), the global error will be order \\(\\epsilon^{p-1}\\). If we instead fix \\(\\epsilon\\) and consider increasing the time, s, for which the trajectory is simulated, the error can in general increase exponentially with s. Interesting. However, this is often not what happens when simulating Hamiltonian dynamics with a symplectic method, as can be seen in Figure. The Euler method and its modification above have order \\(\\epsilon^2\\) local error and order \\(\\epsilon\\) global error. Leapfrog method has order \\(\\epsilon^3\\) local error and order \\(\\epsilon^2\\) global error. As shown by Leimkuhler and Reich, this difference is a consequence of leapfrog being reversible, since any reversible method must have global error that is of even order in \\(\\epsilon.\\) 10.2 MCMC from Hamiltonian Dynamics. 要用Hamiltonian dynamics去得到某个分布的sample需要把density function translate to potential energy function and introducing “momentum” variables to go with the original variables of interest (now seen as “position” variables). We can then simulate a Markov chain in which each iteration resamples the momentum and then does a Metropolis update with a proposal found using Hamiltonian dynamics. 10.2.1 Probability and the Hamiltonian: Canonical Distributions 可以把要抽的分布函数和一个潜在的能量函数通过 canonical distribution 联系起来。给定某些能量函数,\\(E(x)\\),对于某些状态,x,x of some physical system, the canonical distribution over states has probability or probability density function \\[ P(x)=\\frac{1}{Z}exp(\\frac{-E(x)}{T}) \\] Here, T is the temperature of the system (温度!这是要用上原子(量子?)领域的玩意了吗╮(╯_╰)╭),比如弄个热力学定律啥的。。。。。或者说温度和动能有某种对应关系? Z is normalizing constant needed for this function to sum or integrate to one. Viewing this the opposite way, if we are interested in some distribution with density function \\(P(x)\\), we can obtain it as a canonical distribution with \\(T=1\\) by setting \\(E(x)=-\\log P(x)-\\log Z\\). 为什么这么写是因为,P(x)的值域是(0,1),那么logP(x)则是 \\((-\\infty,0)\\), \\(-\\log P(x)-\\log Z\\) 则是\\((-\\log Z,\\infty)\\), as long as Z is any convenient positive constant, \\(-\\log Z\\)可以达到\\(-\\infty\\),所以能量函数的值域就是\\((-\\infty,\\infty)\\). 或者反过来说就是对能量函数来个log link function让他落到概率区间\\((-1,1)\\). A Hamiltonian is an energy function for the joint state of “position”, q, and “momentum”,p, and so defines a joint distribution for them as follows: \\[ P(q,p)=\\frac{1}{Z}exp(\\frac{-H(q,p)}{T}) \\] 所以一个Hamiltonian就是对当前位置的能量函数q,加上动量函数,p组合起来。所以可以定义一个联合分布如上。 Note that the invariance of H under Hamiltonian dynamics means that a Hamiltonian trajectory will (if simulated exactly) move within a hypersurface of cosntant probability density. 所以这里的Hamiltonian dynamics的不变形就表现在一个Hamiltonian 轨道将会在等常数概率的超平面上移动。 If \\(H(q,p)=U(q)+K(p)\\), the joint density is \\[ P(q,p)=\\frac{1}{Z}exp(\\frac{-U(q)}{T})exp(\\frac{-K(p)}{T}) \\] and we see that q and p are independent, and each have canonical distributions, with energy functions \\(U(q)\\) and \\(K(p)\\). We will use \\(q\\) to represent the variables of interest, and introduce p just to allow Hamiltonian dynamics to operate. 所以在加情况下的Hamiltonian,就能分成两个分布,把要抽的分布丢进q,然后找一个能形成Hamiltonian dynamics的q就行了。 在Bayesian统计中,模型参数的后验分布是一般比较关心的,所以这些参数会作为位置,q。把后验分布转化成canonical distribution (with T=1) using a potential energy function defined as \\[ U(q)=-\\log [\\pi(q) L(q | D)], \\] where \\(\\pi(q)\\) is the prior density, and \\(L(q|D)\\) is the likelihood function given data D. 10.2.2 The Hamiltonian Monte Carlo Algorithm 下一步就进入HMC。 HMC仅限于\\(\\mathbb R^d\\)上的连续分布,并且density function can be evaluated(perhaps up to an unknown normalizing constant). For the moment, It will also assume that the density is nonzero everywhere (but this is relaxed in following section). We must also be able to compute the partial derivatives of the log of the density function. These derivatives must therefore exist, except perhaps on a set of point with probability zero, for shich some arbitrary value could be returned. 条件,要求还行?连续,可以算density,可以算log density的偏导。不可导的点的概率测度应为0. HMC samples from the canonical distribution (就是上式P(q,p)。)q是the distribution of interest, as specified using the potential energy function \\(U(q)\\). 选择distribution of momentum variable,p, 独立于q,然后给定动能函数,K(p). 目前应用在HMC上的主要是二次能量函数,就之前提起来那个,可以让p是0均值的多元正态分布。最经常的设置是the components of p are specified to be independent, with component i having variance \\(m_i\\). The kinetic energy function producing this distribution (Setting T=1) is \\[ K(p)=\\sum_{i=1}^{d} \\frac{p_{i}^{2}}{2 m_{i}}. \\] 在5.4会讲\\(m_i\\)的选取对表现的影响。 10.2.2.1 The two steps of the HMC algorithm 每次HMC算法的迭代有两部。第一步only the momentum; the second may change both position and momentum. 第二步 may change both position and momentum. Both steps leave the canonical joint distribution of \\((q,p)\\) invariant, and hence their combination also leaves this distribution invariant. In the first step, new values for the momentum variables are randomly drawn from their Gaussian distribution, independently of the current values of the position variables. 由上一步的假设,有 \\[ p_i\\sim N(0,m_i) \\] 因为q没变,p来自于正确的条件分布(conditional on q,但是独立,所以无所谓)。所以这一步明显让canonical joint distribution invariant. 第二步则用Metropolis update,using Hamiltonian dynamics to propose a new state. Starting at current state (q,p), Hamiltonian dynamics is simulated for L steps using the leapfrog method (or other reversible method that preserves volume,所以基础的Euler法不行?因为没有preserve reversible), with stepsize of \\(\\epsilon\\). 这里,L和\\(\\epsilon\\)是模型参数,需要找一个能有好表现的。The momentum variables at the end of this L-step trajectory are then negated, giving a proposed state\\((q^*,p^*)\\). This proposed state is accepted at the next state of the Markov chain with probability \\[ \\min \\left[1, \\exp \\left(-H\\left(q^{*}, p^{*}\\right)+H(q, p)\\right)\\right]=\\min \\left[1, \\exp \\left(-U\\left(q^{*}\\right)+U(q)-K\\left(p^{*}\\right)+K(p)\\right)\\right] \\] The negation of the momentum variables at the end of the trajectory makes the Metropolis proposal symmetrical, as needed for the acceptance probability above to be valid. This negation need not be done in practice, since \\(K(p)=K(-p)\\), and the momentum will be replaced before it is used again.(第一步得重新抽。) 所以HMC可以是看做从q和p的联合分布中抽,Metropolis步用了Hamiltonian dynamics的proposal,使得(q,p)的概率分布保持不变,或者几乎不变。移动到(q,p)这一点只靠第一步的随机抽。对于q,Hamiltonian dynamics for \\((q,p)\\)可以产生q的值来自于不同的概率分布,或者等价于一个很不一样的potential energy,\\(U(q)\\).但是,重抽样动量变量依然很难得到q的一个合适的分布,不用重抽样,\\(H(q, p)=U(q)+K(p)\\)会是(几乎)常数,而\\(K(p)\\) and \\(U(q)\\)是非负的,那么U(q)将不会超出H(q,p)的初始值,如果没有对p的resampling。 所以这里的resampling就是指对p的正态抽样? 从他的代码来看,negate momentum的意思就是让p=-p,所以为什么要这样好像提了好几次但是没懂,硕士要让proposal symmetric? 为啥要这样啊 HMC = function (U, grad_U, epsilon, L, current_q) { q = current_q p = rnorm(length(q),0,1) # independent standard normal variates current_p = p # Make a half step for momentum at the beginning p = p - epsilon * grad_U(q) / 2 # Alternate full steps for position and momentum for (i in 1:L) { # Make a full step for the position q = q + epsilon * p # Make a full step for the momentum, except at end of trajectory if (i!=L) p = p - epsilon * grad_U(q) } # Make a half step for momentum at the end. p = p - epsilon * grad_U(q) / 2 # Negate momentum at end of trajectory to make the proposal symmetric p = -p # Evaluate potential and kinetic energies at start and end of trajectory current_U = U(current_q) current_K = sum(current_pˆ2) / 2 proposed_U = U(q) proposed_K = sum(pˆ2) / 2 # Accept or reject the state at end of trajectory, returning either # the position at the end of the trajectory or the initial position if (runif(1) < exp(current_U-proposed_U+current_K-proposed_K)) { return (q) # accept } else { return (current_q) # reject } } 10.2.2.2 Proof that HMC leaves the canonical distribution Invariant. 首先是证Metropolis update是reversible的,并且收敛到canonical distribution function for q and p.这是detail balance condition 思路是\\(A_k\\) 是\\((q,p)\\) 空间的一个分割,那么用Hamilton dynamic的Metropolis update就能把\\(A_k\\)映到\\(B_k\\),由于leapfrog steps的reversibility,所以\\(B_k\\)也是\\((q,p)\\)空间的一个分割,并且由于保volume,所以\\(A_k\\)的volume和\\(B_k\\)的相等。则有下式 \\[ P\\left(A_{i}\\right) T\\left(B_{j} | A_{i}\\right)=P\\left(B_{j}\\right) T\\left(A_{i} | B_{j}\\right) \\] 由于当\\(i\\neq j\\)时有,\\(T\\left(A_{i} | B_{j}\\right)=T\\left(B_{j} | A_{i}\\right)=0\\),而当\\(i=j\\)时,由于reversibility,所以成立。故满足detail-balance condition. 当\\(A_k\\)和\\(B_k\\)越来越小的时候,Hamiltonian变成在每个区域几乎是常数。令这个数是\\(H_X\\)在区域\\(X\\)中。同安G的,那么canonical probability density and the transition probabilities become effectively constant within each region as well. 那么上式就能变成 \\[ \\frac{V}{Z} \\exp \\left(-H_{A_{k}}\\right) \\min \\left[1, \\exp \\left(-H_{B_{k}}+H_{A_{k}}\\right)\\right]=\\frac{V}{Z} \\exp \\left(-H_{B_{k}}\\right) \\min \\left[1, \\exp \\left(-H_{A_{k}}+H_{B_{k}}\\right)\\right] \\] 这个式子更直接,因为保Volme,所以H不会变,都相等。 Detail-balance implies Metropolis update leaves the canonical distribution for q and p invariant. 下一步是说每一步的概率\\(P({B_k})\\)都相等, \\[ \\begin{aligned} P\\left(B_{k}\\right) R\\left(B_{k}\\right)+\\sum_{i} P\\left(A_{i}\\right) T\\left(B_{k} | A_{i}\\right) &=P\\left(B_{k}\\right) R\\left(B_{k}\\right)+\\sum_{i} P\\left(B_{k}\\right) T\\left(A_{i} | B_{k}\\right) \\\\ &=P\\left(B_{k}\\right) R\\left(B_{k}\\right)+P\\left(B_{k}\\right) \\sum_{i} T\\left(A_{i} | B_{k}\\right) \\\\ &=P\\left(B_{k}\\right) R\\left(B_{k}\\right)+P\\left(B_{k}\\right)\\left(1-R\\left(B_{k}\\right)\\right) \\\\ &=P\\left(B_{k}\\right) \\end{aligned} \\] The Metropolis update within HMC therefore leaves the canonical distribution invariant. 10.2.2.3 Ergodicity of HMC Typically, HMC will also be “ergodic”. Any value can be sampled for the momentum variables, which can then affect the position variables in arbitrary ways. However, ergodicity can fail if the L leapfrog steps in a trajectory produce an exact periodicity for some function of state. The one-dimensional example solutions are periodic with period \\(2\\pi\\). When \\(L\\epsilon\\) is exactly \\(2\\pi\\), HMC may return to the same possition coordinate. For nearby values of \\(L\\) and \\(\\epsilon\\), HMC may theoretically ergodic, but take very long time to move about the full state space. The potential nonergodicity problem can be solved by random choosing \\(\\epsilon\\) or L in a fairly small interval. Although in real problems interactions between variables typically prevent any exact periodicities from occurring, near periodicities might still slow HMC considerably. 10.2.3 Illustrations of HMC and Its Benefits Now illustrate some practical issues with HMC. 10.2.3.1 Trajectories for a Two-Dimensional Problem Example 1: Sampling from two variables which is bivariate Gaussian, mean of zero, standard deviations of one, and correlation 0.95. Regard this as “position” variables, and introduce two corresponding “momentum” variables which is also Gaussian (as previous demonstrate), with mean 0 and 1 sd, and 0 correlation. Then the Hamiltonian can be define as \\[ H(q, p)=q^{T} \\Sigma^{-1} q / 2+p^{T} p / 2, \\quad \\text { with } \\Sigma=\\left[ \\begin{array}{cc}{1} & {0.95} \\\\ {0.95} & {1}\\end{array}\\right] \\] 10.2.4 The benefit of avoiding random walks 最大的好处,HMC会几乎固定的朝一个方向走 10.2.5 Sampling from a 100-Dimensional Distribution 10.3 HMC in Practice and Theory Requires proper tuning of \\(L\\) and \\(\\epsilon\\). Then the tuning will be discussed below. How performance can be improved by using whatever knowledge is available regarding the scales of variables and their correlations. An additional benefit of HMC- its better scaling with dimensionality than simple Metropolis methods. 10.3.1 Effect of Linear Transformation If the variables beging sampled are transformed by multiplication by some nonsingular matrix, A. However, performance stays the same (except perhaps in terms of computation time per iteration), if at the same time the corresponding momentum variables are multiplied by \\((A^T)^{-1}\\). These facts provide insight into the operation of HMC. Also can help us improve performance when we have some knowledge of the scales and correlations of the variables. Let new variables be \\(q'=Aq\\). Then \\(P'(q')=P(A^{-1}q')/|det(A)|\\), where \\(P(q)\\) is the density for q. If the distribution for q is the canonical distribution for a potential energy function \\(U(q)\\), we can obtain the distribution for \\(q'\\) as the canonical distribution for \\(U'(q')=U(A^{-1}q')\\). 因为\\(|det(A)|\\) 是一个常数,我们需要不把log|deta(A)| 放到potential energy里面。 用之前的动能函数,但是把动量变量变成\\(p'=(A^T)^{-1}p\\),然后新的动能方程是\\(K'(p')=K(A^Tp')\\). 如果使用二次型的动能,\\(K(p)=p^{T} M^{-1} p / 2\\),则新的动能是 \\[ K^{\\prime}\\left(p^{\\prime}\\right)=\\left(A^{T} p^{\\prime}\\right)^{T} M^{-1}\\left(A^{T} p^{\\prime}\\right) / 2=\\left(p^{\\prime}\\right)^{T}\\left(A M^{-1} A^{T}\\right) p^{\\prime} / 2=\\left(p^{\\prime}\\right)^{T}\\left(M^{\\prime}\\right)^{-1} p^{\\prime} / 2 \\] 其中,\\(M^{\\prime}=\\left(A M^{-1} A^{T}\\right)^{-1}=\\left(A^{-1}\\right)^{T} M A^{-1}\\). 如果我哦们用动能变量的变形,则Hamiltonian Dynamics对于新的变量,\\((q',p')\\),足够重复原始的dynamics(对于(q,p))的,所以HMC的表现应该是一样的。为了证明这个,假设我们有对于\\((q',p')\\)的Hamiltonian dynamics,结果对于原来的变量则是 \\[ \\begin{aligned} \\frac{d q}{d t} &=A^{-1} \\frac{d q^{\\prime}}{d t}=A^{-1}\\left(M^{\\prime}\\right)^{-1} p^{\\prime}=A^{-1}\\left(A M^{-1} A^{T}\\right)\\left(A^{T}\\right)^{-1} p=M^{-1} p \\\\ \\frac{d p}{d t} &=A^{T} \\frac{d p^{\\prime}}{d t}=-A^{T} \\nabla U^{\\prime}\\left(q^{\\prime}\\right)=-A^{T}\\left(A^{-1}\\right)^{T} \\nabla U\\left(A^{-1} q^{\\prime}\\right)=-\\nabla U(q) \\end{aligned} \\] 结果和原始的Hamiltonian dynamics for (q,p)一样。 如果A是正交矩阵,\\(A^{-1}=A^{T}\\),则HMC的performance不会变,如果对q 和 p 乘以A。 如果我们给动量选择了一个旋转对称分布,with \\(M=mI\\).(momentum variables are independent, each having variance m).这些变换不会对能量函数造成变化,所以也不会对momentum的distribution造成变化。因为我们有\\(M^{\\prime}=\\left(A(m I)^{-1} A^{T}\\right)^{-1}=m I\\). 如果RWM的proposal也有rotation symmetric的性质(比如Gaussian with covariance matrix mI). Gibbs 则没有rotationally invariant. 但是,Gibbs对某个变量的rescaling是不变的。 Gibbs sampling is not rotationally invariant, nor is a scheme in which the Metropolis algorithm is used to update each variable in turn. However, Gibbs sampling is invariant to rescaling of the variables(transformation by a diagonal matrix), which is not true for HMC or random-walk Metropolis, unless the kinetic energy or proposal distribution is transformed in a corresponding way. Suppose that we have estimate, \\(\\Sigma\\), of the covariance matrix for q, and suppose also that q has at least a roughly Gaussian distribution. How can we use this information to improve the performance of HMC? One way is to transform the variables so that their covariance matrix is close to the identity, by finding the Cholesky decomposition, \\(\\Sigma=LL^T\\), which L being lower-triangular, and letting \\(q'=L^{-1}q\\). We then let our kinetic energy function be \\(K(p)=p^Tp/2\\). Since the momentum variables are independent, and the position variables are close to independent with variances close to one, HMC should perform well using trajectories with a small number of leapfrog steps, which will move all variables to a nearly independent point More realistically, the estimate \\(\\Sigma\\) may not be very good, but this transformation could still improve performance compared to using the same kinetic energy with the original q variables. An equivalent way to make use of the estimated covariance \\(\\Sigma\\) is to keep the original \\(q\\) variables, but use the kinetic energy function \\(K(p)=p^{T} \\Sigma p / 2\\), that is, we let the momentum variables have covariance \\(\\Sigma^{-1}\\). The equivalence can be seen by transforming this kinetic energy to correspond to a transformation to \\(q'=L^{-1}q\\), which gives \\(K(p')=(p')^TM'^{-1}p'\\) with \\(M^{\\prime}=\\left(L^{-1}\\left(L L^{T}\\right)\\left(L^{-1}\\right)^{T}\\right)^{-1}=I\\). 也就是说,如果我们知道position的covariance,那么通过线性变换把这个相关性除掉,然后再有动量变量设为独立的,那么就是两个独立变量的抽,这样HMC的trajectories就只用很小的leapfrog step就能遍历附近的独立的点。就算\\(\\Sigma\\)的估计不好,但是这个变换也能提高HMC的表现效果。 这个方法的限制是当矩阵很大的时候,矩阵运算很花时间。 所以对于高维问题一般用对角的covariance matrix \\(\\Sigma\\) ,相当于只考虑variable的scale而不考虑之间的相关性。 10.3.2 Tuning HMC 一个HMC应用的阻碍是需要选择合适的Leapfrog step, \\(\\epsilon\\), 以及Leapfrog steps的迭代次数,L。这两个参数共同决定了轨道的虚时间。大部分mcmc方法都有参数需要tuned,除了Gibbs,推出来全条件分布以后就能直接抽了。但是,tuning HMC比起其他简单的Metropolis会更难。 10.3.2.1 Preliminary Runs and Trace Plots 要Tuning HMC一般需要先尝试性的run几次去寻找\\(\\epsilon\\)和L。为了判断这几次run的效果如何, trace plots of quantities that are thought to be indicative of overall convergence should be examined. 但是尝试性的run也可能会造成misleading。因为\\(\\epsilon\\)和\\(L\\)的最佳选择不一定是第一次达到平稳的时候的选择。如果必要的话,HMC的每一步迭代中,\\(\\epsilon\\)和 \\(L\\)都进行随机选取。 建议多跑跑几次基于不同的随机性设置,这样比较隔绝的极大值点就更容易被探测到。和其他所有MCMC方法相比,HMC不会更少(或者更多)容易被隔离的极值点所影响。如果找到了有隔离的极值点存在,需要做一些事情去处理这个问题。只是把约束在单个极值点的链组合起来是不可取的,可以尝试使用沿着解集轨道“tempering”回火结合的HMC进行处理多个极值点问题。 10.3.2.2 What Stepsize? 选取一个合适的leapfrog stepsize \\(\\epsilon\\)也是很重要的。 太大的stepsize会导致很低的接受率,太小的stepsize会浪费很多计算时间。 幸运的是,stepsize的选择几乎与要走多少步(L)独立。Hamiltonian的误差,基本上不会因为多走几步而增加,所以stepsize如果足够小的话,这个dynamics会很稳定。 Stability的问题可以从一个一维的问题中看出 \\[ H(q, p)=\\frac{q^{2}}{2 \\sigma^{2}}+\\frac{p^{2}}{2} \\] q的分布是\\(N(0,\\sigma^2)\\) ,对这个系统的leapfrog step是一个线性映射从\\((q(t),p(t))\\)到\\((q(t+\\epsilon),p(t+\\epsilon))\\). 从之前Leapfrog方法的定义式中可以dehumidifier,这个线性变换可以表述成如下的矩阵形式: \\[ \\left[ \\begin{array}{c}{q(t+\\varepsilon)} \\\\ {p(t+\\varepsilon)}\\end{array}\\right]=\\left[ \\begin{array}{cc}{1-\\varepsilon^{2} / 2 \\sigma^{2}} & {\\varepsilon} \\\\ {-\\varepsilon / \\sigma^{2}+\\varepsilon^{3} / 4 \\sigma^{4}} & {1-\\varepsilon^{2} / 2 \\sigma^{2}}\\end{array}\\right] \\left[ \\begin{array}{l}{q(t)} \\\\ {p(t)}\\end{array}\\right] \\] 所以这个映射到底是会有一个稳定的轨道呢,还是会发散到无穷,由这个线性变换的矩阵的特征值所决定,而这个特征是 \\[ \\left(1-\\frac{\\varepsilon^{2}}{2 \\sigma^{2}}\\right) \\pm\\left(\\frac{\\varepsilon}{\\sigma}\\right) \\sqrt{\\varepsilon^{2} / 4 \\sigma^{2}-1} \\] 当\\(\\frac{\\epsilon}{\\sigma}>2\\)时,特征值是实数,所以存在至少一个绝对值大于1的根。此时,解的轨道使用Leapfrod 方法用这个\\(\\epsilon\\)算就会unstable。当\\(\\epsilon/\\sigma<2\\)时,特征值是复数,然后有如下性质: \\[ \\left(1-\\frac{\\varepsilon^{2}}{2 \\sigma^{2}}\\right)^{2}+\\left(\\frac{\\varepsilon^{2}}{\\sigma^{2}}\\right)\\left(1-\\frac{\\varepsilon^{2}}{4 \\sigma^{2}}\\right)=1 \\] 所以Trajectory此时就是stable的。 对于多元的问题,动能函数选择\\(K(p)=p^Tp/2\\),则stability的限制就会由distribution的宽度,在最受限制的方向。比如说对于Gaussian分布,这个东西将会是q的协方差矩阵最小特征值的平方根。而如果使用带质量阵的动能函数\\(K(p)=p^{T} M^{-1} p / 2\\),则可以使用之前提到的线性变换的方法变成上面这种情况再去求特征值。 如果使用一个会产生不稳定轨道的\\(\\epsilon\\),那么H的值就会随着L以指数级增大,并且会导致接受概率极端的小。对于低维的问题,使用一个稍微比stability limit小的\\(\\epsilon\\)也足够能得到一个比较好的接受率。但是对于高维就不一样,stepsize可能需要reduced further than this才能让error in H to a level that produces a good acceptance probability. 如果使用过大的\\(\\epsilon\\),那么可能会对HMC的表现产生非常不好的影响。在这种情况下,HMC会对tuning更加的敏感,甚至超过RWM。RWM需要选择一个scaling,作为random walk的deviation,但是表现的变化是相对很光滑的,而不像HMC当\\(\\epsilon\\)超出stability limit的范围之后,会急剧的下降。(但是高维的情况RWM也会变化的很剧烈,所以这个区别并不是很明显)。 当Stepsize太大导致的HMC表现急剧下降不是一个很重要的问题如果stability limit是一个常数,这个问题很明显来自于实验性的run,所以可以被修复。Real danger是stability limit may differ for several regions of the state space that all have substantial probability. 当preliminary runs开始于一个stability limit很大的区域,那么去一个稍微小一点的\\(\\epsilon\\)可能会比较合适。但是如果对于其他区域这个\\(\\epsilon\\)还是超过了stability limit,那么可能HMC永远不会访问这个区域,甚至这个区域有不可忽略的概率也不行,会导致非常严重的错误。为什么这会发生呢,考虑如果run ever does visit the region where the chosen \\(\\epsilon\\) would produce instability, it wills tay there for a very long time, since the acceptance probability with that \\(\\epsilon\\) will be very small.如果run到了这个区域,会因为接受概率一直很小所以就呆在这里很长时间。尽管HMC产出的是稳定分布,但是基本上不会从一个\\(\\epsilon\\)是stability的移动的这边。一个例子是如果从一个very light tails的分布进行抽样,(lighter than Gaussian distribution),for which the log of the density will fall faster than quadratically. In the tails, the gradient of the log density will be large, and a small stepsize will be needed for stability. 这个在Langevin method中有详细描述。 这个问题可以通过选取一个随机的\\(\\epsilon\\)进行减轻。即便这个分布的均值太大,合适的小的\\(\\epsilon\\)也有可能偶尔会选到。随机选择步长应该处于计算轨道之前,而不是每次更新轨道leapfrog step的时候。有一个“捷径”方法能够当随机选取的步长不合适的时候节约计算时间。 10.3.2.3 What Trajectory Length? 选取一个合适的轨道长度对于一个要系统的,而不是随机游走的探索状态空间的HMC非常重要。很多分布非常难以取样因为他们紧紧的在某些方向被约束住了,但是在其他方向的约束没那么紧。用足够长的轨道就能探索没那么多限制的方向。 但是轨道也可能过长。 对于更复杂的问题,不能用肉眼观察轨道的长度选取的如何,找到变量的线性组合使得约束最小会非常困难,甚至于不可能,当最小的方向其实是一个非线性曲线或者平面的时候。 所以对tajectory长度的试错在某些情况就很有必要。假设对于某些非常困难的问题,用一个L=100的轨道可能会合适starting point。如果实验性的run(和某些合适的\\(\\epsilon\\)),显示HMCHMC达到了几乎独立的点在一次迭代之后,一个更小的L值可能就应该在下一步的时候使用。如果是是L=100还是有很高的自相关的话,可能得在下一次迭代中尝试\\(L=1000\\). 之后会讨论随机变化的轨道长可能很有必要。这样可以避免选到一个靠近周期性的长度。 10.3.2.4 Using Multiple Stepsizes. 所以可以通过找相对的variable的scales去提高HMC的表现。这可以通过两个等价的方法得到。第一个是如果\\(s_i\\)是\\(q_i\\)合适的scale,我们可以变换q,使得\\(q_{i}^{\\prime}=q_{i} / s_{i}\\).或者也可以用另外一个动能函数\\(K(p)=p^TM^{-1}p\\),其中M的对角元是\\(m_i=1/s_i^2\\). 第三种等价方式是最方便的,是使用不同的stepsizes for different pairs of position and momentum variables. 考虑一个leapfrog update with \\(m_i=1/s_i^2\\): \\[ \\begin{aligned} p_{i}(t+\\varepsilon / 2) &=p_{i}(t)-(\\varepsilon / 2) \\frac{\\partial U}{\\partial q_{i}}(q(t)) \\\\ q_{i}(t+\\varepsilon) &=q_{i}(t)+\\varepsilon s_{i}^{2} p_{i}(t+\\varepsilon / 2) \\\\ p_{i}(t+\\varepsilon) &=p_{i}(t+\\varepsilon / 2)-(\\varepsilon / 2) \\frac{\\partial U}{\\partial q_{i}}(q(t+\\varepsilon)) \\end{aligned} \\] Define \\((q^{(0)},p^{(0)})\\) 为leapfrog的开始点,然后定义\\(\\left(q^{(1)}, p^{(1)}\\right)\\) to be the final state (i.e.\\((q(t+\\varepsilon), p(t+\\varepsilon))\\)),定义\\(p^{(1/2)}\\)作为半途的动量。我们现在重新写leapfrog step如下: \\[ \\begin{aligned} p_{i}^{(1 / 2)} &=p_{i}^{(0)}-(\\varepsilon / 2) \\frac{\\partial U}{\\partial q_{i}}\\left(q^{(0)}\\right) \\\\ q_{i}^{(1)} &=q_{i}^{(0)}+\\varepsilon s_{i}^{2} p_{i}^{(1 / 2)} \\\\ p_{i}^{(1)} &=p_{i}^{(1 / 2)}-(\\varepsilon / 2) \\frac{\\partial U}{\\partial q_{i}}\\left(q^{(1)}\\right) \\end{aligned} \\] 这是一个标准的leapfrog更新过程,以下就加上rescaled momentum variables: \\(\\tilde{p}_{i}=s_{i} p_{i}\\) 并且有步长\\(\\epsilon_i=s_i\\epsilon\\), 我们可以写leapfrog update如下: \\[ \\begin{aligned} \\tilde{p}_{i}^{(1 / 2)} &=\\tilde{p}_{i}^{(0)}-\\left(\\varepsilon_{i} / 2\\right) \\frac{\\partial U}{\\partial q_{i}}\\left(q^{(0)}\\right) \\\\ q_{i}^{(1)} &=q_{i}^{(0)}+\\varepsilon_{i} \\tilde{p}_{i}^{(1 / 2)} \\\\ \\tilde{p}_{i}^{(1)} &=\\tilde{p}_{i}^{(1 / 2)}-\\left(\\varepsilon_{i} / 2\\right) \\frac{\\partial U}{\\partial q_{i}}\\left(q^{(1)}\\right) \\end{aligned} \\] 这就是像一个leapfrog update对于\\(m_i=1\\),但是对于不同的\\((q_i,p_i)\\)对有不同的stepsize。当然,这个连续的\\((q,\\tilde p)\\) 不在被解释为一个Hamiltonian dynamics在一致的时间点,但是没有结论说使用这些HMC的轨道。注意当我们在计算轨道之前抽栋梁的时候,每一个\\(\\tilde p_i\\) 是独立的来自于Gaussian分布(\\(N(0,I)\\)),而不是\\(s_i\\). 这个多个stepsize的方法一般来说是最便利的,特别是估计的scales,\\(s_i\\)不是固定的,然后momentum只是partially refreshed。 10.3.3 Combining HMC with Other MCMC Updates 对于某些问题,MCMC孤立的使用HMC是不可能或者不受欢迎的。两个情况Non-HMC更新将会很有必要,1当某些变量是离散的,2当log probability density的导数很难计算或者不可能计算的时候。 这种情况的时候,HMC可以只对其他变量生效。另外一个例子是当特殊的MCMC更新过程可以帮助MCMC收敛,同时HMC不会,比如说,从两个孤立的极值点之间移动。但是并不会完全代替HMC。下面将会讨论,比如说Bayesian hierarchical models可能用HMC结合Gibbs最好。 在这些情况里,一个或者多个HMC updates对于所有变量,或者变量的子集可以代替一个或者多个更新,从而使得对于全部变量的联合分布保持稳定invariant. 没看懂 The HMC updates can be viewed as either leaving this same joint distribution invariant, or as leaving invariant the conditional distribution of the variables that HMC changes, given the current values of the variables that are fixed during the HMC udpate. These are equivalent views. When both HMC and other updates are used, it may be best to use shorter trajectories for HMC than would be used if only HMC were being done. This allows the other updates to be done more often, which presumably helps sampling. Finding the optimal tradeoff is likely to be difficult, however, A variation on HMC that reduces the need for such a tradeoff is described below. 10.3.4 Scaling with Dimensionality Main benefits of HMC was illustrated, its ability to avoid the inefficient exploration of the state space via a random walk. 10.3.4.1 Creating Distributions of Increasing Dimensionality by Replication How performance scales with dimensionality, assume something about how the distribution changes with dimensionality, \\(d\\). Dimensionality increases by adding independent replicas of variables- that is, the potential energy function for \\(q=(q_1,...,q_d)\\) has the form \\(U(q)=\\sum u_i(q_i)\\), for functions \\(u_i\\) drawn independently from some distribution. 实际情况肯定不是这样,但是这样可以作为一个有道理的模型来看看提高维度对某些问题的影响。 虽然在独立的情况下Gibbs表现的非常好,对每个变量进行Metropolis update 也很好,但是这两个算法都是not invariant to rotation. 所以对于某些特殊问题表现可能就不会太好。 10.3.4.2 Scaling of HMC and Random-Walk Metropolis Discuss informally how well HMC and random-walk Metropolis scale with dimension, loosely following Creutz. The following relationship holds when any Metropolis-style algorithm is used to sample a density \\(P(x)=(1 / Z) \\exp (-E(x))\\): \\[ 1=\\mathrm{E}\\left[P\\left(x^{*}\\right) / P(x)\\right]=\\mathrm{E}\\left[\\exp \\left(-\\left(E\\left(x^{*}\\right)-E(x)\\right)\\right)\\right]=\\mathrm{E}[\\exp (-\\Delta)] \\] 当x是目前的状态,假设分布是\\(P(x)\\),\\(x^*\\) 是proposed state,任意的\\(\\Delta=E\\left(x^{*}\\right)-E(x)\\). Jensen’s inequality暗示了能量的区别是非负的 \\[ \\mathrm{E}[\\Delta] \\geq 0 \\] 不等式一般情况下是严格的。 有个问题,怎么通过Jesen不等式得到这个? Jesen不等式的形式:\\(\\varphi(\\mathrm{E}[X]) \\leq \\mathrm{E}[\\varphi(X)]\\) 怎么和\\(\\Delta\\)那个形式联系起来?感觉好奇怪.想复杂了,就是因为那个1的问题,然后用jesen不等式,也就是函数在里面是1,所以函数在外面就大于等于log(1)=0 When \\(U(q)=\\sum u_i(q_i)\\) and proposals are produced independently for each i, we can apply these relationship either to a single variable (or pair of variables) or to the entire state. For a single variable(pair), write \\(\\Delta_1\\) for \\(E(x^*)-E(x)\\), with \\(x=q_i\\) and \\(E(x)=u_i(q_i)\\) 中间过程没怎么看懂,看看结论先。 With increasing dimension by replicating variables will lead to increasing energy differences, since \\(\\Delta _d\\) is the su of \\(\\Delta_1\\) for each variable, each of which has positive mean. This will lead to a decrease in the acceptance probability , which is \\(min(1,exp(-\\Delta_d))\\) unless the width of the proposal distribution or the leapfrog stepsize is decreased to compensate. 后面的大致意思是对于RWM维度越高,那么对于目前状态和proposal的state,potential energy的差异是每个变量造成的差异之和。如果使用固定的标准差\\(\\zeta\\),potential energy difference的mean and variance随着d增大线性增长。这样会导致很低的接受率。所以为了保证能接受的表现,标准差应该随着d增加而降低,iteration次数应该需要保证几乎独立的点。 而和RWM类似,HMC的话,q是独立的,使用能量函数\\(K(p)=\\Sigma p_{i}^{2} / 2\\),不同的\\((q_i,p_i)\\) 组都能服从Hamiltonian dynamics通过potential energy 用q,Kinetic energy用p。而解轨道中的虚拟世界虚拟时间不会随着维度增加而增加(我觉得是指\\(\\epsilon L\\)不受维度影响?)但是使用leapfrog方法得到的proposal state的接受概率是the sum of the errors pertaining to each \\((q_i,p_i)\\) pair. 对于固定的步长\\(\\epsilon\\),一个固定的轨道长度,\\(\\epsilon L\\), mean and variance of the error in H grow linearly with d. This will also lead a progressively lower acceptance rate as dimensionality increases. 所以为了判断哪个方法更好需要决定我们需要多迅速的改变\\(\\zeta\\) 和 \\(\\epsilon\\) 当d 增加的时候。当d增加的时候,那么\\(\\eta\\)和\\(\\epsilon\\)会趋向于0,同时\\(\\Delta_1\\) 将会也趋于0。使用二阶逼近\\(exp(-\\Delta_1)\\),则有\\(1-\\Delta_{1}+\\Delta_{1}^{2} / 2\\) ,则带入上式就有 \\[ \\mathrm{E}\\left[\\Delta_{1}\\right] \\approx \\frac{\\mathrm{E}\\left[\\Delta_{1}^{2}\\right]}{2} \\] 则对于\\(\\Delta_1\\)方差,是均值的2倍,那么对于\\(\\Delta_d\\)的方差也是其2倍均值。(even when \\(\\Delta_d\\) is not small). 为了达到一个好的接受率,我们必须保证\\(\\Delta_d\\)在1附近,since a large mean will not be saved by a similarly large standard deviation(which would produce fairly frequent acceptances as \\(\\Delta_d\\) occasionally takes on a negative value). It follows that \\(E[\\Delta_d]\\) is proportional to \\(d^2_\\zeta\\), wo we can maintain a reasonable acceptance rate by letting \\(\\zeta\\) be proportional to \\(d^{-1/2}\\). The number of iterations needed to reach a nearly independent point will be proportional to \\(\\zeta^{-1/2}\\). The number of iterations needed to reach a nearly independent point will be proportional to \\(\\zeta^{-2}\\), which will be proportional to d. 10.3.4.3 Optimal Acceptance Rates 继续扩展以上的理论,我们可决定如果基于最优的tuning parameter\\(\\zeta\\)或者\\(\\epsilon\\),对应的接受率是多少。 为了找到这个接受率,我们先注意Metropolis methods满足Detailed Balance条件,接受概率对于正负\\(\\Delta_d\\)是一样的。因为所有的proposal对于负的\\(\\Delta_d\\) 都接受,所以接受率一般来说是简单的两倍概率对于proposal有一个负的\\(\\Delta_d\\).对于一个很大的d,中心极限定理暗示了\\(\\Delta_d\\)的分布是Gaussian,因为其为d个独立的\\(\\Delta_1\\)的值。 因为\\(\\Delta_d\\)的方差是二倍均值,\\(E[\\Delta_d]=\\mu\\). 则接受概率能写成如下形式: 对于大的d有: \\[ P(\\text { accept })=2 \\Phi\\left(\\frac{(0-\\mu)}{\\sqrt{2 \\mu}}\\right)=2 \\Phi(-\\sqrt{\\mu / 2})=a(\\mu) \\] \\(\\Phi(z)\\) 是标准高斯的累计分布函数。 对于RWM,得到一个独立的点的花销与\\(1/(a\\zeta^2)\\)成比例。我们可以从上面看出\\(\\mu=E[\\Delta_d]\\) 与\\(\\zeta^2\\)成比例,所以花销服从以下比例 \\[ C_{\\mathrm{rw}} \\propto \\frac{1}{(a(\\mu) \\mu)} \\] 数值计算发现这个极小当\\(\\mu=2.8\\)和\\(a(\\mu)=0.23\\) The same optimal \\(23\\%\\) acceptance rate for random-walk Metropolis was previously obtained using a more formal analysis by Roberts et al (1997). The optimal 65% acceptance rate for HMC . 10.3.4.4 Exploring the Distribution of Potential Energy 更好的Scaling会让HMC的表现取决于resampling of the momentum variables. 我们可以看到考虑how well the methods explore the distribution of the potential energy, \\(U(q)=\\sum u_i(q_i)\\) .用动量的变量去探索能量函数。 等等,能量不应该是distribution of interest吗?得返回去看看 Because \\(U(q)\\) is a sum of d independent terms, its standard deviation will grow in proportion to \\(d^{1/2}\\). 根据Caracciolo et al.(1994),expected change in potential energy from a single Metropolis update will be no more than order 1- intuitively, large upwards canges are unlikely to be accepted, and since Metropolis updates satisfy detailed balance, large downward changes must also be rare(in equilibrium). Because changes in \\(U\\) will follow a random walk (due again to detailed balance). 因为U中的改变将会服从一次随机游走(也是一万年detailed balance),it will take at least order \\((d^{1/2}/1)^2=d\\) Metropolis updates to explore the distribution of \\(U\\). 在HMC的第一步中,动量的重抽量一般贵改变潜在能量by an amount that is proportional to \\(d^{1/2}\\),因为kinetic energy is also a sum of d independent terms, and hence has standard deviation that grows as \\(d^{1/2}\\) (more precisely, its standard deviation is \\(d^{1/2}/2^{1/2}\\)). If the second step of HMC proposes a distant point this change in kinetic energy (and hence in H) will tend, by the end of the trajectory, to have become equally split between kinetic and potential energy. If the endpoint of this trajectory is accepted, the change in potential energy from a single HMC iteration will be proportional to \\(d^{1/2}\\), comparable to its overall range of variation. So, in contrast to random-walk Metropolis, we may hope that only a few HMC iterations will be sufficient to move to a nearly independent point, even for high-dimensional distributions. Analyzing how well method explore the distribution of \\(U\\) can also provide insight into their performance on distributions that are not well modeled by replication of variables, as we will see in the next section. 这个order又是怎么一回事。。。 10.3.5 HMC for Hierarchical Models Many Bayesian models are defined hierarchically. A large set of low-level parameters have prior distributions that are determined by fewer higher-level “hyperparameters,” which in turn may have priors determined by fewer higher-level“hyperparameters,” which in turn may have priors determined by yet-higher-level hyperparameters. 可以应用HMC于这些模型通过一个很显然的办法: (注意对variance hyperparameters做一个log变换,使得无限制)。然而,最好应用HMC仅仅对lower-level parameters, 原因以下会讨论,(5.4.3一般性的讨论了如何applying HMC对variable的子集。) Bayesian neural network model会作为一个例子。这些模型一般来讲有几组low-level parameters,每个associated variance hyperparameter. The posterior distribution of these hyperparameters reflects important aspects of the data,比如说哪些变量和目标任务最相关。The efficiency with which values for these hyperparameters are sampled from the posterior distribution can often determine the overall efficiency of the MCMC method. 抽样hyperparameter的efficiency决定了MCMC的全局效率?? I use HMC only for the low-level parameters in Bayesian neural network models, with the hyperparameters being fixed during an HMC update. These HMC updates alternate with Gibbs sampling updates of the hyperparameters, which (in the simpler versions of the models) are independent given the low-level parameters, and have conditional distributions of standard form. 所以用HMC的只是“低阶”变量,超参数则用Gibbs更新,一个混合的过程。The leapfrog stepsizes used can be set using heuristic that are based on the current hyperparameter values. (I use the multiple stepsize approach described at the end of Section 5.4.2, equivalent to using different mass values, \\(m_i\\), for different parameters.)leapfrog方法的步长用启发式方法进行决定,基于目前的超参数。比如说,the size of the network “weights” on connections out of a “hidden unit” determine how sensitive the likelihood function is to changes in weights on connections into the hidden unit; the variance of the weights on these outgoing connections. If the hyperparameters were changed by the same HMC updates as change the lower-level parameters, using them to set stepsizes would not be valid, since a reversed trajectory would use different stepsizes, and hence not retrace the original trajectory. Without a good way to set setpsizes, HMC for the low-level parameters would likely be much less efficient. 真不是很懂好奇怪,大概emmmm,这样更新会有点问题,所以只用一部分? Choo(2000) bypassed (跳过了?) this problem by using a modification of HMC in which trajectories are simulated by alternating leapfrog steps tha update only the hyperparameters with leapfrog steps that update only the low-level parameters. This procedure maintains both reversibility and volume preservation. However, performance did not improve as hoped because of a second issue with hierarchical models. In these Bayesian neural network models, and many other hierarchical models, the joint distribution of both low-level parameters and hyperparameters is highly skewed. 10.4 Extensions of and Variations on HMC Modify the basic HMC algorithm can be done in many ways. Improve its efficiency, make it useful for a wider range of distribution. Alternatives to the leapfrog discretization of Hamilton’s equation will be discussed. Also how the HMC handle distributions with constraints on the variables. Another algorithm that leapfrog step only done once, which may be useful when not all variables are updated by HMC. Computing a short cut method that avoids computing the whole trajectory when the stepsize is inappropriate. 10.4.1 Discretization by Splitting: Handling constraints and Other Applications Leapfrog method 不是解决Hamilton’s等式,保证volume和reversible的唯一方法。所以也有其他方法可以用于HMC。 很多“symplectic integration method”被设计出来了,很多用于其他方面的应用而不是HMC(比如模拟太阳系几百万年的运行用于检验稳定性。)设计一个有更高阶的准确度的方法是可能的,比如(McLachlan and Atela,1992)。使用这些方法在HMC上,当维度增加时HMC的会渐进地产出更好的结果。尽管如此,还是值得研究Hamiltonian dynamics如何被模拟,这样才能指出如何处理variable的constrain,也可以提高探索偏分析的结果。 10.4.1.1 Splitting the Hamiltonian Many symplectic discretization of Hamiltonian dynamics can be derived by “splitting” the Hamiltonian into several terms, and then, for each term in succession, simulating the dynamics defined by that term for some small time step, 很多辛离散化Hamiltonian dynamics可以用“splitting” Hamiltonian 成为几项导出,然后对于每项一连串的模拟dynamics由定义一些小的时间步骤,重复这个过程知道达到要求的总模拟时间。如果每一项的模拟可以解析的得到,那么我们可以得到辛逼近对dynamics如果可以做到的话。 这个更一般的方法可以从Leimkuhler and Reich (2004,section4.2)和(Sexton and Weingarten (1992))找到. 假设Hamiltonian可以写成k个项的和 \\[ H(q, p)=H_{1}(q, p)+H_{2}(q, p)+\\cdots+H_{k-1}(q, p)+H_{k}(q, p) \\] 假设我们能 exactly 实现Hamiltonian dynamics 基于每个\\(H_i\\),for \\(i=1,...,k\\), with \\(T_{i,\\epsilon}\\) 作为一个映射定义在应用dynamics基于\\(H_i\\) for time \\(\\epsilon\\). As shown by Leimkuhler and Reich, 如果\\(H_i\\) 二阶可微,那么这些映射的组合\\(T_{1, \\varepsilon} \\circ T_{2, \\varepsilon} \\circ \\cdots \\circ T_{k-1, \\varepsilon} \\circ T_{k, \\varepsilon}\\),是一个可行的离散化Hamiltonian dynamics 基于H。 这个会产生exact dynamics in the limit \\(\\epsilon\\) goes to zero, with global error of order \\(\\epsilon\\) or less. 更多的,这个离散化将会保volume,也是symplectic的,因为这些性质对每个\\(T_{i,\\epsilon}\\)都成立。这个离散化也是reversible的,如果\\(H_{i}(q, p)=H_{k-i+1}(q, p)\\).就如前文所提到的,所有的reversible的方法都有偶数阶的global error in \\(\\epsilon\\),意味着global error must be of order \\(\\epsilon^2\\),或者更好。 我们可以推出leapfrog方法来自于对称分解Hamiltonian,如果有\\(H(q, p)=U(q)+K(p)\\),那么我们可以把Hamiltonian写成 \\[ H(q, p)=\\frac{U(q)}{2}+K(p)+\\frac{U(q)}{2} \\] 与之对应的分解是\\(H_{1}(q, p)=H_{3}(q, p)=U(q) / 2\\)和 \\(H_{2}(q, p)=K(p)\\),则此时Hamiltonian dynamics基于\\(H_1\\)就是 \\[ \\begin{aligned} \\frac{d q_{i}}{d t} &=\\frac{\\partial H_{1}}{\\partial p_{i}}=0 \\\\ \\frac{d p_{i}}{d t} &=-\\frac{\\partial H_{1}}{\\partial q_{i}}=-\\frac{1}{2} \\frac{\\partial U}{\\partial q_{i}} \\end{aligned} \\] 应用这个dynamics for time \\(\\epsilon\\),just adds \\(-(\\epsilon/2)\\partial U / \\partial q_{i}\\) to each \\(p_i\\), which is the first part of a leapfrog step. The dynamics based on \\(H_2\\) is as follows: \\[ \\begin{aligned} \\frac{d q_{i}}{d t} &=\\frac{\\partial H_{2}}{\\partial p_{i}}=\\frac{\\partial K}{\\partial p_{i}} \\\\ \\frac{d p_{i}}{d t} &=-\\frac{\\partial H_{2}}{\\partial q_{i}}=0 \\end{aligned} \\] If \\(K(p)=\\frac{1}{2}\\sum p_i^2/m_i\\), applying this dynamics for time \\(\\epsilon\\) results in adding \\(\\epsilon p_i/m_i\\) to each \\(q_i\\) which is the second part of a leapfrog step. Finally, \\(H_3\\) produces the third part of a leapfrog step (Equation 5.20), which is the same as the first part, since \\(H_3=H_1\\). 以下感觉吃力了,需要重新理清楚之前的理论才好深入,所以先泛读一遍吧,笔记就不写了 10.4.1.2 Handling Constraints "], +["bayes-variable-selection.html", "Chapter 11 Bayes variable selection 11.1 Prior Specification 11.2 Summaries the posterior distribution and model averaged inference 11.3 Numerical Methods", " Chapter 11 Bayes variable selection The model uncertainty problem with the \\(2^p\\) competing models: \\[ M_{\\gamma} : \\boldsymbol{y}=\\alpha \\mathbf{1}_{n}+\\boldsymbol{X}_{\\gamma} \\boldsymbol{\\beta}_{\\gamma}+\\boldsymbol{\\varepsilon} \\] The Null model: \\[ M_{0} : y=\\alpha 1_{n}+\\varepsilon \\] Assuming that one of the models in \\(\\mathcal M\\) is the true model, the posterior probability of any model is \\[ \\operatorname{Pr}\\left(M_{\\gamma^{*}} | \\boldsymbol{y}\\right)=\\frac{m_{\\gamma *}(\\boldsymbol{y}) \\operatorname{Pr}\\left(M_{\\gamma^{*}}\\right)}{\\sum_{\\gamma} m_{\\gamma}(\\boldsymbol{y}) \\operatorname{Pr}\\left(M_{\\gamma}\\right)} \\] where \\(Pr(M_\\gamma)\\) is the prior probability of \\(M_\\gamma\\) and \\(m_\\gamma\\) is the integrated likelihood with respect to the prior \\(\\pi_\\gamma\\): \\[ m_{\\gamma}(\\boldsymbol{y})=\\int f_{\\gamma}\\left(\\boldsymbol{y} | \\boldsymbol{\\beta}_{\\gamma}, \\alpha, \\sigma\\right) \\pi_{\\gamma}\\left(\\boldsymbol{\\beta}_{\\gamma}, \\alpha, \\sigma^{2}\\right) d \\boldsymbol{\\beta}_{\\gamma} d \\alpha d \\sigma^{2} \\] 也就是把所有不确定因素积掉以后,y的分布函数,also called the (prior) marginal likelihood. For \\(\\gamma=0\\) this integrated lkelihood becomes: \\[ m_{0}(\\boldsymbol{y})=\\int f_{0}(\\boldsymbol{y} | \\alpha, \\sigma) \\pi_{0}\\left(\\alpha, \\sigma^{2}\\right) d \\alpha d \\sigma^{2} \\] 可以用Bayes factor替换 integrated likelihood function: \\[ \\operatorname{Pr}\\left(M_{\\gamma^{*}} | \\boldsymbol{y}\\right)=\\frac{B_{\\gamma^{*}}(\\boldsymbol{y}) \\operatorname{Pr}\\left(M_{\\gamma^{*}}\\right)}{\\sum_{\\gamma} B_{\\gamma}(\\boldsymbol{y}) \\operatorname{Pr}\\left(M_{\\gamma}\\right)} \\] As stated in the introduction, we are mainly interested in software that implements the formal Bayesian answer which implies that we use the posterior distribution. Due to the following three aspects The priors that the package accomodates, that is, \\(\\pi_{\\gamma}\\left(\\boldsymbol{\\beta}_{\\gamma}, \\alpha, \\sigma^{2}\\right)\\) and \\(Pr(M_\\gamma)\\) the tools provided to summarize the posterior distribution and obtain model averaged inference the numerical methods implemented to compute the posterior distribution 11.1 Prior Specification The two inputs that are needed to obtain the posterior distribution are \\(\\pi_\\gamma\\) and \\(Pr(M_r)\\) the \\(2^p\\) prior distributions for the parameters within each model and the prior distribution over the model space, respectively. 不失一般性,先验分布\\(\\pi_\\gamma\\) 可以写成 \\[ \\pi_{\\gamma}\\left(\\boldsymbol{\\beta}_{\\gamma}, \\alpha, \\sigma^{2}\\right)=\\pi_{\\gamma}\\left(\\boldsymbol{\\beta}_{\\gamma} | \\alpha, \\sigma^{2}\\right) \\pi_{\\gamma}\\left(\\alpha, \\sigma^{2}\\right) \\] ,也就是coefficient,intercept,和variance的先验, 基于最方便的方法,基础Jeffreys’ prior is used for the parameters that are common to all models: \\[ \\pi_{\\gamma}\\left(\\alpha, \\sigma^{2}\\right)=\\sigma^{-2} \\] 对于\\(\\beta\\),则要么使用正态,要么使用mixture正态,中心点在0. (“by reasons of similarity”,Jeffreys,1961) and scaled by \\(\\sigma^{2}\\left(\\boldsymbol{X}_{\\gamma}^{t} \\boldsymbol{X}_{\\gamma}\\right)^{-1}\\), “a matrix suggested by the form of the information matrix.” times a factor g, normally called a “g-prior”. 目前的研究表明这样的方便的prior拥有一系列的最优的性质可以扩展到对超参数g做特殊的先验。 Among these properties are invariance under affine transformations of the covariates, several types of predictive matching and consistency ( Bayarri et al., 2002). The specification of g has inspired many interesting studies in the literature. Of these, we have collected the most popular one in Table1. Relatedd with the conventional priors, which inspired by asymptotically reproducing the popular Bayesian Information Criterion (Schwarz,1978). Raftery proposes using the same covariance matrix as the Unit Information Prior, but with mean the maximum likelihood estimator \\(\\hat\\beta_\\gamma\\) (instead of the zero mean of the conventional prior). Other priors specifically used in model uncertainty problems are the spike and slab priors. Assume that the components of \\(\\beta\\) are independent, each having a mixture of two distributions: one highly concentrated on zero (the spike) and the other one quite disperse (the slab). There are two different developments of this idea in the literature. There are two different developments of this idea. Original version is Mitchell and Beauchamp (1988), the spike is a degenerate distribution at zero so this fits with what we have called the formal approach. Another proposal by George and McCulloch (1993) which the spike is a continuous distribution with a small variance also received a lot of attention, perhaps for computational advantages. 模型空间的prior,\\(\\mathcal M\\), 一个非常受欢迎的起点是 \\[ Pr(M_\\gamma|\\theta)=\\theta^{p_\\gamma}(1-\\theta)^{p-p_\\gamma} \\] where \\(p_\\gamma\\) is the number of covariates in \\(M_\\gamma\\) and the hyperparameter \\(\\theta\\in(0,1)\\) has the interpretation of the common probability that a given variable is included (independently of all others). Among the most popular default choice for \\(\\theta\\) are Fixed \\(\\theta=1/2\\), which assign equal prior probability to each model i.e. \\(\\operatorname{Pr}\\left(M_{\\gamma}\\right)=1 / 2^{p}\\) Random \\(\\theta \\sim \\operatorname{Unif}(0,1)\\), giving euqal probability to each possible number of covariates or model size. 一般来说,固定的\\(\\theta\\)会在多样性上表现非常差,在测试中,伪造的解释变量经常在结果中出现,然后lead to 更有信息的先验。这套个情况可以用随机的\\(\\theta\\)进行避免,第二个proposal 见Scott and Berger (2010.) Lay and Steel(2009) 考虑使用\\(\\theta\\sim Beta(1,b)\\) 可以导出binomial-beta 先验 for the number of covariates in the model or the model size,W: \\[ \\operatorname{Pr}(W=w | b) \\propto \\left( \\begin{array}{c}{p} \\\\ {w}\\end{array}\\right) \\Gamma(1+w) \\Gamma(b+p-w), w=0,1, \\ldots, p \\] 注意\\(b=1\\) 时退化到uniform prior on \\(\\theta\\) and also on \\(W\\). Ley and Steel (2009), this setting is useful to incorporate prior information about the mean model size, say \\(w^*\\). This would translate into \\(b=(p-w^*)/w^*\\). 11.2 Summaries the posterior distribution and model averaged inference The simplest summary of the posterior model distribution is its mode \\[ \\underset{\\gamma}{\\arg \\max } \\operatorname{Pr}\\left(M_{\\gamma} | \\boldsymbol{y}\\right) \\] This model is the model most supported by the information (data and prior) This is normally called HPM(jighest posterior model) or MAP (maximum a posteriori) model. When p is moderate to large, posterior inclusion probabilities (PIP) are very useful. \\[ \\operatorname{Pr}\\left(\\gamma_{i}=1 | \\boldsymbol{y}\\right)=\\sum_{x_{i} \\in M_{\\gamma}} \\operatorname{Pr}\\left(M_{\\gamma} | \\boldsymbol{y}\\right) \\] 这个可以理解成每个变量解释response的重要性。 这个概率也可以用于定义另外一个summary,叫median probability model(MPM) which is the model containing the covariates with inclusion probability larger than 0.5. This model in some case is optimal for prediction. 11.3 Numerical Methods Bayes Lasso: 目前已知问题和需要解决的问题: 预计写作时间:19分钟 首先,解决了一半的问题:\\(\\beta\\) 的先验问题。 由Bayes Lasso的完整版notes(相对于论文) 其中,Hierachical model部分,\\(\\beta|\\tau_1^2,…,\\tau^2_p\\sim N_p(0_p,D_\\tau)\\) 是成立的(根据原始式子推出来的),但是和论文suggest的hierarchical representation of the full model: \\[ \\begin{aligned} \\boldsymbol{y} | \\mu, \\boldsymbol{X}, \\boldsymbol{\\beta}, \\sigma^{2} & \\sim N_{n}\\left(\\mu \\mathbf{1}_{n}+\\boldsymbol{X} \\boldsymbol{\\beta}, \\sigma^{2} \\boldsymbol{I}_{n}\\right) \\\\ \\boldsymbol{\\beta} | \\tau_{1}^{2}, \\ldots, \\tau_{p}^{2}, \\sigma^{2} & \\sim N_{p}\\left(\\mathbf{0}_{p}, \\sigma^{2} \\boldsymbol{D}_{\\tau}\\right), \\qquad \\boldsymbol{D}_{\\tau}=\\operatorname{diag}\\left(\\tau_{1}^{2}, \\ldots, \\tau_{p}^{2}\\right) \\\\ \\tau_{1}^{2}, \\ldots, \\tau_{p}^{2} & \\sim \\prod_{j=1}^{p} \\frac{\\lambda^{2}}{2} e^{-\\lambda^{2} \\tau_{j}^{2} / 2} d \\tau_{j}^{2}, \\quad \\tau_{1}^{2}, \\ldots, \\tau_{p}^{2}>0 \\\\ \\sigma^{2} & \\sim \\pi\\left(\\sigma^{2}\\right) d \\sigma^{2} \\end{aligned} \\] 有细微的差别:差了一个\\(\\sigma^2\\). 在该notes中的描述则是,第一个形式是成立的,但是会有问题: “possibility of a non-unimodal posterior”,在Section 4 unimodal 问题里有详细描述,然后对\\(\\tau^2_1,…,\\tau^2_p\\) 有其他先验的话会形成其他和Lasso有关的方法,在Section 6. 为什么会有不含\\(\\sigma^2\\) 的形式呢?主要是因为来源于最初的方程式 \\[ \\frac{a}{2} e^{-a|z|}=\\int_{0}^{\\infty} \\frac{1}{\\sqrt{2 \\pi s}} e^{-z^{2} /(2 s)} \\frac{a^{2}}{2} e^{-a^{2} s / 2} d s, \\quad a>0 \\] 如果把Laplace先验写成\\(\\pi(\\boldsymbol{\\beta})=\\prod_{j=1}^{p} \\frac{\\lambda}{2} e^{-\\lambda\\left|\\beta_{j}\\right|}\\) 的话,那么\\(a\\) 就是 \\(\\lambda\\) , \\(\\beta\\) 的密度很自然的与\\(\\sigma\\) 无关,是\\(N(0,\\tau_j^2)\\) 如果把Laplace先验写成\\(\\pi\\left(\\boldsymbol{\\beta} | \\sigma^{2}\\right)=\\prod_{j=1}^{p} \\frac{\\lambda}{2 \\sqrt{\\sigma^{2}}} e^{-\\lambda\\left|\\beta_{j}\\right| / \\sqrt{\\sigma^{2}}}\\) 的话,那么a就是\\(\\frac{\\lambda}{\\sqrt{\\sigma^2}}\\) ,则 \\(\\tau_j\\) 的分布会带上 \\(\\sigma^2\\) , 因为,这两种情况都和Full model对不上号。。。。 假设以上先不管,full mode成立,那么就可以写出来joint density \\[ \\begin{array}{l}{f\\left(\\boldsymbol{y} | \\mu, \\boldsymbol{\\beta}, \\sigma^{2}\\right) \\pi\\left(\\sigma^{2}\\right) \\pi(\\mu) \\prod_{j=1}^{p} \\pi\\left(\\beta_{j} | \\tau_{j}^{2}, \\sigma^{2}\\right) \\pi\\left(\\tau_{j}^{2}\\right)=} \\\\ {\\qquad \\frac{1}{\\left(2 \\pi \\sigma^{2}\\right)^{n / 2}} e^{-\\frac{1}{2 \\sigma^{2}}\\left(\\boldsymbol{y}-\\mu \\mathbf{1}_{n}-\\boldsymbol{X} \\boldsymbol{\\beta}\\right)^{\\top}\\left(\\boldsymbol{y}-\\mu \\mathbf{1}_{n}-\\boldsymbol{X} \\boldsymbol{\\beta}\\right)}} \\\\ { \\times \\frac{\\gamma^{a}}{\\Gamma(a)}\\left(\\sigma^{2}\\right)^{-a-1} e^{-\\gamma / \\sigma^{2}} \\prod_{j=1}^{p} \\frac{1}{\\left(2 \\pi \\sigma^{2} \\tau_{j}^{2}\\right)^{1 / 2}} e^{-\\frac{1}{2 \\sigma^{2} \\tau_{j}^{2}} \\beta_{j}^{2}} \\frac{\\lambda^{2}}{2} e^{-\\lambda^{2} \\tau_{j}^{2} / 2}}\\end{array} \\] 这个很清晰,第一块是\\(\\boldsymbol y\\) 的Normal,第二块是\\(\\sigma^2\\) 的inverse-gamma,第三块是\\(\\beta_j\\) 的Normal hierachical prior with 超参数\\(\\tau_j\\) 服从$f()= e{-{2} _{j}^{2} / 2} $. 19分钟写到这,没写完,再来19分钟 然后就是用\\(\\overline y\\) 作为\\(y\\) 的均值,重新写一下Normal里面关于Y和\\(\\mu\\) 的二次型如下: \\[ \\begin{aligned}\\left(\\boldsymbol{y}-\\mu \\mathbf{1}_{n}-\\boldsymbol{X} \\boldsymbol{\\beta}\\right)^{\\top}\\left(\\boldsymbol{y}-\\mu \\mathbf{1}_{n}-\\boldsymbol{X} \\boldsymbol{\\beta}\\right) &=\\left(\\overline{y} \\mathbf{1}_{n}-\\mu \\mathbf{1}_{n}\\right)^{\\top}\\left(\\overline{y} \\mathbf{1}_{n}-\\mu \\mathbf{1}_{n}\\right)+(\\tilde{\\boldsymbol{y}}-\\boldsymbol{X} \\boldsymbol{\\beta})^{\\top}(\\tilde{\\boldsymbol{y}}-\\boldsymbol{X} \\boldsymbol{\\beta}) \\\\ &=n(\\overline{y}-\\boldsymbol{\\mu})^{2}+(\\tilde{\\boldsymbol{y}}-\\boldsymbol{X} \\boldsymbol{\\beta})^{\\top}(\\tilde{\\boldsymbol{y}}-\\boldsymbol{X} \\boldsymbol{\\beta}) \\end{aligned} \\] 所以就把关于\\(\\mu\\)的部分和关于\\(y,X,\\beta\\) 的部分分开了,可以看到,关于\\(\\mu\\)的部分还是一个二次型,配合前面的系数,可以凑成一个Normal的形式,有均值\\(\\overline y\\) 和方差\\(\\sigma^2/n\\),然后把\\(\\mu\\)积掉,因为是正态,还分开了,所以积分就是1,相当于把这项拿走不影响其他项,得到一个新的Marginal only over \\(\\mu\\) 的joint density, which is proportional to \\[ \\frac{1}{\\left(\\sigma^{2}\\right)^{(n-1) / 2}} e^{-\\frac{1}{2 \\sigma^{2}}(\\tilde{\\boldsymbol{y}}-\\boldsymbol{X} \\boldsymbol{\\beta})^{\\top}(\\tilde{\\boldsymbol{y}}-\\boldsymbol{X} \\boldsymbol{\\beta})}\\left(\\sigma^{2}\\right)^{-a-1} e^{-\\gamma / \\sigma^{2}} \\prod_{j=1}^{p} \\frac{1}{\\left(\\sigma^{2} \\tau_{j}^{2}\\right)^{1 / 2}} e^{-\\frac{1}{2 \\sigma^{2} \\tau_{j}^{2}} \\beta_{j}^{2}} e^{-\\lambda^{2} \\tau_{j}^{2} / 2} \\] This expression depends on \\(\\boldsymbol y\\) only through \\(\\tilde{\\boldsymbol{y}}\\) . The conjugacy of the other parameters remains unaffected. 然后就能构建Gibbs \\(\\beta\\) \\[ -\\frac{1}{2 \\sigma^{2}}(\\tilde{\\boldsymbol{y}}-\\boldsymbol{X} \\boldsymbol{\\beta})^{\\top}(\\tilde{\\boldsymbol{y}}-\\boldsymbol{X} \\boldsymbol{\\beta})-\\frac{1}{2 \\sigma^{2}} \\boldsymbol{\\beta}^{\\top} \\boldsymbol{D}_{\\tau}^{-1} \\boldsymbol{\\beta}=-\\frac{1}{2 \\sigma^{2}}\\left\\{\\boldsymbol{\\beta}^{\\top}\\left(\\boldsymbol{X}^{\\top} \\boldsymbol{X}+\\boldsymbol{D}_{\\tau}^{-1}\\right) \\boldsymbol{\\beta}-2 \\tilde{\\boldsymbol{y}}^{\\top} \\boldsymbol{X} \\boldsymbol{\\beta}+\\tilde{\\boldsymbol{y}}^{\\top} \\tilde{\\boldsymbol{y}}\\right\\} \\] \\(\\beta\\) involve part is quadratic form with a linear form, which is proportional to Normal density. Letting \\(\\boldsymbol{A}=\\boldsymbol{X}^{\\top} \\boldsymbol{X}+\\boldsymbol{D}_{\\tau}^{-1}\\) , Then the equation upon can be transformed to \\[ \\boldsymbol{\\beta}^{\\top} \\boldsymbol{A} \\boldsymbol{\\beta}-2 \\tilde{\\boldsymbol{y}}^{\\top} \\boldsymbol{X} \\boldsymbol{\\beta}+\\tilde{\\boldsymbol{y}}^{\\top} \\tilde{\\boldsymbol{y}}=\\left(\\boldsymbol{\\beta}-\\boldsymbol{A}^{-1} \\boldsymbol{X}^{\\top} \\tilde{\\boldsymbol{y}}\\right)^{\\top} \\boldsymbol{A}\\left(\\boldsymbol{\\beta}-\\boldsymbol{A}^{-1} \\boldsymbol{X}^{\\top} \\tilde{\\boldsymbol{y}}\\right)+\\tilde{\\boldsymbol{y}}^{\\mathrm{T}}\\left(\\boldsymbol{I}_{n}-\\boldsymbol{X} \\boldsymbol{A}^{-1} \\boldsymbol{X}^{\\top}\\right) \\tilde{\\boldsymbol{y}} \\] That is, \\(\\boldsymbol \\beta\\) is conditionally multivariate normal with mean \\(A^{-1} \\boldsymbol{X}^{\\top} \\tilde{\\boldsymbol{y}}\\) and variance \\(\\sigma^{2} \\boldsymbol{A}^{-1}\\). 这块应该可以照常用,只要把\\(\\sigma^2\\) 那块的东西处理掉, \\(\\sigma^2\\) 因为要建模,所以这块应该全部要换成Normal和建模的形式 \\[ \\left(\\sigma^{2}\\right)^{-(n-1) / 2-p / 2-a-1} \\exp \\left\\{-\\frac{1}{\\sigma^{2}}\\left((\\tilde{\\boldsymbol{y}}-\\boldsymbol{X} \\boldsymbol{\\beta})^{\\mathrm{T}}(\\tilde{\\boldsymbol{y}}-\\boldsymbol{X} \\boldsymbol{\\beta}) / 2+\\boldsymbol{\\beta}^{\\top} \\boldsymbol{D}_{\\tau}^{-1} \\boldsymbol{\\beta} / 2+\\gamma\\right)\\right\\} \\] 这个是conditionally inverse gamma的形式,with shape \\((n-1) / 2+p / 2+a\\) and scale parameter \\((\\tilde{\\boldsymbol{y}}-\\boldsymbol{X} \\boldsymbol{\\beta})^{\\mathrm{T}}(\\tilde{\\boldsymbol{y}}-\\boldsymbol{X} \\boldsymbol{\\beta}) / 2+\\boldsymbol{\\beta}^{\\top} \\boldsymbol{D}_{\\tau}^{-1} \\boldsymbol{\\beta} / 2+\\gamma\\) . \\(\\tau_j^2\\) for \\(j=1,…,p\\) Density is: \\[ \\left(\\tau_{j}^{2}\\right)^{-1 / 2} \\exp \\left\\{-\\frac{1}{2}\\left(\\frac{\\beta_{j}^{2} / \\sigma^{2}}{\\tau_{j}^{2}}+\\lambda^{2} \\tau_{j}^{2}\\right)\\right\\} \\] Proportional to the density of the reciprocal of an inverse Gaussian random variable. 第四节,等等,怎么变成日常笔记了, The posterior distribution The joint posterior distribution of $$ and \\(\\sigma^2\\) is proportional to \\[ \\left(\\sigma^{2}\\right)^{-(n+p-1) / 2-a-1} \\exp \\left\\{-\\frac{1}{\\sigma^{2}}\\left((\\tilde{\\boldsymbol{y}}-\\boldsymbol{X} \\boldsymbol{\\beta})^{\\top}(\\tilde{\\boldsymbol{y}}-\\boldsymbol{X} \\boldsymbol{\\beta}) / 2+\\gamma\\right)-\\frac{\\lambda}{\\sqrt{\\sigma^{2}}} \\sum_{j=1}^{p}\\left|\\beta_{j}\\right|\\right\\} \\] 从这里面来看,至少很像最小二乘+罚函数的形式。 We may safety let \\(a=0\\) . 这句话没太懂: Assuming that the data do not admit a perfect linear fit (i.e. \\(\\tilde y\\) is not in the column space of \\(\\boldsymbol X\\) ), also let \\(\\gamma=0\\) .This corresponds to using the non-informative scale-invariant prior \\(1/\\sigma^2\\) on \\(\\sigma^2\\) . \\(\\boldsymbol X\\) matrix is, of course, unitless because of its scaling. For comparison, the joint posterior distribution of $$ and \\(\\sigma^2\\) under prior (1), 也就是不涉及\\(\\sigma^2\\) 的prior,with some independent prior \\(\\pi(\\sigma^2)\\) on \\(\\sigma^2\\), is proportional to \\[ \\pi\\left(\\sigma^{2}\\right)\\left(\\sigma^{2}\\right)^{-(n-1) / 2} \\exp \\left\\{-\\frac{1}{2 \\sigma^{2}}(\\tilde{\\boldsymbol{y}}-\\boldsymbol{X} \\boldsymbol{\\beta})^{\\mathrm{T}}(\\tilde{\\boldsymbol{y}}-\\boldsymbol{X} \\boldsymbol{\\beta})-\\lambda \\sum_{j=1}^{p}\\left|\\beta_{j}\\right|\\right\\} \\] 所以果然是为了合并同类项吗 In this case, \\(\\lambda\\) has units that are the reciprocal of the units of the response, and any change in units will require a corresponding change in \\(\\lambda\\) to produce the equivalent Bayesian solution. 这段话应该很重要,关于unit change的,类似于给\\(\\beta\\) 加了一个 rescaling ? 标记一下,这里值得重点研究一下: 这个unit,和scaling究竟是什么情况?如果要挪用的话得用什么?因为毕竟没有我的模型里面没有\\(\\sigma^2\\),能不能不管这个问题,得付出什么代价?要管的话应该用什么? 然后第二个问题就是multi-modal的问题,下次来继续说。 重新开始 第二个模型可能会有多于一个极值点。 Taking \\(p=1,n=10,X^TX=1,X^T\\tilde y=5,\\tilde y^Ty=26,\\lambda=3\\). The mode on the lower right is near the least-squares solution \\(\\beta=5,\\sigma^2=1/8\\), while the mode on the upper left is near the values \\(\\beta=0,\\sigma^2=26/9\\) that would be estimated for the selected model in which \\(\\beta\\) is set to zero. The crease in the upper left mode along the line \\(\\beta=0\\) is a feature produced by the “sharp corners” of the \\(L_1\\) penalty. Not surprisingly, the marginal density of \\(\\beta\\) is also bimodal. When \\(p>1\\), it may be possible to have more than two modes, though we have not investigated this. 所以这两个峰不是莫名其妙的两个峰,是分别是OLS的结果和,不取该\\(\\beta\\) 的结果。 所以好像直接用Bayes Lasso会出问题,得改,但是设计\\(\\sigma^2\\) 感觉会变得很麻烦。。。 多个峰值会导致计算和概念上的问题,这种情况下,单点的后验均值,中位数,或者最大值点是否能总结一个双峰后验的要素是有疑问的。一个更好的summary将会是分别测量每个峰值点 下一步是选取\\(\\lambda\\) ,original的lasso是通过cross-validation,generalised cross-validation, and ideas based on Stein’s unbiased risk estimate. Bayesian Lasso offers some uniquely Bayesian alternatives: empirical Bayes via marginal (Type 2) maximum likelihood, and use of an appropriate hyperprior. 这是啥,不懂得看看 11.3.1 Empirical Bayes by Marginal Maximum Likelihood Finish "], +["advanced-r.html", "Chapter 12 Advanced R 12.1 Data.frame", " Chapter 12 Advanced R The book of Hardley Wickham ## Fundmental 12.0.1 Vector Vector: Basic structure: two flavours: atomic vectors and lists: The common properties: Type: typeof() 这个vector的type是啥 Length, length() 这个vector有多少个元素 Attributes, attributes() additional arbitrary metadata: 这是啥?额外的信息? 所有的元素对于一个atomic vector 必须是一种类型,对比list,则每个元素可以是不同的类型。 NB:is.vector()不检测一个对象是否是vector,而是仅当对象是一个除了名字以外没有attributes的vector的话,返回TRUE。用 is.atomic()||is.list(x)去检测是否一个对象确实是vector。 12.0.1.1 Atomic vectors 有四类atomic vectors我将会仔细的讨论:logical, integer, double(也叫numeric), 和character. 有两类稀有的类型不会深入讨论:complex 和 raw。 Atomic vectors经常使用c()进行创建,表示combine: db1_var=c(1,2.5,4.5) # L表示整数,因为r默认数字的结构为numeric(double) int_var=c(1L,6L,10L) #使用TRUE和FALSE(或者 T 和 F)用来创建一个逻辑vectors log_var=c(TRUE,FALSE,T,F) chr_var=c("these are","some strings") Atomic vector总是扁平的,甚至你用堆叠的c() Atomic vectors are always flat, even if you nect c()’s: c(1,c(2,c(3,4))) ## [1] 1 2 3 4 #和这个一样 c(1,2,3,4) ## [1] 1 2 3 4 12.0.2 Types and tests: 给定一个vector,你能决定其类型,使用 typeof(),或者检查其是不是个特定的类型,使用“is” 函数:is.character(),is.double(),is.logical(),is.integer(),or, more generally, is.atomic(). 注意is.numeric()是一般性的检查是否是“数字”的vector,并且返回TRUE如果是integer和double。虽然叫numeric但是不是检测double的。 12.0.3 Coercion 所有的atomic vector的元素必须是一种类型,所以如果你尝试组合不同类型的话,这些数据会被coerced 成为最复杂的那种类型,序是:logical,integer,double和character. 比如说,组合一个character和一个integer可以得到一个character: str(c("a",1)) ## chr [1:2] "a" "1" 所以这里coerced应该是强制转换的意思。一个逻辑vector强制转换成integer或者double的时候TRUE变成1,FALSE变成0.一个常用的技巧是和sum()以及mean()组合使用。 类型强制转换一般自动产生,大部分数学函数会强制转换成double或者integer,大部分的逻辑操作符会强制转换为logical。如果coercion可能损失信息的时候,一般会得到一个警告信息。如果会产生confusion,可以用as系列函数as.character(), as.double(), as.integer(), or as.logical(). 12.1 Data.frame df=data.frame(x=1:3,y=c("a","b","c")) str(df) data.frame()’s default behaviour which turns strings into factors. Using stringAsFactors=FALSE to suppress this behaviour, Combine data frames using cbind() and rbind() plyr::rbind.fill() 12.1.1 Ordering (integer subsetting) x=c("b","c","a") x[order(x)] ## [1] "a" "b" "c" df=data.frame(x=rep(1:3,each=2),y=6:1,z=letters[1:6]) df2=df[sample(nrow(df)),3:1] df2[order(df2$x),] ## z y x ## 2 b 5 1 ## 1 a 6 1 ## 4 d 3 2 ## 3 c 4 2 ## 6 f 1 3 ## 5 e 2 3 df2[,order(names(df2))] ## x y z ## 4 2 3 d ## 3 2 4 c ## 6 3 1 f ## 2 1 5 b ## 1 1 6 a ## 5 3 2 e df=data.frame(x=1:3,y=3:1,z=letters[1:3]) 12.1.2 Calling a function given a list of arguments list of function arguments: args=list(1:10, na.rm=TRUE) which is equivalent to mean(1:10,na.rm=TRUE) "], +["numeric-derivatives.html", "Chapter 13 Numeric Derivatives", " Chapter 13 Numeric Derivatives Compute \\(f'(x)\\), definition of the derivative, the limit as \\(h\\rightarrow 0\\) of \\[ f^{\\prime}(x) \\approx \\frac{f(x+h)-f(x)}{h} \\] The above procedure is almost guaranteed to produce inaccurate results. When the function \\(f\\) is fiercely expensive to compute. There are two sources of error in this approach truncation error and roundoff error. The truncation error comes from higher terms in the Taylor series expansion: \\[ f(x+h)=f(x)+h f^{\\prime}(x)+\\frac{1}{2} h^{2} f^{\\prime \\prime}(x)+\\frac{1}{6} h^{3} f^{\\prime \\prime \\prime}(x)+\\cdots \\] whence \\[ \\frac{f(x+h)-f(x)}{h}=f^{\\prime}+\\frac{1}{2} h f^{\\prime \\prime}+\\cdots \\] The roundoff error has various contributions. First is in \\(h\\): Suppose, by way ofo an example, that you are at a point \\(x=10.3\\) and you blindly choose \\(h=0.0001\\). Neither \\(x=10.3\\) nor \\(x+h=10.30001\\) is a number with an exact representation in binary; each is therefore represented with some fractional error characteristic of the machine’s floating-point format, \\(\\epsilon_m\\), whose value in single precision may be \\(\\sim 10^{-7}\\). The error in the effective value of h, namely the difference between \\(x+h\\) and \\(x\\) as represented in the machine, is therefore on the order of \\(\\epsilon_m x\\), which implies a fractional error in h of order \\(\\sim \\epsilon_mx/h\\sim10^{-2}\\)! By equation, this immediately implies at least the same large fractional error in the derivative. roundoff error: 舍入误差。假设点x=10.3,然后选取h=0.0001,但是在计算机里保存的既不是x=10.3也不是x+h=10.30001,每个数都会表示成另外一个有相对误差的,相对误差则由计算机的浮点格式,\\(\\epsilon_m\\),所决定,单精度浮点数则在\\(10^{-7}\\). 在h有效值的误差,也就是x+h和x在计算机中表现的值的误差,则其阶为\\(\\epsilon_m x\\),这个表明了h的相对误差的阶是\\(\\sim \\epsilon_m x/h\\sim 10^{-2}\\) ,就很高。 这样,第一课就是,总选择h使得x+h和x的差是一个确定的表示数,this can usually be accomplished by the program steps: temp=x+h h=temp-x "], +["imputation.html", "Chapter 14 Imputation", " Chapter 14 Imputation 处理缺失值问题。 Multiple imputation method. "], +["references.html", "References", " References "], +["r-web-scrape.html", "Chapter 15 R web scrape", " Chapter 15 R web scrape Web scrape用 library(rvest) ## Warning: package 'rvest' was built under R version 3.5.2 ## Loading required package: xml2 lego_movie <- read_html("http://www.imdb.com/title/tt1490017/") rating <- lego_movie %>% html_nodes("strong span") %>% # 这个"strong span"是指什么意思? html_text() %>% as.numeric() rating ## [1] 7.8 #> [1] 7.8 上述的问题回答是由于: knitr::include_graphics(rep("pic/scrape_2.png", 1)) 含有7.8的html元素就是strong 下面的span的元素。 根据这篇tutorial 提取class=""中的元素可以用如下代码 html_nodes('.pagination-page') 学这段写一下: a=read_html("https://d.cosx.org/d/420739-r") html_nodes(a,".PostMention") ## {xml_nodeset (27)} ## [1] <a href="https://d.cosx.org/d/420739/1" class="PostMention" data-id ... ## [2] <a href="https://d.cosx.org/d/420739/1" class="PostMention" data-id ... ## [3] <a href="https://d.cosx.org/d/420739/2" class="PostMention" data-id ... ## [4] <a href="https://d.cosx.org/d/420739/3" class="PostMention" data-id ... ## [5] <a href="https://d.cosx.org/d/420739/3" class="PostMention" data-id ... ## [6] <a href="https://d.cosx.org/d/420739/6" class="PostMention" data-id ... ## [7] <a href="https://d.cosx.org/d/420739/6" class="PostMention" data-id ... ## [8] <a href="https://d.cosx.org/d/420739/7" class="PostMention" data-id ... ## [9] <a href="https://d.cosx.org/d/420739/8" class="PostMention" data-id ... ## [10] <a href="https://d.cosx.org/d/420739/7" class="PostMention" data-id ... ## [11] <a href="https://d.cosx.org/d/420739/10" class="PostMention" data-i ... ## [12] <a href="https://d.cosx.org/d/420739/7" class="PostMention" data-id ... ## [13] <a href="https://d.cosx.org/d/420739/9" class="PostMention" data-id ... ## [14] <a href="https://d.cosx.org/d/420739/9" class="PostMention" data-id ... ## [15] <a href="https://d.cosx.org/d/420739/9" class="PostMention" data-id ... ## [16] <a href="https://d.cosx.org/d/420739/18" class="PostMention" data-i ... ## [17] <a href="https://d.cosx.org/d/420739/12" class="PostMention" data-i ... ## [18] <a href="https://d.cosx.org/d/420739/27" class="PostMention" data-i ... ## [19] <a href="https://d.cosx.org/d/420739/25" class="PostMention" data-i ... ## [20] <a href="https://d.cosx.org/d/420739/13" class="PostMention" data-i ... ## ... 就得到分开的有PostMention类的元素 PostMentionUser=html_nodes(a,".PostMention")%>% html_text() head(PostMentionUser) ## [1] "yihui" "yihui" "tctcab" "yihui" "yihui" "Jiena" 得到PostMention里面的元素,应该是回复中带@的部分? html_nodes(a,".Post-body")%>% html_text()->body head(body) ## [1] "\\n 我现在已经不太参与邮件列表里的讨论了,不过我还是想看看里面最热门的讨论大概都是关于什么话题的。要是有人有兴趣,邮件列表的数据可以从网页上爬,比如 https://stat.ethz.ch/pipermail/r-devel/ 按 Thread 或 Subject 排列的页面爬一下就知道每个主题下面的回帖数量。最终我想要的数据就三列:主题、链接、回帖数量。如:\\n\\nsubject, link, count\\n\\"[R] Open a file which name contains a tilde\\", \\"https://stat.ethz.ch/pipermail/r-devel/2019-June/077961.html\\", 18if(\\"undefined\\"!==typeof hljs)hljs._ha();else if(\\"undefined\\"===typeof hljsLoading){hljsLoading=1;var a=document.getElementsByTagName(\\"head\\")[0],e=document.createElement(\\"link\\");e.type=\\"text/css\\";e.rel=\\"stylesheet\\";e.href=\\"//cdn.bootcss.com/highlight.js/9.12.0/styles/github.min.css\\";a.appendChild(e);e=document.createElement(\\"script\\");e.type=\\"text/javascript\\";e.onload=function(){var d={},f=0;hljs._hb=function(b){b.removeAttribute(\\"data-hljs\\");var c=b.innerHTML;c in d?b.innerHTML=d[c]:(7<++f&&(d={},f=0),hljs.highlightBlock(b.firstChild),d[c]=b.innerHTML)};hljs._ha=function(){for(var b=document.querySelectorAll(\\"pre[data-hljs]\\"),c=b.length;0<c;)hljs._hb(b.item(--c))};hljs._ha()};e.async=!0;e.src=\\"//uploads.cosx.org/static/hljs/highlight.pack.js\\";a.appendChild(e)}上面是开发者列表,还有普通用户列表:https://stat.ethz.ch/pipermail/r-help/\\n\\n我之所以想看这个数据是因为我有个猜测想验证一下:凭我的肉眼观察,我感觉多数“热门”讨论都是极其琐碎无聊的事情,根本不值得费那么多口舌(比如上面那个例子)。为了防止我看歪了,我想看一眼全局数据。希望有壮士能写个脚本。其实第一次本地爬完之后,以后可以在 Travis 上定期爬新的数据,把数据自动更新到 Github 或什么地方。\\n " ## [2] "\\n yihui \\n\\n用xml2做了一个:Rmarkdown效果\\n\\n这种结构比较标准的html还是挺好处理的\\n\\n不过我只爬了一个月,仔细一看同一个thread在多个月里都会出现,要合并也可以,不过数据清理已经超出了这个爬虫的功能范围哈哈哈,暂时在此收手了。\\n " ## [3] "\\n yihui \\n爬好了(从97年4月到目前的数据),戳这:https://github.com/jienagu/tidyverse_examples/blob/master/web_scraping_r_devel.R \\n结果(已经排序好了)如下:\\n> head(everything2)\\n link Link_url count\\n1 [Rd] [RFC] A case for freezing CRAN https://stat.ethz.ch/pipermail/r-devel/2014-March/068548.html 64\\n2 [Rd] CRAN policies https://stat.ethz.ch/pipermail/r-devel/2012-March/063678.html 51\\n3 [Rd] surprising behaviour of names<- https://stat.ethz.ch/pipermail/r-devel/2009-March/052522.html 49\\n4 [Rd] legitimate use of ::: https://stat.ethz.ch/pipermail/r-devel/2013-August/067180.html 45\\n5 [Rd] Computer algebra in R - would that be an idea?? https://stat.ethz.ch/pipermail/r-devel/2005-July/033940.html 40\\n6 [Rd] if(--as-cran)? https://stat.ethz.ch/pipermail/r-devel/2012-September/064760.html 39if(\\"undefined\\"!==typeof hljs)hljs._ha();else if(\\"undefined\\"===typeof hljsLoading){hljsLoading=1;var a=document.getElementsByTagName(\\"head\\")[0],e=document.createElement(\\"link\\");e.type=\\"text/css\\";e.rel=\\"stylesheet\\";e.href=\\"//cdn.bootcss.com/highlight.js/9.12.0/styles/github.min.css\\";a.appendChild(e);e=document.createElement(\\"script\\");e.type=\\"text/javascript\\";e.onload=function(){var d={},f=0;hljs._hb=function(b){b.removeAttribute(\\"data-hljs\\");var c=b.innerHTML;c in d?b.innerHTML=d[c]:(7<++f&&(d={},f=0),hljs.highlightBlock(b.firstChild),d[c]=b.innerHTML)};hljs._ha=function(){for(var b=document.querySelectorAll(\\"pre[data-hljs]\\"),c=b.length;0<c;)hljs._hb(b.item(--c))};hljs._ha()};e.async=!0;e.src=\\"//uploads.cosx.org/static/hljs/highlight.pack.js\\";a.appendChild(e)}" ## [4] "\\n tctcab 不错不错!好了,那就请别的壮士继续背这口锅吧,一是把历史数据爬完,二是在数据里加一列,就是帖子的时间(年月即可)。数据弄好后,如果有相邻月份的帖子标题相同,那就把前一个月的计数加到后一个月的计数上。\\n " ## [5] "\\n yihui \\n年月并没有在xml里出现,不过那个thread的地址里有年月信息,要整合也不難,\\n\\n既然都做到这步了那就顺便走完吧,屁股没擦干净留给别人总不大好哈哈哈\\n " ## [6] "\\n yihui \\n\\n研究了一下合并帖子,觉得并不简单,因为:\\n - 同一个串下面标题有可能不一致\\n - 不同串有可能名字一样\\n\\n尝试了合并标题一样的帖子,但不一定对。比如04,06,07年都有个名为“[Rd] Wish list”的串\\n\\n另外就是抓取的时候一不留神就连接错误,所以我先把thread.html下载到本地再提取信息,速度也快了不少(8分钟到19秒)\\n\\n结果:\\n\\nRmarkdown效果\\n\\n合并串之后可以看出结果跟Jiena 的前十名还是有点细微差别:\\n\\n# A tibble: 10 x 4\\n title link reps year_mon \\n <chr> <chr> <int> <chr> \\n 1 \\"[Rd] [RFC] A case for freezi… https://stat.ethz.ch/pipermai… 69 2014-March\\n 2 \\"[Rd] Wish list\\\\n\\" https://stat.ethz.ch/pipermai… 62 2007-Janu…\\n 3 \\"[Rd] CRAN policies\\\\n\\" https://stat.ethz.ch/pipermai… 51 2012-March\\n 4 \\"[Rd] legitimate use of :::\\\\n\\" https://stat.ethz.ch/pipermai… 48 2014-May \\n 5 \\"[Rd] NEWS.md support on CRAN… https://stat.ethz.ch/pipermai… 48 2015-May \\n 6 \\"[Rd] surprising behaviour of… https://stat.ethz.ch/pipermai… 48 2009-March\\n 7 \\"[Rd] declaring package depen… https://stat.ethz.ch/pipermai… 42 2013-Sept…\\n 8 \\"[Rd] if(--as-cran)?\\\\n\\" https://stat.ethz.ch/pipermai… 42 2012-Sept…\\n 9 \\"[Rd] Bias in R's random inte… https://stat.ethz.ch/pipermai… 37 2018-Sept…\\n10 \\"[Rd] R 3.0, Rtools3.0,l Wind… https://stat.ethz.ch/pipermai… 36 2013-Aprilif(\\"undefined\\"!==typeof hljs)hljs._ha();else if(\\"undefined\\"===typeof hljsLoading){hljsLoading=1;var a=document.getElementsByTagName(\\"head\\")[0],e=document.createElement(\\"link\\");e.type=\\"text/css\\";e.rel=\\"stylesheet\\";e.href=\\"//cdn.bootcss.com/highlight.js/9.12.0/styles/github.min.css\\";a.appendChild(e);e=document.createElement(\\"script\\");e.type=\\"text/javascript\\";e.onload=function(){var d={},f=0;hljs._hb=function(b){b.removeAttribute(\\"data-hljs\\");var c=b.innerHTML;c in d?b.innerHTML=d[c]:(7<++f&&(d={},f=0),hljs.highlightBlock(b.firstChild),d[c]=b.innerHTML)};hljs._ha=function(){for(var b=document.querySelectorAll(\\"pre[data-hljs]\\"),c=b.length;0<c;)hljs._hb(b.item(--c))};hljs._ha()};e.async=!0;e.src=\\"//uploads.cosx.org/static/hljs/highlight.pack.js\\";a.appendChild(e)}" 就得到回帖的内容, html_nodes(a,".PostUser") ## {xml_nodeset (0)} html_nodes(a,".username") ## {xml_nodeset (0)} 但是这个咋就不对。。。。奇怪。 是因为元素叠起来的缘故吗?应该不是奇奇怪怪的,明天再看。 "], +["guide-to-scientific-computing-in-c.html", "Chapter 16 Guide to Scientific Computing in C++ 16.1 Basics 16.2 Basics in C++ 16.3 Redirect Console Output to File 16.4 Pointer 16.5 Functions 16.6 Classess 16.7 Using Makefiles to Compile Multiple Files 16.8 类的继承 16.9 模板 16.10 Class for linear algebra", " Chapter 16 Guide to Scientific Computing in C++ 16.1 Basics first compile in command line: g++ -o HelloWorld HelloWorld.cpp 也可以直接编译HelloWorld.cpp,默认的输出名字是a.out g++ HelloWorld.cpp compile flag: 让编译器保留debug信息,然后不优化代码,这样能进行debug,用“-g” flag g++ -g -o HelloWorld HelloWorld.cpp 让编译器链接library g++ -lm -o HelloWorld HelloWorld.cpp 16.2 Basics in C++ Assert语句进行简单的error抛出: #include <iostream> #include <cassert> #include <cmath> int main(int argc, char * argv[]) { double a; std::cout << "Enter a non-negative number\\n"; std::cin >> a; assert(a >= 0.0); std::cout << "The square root of "<< a; std::cout << " is " << sqrt(a) << "\\n"; return 0; } 16.3 Redirect Console Output to File Require additional header file fstream: include Declare an output stream variable write_output as std::ofstream Specify filename Output.dat Check the file is opened successfully: use assert Use write_output instead of std::cout : Close the file handle. In order to finish these job, the code is #include <cassert> #include <iostream> #include <fstream> int main(int argc, char * argv[]) { double x[3] = {0.0, 1.0, 0.0}; double y[3] = {0.0, 0.0, 1.0}; std::ofstream write_output("Output.dat"); assert(write_output.is_open()); for (int i=0; i<3; i++) { write_output << x[i] << " " << y[i] << "\\n"; } write_output.close(); return 0; } In scientific computing, the precision is crucial, they can be set as: #include <iostream> #include <fstream> int main(int argc, char * argv[]) { double x = 1.8364238; std::ofstream write_output("Output.dat"); write_output.precision(3); // 3 sig figs write_output << x << "\\n"; write_output.precision(5); // 5 sig figs write_output << x << "\\n"; write_output.precision(10); // 10 sig figs write_output << x << "\\n"; write_output.close(); return 0; } It’s similar for read file in stream with cout, just change cout to std::ifstream read_file. The example code in the book is: #include <cassert> #include <iostream> #include <fstream> int main(int argc, char * argv[]) { double x[6], y[6]; std::ifstream read_file("Output.dat"); assert(read_file.is_open()); for (int i=0; i<6; i++) { read_file >> x[i] >> y[i]; } read_file.close(); return 0; } This is the simplest code that just loop the given length as 6, we want read until it is end. Then we need write until get read_file.oef(), so we use while instead of for. while(!read_file.oef()) 16.3.1 Reading from the Command Line 感觉这是命令行管道的东西的感觉。 User want to set some parameters when executing the code. Remind of the definition of main function int main(int argc, char * argv[]) The argc and argv function will be used to finish this purpose. Addition header filecstdlib and function atoi and atof will be sued. #include <iostream> #include <cstdlib> int main(int argc, char *argv[]) { std::cout << "Number of command line arguments = " << argc << "\\n"; for (int i=0; i<argc; i++) { std::cout << "Argument " << i << " = " << argv[i]; std::cout << "\\n"; } std::string program_name = argv[0]; int number_of_nodes = atoi(argv[1]); double conductivity = atof(argv[2]); std::cout << "Program name = " << program_name << "\\n"; std::cout << "Number of nodes = " << number_of_nodes; std::cout << "\\n"; std::cout << "Conductivity = " << conductivity << "\\n"; return 0; } 运行如上程序 ./CommandLineCode 100 5.0 在这个函数里,那一整行命令行的东西都作为参数被参数argv传进去了。而argc是统计有多少个分割的元素 16.4 Pointer Pointer Aliasing 注意同一块区域操作,比如A=A*B(都是矩阵)时,如果直接算会出问题,因为算后面的会依赖前面的值。 Safe Dynamic allocation: 可能会出现没有足够多的内存进行分配,要么是因为数组下标越界或者为负数,要么是因为没有足够多的物理内存。可以通过抛出异常 double *p_x; p_x=new double[10000]; assert(p_x!=NULL) 进行检测。 Every new Has a delete 在动态分配内存的时候要注意,所有分配的内存都需要被释放。当在循环内部使用动态分配内存的时候,这个问题尤其值得注意。比如: for (int i=0; i<10000; i++) { double ** A; A = new double * [50]; for (int j=0; j<50; j++) { A[j] = new double [50]; } } 这样每次运行这样的代码,新的内存就会被分配。但是不会释放。于是每次运行这段程序,就有一块内存不再可用。如果没有垃圾处理机制的话,内存就会很快被占满。 16.5 Functions 16.5.1 Use of Pointers as function arguments. 100页完成。明天继续100页看完。 16.6 Classess 16.6.1 Header Files 应该对引入头文件非常小心。如果include了多次会造成问题。特别是在多个cpp文件上工作时,不经意间就会造成一些问题。为了防止多次included,需要如下的代码: 如果没有被定义,则定义。也就是加一个if语句 使用宏#ifndef如果没有被定义,然后通过#endif结束: #ifndef EXAMPLECLASSHEADERDEF // only if macro // EXAMPLECLASSHEADERDEF not // defined execute lines of // code until #endif // statement #define EXAMPLECLASSHEADERDEF // define the macro // EXAMPLECLASSHEADERDEF. // Ensures that this code is // only compiled once, no // matter how many times it // is included class ExampleClass { lines of code // body of header file }; #endif // need one of these for every #ifndef statement 16.7 Using Makefiles to Compile Multiple Files P94 of Guide to Scientific Computing in C++ 目的:We would rather not compile all of these classes separately every time one file is modified slightly. Class1.o : Class1.cpp Class1.hpp g++ -c -O Class1.cpp Class1.o : Class1.cpp Class1.hpp g++ -c -O Class1.cpp Class2.o : Class2.cpp Class2.hpp g++ -c -O Class2.cpp UseClasses.o : UseClasses.cpp Class1.hpp Class2.hpp g++ -c -O UseClasses.cpp UseClasses : Class1.o Class2.o UseClasses.o g++ -O -o UseClasses Class1.o Class2.o UseClasses.o 定义一个类: #ifndef COMPLEXNUMBERHEADERDEF #define COMPLEXNUMBERHEADERDEF //>>>>>如果没定义的话,就定义 #include <iostream> class ComplexNumber { private: double mRealPart; //real part double mImaginaryPart; //imaginary part大概是虚数部分? public: ComplexNumber(); //初始化函数,让实部和虚部为0 ComplexNumber(double x, double y); double CalculateModulus() const; // 返回一个双精度浮点数包含了这个复数的模 double CalculateArgument() const;// ComplexNumber CalculatePower(double n) const; ComplexNumber& operator=(const ComplexNumber& z); //这个是&的要注意,做了运算符重载? ComplexNumber operator-() const; ComplexNumber operator+(const ComplexNumber& z) const; ComplexNumber operator-(const ComplexNumber& z) const; friend std::ostream& operator<<(std::ostream& output, const ComplexNumber& z); }; #endif 一个新的东西: ComplexNumber& ComplexNumber:: operator=(const ComplexNumber& z) { mRealPart = z.mRealPart; mImaginaryPart = z.mImaginaryPart; return * this; } 这里面出现了两个&,需要看一眼是啥意思: 这里是运算符重载,重载了“=”运算符 用了 const 保证这个赋值时是拷贝而不是alter重名 所以返回的是这个类自己的地址。 然后因为是赋值,所以指向了给定值,然后给了一个const,所以不会被改变。 下面的“-”的重载就没用取地址和指针,这个“-”是取符号不是“减”。 ComplexNumber ComplexNumber::operator-() const { ComplexNumber w; w.mRealPart = -mRealPart; w.mImaginaryPart = -mImaginaryPart; return w; } 然后 是加运算符 // Overloading the binary + operator ComplexNumber ComplexNumber::operator+(const ComplexNumber& z) const { ComplexNumber w; w.mRealPart = mRealPart + z.mRealPart; w.mImaginaryPart = mImaginaryPart + z.mImaginaryPart; return w; } 这里的参数又用了const ComplexNumber& z Point &pt2=pt1; 定义了pt2为pt1的引用。通过这样的定义,pt1和pt2表示同一对象。 所以大部分都是引用的感觉。。。 所以上面是引用了z,所以是z的正常调用(而且不能更改z),然后把值丢到一个新的w里面。 这叫常引用,可以不开辟新的内存空间,而又不会影响原值。 16.8 类的继承 #ifndef EBOOKHEADERDEF #define EBOOKHEADERDEF #include <string> #include "Book.hpp" class Ebook: public Book { public: Ebook(); std::string hiddenUrl; }; #endif 这是继承的基础语法 新的东西是覆盖了初始构造函数的新构造函数Ebook(),和一个std::string的类成员hiddenUrl. 三种继承的解释 16.8.1 继承类的实时多态 Run-Time Polymorphism #ifndef GUESTDEF #define GUESTDEF #include <string> class Guest { public: std::string name, roomType, arrivalDate; int numberOfNights; double telephoneBill; virtual double CalculateBill(); }; #endif 关注点在virtual关键字,是告诉编译器这个方法可能被继承的类覆盖了。 这就是虚函数了。。 16.9 模板 函数返回引用和返回值的区别,关键就是有没有多搞一个零时变量。 一个简单的模板的例子: #include <iostream> template<class T> T GetMaximum(T number1, T number2); int main(int argc, char * argv[]) { std::cout << GetMaximum<int>(10, -2) << "\\n"; std::cout << GetMaximum<double>(-4.6, 3.5) << "\\n"; return 0; } template<class T> T GetMaximum(T number1, T number2) { T result; if (number1 > number2) { result = number1; } else { //number1 <= number2 result = number2; } return result; } 相当于T是占位符,然后使用的时候用尖括号<>把类型告诉程序。 16.9.1 Brief Survey of the Standard Template Library 16.9.1.1 Vector 16.10 Class for linear algebra 每个new 需要匹配一个 delete. "], +["rcpp.html", "Chapter 17 Rcpp", " Chapter 17 Rcpp 根据实验,见jmcmBoost项目里面的试验,代码是: // [[Rcpp::export]] void ReceiveAlist(Rcpp::List theta0) { double beta0=Rcpp::as<double>(theta0["beta"]); double gamma0=Rcpp::as<double>(theta0["gamma"]); double xi0=Rcpp::as<double>(theta0["xi"]); cout<<beta0<<endl; cout<<gamma0<<endl; cout<<xi0<<endl; } /*** R ReceiveAlist(list(beta=c(0,1),gamma=c(1,1),xi=c(1,2))) ReceiveAlist(list(beta=1,gamma=1,xi=1)) */ 第一个报错,第二个可以正确输出。 因为as的数据类型必须是一个scala,不能给一个Numeric Vector 所以问题是如何把这个转换成一个c++的变量 Sys.setenv("PKG_CXXFLAGS"="-std=c++11") 要用auto打印std::vector的话得这样,是c++11标准 理论上应该是用一个c++的STL的vector容器来搞 为啥那本书里面没有,好奇怪 所以进过谷歌,stackexchange的问题 所以解决方案是:: std::vector<double> b = as<std::vector<double> >(a) 于是代码就改成如下: void ReceiveAlist(Rcpp::List theta0) { std::vector<double> beta= Rcpp::as<std::vector<double> >(theta0["beta"]); //double beta0=Rcpp::as<double>(theta0["beta"]); std::vector<double> gamma= Rcpp::as<std::vector<double> >(theta0["gamma"]); //double gamma0=Rcpp::as<double>(theta0["gamma"]); std::vector<double> xi= Rcpp::as<std::vector<double> >(theta0["xi"]); //double xi0=Rcpp::as<double>(theta0["xi"]); //cout<<beta0<<endl; //cout<<gamma0<<endl; // cout<<xi0<<endl; for (auto i: beta) std::cout << i << ' '; std::cout<<endl; for (auto i: gamma) std::cout << i << ' '; std::cout<<endl; for (auto i: xi) std::cout << i << ' '; std::cout<<endl; } 然后结合之前的 Rcpp::List Jmcmoutput2(NumericVector x){ Rcpp::List a=Rcpp::List::create( // with static names Rcpp::CharacterVector::create("cc", "dd"), Rcpp::CharacterVector::create("ee", "ff") ); double cc=0; Rcpp::NumericVector a0=x[1]; double a2= Rcpp::as<double>(a0); return Rcpp::List::create(Rcpp::Named("beta") = Rcpp::wrap(cc), Rcpp::Named("gamma") = Rcpp::wrap(a2), Rcpp::Named("lambda") = x ); } 这就能返回一个R的数据结构list,接口的问题就基本上解决了。 "] ] diff --git a/docs/section-5.html b/docs/section-5.html index aadce8d..bcb33ac 100644 --- a/docs/section-5.html +++ b/docs/section-5.html @@ -25,7 +25,7 @@ - + @@ -136,6 +136,8 @@
                • 2 Mathematical statistic Trick
                • 3 Statistician Tool Box