38  高斯过程回归

本章主要内容分三大块,分别是多元正态分布、二维高斯过程和高斯过程回归。根据高斯过程的定义,我们知道多元正态分布和高斯过程有紧密的联系,首先介绍多元正态分布的定义、随机数模拟和分布拟合。二维高斯过程很有代表性,常用于空间统计中,而一维高斯过程常用于时间序列分析中,因而,介绍高斯过程的定义,二维高斯过程的模拟和参数拟合。在后续的高斯过程回归中,以朗格拉普岛的核辐射数据为例,建立泊松空间广义线性混合效应模型(响应变量非高斯的高斯过程回归模型),随机过程看作一组存在相关性的随机变量,这一组随机变量视为模型中的随机效应。

38.1 多元正态分布

设随机向量 \(\bm{X} = (X_1, X_2, \cdots, X_p)^{\top}\) 服从多元正态分布 \(\mathrm{MVN}(\bm{\mu}, \Sigma)\) ,其联合密度函数如下

\[ \begin{aligned} p(\boldsymbol x) = (2\pi)^{-\frac{p}{2}} |\Sigma|^{-\frac12} \exp\left\{ -\frac12 (\boldsymbol x - \boldsymbol \mu)^T \Sigma^{-1} (\boldsymbol x - \boldsymbol \mu) \right\}, \ \boldsymbol x \in \mathbb{R}^p \end{aligned} \]

其中,协方差矩阵 \(\Sigma\) 是正定的,其 Cholesky 分解为 \(\Sigma = CC^{\top}\) ,这里 \(C\) 为下三角矩阵。设 \(\bm{Z} = (Z_1, Z_2, \cdots, Z_p)^{\top}\) 服从 \(p\) 元标准正态分布 \(\mathrm{MVN}(\bm{0}, I)\) ,则 \(\bm{X} = \bm{\mu} + C\bm{Z}\) 服从多元正态分布 \(\mathrm{MVN}(\bm{\mu}, \Sigma)\)

38.1.1 多元正态分布模拟

可以用 Stan 函数 multi_normal_cholesky_rng 生成随机数模拟多元正态分布。

data {
  int<lower=1> N;
  int<lower=1> D;
  vector[D] mu;
  matrix[D, D] Sigma;
}
transformed data {
  matrix[D, D] L_K = cholesky_decompose(Sigma);
}
parameters {
}
model {
}
generated quantities {
  array[N] vector[D] yhat;
  for (n in 1:N){
    yhat[n] = multi_normal_cholesky_rng(mu, L_K); 
  }
}

上述代码块可以同时模拟多组服从多元正态分布的随机数。其中,参数块 parameters 和模型块 model 是空白的,这是因为模拟随机数不涉及模型推断,只是采样。核心部分 generated quantities 代码块负责生成随机数。

# 给定二元正态分布的参数值
multi_normal_d <- list(
  N = 1, # 一组随机数
  D = 2, # 维度
  mu = c(3, 2), # 均值向量
  Sigma = rbind(c(4, 1), c(1, 1)) # 协方差矩阵
)
library(cmdstanr)
# 编译多元正态分布模型
mod_multi_normal <- cmdstan_model(
  stan_file = "code/multi_normal_simu.stan",
  compile = TRUE, cpp_options = list(stan_threads = TRUE)
)

抽样生成 1000 个服从二元正态分布的随机数。

simu_multi_normal <- mod_multi_normal$sample(
  data = multi_normal_d,
  iter_warmup = 500,    # 每条链预处理迭代次数
  iter_sampling = 1000, # 样本量
  chains = 1,           # 马尔科夫链的数目
  parallel_chains = 1,  # 指定 CPU 核心数,可以给每条链分配一个
  threads_per_chain = 1, # 每条链设置一个线程
  show_messages = FALSE, # 不显示迭代的中间过程
  refresh = 0,        # 不显示采样的进度
  fixed_param = TRUE, # 固定参数
  output_dir = "data-raw/",
  seed = 20232023     # 设置随机数种子,不要使用 set.seed() 函数
)

值得注意,这里,不需要设置参数初始值,但要设置 fixed_param = TRUE,表示根据模型生成模拟数据。

# 原始数据
simu_multi_normal$draws(variables = "yhat", format = "array")
# A draws_array: 1000 iterations, 1 chains, and 2 variables
, , variable = yhat[1,1]

         chain
iteration   1
        1 2.6
        2 5.3
        3 1.4
        4 3.1
        5 7.6

, , variable = yhat[1,2]

         chain
iteration    1
        1 0.65
        2 3.70
        3 4.36
        4 1.85
        5 2.32

# ... with 995 more iterations
# 数据概览
simu_multi_normal$summary(.num_args = list(sigfig = 4, notation = "dec"))
# A tibble: 2 × 10
  variable   mean median    sd    mad      q5   q95  rhat ess_bulk ess_tail
  <chr>     <dbl>  <dbl> <dbl>  <dbl>   <dbl> <dbl> <dbl>    <dbl>    <dbl>
1 yhat[1,1] 3.076  2.979 2.038 1.916  -0.3290 6.535 1.001   1140.    1023. 
2 yhat[1,2] 1.990  1.964 1.018 0.9765  0.2323 3.680 1.002    966.5    982.1

以生成第一个服从二元正态分布的随机数(样本点)为例,这个随机数是通过采样获得的,采样过程中产生一个采样序列,采样序列的轨迹和分布如下:

library(ggplot2)
library(bayesplot)
mcmc_trace(simu_multi_normal$draws(c("yhat[1,1]", "yhat[1,2]")),
  facet_args = list(
    labeller = ggplot2::label_parsed,
    strip.position = "top", ncol = 1
  )
) + theme_bw(base_size = 12)

mcmc_dens(simu_multi_normal$draws(c("yhat[1,1]", "yhat[1,2]")),
  facet_args = list(
    labeller = ggplot2::label_parsed,
    strip.position = "top", ncol = 1
  )
) + theme_bw(base_size = 12)
图 38.1: 采样序列的轨迹和分布
图 38.2: 采样序列的轨迹和分布

这就是一组来自二元正态分布的随机数。

mcmc_scatter(simu_multi_normal$draws(c("yhat[1,1]", "yhat[1,2]"))) +
  theme_bw(base_size = 12) +
  labs(x = expression(x[1]), y = expression(x[2]))
图 38.3: 生成二元正态分布的随机数

提取采样数据,整理成矩阵。

# 抽取原始采样数据
yhat <- simu_multi_normal$draws(c("yhat[1,1]", "yhat[1,2]"))
# 合并多条链
yhat_mean <- apply(yhat, c(1, 3), mean)
# 整理成二维矩阵
x <- as.matrix(yhat_mean)
# 样本均值
colMeans(x)
yhat[1,1] yhat[1,2] 
 3.076281  1.990465 
# 样本方差-协方差矩阵
var(x)
          yhat[1,1] yhat[1,2]
yhat[1,1]  4.152065  1.031544
yhat[1,2]  1.031544  1.035985

38.1.2 多元正态分布拟合

一般地,协方差矩阵的 Cholesky 分解的矩阵表示如下:

\[ \begin{aligned} \Sigma &= \begin{bmatrix} \sigma^2_1 & \rho_{12}\sigma_1\sigma_2 & \rho_{13}\sigma_1\sigma_3 \\ \rho_{12}\sigma_1\sigma_2 & \sigma_2^2 & \rho_{23}\sigma_2\sigma_3 \\ \rho_{13}\sigma_1\sigma_3 & \rho_{23}\sigma_2\sigma_3 & \sigma_3^2 \end{bmatrix} \\ & = \begin{bmatrix} \sigma_1 & 0 & 0 \\ 0 & \sigma_2 & 0 \\ 0 & 0 & \sigma_3 \end{bmatrix} \underbrace{ \begin{bmatrix} 1 & \rho_{12} & \rho_{13} \\ \rho_{12} & 1 & \rho_{23} \\ \rho_{13} & \rho_{23} & 1 \end{bmatrix} }_{R} \begin{bmatrix} \sigma_1 & 0 & 0 \\ 0 & \sigma_2 & 0 \\ 0 & 0 & \sigma_3 \end{bmatrix} \\ & = \begin{bmatrix} \sigma_1 & 0 & 0 \\ 0 & \sigma_2 & 0 \\ 0 & 0 & \sigma_3 \end{bmatrix} \underbrace{L_u L_u^{\top}}_{R} \begin{bmatrix} \sigma_1 & 0 & 0 \\ 0 & \sigma_2 & 0 \\ 0 & 0 & \sigma_3 \end{bmatrix} \end{aligned} \]

data {
  int<lower=1> N; // number of observations
  int<lower=1> K; // dimension of observations
  array[N] vector[K] y; // observations: a list of N vectors (each has K elements)
}
parameters {
  vector[K] mu;
  cholesky_factor_corr[K] Lcorr; // cholesky factor (L_u matrix for R)
  vector<lower=0>[K] sigma;
}
transformed parameters {
  corr_matrix[K] R; // correlation matrix
  cov_matrix[K] Sigma; // VCV matrix
  R = multiply_lower_tri_self_transpose(Lcorr); // R = Lcorr * Lcorr'
  Sigma = quad_form_diag(R, sigma); // quad_form_diag: diag_matrix(sig) * R * diag_matrix(sig)
}
model {
  sigma ~ cauchy(0, 5); // prior for sigma
  Lcorr ~ lkj_corr_cholesky(2.0); // prior for cholesky factor of a correlation matrix
  y ~ multi_normal(mu, Sigma);
}

代码中, 核心部分是关于多元正态分布的协方差矩阵的参数化,先将协方差矩阵中的方差和相关矩阵剥离,然后利用 Cholesky 分解将相关矩阵分解。在 Stan 里,这是一套高效的组合。

矩阵 \(L_u\) 是相关矩阵 \(R\) 的 Cholesky 分解的结果,在贝叶斯框架内,参数都是随机的,相关矩阵是一个随机矩阵,矩阵 \(L_u\) 是一个随机矩阵,它的分布用 Stan 代码表示为如下:

L ~ lkj_corr_cholesky(2.0); # implies L * L' ~ lkj_corr(2.0);

LKJ 分布有一个参数 \(\eta\) ,此处 \(\eta = 2\) ,意味着变量之间的相关性较弱,LKJ 分布的概率密度函数正比于相关矩阵的行列式的 \(\eta-1\) 次幂 \((\det{R})^{\eta-1}\),LKJ 分布的详细说明见Lewandowski-Kurowicka-Joe (LKJ) distribution

有了上面的背景知识,下面先在 R 环境中模拟一组来自多元正态分布的样本。

set.seed(20232023)
# 均值
mu <- c(1, 2, -5) 
# 相关矩阵 (R)
R <- matrix(c(
  1, 0.7, 0.2, 
  0.7, 1, -0.5,
  0.2, -0.5, 1
), 3)
# sd1 = 0.5, sd2 = 1.2, sd3 = 2.3
sigmas <- c(0.5, 1.2, 2.3) 
# 方差-协方差矩阵
Sigma <- diag(sigmas) %*% R %*% diag(sigmas) 
# 模拟 1000 个样本数据
dat <- MASS::mvrnorm(1000, mu = mu, Sigma = Sigma) 

根据 1000 个样本点,估计多元正态分布的均值参数和方差协方差参数。

# 来自多元正态分布的一组观测数据
multi_normal_chol_d <- list(
  N = 1000, # 样本量
  K = 3,    # 三维
  y = dat
)
# 编译多元正态分布模型
mod_multi_normal_chol <- cmdstan_model(
  stan_file = "code/multi_normal_fitted.stan",
  compile = TRUE, cpp_options = list(stan_threads = TRUE)
)
# 拟合多元正态分布模型
fit_multi_normal <- mod_multi_normal_chol$sample(
  data = multi_normal_chol_d,
  iter_warmup = 500,    # 每条链预处理迭代次数
  iter_sampling = 1000, # 每条链采样次数
  chains = 2,           # 马尔科夫链的数目
  parallel_chains = 1,  # 指定 CPU 核心数
  threads_per_chain = 1,  # 每条链设置一个线程
  show_messages = FALSE,  # 不显示迭代的中间过程
  refresh = 0,            # 不显示采样的进度
  output_dir = "data-raw/",
  seed = 20232023     # 设置随机数种子
)

均值向量 \(\bm{\mu}\) 和协方差矩阵 \(\Sigma\) 估计结果如下:

fit_multi_normal$summary(c("mu", "Sigma"), .num_args = list(sigfig = 3, notation = "dec"))
# A tibble: 12 × 10
   variable     mean median     sd    mad     q5    q95  rhat ess_bulk ess_tail
   <chr>       <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl> <dbl>    <dbl>    <dbl>
 1 mu[1]       1.02   1.02  0.0162 0.0154  0.999  1.05   1.00    1605.    1250.
 2 mu[2]       2.00   2.00  0.0385 0.0386  1.93   2.06   1.00    1300.    1138.
 3 mu[3]      -4.86  -4.86  0.0703 0.0724 -4.98  -4.75   1.00    1737.    1549.
 4 Sigma[1,1]  0.250  0.249 0.0110 0.0108  0.231  0.268  1.00    1847.    1336.
 5 Sigma[2,1]  0.427  0.426 0.0235 0.0234  0.388  0.466  1.00    1507.    1261.
 6 Sigma[3,1]  0.210  0.209 0.0361 0.0363  0.151  0.271  1.00    1472.    1202.
 7 Sigma[1,2]  0.427  0.426 0.0235 0.0234  0.388  0.466  1.00    1507.    1261.
 8 Sigma[2,2]  1.47   1.47  0.0649 0.0635  1.37   1.58   1.01    1506.    1265.
 9 Sigma[3,2] -1.41  -1.41  0.0955 0.0957 -1.57  -1.26   1.00    1643.    1572.
10 Sigma[1,3]  0.210  0.209 0.0361 0.0363  0.151  0.271  1.00    1472.    1202.
11 Sigma[2,3] -1.41  -1.41  0.0955 0.0957 -1.57  -1.26   1.00    1643.    1572.
12 Sigma[3,3]  5.28   5.27  0.225  0.221   4.92   5.67   1.00    1605.    1603.

均值向量 \(\bm{\mu} = (\mu_1,\mu_2,\mu_3)^{\top}\) 各个分量及其两两相关性,如下图所示。

mcmc_pairs(
  fit_multi_normal$draws(c("mu[1]", "mu[2]", "mu[3]")),
  diag_fun = "dens", off_diag_fun = "hex"
)
图 38.4: 三元正态分布

38.2 二维高斯过程

高斯过程定义

38.2.1 二维高斯过程模拟

二维高斯过程 \(\mathcal{S}\) 的均值向量为 0 向量,自协方差函数为指数型,如下

\[ \mathsf{Cov}\{S(x_i), S(x_j)\} = \sigma^2 \exp\big( -\frac{\|x_i -x_j\|_{2}}{\phi} \big) \]

其中,不妨设参数 \(\sigma = 10, \phi = 1\) 。模拟高斯过程的 Stan 代码如下

data {
  int<lower=1> N;
  int<lower=1> D;
  array[N] vector[D] X;
  vector[N] mu;
  real<lower=0> sigma;
  real<lower=0> phi;
}
transformed data {
  real delta = 1e-9;
  matrix[N, N] L;
  matrix[N, N] K = gp_exponential_cov(X, sigma, phi) + diag_matrix(rep_vector(delta, N));
  L = cholesky_decompose(K);
}
parameters {
  vector[N] eta;
}
model {
  eta ~ std_normal();
}
generated quantities {
  vector[N] y;
  y = mu + L * eta;
}

在二维规则网格上采样,采样点数量为 225。

n <- 15
gaussian_process_d <- list(
  N = n^2,
  D = 2,
  mu = rep(0, n^2),
  sigma = 10,
  phi = 1,
  X = expand.grid(x1 = 1:n / n, x2 = 1:n / n)
)
# 编译二维高斯过程模型
mod_gaussian_process_simu <- cmdstan_model(
  stan_file = "code/gaussian_process_simu.stan",
  compile = TRUE, cpp_options = list(stan_threads = TRUE)
)

模拟 1 个样本,因为是模拟数据,不需要设置多条链。

fit_multi_normal_gp <- mod_gaussian_process_simu$sample(
  data = gaussian_process_d,
  iter_warmup = 500,       # 每条链预处理迭代次数
  iter_sampling = 1000,    # 样本量
  chains = 1,              # 马尔科夫链的数目
  parallel_chains = 1,    # 指定 CPU 核心数
  threads_per_chain = 1,  # 每条链设置一个线程
  show_messages = FALSE,  # 不显示迭代的中间过程
  refresh = 0,            # 不显示采样的进度
  output_dir = "data-raw/",
  seed = 20232023         # 设置随机数种子
)

位置 1 和 2 处的随机变量的迭代轨迹,均值为 0 ,标准差 10 左右。

mcmc_trace(fit_multi_normal_gp$draws(c("y[1]", "y[2]")),
  facet_args = list(
    labeller = ggplot2::label_parsed,
    strip.position = "top", ncol = 1
  )
) + theme_bw(base_size = 12)
图 38.5: 位置 1 和 2 处的迭代轨迹

位置 1 处的随机变量及其分布

y1 <- fit_multi_normal_gp$draws(c("y[1]"), format = "draws_array")
# 合并链条结果
y1_mean <- apply(y1, c(1, 3), mean)
# y[1] 的方差
var(y1_mean)
         y[1]
y[1] 100.5436
# y[1] 的标准差
sd(y1_mean)
[1] 10.02714

100 次迭代获得 100 个样本点,每次迭代采集一个样本点,每个样本点是一个 225 维的向量。

# 抽取原始的采样数据
y_array <- fit_multi_normal_gp$draws(variables = "y", format = "array")
# 合并链条
y_mean <- apply(y_array, c(1, 3), mean)

从 100 次迭代中任意提取某一个样本点,比如预采样之后的第一次下迭代的结果,接着整理数据。

# 整理数据
sim_gp_data <- cbind.data.frame(gaussian_process_d$X, ysim = y_mean[1, ])

绘制二维高斯过程图形。

ggplot(data = sim_gp_data, aes(x = x1, y = x2)) +
  geom_point(aes(color = ysim)) +
  scale_color_distiller(palette = "Spectral") +
  theme_bw() +
  labs(x = expression(x[1]), y = expression(x[2]))
图 38.6: 二维高斯过程

38.2.2 二维高斯过程拟合

二维高斯过程拟合代码如下

data {
  int<lower=1> N;
  int<lower=1> D;
  array[N] vector[D] x;
  vector[N] y;
}
transformed data {
  real delta = 1e-9;
  vector[N] mu = rep_vector(0, N);
}
parameters {
  real<lower=0> phi;
  real<lower=0> sigma;
}
model {
  matrix[N, N] L_K;
  {
    matrix[N, N] K = gp_exponential_cov(x, sigma, phi) + diag_matrix(rep_vector(delta, N));
    L_K = cholesky_decompose(K);
  }
  
  phi ~ std_normal();
  sigma ~ std_normal();

  y ~ multi_normal_cholesky(mu, L_K);
}
# 二维高斯过程模型
gaussian_process_d <- list(
  D = 2,
  N = nrow(sim_gp_data), # 观测记录的条数
  x = sim_gp_data[, c("x1", "x2")],
  y = sim_gp_data[, "ysim"]
)

nchains <- 2
set.seed(20232023)
# 给每条链设置不同的参数初始值
inits_gaussian_process <- lapply(1:nchains, function(i) {
  list(
    sigma = runif(1),
    phi = runif(1)
  )
})

# 编译模型
mod_gaussian_process <- cmdstan_model(
  stan_file = "code/gaussian_process_fitted.stan",
  compile = TRUE, cpp_options = list(stan_threads = TRUE)
)

# 拟合二维高斯过程
fit_gaussian_process <- mod_gaussian_process$sample(
  data = gaussian_process_d,     # 观测数据
  init = inits_gaussian_process, # 迭代初值
  iter_warmup = 1000,   # 每条链预处理迭代次数
  iter_sampling = 2000, # 每条链总迭代次数
  chains = nchains,     # 马尔科夫链的数目
  parallel_chains = 2,  # 指定 CPU 核心数,可以给每条链分配一个
  threads_per_chain = 2, # 每条链设置一个线程
  show_messages = FALSE, # 不显示迭代的中间过程
  refresh = 0,           # 不显示采样的进度
  output_dir = "data-raw/",
  seed = 20232023        # 设置随机数种子,不要使用 set.seed() 函数
)
# 诊断
fit_gaussian_process$diagnostic_summary()
$num_divergent
[1] 0 0

$num_max_treedepth
[1] 0 0

$ebfmi
[1] 1.097944 1.083674

输出结果

fit_gaussian_process$summary()
# A tibble: 3 × 10
  variable     mean   median     sd    mad       q5      q95  rhat ess_bulk
  <chr>       <dbl>    <dbl>  <dbl>  <dbl>    <dbl>    <dbl> <dbl>    <dbl>
1 lp__     -354.    -354.    0.964  0.703  -356.    -353.     1.00    1395.
2 phi         0.308    0.302 0.0503 0.0486    0.235    0.400  1.00    1196.
3 sigma       5.25     5.24  0.389  0.375     4.65     5.94   1.00    1246.
# ℹ 1 more variable: ess_tail <dbl>

38.3 高斯过程回归

38.3.1 模型介绍

朗格拉普岛是位于太平洋上的一个小岛,因美国在比基尼群岛的氢弹核试验受到严重的核辐射影响,数十年之后,科学家登岛采集核辐射强度数据以评估当地居民重返该岛的可能性。朗格拉普岛是一个十分狭长且占地面积只有几平方公里的小岛。

根据 \({}^{137}\mathrm{Cs}\) 放出伽马射线,在 \(n=157\) 个采样点,分别以时间间隔 \(t_i\) 测量辐射量 \(y(x_i)\),建立泊松型空间广义线性混合效应模型(Diggle, Tawn, 和 Moyeed 1998)

\[ \begin{aligned} \log\{\lambda(x_i)\} & = \beta + S(x_{i})\\ y(x_{i}) &\sim \mathrm{Poisson}\big(t_i\lambda(x_i)\big) \end{aligned} \]

其中,\(\beta\) 表示截距,相当于平均水平,\(\lambda(x_i)\) 表示位置 \(x_i\) 处的辐射强度,\(S(x_{i})\) 表示位置 \(x_i\) 处的空间效应,\(S(x),x \in \mathcal{D} \subset{\mathbb{R}^2}\) 是二维平稳空间高斯过程 \(\mathcal{S}\) 的具体实现。 \(\mathcal{D}\) 表示研究区域,可以理解为朗格拉普岛,它是二维实平面 \(\mathbb{R}^2\) 的子集。

随机过程 \(S(x)\) 的自协方差函数常用的有指数型、幂二次指数型(高斯型)和梅隆型,形式如下:

\[ \begin{aligned} \mathsf{Cov}\{S(x_i), S(x_j)\} &= \sigma^2 \exp\big( -\frac{\|x_i -x_j\|_{2}}{\phi} \big) \\ \mathsf{Cov}\{ S(x_i), S(x_j) \} &= \sigma^2 \exp\big( -\frac{\|x_i -x_j\|_{2}^{2}}{2\phi^2} \big) \\ \mathsf{Cov}\{ S(x_i), S(x_j) \} &= \sigma^2 \frac{2^{1 - \nu}}{\Gamma(\nu)} \left(\sqrt{2\nu}\frac{\|x_i -x_j\|_{2}}{\phi}\right)^{\nu} K_{\nu}\left(\sqrt{2\nu}\frac{\|x_i -x_j\|_{2}}{\phi}\right) \\ K_{\nu}(x) &= \int_{0}^{\infty}\exp(-x \cosh t) \cosh (\nu t) \mathrm{dt} \end{aligned} \]

待估参数:代表方差的 \(\sigma^2\) 和代表范围的 \(\phi\) 。当 \(\nu = 1/2\) 时,梅隆型退化为指数型。

38.3.2 观测数据

# 加载数据
rongelap <- readRDS(file = "data/rongelap.rds")
rongelap_coastline <- readRDS(file = "data/rongelap_coastline.rds")
# 准备输入数据
rongelap_poisson_d <- list(
  N = nrow(rongelap), # 观测记录的条数
  D = 2, # 2 维坐标
  X = rongelap[, c("cX", "cY")] / 6000, # N x 2 矩阵
  y = rongelap$counts, # 响应变量
  offsets = rongelap$time # 漂移项
)
# 准备参数初始化数据
set.seed(20232023)
nchains <- 2 # 2 条迭代链
inits_data_poisson <- lapply(1:nchains, function(i) {
  list(
    beta = rnorm(1),
    sigma = runif(1),
    phi = runif(1),
    lambda = rnorm(157)
  )
})

38.3.3 预测数据

预测未采样的位置的核辐射强度,根据海岸线数据网格化全岛,以格点代表未采样的位置

library(sf)
library(abind)
library(stars)
# 类型转化
rongelap_sf <- st_as_sf(rongelap, coords = c("cX", "cY"), dim = "XY")
rongelap_coastline_sf <- st_as_sf(rongelap_coastline, coords = c("cX", "cY"), dim = "XY")
rongelap_coastline_sfp <- st_cast(st_combine(st_geometry(rongelap_coastline_sf)), "POLYGON")
# 添加缓冲区
rongelap_coastline_buffer <- st_buffer(rongelap_coastline_sfp, dist = 50)
# 构造带边界约束的网格
rongelap_coastline_grid <- st_make_grid(rongelap_coastline_buffer, n = c(150, 75))
# 将 sfc 类型转化为 sf 类型
rongelap_coastline_grid <- st_as_sf(rongelap_coastline_grid)
rongelap_coastline_buffer <- st_as_sf(rongelap_coastline_buffer)
rongelap_grid <- rongelap_coastline_grid[rongelap_coastline_buffer, op = st_intersects]
# 计算网格中心点坐标
rongelap_grid_centroid <- st_centroid(rongelap_grid)
# 共计 1612 个预测点
rongelap_grid_df <- as.data.frame(st_coordinates(rongelap_grid_centroid))
colnames(rongelap_grid_df) <- c("cX", "cY")

未采样的位置 rongelap_grid_df

head(rongelap_grid_df)
         cX        cY
1 -5685.942 -3606.997
2 -5643.145 -3606.997
3 -5600.347 -3606.997
4 -5557.549 -3606.997
5 -5514.751 -3606.997
6 -5471.953 -3606.997

朗格拉普岛网格化生成格点

ggplot() +
  geom_point(data = rongelap_grid_df, aes(x = cX, y = cY), cex = 0.3) +
  geom_path(data = rongelap_coastline, aes(x = cX, y = cY)) +
  coord_fixed() +
  theme_bw() +
  labs(x = "横坐标(米)", y = "纵坐标(米)")
图 38.7: 朗格拉普岛

38.3.4 模型编码

指定各个参数 \(\beta,\sigma,\phi\) 的先验分布

\[ \begin{aligned} \beta &\sim \mathrm{std\_normal}(0,1) \\ \sigma &\sim \mathrm{inv\_gamma}(5,5) \\ \phi &\sim \mathrm{half\_std\_normal}(0,1) \\ \bm{\lambda} | \beta,\sigma &\sim \mathrm{multivariate\_normal}(\bm{\beta}, \sigma^2 \Sigma) \\ \bm{y} | \bm{\lambda} &\sim \mathrm{poisson\_log}\big(\log(\text{offsets})+\bm{\lambda}\big) \end{aligned} \]

其中,\(\beta,\sigma,\phi,\Sigma\) 的含义同前,\(\lambda\) 代表辐射强度,\(\mathrm{offsets}\) 代表漂移项,这里是时间段,\(\bm{y}\) 表示观测的辐射粒子数,\(\mathrm{poisson\_log}\) 表示泊松分布的对数参数化,将频率参数 rate 的对数 \(\lambda\) 作为参数,详见 Stan 函数手册中泊松分布的对数函数表示

data {
  int<lower=1> N;
  int<lower=1> D;
  array[N] vector[D] X;
  array[N] int<lower = 0> y;
  vector[N] offsets;
}
transformed data {
  real delta = 1e-12;
  vector[N] log_offsets = log(offsets);
}
parameters {
  real beta;
  real<lower=0> sigma;
  real<lower=0> phi;
  vector[N] lambda;
}
transformed parameters {
  vector[N] mu = rep_vector(beta, N);
}
model {
  matrix[N, N] L_K;
  {
    matrix[N, N] K = gp_exponential_cov(X, sigma, phi) + diag_matrix(rep_vector(delta, N));
    L_K = cholesky_decompose(K);
  }
  
  beta ~ std_normal();
  sigma ~ inv_gamma(5, 5);
  phi ~ std_normal();
  
  lambda ~ multi_normal_cholesky(mu, L_K);
  y ~ poisson_log(log_offsets + lambda);
}
# 编译模型
mod_rongelap_poisson <- cmdstan_model(
  stan_file = "code/rongelap_poisson_processes.stan",
  compile = TRUE, cpp_options = list(stan_threads = TRUE)
)
# 泊松对数模型
fit_rongelap_poisson <- mod_rongelap_poisson$sample(
  data = rongelap_poisson_d,  # 观测数据
  init = inits_data_poisson,  # 迭代初值
  iter_warmup = 500,    # 每条链预处理迭代次数
  iter_sampling = 1000, # 每条链总迭代次数
  chains = nchains,     # 马尔科夫链的数目
  parallel_chains = 2,  # 指定 CPU 核心数,可以给每条链分配一个
  threads_per_chain = 2, # 每条链设置一个线程
  show_messages = FALSE, # 不显示迭代的中间过程
  refresh = 0,           # 不显示采样的进度
  output_dir = "data-raw/",
  seed = 20232023
)
# 诊断
fit_rongelap_poisson$diagnostic_summary()
$num_divergent
[1] 0 0

$num_max_treedepth
[1] 0 0

$ebfmi
[1] 1.078629 1.048734
# 泊松对数模型
fit_rongelap_poisson$summary(
  variables = c("lp__", "beta", "sigma", "phi"),
  .num_args = list(sigfig = 3, notation = "dec")
)
# A tibble: 4 × 10
  variable         mean       median     sd      mad           q5          q95
  <chr>           <dbl>        <dbl>  <dbl>    <dbl>        <dbl>        <dbl>
1 lp__     3402193.     3402190      9.44   14.8     3402180      3402210     
2 beta           1.78         1.79   0.139   0.116         1.56         1.97  
3 sigma          0.633        0.611  0.112   0.0749        0.516        0.817 
4 phi            0.0267       0.0233 0.0158  0.00780       0.0144       0.0468
# ℹ 3 more variables: rhat <dbl>, ess_bulk <dbl>, ess_tail <dbl>
# 参数的迭代轨迹
mcmc_trace(fit_rongelap_poisson$draws(c("sigma", "phi")),
           facet_args = list(
             labeller = ggplot2::label_parsed,
             strip.position = "top", ncol = 1
           )
) + theme_bw(base_size = 12)
图 38.8: \(\sigma\)\(\phi\) 的迭代轨迹
# 参数的后验分布
mcmc_dens(fit_rongelap_poisson$draws(c("sigma", "phi")),
          facet_args = list(
            labeller = ggplot2::label_parsed,
            strip.position = "top", ncol = 1
          )
) + theme_bw(base_size = 12)
图 38.9: \(\sigma\)\(\phi\) 的后验分布

38.3.5 预测分布

核辐射预测模型的 Stan 代码

functions {
  vector gp_pred_rng(array[] vector x2,
                     vector lambda,
                     array[] vector x1,
                     real beta,
                     real sigma,
                     real phi,
                     real delta) {
    int N1 = rows(lambda);
    int N2 = size(x2);
    vector[N2] f2;
    {
      matrix[N1, N1] L_K;
      vector[N1] K_div_lambda;
      matrix[N1, N2] k_x1_x2;
      matrix[N1, N2] v_pred;
      vector[N2] f2_mu;
      matrix[N2, N2] cov_f2;
      matrix[N2, N2] diag_delta;
      matrix[N1, N1] K;
      K = gp_exponential_cov(x1, sigma, phi);
      L_K = cholesky_decompose(K);
      K_div_lambda = mdivide_left_tri_low(L_K, lambda - beta);
      K_div_lambda = mdivide_right_tri_low(K_div_lambda', L_K)';
      k_x1_x2 = gp_exponential_cov(x1, x2, sigma, phi);
      f2_mu = beta + (k_x1_x2' * K_div_lambda);
      v_pred = mdivide_left_tri_low(L_K, k_x1_x2);
      cov_f2 = gp_exponential_cov(x2, sigma, phi) - v_pred' * v_pred;
      diag_delta = diag_matrix(rep_vector(delta, N2));

      f2 = multi_normal_rng(f2_mu, cov_f2 + diag_delta);
    }
    return f2;
  }
}
data {
  int<lower=1> D;
  int<lower=1> N1;
  array[N1] vector[D] x1;
  array[N1] int<lower = 0> y1;
  vector[N1] offsets1;
  int<lower=1> N2;
  array[N2] vector[D] x2;
  vector[N2] offsets2;
}
transformed data {
  real delta = 1e-12;
  vector[N1] log_offsets1 = log(offsets1);
  vector[N2] log_offsets2 = log(offsets2);
  
  int<lower=1> N = N1 + N2;
  array[N] vector[D] x;
  
  for (n1 in 1:N1) {
    x[n1] = x1[n1];
  }
  for (n2 in 1:N2) {
    x[N1 + n2] = x2[n2];
  }
}
parameters {
  real beta;
  real<lower=0> sigma;
  real<lower=0> phi;
  vector[N1] lambda1;
}
transformed parameters {
  vector[N1] mu = rep_vector(beta, N1);
}
model {
  matrix[N1, N1] L_K;
  {
    matrix[N1, N1] K = gp_exponential_cov(x1, sigma, phi) + diag_matrix(rep_vector(delta, N1));
    L_K = cholesky_decompose(K);
  }

  beta ~ std_normal();
  sigma ~ inv_gamma(5, 5);
  phi ~ std_normal();
  
  lambda1 ~ multi_normal_cholesky(mu, L_K);
  y1 ~ poisson_log(log_offsets1 + lambda1);
}
generated quantities {
  vector[N1] yhat;     // Posterior predictions for each location
  vector[N1] log_lik;  // Log likelihood for each location
  vector[N1] RR1 = log_offsets1 + lambda1;
  
  for(n in 1:N1) {
    log_lik[n] = poisson_log_lpmf(y1[n] | RR1[n]);
    yhat[n] = poisson_log_rng(RR1[n]);
  }
  
  vector[N2] ypred; 
  vector[N2] lambda2 = gp_pred_rng(x2, lambda1, x1, beta, sigma, phi, delta);
  vector[N2] RR2 = log_offsets2 + lambda2;
  
  for(n in 1:N2) {
    ypred[n] = poisson_log_rng(RR2[n]);
  }
}

准备数据、拟合模型

# 固定漂移项
rongelap_grid_df$time <- 100
# 对数高斯模型
rongelap_poisson_pred_d <- list(
  D = 2,
  N1 = nrow(rongelap), # 观测记录的条数
  x1 = rongelap[, c("cX", "cY")] / 6000,
  y1 = rongelap[, "counts"],
  offsets1 = rongelap[, "time"],
  N2 = nrow(rongelap_grid_df), # 2 维坐标
  x2 = rongelap_grid_df[, c("cX", "cY")] / 6000,
  offsets2 = rongelap_grid_df[, "time"]
)
# 迭代链数目
nchains <- 2
# 给每条链设置不同的参数初始值
inits_data_poisson_pred <- lapply(1:nchains, function(i) {
  list(
    beta = rnorm(1),
    sigma = runif(1),
    phi = runif(1),
    lambda = rnorm(157)
  )
})
# 编译模型
mod_rongelap_poisson_pred <- cmdstan_model(
  stan_file = "code/rongelap_poisson_pred.stan",
  compile = TRUE, cpp_options = list(stan_threads = TRUE)
)
# 泊松模型
fit_rongelap_poisson_pred <- mod_rongelap_poisson_pred$sample(
  data = rongelap_poisson_pred_d,   # 观测数据
  init = inits_data_poisson_pred,   # 迭代初值
  iter_warmup = 500,            # 每条链预处理迭代次数
  iter_sampling = 1000,         # 每条链总迭代次数
  chains = nchains,             # 马尔科夫链的数目
  parallel_chains = 2,      # 指定 CPU 核心数,可以给每条链分配一个
  threads_per_chain = 2,    # 每条链设置一个线程
  show_messages = FALSE,    # 不显示迭代的中间过程
  refresh = 0,              # 不显示采样的进度
  output_dir = "data-raw/",
  seed = 20232023           # 设置随机数种子,不要使用 set.seed() 函数
)
# 诊断信息
fit_rongelap_poisson_pred$diagnostic_summary()
$num_divergent
[1] 0 0

$num_max_treedepth
[1] 0 0

$ebfmi
[1] 1.096103 1.065450

参数的后验估计

fit_rongelap_poisson_pred$summary(variables = c("beta", "sigma", "phi"))
# A tibble: 3 × 10
  variable   mean median     sd     mad     q5    q95  rhat ess_bulk ess_tail
  <chr>     <dbl>  <dbl>  <dbl>   <dbl>  <dbl>  <dbl> <dbl>    <dbl>    <dbl>
1 beta     1.77   1.79   0.145  0.120   1.53   1.97    1.00    1423.     647.
2 sigma    0.638  0.610  0.116  0.0789  0.515  0.851   1.01     855.     474.
3 phi      0.0271 0.0233 0.0152 0.00787 0.0141 0.0513  1.00     763.     539.

模型评估 LOO-CV

fit_rongelap_poisson_pred$loo(variables = "log_lik", cores = 2)
Warning: Some Pareto k diagnostic values are too high. See help('pareto-k-diagnostic') for details.

Computed from 2000 by 157 log-likelihood matrix

         Estimate  SE
elpd_loo   -933.7 3.9
p_loo       119.2 2.8
looic      1867.4 7.8
------
Monte Carlo SE of elpd_loo is NA.

Pareto k diagnostic values:
                         Count Pct.    Min. n_eff
(-Inf, 0.5]   (good)       0    0.0%   <NA>      
 (0.5, 0.7]   (ok)        18   11.5%   58        
   (0.7, 1]   (bad)      110   70.1%   8         
   (1, Inf)   (very bad)  29   18.5%   5         
See help('pareto-k-diagnostic') for details.

检查辐射强度分布的拟合效果

# 抽取 yrep 数据
yrep <- fit_rongelap_poisson_pred$draws(variables = "yhat", format = "draws_matrix")
# Posterior predictive checks
pp_check(rongelap$counts / rongelap$time,
  yrep = sweep(yrep[1:50, ], MARGIN = 2, STATS = rongelap$time, FUN = `/`),
  fun = ppc_dens_overlay
) +
  theme_classic()
图 38.10: 后验预测诊断图(密度图)

后 1000 次迭代是平稳的,可取任意一个链条的任意一次迭代,获得采样点处的预测值

yhat_array <- fit_rongelap_poisson_pred$draws(variables = "yhat", format = "array")
lambda1_array <- fit_rongelap_poisson_pred$draws(variables = "lambda1", format = "array")
rongelap_sf$lambda <- as.vector(lambda1_array[1,1,])
rongelap_sf$yhat <- as.vector(yhat_array[1,1,])

数据集 rongelap_sf 的概况

rongelap_sf
Simple feature collection with 157 features and 4 fields
Geometry type: POINT
Dimension:     XY
Bounding box:  xmin: -6050 ymin: -3430 xmax: -50 ymax: 0
CRS:           NA
First 10 features:
   counts time            geometry    lambda yhat
1      75  300 POINT (-6050 -3270) -1.220190   90
2     371  300 POINT (-6050 -3165)  0.183427  361
3    1931  300 POINT (-5925 -3320)  1.842720 1918
4    4357  300 POINT (-5925 -3165)  2.670960 4370
5    2114  300 POINT (-5800 -3350)  1.951680 2136
6    2318  300 POINT (-5800 -3165)  2.053260 2324
7    1975  300 POINT (-5625 -3350)  1.877460 2006
8    1912  300 POINT (-5700 -3260)  1.857760 1940
9    1902  300 POINT (-5700 -3220)  1.815000 1881
10   1882  300 POINT (-5700 -3180)  1.840230 1837

观测值和预测值的情况

summary(rongelap_sf$counts / rongelap_sf$time)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.250   5.890   7.475   7.604   9.363  15.103 
summary(rongelap_sf$yhat / rongelap_sf$time)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.300   5.820   7.373   7.609   9.573  15.383 

展示采样点处的预测值

代码
ggplot(data = rongelap_sf)+
  geom_sf(aes(color = yhat / time), cex = 0.5) +
  scale_colour_viridis_c(option = "C", breaks = 3*0:5,
    guide = guide_colourbar(
      barwidth = 15, barheight = 1.5,
      title.position = "top" # 图例标题位于图例上方
    )) +
  theme_bw() +
  labs(x = "横坐标(米)", y = "纵坐标(米)", colour = "辐射强度") +
  theme(
    legend.position = c(0.75, 0.1),
    legend.direction = "horizontal",
    legend.background = element_blank()
  )
图 38.11: 朗格拉普岛核辐射强度的分布

未采样点的预测

# 后验估计
ypred_tbl <- fit_rongelap_poisson_pred$summary(variables = "ypred", "mean")
rongelap_grid_df$ypred <- ypred_tbl$mean
# 查看预测结果
head(rongelap_grid_df)
         cX        cY time    ypred
1 -5685.942 -3606.997  100 748.5820
2 -5643.145 -3606.997  100 764.2190
3 -5600.347 -3606.997  100 769.1420
4 -5557.549 -3606.997  100 783.6340
5 -5514.751 -3606.997  100 798.8565
6 -5471.953 -3606.997  100 819.4740
# 预测值的分布范围
summary(rongelap_grid_df$ypred / rongelap_grid_df$time)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.495   6.194   7.411   7.311   8.531  12.806 

转化数据类型,去掉缓冲区内的预测位置,准备绘制辐射强度预测值的分布

rongelap_grid_sf <- st_as_sf(rongelap_grid_df, coords = c("cX", "cY"), dim = "XY")
rongelap_grid_stars <- st_rasterize(rongelap_grid_sf, nx = 150, ny = 75)
rongelap_stars <- st_crop(x = rongelap_grid_stars, y = rongelap_coastline_sfp)
代码
# 虚线框数据
dash_sfp <- st_polygon(x = list(rbind(
  c(-6000, -3600),
  c(-6000, -2600),
  c(-5000, -2600),
  c(-5000, -3600),
  c(-6000, -3600)
)), dim = "XY")
# 主体内容
p3 <- ggplot() +
  geom_stars(
    data = rongelap_stars, na.action = na.omit,
    aes(fill = ypred / time)
  ) +
  # 海岸线
  geom_sf(
    data = rongelap_coastline_sfp,
    fill = NA, color = "gray30", linewidth = 0.5
  ) +
  # 图例
  scale_fill_viridis_c(
    option = "C", breaks = 0:13,
    guide = guide_colourbar(
      barwidth = 15, barheight = 1.5,
      title.position = "top" # 图例标题位于图例上方
    )
  ) +
  # 虚线框
  geom_sf(data = dash_sfp, fill = NA, linewidth = 0.75, lty = 2) +
  # 箭头
  geom_segment(
    data = data.frame(x = -5500, xend = -5000, y = -2600, yend = -2250),
    aes(x = x, y = y, xend = xend, yend = yend),
    arrow = arrow(length = unit(0.03, "npc"))
  ) +
  theme_bw() +
  labs(x = "横坐标(米)", y = "纵坐标(米)", fill = "辐射强度") +
  theme(
    legend.position = c(0.75, 0.1),
    legend.direction = "horizontal",
    legend.background = element_blank()
  )

p4 <- ggplot() +
  geom_stars(
    data = rongelap_stars, na.action = na.omit,
    aes(fill = ypred / time), show.legend = FALSE
  ) +
  geom_sf(
    data = rongelap_coastline_sfp,
    fill = NA, color = "gray30", linewidth = 0.75
  ) +
  scale_fill_viridis_c(option = "C", breaks = 0:13) +
  # 虚线框
  geom_sf(data = dash_sfp, fill = NA, linewidth = 0.75, lty = 2) +
  theme_void() +
  coord_sf(expand = FALSE, xlim = c(-6000, -5000), ylim = c(-3600, -2600))
# 叠加图形
p3
print(p4, vp = grid::viewport(x = .3, y = .65, width = .45, height = .45))
图 38.12: 朗格拉普岛核辐射强度的分布

38.4 总结

从模型是否含有块金效应、不同的自相关函数和参数估计方法等方面比较。

library(nlme)
# 高斯分布、指数型自相关结构
fit_exp_reml <- gls(log(counts / time) ~ 1,
  correlation = corExp(value = 200, form = ~ cX + cY, nugget = FALSE),
  data = rongelap, method = "REML"
)
fit_exp_ml <- gls(log(counts / time) ~ 1,
  correlation = corExp(value = 200, form = ~ cX + cY, nugget = FALSE),
  data = rongelap, method = "ML"
)
fit_exp_reml_nugget <- gls(log(counts / time) ~ 1,
  correlation = corExp(value = c(200, 0.1), form = ~ cX + cY, nugget = TRUE),
  data = rongelap, method = "REML"
)
fit_exp_ml_nugget <- gls(log(counts / time) ~ 1,
  correlation = corExp(value = c(200, 0.1), form = ~ cX + cY, nugget = TRUE),
  data = rongelap, method = "ML"
)

# 高斯分布、高斯型自相关结构
fit_gaus_reml <- gls(log(counts / time) ~ 1,
  correlation = corGaus(value = 200, form = ~ cX + cY, nugget = FALSE),
  data = rongelap, method = "REML"
)
fit_gaus_ml <- gls(log(counts / time) ~ 1,
  correlation = corGaus(value = 200, form = ~ cX + cY, nugget = FALSE),
  data = rongelap, method = "ML"
)
fit_gaus_reml_nugget <- gls(log(counts / time) ~ 1,
  correlation = corGaus(value = c(200, 0.1), form = ~ cX + cY, nugget = TRUE),
  data = rongelap, method = "REML"
)
fit_gaus_ml_nugget <- gls(log(counts / time) ~ 1,
  correlation = corGaus(value = c(200, 0.1), form = ~ cX + cY, nugget = TRUE),
  data = rongelap, method = "ML"
)

汇总结果见下表。

表格 38.1: 不同模型与参数估计方法的比较
响应变量分布 空间自相关结构 块金效应 估计方法 \(\beta\) \(\sigma^2\) \(\phi\) 对数似然值
高斯分布 指数型 REML 1.826 0.3172 110.8 -89.07
高斯分布 指数型 ML 1.828 0.3064 105.4 -87.56
高斯分布 指数型 0.03598 REML 1.813 0.2935 169.7472 -88.22
高斯分布 指数型 0.03312 ML 1.828 0.2779 150.1324 -86.88
高斯分布 高斯型 REML 1.878 0.2523 41.96 -100.7
高斯分布 高斯型 ML 1.879 0.25 41.81 -98.62
高斯分布 高斯型 0.07055 REML 1.831 0.2532 139.1431 -84.91
高斯分布 高斯型 0.07053 ML 1.832 0.2459 137.0980 -83.32

相比于其他参数,REML 和 ML 估计方法对参数 \(\phi\) 影响很大,ML 估计的 \(\phi\) 和对数似然函数值更大。高斯型自相关结构中,REML 和 ML 估计方法对参数 \(\phi\) 的估计结果差不多。函数 gls() 对初值要求不高,以上初值选取比较随意,只是符合要求函数定义。

对普通用户来说,想要流畅地使用 Stan 框架,需要面对很多挑战。

  1. 软件安装和配置过程复杂。rstan 包内置的 Stan 版本常低于最新发布的 Stan 版本。
  2. 编译和运行模型的参数控制选项很多。编译模型,OpenCL 和多线程支持,HMC(NUTS)、L-BFGS 和 VI 三大推理算法的参数设置
  3. 模型参数先验分布设置技巧高。模型参数的先验对数据的依赖非常高,仅对线性和广义线性模型依赖较小。即使是面对模拟的简单广义线性混合效应模型,抽样过程也发散严重。
  4. 面对大规模数据扩展困难。以朗格拉普岛的核污染预测任务为例,处理 157 维的积分显得吃力,对 1600 个参数的后验分布模拟和推断低效。

2020 年 Stan 大会 Wade Brorsen 介绍采用 Stan 实现的贝叶斯克里金(Kriging)平滑算法估计和预测各郡县的作物产量。Stan 实现的贝叶斯空间分层正态模型,回归参数随空间区域位置变化,参数的先验分布与空间区域相关,引入大量带超参数的先验分布,运行效率不高,跑模型花费很多时间。假定所有的参数随空间位置变化,模型参数个数瞬间爆炸,跑模型花费 31 天 (Niyizibi, Brorsen, 和 Park 2018)

Stan 总有些优势吧!

  • Stan 非常灵活。Stan 同时是一门概率编程语言,只要统计模型可以被 Stan 编码,理论上就可以编译、运行、获得结果。
  • Stan 功能很多。Stan 还可以解刚性的常微分方程、积分方程等。 非常灵活,非常适合学术研究工作者,计算层面,可以方便地在前人的工作上扩展。
  • Stan 文档很全。函数手册 提供 Stan 内建的各类函数说明。编程手册 提供 Stan 编程语法、程序块的说明,教用户如何使用 Stan 写代码。用户手册 提供 Stan 支持的各类统计模型、代数和微分方程的使用示例。

38.5 习题

  1. 对核辐射污染数据,建立对数高斯过程模型,用 Stan 编码模型,预测全岛的核辐射强度分布。

    \[ \begin{aligned} \beta &\sim \mathrm{std\_normal}(0,1) \\ \sigma &\sim \mathrm{inv\_gamma}(5,5) \\ \phi &\sim \mathrm{half\_std\_normal}(0,1) \\ \tau &\sim \mathrm{half\_std\_normal}(0,1) \\ \bm{y} &\sim \mathrm{multivariate\_normal}(\bm{\beta}, \sigma^2 \Sigma+ \tau^2 I) \end{aligned} \]

    其中,\(\beta\) 代表截距,先验分布为标准正态分布,\(\sigma\) 代表高斯过程的方差参数(信号),先验分布为逆伽马分布,\(\phi\) 代表高斯过程的范围参数,先验分布为半标准正态分布,\(y\) 代表辐射强度的对数,给定参数和数据的条件分布为多元正态分布,\(\Sigma\) 代表协方差矩阵,\(I\) 代表与采样点数量相同的单位矩阵, \(\tau^2\) 是块金效应。

    functions {
      vector gp_pred_rng(array[] vector x2,
                         vector y1,
                         array[] vector x1,
                         real sigma,
                         real phi,
                         real tau,
                         real delta) {
        int N1 = rows(y1);
        int N2 = size(x2);
        vector[N2] f2;
        {
          matrix[N1, N1] L_K;
          vector[N1] K_div_y1;
          matrix[N1, N2] k_x1_x2;
          matrix[N1, N2] v_pred;
          vector[N2] f2_mu;
          matrix[N2, N2] cov_f2;
          matrix[N2, N2] diag_delta;
          matrix[N1, N1] K;
          K = gp_exponential_cov(x1, sigma, phi);
          for (n in 1:N1) {
            K[n, n] = K[n, n] + square(tau);
          }
          L_K = cholesky_decompose(K);
          K_div_y1 = mdivide_left_tri_low(L_K, y1);
          K_div_y1 = mdivide_right_tri_low(K_div_y1', L_K)';
          k_x1_x2 = gp_exponential_cov(x1, x2, sigma, phi);
          f2_mu = (k_x1_x2' * K_div_y1);
          v_pred = mdivide_left_tri_low(L_K, k_x1_x2);
          cov_f2 = gp_exponential_cov(x2, sigma, phi) - v_pred' * v_pred;
          diag_delta = diag_matrix(rep_vector(delta, N2));
    
          f2 = multi_normal_rng(f2_mu, cov_f2 + diag_delta);
        }
        return f2;
      }
    }
    data {
      int<lower=1> D;
      int<lower=1> N1;
      array[N1] vector[D] x1;
      vector[N1] y1;
      int<lower=1> N2;
      array[N2] vector[D] x2;
    }
    transformed data {
      real delta = 1e-9;
    }
    parameters {
      real beta;
      real<lower=0> phi;
      real<lower=0> sigma;
      real<lower=0> tau;
    }
    transformed parameters {
      vector[N1] mu = rep_vector(beta, N1);
    }
    model {
      matrix[N1, N1] L_K;
      {
        matrix[N1, N1] K = gp_exponential_cov(x1, sigma, phi);
        real sq_tau = square(tau);
    
        // diagonal elements
        for (n1 in 1:N1) {
          K[n1, n1] = K[n1, n1] + sq_tau;
        }
    
        L_K = cholesky_decompose(K);
      }
    
      beta ~ std_normal();
      phi ~ std_normal();
      sigma ~ inv_gamma(5, 5);
      tau ~ std_normal();
    
      y1 ~ multi_normal_cholesky(mu, L_K);
    }
    generated quantities {
      vector[N2] f2;
      vector[N2] ypred;
    
      f2 = gp_pred_rng(x2, y1, x1, sigma, phi, tau, delta);
      for (n2 in 1:N2) {
        ypred[n2] = normal_rng(f2[n2], tau);
      }
    }

    代码中,gp_exponential_cov 表示空间相关性结构选择了指数型,详见 Stan 函数手册中的指数型核函数表示cholesky_decompose 表示对协方差矩阵做 Cholesky 分解,分解出来的下三角矩阵作为多元正态分布的参数,详见 Stan 函数手册中的 Cholesky 分解multi_normal_cholesky 表示基于 Cholesky 分解的多元正态分布。详见 Stan 函数手册中的多元正态分布的 Cholesky 参数化表示。

    代码
    set.seed(20232023)
    nchains <- 2 # 2 条迭代链
    # 给每条链设置不同的参数初始值
    inits_data_gaussian <- lapply(1:nchains, function(i) {
      list(
        beta = rnorm(1),
        sigma = runif(1),
        phi = runif(1),
        tau = runif(1)
      )
    })
    
    # 对数高斯模型
    rongelap_gaussian_d <- list(
      N1 = nrow(rongelap), # 观测记录的条数
      N2 = nrow(rongelap_grid_df),
      D = 2, # 2 维坐标
      x1 = rongelap[, c("cX", "cY")] / 6000, # N x 2 坐标矩阵
      x2 = rongelap_grid_df[, c("cX", "cY")] / 6000,
      y1 = log(rongelap$counts / rongelap$time) # N 向量
    )
    # 编码
    mod_rongelap_gaussian <- cmdstan_model(
      stan_file = "code/gaussian_process_pred.stan",
      compile = TRUE, cpp_options = list(stan_threads = TRUE)
    )
    
    # 对数高斯模型
    fit_rongelap_gaussian <- mod_rongelap_gaussian$sample(
      data = rongelap_gaussian_d,   # 观测数据
      init = inits_data_gaussian,   # 迭代初值
      iter_warmup = 500,            # 每条链预处理迭代次数
      iter_sampling = 1000,         # 每条链总迭代次数
      chains = nchains,             # 马尔科夫链的数目
      parallel_chains = 2,      # 指定 CPU 核心数,可以给每条链分配一个
      threads_per_chain = 1,    # 每条链设置一个线程
      show_messages = FALSE,    # 不显示迭代的中间过程
      refresh = 0,              # 不显示采样的进度
      output_dir = "data-raw/",
      seed = 20232023           # 设置随机数种子,不要使用 set.seed() 函数
    )
    
    # 诊断
    fit_rongelap_gaussian$diagnostic_summary()
    # 对数高斯模型
    fit_rongelap_gaussian$summary(
      variables = c("lp__", "beta", "sigma", "phi", "tau"),
      .num_args = list(sigfig = 4, notation = "dec")
    )
    
    # 未采样的位置的核辐射强度预测值
    ypred <- fit_rongelap_gaussian$summary(variables = "ypred", "mean")
    # 预测值
    rongelap_grid_df$ypred <- exp(ypred$mean)
    # 整理数据
    rongelap_grid_sf <- st_as_sf(rongelap_grid_df, coords = c("cX", "cY"), dim = "XY")
    rongelap_grid_stars <- st_rasterize(rongelap_grid_sf, nx = 150, ny = 75)
    rongelap_stars <- st_crop(x = rongelap_grid_stars, y = rongelap_coastline_sfp)
    
    # 虚线框数据
    dash_sfp <- st_polygon(x = list(rbind(
      c(-6000, -3600),
      c(-6000, -2600),
      c(-5000, -2600),
      c(-5000, -3600),
      c(-6000, -3600)
    )), dim = "XY")
    # 主体内容
    p3 <- ggplot() +
      geom_stars(
        data = rongelap_stars, na.action = na.omit,
        aes(fill = ypred)
      ) +
      # 海岸线
      geom_sf(
        data = rongelap_coastline_sfp,
        fill = NA, color = "gray30", linewidth = 0.5
      ) +
      # 图例
      scale_fill_viridis_c(
        option = "C", breaks = 0:12,
        guide = guide_colourbar(
          barwidth = 15, barheight = 1.5,
          title.position = "top" # 图例标题位于图例上方
        )
      ) +
      # 虚线框
      geom_sf(data = dash_sfp, fill = NA, linewidth = 0.75, lty = 2) +
      # 箭头
      geom_segment(
        data = data.frame(x = -5500, xend = -5000, y = -2600, yend = -2250),
        aes(x = x, y = y, xend = xend, yend = yend),
        arrow = arrow(length = unit(0.03, "npc"))
      ) +
      theme_bw() +
      labs(x = "横坐标(米)", y = "纵坐标(米)", fill = "辐射强度") +
      theme(
        legend.position = c(0.75, 0.1),
        legend.direction = "horizontal",
        legend.background = element_blank()
      )
    
    p4 <- ggplot() +
      geom_stars(
        data = rongelap_stars, na.action = na.omit,
        aes(fill = ypred), show.legend = FALSE
      ) +
      geom_sf(
        data = rongelap_coastline_sfp,
        fill = NA, color = "gray30", linewidth = 0.75
      ) +
      scale_fill_viridis_c(option = "C", breaks = 0:12) +
      # 虚线框
      geom_sf(data = dash_sfp, fill = NA, linewidth = 0.75, lty = 2) +
      theme_void() +
      coord_sf(expand = FALSE, xlim = c(-6000, -5000), ylim = c(-3600, -2600))
    # 叠加图形
    p3
    print(p4, vp = grid::viewport(x = .3, y = .65, width = .45, height = .45))