贝叶斯建模brms包学习笔记,大部分内容修改自知乎文章,补充了在指纹因子模型中的尝试案例,以及官方对于自定义函数的案例。
# 加载包
library(dplyr)
library(brms)
library(ggplot2)
library(magrittr)
library(data.table)
名词解释:
结果分析:
as_draws_df(fit_res)
中四列,含义分别为
侵蚀泥沙来源的指纹因子识别问题
模型:
基于线性混合模型的brms贝叶斯求解,其中线性混合模型为\(PX=T\), 其中,\(P\)为m行1列的向量,代表每个源对汇的贡献系数,\(X\)为m行n列的指纹因子矩阵,m为源类型个数,n为指纹因子个数,并且有\(0\leq P_s \leq 1\),\(\sum_{s=1}^{m}{P_s}=1\)
注意,由于贡献比\(\sum_{s=1}^{m}{P_s}=1\)的限制很难实现,所以此将其作为模型\(X\)矩阵的第一行带入,即求解时\(X\)的第一行为元素都是1的单位向量。
本例数据为火烧迹地指纹因子数据,三个源US、BS、CB的贡献比,在三个汇中分别为
# 定义一个数据融合的方法,是#scus中mergeData函数的简化版,只支持均值融合
mergeData_s <- function(data){
vlaidData <- data
allTracer <- names(data)[-(1:3)]
resfront <- vlaidData[, lapply(.SD, unique),.SDcols=3, by="class"] %>% .[, id:=1:nrow(.)]
outData <- vlaidData[, lapply(.SD, mean), .SDcols=-(1:3), by="class"]
res <- resfront[outData, on="class"] %>% .[,.SD, .SDcols=c("id", "class", "type", allTracer)]
return(res)
}
# 读取数据
data_merge <- fread("./data/data-scus-raw.csv") %>% mergeData_s()
# 10组指纹因子
finger_tsp <- c("xlf", "a_star")
finger_cm <- c("a_star", "xhf", "xlf", "cs137", "pb210", "L_star","Y", "X")
finger_cr80 <- c("a_star", "xhf", "xlf", "cs137")
finger_frn <- c("cs137", "pb210")
finger_mag <- c("xhf","xlf")
finger_col <- c("X", "Y", "Z", "L_star", "a_star", "b_star", "c_star", "h_star")
finger_set <- list(c(finger_tsp), c(finger_cm), c(finger_cr80),
c(finger_frn), c(finger_mag), c(finger_col),
c(finger_frn, finger_mag),
c(finger_frn, finger_col),
c(finger_mag, finger_col),
c(finger_frn, finger_mag, finger_col))
# 筛选源汇和指纹因子
mix <- 1 #选择使用哪个汇建模
finger <- finger_cm #选择使用哪组复合指纹
sou_name <- data_merge$class[1:3]
tar_name <- data_merge$class[4:6]
dat_sou <- data_merge[1:3, .SD, .SDcols=finger] %>% t() %>% data.table() %>% set_colnames(sou_name)
dat_tar <- data_merge[4:6, .SD, .SDcols=finger] %>% t() %>% data.table() %>% set_colnames(tar_name)
# 生成模型数据
dat_model <- matrix(1, nrow = 1, ncol = length(sou_name)+1) %>%
data.table() %>%
set_colnames(c(sou_name,tar_name[mix])) %>%
rbind(., cbind(dat_sou,dat_tar[,.SD,.SDcols = mix]))
# 回归公式字符型
formula_str <- paste0(tar_name[mix], " ~ 0 + ", paste0(sou_name, collapse = " + "))
# 建立贝叶斯模型
fit_finger <- brm(
formula = as.formula(formula_str), #回归公式,无截距项
data = dat_model, #数据
family = gaussian(), #正态分布假设(线性回归)
chains = 4, #MCMC 链数
iter = 2000, #每条链的迭代次数
warmup = 1000, #预热步数
cores = 2, #使用的 CPU 核心数
silent = TRUE,
control = list(adapt_delta = 0.95) # 增加 adapt_delta(结果提示发散时)
)
fit结果
Family: gaussian
Links: mu = identity; sigma = identity
Formula: M1 ~ 0 + BS + US + CB
Data: dat_model (Number of observations: 9)
Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup draws = 4000
Regression Coefficients:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
BS 0.26 0.01 0.24 0.29 1.01 707 899
US 0.49 0.05 0.40 0.57 1.01 592 684
CB 0.24 0.04 0.18 0.31 1.01 593 649
Further Distributional Parameters:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sigma 0.51 0.26 0.26 1.17 1.01 729 729
Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
结果分析
注意:
pairs(fit_finger)
检查# 查看结果MAE
mae <- function(a, b, c, mix){
if(mix==1){
p1 <- p2 <- p3 <- 1/3
} else if(mix==2){
p1 <- 0.4;p2 <- 0.4;p3 <- 0.2
} else {
p1 <- 0.6;p2 <- 0.2;p3 <- 0.2
}
res <- mean(c(abs(p1-a), abs(p2-b), abs(p3-c)))*100
return(res)
}
# scus 结果更好
# mae(0.2551262,0.5068927,0.2379811, mix) #scus:11.57062%
# mae(0.26, 0.49,0.24,mix) #brms:10.77778%
# 提取后验样本
posterior <- as_draws_df(fit_finger)
dat_res <- posterior[,seq_along(sou_name)] %>% setDT() %>% melt()
# 后验结果画图
ggplot(dat_res, aes(x=value, fill=variable))+
geom_density(show.legend = F)+
facet_wrap("variable", ncol=2, scales="free")+
theme_bw()
与上面模型不同,这里强定义了贡献比的特征,包括01区间、和为1,并且采用非线性模型估计结果。
# 读取数据
mergeData_s <- function(data){
vlaidData <- data
allTracer <- names(data)[-(1:3)]
resfront <- vlaidData[, lapply(.SD, unique),.SDcols=3, by="class"] %>% .[, id:=1:nrow(.)]
outData <- vlaidData[, lapply(.SD, mean), .SDcols=-(1:3), by="class"]
res <- resfront[outData, on="class"] %>% .[,.SD, .SDcols=c("id", "class", "type", allTracer)]
return(res)
}
data_merge <- fread("./data/data-scus-raw.csv") %>% mergeData_s()
# 10组指纹因子
finger_tsp <- c("xlf", "a_star")
finger_cm <- c("a_star", "xhf", "xlf", "cs137", "pb210", "L_star","Y", "X")
finger_cr80 <- c("a_star", "xhf", "xlf", "cs137")
finger_frn <- c("cs137", "pb210")
finger_mag <- c("xhf","xlf")
finger_col <- c("X", "Y", "Z", "L_star", "a_star", "b_star", "c_star", "h_star")
finger_set <- list(c(finger_tsp), c(finger_cm), c(finger_cr80),
c(finger_frn), c(finger_mag), c(finger_col),
c(finger_frn, finger_mag),
c(finger_frn, finger_col),
c(finger_mag, finger_col),
c(finger_frn, finger_mag, finger_col))
# 指纹因子
mix <- 2 #选择使用哪个汇建模
finger <- finger_cm #选择使用哪组复合指纹
# 筛选源汇
sou_name <- data_merge$class[1:3]
tar_name <- data_merge$class[4:6]
dat_sou <- data_merge[1:3, .SD, .SDcols=finger] %>% t() %>% data.table() %>% set_colnames(sou_name)
dat_tar <- data_merge[4:6, .SD, .SDcols=finger] %>% t() %>% data.table() %>% set_colnames(tar_name)
# 生成模型数据
dat_model <- cbind(dat_sou,dat_tar[,.SD,.SDcols = mix])
定义公式时,部分参数含义如下
nl = TRUE
表示非线性模型b1,b2,b3
表示需要求解的结果,是自定义的名字b ~ 1
表示b是常数项(截距项),与自变量无关。b ~ x1
表示b是变量 x1 的线性函数。b ~ x1 + x2
表示b是 x1 和 x2 的线性函数。b ~ (1 | group)
表示b是一个随机效应,随 group
变化。# 非线性公式:简化定义
formula <- bf(
M2 ~ (exp(b1) / (exp(b1) + exp(b2) + exp(b3))) * BS +
(exp(b2) / (exp(b1) + exp(b2) + exp(b3))) * US +
(exp(b3) / (exp(b1) + exp(b2) + exp(b3))) * CB,
b1 + b2 + b3 ~ 1,
nl = TRUE
)
关于先验分布选择的问题:
均匀分布:一种无信息先验,因为它对所有可能的值赋予相同的概率密度。
优点是,完全对称,没有任何偏好。
缺点是,范围是硬性限制,均匀分布可能不如其他分布(如正态分布)高效。
正态分布:是一种弱信息先验,表示你假设参数的值在 0 附近,但允许较大的灵活性。
优点是,没有硬性范围限制,适合参数的真实值可能超出某个区间的情况。并且由于连续光滑,计算更高效。
缺点是,不是完全无信息的,因为它假设参数的值更可能集中在0附近。若参数真实值远离0,则需要调整均值和标准差。
# 先验
priors <- c(
prior(normal(0, 1), nlpar = "b1"), # b1 的先验
prior(normal(0, 1), nlpar = "b2"), # b2 的先验
prior(normal(0, 1), nlpar = "b3") # b3 的先验
)
# 经尝试,均匀分布结果经常不收敛,虽然理论上b取值-10到10可以穷举定义域中的所有可能
# priors <- c(
# prior(uniform(-10,10), nlpar = "b1"), # b1 的先验
# prior(uniform(-10,10), nlpar = "b2"), # b2 的先验
# prior(uniform(-10,10), nlpar = "b3") # b3 的先验
# )
建模求解
# 模型拟合
fit_finger_nolinear <- brm(
formula = formula, # 非线性公式
data = dat_model, # 数据
family = gaussian(), # 正态分布假设
prior = priors, # 设置先验
chains = 4, # MCMC 链数
iter = 2000, # 每条链的迭代次数
warmup = 1000, # 预热步数
cores = 2, # 使用的 CPU 核心数
control = list(adapt_delta = 0.95) # 调整控制参数
)
# 提取模型后验样本
fixed_effects <- fixef(fit_finger_nolinear)
# 提取 beta1, beta2, beta3 的估计值
b1 <- fixed_effects[1,1];b1
b2 <- fixed_effects[2,1];b2
b3 <- fixed_effects[3,1];b3
# 求解对应贡献比
res_p1 <- exp(b1) / (exp(b1) + exp(b2) + exp(b3));res_p1
res_p2 <- exp(b2) / (exp(b1) + exp(b2) + exp(b3));res_p2
res_p3 <- exp(b3) / (exp(b1) + exp(b2) + exp(b3));res_p3
sum(res_p1, res_p2, res_p3)
# 查看MAE结果
mae <- function(a, b, c, mix){
if(mix==1){
p1 <- p2 <- p3 <- 1/3
} else if(mix==2){
p1 <- 0.4;p2 <- 0.4;p3 <- 0.2
} else {
p1 <- 0.6;p2 <- 0.2;p3 <- 0.2
}
res <- mean(c(abs(p1-a), abs(p2-b), abs(p3-c)))*100
return(res)
}
# 结果比同情况下的scus结果更好(多次实验发现,MAE互有胜负)
mae(0.32812531, 0.60558663, 0.06628806, mix) #scus:13.70578%
mae(res_p1, res_p2, res_p3, mix) #brms:12.98551%
fit结果(注意,当Rhat超过1.05时,说明计算结果不好,得从新算)
Family: gaussian
Links: mu = identity; sigma = identity
Formula: M2 ~ (exp(b1)/(exp(b1) + exp(b2) + exp(b3))) * US + (exp(b2)/(exp(b1) + exp(b2) + exp(b3))) * BS + (exp(b3)/(exp(b1) + exp(b2) + exp(b3))) * CB
b1 ~ 1
b2 ~ 1
b3 ~ 1
Data: dat_model (Number of observations: 8)
Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup draws = 4000
Regression Coefficients:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
b1_Intercept 0.93 0.62 -0.29 2.19 1.01 946 1297
b2_Intercept 0.36 0.59 -0.81 1.56 1.01 869 1210
b3_Intercept -1.20 0.71 -2.59 0.20 1.00 961 1095
Further Distributional Parameters:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sigma 0.66 0.27 0.35 1.28 1.00 1237 1337
Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
案例:基于鸢尾花数据集iris,推测setosa类鸢尾花的 Sepal.Length
模型:\(Y = \beta_0+\epsilon\),因为只有截距项,所以截距的估计值为均值,随机变量的标准差也就是的标准差。
生成模型:\(y_i \sim N(\mu^2,\sigma)\)
先验分布:\(\mu \sim N(5, 2)\), \(\sigma \sim exp(1)\)
这里将误差的分布定义为指数分布,有以下原因,非负性、简洁性、正则化(对较小的标准差值有更强的先验期望,倾向于认为误差不会特别大)
可用的先验分布形式还有:
prior(normal(0, 1), class = "sigma", lb = 0)
prior(cauchy(0, 1), class = "sigma", lb = 0)
prior(gamma(2, 0.5), class = "sigma")
prior(student_t(3, 0, 1), class = "sigma", lb = 0)
prior(uniform(0, 10), class = "sigma")
prior(inverse_gamma(2, 0.5), class = "sigma")
在brms中,class
参数用于指定先验分布应用于模型的哪个部分或参数类别。不同的class值对应于模型中的特定参数类型。这是brms包中定义先验分布的重要部分,因为复杂模型可能有多种参数需要指定先验。常用的class
参数有如下可选项:
class = "Intercept"
class = "b"
prior(exponential(1), class = "sigma")
prior(cauchy(0, 2), class = "sd")
prior(lkj(2), class = "cor")
prior(normal(2, 1), class = "shape")
prior(gamma(2, 0.1), class = "nu")
prior(normal(0, 10), class = "threshold")
# 准备数据
data <- iris %>% filter(Species == 'setosa')
# 设定先验分布
myprior <- c(prior(normal(5, 2), class = 'Intercept'),
prior(exponential(1), class = 'sigma'))
# 建模预测(这里不运行这个模型了,太慢而且文字信息太多)
# fit_msd <- brm(Sepal.Length ~ 1, prior = myprior, data = data)
# 查看绘制后验分布和MCMC过程
# 提取后验样本
# posterior <- as_draws_df(fit_msd)
# 查看拦截项的后验分布
# hist(posterior$b_Intercept, main = "Posterior Distribution of Intercept", xlab = "Intercept")
# 或者直接画图
# plot(fit_msd)
fit结果如下:
Family: gaussian
Links: mu = identity; sigma = identity
Formula: Sepal.Length ~ 1
Data: data (Number of observations: 50)
Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup draws = 4000
Regression Coefficients:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept 5.01 0.05 4.91 5.10 1.00 3070 2688
Further Distributional Parameters:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sigma 0.36 0.04 0.30 0.44 1.00 3296 2605
Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
结果表明:均值估计为5.01,可信区间为4.91-5.11;标准差为0.36,可信区间为0.30-0.44
其中需要注意的是
案例:假设有A,B两款药;100个服用A药的人中有30个康复了,50个服用B药的人有20个康复。我们想估算A和B两类药的有效率,以及两类药的有效率的差值。
模型:
生成模型:\(y_{ia} \sim binomial(p_a)\), \(y_{ib} \sim binomial(p_b)\), \(p_a=\alpha\), \(p_b=\alpha + \beta\)
先验分布:这里使用了程序默认的先验分布。可以通过fit_pro$prior查看
注意:
# 准备数据
data <- tibble(size = c(100, 50),
success = c(30, 20),
type = factor(c("A", "B")))
# 建模预测(这里不运行这个模型了,太慢而且文字信息太多)
# fit_pro <- brm(success|trials(size) ~ type,
# family = binomial(link = 'identity'),
# data = data)
# fit_pro
fit结果如下
Family: binomial
Links: mu = identity
Formula: success | trials(size) ~ type
Data: data (Number of observations: 2)
Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup draws = 4000
Regression Coefficients:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept 0.30 0.05 0.22 0.40 1.00 4485 3168
typeB 0.10 0.08 -0.06 0.26 1.00 2083 2437
Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
结果表明:A类药品的有效率为0.3,可信区间为0.22到0.40;B类药品比A类药品有效率高0.1,可信区间为-0.06到0.26.
通过prior_summary(fit_pro)
查看先验分布,结果如下,其中,flat表示非信息性先验,即无任何有用的先验信息。student_t(3, 25, 7.4)
表示对截距参数的先验是一个
Student-t
分布,其中自由度为3,均值为25,标准差为7.4。class中的b表示回归系数,intercept表示截距项。source中,default表示先验是brms的默认设置,vectorized表示先验分布是什么是量化应用到多个回归系数上的
prior class coef group resp dpar nlpar lb ub source
(flat) b default
(flat) b typeB (vectorized)
student_t(3, 25, 7.4) Intercept default
案例:用一元线性模型对冰淇凌销量\(y_i\)和气温\(x_i\)之间的关系进行建模。其中气温是自变量,销量是因变量。共有三个待估参数,分别是气温的系数\(\beta\),截距\(\alpha\),以及条件分布的标准差\(\sigma\)(假设为正态分布)。
模型:
生成模型:\(y_i \sim N(\mu_i,\sigma)\), \(\mu_i=\alpha+\beta x_i\),即线性回归模型
先验分布:默认分布
data <- data.frame(
temp = c( 11.9, 14.2, 15.2, 16.4, 17.2, 18.1,
18.5, 19.4, 22.1, 22.6, 23.4, 25.1),
units = c( 185L, 215L, 332L, 325L, 408L, 421L,
406L, 412L, 522L, 445L, 544L, 614L)
)
# fit_linreg <- brm(units ~ temp, data = data)
# fit_linreg
fit结果如下
Family: gaussian
Links: mu = identity; sigma = identity
Formula: units ~ temp
Data: data (Number of observations: 12)
Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup draws = 4000
Regression Coefficients:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept -157.95 63.85 -282.11 -27.44 1.00 2909 2253
temp 30.01 3.36 23.08 36.70 1.00 2956 2260
Further Distributional Parameters:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sigma 43.68 11.49 27.32 71.30 1.00 2077 2158
Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
注意,此处的Family为高斯分布,link为identity,
案例:上面冰激凌的例子中。考虑到因变量是计数型的,想把分布族从正态改为泊松。和R自带的glm函数一样,模型改写成以下形式就可以:
模型:
生成模型:\(y_i \sim Poisson(\mu_i,\sigma)\), \(log(\mu_n)=\alpha+\beta x_i\),即线性回归模型
先验分布:默认分布
注意:
factorial(3)
,计算组合用choose(5, 2)
,计算排列用factorial(5)/factorial(5-3)
data <- data.frame(
temp = c( 11.9, 14.2, 15.2, 16.4, 17.2, 18.1,
18.5, 19.4, 22.1, 22.6, 23.4, 25.1),
units = c( 185L, 215L, 332L, 325L, 408L, 421L,
406L, 412L, 522L, 445L, 544L, 614L)
)
fit_pos_simp <- brm(units ~ temp,
data = data,
family = poisson(link = 'log'))
fit结果如下
Family: poisson
Links: mu = log
Formula: units ~ temp
Data: data (Number of observations: 12)
Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup draws = 4000
Regression Coefficients:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept 4.54 0.08 4.39 4.69 1.00 2552 2895
temp 0.08 0.00 0.07 0.08 1.00 3061 2742
Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
注意,原模型拟合的结果,实际上是lambda与x的关系,即y的均值与x的关系,
从回归结果中可以看出,模型对数据的拟合情况
# 提取 conditional_effects 数据
effects <- conditional_effects(fit_pos_simp) # 注意若不赋值,这个函数本身就画图,但是不画数据点
effect_data <- as.data.frame(effects$temp) # 提取与 temp 相关的效果
# 画图
ggplot() +
geom_point(data = data, aes(x = temp, y = units), color = "blue", size = 2) +
geom_line(data = effect_data, aes(x = temp, y = estimate__), color = "red", linewidth = 1) +
geom_ribbon(data = effect_data, aes(x = temp, ymin = lower__, ymax = upper__),
fill = "red", alpha = 0.2)
注意,虽然fit_pos_simp
的结果可以转化为geom_function(fun=\(x) exp(4.54+0.08*x))
,但是下面的图与上面的并不相同,原因可能是conditional_effects
使用了后验分布来计算均值,而 geom_function
使用的是点估计值
ggplot(data, aes(temp, units))+
geom_point()+
# geom_function(fun= ~ exp(4.54+0.08*x))
# geom_function(fun=function(x) exp(4.54+0.08*x))
geom_function(fun=\(x) exp(4.54+0.08*x))
此外,贝叶斯建模后,可基于所得模型(或者说后验分布)对原样本中的冰淇凌销量进行再预测。将预测结果和实际情况比较,从而对模型的拟合情况进行评估。下图中的曲线为概率密度曲线。粗实线是样本中冰淇凌的实际销量分布情况,细实线是基于所得泊松模型多次抽样模拟的结果。
# 模型预测结果的后验检验图
pp_check(fit_pos_simp)
案例:假设景区有一个钓鱼的项目,游客团体们组队去钓鱼。游客团体有两类,一类是专心想钓鱼的,另一类是纯粹不想钓鱼的(例如只是去鱼塘欣赏风景)。前者钓到鱼的数量服从泊松分布,后者钓到鱼的数量衡等于零,导致统计的钓鱼数量分布相较泊松分布有了更多零,故名“零膨胀”。
这里使用零膨胀泊松(Zero-Inflated Poisson, ZIP)拟合数据
这个数据集是一个来自加州大学洛杉矶分校(UCLA)数字研究与教育研究所的数据集(https://stats.oarc.ucla.edu/r/dae/zip/)。它包含有关到州立公园钓鱼旅行的信息。数据已经保存到本地,其中共有8个列,其含义分别为:
注意这组数据中,nofish列没用,无论是0还是1,对应的count都有非0数据,说明矛盾的
模型:
生成模型:\(y_i\sim ZIPoisson(p_i, \lambda_i)\), \(p_i=\alpha_i\), \(log(\lambda_i)=\alpha_\lambda+x_i^\prime \beta_\lambda\),其中\(p_i\)是必然捕不到鱼的概率(zi)
先验分布:默认分布
注意:
poisson
变为了zero_inflated_poisson
# zinb <- read.csv("http://stats.idre.ucla.edu/stat/data/fish.csv")
zinb <- read.csv("./data/zinb.csv")
dat_zero <- zinb %>% mutate(camper = factor(camper, labels = c("no", "yes")))
fit_pos_zero <- brm(count ~ persons + child + camper,
data = dat_zero,
family = zero_inflated_poisson("log"))
fit结果如下
Family: zero_inflated_poisson
Links: mu = log; zi = identity
Formula: count ~ persons + child + camper
Data: dat_zero (Number of observations: 250)
Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup draws = 4000
Regression Coefficients:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept -1.02 0.17 -1.36 -0.68 1.00 2869 2729
persons 0.88 0.04 0.79 0.96 1.00 3016 2550
child -1.37 0.09 -1.55 -1.18 1.00 2567 2593
camperyes 0.80 0.10 0.61 0.99 1.00 3030 2203
Further Distributional Parameters:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
zi 0.41 0.04 0.32 0.49 1.00 2775 2418
Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
结果中:
zi是ZIP模型中的零膨胀参数,表示数据中额外零值出现的概率。ZIP模型包括确定性0(零膨胀部分)以及泊松分布(普通泊松部分),用公式表示为\(P(Y=0)=zi+(1-zi)P_{poisson}(Y=0)\)。zi是必然情况,在本例中为41%,也就是说41%必然为0,剩余的59%才服从泊松分布,这部分仍有可能为0。
其余变量的边际效应,可用如下代码绘制出来,结果表明团队成员越多、孩子越少、带露营的团队容易捕到更多鱼:
ME <- conditional_effects(fit_pos_zero)
#下面主要是为了把每个变量的边际效应图都提取出来,合并后方便展示
p1 <- plot(ME, plot = FALSE)[[1]] + theme_classic()
p2 <- plot(ME, plot = FALSE)[[2]] + theme_classic()
p3 <- plot(ME, plot = FALSE)[[3]] + theme_classic()
library(cowplot)
plot_grid(p1, p2, p3)
案例:对于上文的案例,我们可以假设,那些有孩子的团会避免去摸鱼。检验该假设需要查看孩子数量child对不摸鱼概率zi 有没有解释力。我们可以将该设定和上面零膨胀模型一起估计。brm中只需要用bf将两个公式合并在一起就可以。
模型:
生成模型:\(y_i\sim ZIPoisson(p_i, \lambda_i)\), \(log(p_i)=\alpha_p+\beta_pchild\), \(log(\lambda_i)=\alpha_\lambda+x_i^\prime \beta_\lambda\),其中\(p_i\)是必然捕不到鱼的概率(zi)
先验分布:默认分布
注意,与上面模型的不同之处在于,多了一个zi ~ child
# fit_pos_zero2 <- brm(bf(count ~ persons + child + camper, zi ~ child),
# data = zinb,
# family = zero_inflated_poisson())
#
# fit_pos_zero2
fit结果:
Family: zero_inflated_poisson
Links: mu = log; zi = logit
Formula: count ~ persons + child + camper
zi ~ child
Data: zinb (Number of observations: 250)
Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup draws = 4000
Regression Coefficients:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept -1.08 0.19 -1.45 -0.73 1.00 2739 2803
zi_Intercept -0.96 0.26 -1.49 -0.48 1.00 3584 2871
persons 0.89 0.05 0.80 0.99 1.00 2686 2415
child -1.18 0.10 -1.37 -0.99 1.00 2590 2394
camper 0.78 0.09 0.59 0.97 1.00 3700 2637
zi_child 1.21 0.28 0.66 1.78 1.00 3613 2431
Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
结果表明:
Links: mu = log; zi = logit
,表明brm默认给zi一个logit变换,因此对zi_child的解释应该也服从logit模型,即团队每多一个child,则zi概率变为原来的\(exp(1.21)=3.35\)倍上面的泊松分布,实际上是广义线性模型中的一种。广义线性模型可表示为如下形式。
首先,个体\(i\)的因变量\(y_i\)来自某一个分布族family:\(y_i\sim family(\theta_{1i}, \theta_{2i}, \theta_{3i}...)\)
其中,\(\theta_i\)是该分布族的参数,因为分布族不同,参数\(\theta_i\)的数量也不同,例如正态分布就有两个参数,均值与方差;泊松分布只有一个参数,因为它的均值也就是方差;零膨胀模型分布有两个参数,一个是因变量恒为0的概率,一个是泊松分布的均值。
对于任意模型,都有线性组合\(\eta=X\beta\),为了将该组合与分布族的参数联系起来,需要连接函数link:\(link(\theta_i)=\eta_i\)
对于正态分布来说,\(link\)就是\(\theta_i\)本身,因此线性回归模型中,family是gaussian,mu和sigma的link都是identity;对于泊松分布来说,就是\(log(\cdot)\);对于零膨胀模型来说,由于有两个参数,所以有两个连接函数,均值的和zi的都是\(log(\cdot)\)。
因此,只需要设定好family
和link
参数,就可以拟合所有的广义线性模型了。brms提供了大多数常用分布族。当然,我们也可以手动自己添加分布族。
常用的模型举例如下,其他各类分布族与连接函数,可以通过??brmsfamily查看。
# # 二元logistic回归
# brm(y ~ x, data = data, family = binomial(link = 'logit'))
#
# # 多分类logistic回归
# brm(y ~ x, data = data, family = multinomial(link = 'logit'))
#
# # 序次logistic回归
# brm(y ~ x, data = data, family = cumulative(link = 'logit'))
Pooling model(池化模型)是一种在统计学、机器学习和计量经济学中常用的方法,它假设不同的个体或组的数据可以合并(pooled)起来以进行分析。通过池化,模型将多个数据来源的特征看作来自相同分布,从而简化建模和参数估计。
案例:这里使用的是明尼苏达州居民住宅氡含量的数据。因变量为住宅的氡含量radon,自变量为房屋楼层floor(0 = basement, 1 = first floor)。在数据结构上,作为观测对象的住房嵌套于其所在郡county。我们希望通过建模探究房屋楼层floor对氡含量radon的影响。注意这里只筛选了state为MN的样本。
模型:
基础线性模型为:\(radon=\alpha+\beta_1floor+\epsilon\)
# 原始数据连接:https://github.com/fonnesbeck/multilevel_modeling/blob/master/data/srrs2.dat
srrs2 <- read.table("./data/srrs2.dat", header = T, sep = ',') %>%
as_tibble() %>%
filter(state == 'MN') %>%
mutate(radon = ifelse(activity == 0, .1, activity), county = factor(county))
# 频率法建模
# fit_pool_freq <- lm(radon ~ floor, data = srrs2)
# 贝叶斯建模
# fit_pool_baye <- brm(radon ~ floor, data = srrs2)
fit结果对比,差异不大
频率模型
Call:
lm(formula = radon ~ floor, data = srrs2)
Coefficients:
(Intercept) floor
5.066 -1.785
贝叶斯模型
Family: gaussian
Links: mu = identity; sigma = identity
Formula: radon ~ floor
Data: srrs2 (Number of observations: 919)
Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup draws = 4000
Regression Coefficients:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept 5.06 0.16 4.75 5.38 1.00 5085 3081
floor -1.79 0.40 -2.58 -1.01 1.00 4125 3157
Further Distributional Parameters:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sigma 4.44 0.11 4.23 4.65 1.00 4670 3218
Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
模型:
生成模型:\(radon=\alpha+\beta_1floor+\beta_2county_{ANOKA}+\beta_3county_{BECKER}+...+\beta_{85}county_{YELLOW MEDICINE}\)
该模型假设每个郡都有一个独立的截距,或者说自带一个固定效应。其缺陷在于,当某郡的住房样本量比较小时,对该郡固定效应的估计就很容出现极端值,或者说估计不准确(尽管在保证参数无偏性上,固定效应比随机效应的设定更具优势)。
注意,数据中,county为字符型或者说
# 频率法建模
# fit_pool_freq_fix <- lm(radon ~ floor+county, data = srrs2)
# 贝叶斯建模
# fit_pool_baye_fix <- brm(radon ~ floor+county, data = srrs2)
fit结果对比,差异不大
频率模型
Call:
lm(formula = radon ~ floor + county, data = srrs2)
Coefficients:
(Intercept) floor countyANOKA
2.7114 -2.5455 0.3970
countyBECKER countyBELTRAMI countyBENTON
2.3190 2.7718 1.7000
countyBIG STONE countyBLUE EARTH countyBROWN
2.2220 5.3023 4.3864
(略)
贝叶斯模型
Family: gaussian
Links: mu = identity; sigma = identity
Formula: radon ~ floor + county
Data: srrs2 (Number of observations: 919)
Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup draws = 4000
Regression Coefficients:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept 2.71 2.12 -1.63 6.63 1.10 34 166
floor -2.55 0.42 -3.37 -1.71 1.00 3828 2882
countyANOKA 0.38 2.19 -3.75 4.75 1.10 35 181
countyBECKER 2.28 3.19 -4.01 8.46 1.04 71 567
countyBELTRAMI 2.79 2.66 -2.22 8.16 1.06 51 350
countyBENTON 1.69 2.97 -4.18 7.50 1.05 53 433
countyBIGSTONE 2.20 3.19 -4.03 8.38 1.04 71 591
countyBLUEEARTH 5.28 2.40 0.86 10.04 1.08 42 223
countyBROWN 4.42 2.95 -1.31 10.19 1.05 58 424
(略)
模型设定
\(radon_i\sim N(\alpha_{j[i]}+\beta_{1j[i]}floor, \sigma^2)\), \[\binom{\alpha_j}{\beta_{1j}}\sim N(\binom{\mu_{\alpha_j}}{\mu_{\beta_{1j}}}, \begin{pmatrix} \sigma^2_{\alpha_j} & \rho\alpha_j\beta_{1j} \\ \rho\beta_{1j} & \sigma^2_{\beta_{1j}} \\ \end{pmatrix})\], for county \(j = 1,2,3,...,J\)
顾名思义,就是对截距和系数项设置了随机效应,假设不同郡的系数和截距都不一样。该设定和固定效应的区别是,我们并不对每个郡的截距与斜率进行单独估计,而是充分利用了数据总体所包含的信息。简单来说,就是每个县都有自己的均值,但是一个县获取的信息可以帮助我们估计其他县的值。
注意,设置随机效应的方法为(1 + floor|county)
,代表对截距项以及floor项在county层面设置了随机效应
# 频率法建模
library(lme4)
fit_pool_freq_rad <- lmer(radon ~ floor + (1 + floor|county), data = srrs2)
# 贝叶斯模型
# fit_pool_baye_rad <- brm(formula = radon ~ floor + (1+floor|county), data = srrs2)
# coef(fit_pool_baye_rad) #可以查看对不同郡截距与系数的估计。
# as_draws_df(fit_pool_baye_rad) #可以直接提取后验分布样本。
fit结果对比
频率模型
Linear mixed model fit by REML ['lmerMod']
Formula: radon ~ floor + (1 + floor | county)
Data: srrs2
REML criterion at convergence: 5273.721
Random effects:
Groups Name Std.Dev. Corr
county (Intercept) 2.246
floor 1.910 -0.98
Residual 4.076
Number of obs: 919, groups: county, 85
Fixed Effects:
(Intercept) floor
5.774 -2.473
贝叶斯模型
Family: gaussian
Links: mu = identity; sigma = identity
Formula: radon ~ floor + (1 + floor | county)
Data: srrs2 (Number of observations: 919)
Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup draws = 4000
Multilevel Hyperparameters:
~county (Number of levels: 85)
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sd(Intercept) 2.17 0.31 1.59 2.81 1.00 1242 1811
sd(floor) 1.92 0.54 0.87 2.98 1.00 1382 1441
cor(Intercept,floor) -0.84 0.14 -0.99 -0.50 1.00 2038 2068
Regression Coefficients:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept 5.74 0.32 5.12 6.37 1.00 1291 2461
floor -2.37 0.47 -3.30 -1.43 1.00 2716 3104
Further Distributional Parameters:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sigma 4.08 0.10 3.88 4.28 1.00 5690 2613
Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
brms
包模型的主要结构包括:
family
:\(y_i\sim
family(\theta_{1i}, \theta_{2i}, \theta_{3i}...)\)link
:\(link(\theta_i)=\eta_i\)其中,自变量与因变量的关系,可以自定义,例如下面的数据中,有以下关系:
注意:
bf
表示贝叶斯模型公式,支持指定固定效应、随机效应以及非线性关系。nl=TRUE
,即模型是非线性的,是自己设定的\(\eta\)形式b1 ~ 1
,
b2 ~ 1
告诉程序,b1和b2都是待估参数,且不需要用其他变量对这两个参数进行估计(即只有截距~1)。prior
函数,将b1
,b2
的先验函数,定义为正态分布# 生成数据
b <- c(2, 0.75)
x <- rnorm(100)
y <- rnorm(100, mean = b[1] * exp(b[2] * x))
dat_custom <- data.frame(x, y)
# 设定先验分布
prior1 <- prior(normal(1, 2), nlpar = "b1") + prior(normal(0, 2), nlpar = "b2")
# 拟合
# fit_custom <- brm(bf(y ~ b1 * exp(b2 * x),
# nl = TRUE,
# b1 ~ 1,
# b2 ~ 1),
# data = dat_custom,
# prior = prior1)
fit结果:
Family: gaussian
Links: mu = identity; sigma = identity
Formula: y ~ b1 * exp(b2 * x)
b1 ~ 1
b2 ~ 1
Data: dat_custom (Number of observations: 100)
Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup draws = 4000
Regression Coefficients:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
b1_Intercept 1.90 0.10 1.69 2.10 1.00 1727 2057
b2_Intercept 0.79 0.03 0.74 0.85 1.00 1726 2194
Further Distributional Parameters:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sigma 1.00 0.07 0.87 1.16 1.00 2213 2035
Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
结果表明,b1的估计值为1.90,b2的估计值为0.79
Case data的数据来自lme4包,描述了非洲牛CBPP病的传播。其中有四个变量:
基准模型:
普通二项式模型,对于事件\(y\)(本例中为incidence)和总试验次数\(T\)(size),二项分布的概率质量函数为\(P(y|T,p)= \binom{T}{y}p^y(1-p)^{T-y}\),其中\(p_i\)是概率。景点的二项式模型中,使用logit-scale预测\(p_i\),即对每个观测值\(i\),成功的概率为\(p_i=\frac{exp(\eta_i)}{1+exp(\eta_i)}\),其中\(\eta_i\)是观测值\(i\)的线性预测项。
在brms包中,上述过程可表示为:
其中:
|
可以看作是incidence/size
,即成功率# fit_offical1 <- brm(incidence | trials(size) ~ period + (1|herd),
# data = cbpp,
# family = binomial())
fit结果为
Family: binomial
Links: mu = logit
Formula: incidence | trials(size) ~ period + (1 | herd)
Data: cbpp (Number of observations: 56)
Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup draws = 4000
Multilevel Hyperparameters:
~herd (Number of levels: 15)
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sd(Intercept) 0.76 0.23 0.40 1.29 1.00 1347 1893
Regression Coefficients:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept -1.40 0.26 -1.95 -0.92 1.00 1852 2195
period2 -1.01 0.31 -1.64 -0.39 1.00 4916 3139
period3 -1.15 0.33 -1.83 -0.53 1.00 4634 3196
period4 -1.61 0.44 -2.56 -0.79 1.00 4632 3188
Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
结果表明:
period1
,-1.40的意义是\(\eta=-1.40\),则带入概率的link式\(p_i=\frac{exp(\eta_i)}{1+exp(\eta_i)}\),求得的period1的p等于0.20,类似的period2时,\(\eta=-1.40-1.01=-2.41\)带入公式得p2=0.08,同理p3=0.075(也是-1.40-1.15),p4=0.047(-1.40-1.61)incidence
上面模型的一个缺点是,由于采用了线性模型,则方差是固定的为\(Var(y_i)=T_ip_i(1-p_i)\),所有超过这个值的方差都不能被模型所解释,这被称作过度散布。可以通过自定义分布来解决。
模型:
Beta-binomial model,是广义的二项分布,有额外的参数处理过度散步。在Beta二项分布中,不再直接估计概率\(p_i\),而是定义了两个Beta分布的超参数,即\(\alpha>0\),\(\beta>0\)。则\(p_i\sim Beta(\alpha_i, \beta_i)\)。
由于\(\alpha_i\)和\(\beta_i\)两参数的含义非常难解释,因此写成另一种形式,另\(\mu\in[0,1]\),\(\phi>0\),且\(\alpha_i=\mu_i\phi_i\),\(\beta_i=\mu_i(1-\phi_i)\),则原Beta分布化简为\(p_i\sim Beta(\mu\phi, (1-\mu)\phi)\)。
因此,参数\(\mu\)和\(\phi\)指定了均值和精度参数,并且\(\mu_i=\frac{exp(\eta_i)}{1+exp(\eta_i)}\),于是仍然通过转换后的线性预测器(如原始二项模型)来预测预期概率,但可以通过参数\(\phi\)解释过度分散问题。
然后使用custom_family
定义分布族
beta_binomial2 <- custom_family(
"beta_binomial2", #自定义分布族名字
dpars = c("mu", "phi"), #分布族需要哪些参数?
links = c("logit", "log"), #分布族参数的连接函数
lb = c(0, 0), #参数的下限
ub = c(1, NA), #参数上限
type = "int", #分布的类型:连续还是离散?
vars = "vint1[n]" #其他不需要估计的参数(命名为vint)——在本例中,即独立重复实验的次数T
)
其次,由于brms
是对Rstan进行的封装,所以自定义分布族后,需要给出(1)分布族密度的对数,以及(2)生成随机样本的函数在Rstan
中的表达式,而对于beta_binomial2分布,这是直接的,因为已经实现了有序的beta_binomial分布。对于模型拟合,我们只需要beta_binomial2_lpmf
,但是beta_binomial2_rng
在处理后会很方便。
stan_funs <- "
real beta_binomial2_lpmf(int y, real mu, real phi, int T) {
return beta_binomial_lpmf(y | T, mu * phi, (1 - mu) * phi);
}
int beta_binomial2_rng(real mu, real phi, int T) {
return beta_binomial_rng(T, mu * phi, (1 - mu) * phi);
}
"
下面这个函数,提供了试验次数的信息。为了提供有关试验次数(一个整数变量)的信息,我们将使用加法参数vint()
,该参数只能在自定义族中使用。类似地,如果我们需要包含实际数据的额外向量,我们将使用vreal()
。实际上,对于这个特殊的例子,我们可以更优雅地应用加法参数trials()
,而不是在基本的二项模型中使用vint()
。然而,由于目前的小插图是为了给主题的总体概述,我们将采用更一般的方法。
# 传递给Stan的用户定义变量
stanvars <- stanvar(scode = stan_funs, block = "functions")
最后正常拟合就可以了:
fit_offical2 <- brm(
incidence | vint(size) ~ period + (1|herd), data = cbpp,
family = beta_binomial2, stanvars = stanvars
)
fit结果:
Family: beta_binomial2
Links: mu = logit; phi = identity
Formula: incidence | vint(size) ~ period + (1 | herd)
Data: cbpp (Number of observations: 56)
Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup draws = 4000
Multilevel Hyperparameters:
~herd (Number of levels: 15)
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sd(Intercept) 0.38 0.26 0.02 0.97 1.01 991 1798
Regression Coefficients:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept -1.35 0.26 -1.87 -0.85 1.00 3541 2346
period2 -0.99 0.41 -1.83 -0.22 1.00 3164 2992
period3 -1.25 0.45 -2.18 -0.42 1.00 3296 2719
period4 -1.53 0.52 -2.63 -0.59 1.00 3403 2886
Further Distributional Parameters:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
phi 17.63 17.69 5.55 57.66 1.00 1400 1186
Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
结果表明,period系数的不确定性比基准模型的不确定性要大一些,这是模型中加入了过色散参数\(\phi\)的结果。除此之外,结果看起来非常相似。
三个有用的需要用户额外输入的函,数这些方法不仅对其自身有用,还为其他后处理方法提供基础。例如,通过loo
方法实现的近似留一交叉验证(LOO-CV)就依赖于log_lik
函数:
为了能够进行log_lik
计算,用户需要为自定义模型定义一个log_lik_<family-name>
函数,本例中,对于
Beta-Binomial
模型,函数应命名为log_lik_beta_binomial2
,它需要两个输入参数,
prep$data
中,模型参数存储在prep$dpars
中)。通常,预测参数(如mu)是一个S×N矩阵(其中S是后验样本数量,N是观测数量),而未预测的参数(如 \(\phi\))是一个大小为N的向量。
有两种方法定义log-likelihood函数,其中一种是自己定义,优点是稳定且在自定义family时的唯一方法;另一种是通过暴露stan中的函数,使函数可以在R中使用,优点是方便
# 使用Stan暴露函数的方法,后期比如pp_check(fit_offical2)中会用到暴露的函数
expose_functions(fit_offical2, vectorize = TRUE)
# 使用自定义方法
log_lik_beta_binomial2 <- function(i, prep) {
mu <- brms::get_dpar(prep, "mu", i = i)
phi <- brms::get_dpar(prep, "phi", i = i)
trials <- prep$data$vint1[i]
y <- prep$data$Y[i]
beta_binomial2_lpmf(y, mu, phi, trials)
}
定义log-likelihood函数后,就可以比较两个模型的拟合效果了
# 模型对比
# loo(fit_offical1, fit_offical2)
对比结果:
Output of model 'fit_offical1':
Computed from 4000 by 56 log-likelihood matrix.
Estimate SE
elpd_loo -100.6 10.4
p_loo 22.7 4.7
looic 201.2 20.9
------
MCSE of elpd_loo is NA.
MCSE and ESS estimates assume MCMC draws (r_eff in [0.4, 1.6]).
Pareto k diagnostic values:
Count Pct. Min. ESS
(-Inf, 0.7] (good) 48 85.7% 264
(0.7, 1] (bad) 6 10.7% <NA>
(1, Inf) (very bad) 2 3.6% <NA>
See help('pareto-k-diagnostic') for details.
Output of model 'fit_offical2':
Computed from 4000 by 56 log-likelihood matrix.
Estimate SE
elpd_loo -95.0 8.3
p_loo 10.8 2.0
looic 190.0 16.5
------
MCSE of elpd_loo is NA.
MCSE and ESS estimates assume MCMC draws (r_eff in [0.4, 1.1]).
Pareto k diagnostic values:
Count Pct. Min. ESS
(-Inf, 0.7] (good) 55 98.2% 118
(0.7, 1] (bad) 1 1.8% <NA>
(1, Inf) (very bad) 0 0.0% <NA>
See help('pareto-k-diagnostic') for details.
Model comparisons:
elpd_diff se_diff
fit_offical2 0.0 0.0
fit_offical1 -5.6 4.5
结果表明,较大的ELPD模型拟合更好,则第二个beta-binomial模型优于基准模型
posterior_predict_beta_binomial2 <- function(i, prep, ...) {
mu <- brms::get_dpar(prep, "mu", i = i)
phi <- brms::get_dpar(prep, "phi", i = i)
trials <- prep$data$vint1[i]
beta_binomial2_rng(mu, phi, trials)
}
后验检查
# 模型预测结果的后验检验图
pp_check(fit_offical2)
posterior_epred_beta_binomial2 <- function(prep) {
mu <- brms::get_dpar(prep, "mu")
trials <- prep$data$vint1
trials <- matrix(trials, nrow = nrow(mu), ncol = ncol(mu), byrow = TRUE)
mu * trials
}
直接依赖posterior_epred
的后处理过程是conditional_effects
,可以可视化预测器的效果
# 预测效果可视化
# 为了便于解释,我们将size设置为1,以便上图的y轴表示概率。
conditional_effects(fit_offical2, conditions = data.frame(size = 1))
© 2021-2025, Lcpmgh, All rights reserved.