R语言变量选择向前选择、向后剔除、逐步回归
·
向前选择
# =========================================================
# 多元线性回归变量选择:向前选择法 Forward Selection
# =========================================================
# =========================================================
# 1. 构造十维多元回归模拟数据
# =========================================================
set.seed(123)
# 样本量
n <- 200
# 自变量个数
p <- 10
# 生成自变量矩阵
X_raw <- matrix(rnorm(n * p, mean = 0, sd = 1), nrow = n, ncol = p)
# 设置变量名
colnames(X_raw) <- paste0("X", 1:p)
# 真实截距
beta0_true <- 5
# 真实回归系数
# X4 和 X7 的真实系数为 0,表示它们是无效变量
beta_true <- c(
2.0, # X1
-1.5, # X2
3.0, # X3
0.0, # X4
1.2, # X5
-2.0, # X6
0.0, # X7
0.8, # X8
1.5, # X9
-1.0 # X10
)
# 随机误差
epsilon <- rnorm(n, mean = 0, sd = 1)
# 因变量
Y <- beta0_true + X_raw %*% beta_true + epsilon
Y <- as.vector(Y)
cat("\n========== 真实模型信息 ==========\n")
cat("真实有效变量:X1, X2, X3, X5, X6, X8, X9, X10\n")
cat("真实无效变量:X4, X7\n")
# =========================================================
# 2. 手动 OLS 拟合函数
# =========================================================
manual_ols_by_vars <- function(X_raw, Y, selected_vars) {
X_raw <- as.matrix(X_raw)
Y <- matrix(Y, ncol = 1)
n <- length(Y)
# 如果没有选择变量,只拟合截距模型
if (length(selected_vars) == 0) {
X_design <- matrix(1, nrow = n, ncol = 1)
colnames(X_design) <- "Intercept"
} else {
X_selected <- X_raw[, selected_vars, drop = FALSE]
X_design <- cbind(1, X_selected)
colnames(X_design) <- c("Intercept", selected_vars)
}
# 参数个数,包括截距项
k <- ncol(X_design)
# OLS 估计:beta_hat = (X'X)^(-1) X'Y
beta_hat <- solve(t(X_design) %*% X_design) %*% t(X_design) %*% Y
# 拟合值
Y_hat <- X_design %*% beta_hat
# 残差
residuals <- Y - Y_hat
# 残差平方和
SSE <- sum(residuals^2)
# 总平方和
SST <- sum((Y - mean(Y))^2)
# 回归平方和
SSR <- SST - SSE
# R方
R2 <- 1 - SSE / SST
# 调整后的R方
adj_R2 <- 1 - (SSE / (n - k)) / (SST / (n - 1))
# AIC 和 BIC
AIC_value <- n * log(SSE / n) + 2 * k
BIC_value <- n * log(SSE / n) + k * log(n)
result <- list(
selected_vars = selected_vars,
X_design = X_design,
beta_hat = beta_hat,
fitted_values = Y_hat,
residuals = residuals,
SSE = SSE,
SSR = SSR,
SST = SST,
R2 = R2,
adj_R2 = adj_R2,
AIC = AIC_value,
BIC = BIC_value,
k = k
)
return(result)
}
# =========================================================
# 3. 向前选择函数
# =========================================================
# 思路:
# 1. 从空模型开始;
# 2. 每一步尝试加入一个尚未进入模型的变量;
# 3. 选择使 AIC 或 BIC 最小的变量;
# 4. 如果加入该变量后准则下降,则保留;
# 5. 如果没有变量能继续降低准则,则停止。
# =========================================================
forward_selection <- function(X_raw, Y, criterion = "BIC") {
all_vars <- colnames(X_raw)
# 初始模型为空模型
selected_vars <- character(0)
remaining_vars <- all_vars
# 当前模型
current_fit <- manual_ols_by_vars(X_raw, Y, selected_vars)
current_score <- current_fit[[criterion]]
# 保存选择路径
selection_path <- data.frame(
step = 0,
action = "Start",
added_variable = "-",
selected_variables = "-",
AIC = current_fit$AIC,
BIC = current_fit$BIC,
R2 = current_fit$R2,
adj_R2 = current_fit$adj_R2
)
step <- 1
repeat {
# 如果没有剩余变量,停止
if (length(remaining_vars) == 0) {
break
}
candidate_table <- data.frame()
candidate_fit_list <- list()
# 尝试加入每一个剩余变量
for (var in remaining_vars) {
candidate_vars <- c(selected_vars, var)
fit <- manual_ols_by_vars(X_raw, Y, candidate_vars)
candidate_table <- rbind(
candidate_table,
data.frame(
candidate_variable = var,
selected_variables = paste(candidate_vars, collapse = ", "),
AIC = fit$AIC,
BIC = fit$BIC,
R2 = fit$R2,
adj_R2 = fit$adj_R2
)
)
candidate_fit_list[[var]] <- fit
}
# 找到当前步骤中最优的候选变量
best_index <- which.min(candidate_table[[criterion]])
best_var <- candidate_table$candidate_variable[best_index]
best_fit <- candidate_fit_list[[best_var]]
best_score <- best_fit[[criterion]]
# 判断加入变量后准则是否下降
if (best_score < current_score) {
selected_vars <- c(selected_vars, best_var)
remaining_vars <- setdiff(remaining_vars, best_var)
current_fit <- best_fit
current_score <- best_score
selection_path <- rbind(
selection_path,
data.frame(
step = step,
action = "Add",
added_variable = best_var,
selected_variables = paste(selected_vars, collapse = ", "),
AIC = current_fit$AIC,
BIC = current_fit$BIC,
R2 = current_fit$R2,
adj_R2 = current_fit$adj_R2
)
)
step <- step + 1
} else {
# 如果加入任何变量都不能降低 AIC 或 BIC,则停止
break
}
}
final_fit <- manual_ols_by_vars(X_raw, Y, selected_vars)
result <- list(
method = paste("Forward Selection by", criterion),
criterion = criterion,
selected_vars = selected_vars,
final_fit = final_fit,
selection_path = selection_path
)
return(result)
}
# =========================================================
# 4. 分别使用 AIC 和 BIC 进行向前选择
# =========================================================
forward_AIC <- forward_selection(X_raw, Y, criterion = "AIC")
forward_BIC <- forward_selection(X_raw, Y, criterion = "BIC")
# =========================================================
# 5. 输出 AIC 向前选择结果
# =========================================================
cat("\n\n========== AIC 向前选择路径 ==========\n")
print(forward_AIC$selection_path)
cat("\n========== AIC 向前选择最终结果 ==========\n")
cat("选择变量:", paste(forward_AIC$selected_vars, collapse = ", "), "\n")
cat("变量个数:", length(forward_AIC$selected_vars), "\n")
cat("最终 AIC =", forward_AIC$final_fit$AIC, "\n")
cat("最终 BIC =", forward_AIC$final_fit$BIC, "\n")
cat("最终 R2 =", forward_AIC$final_fit$R2, "\n")
cat("最终调整后 R2 =", forward_AIC$final_fit$adj_R2, "\n")
cat("\nAIC 向前选择模型回归系数:\n")
AIC_coef_table <- data.frame(
variable = colnames(forward_AIC$final_fit$X_design),
estimate = as.vector(forward_AIC$final_fit$beta_hat)
)
print(AIC_coef_table)
# =========================================================
# 6. 输出 BIC 向前选择结果
# =========================================================
cat("\n\n========== BIC 向前选择路径 ==========\n")
print(forward_BIC$selection_path)
cat("\n========== BIC 向前选择最终结果 ==========\n")
cat("选择变量:", paste(forward_BIC$selected_vars, collapse = ", "), "\n")
cat("变量个数:", length(forward_BIC$selected_vars), "\n")
cat("最终 AIC =", forward_BIC$final_fit$AIC, "\n")
cat("最终 BIC =", forward_BIC$final_fit$BIC, "\n")
cat("最终 R2 =", forward_BIC$final_fit$R2, "\n")
cat("最终调整后 R2 =", forward_BIC$final_fit$adj_R2, "\n")
cat("\nBIC 向前选择模型回归系数:\n")
BIC_coef_table <- data.frame(
variable = colnames(forward_BIC$final_fit$X_design),
estimate = as.vector(forward_BIC$final_fit$beta_hat)
)
print(BIC_coef_table)
# =========================================================
# 7. 与真实模型进行比较
# =========================================================
true_nonzero_vars <- c("X1", "X2", "X3", "X5", "X6", "X8", "X9", "X10")
true_zero_vars <- c("X4", "X7")
cat("\n\n========== AIC 向前选择与真实模型比较 ==========\n")
cat("AIC 选中变量:", paste(forward_AIC$selected_vars, collapse = ", "), "\n")
cat("真实有效变量:", paste(true_nonzero_vars, collapse = ", "), "\n")
cat("AIC 是否错误保留 X4 或 X7:",
any(forward_AIC$selected_vars %in% true_zero_vars), "\n")
cat("AIC 是否遗漏真实有效变量:",
any(!(true_nonzero_vars %in% forward_AIC$selected_vars)), "\n")
cat("\n========== BIC 向前选择与真实模型比较 ==========\n")
cat("BIC 选中变量:", paste(forward_BIC$selected_vars, collapse = ", "), "\n")
cat("真实有效变量:", paste(true_nonzero_vars, collapse = ", "), "\n")
cat("BIC 是否错误保留 X4 或 X7:",
any(forward_BIC$selected_vars %in% true_zero_vars), "\n")
cat("BIC 是否遗漏真实有效变量:",
any(!(true_nonzero_vars %in% forward_BIC$selected_vars)), "\n")
# =========================================================
# 8. 绘制向前选择路径图
# =========================================================
par(mfrow = c(1, 2))
plot(
forward_AIC$selection_path$step,
forward_AIC$selection_path$AIC,
type = "b",
pch = 19,
col = "blue",
xlab = "选择步骤",
ylab = "AIC",
main = "AIC 向前选择路径"
)
plot(
forward_BIC$selection_path$step,
forward_BIC$selection_path$BIC,
type = "b",
pch = 19,
col = "red",
xlab = "选择步骤",
ylab = "BIC",
main = "BIC 向前选择路径"
)
par(mfrow = c(1, 1))
# =========================================================
# 9. 最终结论
# =========================================================
cat("\n\n========== 最终结论 ==========\n")
cat("AIC 向前选择最终变量:",
paste(forward_AIC$selected_vars, collapse = ", "), "\n")
cat("BIC 向前选择最终变量:",
paste(forward_BIC$selected_vars, collapse = ", "), "\n")
cat("\n如果最终剔除了 X4 和 X7,并保留了 X1, X2, X3, X5, X6, X8, X9, X10,\n")
cat("说明向前选择法能够较好识别真实有效变量。\n")
向后剔除
# =========================================================
# 多元线性回归变量选择:向后剔除法 Backward Elimination
# 案例:十维多元线性回归
# =========================================================
# =========================================================
# 1. 构造十维多元回归模拟数据
# =========================================================
set.seed(123)
# 样本量
n <- 200
# 自变量个数
p <- 10
# 生成自变量矩阵
X_raw <- matrix(rnorm(n * p, mean = 0, sd = 1), nrow = n, ncol = p)
# 设置变量名
colnames(X_raw) <- paste0("X", 1:p)
# 真实截距
beta0_true <- 5
# 真实回归系数
# X4 和 X7 的真实系数为 0,表示它们是无效变量
beta_true <- c(
2.0, # X1
-1.5, # X2
3.0, # X3
0.0, # X4
1.2, # X5
-2.0, # X6
0.0, # X7
0.8, # X8
1.5, # X9
-1.0 # X10
)
# 随机误差
epsilon <- rnorm(n, mean = 0, sd = 1)
# 因变量
Y <- beta0_true + X_raw %*% beta_true + epsilon
Y <- as.vector(Y)
cat("\n========== 真实模型信息 ==========\n")
cat("真实有效变量:X1, X2, X3, X5, X6, X8, X9, X10\n")
cat("真实无效变量:X4, X7\n")
# =========================================================
# 2. 手动 OLS 拟合函数
# =========================================================
manual_ols_by_vars <- function(X_raw, Y, selected_vars) {
X_raw <- as.matrix(X_raw)
Y <- matrix(Y, ncol = 1)
n <- length(Y)
# 如果没有选择变量,只拟合截距模型
if (length(selected_vars) == 0) {
X_design <- matrix(1, nrow = n, ncol = 1)
colnames(X_design) <- "Intercept"
} else {
X_selected <- X_raw[, selected_vars, drop = FALSE]
X_design <- cbind(1, X_selected)
colnames(X_design) <- c("Intercept", selected_vars)
}
# 参数个数,包括截距项
k <- ncol(X_design)
# OLS 估计:beta_hat = (X'X)^(-1) X'Y
beta_hat <- solve(t(X_design) %*% X_design) %*% t(X_design) %*% Y
# 拟合值
Y_hat <- X_design %*% beta_hat
# 残差
residuals <- Y - Y_hat
# 残差平方和
SSE <- sum(residuals^2)
# 总平方和
SST <- sum((Y - mean(Y))^2)
# 回归平方和
SSR <- SST - SSE
# R方
R2 <- 1 - SSE / SST
# 调整后的R方
adj_R2 <- 1 - (SSE / (n - k)) / (SST / (n - 1))
# AIC 和 BIC
AIC_value <- n * log(SSE / n) + 2 * k
BIC_value <- n * log(SSE / n) + k * log(n)
result <- list(
selected_vars = selected_vars,
X_design = X_design,
beta_hat = beta_hat,
fitted_values = Y_hat,
residuals = residuals,
SSE = SSE,
SSR = SSR,
SST = SST,
R2 = R2,
adj_R2 = adj_R2,
AIC = AIC_value,
BIC = BIC_value,
k = k
)
return(result)
}
# =========================================================
# 3. 向后剔除函数
# =========================================================
# 思路:
# 1. 从完整模型开始;
# 2. 每一步尝试删除当前模型中的一个变量;
# 3. 选择使 AIC 或 BIC 最小的删除方案;
# 4. 如果删除该变量后准则下降,则删除;
# 5. 如果删除任何变量都不能继续降低准则,则停止。
# =========================================================
backward_elimination <- function(X_raw, Y, criterion = "BIC") {
all_vars <- colnames(X_raw)
# 初始模型为完整模型
selected_vars <- all_vars
# 当前模型
current_fit <- manual_ols_by_vars(X_raw, Y, selected_vars)
current_score <- current_fit[[criterion]]
# 保存剔除路径
selection_path <- data.frame(
step = 0,
action = "Start",
removed_variable = "-",
selected_variables = paste(selected_vars, collapse = ", "),
AIC = current_fit$AIC,
BIC = current_fit$BIC,
R2 = current_fit$R2,
adj_R2 = current_fit$adj_R2
)
step <- 1
repeat {
# 如果当前模型已经没有变量,则停止
if (length(selected_vars) == 0) {
break
}
candidate_table <- data.frame()
candidate_fit_list <- list()
# 尝试删除当前模型中的每一个变量
for (var in selected_vars) {
candidate_vars <- setdiff(selected_vars, var)
fit <- manual_ols_by_vars(X_raw, Y, candidate_vars)
if (length(candidate_vars) == 0) {
vars_text <- "-"
} else {
vars_text <- paste(candidate_vars, collapse = ", ")
}
candidate_table <- rbind(
candidate_table,
data.frame(
removed_variable = var,
selected_variables = vars_text,
AIC = fit$AIC,
BIC = fit$BIC,
R2 = fit$R2,
adj_R2 = fit$adj_R2
)
)
candidate_fit_list[[var]] <- fit
}
# 找到当前步骤中最优的删除变量
best_index <- which.min(candidate_table[[criterion]])
best_removed_var <- candidate_table$removed_variable[best_index]
best_fit <- candidate_fit_list[[best_removed_var]]
best_score <- best_fit[[criterion]]
# 判断删除变量后准则是否下降
if (best_score < current_score) {
selected_vars <- setdiff(selected_vars, best_removed_var)
current_fit <- best_fit
current_score <- best_score
if (length(selected_vars) == 0) {
vars_text <- "-"
} else {
vars_text <- paste(selected_vars, collapse = ", ")
}
selection_path <- rbind(
selection_path,
data.frame(
step = step,
action = "Remove",
removed_variable = best_removed_var,
selected_variables = vars_text,
AIC = current_fit$AIC,
BIC = current_fit$BIC,
R2 = current_fit$R2,
adj_R2 = current_fit$adj_R2
)
)
step <- step + 1
} else {
# 如果删除任何变量都不能降低 AIC 或 BIC,则停止
break
}
}
final_fit <- manual_ols_by_vars(X_raw, Y, selected_vars)
result <- list(
method = paste("Backward Elimination by", criterion),
criterion = criterion,
selected_vars = selected_vars,
final_fit = final_fit,
selection_path = selection_path
)
return(result)
}
# =========================================================
# 4. 分别使用 AIC 和 BIC 进行向后剔除
# =========================================================
backward_AIC <- backward_elimination(X_raw, Y, criterion = "AIC")
backward_BIC <- backward_elimination(X_raw, Y, criterion = "BIC")
# =========================================================
# 5. 输出 AIC 向后剔除结果
# =========================================================
cat("\n\n========== AIC 向后剔除路径 ==========\n")
print(backward_AIC$selection_path)
cat("\n========== AIC 向后剔除最终结果 ==========\n")
cat("选择变量:", paste(backward_AIC$selected_vars, collapse = ", "), "\n")
cat("变量个数:", length(backward_AIC$selected_vars), "\n")
cat("最终 AIC =", backward_AIC$final_fit$AIC, "\n")
cat("最终 BIC =", backward_AIC$final_fit$BIC, "\n")
cat("最终 R2 =", backward_AIC$final_fit$R2, "\n")
cat("最终调整后 R2 =", backward_AIC$final_fit$adj_R2, "\n")
cat("\nAIC 向后剔除模型回归系数:\n")
AIC_coef_table <- data.frame(
variable = colnames(backward_AIC$final_fit$X_design),
estimate = as.vector(backward_AIC$final_fit$beta_hat)
)
print(AIC_coef_table)
# =========================================================
# 6. 输出 BIC 向后剔除结果
# =========================================================
cat("\n\n========== BIC 向后剔除路径 ==========\n")
print(backward_BIC$selection_path)
cat("\n========== BIC 向后剔除最终结果 ==========\n")
cat("选择变量:", paste(backward_BIC$selected_vars, collapse = ", "), "\n")
cat("变量个数:", length(backward_BIC$selected_vars), "\n")
cat("最终 AIC =", backward_BIC$final_fit$AIC, "\n")
cat("最终 BIC =", backward_BIC$final_fit$BIC, "\n")
cat("最终 R2 =", backward_BIC$final_fit$R2, "\n")
cat("最终调整后 R2 =", backward_BIC$final_fit$adj_R2, "\n")
cat("\nBIC 向后剔除模型回归系数:\n")
BIC_coef_table <- data.frame(
variable = colnames(backward_BIC$final_fit$X_design),
estimate = as.vector(backward_BIC$final_fit$beta_hat)
)
print(BIC_coef_table)
# =========================================================
# 7. 与真实模型进行比较
# =========================================================
true_nonzero_vars <- c("X1", "X2", "X3", "X5", "X6", "X8", "X9", "X10")
true_zero_vars <- c("X4", "X7")
cat("\n\n========== AIC 向后剔除与真实模型比较 ==========\n")
cat("AIC 选中变量:", paste(backward_AIC$selected_vars, collapse = ", "), "\n")
cat("真实有效变量:", paste(true_nonzero_vars, collapse = ", "), "\n")
cat("AIC 是否错误保留 X4 或 X7:",
any(backward_AIC$selected_vars %in% true_zero_vars), "\n")
cat("AIC 是否遗漏真实有效变量:",
any(!(true_nonzero_vars %in% backward_AIC$selected_vars)), "\n")
cat("\n========== BIC 向后剔除与真实模型比较 ==========\n")
cat("BIC 选中变量:", paste(backward_BIC$selected_vars, collapse = ", "), "\n")
cat("真实有效变量:", paste(true_nonzero_vars, collapse = ", "), "\n")
cat("BIC 是否错误保留 X4 或 X7:",
any(backward_BIC$selected_vars %in% true_zero_vars), "\n")
cat("BIC 是否遗漏真实有效变量:",
any(!(true_nonzero_vars %in% backward_BIC$selected_vars)), "\n")
# =========================================================
# 8. 绘制向后剔除路径图
# =========================================================
par(mfrow = c(1, 2))
plot(
backward_AIC$selection_path$step,
backward_AIC$selection_path$AIC,
type = "b",
pch = 19,
col = "blue",
xlab = "剔除步骤",
ylab = "AIC",
main = "AIC 向后剔除路径"
)
plot(
backward_BIC$selection_path$step,
backward_BIC$selection_path$BIC,
type = "b",
pch = 19,
col = "red",
xlab = "剔除步骤",
ylab = "BIC",
main = "BIC 向后剔除路径"
)
par(mfrow = c(1, 1))
# =========================================================
# 9. 最终结论
# =========================================================
cat("\n\n========== 最终结论 ==========\n")
cat("AIC 向后剔除最终变量:",
paste(backward_AIC$selected_vars, collapse = ", "), "\n")
cat("BIC 向后剔除最终变量:",
paste(backward_BIC$selected_vars, collapse = ", "), "\n")
cat("\n如果最终剔除了 X4 和 X7,并保留了 X1, X2, X3, X5, X6, X8, X9, X10,\n")
cat("说明向后剔除法能够较好识别真实有效变量。\n")
逐步回归
# =========================================================
# 多元线性回归变量选择:逐步回归 Stepwise Selection
# 案例:十维多元线性回归
# =========================================================
options(stringsAsFactors = FALSE)
# =========================================================
# 1. 构造十维多元回归模拟数据
# =========================================================
set.seed(123)
# 样本量
n <- 200
# 自变量个数
p <- 10
# 生成自变量矩阵
X_raw <- matrix(rnorm(n * p, mean = 0, sd = 1), nrow = n, ncol = p)
# 设置变量名
colnames(X_raw) <- paste0("X", 1:p)
# 真实截距
beta0_true <- 5
# 真实回归系数
# X4 和 X7 的真实系数为 0,表示它们是无效变量
beta_true <- c(
2.0, # X1
-1.5, # X2
3.0, # X3
0.0, # X4
1.2, # X5
-2.0, # X6
0.0, # X7
0.8, # X8
1.5, # X9
-1.0 # X10
)
# 随机误差
epsilon <- rnorm(n, mean = 0, sd = 1)
# 因变量
Y <- beta0_true + X_raw %*% beta_true + epsilon
Y <- as.vector(Y)
cat("\n========== 真实模型信息 ==========\n")
cat("真实有效变量:X1, X2, X3, X5, X6, X8, X9, X10\n")
cat("真实无效变量:X4, X7\n")
# =========================================================
# 2. 手动 OLS 拟合函数
# =========================================================
manual_ols_by_vars <- function(X_raw, Y, selected_vars) {
X_raw <- as.matrix(X_raw)
Y <- matrix(Y, ncol = 1)
n <- length(Y)
# 如果没有选择变量,只拟合截距模型
if (length(selected_vars) == 0) {
X_design <- matrix(1, nrow = n, ncol = 1)
colnames(X_design) <- "Intercept"
} else {
X_selected <- X_raw[, selected_vars, drop = FALSE]
X_design <- cbind(1, X_selected)
colnames(X_design) <- c("Intercept", selected_vars)
}
# 参数个数,包括截距项
k <- ncol(X_design)
# OLS 估计:beta_hat = (X'X)^(-1) X'Y
beta_hat <- solve(t(X_design) %*% X_design) %*% t(X_design) %*% Y
# 拟合值
Y_hat <- X_design %*% beta_hat
# 残差
residuals <- Y - Y_hat
# 残差平方和
SSE <- sum(residuals^2)
# 总平方和
SST <- sum((Y - mean(Y))^2)
# 回归平方和
SSR <- SST - SSE
# R方
R2 <- 1 - SSE / SST
# 调整后的R方
adj_R2 <- 1 - (SSE / (n - k)) / (SST / (n - 1))
# AIC 和 BIC
AIC_value <- n * log(SSE / n) + 2 * k
BIC_value <- n * log(SSE / n) + k * log(n)
result <- list(
selected_vars = selected_vars,
X_design = X_design,
beta_hat = beta_hat,
fitted_values = Y_hat,
residuals = residuals,
SSE = SSE,
SSR = SSR,
SST = SST,
R2 = R2,
adj_R2 = adj_R2,
AIC = AIC_value,
BIC = BIC_value,
k = k
)
return(result)
}
# =========================================================
# 3. 逐步回归函数
# =========================================================
# 思路:
# 1. 从空模型开始;
# 2. 每一步同时尝试:
# 加入一个尚未进入模型的变量;
# 删除一个已经进入模型的变量;
# 3. 选择使 AIC 或 BIC 最小的操作;
# 4. 如果该操作能使准则下降,则执行;
# 5. 如果加入或删除都不能继续降低准则,则停止。
# =========================================================
stepwise_selection <- function(X_raw, Y, criterion = "BIC") {
all_vars <- colnames(X_raw)
# 初始模型为空模型
selected_vars <- character(0)
# 当前模型
current_fit <- manual_ols_by_vars(X_raw, Y, selected_vars)
current_score <- current_fit[[criterion]]
# 保存选择路径
selection_path <- data.frame(
step = 0,
action = "Start",
changed_variable = "-",
selected_variables = "-",
AIC = current_fit$AIC,
BIC = current_fit$BIC,
R2 = current_fit$R2,
adj_R2 = current_fit$adj_R2
)
step <- 1
repeat {
candidate_table <- data.frame()
candidate_fit_list <- list()
candidate_vars_list <- list()
candidate_id <- 1
# -----------------------------------------------------
# 3.1 尝试加入变量
# -----------------------------------------------------
remaining_vars <- setdiff(all_vars, selected_vars)
if (length(remaining_vars) > 0) {
for (var in remaining_vars) {
candidate_vars <- c(selected_vars, var)
fit <- manual_ols_by_vars(X_raw, Y, candidate_vars)
candidate_table <- rbind(
candidate_table,
data.frame(
action = "Add",
changed_variable = var,
selected_variables = paste(candidate_vars, collapse = ", "),
AIC = fit$AIC,
BIC = fit$BIC,
R2 = fit$R2,
adj_R2 = fit$adj_R2
)
)
candidate_fit_list[[candidate_id]] <- fit
candidate_vars_list[[candidate_id]] <- candidate_vars
candidate_id <- candidate_id + 1
}
}
# -----------------------------------------------------
# 3.2 尝试删除变量
# -----------------------------------------------------
if (length(selected_vars) > 0) {
for (var in selected_vars) {
candidate_vars <- setdiff(selected_vars, var)
fit <- manual_ols_by_vars(X_raw, Y, candidate_vars)
if (length(candidate_vars) == 0) {
vars_text <- "-"
} else {
vars_text <- paste(candidate_vars, collapse = ", ")
}
candidate_table <- rbind(
candidate_table,
data.frame(
action = "Remove",
changed_variable = var,
selected_variables = vars_text,
AIC = fit$AIC,
BIC = fit$BIC,
R2 = fit$R2,
adj_R2 = fit$adj_R2
)
)
candidate_fit_list[[candidate_id]] <- fit
candidate_vars_list[[candidate_id]] <- candidate_vars
candidate_id <- candidate_id + 1
}
}
# -----------------------------------------------------
# 3.3 找到当前步骤中最优的操作
# -----------------------------------------------------
best_index <- which.min(candidate_table[[criterion]])
best_action <- candidate_table$action[best_index]
best_changed_var <- candidate_table$changed_variable[best_index]
best_fit <- candidate_fit_list[[best_index]]
best_vars <- candidate_vars_list[[best_index]]
best_score <- best_fit[[criterion]]
# -----------------------------------------------------
# 3.4 判断准则是否下降
# -----------------------------------------------------
if (best_score < current_score) {
selected_vars <- best_vars
current_fit <- best_fit
current_score <- best_score
if (length(selected_vars) == 0) {
vars_text <- "-"
} else {
vars_text <- paste(selected_vars, collapse = ", ")
}
selection_path <- rbind(
selection_path,
data.frame(
step = step,
action = best_action,
changed_variable = best_changed_var,
selected_variables = vars_text,
AIC = current_fit$AIC,
BIC = current_fit$BIC,
R2 = current_fit$R2,
adj_R2 = current_fit$adj_R2
)
)
step <- step + 1
} else {
# 如果加入或删除任何变量都不能降低 AIC 或 BIC,则停止
break
}
}
final_fit <- manual_ols_by_vars(X_raw, Y, selected_vars)
result <- list(
method = paste("Stepwise Selection by", criterion),
criterion = criterion,
selected_vars = selected_vars,
final_fit = final_fit,
selection_path = selection_path
)
return(result)
}
# =========================================================
# 4. 分别使用 AIC 和 BIC 进行逐步回归
# =========================================================
stepwise_AIC <- stepwise_selection(X_raw, Y, criterion = "AIC")
stepwise_BIC <- stepwise_selection(X_raw, Y, criterion = "BIC")
# =========================================================
# 5. 输出 AIC 逐步回归结果
# =========================================================
cat("\n\n========== AIC 逐步回归路径 ==========\n")
print(stepwise_AIC$selection_path)
cat("\n========== AIC 逐步回归最终结果 ==========\n")
cat("选择变量:", paste(stepwise_AIC$selected_vars, collapse = ", "), "\n")
cat("变量个数:", length(stepwise_AIC$selected_vars), "\n")
cat("最终 AIC =", stepwise_AIC$final_fit$AIC, "\n")
cat("最终 BIC =", stepwise_AIC$final_fit$BIC, "\n")
cat("最终 R2 =", stepwise_AIC$final_fit$R2, "\n")
cat("最终调整后 R2 =", stepwise_AIC$final_fit$adj_R2, "\n")
cat("\nAIC 逐步回归模型回归系数:\n")
AIC_coef_table <- data.frame(
variable = colnames(stepwise_AIC$final_fit$X_design),
estimate = as.vector(stepwise_AIC$final_fit$beta_hat)
)
print(AIC_coef_table)
# =========================================================
# 6. 输出 BIC 逐步回归结果
# =========================================================
cat("\n\n========== BIC 逐步回归路径 ==========\n")
print(stepwise_BIC$selection_path)
cat("\n========== BIC 逐步回归最终结果 ==========\n")
cat("选择变量:", paste(stepwise_BIC$selected_vars, collapse = ", "), "\n")
cat("变量个数:", length(stepwise_BIC$selected_vars), "\n")
cat("最终 AIC =", stepwise_BIC$final_fit$AIC, "\n")
cat("最终 BIC =", stepwise_BIC$final_fit$BIC, "\n")
cat("最终 R2 =", stepwise_BIC$final_fit$R2, "\n")
cat("最终调整后 R2 =", stepwise_BIC$final_fit$adj_R2, "\n")
cat("\nBIC 逐步回归模型回归系数:\n")
BIC_coef_table <- data.frame(
variable = colnames(stepwise_BIC$final_fit$X_design),
estimate = as.vector(stepwise_BIC$final_fit$beta_hat)
)
print(BIC_coef_table)
# =========================================================
# 7. 与真实模型进行比较
# =========================================================
true_nonzero_vars <- c("X1", "X2", "X3", "X5", "X6", "X8", "X9", "X10")
true_zero_vars <- c("X4", "X7")
cat("\n\n========== AIC 逐步回归与真实模型比较 ==========\n")
cat("AIC 选中变量:", paste(stepwise_AIC$selected_vars, collapse = ", "), "\n")
cat("真实有效变量:", paste(true_nonzero_vars, collapse = ", "), "\n")
cat("AIC 是否错误保留 X4 或 X7:",
any(stepwise_AIC$selected_vars %in% true_zero_vars), "\n")
cat("AIC 是否遗漏真实有效变量:",
any(!(true_nonzero_vars %in% stepwise_AIC$selected_vars)), "\n")
cat("\n========== BIC 逐步回归与真实模型比较 ==========\n")
cat("BIC 选中变量:", paste(stepwise_BIC$selected_vars, collapse = ", "), "\n")
cat("真实有效变量:", paste(true_nonzero_vars, collapse = ", "), "\n")
cat("BIC 是否错误保留 X4 或 X7:",
any(stepwise_BIC$selected_vars %in% true_zero_vars), "\n")
cat("BIC 是否遗漏真实有效变量:",
any(!(true_nonzero_vars %in% stepwise_BIC$selected_vars)), "\n")
# =========================================================
# 8. 绘制逐步回归路径图
# =========================================================
par(mfrow = c(1, 2))
plot(
stepwise_AIC$selection_path$step,
stepwise_AIC$selection_path$AIC,
type = "b",
pch = 19,
col = "blue",
xlab = "逐步回归步骤",
ylab = "AIC",
main = "AIC 逐步回归路径"
)
plot(
stepwise_BIC$selection_path$step,
stepwise_BIC$selection_path$BIC,
type = "b",
pch = 19,
col = "red",
xlab = "逐步回归步骤",
ylab = "BIC",
main = "BIC 逐步回归路径"
)
par(mfrow = c(1, 1))
# =========================================================
# 9. 最终结论
# =========================================================
cat("\n\n========== 最终结论 ==========\n")
cat("AIC 逐步回归最终变量:",
paste(stepwise_AIC$selected_vars, collapse = ", "), "\n")
cat("BIC 逐步回归最终变量:",
paste(stepwise_BIC$selected_vars, collapse = ", "), "\n")
cat("\n如果最终剔除了 X4 和 X7,并保留了 X1, X2, X3, X5, X6, X8, X9, X10,\n")
cat("说明逐步回归能够较好识别真实有效变量。\n")
AtomGit 是由开放原子开源基金会联合 CSDN 等生态伙伴共同推出的新一代开源与人工智能协作平台。平台坚持“开放、中立、公益”的理念,把代码托管、模型共享、数据集托管、智能体开发体验和算力服务整合在一起,为开发者提供从开发、训练到部署的一站式体验。
更多推荐

所有评论(0)