向前选择

# =========================================================
# 多元线性回归变量选择:向前选择法 Forward Selection
# =========================================================


# =========================================================
# 1. 构造十维多元回归模拟数据
# =========================================================

set.seed(123)

# 样本量
n <- 200

# 自变量个数
p <- 10

# 生成自变量矩阵
X_raw <- matrix(rnorm(n * p, mean = 0, sd = 1), nrow = n, ncol = p)

# 设置变量名
colnames(X_raw) <- paste0("X", 1:p)

# 真实截距
beta0_true <- 5

# 真实回归系数
# X4 和 X7 的真实系数为 0,表示它们是无效变量
beta_true <- c(
  2.0,     # X1
  -1.5,    # X2
  3.0,     # X3
  0.0,     # X4
  1.2,     # X5
  -2.0,    # X6
  0.0,     # X7
  0.8,     # X8
  1.5,     # X9
  -1.0     # X10
)

# 随机误差
epsilon <- rnorm(n, mean = 0, sd = 1)

# 因变量
Y <- beta0_true + X_raw %*% beta_true + epsilon
Y <- as.vector(Y)


cat("\n========== 真实模型信息 ==========\n")
cat("真实有效变量:X1, X2, X3, X5, X6, X8, X9, X10\n")
cat("真实无效变量:X4, X7\n")


# =========================================================
# 2. 手动 OLS 拟合函数
# =========================================================

manual_ols_by_vars <- function(X_raw, Y, selected_vars) {
  
  X_raw <- as.matrix(X_raw)
  Y <- matrix(Y, ncol = 1)
  
  n <- length(Y)
  
  # 如果没有选择变量,只拟合截距模型
  if (length(selected_vars) == 0) {
    
    X_design <- matrix(1, nrow = n, ncol = 1)
    colnames(X_design) <- "Intercept"
    
  } else {
    
    X_selected <- X_raw[, selected_vars, drop = FALSE]
    X_design <- cbind(1, X_selected)
    colnames(X_design) <- c("Intercept", selected_vars)
  }
  
  # 参数个数,包括截距项
  k <- ncol(X_design)
  
  # OLS 估计:beta_hat = (X'X)^(-1) X'Y
  beta_hat <- solve(t(X_design) %*% X_design) %*% t(X_design) %*% Y
  
  # 拟合值
  Y_hat <- X_design %*% beta_hat
  
  # 残差
  residuals <- Y - Y_hat
  
  # 残差平方和
  SSE <- sum(residuals^2)
  
  # 总平方和
  SST <- sum((Y - mean(Y))^2)
  
  # 回归平方和
  SSR <- SST - SSE
  
  # R方
  R2 <- 1 - SSE / SST
  
  # 调整后的R方
  adj_R2 <- 1 - (SSE / (n - k)) / (SST / (n - 1))
  
  # AIC 和 BIC
  AIC_value <- n * log(SSE / n) + 2 * k
  BIC_value <- n * log(SSE / n) + k * log(n)
  
  result <- list(
    selected_vars = selected_vars,
    X_design = X_design,
    beta_hat = beta_hat,
    fitted_values = Y_hat,
    residuals = residuals,
    SSE = SSE,
    SSR = SSR,
    SST = SST,
    R2 = R2,
    adj_R2 = adj_R2,
    AIC = AIC_value,
    BIC = BIC_value,
    k = k
  )
  
  return(result)
}


# =========================================================
# 3. 向前选择函数
# =========================================================
# 思路:
# 1. 从空模型开始;
# 2. 每一步尝试加入一个尚未进入模型的变量;
# 3. 选择使 AIC 或 BIC 最小的变量;
# 4. 如果加入该变量后准则下降,则保留;
# 5. 如果没有变量能继续降低准则,则停止。
# =========================================================

forward_selection <- function(X_raw, Y, criterion = "BIC") {
  
  all_vars <- colnames(X_raw)
  
  # 初始模型为空模型
  selected_vars <- character(0)
  remaining_vars <- all_vars
  
  # 当前模型
  current_fit <- manual_ols_by_vars(X_raw, Y, selected_vars)
  current_score <- current_fit[[criterion]]
  
  # 保存选择路径
  selection_path <- data.frame(
    step = 0,
    action = "Start",
    added_variable = "-",
    selected_variables = "-",
    AIC = current_fit$AIC,
    BIC = current_fit$BIC,
    R2 = current_fit$R2,
    adj_R2 = current_fit$adj_R2
  )
  
  step <- 1
  
  repeat {
    
    # 如果没有剩余变量,停止
    if (length(remaining_vars) == 0) {
      break
    }
    
    candidate_table <- data.frame()
    candidate_fit_list <- list()
    
    # 尝试加入每一个剩余变量
    for (var in remaining_vars) {
      
      candidate_vars <- c(selected_vars, var)
      
      fit <- manual_ols_by_vars(X_raw, Y, candidate_vars)
      
      candidate_table <- rbind(
        candidate_table,
        data.frame(
          candidate_variable = var,
          selected_variables = paste(candidate_vars, collapse = ", "),
          AIC = fit$AIC,
          BIC = fit$BIC,
          R2 = fit$R2,
          adj_R2 = fit$adj_R2
        )
      )
      
      candidate_fit_list[[var]] <- fit
    }
    
    # 找到当前步骤中最优的候选变量
    best_index <- which.min(candidate_table[[criterion]])
    best_var <- candidate_table$candidate_variable[best_index]
    best_fit <- candidate_fit_list[[best_var]]
    best_score <- best_fit[[criterion]]
    
    # 判断加入变量后准则是否下降
    if (best_score < current_score) {
      
      selected_vars <- c(selected_vars, best_var)
      remaining_vars <- setdiff(remaining_vars, best_var)
      
      current_fit <- best_fit
      current_score <- best_score
      
      selection_path <- rbind(
        selection_path,
        data.frame(
          step = step,
          action = "Add",
          added_variable = best_var,
          selected_variables = paste(selected_vars, collapse = ", "),
          AIC = current_fit$AIC,
          BIC = current_fit$BIC,
          R2 = current_fit$R2,
          adj_R2 = current_fit$adj_R2
        )
      )
      
      step <- step + 1
      
    } else {
      
      # 如果加入任何变量都不能降低 AIC 或 BIC,则停止
      break
    }
  }
  
  final_fit <- manual_ols_by_vars(X_raw, Y, selected_vars)
  
  result <- list(
    method = paste("Forward Selection by", criterion),
    criterion = criterion,
    selected_vars = selected_vars,
    final_fit = final_fit,
    selection_path = selection_path
  )
  
  return(result)
}


# =========================================================
# 4. 分别使用 AIC 和 BIC 进行向前选择
# =========================================================

forward_AIC <- forward_selection(X_raw, Y, criterion = "AIC")
forward_BIC <- forward_selection(X_raw, Y, criterion = "BIC")


# =========================================================
# 5. 输出 AIC 向前选择结果
# =========================================================

cat("\n\n========== AIC 向前选择路径 ==========\n")
print(forward_AIC$selection_path)

cat("\n========== AIC 向前选择最终结果 ==========\n")
cat("选择变量:", paste(forward_AIC$selected_vars, collapse = ", "), "\n")
cat("变量个数:", length(forward_AIC$selected_vars), "\n")
cat("最终 AIC =", forward_AIC$final_fit$AIC, "\n")
cat("最终 BIC =", forward_AIC$final_fit$BIC, "\n")
cat("最终 R2 =", forward_AIC$final_fit$R2, "\n")
cat("最终调整后 R2 =", forward_AIC$final_fit$adj_R2, "\n")

cat("\nAIC 向前选择模型回归系数:\n")
AIC_coef_table <- data.frame(
  variable = colnames(forward_AIC$final_fit$X_design),
  estimate = as.vector(forward_AIC$final_fit$beta_hat)
)
print(AIC_coef_table)


# =========================================================
# 6. 输出 BIC 向前选择结果
# =========================================================

cat("\n\n========== BIC 向前选择路径 ==========\n")
print(forward_BIC$selection_path)

cat("\n========== BIC 向前选择最终结果 ==========\n")
cat("选择变量:", paste(forward_BIC$selected_vars, collapse = ", "), "\n")
cat("变量个数:", length(forward_BIC$selected_vars), "\n")
cat("最终 AIC =", forward_BIC$final_fit$AIC, "\n")
cat("最终 BIC =", forward_BIC$final_fit$BIC, "\n")
cat("最终 R2 =", forward_BIC$final_fit$R2, "\n")
cat("最终调整后 R2 =", forward_BIC$final_fit$adj_R2, "\n")

cat("\nBIC 向前选择模型回归系数:\n")
BIC_coef_table <- data.frame(
  variable = colnames(forward_BIC$final_fit$X_design),
  estimate = as.vector(forward_BIC$final_fit$beta_hat)
)
print(BIC_coef_table)


# =========================================================
# 7. 与真实模型进行比较
# =========================================================

true_nonzero_vars <- c("X1", "X2", "X3", "X5", "X6", "X8", "X9", "X10")
true_zero_vars <- c("X4", "X7")

cat("\n\n========== AIC 向前选择与真实模型比较 ==========\n")
cat("AIC 选中变量:", paste(forward_AIC$selected_vars, collapse = ", "), "\n")
cat("真实有效变量:", paste(true_nonzero_vars, collapse = ", "), "\n")
cat("AIC 是否错误保留 X4 或 X7:",
    any(forward_AIC$selected_vars %in% true_zero_vars), "\n")
cat("AIC 是否遗漏真实有效变量:",
    any(!(true_nonzero_vars %in% forward_AIC$selected_vars)), "\n")

cat("\n========== BIC 向前选择与真实模型比较 ==========\n")
cat("BIC 选中变量:", paste(forward_BIC$selected_vars, collapse = ", "), "\n")
cat("真实有效变量:", paste(true_nonzero_vars, collapse = ", "), "\n")
cat("BIC 是否错误保留 X4 或 X7:",
    any(forward_BIC$selected_vars %in% true_zero_vars), "\n")
cat("BIC 是否遗漏真实有效变量:",
    any(!(true_nonzero_vars %in% forward_BIC$selected_vars)), "\n")


# =========================================================
# 8. 绘制向前选择路径图
# =========================================================

par(mfrow = c(1, 2))

plot(
  forward_AIC$selection_path$step,
  forward_AIC$selection_path$AIC,
  type = "b",
  pch = 19,
  col = "blue",
  xlab = "选择步骤",
  ylab = "AIC",
  main = "AIC 向前选择路径"
)

plot(
  forward_BIC$selection_path$step,
  forward_BIC$selection_path$BIC,
  type = "b",
  pch = 19,
  col = "red",
  xlab = "选择步骤",
  ylab = "BIC",
  main = "BIC 向前选择路径"
)

par(mfrow = c(1, 1))


# =========================================================
# 9. 最终结论
# =========================================================

cat("\n\n========== 最终结论 ==========\n")

cat("AIC 向前选择最终变量:",
    paste(forward_AIC$selected_vars, collapse = ", "), "\n")

cat("BIC 向前选择最终变量:",
    paste(forward_BIC$selected_vars, collapse = ", "), "\n")

cat("\n如果最终剔除了 X4 和 X7,并保留了 X1, X2, X3, X5, X6, X8, X9, X10,\n")
cat("说明向前选择法能够较好识别真实有效变量。\n")

向后剔除

# =========================================================
# 多元线性回归变量选择:向后剔除法 Backward Elimination
# 案例:十维多元线性回归
# =========================================================


# =========================================================
# 1. 构造十维多元回归模拟数据
# =========================================================

set.seed(123)

# 样本量
n <- 200

# 自变量个数
p <- 10

# 生成自变量矩阵
X_raw <- matrix(rnorm(n * p, mean = 0, sd = 1), nrow = n, ncol = p)

# 设置变量名
colnames(X_raw) <- paste0("X", 1:p)

# 真实截距
beta0_true <- 5

# 真实回归系数
# X4 和 X7 的真实系数为 0,表示它们是无效变量
beta_true <- c(
  2.0,     # X1
  -1.5,    # X2
  3.0,     # X3
  0.0,     # X4
  1.2,     # X5
  -2.0,    # X6
  0.0,     # X7
  0.8,     # X8
  1.5,     # X9
  -1.0     # X10
)

# 随机误差
epsilon <- rnorm(n, mean = 0, sd = 1)

# 因变量
Y <- beta0_true + X_raw %*% beta_true + epsilon
Y <- as.vector(Y)


cat("\n========== 真实模型信息 ==========\n")
cat("真实有效变量:X1, X2, X3, X5, X6, X8, X9, X10\n")
cat("真实无效变量:X4, X7\n")


# =========================================================
# 2. 手动 OLS 拟合函数
# =========================================================

manual_ols_by_vars <- function(X_raw, Y, selected_vars) {
  
  X_raw <- as.matrix(X_raw)
  Y <- matrix(Y, ncol = 1)
  
  n <- length(Y)
  
  # 如果没有选择变量,只拟合截距模型
  if (length(selected_vars) == 0) {
    
    X_design <- matrix(1, nrow = n, ncol = 1)
    colnames(X_design) <- "Intercept"
    
  } else {
    
    X_selected <- X_raw[, selected_vars, drop = FALSE]
    X_design <- cbind(1, X_selected)
    colnames(X_design) <- c("Intercept", selected_vars)
  }
  
  # 参数个数,包括截距项
  k <- ncol(X_design)
  
  # OLS 估计:beta_hat = (X'X)^(-1) X'Y
  beta_hat <- solve(t(X_design) %*% X_design) %*% t(X_design) %*% Y
  
  # 拟合值
  Y_hat <- X_design %*% beta_hat
  
  # 残差
  residuals <- Y - Y_hat
  
  # 残差平方和
  SSE <- sum(residuals^2)
  
  # 总平方和
  SST <- sum((Y - mean(Y))^2)
  
  # 回归平方和
  SSR <- SST - SSE
  
  # R方
  R2 <- 1 - SSE / SST
  
  # 调整后的R方
  adj_R2 <- 1 - (SSE / (n - k)) / (SST / (n - 1))
  
  # AIC 和 BIC
  AIC_value <- n * log(SSE / n) + 2 * k
  BIC_value <- n * log(SSE / n) + k * log(n)
  
  result <- list(
    selected_vars = selected_vars,
    X_design = X_design,
    beta_hat = beta_hat,
    fitted_values = Y_hat,
    residuals = residuals,
    SSE = SSE,
    SSR = SSR,
    SST = SST,
    R2 = R2,
    adj_R2 = adj_R2,
    AIC = AIC_value,
    BIC = BIC_value,
    k = k
  )
  
  return(result)
}


# =========================================================
# 3. 向后剔除函数
# =========================================================
# 思路:
# 1. 从完整模型开始;
# 2. 每一步尝试删除当前模型中的一个变量;
# 3. 选择使 AIC 或 BIC 最小的删除方案;
# 4. 如果删除该变量后准则下降,则删除;
# 5. 如果删除任何变量都不能继续降低准则,则停止。
# =========================================================

backward_elimination <- function(X_raw, Y, criterion = "BIC") {
  
  all_vars <- colnames(X_raw)
  
  # 初始模型为完整模型
  selected_vars <- all_vars
  
  # 当前模型
  current_fit <- manual_ols_by_vars(X_raw, Y, selected_vars)
  current_score <- current_fit[[criterion]]
  
  # 保存剔除路径
  selection_path <- data.frame(
    step = 0,
    action = "Start",
    removed_variable = "-",
    selected_variables = paste(selected_vars, collapse = ", "),
    AIC = current_fit$AIC,
    BIC = current_fit$BIC,
    R2 = current_fit$R2,
    adj_R2 = current_fit$adj_R2
  )
  
  step <- 1
  
  repeat {
    
    # 如果当前模型已经没有变量,则停止
    if (length(selected_vars) == 0) {
      break
    }
    
    candidate_table <- data.frame()
    candidate_fit_list <- list()
    
    # 尝试删除当前模型中的每一个变量
    for (var in selected_vars) {
      
      candidate_vars <- setdiff(selected_vars, var)
      
      fit <- manual_ols_by_vars(X_raw, Y, candidate_vars)
      
      if (length(candidate_vars) == 0) {
        vars_text <- "-"
      } else {
        vars_text <- paste(candidate_vars, collapse = ", ")
      }
      
      candidate_table <- rbind(
        candidate_table,
        data.frame(
          removed_variable = var,
          selected_variables = vars_text,
          AIC = fit$AIC,
          BIC = fit$BIC,
          R2 = fit$R2,
          adj_R2 = fit$adj_R2
        )
      )
      
      candidate_fit_list[[var]] <- fit
    }
    
    # 找到当前步骤中最优的删除变量
    best_index <- which.min(candidate_table[[criterion]])
    best_removed_var <- candidate_table$removed_variable[best_index]
    best_fit <- candidate_fit_list[[best_removed_var]]
    best_score <- best_fit[[criterion]]
    
    # 判断删除变量后准则是否下降
    if (best_score < current_score) {
      
      selected_vars <- setdiff(selected_vars, best_removed_var)
      
      current_fit <- best_fit
      current_score <- best_score
      
      if (length(selected_vars) == 0) {
        vars_text <- "-"
      } else {
        vars_text <- paste(selected_vars, collapse = ", ")
      }
      
      selection_path <- rbind(
        selection_path,
        data.frame(
          step = step,
          action = "Remove",
          removed_variable = best_removed_var,
          selected_variables = vars_text,
          AIC = current_fit$AIC,
          BIC = current_fit$BIC,
          R2 = current_fit$R2,
          adj_R2 = current_fit$adj_R2
        )
      )
      
      step <- step + 1
      
    } else {
      
      # 如果删除任何变量都不能降低 AIC 或 BIC,则停止
      break
    }
  }
  
  final_fit <- manual_ols_by_vars(X_raw, Y, selected_vars)
  
  result <- list(
    method = paste("Backward Elimination by", criterion),
    criterion = criterion,
    selected_vars = selected_vars,
    final_fit = final_fit,
    selection_path = selection_path
  )
  
  return(result)
}


# =========================================================
# 4. 分别使用 AIC 和 BIC 进行向后剔除
# =========================================================

backward_AIC <- backward_elimination(X_raw, Y, criterion = "AIC")
backward_BIC <- backward_elimination(X_raw, Y, criterion = "BIC")


# =========================================================
# 5. 输出 AIC 向后剔除结果
# =========================================================

cat("\n\n========== AIC 向后剔除路径 ==========\n")
print(backward_AIC$selection_path)

cat("\n========== AIC 向后剔除最终结果 ==========\n")
cat("选择变量:", paste(backward_AIC$selected_vars, collapse = ", "), "\n")
cat("变量个数:", length(backward_AIC$selected_vars), "\n")
cat("最终 AIC =", backward_AIC$final_fit$AIC, "\n")
cat("最终 BIC =", backward_AIC$final_fit$BIC, "\n")
cat("最终 R2 =", backward_AIC$final_fit$R2, "\n")
cat("最终调整后 R2 =", backward_AIC$final_fit$adj_R2, "\n")

cat("\nAIC 向后剔除模型回归系数:\n")
AIC_coef_table <- data.frame(
  variable = colnames(backward_AIC$final_fit$X_design),
  estimate = as.vector(backward_AIC$final_fit$beta_hat)
)
print(AIC_coef_table)


# =========================================================
# 6. 输出 BIC 向后剔除结果
# =========================================================

cat("\n\n========== BIC 向后剔除路径 ==========\n")
print(backward_BIC$selection_path)

cat("\n========== BIC 向后剔除最终结果 ==========\n")
cat("选择变量:", paste(backward_BIC$selected_vars, collapse = ", "), "\n")
cat("变量个数:", length(backward_BIC$selected_vars), "\n")
cat("最终 AIC =", backward_BIC$final_fit$AIC, "\n")
cat("最终 BIC =", backward_BIC$final_fit$BIC, "\n")
cat("最终 R2 =", backward_BIC$final_fit$R2, "\n")
cat("最终调整后 R2 =", backward_BIC$final_fit$adj_R2, "\n")

cat("\nBIC 向后剔除模型回归系数:\n")
BIC_coef_table <- data.frame(
  variable = colnames(backward_BIC$final_fit$X_design),
  estimate = as.vector(backward_BIC$final_fit$beta_hat)
)
print(BIC_coef_table)


# =========================================================
# 7. 与真实模型进行比较
# =========================================================

true_nonzero_vars <- c("X1", "X2", "X3", "X5", "X6", "X8", "X9", "X10")
true_zero_vars <- c("X4", "X7")

cat("\n\n========== AIC 向后剔除与真实模型比较 ==========\n")
cat("AIC 选中变量:", paste(backward_AIC$selected_vars, collapse = ", "), "\n")
cat("真实有效变量:", paste(true_nonzero_vars, collapse = ", "), "\n")
cat("AIC 是否错误保留 X4 或 X7:",
    any(backward_AIC$selected_vars %in% true_zero_vars), "\n")
cat("AIC 是否遗漏真实有效变量:",
    any(!(true_nonzero_vars %in% backward_AIC$selected_vars)), "\n")

cat("\n========== BIC 向后剔除与真实模型比较 ==========\n")
cat("BIC 选中变量:", paste(backward_BIC$selected_vars, collapse = ", "), "\n")
cat("真实有效变量:", paste(true_nonzero_vars, collapse = ", "), "\n")
cat("BIC 是否错误保留 X4 或 X7:",
    any(backward_BIC$selected_vars %in% true_zero_vars), "\n")
cat("BIC 是否遗漏真实有效变量:",
    any(!(true_nonzero_vars %in% backward_BIC$selected_vars)), "\n")


# =========================================================
# 8. 绘制向后剔除路径图
# =========================================================

par(mfrow = c(1, 2))

plot(
  backward_AIC$selection_path$step,
  backward_AIC$selection_path$AIC,
  type = "b",
  pch = 19,
  col = "blue",
  xlab = "剔除步骤",
  ylab = "AIC",
  main = "AIC 向后剔除路径"
)

plot(
  backward_BIC$selection_path$step,
  backward_BIC$selection_path$BIC,
  type = "b",
  pch = 19,
  col = "red",
  xlab = "剔除步骤",
  ylab = "BIC",
  main = "BIC 向后剔除路径"
)

par(mfrow = c(1, 1))


# =========================================================
# 9. 最终结论
# =========================================================

cat("\n\n========== 最终结论 ==========\n")

cat("AIC 向后剔除最终变量:",
    paste(backward_AIC$selected_vars, collapse = ", "), "\n")

cat("BIC 向后剔除最终变量:",
    paste(backward_BIC$selected_vars, collapse = ", "), "\n")

cat("\n如果最终剔除了 X4 和 X7,并保留了 X1, X2, X3, X5, X6, X8, X9, X10,\n")
cat("说明向后剔除法能够较好识别真实有效变量。\n")

逐步回归

# =========================================================
# 多元线性回归变量选择:逐步回归 Stepwise Selection
# 案例:十维多元线性回归
# =========================================================

options(stringsAsFactors = FALSE)


# =========================================================
# 1. 构造十维多元回归模拟数据
# =========================================================

set.seed(123)

# 样本量
n <- 200

# 自变量个数
p <- 10

# 生成自变量矩阵
X_raw <- matrix(rnorm(n * p, mean = 0, sd = 1), nrow = n, ncol = p)

# 设置变量名
colnames(X_raw) <- paste0("X", 1:p)

# 真实截距
beta0_true <- 5

# 真实回归系数
# X4 和 X7 的真实系数为 0,表示它们是无效变量
beta_true <- c(
  2.0,     # X1
  -1.5,    # X2
  3.0,     # X3
  0.0,     # X4
  1.2,     # X5
  -2.0,    # X6
  0.0,     # X7
  0.8,     # X8
  1.5,     # X9
  -1.0     # X10
)

# 随机误差
epsilon <- rnorm(n, mean = 0, sd = 1)

# 因变量
Y <- beta0_true + X_raw %*% beta_true + epsilon
Y <- as.vector(Y)

cat("\n========== 真实模型信息 ==========\n")
cat("真实有效变量:X1, X2, X3, X5, X6, X8, X9, X10\n")
cat("真实无效变量:X4, X7\n")


# =========================================================
# 2. 手动 OLS 拟合函数
# =========================================================

manual_ols_by_vars <- function(X_raw, Y, selected_vars) {
  
  X_raw <- as.matrix(X_raw)
  Y <- matrix(Y, ncol = 1)
  
  n <- length(Y)
  
  # 如果没有选择变量,只拟合截距模型
  if (length(selected_vars) == 0) {
    
    X_design <- matrix(1, nrow = n, ncol = 1)
    colnames(X_design) <- "Intercept"
    
  } else {
    
    X_selected <- X_raw[, selected_vars, drop = FALSE]
    X_design <- cbind(1, X_selected)
    colnames(X_design) <- c("Intercept", selected_vars)
  }
  
  # 参数个数,包括截距项
  k <- ncol(X_design)
  
  # OLS 估计:beta_hat = (X'X)^(-1) X'Y
  beta_hat <- solve(t(X_design) %*% X_design) %*% t(X_design) %*% Y
  
  # 拟合值
  Y_hat <- X_design %*% beta_hat
  
  # 残差
  residuals <- Y - Y_hat
  
  # 残差平方和
  SSE <- sum(residuals^2)
  
  # 总平方和
  SST <- sum((Y - mean(Y))^2)
  
  # 回归平方和
  SSR <- SST - SSE
  
  # R方
  R2 <- 1 - SSE / SST
  
  # 调整后的R方
  adj_R2 <- 1 - (SSE / (n - k)) / (SST / (n - 1))
  
  # AIC 和 BIC
  AIC_value <- n * log(SSE / n) + 2 * k
  BIC_value <- n * log(SSE / n) + k * log(n)
  
  result <- list(
    selected_vars = selected_vars,
    X_design = X_design,
    beta_hat = beta_hat,
    fitted_values = Y_hat,
    residuals = residuals,
    SSE = SSE,
    SSR = SSR,
    SST = SST,
    R2 = R2,
    adj_R2 = adj_R2,
    AIC = AIC_value,
    BIC = BIC_value,
    k = k
  )
  
  return(result)
}


# =========================================================
# 3. 逐步回归函数
# =========================================================
# 思路:
# 1. 从空模型开始;
# 2. 每一步同时尝试:
#    加入一个尚未进入模型的变量;
#    删除一个已经进入模型的变量;
# 3. 选择使 AIC 或 BIC 最小的操作;
# 4. 如果该操作能使准则下降,则执行;
# 5. 如果加入或删除都不能继续降低准则,则停止。
# =========================================================

stepwise_selection <- function(X_raw, Y, criterion = "BIC") {
  
  all_vars <- colnames(X_raw)
  
  # 初始模型为空模型
  selected_vars <- character(0)
  
  # 当前模型
  current_fit <- manual_ols_by_vars(X_raw, Y, selected_vars)
  current_score <- current_fit[[criterion]]
  
  # 保存选择路径
  selection_path <- data.frame(
    step = 0,
    action = "Start",
    changed_variable = "-",
    selected_variables = "-",
    AIC = current_fit$AIC,
    BIC = current_fit$BIC,
    R2 = current_fit$R2,
    adj_R2 = current_fit$adj_R2
  )
  
  step <- 1
  
  repeat {
    
    candidate_table <- data.frame()
    candidate_fit_list <- list()
    candidate_vars_list <- list()
    
    candidate_id <- 1
    
    # -----------------------------------------------------
    # 3.1 尝试加入变量
    # -----------------------------------------------------
    
    remaining_vars <- setdiff(all_vars, selected_vars)
    
    if (length(remaining_vars) > 0) {
      
      for (var in remaining_vars) {
        
        candidate_vars <- c(selected_vars, var)
        fit <- manual_ols_by_vars(X_raw, Y, candidate_vars)
        
        candidate_table <- rbind(
          candidate_table,
          data.frame(
            action = "Add",
            changed_variable = var,
            selected_variables = paste(candidate_vars, collapse = ", "),
            AIC = fit$AIC,
            BIC = fit$BIC,
            R2 = fit$R2,
            adj_R2 = fit$adj_R2
          )
        )
        
        candidate_fit_list[[candidate_id]] <- fit
        candidate_vars_list[[candidate_id]] <- candidate_vars
        
        candidate_id <- candidate_id + 1
      }
    }
    
    
    # -----------------------------------------------------
    # 3.2 尝试删除变量
    # -----------------------------------------------------
    
    if (length(selected_vars) > 0) {
      
      for (var in selected_vars) {
        
        candidate_vars <- setdiff(selected_vars, var)
        fit <- manual_ols_by_vars(X_raw, Y, candidate_vars)
        
        if (length(candidate_vars) == 0) {
          vars_text <- "-"
        } else {
          vars_text <- paste(candidate_vars, collapse = ", ")
        }
        
        candidate_table <- rbind(
          candidate_table,
          data.frame(
            action = "Remove",
            changed_variable = var,
            selected_variables = vars_text,
            AIC = fit$AIC,
            BIC = fit$BIC,
            R2 = fit$R2,
            adj_R2 = fit$adj_R2
          )
        )
        
        candidate_fit_list[[candidate_id]] <- fit
        candidate_vars_list[[candidate_id]] <- candidate_vars
        
        candidate_id <- candidate_id + 1
      }
    }
    
    
    # -----------------------------------------------------
    # 3.3 找到当前步骤中最优的操作
    # -----------------------------------------------------
    
    best_index <- which.min(candidate_table[[criterion]])
    
    best_action <- candidate_table$action[best_index]
    best_changed_var <- candidate_table$changed_variable[best_index]
    best_fit <- candidate_fit_list[[best_index]]
    best_vars <- candidate_vars_list[[best_index]]
    best_score <- best_fit[[criterion]]
    
    
    # -----------------------------------------------------
    # 3.4 判断准则是否下降
    # -----------------------------------------------------
    
    if (best_score < current_score) {
      
      selected_vars <- best_vars
      current_fit <- best_fit
      current_score <- best_score
      
      if (length(selected_vars) == 0) {
        vars_text <- "-"
      } else {
        vars_text <- paste(selected_vars, collapse = ", ")
      }
      
      selection_path <- rbind(
        selection_path,
        data.frame(
          step = step,
          action = best_action,
          changed_variable = best_changed_var,
          selected_variables = vars_text,
          AIC = current_fit$AIC,
          BIC = current_fit$BIC,
          R2 = current_fit$R2,
          adj_R2 = current_fit$adj_R2
        )
      )
      
      step <- step + 1
      
    } else {
      
      # 如果加入或删除任何变量都不能降低 AIC 或 BIC,则停止
      break
    }
  }
  
  final_fit <- manual_ols_by_vars(X_raw, Y, selected_vars)
  
  result <- list(
    method = paste("Stepwise Selection by", criterion),
    criterion = criterion,
    selected_vars = selected_vars,
    final_fit = final_fit,
    selection_path = selection_path
  )
  
  return(result)
}


# =========================================================
# 4. 分别使用 AIC 和 BIC 进行逐步回归
# =========================================================

stepwise_AIC <- stepwise_selection(X_raw, Y, criterion = "AIC")
stepwise_BIC <- stepwise_selection(X_raw, Y, criterion = "BIC")


# =========================================================
# 5. 输出 AIC 逐步回归结果
# =========================================================

cat("\n\n========== AIC 逐步回归路径 ==========\n")
print(stepwise_AIC$selection_path)

cat("\n========== AIC 逐步回归最终结果 ==========\n")
cat("选择变量:", paste(stepwise_AIC$selected_vars, collapse = ", "), "\n")
cat("变量个数:", length(stepwise_AIC$selected_vars), "\n")
cat("最终 AIC =", stepwise_AIC$final_fit$AIC, "\n")
cat("最终 BIC =", stepwise_AIC$final_fit$BIC, "\n")
cat("最终 R2 =", stepwise_AIC$final_fit$R2, "\n")
cat("最终调整后 R2 =", stepwise_AIC$final_fit$adj_R2, "\n")

cat("\nAIC 逐步回归模型回归系数:\n")
AIC_coef_table <- data.frame(
  variable = colnames(stepwise_AIC$final_fit$X_design),
  estimate = as.vector(stepwise_AIC$final_fit$beta_hat)
)
print(AIC_coef_table)


# =========================================================
# 6. 输出 BIC 逐步回归结果
# =========================================================

cat("\n\n========== BIC 逐步回归路径 ==========\n")
print(stepwise_BIC$selection_path)

cat("\n========== BIC 逐步回归最终结果 ==========\n")
cat("选择变量:", paste(stepwise_BIC$selected_vars, collapse = ", "), "\n")
cat("变量个数:", length(stepwise_BIC$selected_vars), "\n")
cat("最终 AIC =", stepwise_BIC$final_fit$AIC, "\n")
cat("最终 BIC =", stepwise_BIC$final_fit$BIC, "\n")
cat("最终 R2 =", stepwise_BIC$final_fit$R2, "\n")
cat("最终调整后 R2 =", stepwise_BIC$final_fit$adj_R2, "\n")

cat("\nBIC 逐步回归模型回归系数:\n")
BIC_coef_table <- data.frame(
  variable = colnames(stepwise_BIC$final_fit$X_design),
  estimate = as.vector(stepwise_BIC$final_fit$beta_hat)
)
print(BIC_coef_table)


# =========================================================
# 7. 与真实模型进行比较
# =========================================================

true_nonzero_vars <- c("X1", "X2", "X3", "X5", "X6", "X8", "X9", "X10")
true_zero_vars <- c("X4", "X7")

cat("\n\n========== AIC 逐步回归与真实模型比较 ==========\n")
cat("AIC 选中变量:", paste(stepwise_AIC$selected_vars, collapse = ", "), "\n")
cat("真实有效变量:", paste(true_nonzero_vars, collapse = ", "), "\n")
cat("AIC 是否错误保留 X4 或 X7:",
    any(stepwise_AIC$selected_vars %in% true_zero_vars), "\n")
cat("AIC 是否遗漏真实有效变量:",
    any(!(true_nonzero_vars %in% stepwise_AIC$selected_vars)), "\n")

cat("\n========== BIC 逐步回归与真实模型比较 ==========\n")
cat("BIC 选中变量:", paste(stepwise_BIC$selected_vars, collapse = ", "), "\n")
cat("真实有效变量:", paste(true_nonzero_vars, collapse = ", "), "\n")
cat("BIC 是否错误保留 X4 或 X7:",
    any(stepwise_BIC$selected_vars %in% true_zero_vars), "\n")
cat("BIC 是否遗漏真实有效变量:",
    any(!(true_nonzero_vars %in% stepwise_BIC$selected_vars)), "\n")


# =========================================================
# 8. 绘制逐步回归路径图
# =========================================================

par(mfrow = c(1, 2))

plot(
  stepwise_AIC$selection_path$step,
  stepwise_AIC$selection_path$AIC,
  type = "b",
  pch = 19,
  col = "blue",
  xlab = "逐步回归步骤",
  ylab = "AIC",
  main = "AIC 逐步回归路径"
)

plot(
  stepwise_BIC$selection_path$step,
  stepwise_BIC$selection_path$BIC,
  type = "b",
  pch = 19,
  col = "red",
  xlab = "逐步回归步骤",
  ylab = "BIC",
  main = "BIC 逐步回归路径"
)

par(mfrow = c(1, 1))


# =========================================================
# 9. 最终结论
# =========================================================

cat("\n\n========== 最终结论 ==========\n")

cat("AIC 逐步回归最终变量:",
    paste(stepwise_AIC$selected_vars, collapse = ", "), "\n")

cat("BIC 逐步回归最终变量:",
    paste(stepwise_BIC$selected_vars, collapse = ", "), "\n")

cat("\n如果最终剔除了 X4 和 X7,并保留了 X1, X2, X3, X5, X6, X8, X9, X10,\n")
cat("说明逐步回归能够较好识别真实有效变量。\n")

Logo

AtomGit 是由开放原子开源基金会联合 CSDN 等生态伙伴共同推出的新一代开源与人工智能协作平台。平台坚持“开放、中立、公益”的理念,把代码托管、模型共享、数据集托管、智能体开发体验和算力服务整合在一起,为开发者提供从开发、训练到部署的一站式体验。

更多推荐