Day 9 编程实战:随机森林调参与时间序列交叉验证

实战目标

  1. 理解各超参数对模型的影响
  2. 掌握TimeSeriesSplit的正确使用方法
  3. 使用GridSearchCV和RandomizedSearchCV调参
  4. 绘制学习曲线和验证曲线诊断模型
  5. 找到最优参数组合

1. 导入必要的库

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import (
    TimeSeriesSplit, GridSearchCV, RandomizedSearchCV,
    learning_curve, validation_curve, cross_val_score
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, classification_report
)
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# 启用LaTeX渲染(如果系统安装了LaTeX)
plt.rcParams['text.usetex'] = False  # 设为False避免LaTeX依赖
# 设置中文显示
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

2. 生成金融数据

def generate_financial_data(ts_code):
    """生成金融数据"""
    
    data_path = Path(r"E:\AppData\quant_trade\klines\kline2014-2024")
    kline_file = data_path / f"{ts_code}.csv"
    
    df = pd.read_csv(kline_file, usecols=["trade_date", "close", "vol"],
                     parse_dates=["trade_date"])\
            .rename(columns={"vol": "volume"})\
            .sort_values(by=["trade_date"])\
            .reset_index(drop=True)
    
    df['return'] = df['close'].pct_change()
    
    # 技术指标
    # RSI
    delta = df['return'].fillna(0)
    gain = delta.where(delta > 0, 0).rolling(14).mean()
    loss = -delta.where(delta < 0, 0).rolling(14).mean()
    rs = gain / (loss + 1e-10)
    df['rsi'] = 100 - (100 / (1 + rs))
    
    # MACD
    ema12 = df['close'].ewm(span=12, adjust=False).mean()
    ema26 = df['close'].ewm(span=26, adjust=False).mean()
    df['macd'] = ema12 - ema26
    df['macd_signal'] = df['macd'].ewm(span=9, adjust=False).mean()
    
    # 均线比率
    df['ma5'] = df['close'].rolling(5).mean()
    df['ma20'] = df['close'].rolling(20).mean()
    df['ma_ratio'] = df['ma5'] / df['ma20'] - 1
    
    # 波动率
    df['volatility'] = df['return'].rolling(20).std()
    
    # 成交量比率
    df['volume_ratio'] = df['volume'] / df['volume'].rolling(10).mean()
    
    # 动量指标
    for lag in [1, 2, 3, 5, 10]:
        df[f'momentum_{lag}'] = df['return'].shift(lag).fillna(0)
    
    # 目标变量:3日收盘是否上涨
    df['target'] = (df['return'].shift(-3) > 0).astype(int)
    
    # 删除缺失值
    df = df.dropna()
    
    return df

# 生成数据
ts_code = "600519.SH"
df = generate_financial_data(ts_code)
print(f"数据形状: {df.shape}")

# 特征选择
feature_cols = ['rsi', 'macd', 'macd_signal', 'ma_ratio', 'volatility', 
                'volume_ratio', 'momentum_1', 'momentum_2', 'momentum_3', 
                'momentum_5', 'momentum_10']
X = df[feature_cols]
y = df['target']

print(f"特征数量: {len(feature_cols)}")
print(f"样本数量: {len(X)}")
print(f"目标分布: {y.value_counts(normalize=True)}")

# 按时间划分
split_idx = int(len(X) * 0.7)
X_train = X[:split_idx]
X_test = X[split_idx:]
y_train = y[:split_idx]
y_test = y[split_idx:]

print(f"\n训练集: {len(X_train)} 样本")
print(f"测试集: {len(X_test)} 样本")

# 标准化(随机森林不需要,但为了对比,依然做标准化)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
数据形状: (2452, 18)
特征数量: 11
样本数量: 2452
目标分布: target
1    0.504894
0    0.495106
Name: proportion, dtype: float64

训练集: 1716 样本
测试集: 736 样本

3. TimeSeriesSplit详解

3.1 TimeSeriesSplit可视化

sklearn.model_selection.TimeSeriesSplit 是 Scikit-learn 中专门用于时间序列数据的交叉验证工具。与传统的 KFold 不同,它严格遵循时间顺序,确保训练集的时间点始终早于测试集,从而避免“未来数据泄露”(即用未来的信息预测过去)。

TimeSeriesSplit 采用的是“扩展窗口”策略。

  • 训练集:随着折数(split)的增加,训练集的数据量会逐渐累积变大。
  • 测试集:始终位于训练集之后,且大小通常是固定的。
    关键参数详解:
参数 默认值 作用说明
n_splits 5 将数据切分成多少份(即做几次交叉验证)。
test_size None 指定每个测试集的大小。若为 None,则自动计算为 n_samples // (n_splits + 1)
max_train_size None 限制训练集的最大长度。用于模拟“滑动窗口”,防止训练集过大导致训练过慢。
gap 0 训练集与测试集之间的间隔样本数。用于避免短期时间相关性带来的数据泄露(例如用昨天的数据预测明天)。
def visualize_timeseries_split(X, n_splits=5, test_size=100):
    """可视化TimeSeriesSplit的分割方式"""
    tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
    
    fig, axes = plt.subplots(n_splits, 1, figsize=(14, 2*n_splits))
    
    for idx, (train_idx, test_idx) in enumerate(tscv.split(X)):
        # 创建颜色数组
        colors = np.full(len(X), 'lightgray')
        colors[train_idx] = 'blue'
        colors[test_idx] = 'red'
        
        axes[idx].bar(range(len(X)), [1]*len(X), color=colors, width=1)
        axes[idx].set_ylabel(f'Fold {idx+1}')
        axes[idx].set_xlim(0, len(X))
        axes[idx].set_yticks([])
        axes[idx].axvline(x=train_idx[-1], color='black', linestyle='--', alpha=0.5)
        
    axes[-1].set_xlabel('样本索引(时间顺序)')
    fig.suptitle('TimeSeriesSplit 可视化(蓝色=训练集,红色=测试集)', fontsize=14)
    plt.tight_layout()
    plt.show()

# 可视化
print("TimeSeriesSplit 工作原理:")
print("- 训练集始终在测试集之前")
print("- 训练集大小递增")
print("- 测试集不重叠")
visualize_timeseries_split(X_train, n_splits=4, test_size=100)
TimeSeriesSplit 工作原理:
- 训练集始终在测试集之前
- 训练集大小递增
- 测试集不重叠

在这里插入图片描述

3.2 普通K折 vs TimeSeriesSplit

from sklearn.model_selection import KFold

def compare_cv_methods(X, y, model, n_splits=5):
    """对比普通K折和时间序列交叉验证"""
    
    # 普通K折(打乱顺序)
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    kf_scores = cross_val_score(model, X, y, cv=kf, scoring='roc_auc')
    
    # 时间序列交叉验证
    tscv = TimeSeriesSplit(n_splits=n_splits)
    tscv_scores = cross_val_score(model, X, y, cv=tscv, scoring='roc_auc')
    
    print("="*60)
    print("交叉验证方法对比")
    print("="*60)
    print(f"普通K折 (shuffle=True):")
    print(f"  AUC均值: {kf_scores.mean():.4f} ± {kf_scores.std():.4f}")
    print(f"\nTimeSeriesSplit:")
    print(f"  AUC均值: {tscv_scores.mean():.4f} ± {tscv_scores.std():.4f}")
    print(f"\n⚠️ 普通K折在时间序列数据上会引入前视偏差,结果过于乐观!")
    
    return kf_scores, tscv_scores

# 对比测试(使用小数据集快速演示)
X_small = X_train[:500]
y_small = y_train[:500]
rf_small = RandomForestClassifier(n_estimators=50, random_state=42)

kf_scores, tscv_scores = compare_cv_methods(X_small, y_small, rf_small)
============================================================
交叉验证方法对比
============================================================
普通K折 (shuffle=True):
  AUC均值: 0.4991 ± 0.0183

TimeSeriesSplit:
  AUC均值: 0.5323 ± 0.0505

⚠️ 普通K折在时间序列数据上会引入前视偏差,结果过于乐观!

4. 单参数影响分析

4.1 n_estimators的影响

def analyze_n_estimators(X_train, y_train, X_test, y_test, n_range=range(10, 301, 20)):
    """分析n_estimators对模型性能的影响"""
    train_scores = []
    test_scores = []
    oob_scores = []
    times = []
    
    for n in n_range:
        start_time = time.time()
        rf = RandomForestClassifier(n_estimators=n, oob_score=True, 
                                    n_jobs=-1, random_state=42)
        rf.fit(X_train, y_train)
        times.append(time.time() - start_time)
        
        train_scores.append(accuracy_score(y_train, rf.predict(X_train)))
        test_scores.append(accuracy_score(y_test, rf.predict(X_test)))
        oob_scores.append(rf.oob_score_)
    
    # 可视化
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # 准确率曲线
    axes[0, 0].plot(n_range, train_scores, 'b-o', label='训练集', linewidth=2)
    axes[0, 0].plot(n_range, test_scores, 'r-s', label='测试集', linewidth=2)
    axes[0, 0].set_xlabel('n_estimators')
    axes[0, 0].set_ylabel('准确率')
    axes[0, 0].set_title('n_estimators对准确率的影响')
    axes[0, 0].legend()
    axes[0, 0].grid(True, alpha=0.3)
    
    # OOB评分
    axes[0, 1].plot(n_range, oob_scores, 'g-o', linewidth=2)
    axes[0, 1].set_xlabel('n_estimators')
    axes[0, 1].set_ylabel('OOB Score')
    axes[0, 1].set_title('n_estimators对OOB评分的影响')
    axes[0, 1].grid(True, alpha=0.3)
    
    # 边际收益
    marginal_gain = np.diff(test_scores)
    axes[1, 0].plot(n_range[1:], marginal_gain, 'm-o', linewidth=2)
    axes[1, 0].axhline(y=0.001, color='r', linestyle='--', label='边际收益阈值')
    axes[1, 0].set_xlabel('n_estimators')
    axes[1, 0].set_ylabel('准确率提升')
    axes[1, 0].set_title('边际收益分析')
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=0.3)
    
    # 训练时间
    axes[1, 1].plot(n_range, times, 'c-o', linewidth=2)
    axes[1, 1].set_xlabel('n_estimators')
    axes[1, 1].set_ylabel('训练时间 (秒)')
    axes[1, 1].set_title('训练时间 vs n_estimators')
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # 建议
    best_n = n_range[np.argmax(test_scores)]
    print(f"最佳n_estimators: {best_n}")
    print(f"对应测试准确率: {max(test_scores):.4f}")
    
    return best_n

# 分析n_estimators
best_n = analyze_n_estimators(X_train, y_train, X_test, y_test)

在这里插入图片描述

最佳n_estimators: 90
对应测试准确率: 0.5109

4.2 max_depth的影响

def analyze_max_depth(X_train, y_train, X_test, y_test, depth_range=range(3, 31, 3)):
    """分析max_depth对模型性能的影响"""
    train_scores = []
    test_scores = []
    oob_scores = []
    
    for depth in depth_range:
        rf = RandomForestClassifier(n_estimators=best_n, max_depth=depth,
                                    oob_score=True, n_jobs=-1, random_state=42)
        rf.fit(X_train, y_train)
        
        train_scores.append(accuracy_score(y_train, rf.predict(X_train)))
        test_scores.append(accuracy_score(y_test, rf.predict(X_test)))
        oob_scores.append(rf.oob_score_)
    
    # 可视化
    plt.figure(figsize=(12, 6))
    plt.plot(depth_range, train_scores, 'b-o', label='训练集', linewidth=2, markersize=8)
    plt.plot(depth_range, test_scores, 'r-s', label='测试集', linewidth=2, markersize=8)
    plt.plot(depth_range, oob_scores, 'g-^', label='OOB', linewidth=2, markersize=8)
    plt.xlabel('max_depth')
    plt.ylabel('准确率')
    plt.title('max_depth对模型性能的影响')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # 标记最佳深度
    best_depth = depth_range[np.argmax(test_scores)]
    plt.axvline(x=best_depth, color='purple', linestyle='--', 
                label=f'最佳深度={best_depth}')
    plt.legend()
    plt.show()
    
    print(f"最佳max_depth: {best_depth}")
    print(f"对应测试准确率: {max(test_scores):.4f}")
    
    # 过拟合判断
    overfit_gap = np.array(train_scores) - np.array(test_scores)
    max_gap_idx = np.argmax(overfit_gap)
    if overfit_gap[max_gap_idx] > 0.05:
        print(f"⚠️ 当depth={depth_range[max_gap_idx]}时,过拟合差距为{overfit_gap[max_gap_idx]:.4f}")
    
    return best_depth

# 分析max_depth
best_depth = analyze_max_depth(X_train, y_train, X_test, y_test)

在这里插入图片描述

最佳max_depth: 15
对应测试准确率: 0.5190
⚠️ 当depth=18时,过拟合差距为0.5000

4.3 min_samples_split的影响

def analyze_min_samples_split(X_train, y_train, X_test, y_test, 
                              split_range=[3, 5, 10, 20, 50, 100]):
    """分析min_samples_split对模型性能的影响"""
    train_scores = []
    test_scores = []
    
    for min_split in split_range:
        rf = RandomForestClassifier(n_estimators=best_n, max_depth=best_depth,
                                    min_samples_split=min_split,
                                    n_jobs=-1, random_state=42)
        rf.fit(X_train, y_train)
        
        train_scores.append(accuracy_score(y_train, rf.predict(X_train)))
        test_scores.append(accuracy_score(y_test, rf.predict(X_test)))
    
    # 可视化
    plt.figure(figsize=(10, 6))
    plt.plot(split_range, train_scores, 'b-o', label='训练集', linewidth=2, markersize=8)
    plt.plot(split_range, test_scores, 'r-s', label='测试集', linewidth=2, markersize=8)
    plt.xscale('log')
    plt.xlabel('min_samples_split (对数尺度)')
    plt.ylabel('准确率')
    plt.title('min_samples_split对模型性能的影响')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    best_split = split_range[np.argmax(test_scores)]
    plt.axvline(x=best_split, color='purple', linestyle='--', 
                label=f'最佳值={best_split}')
    plt.legend()
    plt.show()
    
    print(f"最佳min_samples_split: {best_split}")
    print(f"对应测试准确率: {max(test_scores):.4f}")
    
    return best_split

# 分析min_samples_split
best_split = analyze_min_samples_split(X_train, y_train, X_test, y_test)

在这里插入图片描述

最佳min_samples_split: 20
对应测试准确率: 0.5177

5. 时间序列交叉验证调参

5.1 使用TimeSeriesSplit进行网格搜索

def grid_search_with_timeseries(X_train, y_train, param_grid, n_splits=5):
    """使用TimeSeriesSplit进行网格搜索"""
    
    # 创建时间序列交叉验证
    tscv = TimeSeriesSplit(n_splits=n_splits)
    
    # 基础模型
    rf_base = RandomForestClassifier(random_state=42, n_jobs=-1)
    
    # 网格搜索
    grid_search = GridSearchCV(
        rf_base, param_grid, 
        cv=tscv, 
        scoring='roc_auc',
        n_jobs=-1,
        verbose=1
    )
    
    start_time = time.time()
    grid_search.fit(X_train, y_train)
    elapsed_time = time.time() - start_time
    
    print(f"\n网格搜索完成,耗时: {elapsed_time:.2f}秒")
    print(f"最佳参数: {grid_search.best_params_}")
    print(f"最佳CV AUC: {grid_search.best_score_:.4f}")
    
    return grid_search

# 定义参数网格(粗调)
param_grid_coarse = {
    'max_depth': [10, 15, 20, 25, 30],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4]
}

print("开始粗调网格搜索...")
print(f"参数组合数: {len(param_grid_coarse['max_depth']) * len(param_grid_coarse['min_samples_split']) * len(param_grid_coarse['min_samples_leaf'])}")

grid_search_coarse = grid_search_with_timeseries(X_train, y_train, param_grid_coarse)
开始粗调网格搜索...
参数组合数: 60
Fitting 5 folds for each of 60 candidates, totalling 300 fits

网格搜索完成,耗时: 42.71秒
最佳参数: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5}
最佳CV AUC: 0.5096

5.2 精调

# 根据粗调结果,缩小参数范围
best_params = grid_search_coarse.best_params_
print(f"\n粗调最佳参数: {best_params}")

# 精调参数网格
param_grid_fine = {
    'max_depth': [best_params['max_depth'] - 2, best_params['max_depth'] - 1,
                  best_params['max_depth'], 
                  best_params['max_depth'] + 1, best_params['max_depth'] + 2],
    'min_samples_split': [max(2, best_params['min_samples_split'] - 2),
                          max(2, best_params['min_samples_split'] - 1),
                          best_params['min_samples_split'],
                          best_params['min_samples_split'] + 1,
                          best_params['min_samples_split'] + 2],
    'min_samples_leaf': [max(1, best_params['min_samples_leaf'] - 1),
                         best_params['min_samples_leaf'],
                         best_params['min_samples_leaf'] + 1]
}

# 确保参数有效
param_grid_fine['max_depth'] = [d for d in param_grid_fine['max_depth'] if d >= 1]
param_grid_fine['min_samples_split'] = [s for s in param_grid_fine['min_samples_split'] if s >= 2]
param_grid_fine['min_samples_leaf'] = [l for l in param_grid_fine['min_samples_leaf'] if l >= 1]

print(f"\n精调参数网格: {param_grid_fine}")

grid_search_fine = grid_search_with_timeseries(X_train, y_train, param_grid_fine)
粗调最佳参数: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5}

精调参数网格: {'max_depth': [8, 9, 10, 11, 12], 'min_samples_split': [3, 4, 5, 6, 7], 'min_samples_leaf': [1, 1, 2]}
Fitting 5 folds for each of 75 candidates, totalling 375 fits

网格搜索完成,耗时: 42.87秒
最佳参数: {'max_depth': 11, 'min_samples_leaf': 1, 'min_samples_split': 5}
最佳CV AUC: 0.5124

5.3 随机搜索(大参数空间)

from scipy.stats import randint, uniform

# 定义参数分布
param_dist = {
    'n_estimators': randint(50, 300),
    'max_depth': randint(5, 50),
    'min_samples_split': randint(2, 30),
    'min_samples_leaf': randint(1, 20),
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

# 随机搜索
tscv = TimeSeriesSplit(n_splits=5)
rf_random = RandomForestClassifier(random_state=42, n_jobs=-1)

random_search = RandomizedSearchCV(
    rf_random, param_dist,
    n_iter=50,  # 搜索50次
    cv=tscv,
    scoring='roc_auc',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

print("开始随机搜索(50次迭代)...")
start_time = time.time()
random_search.fit(X_train, y_train)
elapsed_time = time.time() - start_time

print(f"\n随机搜索完成,耗时: {elapsed_time:.2f}秒")
print(f"最佳参数: {random_search.best_params_}")
print(f"最佳CV AUC: {random_search.best_score_:.4f}")
开始随机搜索(50次迭代)...
Fitting 5 folds for each of 50 candidates, totalling 250 fits

随机搜索完成,耗时: 75.28秒
最佳参数: {'bootstrap': True, 'max_depth': 7, 'max_features': 'sqrt', 'min_samples_leaf': 19, 'min_samples_split': 8, 'n_estimators': 70}
最佳CV AUC: 0.5163

6. 学习曲线与验证曲线

6.1 学习曲线

def plot_learning_curve(estimator, X, y, cv, title="学习曲线"):
    """绘制学习曲线"""
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=-1,
        train_sizes=np.linspace(0.1, 1.0, 10),
        scoring='roc_auc'
    )
    
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    
    plt.figure(figsize=(10, 6))
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, 
                     alpha=0.1, color='blue')
    plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, 
                     alpha=0.1, color='red')
    plt.plot(train_sizes, train_mean, 'b-o', label='训练集', linewidth=2)
    plt.plot(train_sizes, test_mean, 'r-s', label='验证集', linewidth=2)
    plt.xlabel('训练样本数')
    plt.ylabel('AUC')
    plt.title(title)
    plt.legend(loc='best')
    plt.grid(True, alpha=0.3)
    plt.show()
    
    # 诊断
    final_gap = train_mean[-1] - test_mean[-1]
    print(f"最终训练集AUC: {train_mean[-1]:.4f}")
    print(f"最终验证集AUC: {test_mean[-1]:.4f}")
    print(f"差距: {final_gap:.4f}")
    
    if final_gap > 0.1:
        print("⚠️ 模型可能过拟合,需要增加正则化")
    elif train_mean[-1] < 0.6:
        print("⚠️ 模型可能欠拟合,需要增加模型复杂度")
    else:
        print("✅ 模型拟合良好")

# 使用最佳参数模型
best_rf = random_search.best_estimator_
tscv_plot = TimeSeriesSplit(n_splits=5)
plot_learning_curve(best_rf, X_train, y_train, tscv_plot, 
                   f"学习曲线 (n_estimators={best_rf.n_estimators})")

在这里插入图片描述

最终训练集AUC: 0.8100
最终验证集AUC: 0.5101
差距: 0.2999
⚠️ 模型可能过拟合,需要增加正则化

6.2 验证曲线

def plot_validation_curve(estimator, X, y, param_name, param_range, cv, title="验证曲线"):
    """绘制验证曲线"""
    train_scores, test_scores = validation_curve(
        estimator, X, y, param_name=param_name, param_range=param_range,
        cv=cv, scoring='roc_auc', n_jobs=-1
    )
    
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    
    plt.figure(figsize=(10, 6))
    plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, 
                     alpha=0.1, color='blue')
    plt.fill_between(param_range, test_mean - test_std, test_mean + test_std, 
                     alpha=0.1, color='red')
    plt.plot(param_range, train_mean, 'b-o', label='训练集', linewidth=2)
    plt.plot(param_range, test_mean, 'r-s', label='验证集', linewidth=2)
    plt.xlabel(param_name)
    plt.ylabel('AUC')
    plt.title(title)
    plt.legend(loc='best')
    plt.grid(True, alpha=0.3)
    
    # 标记最佳值
    best_idx = np.argmax(test_mean)
    best_val = param_range[best_idx]
    plt.axvline(x=best_val, color='green', linestyle='--', 
                label=f'最佳值={best_val}')
    plt.legend()
    plt.show()
    
    print(f"最佳{param_name}: {best_val}")
    print(f"对应验证AUC: {test_mean[best_idx]:.4f}")

# 绘制各参数的验证曲线
tscv_val = TimeSeriesSplit(n_splits=5)

# max_depth验证曲线
param_range_depth = [5, 10, 15, 20, 25, 30]
plot_validation_curve(RandomForestClassifier(n_estimators=best_n, random_state=42),
                     X_train, y_train, 'max_depth', param_range_depth, tscv_val,
                     '验证曲线 - max_depth')

# min_samples_split验证曲线
param_range_split = [2, 5, 10, 20, 50, 100]
plot_validation_curve(RandomForestClassifier(n_estimators=best_n, random_state=42),
                     X_train, y_train, 'min_samples_split', param_range_split, tscv_val,
                     '验证曲线 - min_samples_split')

# min_samples_leaf验证曲线
param_range_leaf = [1, 2, 4, 6, 8, 10]
plot_validation_curve(RandomForestClassifier(n_estimators=best_n, random_state=42),
                     X_train, y_train, 'min_samples_leaf', param_range_leaf, tscv_val,
                     '验证曲线 - min_samples_leaf')

在这里插入图片描述

最佳max_depth: 5
对应验证AUC: 0.5027

在这里插入图片描述

最佳min_samples_split: 100
对应验证AUC: 0.5143

在这里插入图片描述

最佳min_samples_leaf: 4
对应验证AUC: 0.5049

7. 最终模型评估

7.1 使用最佳参数训练最终模型

best_params_final = random_search.best_params_
print("="*60)
print("最终模型参数")
print("="*60)
for param, value in best_params_final.items():
    print(f"{param}: {value}")

# 训练最终模型
rf_final = RandomForestClassifier(**best_params_final, random_state=42, n_jobs=-1)
rf_final.fit(X_train, y_train)

# 预测
y_pred_final = rf_final.predict(X_test)
y_proba_final = rf_final.predict_proba(X_test)[:, 1]

# 评估
print("\n" + "="*60)
print("最终模型测试集性能")
print("="*60)
print(f"准确率: {accuracy_score(y_test, y_pred_final):.4f}")
print(f"精确率: {precision_score(y_test, y_pred_final):.4f}")
print(f"召回率: {recall_score(y_test, y_pred_final):.4f}")
print(f"F1分数: {f1_score(y_test, y_pred_final):.4f}")
print(f"AUC: {roc_auc_score(y_test, y_proba_final):.4f}")
============================================================
最终模型参数
============================================================
bootstrap: True
max_depth: 7
max_features: sqrt
min_samples_leaf: 19
min_samples_split: 8
n_estimators: 70

============================================================
最终模型测试集性能
============================================================
准确率: 0.4986
精确率: 0.4747
召回率: 0.6243
F1分数: 0.5393
AUC: 0.5006

7.2 调参前后对比

# 默认参数模型
rf_default = RandomForestClassifier(random_state=42, n_jobs=-1)
rf_default.fit(X_train, y_train)
y_proba_default = rf_default.predict_proba(X_test)[:, 1]

# 对比
print("="*60)
print("调参前后性能对比")
print("="*60)
print(f"{'指标':<12} {'默认参数':<12} {'调优后':<12} {'提升':<12}")
print("-"*48)

for metric_name, metric_func in [
    ('准确率', lambda: accuracy_score(y_test, rf_default.predict(X_test))),
    ('AUC', lambda: roc_auc_score(y_test, y_proba_default))
]:
    default_val = metric_func()
    final_val = accuracy_score(y_test, y_pred_final) if metric_name == '准确率' else roc_auc_score(y_test, y_proba_final)
    improvement = final_val - default_val
    print(f"{metric_name:<12} {default_val:<12.4f} {final_val:<12.4f} {improvement:+.4f}")

# ROC曲线对比
plt.figure(figsize=(10, 8))

fpr_default, tpr_default, _ = roc_curve(y_test, y_proba_default)
fpr_final, tpr_final, _ = roc_curve(y_test, y_proba_final)

plt.plot(fpr_default, tpr_default, 'b-', linewidth=2, 
         label=f'默认参数 (AUC={roc_auc_score(y_test, y_proba_default):.4f})')
plt.plot(fpr_final, tpr_final, 'r-', linewidth=2, 
         label=f'调优后 (AUC={roc_auc_score(y_test, y_proba_final):.4f})')
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label='随机分类器')

plt.xlabel('假阳性率 (FPR)')
plt.ylabel('真阳性率 (TPR)')
plt.title('调参前后ROC曲线对比')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
plt.show()
============================================================
调参前后性能对比
============================================================
指标           默认参数         调优后          提升          
------------------------------------------------
准确率          0.5041       0.4986       -0.0054
AUC          0.5097       0.5006       -0.0091

在这里插入图片描述

8. 特征重要性分析

# 特征重要性
importances_final = rf_final.feature_importances_
importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': importances_final
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(importance_df['feature'], importance_df['importance'])
plt.xlabel('重要性')
plt.title(f'最终模型特征重要性 (n_estimators={rf_final.n_estimators})')
for i, (_, row) in enumerate(importance_df.iterrows()):
    plt.text(row['importance'] + 0.002, i, f"{row['importance']:.4f}", va='center')
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("特征重要性排序:")
print(importance_df.to_string(index=False))

在这里插入图片描述

特征重要性排序:
     feature  importance
  momentum_3    0.133388
  momentum_1    0.120860
  momentum_5    0.104773
 momentum_10    0.097087
  momentum_2    0.087876
 macd_signal    0.084436
         rsi    0.078785
    ma_ratio    0.077452
        macd    0.074416
volume_ratio    0.072558
  volatility    0.068370

9. 今日总结

  1. 核心超参数理解:

    • n_estimators: 树的数量,影响稳定性和计算时间
    • max_depth: 最大深度,最重要的正则化参数
    • min_samples_split: 分裂所需最小样本数
    • min_samples_leaf: 叶节点最小样本数
    • max_features: 特征子集大小,控制多样性
  2. 时间序列交叉验证:

    • 使用TimeSeriesSplit代替普通K折
    • 训练集始终在测试集之前
    • 可设置gap防止短期依赖泄露
  3. 调参策略:

    • 粗调 → 精调 → 联合调优
    • 使用验证曲线观察参数影响
    • 学习曲线诊断过拟合/欠拟合
  4. 量化交易建议:

    • 使用TimeSeriesSplit避免前视偏差
    • 定期重新调参(每季度/半年)
    • 保留验证集进行最终评估
  5. 扩展作业

    • 作业1:尝试不同的特征子集大小(max_features),观察对性能的影响
    • 作业2:使用RandomizedSearchCV进行更大范围的搜索(n_iter=100)
    • 作业3:实现滚动窗口交叉验证(固定窗口大小)
    • 作业4:在实际股票数据上验证调参效果
  6. 量化思考

    • 时间序列数据的交叉验证必须考虑时间顺序
    • 参数调优可以显著提升模型性能
    • 需要平衡模型复杂度和泛化能力
    • 实盘前需在独立的验证集上测试
  7. 附录:参数速查表

# ┌─────────────────────────────────────────────────────────────────────────┐
# │                        随机森林参数速查表                                │
# ├─────────────────────────────────────────────────────────────────────────┤
# │                                                                         │
# │ 参数名称           │ 默认值      │ 调优范围           |  作用             │
# │ ───────────────────┼────────────┼────────────────────┼──────────────────│
# │ n_estimators       │ 100        │ 50-500             │ 树的数量          │
# │ max_depth          │ None       │ 5-30               │ 最大深度          │
# │ min_samples_split  │ 2          │ 2-20               │ 分裂最小样本数    │
# │ min_samples_leaf   │ 1          │ 1-10               │ 叶节点最小样本数  │
# │ max_features       │ sqrt       │ sqrt, log2, None   │ 特征子集大小      │
# │ bootstrap          │ True       │ True, False        │ 是否自助采样      │
# │ oob_score          │ False      │ True               │ 是否计算OOB评分   │
# │                                                                         │
# └─────────────────────────────────────────────────────────────────────────┘
Logo

AtomGit 是由开放原子开源基金会联合 CSDN 等生态伙伴共同推出的新一代开源与人工智能协作平台。平台坚持“开放、中立、公益”的理念,把代码托管、模型共享、数据集托管、智能体开发体验和算力服务整合在一起,为开发者提供从开发、训练到部署的一站式体验。

更多推荐