01-编程基础与数学基石：微积分

xiaotao131

525人浏览 · 2026-04-17 00:19:56

xiaotao131 · 2026-04-17 00:19:56 发布

在这里插入图片描述

微积分：AI模型训练的优化引擎

一、为什么AI需要微积分？

1.1 微积分在AI中的核心地位

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

print("=" * 60)
print("微积分在AI中的应用场景")
print("=" * 60)

applications = {
    "梯度下降": "用导数找到损失函数的最小值",
    "反向传播": "用链式法则计算神经网络各层的梯度",
    "学习率调度": "理解梯度变化调整步长",
    "优化器设计": "Adam/SGD等基于梯度信息",
    "激活函数": "需要可导才能反向传播"
}

for app, desc in applications.items():
    print(f"\n📌 {app}:")
    print(f"   {desc}")

# 直观示例：用导数找到函数最小值
def f(x):
    return x**2 + 2*x + 1

def df(x):
    return 2*x + 2  # 导数

x = np.linspace(-4, 2, 100)
y = f(x)

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(x, y, 'b-', linewidth=2, label='f(x) = x² + 2x + 1')
plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
plt.xlabel('x')
plt.ylabel('f(x)')
plt.title('函数曲线：我们要找到最小值')
plt.grid(True, alpha=0.3)
plt.legend()

# 梯度下降过程
x_current = 3
learning_rate = 0.3
history_x = [x_current]
history_y = [f(x_current)]

for _ in range(10):
    gradient = df(x_current)
    x_current = x_current - learning_rate * gradient
    history_x.append(x_current)
    history_y.append(f(x_current))

plt.subplot(1, 2, 2)
plt.plot(x, y, 'b-', linewidth=2, label='f(x)')
plt.plot(history_x, history_y, 'ro-', markersize=8, label='梯度下降路径')
plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
plt.xlabel('x')
plt.ylabel('f(x)')
plt.title(f'梯度下降找到最小值 x={history_x[-1]:.3f}')
plt.grid(True, alpha=0.3)
plt.legend()

plt.tight_layout()
plt.show()

print(f"\n✨ 梯度下降过程:")
for i, (x_val, y_val) in enumerate(zip(history_x, history_y)):
    print(f"   第{i}步: x={x_val:.3f}, f(x)={y_val:.3f}")

二、导数：变化率的度量

2.1 导数的直观理解

导数 = 函数在某点的瞬时变化率 = 切线的斜率

# 导数的几何意义
def f1(x):
    return x**2

def f1_derivative(x):
    return 2*x

x_point = 1.5
x = np.linspace(-2, 3, 100)
y = f1(x)

# 计算切线
tangent_x = np.linspace(x_point - 1, x_point + 1, 50)
tangent_y = f1(x_point) + f1_derivative(x_point) * (tangent_x - x_point)

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(x, y, 'b-', linewidth=2, label='f(x) = x²')
plt.plot(tangent_x, tangent_y, 'r--', linewidth=2, label=f'切线 (斜率={f1_derivative(x_point):.1f})')
plt.plot(x_point, f1(x_point), 'ro', markersize=10)
plt.axhline(y=0, color='black', alpha=0.3)
plt.axvline(x=0, color='black', alpha=0.3)
plt.grid(True, alpha=0.3)
plt.xlabel('x')
plt.ylabel('f(x)')
plt.title('导数 = 切线的斜率')
plt.legend()

# 不同点的导数
points = [-1.5, 0, 1.5]
plt.subplot(1, 2, 2)
plt.plot(x, y, 'b-', linewidth=2)

for xp in points:
    slope = f1_derivative(xp)
    tangent_x = np.linspace(xp - 0.8, xp + 0.8, 50)
    tangent_y = f1(xp) + slope * (tangent_x - xp)
    plt.plot(tangent_x, tangent_y, '--', linewidth=1.5, label=f'x={xp}, 斜率={slope:.1f}')
    plt.plot(xp, f1(xp), 'ro', markersize=8)

plt.grid(True, alpha=0.3)
plt.xlabel('x')
plt.ylabel('f(x)')
plt.title('不同点的导数（斜率）不同')
plt.legend()

plt.tight_layout()
plt.show()

print("\n📐 导数的解读:")
print("   导数为正 → 函数在增加（向右走会变大）")
print("   导数为负 → 函数在减少（向右走会变小）")
print("   导数为零 → 可能是极值点（最小或最大）")

2.2 导数在AI中的应用：学习率

# 学习率的影响：太大不收敛，太小收敛慢
def loss_function(w):
    return w**2 + 2*w + 5

def gradient(w):
    return 2*w + 2

# 测试不同学习率
learning_rates = [0.05, 0.3, 0.8, 1.2]
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

for idx, lr in enumerate(learning_rates):
    ax = axes[idx // 2, idx % 2]
    
    w = 3.0  # 初始值
    history = [w]
    
    for _ in range(20):
        grad = gradient(w)
        w = w - lr * grad
        history.append(w)
    
    w_range = np.linspace(-4, 4, 100)
    loss_range = loss_function(w_range)
    
    ax.plot(w_range, loss_range, 'b-', linewidth=2)
    ax.plot(history, [loss_function(w) for w in history], 'ro-', markersize=6)
    ax.set_xlabel('权重 w')
    ax.set_ylabel('损失')
    ax.set_title(f'学习率 = {lr}')
    ax.grid(True, alpha=0.3)
    
    if lr > 1:
        ax.text(0.5, 0.8, '⚠️ 太大，发散！', transform=ax.transAxes, 
                fontsize=12, color='red', ha='center')
    elif lr < 0.1:
        ax.text(0.5, 0.8, '🐢 太小，收敛慢', transform=ax.transAxes,
                fontsize=12, color='orange', ha='center')
    else:
        ax.text(0.5, 0.8, '✅ 合适，快速收敛', transform=ax.transAxes,
                fontsize=12, color='green', ha='center')

plt.suptitle('学习率对梯度下降的影响', fontsize=14)
plt.tight_layout()
plt.show()

print("\n💡 学习率选择建议:")
print("   学习率太大 → 震荡/发散（跨过最小值）")
print("   学习率太小 → 收敛太慢（需要很多步）")
print("   合适的学习率 → 平稳下降到最小值")

三、偏导数：多变量函数的梯度

3.1 从单变量到多变量

# 二元函数：f(x, y) = x² + y²
def f_2d(x, y):
    return x**2 + y**2

# 偏导数：对x求导时把y当常数，对y求导时把x当常数
def df_dx(x, y):
    return 2*x  # ∂f/∂x

def df_dy(x, y):
    return 2*y  # ∂f/∂y

# 可视化
x = np.linspace(-3, 3, 50)
y = np.linspace(-3, 3, 50)
X, Y = np.meshgrid(x, y)
Z = f_2d(X, Y)

fig = plt.figure(figsize=(14, 5))

# 3D曲面
ax1 = fig.add_subplot(1, 2, 1, projection='3d')
surf = ax1.plot_surface(X, Y, Z, cmap='viridis', alpha=0.8)
ax1.set_xlabel('x')
ax1.set_ylabel('y')
ax1.set_zlabel('f(x,y)')
ax1.set_title('f(x,y) = x² + y²')

# 等高线 + 梯度向量
ax2 = fig.add_subplot(1, 2, 2)
contour = ax2.contour(X, Y, Z, levels=20, cmap='viridis')
ax2.clabel(contour, inline=True, fontsize=8)

# 在几个点绘制梯度向量
points = [(-2, -1), (2, -1), (-2, 1), (2, 1), (0, 0)]
for px, py in points:
    gx = df_dx(px, py)
    gy = df_dy(px, py)
    # 梯度方向指向增长最快的方向
    ax2.quiver(px, py, gx, gy, color='red', angles='xy', 
               scale_units='xy', scale=5, width=0.02)
    ax2.plot(px, py, 'bo', markersize=8)

ax2.set_xlabel('x')
ax2.set_ylabel('y')
ax2.set_title('等高线 + 梯度向量（指向最陡上升方向）')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n📊 偏导数的解读:")
print("   ∂f/∂x = 2x: 在x方向的变化率")
print("   ∂f/∂y = 2y: 在y方向的变化率")
print("   梯度 ∇f = (∂f/∂x, ∂f/∂y) = (2x, 2y)")
print("   梯度指向函数增长最快的方向！")

3.2 AI实战：线性回归的梯度计算

# 线性回归：损失函数对参数的偏导数
np.random.seed(42)

# 生成数据
n_samples = 100
X = np.random.randn(n_samples, 1)
true_w = 2.5
true_b = 1.0
y = true_w * X.flatten() + true_b + np.random.randn(n_samples) * 0.5

# 损失函数：MSE = (1/n) * Σ(y_pred - y_true)²
# y_pred = w * x + b

def compute_gradients(X, y, w, b):
    """计算损失函数对w和b的偏导数"""
    n = len(y)
    y_pred = w * X.flatten() + b
    
    # 偏导数公式
    dw = (2/n) * np.sum((y_pred - y) * X.flatten())
    db = (2/n) * np.sum(y_pred - y)
    
    return dw, db

def compute_loss(X, y, w, b):
    """计算当前损失"""
    y_pred = w * X.flatten() + b
    return np.mean((y_pred - y) ** 2)

# 可视化损失曲面
w_range = np.linspace(0, 4, 50)
b_range = np.linspace(-1, 3, 50)
W, B = np.meshgrid(w_range, b_range)
Loss = np.zeros_like(W)

for i in range(len(w_range)):
    for j in range(len(b_range)):
        Loss[j, i] = compute_loss(X, y, W[j, i], B[j, i])

# 梯度下降过程
w, b = 0.0, 0.0  # 初始值
learning_rate = 0.1
history = [(w, b, compute_loss(X, y, w, b))]

for epoch in range(50):
    dw, db = compute_gradients(X, y, w, b)
    w = w - learning_rate * dw
    b = b - learning_rate * db
    history.append((w, b, compute_loss(X, y, w, b)))

# 可视化
fig = plt.figure(figsize=(15, 5))

# 3D损失曲面
ax1 = fig.add_subplot(1, 3, 1, projection='3d')
surf = ax1.plot_surface(W, B, Loss, cmap='viridis', alpha=0.7)
ax1.set_xlabel('权重 w')
ax1.set_ylabel('偏置 b')
ax1.set_zlabel('损失')
ax1.set_title('损失函数曲面')

# 绘制梯度下降路径
w_hist = [h[0] for h in history]
b_hist = [h[1] for h in history]
loss_hist = [h[2] for h in history]
ax1.plot(w_hist, b_hist, loss_hist, 'ro-', markersize=4, linewidth=1)

# 等高线 + 路径
ax2 = fig.add_subplot(1, 3, 2)
contour = ax2.contour(W, B, Loss, levels=30, cmap='viridis')
ax2.plot(w_hist, b_hist, 'ro-', markersize=4, linewidth=1.5)
ax2.plot(true_w, true_b, 'b*', markersize=15, label='真实值')
ax2.set_xlabel('权重 w')
ax2.set_ylabel('偏置 b')
ax2.set_title('梯度下降路径（等高线图）')
ax2.legend()
ax2.grid(True, alpha=0.3)

# 损失下降曲线
ax3 = fig.add_subplot(1, 3, 3)
ax3.plot(range(len(history)), [h[2] for h in history], 'b-', linewidth=2)
ax3.set_xlabel('迭代次数')
ax3.set_ylabel('损失')
ax3.set_title('损失下降曲线')
ax3.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\n📈 线性回归结果:")
print(f"   真实值: w={true_w:.2f}, b={true_b:.2f}")
print(f"   估计值: w={w:.2f}, b={b:.2f}")
print(f"   最终损失: {history[-1][2]:.4f}")

四、链式法则：反向传播的核心

4.1 链式法则的原理

链式法则：复合函数的导数 = 外层导数 × 内层导数

# 链式法则演示
def demonstrate_chain_rule():
    """
    复合函数: f(g(x)) = (2x + 1)²
    外层: f(u) = u², 导数 f'(u) = 2u
    内层: g(x) = 2x + 1, 导数 g'(x) = 2
    链式: df/dx = f'(g(x)) * g'(x) = 2(2x+1) * 2 = 4(2x+1)
    """
    x = 2
    u = 2*x + 1  # 内层输出
    f = u**2      # 最终输出
    
    # 手动链式法则
    df_du = 2*u          # 外层导数
    du_dx = 2            # 内层导数
    df_dx_chain = df_du * du_dx
    
    # 直接求导验证
    df_dx_direct = 4 * (2*x + 1)
    
    print(f"\n🔗 链式法则示例:")
    print(f"   x = {x}")
    print(f"   u = g(x) = 2x + 1 = {u}")
    print(f"   f = u² = {f}")
    print(f"   链式法则: df/dx = {df_du} × {du_dx} = {df_dx_chain}")
    print(f"   直接求导: df/dx = {df_dx_direct}")
    print(f"   ✅ 结果一致！")

demonstrate_chain_rule()

# 可视化链式法则
def f_outer(u):
    return u**2

def g_inner(x):
    return 2*x + 1

x_vals = np.linspace(0, 3, 100)
u_vals = g_inner(x_vals)
f_vals = f_outer(u_vals)

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# 内层函数
axes[0].plot(x_vals, u_vals, 'b-', linewidth=2)
axes[0].set_xlabel('x')
axes[0].set_ylabel('u = g(x)')
axes[0].set_title('内层函数: u = 2x + 1')
axes[0].grid(True, alpha=0.3)

# 外层函数
axes[1].plot(u_vals, f_vals, 'r-', linewidth=2)
axes[1].set_xlabel('u')
axes[1].set_ylabel('f = h(u)')
axes[1].set_title('外层函数: f = u²')
axes[1].grid(True, alpha=0.3)

# 复合函数
axes[2].plot(x_vals, f_vals, 'g-', linewidth=2)
axes[2].set_xlabel('x')
axes[2].set_ylabel('f(g(x))')
axes[2].set_title('复合函数: f(g(x)) = (2x+1)²')
axes[2].grid(True, alpha=0.3)

plt.suptitle('链式法则：复合函数的分解', fontsize=14)
plt.tight_layout()
plt.show()

4.2 神经网络中的反向传播

# 实现一个简单的神经网络演示反向传播
class SimpleNeuralNetwork:
    """2层神经网络：输入→隐藏→输出"""
    
    def __init__(self, input_size, hidden_size, output_size):
        # 初始化权重
        np.random.seed(42)
        self.W1 = np.random.randn(input_size, hidden_size) * 0.5
        self.b1 = np.zeros(hidden_size)
        self.W2 = np.random.randn(hidden_size, output_size) * 0.5
        self.b2 = np.zeros(output_size)
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_derivative(self, x):
        return x * (1 - x)
    
    def forward(self, X):
        """前向传播"""
        self.z1 = X @ self.W1 + self.b1
        self.a1 = self.sigmoid(self.z1)
        self.z2 = self.a1 @ self.W2 + self.b2
        self.a2 = self.sigmoid(self.z2)
        return self.a2
    
    def backward(self, X, y, output):
        """反向传播（链式法则的应用）"""
        m = X.shape[0]
        
        # 输出层误差
        dZ2 = output - y
        dW2 = (1/m) * self.a1.T @ dZ2
        db2 = (1/m) * np.sum(dZ2, axis=0)
        
        # 隐藏层误差（链式法则）
        dA1 = dZ2 @ self.W2.T
        dZ1 = dA1 * self.sigmoid_derivative(self.a1)
        dW1 = (1/m) * X.T @ dZ1
        db1 = (1/m) * np.sum(dZ1, axis=0)
        
        return dW1, db1, dW2, db2
    
    def update(self, dW1, db1, dW2, db2, learning_rate):
        """更新参数"""
        self.W1 -= learning_rate * dW1
        self.b1 -= learning_rate * db1
        self.W2 -= learning_rate * dW2
        self.b2 -= learning_rate * db2

# 生成异或数据
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [1], [1], [0]])

# 训练网络
nn = SimpleNeuralNetwork(2, 4, 1)
epochs = 5000
learning_rate = 0.5
losses = []

for epoch in range(epochs):
    # 前向传播
    output = nn.forward(X)
    
    # 计算损失
    loss = np.mean((output - y) ** 2)
    losses.append(loss)
    
    # 反向传播
    dW1, db1, dW2, db2 = nn.backward(X, y, output)
    
    # 更新参数
    nn.update(dW1, db1, dW2, db2, learning_rate)
    
    if epoch % 1000 == 0:
        print(f"Epoch {epoch}: Loss = {loss:.6f}")

# 可视化结果
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# 损失曲线
axes[0].plot(losses, 'b-', linewidth=2)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('训练损失下降曲线')
axes[0].grid(True, alpha=0.3)

# 预测结果
predictions = nn.forward(X)
axes[1].bar(range(4), predictions.flatten(), color=['red', 'green', 'green', 'red'])
axes[1].set_xticks(range(4))
axes[1].set_xticklabels(['00', '01', '10', '11'])
axes[1].set_ylabel('预测值')
axes[1].set_title('异或问题预测结果')
axes[1].axhline(y=0.5, color='black', linestyle='--', label='决策边界')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.suptitle('神经网络学习异或问题（反向传播）', fontsize=14)
plt.tight_layout()
plt.show()

print(f"\n🧠 神经网络训练结果:")
print(f"   输入: 00 → 预测: {predictions[0,0]:.3f} (期望: 0)")
print(f"   输入: 01 → 预测: {predictions[1,0]:.3f} (期望: 1)")
print(f"   输入: 10 → 预测: {predictions[2,0]:.3f} (期望: 1)")
print(f"   输入: 11 → 预测: {predictions[3,0]:.3f} (期望: 0)")

4.3 可视化反向传播的梯度流

# 可视化梯度在神经网络中的流动
def visualize_gradient_flow():
    """展示梯度如何从输出层传播到输入层"""
    
    layers = ['输入层', '隐藏层1', '隐藏层2', '输出层']
    gradients = [0.01, 0.05, 0.25, 1.0]  # 模拟梯度大小
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
    
    # 梯度流
    x_pos = np.arange(len(layers))
    ax1.bar(x_pos, gradients, color='steelblue', alpha=0.7)
    ax1.set_xticks(x_pos)
    ax1.set_xticklabels(layers)
    ax1.set_ylabel('梯度大小')
    ax1.set_title('梯度反向传播：从输出层到输入层')
    ax1.grid(True, alpha=0.3, axis='y')
    
    # 添加箭头表示传播方向
    for i in range(len(gradients)-1):
        ax1.annotate('', xy=(i+1, gradients[i+1]), xytext=(i, gradients[i]),
                    arrowprops=dict(arrowstyle='->', color='red', lw=2))
    
    # 梯度消失/爆炸问题
    grad_flow = {
        '正常': [0.8, 0.6, 0.4, 0.2],
        '梯度消失': [0.8, 0.2, 0.05, 0.01],
        '梯度爆炸': [0.8, 2.0, 5.0, 12.0]
    }
    
    for name, grads in grad_flow.items():
        ax2.plot(layers, grads, 'o-', linewidth=2, markersize=8, label=name)
    
    ax2.set_ylabel('梯度大小')
    ax2.set_title('梯度消失 vs 梯度爆炸')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("\n⚠️ 梯度问题警示:")
    print("   梯度消失: 深层网络无法学习（使用ReLU、残差连接）")
    print("   梯度爆炸: 参数更新过大（使用梯度裁剪）")

visualize_gradient_flow()

五、梯度：优化的指南针

5.1 梯度的几何意义

梯度 = 所有偏导数组成的向量，指向函数增长最快的方向

# 梯度下降的几何演示
def rosenbrock(x, y):
    """Rosenbrock函数，优化中的经典测试函数"""
    return (1 - x)**2 + 100 * (y - x**2)**2

def rosenbrock_gradient(x, y):
    """Rosenbrock函数的梯度"""
    dx = -2 * (1 - x) - 400 * x * (y - x**2)
    dy = 200 * (y - x**2)
    return np.array([dx, dy])

# 梯度下降优化
x, y = -1.5, 1.5  # 起始点
learning_rate = 0.0005
path = [(x, y, rosenbrock(x, y))]

for _ in range(100):
    grad = rosenbrock_gradient(x, y)
    x -= learning_rate * grad[0]
    y -= learning_rate * grad[1]
    path.append((x, y, rosenbrock(x, y)))

# 可视化
x_range = np.linspace(-2, 2, 100)
y_range = np.linspace(-1, 3, 100)
X, Y = np.meshgrid(x_range, y_range)
Z = rosenbrock(X, Y)

fig = plt.figure(figsize=(15, 5))

# 3D视图
ax1 = fig.add_subplot(1, 2, 1, projection='3d')
surf = ax1.plot_surface(X, Y, Z, cmap='viridis', alpha=0.7)
ax1.plot([p[0] for p in path], [p[1] for p in path], [p[2] for p in path], 
         'ro-', markersize=4, linewidth=1.5)
ax1.set_xlabel('x')
ax1.set_ylabel('y')
ax1.set_zlabel('f(x,y)')
ax1.set_title('梯度下降在Rosenbrock函数上的路径')

# 等高线视图
ax2 = fig.add_subplot(1, 2, 2)
contour = ax2.contour(X, Y, Z, levels=50, cmap='viridis')
ax2.plot([p[0] for p in path], [p[1] for p in path], 'ro-', markersize=4, linewidth=1.5)
ax2.plot(1, 1, 'b*', markersize=15, label='全局最小值 (1,1)')
ax2.set_xlabel('x')
ax2.set_ylabel('y')
ax2.set_title('梯度下降路径（等高线）')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\n🎯 梯度下降结果:")
print(f"   起点: (-1.5, 1.5)")
print(f"   终点: ({path[-1][0]:.3f}, {path[-1][1]:.3f})")
print(f"   最小值点: (1, 1)")

5.2 各种优化算法对比

# 对比不同优化算法
class OptimizerComparison:
    """对比不同梯度下降优化算法"""
    
    @staticmethod
    def gradient_descent(grad_func, start, lr=0.1, epochs=50):
        x = start.copy()
        path = [x.copy()]
        for _ in range(epochs):
            grad = grad_func(x)
            x = x - lr * grad
            path.append(x.copy())
        return np.array(path)
    
    @staticmethod
    def momentum(grad_func, start, lr=0.1, momentum=0.9, epochs=50):
        x = start.copy()
        v = np.zeros_like(x)
        path = [x.copy()]
        for _ in range(epochs):
            grad = grad_func(x)
            v = momentum * v - lr * grad
            x = x + v
            path.append(x.copy())
        return np.array(path)
    
    @staticmethod
    def rmsprop(grad_func, start, lr=0.01, beta=0.9, eps=1e-8, epochs=50):
        x = start.copy()
        s = np.zeros_like(x)
        path = [x.copy()]
        for _ in range(epochs):
            grad = grad_func(x)
            s = beta * s + (1 - beta) * grad**2
            x = x - lr * grad / (np.sqrt(s) + eps)
            path.append(x.copy())
        return np.array(path)

# 测试函数：有崎岖表面的函数
def test_function(x):
    return x[0]**2 + 10 * x[1]**2 + 2 * np.sin(5*x[0]) * np.cos(5*x[1])

def test_gradient(x):
    return np.array([
        2*x[0] + 10 * np.cos(5*x[0]) * np.cos(5*x[1]),
        20*x[1] - 10 * np.sin(5*x[0]) * np.sin(5*x[1])
    ])

# 运行不同优化器
start = np.array([2.0, 1.5])
optimizers = {
    'SGD': OptimizerComparison.gradient_descent,
    'Momentum': OptimizerComparison.momentum,
    'RMSprop': OptimizerComparison.rmsprop
}

paths = {}
for name, opt_func in optimizers.items():
    paths[name] = opt_func(test_gradient, start, epochs=30)

# 可视化
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

x_range = np.linspace(-3, 3, 100)
y_range = np.linspace(-2, 2, 100)
X, Y = np.meshgrid(x_range, y_range)
Z = test_function([X, Y])

for idx, (name, path) in enumerate(paths.items()):
    ax = axes[idx]
    contour = ax.contour(X, Y, Z, levels=30, cmap='viridis', alpha=0.7)
    ax.plot(path[:, 0], path[:, 1], 'ro-', markersize=3, linewidth=1.5)
    ax.plot(start[0], start[1], 'bs', markersize=8, label='起点')
    ax.plot(0, 0, 'g*', markersize=12, label='最小值')
    ax.set_title(f'{name}')
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.suptitle('不同优化算法对比', fontsize=14)
plt.tight_layout()
plt.show()

print("\n📊 优化算法特点:")
print("   SGD: 简单但可能震荡")
print("   Momentum: 加速收敛，减少震荡")
print("   RMSprop: 自适应学习率，处理不同尺度特征")
print("   Adam: 结合Momentum和RMSprop（最常用）")

六、实战：从零实现梯度下降

# 完整实现：使用梯度下降训练线性模型
class LinearRegressionGD:
    """从零实现梯度下降的线性回归"""
    
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.lr = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None
        self.loss_history = []
    
    def fit(self, X, y):
        """训练模型"""
        n_samples, n_features = X.shape
        
        # 初始化参数
        self.weights = np.random.randn(n_features) * 0.01
        self.bias = 0
        
        # 梯度下降
        for i in range(self.n_iterations):
            # 前向传播
            y_pred = X @ self.weights + self.bias
            
            # 计算损失（MSE）
            loss = np.mean((y_pred - y) ** 2)
            self.loss_history.append(loss)
            
            # 计算梯度
            dw = (2/n_samples) * X.T @ (y_pred - y)
            db = (2/n_samples) * np.sum(y_pred - y)
            
            # 更新参数
            self.weights -= self.lr * dw
            self.bias -= self.lr * db
            
            # 可选：学习率衰减
            if i % 100 == 0:
                print(f"Epoch {i:4d}, Loss: {loss:.6f}")
        
        return self
    
    def predict(self, X):
        return X @ self.weights + self.bias

# 生成非线性数据（用线性模型拟合）
np.random.seed(42)
X = np.linspace(0, 10, 100).reshape(-1, 1)
y = 2.5 * X.flatten() + 1.5 + np.random.randn(100) * 1.5

# 训练模型
model = LinearRegressionGD(learning_rate=0.01, n_iterations=500)
model.fit(X, y)

# 预测
y_pred = model.predict(X)

# 可视化
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 拟合结果
axes[0].scatter(X, y, alpha=0.6, label='训练数据')
axes[0].plot(X, y_pred, 'r-', linewidth=2, label='拟合直线')
axes[0].set_xlabel('X')
axes[0].set_ylabel('y')
axes[0].set_title(f'线性回归结果\ny = {model.weights[0]:.3f}x + {model.bias:.3f}')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# 损失下降曲线
axes[1].plot(model.loss_history, 'b-', linewidth=2)
axes[1].set_xlabel('迭代次数')
axes[1].set_ylabel('损失 (MSE)')
axes[1].set_title('梯度下降：损失下降曲线')
axes[1].grid(True, alpha=0.3)

# 标注最终损失
axes[1].annotate(f'最终损失: {model.loss_history[-1]:.4f}',
                 xy=(len(model.loss_history)-1, model.loss_history[-1]),
                 xytext=(len(model.loss_history)-100, model.loss_history[-1] + 0.5),
                 arrowprops=dict(arrowstyle='->'))

plt.tight_layout()
plt.show()

print(f"\n✅ 训练完成!")
print(f"   真实值: w=2.50, b=1.50")
print(f"   学习到的: w={model.weights[0]:.3f}, b={model.bias:.3f}")

七、梯度消失与梯度爆炸

7.1 问题演示

# 演示深层网络中的梯度问题
def simulate_deep_network_gradients(depth=10, activation='sigmoid'):
    """模拟深层网络的梯度传播"""
    
    # 模拟每层的梯度缩放因子
    if activation == 'sigmoid':
        scale = 0.25  # sigmoid导数最大0.25
    elif activation == 'tanh':
        scale = 0.5   # tanh导数最大0.5
    else:  # ReLU
        scale = 1.0   # ReLU正区间导数为1
    
    # 梯度传播
    gradients = [1.0]  # 初始梯度（输出层）
    for i in range(1, depth + 1):
        # 每经过一层，梯度乘以缩放因子（简化模型）
        new_grad = gradients[-1] * scale
        gradients.append(new_grad)
    
    return gradients

# 对比不同激活函数
depths = list(range(1, 21))
activations = ['sigmoid', 'tanh', 'relu']

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 梯度随深度变化
ax1 = axes[0]
for act in activations:
    grads = simulate_deep_network_gradients(20, act)
    ax1.plot(depths, grads[1:], 'o-', linewidth=2, label=act.upper())

ax1.set_xlabel('网络深度（层数）')
ax1.set_ylabel('梯度大小')
ax1.set_title('不同激活函数的梯度传播')
ax1.legend()
ax1.grid(True, alpha=0.3)
ax1.set_yscale('log')  # 对数坐标

# 可视化梯度消失的影响
ax2 = axes[1]
gradient_values = {
    '正常梯度': [0.8, 0.6, 0.4, 0.2],
    '梯度消失': [0.8, 0.15, 0.03, 0.006],
    '梯度爆炸': [0.8, 1.6, 3.2, 6.4]
}

layers = ['输出层', '层3', '层2', '输入层']
for name, grads in gradient_values.items():
    ax2.plot(layers, grads, 'o-', linewidth=2, markersize=8, label=name)

ax2.set_ylabel('梯度大小')
ax2.set_title('梯度传播问题示意')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.suptitle('梯度消失与梯度爆炸问题', fontsize=14)
plt.tight_layout()
plt.show()

print("\n⚠️ 梯度问题解决方案:")
print("   1. 梯度消失: 使用ReLU激活函数、残差连接(ResNet)")
print("   2. 梯度爆炸: 梯度裁剪(Gradient Clipping)、权重初始化")
print("   3. Batch Normalization: 同时缓解两个问题")
print("   4. LSTM/GRU: 解决RNN中的梯度问题")

# 演示梯度裁剪
def gradient_clipping_demo():
    """梯度裁剪的效果演示"""
    original_grad = np.array([0.5, 1.2, 2.8, 5.0, 10.0])
    max_norm = 2.0
    
    # 计算当前范数
    grad_norm = np.linalg.norm(original_grad)
    
    if grad_norm > max_norm:
        clipped_grad = original_grad * (max_norm / grad_norm)
    else:
        clipped_grad = original_grad
    
    print(f"\n✂️ 梯度裁剪演示:")
    print(f"   原始梯度: {original_grad}")
    print(f"   梯度范数: {grad_norm:.2f}")
    print(f"   裁剪后: {clipped_grad}")
    print(f"   裁剪后范数: {np.linalg.norm(clipped_grad):.2f}")

gradient_clipping_demo()

八、完整实战：训练一个可解释的神经网络

# 从零实现带可视化的小型神经网络
class VisualizableNN:
    """可可视化的神经网络，展示梯度下降过程"""
    
    def __init__(self):
        self.W = np.random.randn(2, 1) * 0.5
        self.b = np.zeros(1)
        self.losses = []
        self.gradients = []
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def forward(self, X):
        return self.sigmoid(X @ self.W + self.b)
    
    def loss(self, y_pred, y_true):
        # 二分类交叉熵
        return -np.mean(y_true * np.log(y_pred + 1e-8) + 
                        (1 - y_true) * np.log(1 - y_pred + 1e-8))
    
    def gradient(self, X, y_true, y_pred):
        m = X.shape[0]
        dz = y_pred - y_true.reshape(-1, 1)
        dW = (1/m) * X.T @ dz
        db = (1/m) * np.sum(dz)
        return dW, db
    
    def train(self, X, y, epochs=1000, lr=0.5, verbose=True):
        for epoch in range(epochs):
            # 前向传播
            y_pred = self.forward(X)
            
            # 计算损失
            loss = self.loss(y_pred, y)
            self.losses.append(loss)
            
            # 反向传播
            dW, db = self.gradient(X, y, y_pred)
            self.gradients.append(np.linalg.norm(dW))
            
            # 更新参数
            self.W -= lr * dW
            self.b -= lr * db
            
            if verbose and epoch % 200 == 0:
                print(f"Epoch {epoch:4d}, Loss: {loss:.6f}, ||dW||: {np.linalg.norm(dW):.4f}")
        
        return self

# 生成螺旋形数据
def generate_spiral_data(n_points=200, noise=0.1):
    """生成螺旋形数据，展示神经网络的学习能力"""
    theta = np.sqrt(np.random.rand(n_points)) * 2 * np.pi
    r = 2 * theta / (2 * np.pi)
    
    # 类别0
    x0 = r * np.cos(theta) + np.random.randn(n_points) * noise
    y0 = r * np.sin(theta) + np.random.randn(n_points) * noise
    
    # 类别1（反向螺旋）
    theta = np.sqrt(np.random.rand(n_points)) * 2 * np.pi
    r = 2 * theta / (2 * np.pi)
    x1 = -r * np.cos(theta) + np.random.randn(n_points) * noise
    y1 = -r * np.sin(theta) + np.random.randn(n_points) * noise
    
    X = np.vstack([np.column_stack([x0, y0]), 
                   np.column_stack([x1, y1])])
    y = np.hstack([np.zeros(n_points), np.ones(n_points)])
    
    return X, y

# 生成数据
X, y = generate_spiral_data(150, noise=0.15)

# 创建网格用于可视化决策边界
x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                     np.linspace(y_min, y_max, 100))
grid = np.c_[xx.ravel(), yy.ravel()]

# 训练网络
nn = VisualizableNN()
nn.train(X, y, epochs=1000, lr=0.5)

# 预测网格点
Z = nn.forward(grid)
Z = Z.reshape(xx.shape)

# 可视化结果
fig = plt.figure(figsize=(16, 10))

# 1. 决策边界
ax1 = fig.add_subplot(2, 2, 1)
contour = ax1.contourf(xx, yy, Z, levels=20, cmap='RdBu', alpha=0.6)
ax1.scatter(X[y==0, 0], X[y==0, 1], c='blue', label='类别0', alpha=0.6, edgecolors='black')
ax1.scatter(X[y==1, 0], X[y==1, 1], c='red', label='类别1', alpha=0.6, edgecolors='black')
ax1.set_xlabel('x1')
ax1.set_ylabel('x2')
ax1.set_title('神经网络学习到的决策边界')
ax1.legend()
plt.colorbar(contour, ax=ax1)

# 2. 损失下降曲线
ax2 = fig.add_subplot(2, 2, 2)
ax2.plot(nn.losses, 'b-', linewidth=2)
ax2.set_xlabel('Epoch')
ax2.set_ylabel('损失')
ax2.set_title('损失下降曲线（梯度下降优化）')
ax2.grid(True, alpha=0.3)
ax2.set_yscale('log')

# 3. 梯度范数变化
ax3 = fig.add_subplot(2, 2, 3)
ax3.plot(nn.gradients, 'g-', linewidth=2)
ax3.set_xlabel('Epoch')
ax3.set_ylabel('梯度范数 ||dW||')
ax3.set_title('梯度变化（初期大，后期小）')
ax3.grid(True, alpha=0.3)

# 4. 参数变化轨迹
ax4 = fig.add_subplot(2, 2, 4)
# 模拟权重空间轨迹（简化）
w1_traj = [0.5]
w2_traj = [0.5]
for i in range(1, len(nn.losses), 50):
    # 这里简化展示，实际可以用nn.W的历史
    w1_traj.append(w1_traj[-1] * 0.98)
    w2_traj.append(w2_traj[-1] * 0.97)
ax4.plot(w1_traj, w2_traj, 'ro-', markersize=4)
ax4.set_xlabel('权重 w1')
ax4.set_ylabel('权重 w2')
ax4.set_title('参数空间中的优化路径')
ax4.grid(True, alpha=0.3)

plt.suptitle('神经网络训练过程全览：梯度下降在行动', fontsize=16)
plt.tight_layout()
plt.show()

# 计算准确率
predictions = (nn.forward(X) > 0.5).flatten()
accuracy = np.mean(predictions == y)
print(f"\n🎯 最终准确率: {accuracy*100:.2f}%")

九、学习检查清单

基础概念（必须掌握）

理解导数的几何意义（切线斜率）
理解梯度下降的基本思想
掌握偏导数的概念
理解链式法则（反向传播的基础）

核心应用（重要）

能用梯度下降优化简单函数
理解学习率的影响
知道什么是梯度消失/爆炸
理解反向传播的基本流程

扩展了解（按需）

各种优化器（Adam、RMSprop）的原理
二阶优化方法（牛顿法）
自动微分机制

十、总结

微积分在AI中的核心价值：

概念	AI应用	解决的问题
导数	梯度下降	找到损失函数的最小值
偏导数	多参数优化	每个参数如何影响损失
链式法则	反向传播	计算深层网络的梯度
梯度	参数更新	告诉我们往哪个方向走

核心公式记忆：

参数更新：θ_new = θ_old - η × ∇L(θ)
        新参数 = 旧参数 - 学习率 × 梯度

梯度下降的三要素：
1. 方向：梯度的反方向（下降最快的方向）
2. 步长：学习率（每一步走多远）
3. 终止条件：梯度接近0或达到最大迭代次数

记住：

导数告诉你函数在变化
梯度告诉你往哪走下降最快
链式法则让你能计算复杂函数的导数
反向传播让深度学习成为可能

下一步：

学习概率论（理解不确定性）
学习信息论（交叉熵、KL散度）

微积分不是障碍，而是理解AI如何学习的钥匙！

AtomGit开源社区

AtomGit 是由开放原子开源基金会联合 CSDN 等生态伙伴共同推出的新一代开源与人工智能协作平台。平台坚持“开放、中立、公益”的理念，把代码托管、模型共享、数据集托管、智能体开发体验和算力服务整合在一起，为开发者提供从开发、训练到部署的一站式体验。

更多推荐

亲测10款降AIGC工具，这款神器让你论文告别机翻感

AtomGit开源社区

亲测10款降AIGC工具，这款神器让你论文告别机翻感

AtomGit开源社区

你可能还不知道，「测试用例」竟是AI时代提升效率的关键

AtomGit开源社区

所有评论(0)

查看更多评论

xiaotao131

@xiaotao131

已为社区贡献16条内容

01-编程基础与数学基石：微积分

xiaotao131

微积分：AI模型训练的优化引擎

一、为什么AI需要微积分？

1.1 微积分在AI中的核心地位

二、导数：变化率的度量

2.1 导数的直观理解

2.2 导数在AI中的应用：学习率

三、偏导数：多变量函数的梯度

3.1 从单变量到多变量

3.2 AI实战：线性回归的梯度计算

四、链式法则：反向传播的核心

4.1 链式法则的原理

4.2 神经网络中的反向传播

4.3 可视化反向传播的梯度流

五、梯度：优化的指南针

5.1 梯度的几何意义

5.2 各种优化算法对比

六、实战：从零实现梯度下降

七、梯度消失与梯度爆炸

7.1 问题演示

八、完整实战：训练一个可解释的神经网络

九、学习检查清单

基础概念（必须掌握）

核心应用（重要）

扩展了解（按需）

十、总结

所有评论(0)

温馨提示：您尚未绑定手机号

xiaotao131