01-编程基础与数学基石:微积分
·

微积分:AI模型训练的优化引擎
一、为什么AI需要微积分?
1.1 微积分在AI中的核心地位
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
print("=" * 60)
print("微积分在AI中的应用场景")
print("=" * 60)
applications = {
"梯度下降": "用导数找到损失函数的最小值",
"反向传播": "用链式法则计算神经网络各层的梯度",
"学习率调度": "理解梯度变化调整步长",
"优化器设计": "Adam/SGD等基于梯度信息",
"激活函数": "需要可导才能反向传播"
}
for app, desc in applications.items():
print(f"\n📌 {app}:")
print(f" {desc}")
# 直观示例:用导数找到函数最小值
def f(x):
return x**2 + 2*x + 1
def df(x):
return 2*x + 2 # 导数
x = np.linspace(-4, 2, 100)
y = f(x)
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(x, y, 'b-', linewidth=2, label='f(x) = x² + 2x + 1')
plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
plt.xlabel('x')
plt.ylabel('f(x)')
plt.title('函数曲线:我们要找到最小值')
plt.grid(True, alpha=0.3)
plt.legend()
# 梯度下降过程
x_current = 3
learning_rate = 0.3
history_x = [x_current]
history_y = [f(x_current)]
for _ in range(10):
gradient = df(x_current)
x_current = x_current - learning_rate * gradient
history_x.append(x_current)
history_y.append(f(x_current))
plt.subplot(1, 2, 2)
plt.plot(x, y, 'b-', linewidth=2, label='f(x)')
plt.plot(history_x, history_y, 'ro-', markersize=8, label='梯度下降路径')
plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
plt.xlabel('x')
plt.ylabel('f(x)')
plt.title(f'梯度下降找到最小值 x={history_x[-1]:.3f}')
plt.grid(True, alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()
print(f"\n✨ 梯度下降过程:")
for i, (x_val, y_val) in enumerate(zip(history_x, history_y)):
print(f" 第{i}步: x={x_val:.3f}, f(x)={y_val:.3f}")
二、导数:变化率的度量
2.1 导数的直观理解
导数 = 函数在某点的瞬时变化率 = 切线的斜率
# 导数的几何意义
def f1(x):
return x**2
def f1_derivative(x):
return 2*x
x_point = 1.5
x = np.linspace(-2, 3, 100)
y = f1(x)
# 计算切线
tangent_x = np.linspace(x_point - 1, x_point + 1, 50)
tangent_y = f1(x_point) + f1_derivative(x_point) * (tangent_x - x_point)
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(x, y, 'b-', linewidth=2, label='f(x) = x²')
plt.plot(tangent_x, tangent_y, 'r--', linewidth=2, label=f'切线 (斜率={f1_derivative(x_point):.1f})')
plt.plot(x_point, f1(x_point), 'ro', markersize=10)
plt.axhline(y=0, color='black', alpha=0.3)
plt.axvline(x=0, color='black', alpha=0.3)
plt.grid(True, alpha=0.3)
plt.xlabel('x')
plt.ylabel('f(x)')
plt.title('导数 = 切线的斜率')
plt.legend()
# 不同点的导数
points = [-1.5, 0, 1.5]
plt.subplot(1, 2, 2)
plt.plot(x, y, 'b-', linewidth=2)
for xp in points:
slope = f1_derivative(xp)
tangent_x = np.linspace(xp - 0.8, xp + 0.8, 50)
tangent_y = f1(xp) + slope * (tangent_x - xp)
plt.plot(tangent_x, tangent_y, '--', linewidth=1.5, label=f'x={xp}, 斜率={slope:.1f}')
plt.plot(xp, f1(xp), 'ro', markersize=8)
plt.grid(True, alpha=0.3)
plt.xlabel('x')
plt.ylabel('f(x)')
plt.title('不同点的导数(斜率)不同')
plt.legend()
plt.tight_layout()
plt.show()
print("\n📐 导数的解读:")
print(" 导数为正 → 函数在增加(向右走会变大)")
print(" 导数为负 → 函数在减少(向右走会变小)")
print(" 导数为零 → 可能是极值点(最小或最大)")
2.2 导数在AI中的应用:学习率
# 学习率的影响:太大不收敛,太小收敛慢
def loss_function(w):
return w**2 + 2*w + 5
def gradient(w):
return 2*w + 2
# 测试不同学习率
learning_rates = [0.05, 0.3, 0.8, 1.2]
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
for idx, lr in enumerate(learning_rates):
ax = axes[idx // 2, idx % 2]
w = 3.0 # 初始值
history = [w]
for _ in range(20):
grad = gradient(w)
w = w - lr * grad
history.append(w)
w_range = np.linspace(-4, 4, 100)
loss_range = loss_function(w_range)
ax.plot(w_range, loss_range, 'b-', linewidth=2)
ax.plot(history, [loss_function(w) for w in history], 'ro-', markersize=6)
ax.set_xlabel('权重 w')
ax.set_ylabel('损失')
ax.set_title(f'学习率 = {lr}')
ax.grid(True, alpha=0.3)
if lr > 1:
ax.text(0.5, 0.8, '⚠️ 太大,发散!', transform=ax.transAxes,
fontsize=12, color='red', ha='center')
elif lr < 0.1:
ax.text(0.5, 0.8, '🐢 太小,收敛慢', transform=ax.transAxes,
fontsize=12, color='orange', ha='center')
else:
ax.text(0.5, 0.8, '✅ 合适,快速收敛', transform=ax.transAxes,
fontsize=12, color='green', ha='center')
plt.suptitle('学习率对梯度下降的影响', fontsize=14)
plt.tight_layout()
plt.show()
print("\n💡 学习率选择建议:")
print(" 学习率太大 → 震荡/发散(跨过最小值)")
print(" 学习率太小 → 收敛太慢(需要很多步)")
print(" 合适的学习率 → 平稳下降到最小值")
三、偏导数:多变量函数的梯度
3.1 从单变量到多变量
# 二元函数:f(x, y) = x² + y²
def f_2d(x, y):
return x**2 + y**2
# 偏导数:对x求导时把y当常数,对y求导时把x当常数
def df_dx(x, y):
return 2*x # ∂f/∂x
def df_dy(x, y):
return 2*y # ∂f/∂y
# 可视化
x = np.linspace(-3, 3, 50)
y = np.linspace(-3, 3, 50)
X, Y = np.meshgrid(x, y)
Z = f_2d(X, Y)
fig = plt.figure(figsize=(14, 5))
# 3D曲面
ax1 = fig.add_subplot(1, 2, 1, projection='3d')
surf = ax1.plot_surface(X, Y, Z, cmap='viridis', alpha=0.8)
ax1.set_xlabel('x')
ax1.set_ylabel('y')
ax1.set_zlabel('f(x,y)')
ax1.set_title('f(x,y) = x² + y²')
# 等高线 + 梯度向量
ax2 = fig.add_subplot(1, 2, 2)
contour = ax2.contour(X, Y, Z, levels=20, cmap='viridis')
ax2.clabel(contour, inline=True, fontsize=8)
# 在几个点绘制梯度向量
points = [(-2, -1), (2, -1), (-2, 1), (2, 1), (0, 0)]
for px, py in points:
gx = df_dx(px, py)
gy = df_dy(px, py)
# 梯度方向指向增长最快的方向
ax2.quiver(px, py, gx, gy, color='red', angles='xy',
scale_units='xy', scale=5, width=0.02)
ax2.plot(px, py, 'bo', markersize=8)
ax2.set_xlabel('x')
ax2.set_ylabel('y')
ax2.set_title('等高线 + 梯度向量(指向最陡上升方向)')
ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("\n📊 偏导数的解读:")
print(" ∂f/∂x = 2x: 在x方向的变化率")
print(" ∂f/∂y = 2y: 在y方向的变化率")
print(" 梯度 ∇f = (∂f/∂x, ∂f/∂y) = (2x, 2y)")
print(" 梯度指向函数增长最快的方向!")
3.2 AI实战:线性回归的梯度计算
# 线性回归:损失函数对参数的偏导数
np.random.seed(42)
# 生成数据
n_samples = 100
X = np.random.randn(n_samples, 1)
true_w = 2.5
true_b = 1.0
y = true_w * X.flatten() + true_b + np.random.randn(n_samples) * 0.5
# 损失函数:MSE = (1/n) * Σ(y_pred - y_true)²
# y_pred = w * x + b
def compute_gradients(X, y, w, b):
"""计算损失函数对w和b的偏导数"""
n = len(y)
y_pred = w * X.flatten() + b
# 偏导数公式
dw = (2/n) * np.sum((y_pred - y) * X.flatten())
db = (2/n) * np.sum(y_pred - y)
return dw, db
def compute_loss(X, y, w, b):
"""计算当前损失"""
y_pred = w * X.flatten() + b
return np.mean((y_pred - y) ** 2)
# 可视化损失曲面
w_range = np.linspace(0, 4, 50)
b_range = np.linspace(-1, 3, 50)
W, B = np.meshgrid(w_range, b_range)
Loss = np.zeros_like(W)
for i in range(len(w_range)):
for j in range(len(b_range)):
Loss[j, i] = compute_loss(X, y, W[j, i], B[j, i])
# 梯度下降过程
w, b = 0.0, 0.0 # 初始值
learning_rate = 0.1
history = [(w, b, compute_loss(X, y, w, b))]
for epoch in range(50):
dw, db = compute_gradients(X, y, w, b)
w = w - learning_rate * dw
b = b - learning_rate * db
history.append((w, b, compute_loss(X, y, w, b)))
# 可视化
fig = plt.figure(figsize=(15, 5))
# 3D损失曲面
ax1 = fig.add_subplot(1, 3, 1, projection='3d')
surf = ax1.plot_surface(W, B, Loss, cmap='viridis', alpha=0.7)
ax1.set_xlabel('权重 w')
ax1.set_ylabel('偏置 b')
ax1.set_zlabel('损失')
ax1.set_title('损失函数曲面')
# 绘制梯度下降路径
w_hist = [h[0] for h in history]
b_hist = [h[1] for h in history]
loss_hist = [h[2] for h in history]
ax1.plot(w_hist, b_hist, loss_hist, 'ro-', markersize=4, linewidth=1)
# 等高线 + 路径
ax2 = fig.add_subplot(1, 3, 2)
contour = ax2.contour(W, B, Loss, levels=30, cmap='viridis')
ax2.plot(w_hist, b_hist, 'ro-', markersize=4, linewidth=1.5)
ax2.plot(true_w, true_b, 'b*', markersize=15, label='真实值')
ax2.set_xlabel('权重 w')
ax2.set_ylabel('偏置 b')
ax2.set_title('梯度下降路径(等高线图)')
ax2.legend()
ax2.grid(True, alpha=0.3)
# 损失下降曲线
ax3 = fig.add_subplot(1, 3, 3)
ax3.plot(range(len(history)), [h[2] for h in history], 'b-', linewidth=2)
ax3.set_xlabel('迭代次数')
ax3.set_ylabel('损失')
ax3.set_title('损失下降曲线')
ax3.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print(f"\n📈 线性回归结果:")
print(f" 真实值: w={true_w:.2f}, b={true_b:.2f}")
print(f" 估计值: w={w:.2f}, b={b:.2f}")
print(f" 最终损失: {history[-1][2]:.4f}")
四、链式法则:反向传播的核心
4.1 链式法则的原理
链式法则:复合函数的导数 = 外层导数 × 内层导数
# 链式法则演示
def demonstrate_chain_rule():
"""
复合函数: f(g(x)) = (2x + 1)²
外层: f(u) = u², 导数 f'(u) = 2u
内层: g(x) = 2x + 1, 导数 g'(x) = 2
链式: df/dx = f'(g(x)) * g'(x) = 2(2x+1) * 2 = 4(2x+1)
"""
x = 2
u = 2*x + 1 # 内层输出
f = u**2 # 最终输出
# 手动链式法则
df_du = 2*u # 外层导数
du_dx = 2 # 内层导数
df_dx_chain = df_du * du_dx
# 直接求导验证
df_dx_direct = 4 * (2*x + 1)
print(f"\n🔗 链式法则示例:")
print(f" x = {x}")
print(f" u = g(x) = 2x + 1 = {u}")
print(f" f = u² = {f}")
print(f" 链式法则: df/dx = {df_du} × {du_dx} = {df_dx_chain}")
print(f" 直接求导: df/dx = {df_dx_direct}")
print(f" ✅ 结果一致!")
demonstrate_chain_rule()
# 可视化链式法则
def f_outer(u):
return u**2
def g_inner(x):
return 2*x + 1
x_vals = np.linspace(0, 3, 100)
u_vals = g_inner(x_vals)
f_vals = f_outer(u_vals)
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
# 内层函数
axes[0].plot(x_vals, u_vals, 'b-', linewidth=2)
axes[0].set_xlabel('x')
axes[0].set_ylabel('u = g(x)')
axes[0].set_title('内层函数: u = 2x + 1')
axes[0].grid(True, alpha=0.3)
# 外层函数
axes[1].plot(u_vals, f_vals, 'r-', linewidth=2)
axes[1].set_xlabel('u')
axes[1].set_ylabel('f = h(u)')
axes[1].set_title('外层函数: f = u²')
axes[1].grid(True, alpha=0.3)
# 复合函数
axes[2].plot(x_vals, f_vals, 'g-', linewidth=2)
axes[2].set_xlabel('x')
axes[2].set_ylabel('f(g(x))')
axes[2].set_title('复合函数: f(g(x)) = (2x+1)²')
axes[2].grid(True, alpha=0.3)
plt.suptitle('链式法则:复合函数的分解', fontsize=14)
plt.tight_layout()
plt.show()
4.2 神经网络中的反向传播
# 实现一个简单的神经网络演示反向传播
class SimpleNeuralNetwork:
"""2层神经网络:输入→隐藏→输出"""
def __init__(self, input_size, hidden_size, output_size):
# 初始化权重
np.random.seed(42)
self.W1 = np.random.randn(input_size, hidden_size) * 0.5
self.b1 = np.zeros(hidden_size)
self.W2 = np.random.randn(hidden_size, output_size) * 0.5
self.b2 = np.zeros(output_size)
def sigmoid(self, x):
return 1 / (1 + np.exp(-x))
def sigmoid_derivative(self, x):
return x * (1 - x)
def forward(self, X):
"""前向传播"""
self.z1 = X @ self.W1 + self.b1
self.a1 = self.sigmoid(self.z1)
self.z2 = self.a1 @ self.W2 + self.b2
self.a2 = self.sigmoid(self.z2)
return self.a2
def backward(self, X, y, output):
"""反向传播(链式法则的应用)"""
m = X.shape[0]
# 输出层误差
dZ2 = output - y
dW2 = (1/m) * self.a1.T @ dZ2
db2 = (1/m) * np.sum(dZ2, axis=0)
# 隐藏层误差(链式法则)
dA1 = dZ2 @ self.W2.T
dZ1 = dA1 * self.sigmoid_derivative(self.a1)
dW1 = (1/m) * X.T @ dZ1
db1 = (1/m) * np.sum(dZ1, axis=0)
return dW1, db1, dW2, db2
def update(self, dW1, db1, dW2, db2, learning_rate):
"""更新参数"""
self.W1 -= learning_rate * dW1
self.b1 -= learning_rate * db1
self.W2 -= learning_rate * dW2
self.b2 -= learning_rate * db2
# 生成异或数据
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [1], [1], [0]])
# 训练网络
nn = SimpleNeuralNetwork(2, 4, 1)
epochs = 5000
learning_rate = 0.5
losses = []
for epoch in range(epochs):
# 前向传播
output = nn.forward(X)
# 计算损失
loss = np.mean((output - y) ** 2)
losses.append(loss)
# 反向传播
dW1, db1, dW2, db2 = nn.backward(X, y, output)
# 更新参数
nn.update(dW1, db1, dW2, db2, learning_rate)
if epoch % 1000 == 0:
print(f"Epoch {epoch}: Loss = {loss:.6f}")
# 可视化结果
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# 损失曲线
axes[0].plot(losses, 'b-', linewidth=2)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('训练损失下降曲线')
axes[0].grid(True, alpha=0.3)
# 预测结果
predictions = nn.forward(X)
axes[1].bar(range(4), predictions.flatten(), color=['red', 'green', 'green', 'red'])
axes[1].set_xticks(range(4))
axes[1].set_xticklabels(['00', '01', '10', '11'])
axes[1].set_ylabel('预测值')
axes[1].set_title('异或问题预测结果')
axes[1].axhline(y=0.5, color='black', linestyle='--', label='决策边界')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.suptitle('神经网络学习异或问题(反向传播)', fontsize=14)
plt.tight_layout()
plt.show()
print(f"\n🧠 神经网络训练结果:")
print(f" 输入: 00 → 预测: {predictions[0,0]:.3f} (期望: 0)")
print(f" 输入: 01 → 预测: {predictions[1,0]:.3f} (期望: 1)")
print(f" 输入: 10 → 预测: {predictions[2,0]:.3f} (期望: 1)")
print(f" 输入: 11 → 预测: {predictions[3,0]:.3f} (期望: 0)")
4.3 可视化反向传播的梯度流
# 可视化梯度在神经网络中的流动
def visualize_gradient_flow():
"""展示梯度如何从输出层传播到输入层"""
layers = ['输入层', '隐藏层1', '隐藏层2', '输出层']
gradients = [0.01, 0.05, 0.25, 1.0] # 模拟梯度大小
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
# 梯度流
x_pos = np.arange(len(layers))
ax1.bar(x_pos, gradients, color='steelblue', alpha=0.7)
ax1.set_xticks(x_pos)
ax1.set_xticklabels(layers)
ax1.set_ylabel('梯度大小')
ax1.set_title('梯度反向传播:从输出层到输入层')
ax1.grid(True, alpha=0.3, axis='y')
# 添加箭头表示传播方向
for i in range(len(gradients)-1):
ax1.annotate('', xy=(i+1, gradients[i+1]), xytext=(i, gradients[i]),
arrowprops=dict(arrowstyle='->', color='red', lw=2))
# 梯度消失/爆炸问题
grad_flow = {
'正常': [0.8, 0.6, 0.4, 0.2],
'梯度消失': [0.8, 0.2, 0.05, 0.01],
'梯度爆炸': [0.8, 2.0, 5.0, 12.0]
}
for name, grads in grad_flow.items():
ax2.plot(layers, grads, 'o-', linewidth=2, markersize=8, label=name)
ax2.set_ylabel('梯度大小')
ax2.set_title('梯度消失 vs 梯度爆炸')
ax2.legend()
ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("\n⚠️ 梯度问题警示:")
print(" 梯度消失: 深层网络无法学习(使用ReLU、残差连接)")
print(" 梯度爆炸: 参数更新过大(使用梯度裁剪)")
visualize_gradient_flow()
五、梯度:优化的指南针
5.1 梯度的几何意义
梯度 = 所有偏导数组成的向量,指向函数增长最快的方向
# 梯度下降的几何演示
def rosenbrock(x, y):
"""Rosenbrock函数,优化中的经典测试函数"""
return (1 - x)**2 + 100 * (y - x**2)**2
def rosenbrock_gradient(x, y):
"""Rosenbrock函数的梯度"""
dx = -2 * (1 - x) - 400 * x * (y - x**2)
dy = 200 * (y - x**2)
return np.array([dx, dy])
# 梯度下降优化
x, y = -1.5, 1.5 # 起始点
learning_rate = 0.0005
path = [(x, y, rosenbrock(x, y))]
for _ in range(100):
grad = rosenbrock_gradient(x, y)
x -= learning_rate * grad[0]
y -= learning_rate * grad[1]
path.append((x, y, rosenbrock(x, y)))
# 可视化
x_range = np.linspace(-2, 2, 100)
y_range = np.linspace(-1, 3, 100)
X, Y = np.meshgrid(x_range, y_range)
Z = rosenbrock(X, Y)
fig = plt.figure(figsize=(15, 5))
# 3D视图
ax1 = fig.add_subplot(1, 2, 1, projection='3d')
surf = ax1.plot_surface(X, Y, Z, cmap='viridis', alpha=0.7)
ax1.plot([p[0] for p in path], [p[1] for p in path], [p[2] for p in path],
'ro-', markersize=4, linewidth=1.5)
ax1.set_xlabel('x')
ax1.set_ylabel('y')
ax1.set_zlabel('f(x,y)')
ax1.set_title('梯度下降在Rosenbrock函数上的路径')
# 等高线视图
ax2 = fig.add_subplot(1, 2, 2)
contour = ax2.contour(X, Y, Z, levels=50, cmap='viridis')
ax2.plot([p[0] for p in path], [p[1] for p in path], 'ro-', markersize=4, linewidth=1.5)
ax2.plot(1, 1, 'b*', markersize=15, label='全局最小值 (1,1)')
ax2.set_xlabel('x')
ax2.set_ylabel('y')
ax2.set_title('梯度下降路径(等高线)')
ax2.legend()
ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print(f"\n🎯 梯度下降结果:")
print(f" 起点: (-1.5, 1.5)")
print(f" 终点: ({path[-1][0]:.3f}, {path[-1][1]:.3f})")
print(f" 最小值点: (1, 1)")
5.2 各种优化算法对比
# 对比不同优化算法
class OptimizerComparison:
"""对比不同梯度下降优化算法"""
@staticmethod
def gradient_descent(grad_func, start, lr=0.1, epochs=50):
x = start.copy()
path = [x.copy()]
for _ in range(epochs):
grad = grad_func(x)
x = x - lr * grad
path.append(x.copy())
return np.array(path)
@staticmethod
def momentum(grad_func, start, lr=0.1, momentum=0.9, epochs=50):
x = start.copy()
v = np.zeros_like(x)
path = [x.copy()]
for _ in range(epochs):
grad = grad_func(x)
v = momentum * v - lr * grad
x = x + v
path.append(x.copy())
return np.array(path)
@staticmethod
def rmsprop(grad_func, start, lr=0.01, beta=0.9, eps=1e-8, epochs=50):
x = start.copy()
s = np.zeros_like(x)
path = [x.copy()]
for _ in range(epochs):
grad = grad_func(x)
s = beta * s + (1 - beta) * grad**2
x = x - lr * grad / (np.sqrt(s) + eps)
path.append(x.copy())
return np.array(path)
# 测试函数:有崎岖表面的函数
def test_function(x):
return x[0]**2 + 10 * x[1]**2 + 2 * np.sin(5*x[0]) * np.cos(5*x[1])
def test_gradient(x):
return np.array([
2*x[0] + 10 * np.cos(5*x[0]) * np.cos(5*x[1]),
20*x[1] - 10 * np.sin(5*x[0]) * np.sin(5*x[1])
])
# 运行不同优化器
start = np.array([2.0, 1.5])
optimizers = {
'SGD': OptimizerComparison.gradient_descent,
'Momentum': OptimizerComparison.momentum,
'RMSprop': OptimizerComparison.rmsprop
}
paths = {}
for name, opt_func in optimizers.items():
paths[name] = opt_func(test_gradient, start, epochs=30)
# 可视化
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
x_range = np.linspace(-3, 3, 100)
y_range = np.linspace(-2, 2, 100)
X, Y = np.meshgrid(x_range, y_range)
Z = test_function([X, Y])
for idx, (name, path) in enumerate(paths.items()):
ax = axes[idx]
contour = ax.contour(X, Y, Z, levels=30, cmap='viridis', alpha=0.7)
ax.plot(path[:, 0], path[:, 1], 'ro-', markersize=3, linewidth=1.5)
ax.plot(start[0], start[1], 'bs', markersize=8, label='起点')
ax.plot(0, 0, 'g*', markersize=12, label='最小值')
ax.set_title(f'{name}')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.legend()
ax.grid(True, alpha=0.3)
plt.suptitle('不同优化算法对比', fontsize=14)
plt.tight_layout()
plt.show()
print("\n📊 优化算法特点:")
print(" SGD: 简单但可能震荡")
print(" Momentum: 加速收敛,减少震荡")
print(" RMSprop: 自适应学习率,处理不同尺度特征")
print(" Adam: 结合Momentum和RMSprop(最常用)")
六、实战:从零实现梯度下降
# 完整实现:使用梯度下降训练线性模型
class LinearRegressionGD:
"""从零实现梯度下降的线性回归"""
def __init__(self, learning_rate=0.01, n_iterations=1000):
self.lr = learning_rate
self.n_iterations = n_iterations
self.weights = None
self.bias = None
self.loss_history = []
def fit(self, X, y):
"""训练模型"""
n_samples, n_features = X.shape
# 初始化参数
self.weights = np.random.randn(n_features) * 0.01
self.bias = 0
# 梯度下降
for i in range(self.n_iterations):
# 前向传播
y_pred = X @ self.weights + self.bias
# 计算损失(MSE)
loss = np.mean((y_pred - y) ** 2)
self.loss_history.append(loss)
# 计算梯度
dw = (2/n_samples) * X.T @ (y_pred - y)
db = (2/n_samples) * np.sum(y_pred - y)
# 更新参数
self.weights -= self.lr * dw
self.bias -= self.lr * db
# 可选:学习率衰减
if i % 100 == 0:
print(f"Epoch {i:4d}, Loss: {loss:.6f}")
return self
def predict(self, X):
return X @ self.weights + self.bias
# 生成非线性数据(用线性模型拟合)
np.random.seed(42)
X = np.linspace(0, 10, 100).reshape(-1, 1)
y = 2.5 * X.flatten() + 1.5 + np.random.randn(100) * 1.5
# 训练模型
model = LinearRegressionGD(learning_rate=0.01, n_iterations=500)
model.fit(X, y)
# 预测
y_pred = model.predict(X)
# 可视化
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# 拟合结果
axes[0].scatter(X, y, alpha=0.6, label='训练数据')
axes[0].plot(X, y_pred, 'r-', linewidth=2, label='拟合直线')
axes[0].set_xlabel('X')
axes[0].set_ylabel('y')
axes[0].set_title(f'线性回归结果\ny = {model.weights[0]:.3f}x + {model.bias:.3f}')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# 损失下降曲线
axes[1].plot(model.loss_history, 'b-', linewidth=2)
axes[1].set_xlabel('迭代次数')
axes[1].set_ylabel('损失 (MSE)')
axes[1].set_title('梯度下降:损失下降曲线')
axes[1].grid(True, alpha=0.3)
# 标注最终损失
axes[1].annotate(f'最终损失: {model.loss_history[-1]:.4f}',
xy=(len(model.loss_history)-1, model.loss_history[-1]),
xytext=(len(model.loss_history)-100, model.loss_history[-1] + 0.5),
arrowprops=dict(arrowstyle='->'))
plt.tight_layout()
plt.show()
print(f"\n✅ 训练完成!")
print(f" 真实值: w=2.50, b=1.50")
print(f" 学习到的: w={model.weights[0]:.3f}, b={model.bias:.3f}")
七、梯度消失与梯度爆炸
7.1 问题演示
# 演示深层网络中的梯度问题
def simulate_deep_network_gradients(depth=10, activation='sigmoid'):
"""模拟深层网络的梯度传播"""
# 模拟每层的梯度缩放因子
if activation == 'sigmoid':
scale = 0.25 # sigmoid导数最大0.25
elif activation == 'tanh':
scale = 0.5 # tanh导数最大0.5
else: # ReLU
scale = 1.0 # ReLU正区间导数为1
# 梯度传播
gradients = [1.0] # 初始梯度(输出层)
for i in range(1, depth + 1):
# 每经过一层,梯度乘以缩放因子(简化模型)
new_grad = gradients[-1] * scale
gradients.append(new_grad)
return gradients
# 对比不同激活函数
depths = list(range(1, 21))
activations = ['sigmoid', 'tanh', 'relu']
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# 梯度随深度变化
ax1 = axes[0]
for act in activations:
grads = simulate_deep_network_gradients(20, act)
ax1.plot(depths, grads[1:], 'o-', linewidth=2, label=act.upper())
ax1.set_xlabel('网络深度(层数)')
ax1.set_ylabel('梯度大小')
ax1.set_title('不同激活函数的梯度传播')
ax1.legend()
ax1.grid(True, alpha=0.3)
ax1.set_yscale('log') # 对数坐标
# 可视化梯度消失的影响
ax2 = axes[1]
gradient_values = {
'正常梯度': [0.8, 0.6, 0.4, 0.2],
'梯度消失': [0.8, 0.15, 0.03, 0.006],
'梯度爆炸': [0.8, 1.6, 3.2, 6.4]
}
layers = ['输出层', '层3', '层2', '输入层']
for name, grads in gradient_values.items():
ax2.plot(layers, grads, 'o-', linewidth=2, markersize=8, label=name)
ax2.set_ylabel('梯度大小')
ax2.set_title('梯度传播问题示意')
ax2.legend()
ax2.grid(True, alpha=0.3)
plt.suptitle('梯度消失与梯度爆炸问题', fontsize=14)
plt.tight_layout()
plt.show()
print("\n⚠️ 梯度问题解决方案:")
print(" 1. 梯度消失: 使用ReLU激活函数、残差连接(ResNet)")
print(" 2. 梯度爆炸: 梯度裁剪(Gradient Clipping)、权重初始化")
print(" 3. Batch Normalization: 同时缓解两个问题")
print(" 4. LSTM/GRU: 解决RNN中的梯度问题")
# 演示梯度裁剪
def gradient_clipping_demo():
"""梯度裁剪的效果演示"""
original_grad = np.array([0.5, 1.2, 2.8, 5.0, 10.0])
max_norm = 2.0
# 计算当前范数
grad_norm = np.linalg.norm(original_grad)
if grad_norm > max_norm:
clipped_grad = original_grad * (max_norm / grad_norm)
else:
clipped_grad = original_grad
print(f"\n✂️ 梯度裁剪演示:")
print(f" 原始梯度: {original_grad}")
print(f" 梯度范数: {grad_norm:.2f}")
print(f" 裁剪后: {clipped_grad}")
print(f" 裁剪后范数: {np.linalg.norm(clipped_grad):.2f}")
gradient_clipping_demo()
八、完整实战:训练一个可解释的神经网络
# 从零实现带可视化的小型神经网络
class VisualizableNN:
"""可可视化的神经网络,展示梯度下降过程"""
def __init__(self):
self.W = np.random.randn(2, 1) * 0.5
self.b = np.zeros(1)
self.losses = []
self.gradients = []
def sigmoid(self, x):
return 1 / (1 + np.exp(-x))
def forward(self, X):
return self.sigmoid(X @ self.W + self.b)
def loss(self, y_pred, y_true):
# 二分类交叉熵
return -np.mean(y_true * np.log(y_pred + 1e-8) +
(1 - y_true) * np.log(1 - y_pred + 1e-8))
def gradient(self, X, y_true, y_pred):
m = X.shape[0]
dz = y_pred - y_true.reshape(-1, 1)
dW = (1/m) * X.T @ dz
db = (1/m) * np.sum(dz)
return dW, db
def train(self, X, y, epochs=1000, lr=0.5, verbose=True):
for epoch in range(epochs):
# 前向传播
y_pred = self.forward(X)
# 计算损失
loss = self.loss(y_pred, y)
self.losses.append(loss)
# 反向传播
dW, db = self.gradient(X, y, y_pred)
self.gradients.append(np.linalg.norm(dW))
# 更新参数
self.W -= lr * dW
self.b -= lr * db
if verbose and epoch % 200 == 0:
print(f"Epoch {epoch:4d}, Loss: {loss:.6f}, ||dW||: {np.linalg.norm(dW):.4f}")
return self
# 生成螺旋形数据
def generate_spiral_data(n_points=200, noise=0.1):
"""生成螺旋形数据,展示神经网络的学习能力"""
theta = np.sqrt(np.random.rand(n_points)) * 2 * np.pi
r = 2 * theta / (2 * np.pi)
# 类别0
x0 = r * np.cos(theta) + np.random.randn(n_points) * noise
y0 = r * np.sin(theta) + np.random.randn(n_points) * noise
# 类别1(反向螺旋)
theta = np.sqrt(np.random.rand(n_points)) * 2 * np.pi
r = 2 * theta / (2 * np.pi)
x1 = -r * np.cos(theta) + np.random.randn(n_points) * noise
y1 = -r * np.sin(theta) + np.random.randn(n_points) * noise
X = np.vstack([np.column_stack([x0, y0]),
np.column_stack([x1, y1])])
y = np.hstack([np.zeros(n_points), np.ones(n_points)])
return X, y
# 生成数据
X, y = generate_spiral_data(150, noise=0.15)
# 创建网格用于可视化决策边界
x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
np.linspace(y_min, y_max, 100))
grid = np.c_[xx.ravel(), yy.ravel()]
# 训练网络
nn = VisualizableNN()
nn.train(X, y, epochs=1000, lr=0.5)
# 预测网格点
Z = nn.forward(grid)
Z = Z.reshape(xx.shape)
# 可视化结果
fig = plt.figure(figsize=(16, 10))
# 1. 决策边界
ax1 = fig.add_subplot(2, 2, 1)
contour = ax1.contourf(xx, yy, Z, levels=20, cmap='RdBu', alpha=0.6)
ax1.scatter(X[y==0, 0], X[y==0, 1], c='blue', label='类别0', alpha=0.6, edgecolors='black')
ax1.scatter(X[y==1, 0], X[y==1, 1], c='red', label='类别1', alpha=0.6, edgecolors='black')
ax1.set_xlabel('x1')
ax1.set_ylabel('x2')
ax1.set_title('神经网络学习到的决策边界')
ax1.legend()
plt.colorbar(contour, ax=ax1)
# 2. 损失下降曲线
ax2 = fig.add_subplot(2, 2, 2)
ax2.plot(nn.losses, 'b-', linewidth=2)
ax2.set_xlabel('Epoch')
ax2.set_ylabel('损失')
ax2.set_title('损失下降曲线(梯度下降优化)')
ax2.grid(True, alpha=0.3)
ax2.set_yscale('log')
# 3. 梯度范数变化
ax3 = fig.add_subplot(2, 2, 3)
ax3.plot(nn.gradients, 'g-', linewidth=2)
ax3.set_xlabel('Epoch')
ax3.set_ylabel('梯度范数 ||dW||')
ax3.set_title('梯度变化(初期大,后期小)')
ax3.grid(True, alpha=0.3)
# 4. 参数变化轨迹
ax4 = fig.add_subplot(2, 2, 4)
# 模拟权重空间轨迹(简化)
w1_traj = [0.5]
w2_traj = [0.5]
for i in range(1, len(nn.losses), 50):
# 这里简化展示,实际可以用nn.W的历史
w1_traj.append(w1_traj[-1] * 0.98)
w2_traj.append(w2_traj[-1] * 0.97)
ax4.plot(w1_traj, w2_traj, 'ro-', markersize=4)
ax4.set_xlabel('权重 w1')
ax4.set_ylabel('权重 w2')
ax4.set_title('参数空间中的优化路径')
ax4.grid(True, alpha=0.3)
plt.suptitle('神经网络训练过程全览:梯度下降在行动', fontsize=16)
plt.tight_layout()
plt.show()
# 计算准确率
predictions = (nn.forward(X) > 0.5).flatten()
accuracy = np.mean(predictions == y)
print(f"\n🎯 最终准确率: {accuracy*100:.2f}%")
九、学习检查清单
基础概念(必须掌握)
- 理解导数的几何意义(切线斜率)
- 理解梯度下降的基本思想
- 掌握偏导数的概念
- 理解链式法则(反向传播的基础)
核心应用(重要)
- 能用梯度下降优化简单函数
- 理解学习率的影响
- 知道什么是梯度消失/爆炸
- 理解反向传播的基本流程
扩展了解(按需)
- 各种优化器(Adam、RMSprop)的原理
- 二阶优化方法(牛顿法)
- 自动微分机制
十、总结
微积分在AI中的核心价值:
| 概念 | AI应用 | 解决的问题 |
|---|---|---|
| 导数 | 梯度下降 | 找到损失函数的最小值 |
| 偏导数 | 多参数优化 | 每个参数如何影响损失 |
| 链式法则 | 反向传播 | 计算深层网络的梯度 |
| 梯度 | 参数更新 | 告诉我们往哪个方向走 |
核心公式记忆:
参数更新:θ_new = θ_old - η × ∇L(θ)
新参数 = 旧参数 - 学习率 × 梯度
梯度下降的三要素:
1. 方向:梯度的反方向(下降最快的方向)
2. 步长:学习率(每一步走多远)
3. 终止条件:梯度接近0或达到最大迭代次数
记住:
- 导数告诉你函数在变化
- 梯度告诉你往哪走下降最快
- 链式法则让你能计算复杂函数的导数
- 反向传播让深度学习成为可能
下一步:
- 学习概率论(理解不确定性)
- 学习信息论(交叉熵、KL散度)
微积分不是障碍,而是理解AI如何学习的钥匙!
AtomGit 是由开放原子开源基金会联合 CSDN 等生态伙伴共同推出的新一代开源与人工智能协作平台。平台坚持“开放、中立、公益”的理念,把代码托管、模型共享、数据集托管、智能体开发体验和算力服务整合在一起,为开发者提供从开发、训练到部署的一站式体验。
更多推荐



所有评论(0)