C++ 信息论（Information Theory）完整万字教程

信息论是现代计算机科学、通信、数据压缩、加密、AI 的核心数学基础，由香农（Claude Shannon）在 1948 年创立。它用数学方式量化信息、不确定性、冗余度、压缩极限、信道容量。

目录（全文约 20000 字）

信息论基础：信息、熵、自信息
香农熵（Shannon Entropy）：核心定义、物理意义、C++ 实现
联合熵、条件熵
互信息（Mutual Information）
相对熵（KL 散度）
交叉熵（Cross Entropy）：AI 最常用损失函数
信道容量、数据压缩极限
最大熵原理
信息论在 C++ 中的工程应用
- 文本熵计算
- 数据压缩率评估
- 特征选择（互信息）
- 二分类交叉熵损失
完整可运行 C++ 信息论工具库

1. 信息论基础：什么是信息？

信息 = 消除不确定性的程度。

事件发生概率越低，信息量越大。

自信息（Self-information）

$I(x) = -\log_2 p(x)$

单位：比特（bit）

含义：

概率越小 → 信息越多
必然事件 $p = 1$ → 信息=0
不可能事件 → 信息无穷大

2. 香农熵（Entropy）：信息论核心

熵 = 随机变量的平均信息量
= 不确定性的度量
= 数据压缩的理论极限

公式：
$-\sum_{i} p(x_i) \log_2 p(x_i)$

熵的物理意义

熵越大 → 越混乱、越不确定
熵越小 → 越确定、越有序
均匀分布时熵最大
确定分布（一个事件概率 1）熵 = 0

3. C++ 实现：自信息 + 香农熵

#include <iostream>
#include <vector>
#include <cmath>
#include <unordered_map>
#include <algorithm>
#include <iomanip>

using namespace std;

const double EPS = 1e-12;
const double LOG2 = log(2.0);

inline double log2(double x) {
    if (x < EPS) return 0;
    return log(x) / LOG2;
}

// 自信息
double self_information(double p) {
    if (p < EPS || p > 1.0 + EPS) return 0;
    return -log2(p);
}

// 香农熵
double entropy(const vector<double>& prob) {
    double H = 0.0;
    for (double p : prob) {
        if (p < EPS) continue;
        H -= p * log2(p);
    }
    return H;
}

// 从频数计算概率
vector<double> count_to_prob(const vector<int>& cnt) {
    vector<double> prob;
    int sum = 0;
    for (int x : cnt) sum += x;
    if (sum == 0) return prob;
    for (int x : cnt) prob.push_back((double)x / sum);
    return prob;
}

4. 联合熵（Joint Entropy）

描述两个变量一起的不确定性：

$-\sum_{x,y} p(x,y)\log_2 p(x,y)$

5. 条件熵（Conditional Entropy）

已知 X 时，Y 的剩余不确定性：

$H (Y ∣ X) = H (X, Y) - H (X)$

6. 互信息（Mutual Information）

两个变量之间的相关程度（非线性相关）

$I (X; Y) = H (X) + H (Y) - H (X, Y)$

互信息越大 → 两个变量越相关
= 0 → 独立

7. 相对熵（KL Divergence）

衡量两个分布的差异：

$D_{KL}(P||Q) = \sum P(x)\log_2 \frac{P(x)}{Q(x)}$

非负、不对称、不满足距离定义

8. 交叉熵（Cross Entropy）

AI 最核心损失函数

$-\sum P(x)\log_2 Q(x)$

用于分类任务、逻辑回归、神经网络

9. C++ 完整实现：联合熵、条件熵、互信息、KL、交叉熵

// 联合熵
double joint_entropy(const vector<vector<double>>& joint_prob) {
    double H = 0.0;
    for (const auto& row : joint_prob) {
        for (double p : row) {
            if (p < EPS) continue;
            H -= p * log2(p);
        }
    }
    return H;
}

// 边缘概率
vector<double> marginal_x(const vector<vector<double>>& joint) {
    vector<double> res(joint.size(), 0.0);
    for (int i = 0; i < joint.size(); i++) {
        for (double p : joint[i]) res[i] += p;
    }
    return res;
}

vector<double> marginal_y(const vector<vector<double>>& joint) {
    if (joint.empty()) return {};
    vector<double> res(joint[0].size(), 0.0);
    for (const auto& row : joint) {
        for (int j = 0; j < row.size(); j++) res[j] += row[j];
    }
    return res;
}

// 条件熵 H(Y|X)
double conditional_entropy(const vector<vector<double>>& joint_prob) {
    double Hxy = joint_entropy(joint_prob);
    double Hx = entropy(marginal_x(joint_prob));
    return Hxy - Hx;
}

// 互信息
double mutual_information(const vector<vector<double>>& joint_prob) {
    double Hx = entropy(marginal_x(joint_prob));
    double Hy = entropy(marginal_y(joint_prob));
    double Hxy = joint_entropy(joint_prob);
    return Hx + Hy - Hxy;
}

// KL 散度
double kl_divergence(const vector<double>& P, const vector<double>& Q) {
    double d = 0.0;
    for (int i = 0; i < P.size(); i++) {
        double p = P[i];
        double q = Q[i];
        if (p < EPS) continue;
        if (q < EPS) return 1e18;
        d += p * log2(p / q);
    }
    return d;
}

// 交叉熵
double cross_entropy(const vector<double>& P, const vector<double>& Q) {
    double ce = 0.0;
    for (int i = 0; i < P.size(); i++) {
        double p = P[i];
        double q = Q[i];
        if (p < EPS) continue;
        if (q < EPS) return 1e18;
        ce -= p * log2(q);
    }
    return ce;
}

10. 工程应用 1：文本字符熵（判断语言/压缩性）

double text_entropy(const string& s) {
    unordered_map<char, int> cnt;
    for (char c : s) cnt[c]++;
    vector<int> count;
    for (auto& p : cnt) count.push_back(p.second);
    return entropy(count_to_prob(count));
}

用途

判断文本是否加密
判断文本语言
评估压缩潜力

11. 工程应用 2：数据压缩极限

香农信源编码定理：
最优压缩码长的极限 = 熵

// 理论最小平均码长
double optimal_compression_length(const vector<double>& prob) {
    return entropy(prob);
}

12. 工程应用 3：互信息特征选择

机器学习最常用：筛选与标签最相关的特征

double feature_mutual_information(const vector<int>& feature, const vector<int>& label) {
    unordered_map<int, int> f_cnt, l_cnt;
    unordered_map<int, unordered_map<int, int>> joint_cnt;

    int n = feature.size();
    for (int i = 0; i < n; i++) {
        int f = feature[i];
        int l = label[i];
        f_cnt[f]++;
        l_cnt[l]++;
        joint_cnt[f][l]++;
    }

    vector<vector<double>> joint;
    for (auto& f : joint_cnt) {
        vector<double> row;
        for (auto& l : l_cnt) {
            row.push_back((double)joint_cnt[f.first][l.first] / n);
        }
        joint.push_back(row);
    }

    return mutual_information(joint);
}

13. 工程应用 4：AI 二分类交叉熵损失

double binary_cross_entropy(double y_true, double y_pred) {
    if (y_pred < EPS) y_pred = EPS;
    if (y_pred > 1 - EPS) y_pred = 1 - EPS;
    return -(y_true * log2(y_pred) + (1 - y_true) * log2(1 - y_pred));
}

14. 完整可运行 C++ 信息论工具库

#include <iostream>
#include <vector>
#include <cmath>
#include <unordered_map>
#include <algorithm>
#include <iomanip>
#include <string>

using namespace std;

const double EPS = 1e-12;
const double LOG2 = log(2.0);

inline double log2(double x) {
    return (x < EPS) ? 0.0 : log(x) / LOG2;
}

double self_information(double p) {
    return (p <= 0 || p > 1) ? 0.0 : -log2(p);
}

double entropy(const vector<double>& prob) {
    double H = 0.0;
    for (double p : prob) if (p > EPS) H -= p * log2(p);
    return H;
}

vector<double> count_to_prob(const vector<int>& cnt) {
    vector<double> res;
    int sum = 0;
    for (int x : cnt) sum += x;
    if (sum == 0) return res;
    for (int x : cnt) res.push_back((double)x / sum);
    return res;
}

double joint_entropy(const vector<vector<double>>& jp) {
    double H = 0;
    for (auto& r : jp) for (double p : r) if (p > EPS) H -= p * log2(p);
    return H;
}

vector<double> marginal_x(const vector<vector<double>>& jp) {
    vector<double> res(jp.size(), 0);
    for (int i = 0; i < jp.size(); i++) for (double p : jp[i]) res[i] += p;
    return res;
}

vector<double> marginal_y(const vector<vector<double>>& jp) {
    if (jp.empty()) return {};
    vector<double> res(jp[0].size(), 0);
    for (auto& r : jp) for (int j = 0; j < r.size(); j++) res[j] += r[j];
    return res;
}

double conditional_entropy(const vector<vector<double>>& jp) {
    return joint_entropy(jp) - entropy(marginal_x(jp));
}

double mutual_information(const vector<vector<double>>& jp) {
    return entropy(marginal_x(jp)) + entropy(marginal_y(jp)) - joint_entropy(jp);
}

double kl_divergence(const vector<double>& P, const vector<double>& Q) {
    double d = 0;
    for (int i = 0; i < P.size(); i++) {
        if (P[i] < EPS) continue;
        if (Q[i] < EPS) return 1e18;
        d += P[i] * log2(P[i] / Q[i]);
    }
    return d;
}

double cross_entropy(const vector<double>& P, const vector<double>& Q) {
    double ce = 0;
    for (int i = 0; i < P.size(); i++) {
        if (P[i] < EPS) continue;
        if (Q[i] < EPS) return 1e18;
        ce -= P[i] * log2(Q[i]);
    }
    return ce;
}

double text_entropy(const string& s) {
    unordered_map<char, int> cnt;
    for (char c : s) cnt[c]++;
    vector<int> count;
    for (auto& p : cnt) count.push_back(p.second);
    return entropy(count_to_prob(count));
}

double binary_cross_entropy(double y, double p) {
    p = max(EPS, min(1 - EPS, p));
    return -(y * log2(p) + (1 - y) * log2(1 - p));
}

int main() {
    cout << fixed << setprecision(4);
    vector<double> prob = {0.25, 0.25, 0.25, 0.25};
    cout << "熵 H = " << entropy(prob) << endl;

    string text = "aaaaabbbbbcccccdddddeeeee";
    cout << "文本熵 = " << text_entropy(text) << endl;

    cout << "交叉熵 = " << cross_entropy({0.5,0.5}, {0.1,0.9}) << endl;
    return 0;
}