silero-vad.cc

核心代码

完整代码

//
// Created by lovemefan on 2024/11/24.
//

#include "silero-vad.h"
#define SENSE_VOICE_VAD_MAX_NODES 1024
#define VAD_CHUNK_SIZE 640
/*
 \begin{array}{ll}
    i = \sigma(W_{ii} x + b_{ii} + W_{hi} h + b_{hi}) \\
    f = \sigma(W_{if} x + b_{if} + W_{hf} h + b_{hf}) \\
    g = \tanh(W_{ig} x + b_{ig} + W_{hg} h + b_{hg}) \\
    o = \sigma(W_{io} x + b_{io} + W_{ho} h + b_{ho}) \\
    c' = f * c + i * g \\
    h' = o * \tanh(c') \\
    \end{array}

 * */


ggml_cgraph *silero_vad_build_graph(
        sense_voice_context &ctx, sense_voice_state &state){

    const auto &model = ctx.vad_model.model;

    struct ggml_init_params params = {
            /*.mem_size   =*/state.sched_vad.meta.size(),
            /*.mem_buffer =*/state.sched_vad.meta.data(),
            /*.no_alloc   =*/true,
    };

    struct ggml_context *ctx0 = ggml_init(params);

    ggml_cgraph *gf = ggml_new_graph_custom(ctx0, SENSE_VOICE_VAD_MAX_NODES, false);

    ggml_tensor *chunk = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, VAD_CHUNK_SIZE);
    // chunk size must be 576 before pad
    ggml_set_name(chunk, "audio_chunk");
    ggml_set_input(chunk);


    ggml_tensor *cur;
    // stft
    {
        cur = ggml_conv_1d(ctx0, model->stft.forward_basis_buffer, chunk, 128, 0, 1);
        // chunk operation by ggml view, equals torch.chunk(x, 2) in pytorch
        struct ggml_tensor * real_part = ggml_view_2d(ctx0, cur, cur->ne[0], cur->ne[1] / 2, cur->nb[1], 0);
        ggml_set_name(real_part, "real_part");
        struct ggml_tensor * image_part = ggml_view_2d(ctx0, cur, cur->ne[0], cur->ne[1] / 2, cur->nb[1], cur->nb[0] * cur->ne[0] * cur->ne[1] / 2);
        ggml_set_name(image_part, "image_part");
        // magnitude, equals torch.sqrt(real_part ** 2 + imag_part ** 2)
        cur = ggml_sqrt(ctx0,
                        ggml_add(ctx0,
                                 ggml_mul(ctx0, real_part, real_part),
                                 ggml_mul(ctx0, image_part, image_part)
                                 )
                        );
        ggml_set_name(cur, "magnitude");

    }

    // encoder
    {
        {
            cur = ggml_conv_1d(ctx0, model->encoders_layer[0].reparam_conv_w, cur, 1, 1, 1);
            cur = ggml_add(ctx0, cur, ggml_cont(ctx0, ggml_transpose(ctx0, model->encoders_layer[0].reparam_conv_b)));
            cur = ggml_relu(ctx0, cur);

            cur = ggml_conv_1d(ctx0, model->encoders_layer[1].reparam_conv_w, cur, 2, 1, 1);
            cur = ggml_add(ctx0, cur,  ggml_cont(ctx0, ggml_transpose(ctx0, model->encoders_layer[1].reparam_conv_b)));
            cur = ggml_relu(ctx0, cur);

            cur = ggml_conv_1d(ctx0, model->encoders_layer[2].reparam_conv_w, cur, 2, 1, 1);
            cur = ggml_add(ctx0, cur,  ggml_cont(ctx0, ggml_transpose(ctx0, model->encoders_layer[2].reparam_conv_b)));
            cur = ggml_relu(ctx0, cur);

            cur = ggml_conv_1d(ctx0, model->encoders_layer[3].reparam_conv_w, cur, 1, 1, 1);
            cur = ggml_add(ctx0, cur,  ggml_cont(ctx0, ggml_transpose(ctx0, model->encoders_layer[3].reparam_conv_b)));
            cur = ggml_relu(ctx0, cur);
        }

    }

    //decoder
    {

        struct ggml_tensor* in_lstm_hidden_state = ggml_new_tensor_1d(ctx0, cur->type, cur->ne[1]);
        struct ggml_tensor*  in_lstm_context = ggml_new_tensor_1d(ctx0, cur->type, cur->ne[1]);

        struct ggml_tensor* out_lstm_hidden_state;
        struct ggml_tensor*  out_lstm_context;

        ggml_set_name(in_lstm_context, "in_lstm_context");
        ggml_set_name(in_lstm_hidden_state, "in_lstm_hidden_state");


        // lstm cell
        // ref: https://github.com/pytorch/pytorch/blob/1a93b96815b5c87c92e060a6dca51be93d712d09/aten/src/ATen/native/RNN.cpp#L298-L304
        // gates = x @ self.weight_ih.T + self.bias_ih + hx[0] @ self.weight_hh.T + self.bias_hh
        // chunked_gates = gates.chunk(4, dim=-1)
        // ingate = torch.sigmoid(chunked_gates[0])
        // forgetgate = torch.sigmoid(chunked_gates[1])
        // cellgate = torch.tanh(chunked_gates[2])
        // outgate = torch.sigmoid(chunked_gates[3])
        // cy = forgetgate * hx[1] + ingate * cellgate
        // hy = outgate * torch.tanh(cy)

        struct ggml_tensor *gates = ggml_add(
                ctx0,
                ggml_add(ctx0, ggml_mul_mat(ctx0,
                                            model->decoder.lstm_weight_ih,
                                            ggml_transpose(ctx0, cur)),
                         model->decoder.lstm_bias_ih),

                ggml_add(ctx0, ggml_mul_mat(ctx0,
                                            model->decoder.lstm_weight_hh,
                                            in_lstm_hidden_state),
                         model->decoder.lstm_bias_hh));
        ggml_set_name(gates, "gates");

        struct ggml_tensor * input_gates = ggml_sigmoid(ctx0, ggml_view_2d(ctx0, gates, gates->ne[0] / 4, gates->ne[1] , gates->nb[1], 0));
        struct ggml_tensor * forget_gates = ggml_sigmoid(ctx0, ggml_view_2d(ctx0, gates, gates->ne[0] / 4, gates->ne[1], gates->nb[1], gates->nb[0] / 4 * gates->ne[0]));
        struct ggml_tensor * cell_gate = ggml_tanh(ctx0, ggml_view_2d(ctx0, gates, gates->ne[0] / 4, gates->ne[1], gates->nb[1], 2 * gates->nb[0] / 4 * gates->ne[0]));
        struct ggml_tensor * out_gates = ggml_sigmoid(ctx0, ggml_view_2d(ctx0, gates, gates->ne[0] / 4, gates->ne[1], gates->nb[1], 3 * gates->nb[0] / 4 * gates->ne[0]));

        ggml_set_name(input_gates, "input_gates");
        ggml_set_name(forget_gates, "forget_gates");
        ggml_set_name(cell_gate, "cell_gates");
        ggml_set_name(out_gates, "out_gates");

        out_lstm_context = ggml_add(ctx0,
                                          ggml_mul(ctx0, forget_gates, in_lstm_context),
                                          ggml_mul(ctx0, input_gates, cell_gate)
                                          );

        ggml_set_name(out_lstm_context, "out_lstm_context");
        ggml_set_output(out_lstm_context);
        out_lstm_hidden_state = ggml_mul(ctx0, out_gates, ggml_tanh(ctx0, out_lstm_context));
        ggml_set_name(out_lstm_hidden_state, "out_lstm_hidden_state");
        ggml_set_output(out_lstm_hidden_state);

        cur = ggml_relu(ctx0, out_lstm_hidden_state);
        cur = ggml_conv_1d(ctx0, model->decoder.decoder_conv_w, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), 1, 0, 1);
        cur = ggml_add(ctx0, cur, ggml_transpose(ctx0, model->decoder.decoder_conv_b));
        ggml_set_name(cur, "decoder_out");
        cur = ggml_sigmoid(ctx0, cur);
        ggml_set_name(cur, "logit");

    }

    ggml_set_output(cur);
    ggml_build_forward_expand(gf, cur);
    ggml_free(ctx0);
    return gf;
}


bool silero_vad_encode_internal(sense_voice_context &ctx,
                                sense_voice_state &state,
                                std::vector<float> chunk,
                                const int n_threads,
                                float &speech_prob){
    {
        auto & sched = ctx.state->sched_vad.sched;
        ggml_cgraph *gf = silero_vad_build_graph(ctx, state);

        //          ggml_backend_sched_set_eval_callback(sched,  ctx->params.cb_eval, &ctx->params.cb_eval_user_data);


        if (!ggml_backend_sched_alloc_graph(sched, gf)) {
            // should never happen as we pre-allocate the memory
            return false;
        }
        // set the input
        {

            struct ggml_tensor *data = ggml_graph_get_tensor(gf, "audio_chunk");
            ggml_backend_tensor_set(data, chunk.data(), 0, ggml_nbytes(data));

            struct ggml_tensor *in_lstm_context = ggml_graph_get_tensor(gf, "in_lstm_context");
            struct ggml_tensor *in_lstm_hidden_state = ggml_graph_get_tensor(gf, "in_lstm_hidden_state");

            ggml_backend_tensor_copy(state.vad_lstm_context, in_lstm_context);
            ggml_backend_tensor_copy(state.vad_lstm_hidden_state, in_lstm_hidden_state);

        }

        if (!ggml_graph_compute_helper(sched, gf, n_threads)) {
            return false;
        }

        // save output state
        {
            struct ggml_tensor *lstm_context = ggml_graph_get_tensor(gf, "out_lstm_context");
            ggml_backend_tensor_copy(lstm_context, state.vad_lstm_context);
            struct ggml_tensor *lstm_hidden_state = ggml_graph_get_tensor(gf, "out_lstm_hidden_state");
            ggml_backend_tensor_copy(lstm_hidden_state, state.vad_lstm_hidden_state);

        }
        ggml_backend_tensor_get(ggml_graph_get_tensor(gf, "logit"), &speech_prob, 0, sizeof(speech_prob));
    }
    return true;
}

代码解释

作用:判断当前音频里有没有人声

  • 输出:0~1 之间的概率值
  • 用途:切割静音、只保留说话部分,提升语音识别速度和准确率

它是干嘛的?(生活比喻)

  • VAD = Voice Activity Detection = 语音活动检测
  • 相当于语音识别的 “门卫”
  • 只让有人声的片段进入模型
  • 静音、噪音、空音全部过滤掉

没有 VAD,语音识别会:

  • 变慢
  • 把静音识别成乱码
  • 耗电更高

代码整体流程(超级清晰)

输入一段固定长度音频 → STFT 转频谱 → 卷积编码器提取特征 → LSTM 记忆人声状态 → 解码器输出人声概率


逐部分超简解释

1. 顶部注释公式

plaintext

i = σ(...)
f = σ(...)
g = tanh(...)
o = σ(...)
c' = f * c + i * g
h' = o * tanh(c')

这是 LSTM 门控公式,不用懂,只需要知道:LSTM 用来记住前面的声音,判断是不是连续说话


2. silero_vad_build_graph

构建 VAD 模型的计算图就是把神经网络一层一层搭起来:

① STFT 时频变换

cpp

运行

cur = ggml_conv_1d(..., stft_basis, ...);

时域波形 → 频域频谱模型只能看懂频谱,看不懂原始波形。

② 4 层卷积编码器

cpp

运行

conv1d → add → relu
conv1d → add → relu
conv1d → add → relu
conv1d → add → relu

提取声音特征判断:这是噪音?静音?还是人声?

③ LSTM 解码器(核心)

cpp

运行

gates = ...
input_gates = sigmoid(...)
forget_gates = ...
out_gates = ...

LSTM 是 VAD 的灵魂

  • 记住上一帧是不是人声
  • 防止断断续续
  • 让判断更稳定
④ 最终输出

cpp

运行

cur = ggml_sigmoid(ctx0, cur);

输出 0~1 的概率

  • ≥0.5:有人声
  • <0.5:静音 / 噪音

3. silero_vad_encode_internal

真正运行 VAD 的函数做 3 件事:

  1. 输入音频块
  2. 运行模型计算
  3. 输出 speech_prob(人声概率)
  4. 保存 LSTM 状态(记忆)

输入输出长什么样?

  • 输入:长度固定 640 的 float 音频
  • 输出:speech_prob = 0.98(有人声)/ 0.02(静音)

它在整个项目里的位置(超级重要)

plaintext

麦克风 / WAV 文件
        ↓
【Silero VAD(这段代码)】
        ↓
只保留有人声的片段
        ↓
SenseVoice 语音识别
        ↓
输出文字

最关键的结论

这段代码 = Silero VAD 纯 C++ 实现

作用 = 检测有没有人声

优点 = 轻量、超快、准、离线可用

是语音识别系统必不可少的前置模块


最简总结

  • Silero VAD:语音识别的门卫
  • 输入:音频
  • 输出:人声概率(0~1)
  • 功能:过滤静音、提升识别速度与准确率
  • 结构:STFT → CNN → LSTM → 概率输出

人人皆为创造者,共创方能共成长


每个人都是使用者,也是创造者;是数字世界的消费者,更是价值的生产者与分享者。在智能时代的浪潮里,单打独斗的发展模式早已落幕,唯有开放连接、创意共创、利益共享,才能让个体价值汇聚成生态合力,让技术与创意双向奔赴,实现平台与伙伴的快速成长、共赢致远。

原创永久分成,共赴星辰大海

原创创意共创、永久收益分成,是东方仙盟始终坚守的核心理念。我们坚信,每一份原创智慧都值得被尊重与回馈,以永久分成锚定共创初心,让创意者长期享有价值红利,携手万千伙伴向着科技星辰大海笃定前行,拥抱硅基  生命与数字智能交融的未来,共筑跨越时代的数字文明共同体。

东方仙盟:拥抱知识开源,共筑数字新生态

在全球化与数字化浪潮中,东方仙盟始终秉持开放协作、知识共享的理念,积极拥抱开源技术与开放标准。我们相信,唯有打破技术壁垒、汇聚全球智慧,才能真正推动行业的可持续发展。

开源赋能中小商户:通过将前端异常检测、跨系统数据互联等核心能力开源化,东方仙盟为全球中小商户提供了低成本、高可靠的技术解决方案,让更多商家能够平等享受数字转型的红利。
共建行业标准:我们积极参与国际技术社区,与全球开发者、合作伙伴共同制定开放协议   与技术规范,推动跨境零售、文旅、餐饮等多业态的系统互联互通,构建更加公平、高效的数字生态。
知识普惠,共促发展:通过开源社区   、技术文档与培训体系,东方仙盟致力于将前沿技术转化为可落地的行业实践,赋能全球合作伙伴,共同培育创新人才,推动数字经济  的普惠式增长


阿雪技术观


在科技发展浪潮中,我们不妨积极投身技术共享。不满足于做受益者,更要主动担当贡献者。无论是分享代码、撰写技术博客,还是参与开源项目   维护改进,每一个微小举动都可能蕴含推动技术进步的巨大能量。东方仙盟是汇聚力量的天地,我们携手在此探索硅基 生命,为科技进步添砖加瓦。

Hey folks, in this  wild tech - driven world, why not dive headfirst into the whole tech - sharing scene? Don't just  be the one reaping all the benefits; step up and be a contributor too. Whether you're tossing out your code snippets        , hammering out some tech blogs, or getting your hands dirty with maintaining and sprucing up open - source projects, every little thing you do might just end up being a massive force that pushes tech forward. And guess what? The Eastern FairyAlliance is this awesome      place where we all come together. We're gonna team up and explore the whole silicon - based life thing, and in the process, we'll be fueling the growth of technology

Logo

AtomGit 是由开放原子开源基金会联合 CSDN 等生态伙伴共同推出的新一代开源与人工智能协作平台。平台坚持“开放、中立、公益”的理念,把代码托管、模型共享、数据集托管、智能体开发体验和算力服务整合在一起,为开发者提供从开发、训练到部署的一站式体验。

更多推荐