ESP32 自定义唤醒词【开源项目+源码】https://gitcode.com/gzwebsj1/xiaomo.git
·
国内仓库地址:https://gitcode.com/gzwebsj1/xiaomo.git
一、模块概述
本模块基于 ESP32 平台,通过 ADC 采集麦克风音频信号,使用音频能量检测算法实现轻量级自定义唤醒词识别,支持手动配置与自动训练模式,无需神经网络模型即可运行,适合低资源离线语音唤醒场景。
1.1 核心功能
- 16kHz 采样率音频采集与直流偏移自动校准
- 基于能量阈值的唤醒词检测(单段语音模式)
- 双峰值模式检测(适合重复唤醒词如 “小林,小林”)
- 唤醒词自动训练模式(自动校准阈值与时长)
- 唤醒成功回调函数
- 监听启停、模块初始化与反初始化
1.2 技术参数
- 采样率:16000 Hz
- 音频缓冲:512 点
- 任务栈:8192 字节
- 任务优先级:5
- 检测方式:能量检测 / 双峰值模式
- 支持唤醒词长度:最大 31 字节
- 训练样本数:1~20
二、文件结构
2.1 依赖文件
esp_sr_x.c(核心实现)esp_sr_x.h(对外接口头文件)adc.c / adc.h(ADC 驱动,需用户实现)- ESP-IDF 基础库:FreeRTOS、esp_log、esp_task_wdt 等
2.2 对外接口头文件关键定义
c
运行
typedef void (*wakeup_callback_t)(const char *wake_word, int32_t max_energy, uint32_t duration_ms);
typedef struct {
char wake_word[32];
int32_t energy_threshold;
uint32_t min_duration_ms;
uint32_t max_duration_ms;
uint8_t pattern_type;
} wake_word_config_t;
// 提供以下 API
esp_err_t esp_sr_project_init(void);
esp_err_t esp_sr_wn_deinit(void);
esp_err_t esp_sr_set_custom_wake_word(...);
esp_err_t esp_sr_start_listening(void);
esp_err_t esp_sr_stop_listening(void);
esp_err_t esp_sr_enter_training_mode(...);
esp_err_t esp_sr_set_wakeup_callback(...);
wake_word_config_t esp_sr_get_current_config(void);
三、模块工作原理
3.1 音频采集
- 通过 ADC 读取麦克风电压
- 初始化时进行直流偏移校准
- 运行中动态更新直流偏移,抑制底漂
- 信号放大并限制在 int16 范围
3.2 能量计算
plaintext
音频能量 = 所有采样点绝对值的平均值
能量大小直接反映语音响度。
3.3 唤醒检测逻辑
-
模式 0(默认):单段语音检测
- 能量超过阈值 → 开始计时
- 能量低于阈值 30% → 结束
- 时长在 min~max 之间 → 唤醒成功
-
模式 1:双峰值模式(适合 “小林,小林”)
- 检测 2 次有效能量峰值
- 峰值间隔 200~800ms
- 总时长 400~1500ms → 唤醒
3.4 训练模式
- 采集 N 次唤醒词样本
- 计算平均能量、平均时长
- 自动生成参数:
- 阈值 = 平均能量 × 0.8
- 最小时长 = 平均时长 × 0.7
- 最大时长 = 平均时长 × 1.3
四、API 详细说明
4.1 模块初始化与启动
esp_sr_project_init
- 功能:初始化 ADC、音频、创建检测任务
- 返回:ESP_OK 成功
esp_sr_wn_deinit
- 功能:反初始化 ADC,释放资源
4.2 唤醒词配置
esp_sr_set_custom_wake_word
c
运行
esp_err_t esp_sr_set_custom_wake_word(
const char *wake_word,
int32_t energy_threshold,
uint32_t min_duration_ms,
uint32_t max_duration_ms
);
- 功能:手动设置唤醒词与检测参数
4.3 监听控制
esp_sr_start_listening— 开始监听esp_sr_stop_listening— 停止监听
4.4 训练模式
c
运行
esp_err_t esp_sr_enter_training_mode(const char *wake_word, uint8_t sample_count);
- sample_count:1~20
- 采集完成后自动退出训练并更新参数
4.5 唤醒回调
c
运行
esp_sr_set_wakeup_callback(wakeup_handler);
回调原型:
c
运行
void wakeup_handler(const char *wake_word, int32_t max_energy, uint32_t duration_ms);
4.6 获取当前配置
c
运行
wake_word_config_t esp_sr_get_current_config(void);
五、使用示例(可直接编译)
c
运行
#include "esp_sr_x.h"
#include "esp_log.h"
static const char *TAG = "SR_DEMO";
void wakeup_handler(const char *wake_word, int32_t max_energy, uint32_t duration_ms)
{
ESP_LOGI(TAG, "唤醒成功:%s,能量:%d,时长:%d ms",
wake_word, max_energy, duration_ms);
}
void app_main(void)
{
// 初始化
esp_sr_project_init();
// 设置回调
esp_sr_set_wakeup_callback(wakeup_handler);
// 手动设置唤醒词
esp_sr_set_custom_wake_word("小林,小林", 5000, 300, 1200);
// 启动监听
esp_sr_start_listening();
ESP_LOGI(TAG, "已启动唤醒检测...");
while (1) {
vTaskDelay(pdMS_TO_TICKS(1000));
}
}
六、训练模式示例
c
运行
// 采集 5 次样本自动校准
esp_sr_enter_training_mode("小林,小林", 5);
esp_sr_start_listening();
七、硬件与适配要求
7.1 硬件要求
- ESP32 系列芯片
- 模拟麦克风(如 MIC 驻极体 + 放大电路)
- 麦克风输出接 ESP32 ADC 通道
7.2 ADC 驱动要求
用户必须实现以下函数:
esp_err_t adc_init(void);esp_err_t adc_deinit(void);esp_err_t adc_read_voltage(uint32_t *voltage);
八、常见问题与调试
8.1 不触发唤醒
- 能量阈值过高
- 麦克风无输出或增益太小
- ADC 采集异常
- 时长范围设置不合理
8.2 频繁误触发
- 环境噪音大
- 能量阈值太低
- 直流偏移未校准
8.3 任务崩溃
- 任务栈不足(可改为 10240)
- 内存溢出
- ADC 驱动异常
#include "esp_log.h" #include "freertos/FreeRTOS.h" #include "freertos/task.h" #include <stdlib.h> #include <string.h> #include <math.h> #include "esp_task_wdt.h" #include "./include/esp_sr_x.h" #include "./include/adc.h" #define SAMPLE_RATE 16000 #define BUFFER_SIZE 512 static bool adc_initialized = false; static int32_t dc_offset = 0; static uint32_t dc_offset_samples = 0; void delay_ms(uint32_t milliseconds) { if (milliseconds == 0) { return; } TickType_t ticks = pdMS_TO_TICKS(milliseconds); if (ticks == 0) { ticks = 1; } vTaskDelay(ticks); } static int32_t calculate_energy(int16_t *audio_data, size_t length) { if (audio_data == NULL || length == 0) { return 0; } int64_t sum = 0; for (size_t i = 0; i < length; i++) { sum += abs(audio_data[i]); } return (int32_t)(sum / length); } static wake_word_config_t custom_wake_config = { .wake_word = "小林,小林", .energy_threshold = 5000, .min_duration_ms = 300, .max_duration_ms = 1200, .pattern_type = 0 }; static bool is_listening = false; static wakeup_callback_t wakeup_callback = NULL; static bool is_training_mode = false; static char training_word[32] = {0}; static uint8_t training_samples = 0; static uint8_t collected_samples = 0; static int32_t training_energies[20] = {0}; static uint32_t training_durations[20] = {0}; typedef struct { bool is_active; uint32_t start_time; uint32_t end_time; int32_t max_energy; uint32_t frame_count; } voice_activity_state_t; static void complete_training(void) { if (collected_samples == 0) { return; } int64_t total_energy = 0; uint64_t total_duration = 0; for (int i = 0; i < collected_samples; i++) { total_energy += training_energies[i]; total_duration += training_durations[i]; } int32_t avg_energy = total_energy / collected_samples; uint32_t avg_duration = total_duration / collected_samples; custom_wake_config.energy_threshold = avg_energy * 0.8; custom_wake_config.min_duration_ms = avg_duration * 0.7; custom_wake_config.max_duration_ms = avg_duration * 1.3; strncpy(custom_wake_config.wake_word, training_word, sizeof(custom_wake_config.wake_word) - 1); is_training_mode = false; collected_samples = 0; } static bool detect_custom_wake_word(int16_t *audio_data, size_t length, voice_activity_state_t *activity_state) { static bool in_activity = false; static uint32_t activity_start = 0; static int32_t peak_energy = 0; static uint32_t activity_frames = 0; int32_t current_energy = calculate_energy(audio_data, length); uint32_t current_time = xTaskGetTickCount(); if (!in_activity && current_energy > custom_wake_config.energy_threshold) { in_activity = true; activity_start = current_time; peak_energy = current_energy; activity_frames = 1; return false; } if (in_activity) { activity_frames++; if (current_energy > peak_energy) { peak_energy = current_energy; } if (current_energy < custom_wake_config.energy_threshold * 0.3) { uint32_t duration = current_time - activity_start; if (duration >= custom_wake_config.min_duration_ms && duration <= custom_wake_config.max_duration_ms) { if (activity_state != NULL) { activity_state->is_active = true; activity_state->start_time = activity_start; activity_state->end_time = current_time; activity_state->max_energy = peak_energy; activity_state->frame_count = activity_frames; } if (is_training_mode && collected_samples < training_samples) { training_energies[collected_samples] = peak_energy; training_durations[collected_samples] = duration; collected_samples++; if (collected_samples >= training_samples) { complete_training(); } } in_activity = false; return true; } else { in_activity = false; } } } return false; } static bool detect_pattern_wake_word(int16_t *audio_data, size_t length) { static uint32_t last_peak_time = 0; static uint8_t peak_count = 0; static uint32_t pattern_start = 0; int32_t energy = calculate_energy(audio_data, length); uint32_t current_time = xTaskGetTickCount(); if (energy > custom_wake_config.energy_threshold) { if (last_peak_time == 0) { pattern_start = current_time; peak_count = 1; last_peak_time = current_time; } else { uint32_t interval = current_time - last_peak_time; if (interval >= 200 && interval <= 800) { peak_count++; last_peak_time = current_time; if (peak_count >= 2) { uint32_t total_time = current_time - pattern_start; if (total_time >= 400 && total_time <= 1500) { peak_count = 0; last_peak_time = 0; return true; } } } else { peak_count = 0; last_peak_time = current_time; } } } if (current_time - last_peak_time > 2000) { peak_count = 0; last_peak_time = 0; } return false; } static esp_err_t audio_init(void) { esp_err_t ret = adc_init(); if (ret != ESP_OK) { return ret; } adc_initialized = true; uint32_t sum = 0; const int calibration_samples = 100; for (int i = 0; i < calibration_samples; i++) { uint32_t voltage = 0; if (adc_read_voltage(&voltage) == ESP_OK) { sum += voltage; } delay_ms(1); // 1ms间隔 } if (calibration_samples > 0) { dc_offset = sum / calibration_samples; } else { dc_offset = 1500; // 默认值(假设为3.3V的一半) } return ESP_OK; } esp_err_t get_audio_data(int16_t *audio_data, size_t length) { if (audio_data == NULL || length == 0) { return ESP_ERR_INVALID_ARG; } if (!adc_initialized) { return ESP_ERR_INVALID_STATE; } uint32_t raw_voltage = 0; int32_t dc_offset_local = dc_offset; for (size_t i = 0; i < length; i++) { esp_err_t ret = adc_read_voltage(&raw_voltage); if (ret != ESP_OK) { audio_data[i] = 0; continue; } int32_t ac_signal = (int32_t)raw_voltage - dc_offset_local; dc_offset_samples++; if (dc_offset_samples > 1000) { dc_offset_local = (dc_offset_local * 999 + raw_voltage) / 1000; dc_offset_samples = 0; } int32_t amplified = ac_signal * 328; // 328 = 32767/100 if (amplified > 32767) amplified = 32767; if (amplified < -32768) amplified = -32768; audio_data[i] = (int16_t)amplified; } if (dc_offset_samples == 0) { dc_offset = dc_offset_local; } return ESP_OK; } esp_err_t esp_sr_wn_init(void) { srand((unsigned int)xTaskGetTickCount()); esp_err_t ret = audio_init(); if (ret != ESP_OK) { return ret; } return ESP_OK; } void esp_sr_wn_detect_task(void *pvParameters) { esp_task_wdt_config_t wdt_config = { .timeout_ms = 10000, // 10秒超时 .idle_core_mask = (1 << 0) | (1 << 1), // 监控两个核心 .trigger_panic = false // 不触发panic,只打印警告 }; esp_err_t wdt_ret = esp_task_wdt_init(&wdt_config); if (wdt_ret != ESP_OK) { } wdt_ret = esp_task_wdt_add(NULL); if (wdt_ret != ESP_OK) { } uint32_t adc_voltage = 0; uint32_t loop_count = 0; uint32_t wake_detections = 0; uint32_t total_samples = 0; int16_t *audio_buffer = (int16_t *)malloc(BUFFER_SIZE * sizeof(int16_t)); if (audio_buffer == NULL) { esp_task_wdt_delete(NULL); // 清理看门狗 vTaskDelete(NULL); return; } uint32_t start_time = xTaskGetTickCount(); uint32_t last_status_time = start_time; uint32_t last_wdt_reset = start_time; const uint32_t TARGET_CYCLE_TIME_MS = BUFFER_SIZE * 1000 / SAMPLE_RATE; while (1) { loop_count++; uint32_t current_time = xTaskGetTickCount(); if (current_time - last_wdt_reset > 1000) { esp_task_wdt_reset(); last_wdt_reset = current_time; static uint32_t wdt_reset_count = 0; wdt_reset_count++; if (wdt_reset_count % 10 == 0) { } } uint32_t processing_start = xTaskGetTickCount(); esp_err_t voltage_ret = adc_read_voltage(&adc_voltage); if (voltage_ret != ESP_OK) { adc_voltage = 0; } esp_err_t audio_ret = get_audio_data(audio_buffer, BUFFER_SIZE); if (audio_ret != ESP_OK) { memset(audio_buffer, 0, BUFFER_SIZE * sizeof(int16_t)); } total_samples += BUFFER_SIZE; bool wake_detected = false; voice_activity_state_t activity_state = {0}; if (!is_listening) { vTaskDelay(pdMS_TO_TICKS(100)); continue; } if (is_training_mode) { wake_detected = detect_custom_wake_word(audio_buffer, BUFFER_SIZE, &activity_state); } else { switch (custom_wake_config.pattern_type) { case 0: wake_detected = detect_custom_wake_word(audio_buffer, BUFFER_SIZE, &activity_state); break; case 1: wake_detected = detect_pattern_wake_word(audio_buffer, BUFFER_SIZE); break; default: wake_detected = detect_custom_wake_word(audio_buffer, BUFFER_SIZE, &activity_state); break; } } if (wake_detected && !is_training_mode) { wake_detections++; uint32_t duration = 0; if (activity_state.is_active) { duration = activity_state.end_time - activity_state.start_time; } if (wakeup_callback != NULL) { wakeup_callback(custom_wake_config.wake_word, activity_state.max_energy, duration); } vTaskDelay(pdMS_TO_TICKS(1000)); } uint32_t processing_time = xTaskGetTickCount() - processing_start; if (processing_time < TARGET_CYCLE_TIME_MS) { uint32_t wait_time = TARGET_CYCLE_TIME_MS - processing_time; vTaskDelay(pdMS_TO_TICKS(wait_time)); } else { if (processing_time > TARGET_CYCLE_TIME_MS * 2) { } } current_time = xTaskGetTickCount(); if (current_time - last_status_time >= 10000) { // 每10秒 // uint32_t run_time = (current_time - start_time) / 1000; // uint32_t actual_sample_rate = (run_time > 0) ? total_samples / run_time : 0; last_status_time = current_time; } if (loop_count % 10 == 0) { taskYIELD(); } } esp_task_wdt_delete(NULL); free(audio_buffer); } esp_err_t esp_sr_project_init(void) { esp_err_t ret = esp_sr_wn_init(); if (ret != ESP_OK) { return ret; } BaseType_t task_ret = xTaskCreate( esp_sr_wn_detect_task, "sr_detect_task", 8192, NULL, 5, NULL ); if (task_ret != pdPASS) { return ESP_FAIL; } is_listening = true; return ESP_OK; } esp_err_t esp_sr_wn_deinit(void) { if (adc_initialized) { adc_deinit(); adc_initialized = false; } return ESP_OK; } esp_err_t esp_sr_set_custom_wake_word(const char *wake_word, int32_t energy_threshold, uint32_t min_duration_ms, uint32_t max_duration_ms) { if (wake_word == NULL || strlen(wake_word) > 31) { return ESP_ERR_INVALID_ARG; } strncpy(custom_wake_config.wake_word, wake_word, sizeof(custom_wake_config.wake_word) - 1); custom_wake_config.energy_threshold = energy_threshold; custom_wake_config.min_duration_ms = min_duration_ms; custom_wake_config.max_duration_ms = max_duration_ms; return ESP_OK; } esp_err_t esp_sr_start_listening(void) { if (!adc_initialized) { return ESP_ERR_INVALID_STATE; } is_listening = true; return ESP_OK; } esp_err_t esp_sr_stop_listening(void) { is_listening = false; return ESP_OK; } esp_err_t esp_sr_enter_training_mode(const char *wake_word, uint8_t sample_count) { if (wake_word == NULL || sample_count == 0 || sample_count > 20) { return ESP_ERR_INVALID_ARG; } strncpy(training_word, wake_word, sizeof(training_word) - 1); training_samples = sample_count; collected_samples = 0; is_training_mode = true; return ESP_OK; } wake_word_config_t esp_sr_get_current_config(void) { return custom_wake_config; } esp_err_t esp_sr_set_wakeup_callback(wakeup_callback_t callback) { wakeup_callback = callback; return ESP_OK; }
AtomGit 是由开放原子开源基金会联合 CSDN 等生态伙伴共同推出的新一代开源与人工智能协作平台。平台坚持“开放、中立、公益”的理念,把代码托管、模型共享、数据集托管、智能体开发体验和算力服务整合在一起,为开发者提供从开发、训练到部署的一站式体验。
更多推荐


所有评论(0)