1 AI-MuleRun

咨询:给出运用ESP32进行本地特定人语音和人脸图像识别的硬软件实现最简方案和MICROPYTHON的具体编码实现,说明所使用AI模型的获取与嵌入应用。

2 硬件方案

组件

型号推荐

作用

主控

ESP32-S3-WROOM-1 (N16R8)

16MB Flash + 8MB PSRAM,支持向量指令加速

摄像头

OV2640(DVP 接口)

人脸图像采集

麦克风

INMP441(I2S 数字麦)

语音采集

喇叭(可选)

MAX98357A + 小喇叭

语音反馈

开发板

ESP32-S3-EYE 或 Freenove ESP32-S3-WROOM

集成摄像头和麦克风

2.1 核心选型

2.2 接线概要

接线(分立元件时):

3 AI 模型获取与嵌入

3.1 人脸识别模型

阶段

模型

来源

大小

人脸检测

MFN (Mobile FaceNet)

ESP-WHO 内置

~500KB

人脸特征提取

MobileFaceNet

ESP-WHO 内置

~900KB

获取方式:

# 克隆 ESP-WHO 仓库(含预训练模型)

git clone --recursive https://github.com/espressif/esp-who.git

# 模型位于:

# esp-who/components/modules/ai/who_human_face_detection/

# esp-who/components/modules/ai/who_human_face_recognition/

模型以 C 数组形式嵌入固件,编译时链接到 Flash,运行时按需加载到 PSRAM。

3.2 语音识别模型

功能

模型

来源

说明

唤醒词

WakeNet

ESP-SR 内置

支持自定义唤醒词

命令词识别

MultiNet

ESP-SR 内置

支持约200条中英文命令

声纹识别(说话人)

自定义 MFCC + DTW

自行实现

特定人识别

获取方式:

# ESP-SR 通过 ESP-IDF 组件管理器获取

idf.py add-dependency "espressif/esp-sr"

# 或克隆:

git clone https://github.com/espressif/esp-sr.git

模型以 C 数组形式嵌入固件,编译时链接到 Flash,运行时按需加载到 PSRAM。

3.3 模型嵌入流程

训练好的模型 (.tflite / .onnx)

        │

        ▼

量化为 INT8 (TFLite Micro 格式)

        │

        ▼

xxd -i model.tflite > model_data.h   ← 转为 C 数组

        │

        ▼

编译进固件 Flash,运行时加载到 PSRAM

4 软件架构

┌─────────────────────────────────────┐

│         MicroPython 业务层          │

│  (主循环、状态机、事件分发、GPIO)     │

├─────────────────────────────────────┤

│      C User Module (桥接层)         │

│  face_recognizer.c / voice_cmd.c   │

├──────────────┬──────────────────────┤

│  ESP-WHO     │    ESP-SR            │

│  人脸检测     │    唤醒词+命令词      │

│  人脸识别     │    声纹特征提取       │

├──────────────┴──────────────────────┤

│  ESP-IDF (FreeRTOS, I2S, DVP驱动)  │

└─────────────────────────────────────┘

5 具体编码实现

5.1 C桥接(模块编译进MicroPython固件)

需要自定义编译 MicroPython 固件,加入 C 扩展模块。

文件结构:

micropython/

├── ports/esp32/

│   ├── boards/ESP32_S3_AI/

│   │   ├── mpconfigboard.h

│   │   └── mpconfigboard.cmake

│   └── usermods/

│       ├── ai_face/

│       │   ├── micropython.cmake

│       │   └── ai_face.c

│       └── ai_voice/

│           ├── micropython.cmake

│           └── ai_voice.c

#include "py/runtime.h"

#include "py/obj.h"

#include "esp_camera.h"

#include "who_human_face_detection.h"

#include "who_human_face_recognition.h"

#include "face_recognition_tool.h"

ai_face.c(核心桥接):

// 已注册人脸特征存储 (最多存10)

static face_id_list id_list = {0};

static bool camera_inited = false;

// 初始化摄像头

STATIC mp_obj_t ai_face_init(void) {

    if (camera_inited) return mp_const_true;

    camera_config_t config = {

        .pin_pwdn  = -1,

        .pin_reset = -1,

        .pin_xclk  = 10,

        .pin_sccb_sda = 14,

        .pin_sccb_scl = 15,

        .pin_d7 = 9, .pin_d6 = 8, .pin_d5 = 7, .pin_d4 = 6,

        .pin_d3 = 5, .pin_d2 = 4, .pin_d1 = 3, .pin_d0 = 2,

        .pin_vsync = 12,

        .pin_href  = 13,

        .pin_pclk  = 11,

        .xclk_freq_hz = 20000000,

        .pixel_format = PIXFORMAT_RGB565,

        .frame_size = FRAMESIZE_240X240,

        .fb_count = 2,

        .grab_mode = CAMERA_GRAB_LATEST,    };

    esp_err_t err = esp_camera_init(&config);

    if (err != ESP_OK) {

        mp_raise_msg(&mp_type_RuntimeError, MP_ERROR_TEXT("camera init fail"));

    }

    face_id_init(&id_list, 10, ENROLL_CONFIRM_TIMES);

    camera_inited = true;

    return mp_const_true;}

STATIC MP_DEFINE_CONST_FUN_OBJ_0(ai_face_init_obj, ai_face_init);

// 注册人脸传入人名,拍照提取特征存储

STATIC mp_obj_t ai_face_enroll(mp_obj_t name_obj) {

    const char *name = mp_obj_str_get_str(name_obj);

    camera_fb_t *fb = esp_camera_fb_get();

    if (!fb) return mp_obj_new_int(-1);

    // 检测人脸

    box_array_t *boxes = face_detect(fb->buf, fb->width, fb->height);

    if (!boxes || boxes->len == 0) {

        esp_camera_fb_return(fb);

        return mp_obj_new_int(-2); // 未检测到人脸    }

    // 提取特征并注册

    int id = face_id_enroll(&id_list, fb->buf, fb->width, fb->height,

                            &boxes->box[0], name);

    esp_camera_fb_return(fb);

    free(boxes);

    return mp_obj_new_int(id);}

STATIC MP_DEFINE_CONST_FUN_OBJ_1(ai_face_enroll_obj, ai_face_enroll);

// 识别人脸返回人名或 None

STATIC mp_obj_t ai_face_recognize(void) {

    camera_fb_t *fb = esp_camera_fb_get();

    if (!fb) return mp_const_none;

    box_array_t *boxes = face_detect(fb->buf, fb->width, fb->height);

    if (!boxes || boxes->len == 0) {

        esp_camera_fb_return(fb);

        return mp_const_none;    }

    face_id_info_t info;

    int matched = face_id_recognize(&id_list, fb->buf, fb->width, fb->height,                                    &boxes->box[0], &info);

    esp_camera_fb_return(fb);

    free(boxes);

    if (matched >= 0) {

        return mp_obj_new_str(info.name, strlen(info.name));    }

    return mp_const_none;}

STATIC MP_DEFINE_CONST_FUN_OBJ_0(ai_face_recognize_obj, ai_face_recognize);

// 模块定义

STATIC const mp_rom_map_elem_t ai_face_globals_table[] = {

    { MP_ROM_QSTR(MP_QSTR___name__), MP_ROM_QSTR(MP_QSTR_ai_face) },

    { MP_ROM_QSTR(MP_QSTR_init),      MP_ROM_PTR(&ai_face_init_obj) },

    { MP_ROM_QSTR(MP_QSTR_enroll),    MP_ROM_PTR(&ai_face_enroll_obj) },

    { MP_ROM_QSTR(MP_QSTR_recognize), MP_ROM_PTR(&ai_face_recognize_obj) },

};

STATIC MP_DEFINE_CONST_DICT(ai_face_globals, ai_face_globals_table);

const mp_obj_module_t ai_face_module = {

    .base = { &mp_type_module },

    .globals = (mp_obj_dict_t *)&ai_face_globals,};

MP_REGISTER_MODULE(MP_QSTR_ai_face, ai_face_module);

ai_voice.c(语音桥接,类似结构):

#include "py/runtime.h"

#include "esp_sr.h"

#include "esp_mn_speech_commands.h"

#include "driver/i2s_std.h"

#include "esp_wn_iface.h"

#include "esp_mn_iface.h"

static esp_mn_iface_t *multinet = NULL;

static model_iface_data_t *mn_data = NULL;

static srmodel_list_t *sr_models = NULL;

// 初始化语音识别引擎

STATIC mp_obj_t ai_voice_init(void) {

    // I2S 配置

    i2s_chan_handle_t rx_handle;

    i2s_chan_config_t chan_cfg = I2S_CHANNEL_DEFAULT_CONFIG(I2S_NUM_0, I2S_ROLE_MASTER);

    i2s_new_channel(&chan_cfg, NULL, &rx_handle);

    i2s_std_config_t std_cfg = {

        .clk_cfg  = I2S_STD_CLK_DEFAULT_CONFIG(16000),

        .slot_cfg = I2S_STD_PHILIPS_SLOT_DEFAULT_CONFIG(I2S_DATA_BIT_WIDTH_16BIT,

                                                         I2S_SLOT_MODE_MONO),

        .gpio_cfg = {

            .bclk = GPIO_NUM_17,

            .ws   = GPIO_NUM_16,

            .din  = GPIO_NUM_18,

        },

    };

    i2s_channel_init_std_mode(rx_handle, &std_cfg);

    i2s_channel_enable(rx_handle);

    // 加载模型

    sr_models = esp_srmodel_init("model");

    char *mn_name = esp_srmodel_filter(sr_models, ESP_MN_PREFIX, ESP_MN_CHINESE);

    multinet = esp_mn_handle_from_name(mn_name);

    mn_data = multinet->create(mn_name, 6000);

    // 添加命令词

    esp_mn_commands_update_from_sdkconfig(multinet, mn_data);

    return mp_const_true;}

STATIC MP_DEFINE_CONST_FUN_OBJ_0(ai_voice_init_obj, ai_voice_init);

// 监听并识别一次命令 (阻塞)

STATIC mp_obj_t ai_voice_listen(void) {

    // 读取音频数据并送入模型

    int16_t *buffer = heap_caps_malloc(1024 * sizeof(int16_t), MALLOC_CAP_SPIRAM);

    size_t bytes_read;

    i2s_channel_read(rx_handle, buffer, 1024 * 2, &bytes_read, portMAX_DELAY);

    esp_mn_state_t state = multinet->detect(mn_data, buffer);

    free(buffer);

    if (state == ESP_MN_STATE_DETECTED) {

        esp_mn_results_t *result = multinet->get_results(mn_data);

        return mp_obj_new_int(result->command_id[0]);    }

    return mp_obj_new_int(-1);}

STATIC MP_DEFINE_CONST_FUN_OBJ_0(ai_voice_listen_obj, ai_voice_listen);

STATIC const mp_rom_map_elem_t ai_voice_globals_table[] = {

    { MP_ROM_QSTR(MP_QSTR___name__), MP_ROM_QSTR(MP_QSTR_ai_voice) },

    { MP_ROM_QSTR(MP_QSTR_init),   MP_ROM_PTR(&ai_voice_init_obj) },

    { MP_ROM_QSTR(MP_QSTR_listen), MP_ROM_PTR(&ai_voice_listen_obj) },};

STATIC MP_DEFINE_CONST_DICT(ai_voice_globals, ai_voice_globals_table);

const mp_obj_module_t ai_voice_module = {

    .base = { &mp_type_module },

    .globals = (mp_obj_dict_t *)&ai_voice_globals,};

MP_REGISTER_MODULE(MP_QSTR_ai_voice, ai_voice_module);

5.2 固件编译

# 1. 环境准备

git clone https://github.com/micropython/micropython.git

cd micropython

git submodule update --init

source ~/esp/esp-idf/export.sh# 2. 配置 ESP-IDF ( v5.1+)

# 3. usermods 放入 ports/esp32/usermods/

# 4. 编译(指定用户模块和板级配置)

cd ports/esp32

make USER_C_MODULES=usermods/micropython.cmake \

     BOARD=ESP32_S3_N16R8 \

     FROZEN_MANIFEST=manifest.py

esptool.py --port /dev/ttyUSB0 write_flash -z 0 build-ESP32_S3/firmware.bin# 5. 烧录

5.3 MicroPython业务层代码

"""main.py — ESP32-S3 本地人脸+语音识别主程序

烧录到 ESP32-S3 MicroPython 文件系统根目录"""

import ai_face;import ai_voice;import machine;import time# ── 硬件 ──

led_green = machine.Pin(48, machine.Pin.OUT)  # 识别成功

led_red   = machine.Pin(47, machine.Pin.OUT)  # 识别失败

relay     = machine.Pin(46, machine.Pin.OUT)  # 门锁/设备控制

AUTHORIZED_USERS = {}  # {face_id: name}# ── 授权用户 ──

# ── 语音命令映射 ──

VOICE_COMMANDS = {  0: "开灯", 1: "关灯", 2: "开门", 3: "关门",4: "注册人脸",}

class SmartGate:    """融合人脸+语音的本地识别控制器"""

    STATE_IDLE     = 0

    STATE_ENROLL   = 1

    STATE_VERIFY   = 2

    def __init__(self):

        self.state = self.STATE_IDLE

        self.current_user = None

        self._init_hardware()

    def _init_hardware(self):

        print("[SYS] 初始化摄像头...");        ai_face.init()

        print("[SYS] 初始化语音引擎...");        ai_voice.init()

        led_green.off(); led_red.off();   relay.off()

        print("[SYS] 系统就绪")

    # ─────── 人脸注册 ───────

    def enroll_face(self, name):

        """注册新用户人脸,需正对摄像头拍3"""

        print(f"[ENROLL] 开始注册: {name}"); success_count = 0

        for attempt in range(3):  print(f {attempt+1}/3 次采集,请正对摄像头...")

            self._blink(led_green, times=2); e.sleep_ms(500)

            result = ai_face.enroll(name)

            if result >= 0:

                success_count += 1; print(f采集成功 (ID={result})")

            elif result == -2: print(未检测到人脸,请调整位置")

            else:   print(采集失败")

            time.sleep(1)    if success_count >= 2:

            AUTHORIZED_USERS[result] = name

            print(f"[ENROLL] {name} 注册完成")

            self._blink(led_green, times=5)

            return True

        else:

            print(f"[ENROLL] {name} 注册失败")

            self._blink(led_red, times=5)

            return False

    # ─────── 人脸识别 ───────

    def verify_face(self):

        """识别当前摄像头中的人脸,返回用户名或 None"""

        name = ai_face.recognize()

        if name is not None:

            print(f"[FACE] 识别到: {name}")

            self.current_user = name

            led_green.on()

            led_red.off()

            return name

        else:

            led_green.off()

            led_red.on()

            time.sleep_ms(200)

            led_red.off()

            return None

    # ─────── 语音命令处理 ───────

    def process_voice(self):

        """监听一次语音命令并执行"""

        cmd_id = ai_voice.listen()

        if cmd_id < 0:

            return None

        cmd_name = VOICE_COMMANDS.get(cmd_id, f"未知命令({cmd_id})")

        print(f"[VOICE] 命令: {cmd_name}")

        # 注册命令不需要人脸验证

        if cmd_id == 4:

            self.state = self.STATE_ENROLL

            return cmd_name

        # 其他命令需先验证身份

        if self.current_user is None:

            print("[AUTH] 请先进行人脸验证")

            self._blink(led_red, times=3)

            return None

        self._execute_command(cmd_id)

        return cmd_name

    def _execute_command(self, cmd_id):

        if cmd_id == 0:    # 开灯

            led_green.on()

        elif cmd_id == 1:  # 关灯

            led_green.off()

        elif cmd_id == 2:  # 开门

            relay.on()

            print("[ACT] 门已开启,5秒后自动关闭")

            time.sleep(5)

            relay.off()

        elif cmd_id == 3:  # 关门

            relay.off()

    # ─────── 工具方法 ───────

    def _blink(self, led, times=3, interval_ms=150):

        for _ in range(times):

            led.on()

            time.sleep_ms(interval_ms)

            led.off()

            time.sleep_ms(interval_ms)

# ══════════════════════════════════

主循环

# ══════════════════════════════════

def main():

    gate = SmartGate()

    # 首次使用:注册管理员

    print("=== 首次使用请注册管理员 ===")

    print("对准摄像头,说 '注册人脸'")

    while True:

        try:

            # 每轮:先尝试人脸识别

            user = gate.verify_face()

            # 再监听语音命令

            cmd = gate.process_voice()

            # 处理注册状态

            if gate.state == gate.STATE_ENROLL:

                gate.enroll_face("user_" + str(time.ticks_ms()))

                gate.state = gate.STATE_IDLE

            # 10秒无操作清除身份

            time.sleep_ms(100)

        except KeyboardInterrupt:

            print("\n[SYS] 系统关闭")

            break

        except Exception as e:

            print(f"[ERR] {e}")

            time.sleep(1)

if __name__ == "__main__":

    main()

5.4 声纹识别的纯MicroPython简化实现

如果不想依赖 ESP-SR 做说话人验证,可以用 MFCC + 模板匹配做极简声纹:

  ""voiceprint.py基于MFCC的极简声纹识别

适用于区分2-5个已注册说话人"""

import math, array

class VoicePrint:

    """MFCC特征提取 + DTW模板匹配的声纹识别"""

    def __init__(self, sr=16000, n_mfcc=13):

        self.sr = sr

        self.n_mfcc = n_mfcc

        self.templates = {}  # {name: [mfcc_frames]}

        self._mel_filters = self._build_mel_filters(26, 512, sr)

    def _build_mel_filters(self, n_filters, nfft, sr):

        """构建Mel滤波器组"""

        low_mel = 0;  high_mel = 2595 * math.log10(1 + (sr / 2) / 700)

        mel_points = [low_mel + i * (high_mel - low_mel) / (n_filters + 1)

                      for i in range(n_filters + 2)]

        hz_points = [700 * (10 ** (m / 2595) - 1) for m in mel_points]

        bins = [int((nfft + 1) * h / sr) for h in hz_points]

        filters = []

        for i in range(n_filters):

            f = array.array('f', [0.0] * (nfft // 2 + 1))

            for j in range(bins[i], bins[i+1]):

                if bins[i+1] != bins[i]: f[j] = (j - bins[i]) / (bins[i+1] - bins[i])

            for j in range(bins[i+1], bins[i+2]): if bins[i+2] != bins[i+1]: bins[i+2] - j) / (bins[i+2] - bins[i+1])

            filters.append(f)

        return filters

    def extract_mfcc(self, audio_samples):

        """16bit PCM提取MFCC特征序列"""

        frame_len = 512

        hop = 256

        n_frames = (len(audio_samples) - frame_len) // hop

        mfcc_seq = []

        for i in range(n_frames):

            frame = audio_samples[i*hop : i*hop+frame_len]            # 汉明窗

            windowed = [frame[j] * (0.54 - 0.46 * math.cos(2*math.pi*j/(frame_len-1)))

                        for j in range(frame_len)]            # 简化FFT功率谱 (实际应用中用 ulab.numpy.fft)

            power = self._power_spectrum(windowed)            # Mel滤波

            mel_energy = []

            for filt in self._mel_filters:

                e = sum(power[j] * filt[j] for j in range(len(power)))

                mel_energy.append(math.log(e + 1e-10))            # DCT 取前 n_mfcc 个系数

            mfcc = self._dct(mel_energy)[:self.n_mfcc]; mfcc_seq.append(mfcc);  return mfcc_seq

    def _power_spectrum(self, frame):

        """简化功率谱估计"""

        n = len(frame)

        power = []

        for k in range(n // 2 + 1):

            re = sum(frame[j] * math.cos(2*math.pi*k*j/n) for j in range(n))

            im = sum(frame[j] * math.sin(2*math.pi*k*j/n) for j in range(n))

            power.append((re*re + im*im) / n)

        return power

    def _dct(self, x):

        n = len(x)

        return [sum(x[j] * math.cos(math.pi*(j+0.5)*k/n); for j in range(n));  for k in range(n)]

    def enroll(self, name, audio_samples):

        """注册说话人声纹模板"""

        mfcc = self.extract_mfcc(audio_samples)

        self.templates[name] = mfcc

        print(f"[VP] 已注册 {name}, {len(mfcc)} 帧特征")

    def identify(self, audio_samples, threshold=50.0):

        """识别说话人,返回 (name, distance) (None, -1)"""

        mfcc = self.extract_mfcc(audio_samples)

        best_name = None

        best_dist = float('inf')

        for name, template in self.templates.items():

            dist = self._dtw_distance(mfcc, template)

            if dist < best_dist:  best_dist = dist;    best_name = name

        if best_dist < threshold:   return best_name, best_dist;    return None, best_dist

    def _dtw_distance(self, s1, s2):

        """动态时间规整距离"""

        n, m = len(s1), len(s2)        # 内存优化:只保留两行

        prev = [float('inf')] * (m + 1)

        curr = [float('inf')] * (m + 1)

        prev[0] = 0

        for i in range(1, n + 1):

            curr[0] = float('inf')

            for j in range(1, m + 1):

                cost = sum((a - b) ** 2 for a, b in zip(s1[i-1], s2[j-1]))

                curr[j] = cost + min(prev[j], curr[j-1], prev[j-1])

            prev, curr = curr, prev

        return math.sqrt(prev[m]) / max(n, m)

6 纯MicroPython替代方案无需编译C模块

如果不想编译自定义固件,可以用 MicroPython + HTTP 本地服务 的折中方案:

ESP32-S3 (MicroPython)  ←──WiFi──→  本地树莓派/PC

  采集图像+音频                       运行 face_recognition

  发送到局域网                        + whisper 推理

  接收识别结果                        返回结果

但这不满足"本地"的要求。真正的全本地方案必须走 C 模块路径。

7 总结

维度

方案

硬件

ESP32-S3 (N16R8) + OV2640 + INMP441

人脸模型

ESP-WHO 内置 MobileFaceNet,~1.4MB

语音模型

ESP-SR 内置 WakeNet + MultiNet,~3MB

声纹识别

MFCC + DTW(纯 Python 可实现)

开发语言

C(AI推理模块)+ MicroPython(业务逻辑)

编译工具

ESP-IDF v5.1+ 编译自定义 MicroPython 固件

识别延迟

人脸 ~300ms,语音命令 ~500ms(ESP32-S3)

成本

硬件合计约 ¥80-120

核心要点:ESP32-S3 上做本地 AI 推理,性能瓶颈在 C 层而非 Python 层。MicroPython 的角色是胶水语言——负责流程控制、状态管理和外设交互,AI 推理必须下沉到 C 模块。不存在纯 MicroPython 可用的本地神经网络推理库能跑在 ESP32 上。

Logo

AtomGit 是由开放原子开源基金会联合 CSDN 等生态伙伴共同推出的新一代开源与人工智能协作平台。平台坚持“开放、中立、公益”的理念,把代码托管、模型共享、数据集托管、智能体开发体验和算力服务整合在一起,为开发者提供从开发、训练到部署的一站式体验。

更多推荐