AI龙虾聊天具身智能机器人最简方案规划二

1 AI-MuleRun
咨询:给出运用ESP32进行本地特定人语音和人脸图像识别的硬软件实现最简方案和MICROPYTHON的具体编码实现,说明所使用AI模型的获取与嵌入应用。

2 硬件方案
|
组件 |
型号推荐 |
作用 |
|
主控 |
ESP32-S3-WROOM-1 (N16R8) |
16MB Flash + 8MB PSRAM,支持向量指令加速 |
|
摄像头 |
OV2640(DVP 接口) |
人脸图像采集 |
|
麦克风 |
INMP441(I2S 数字麦) |
语音采集 |
|
喇叭(可选) |
MAX98357A + 小喇叭 |
语音反馈 |
|
开发板 |
ESP32-S3-EYE 或 Freenove ESP32-S3-WROOM |
集成摄像头和麦克风 |
2.1 核心选型

2.2 接线概要

接线(分立元件时):

3 AI 模型获取与嵌入
3.1 人脸识别模型
|
阶段 |
模型 |
来源 |
大小 |
|
人脸检测 |
MFN (Mobile FaceNet) |
ESP-WHO 内置 |
~500KB |
|
人脸特征提取 |
MobileFaceNet |
ESP-WHO 内置 |
~900KB |
获取方式:
# 克隆 ESP-WHO 仓库(含预训练模型)
git clone --recursive https://github.com/espressif/esp-who.git
# 模型位于:
# esp-who/components/modules/ai/who_human_face_detection/
# esp-who/components/modules/ai/who_human_face_recognition/
模型以 C 数组形式嵌入固件,编译时链接到 Flash,运行时按需加载到 PSRAM。
3.2 语音识别模型
|
功能 |
模型 |
来源 |
说明 |
|
唤醒词 |
WakeNet |
ESP-SR 内置 |
支持自定义唤醒词 |
|
命令词识别 |
MultiNet |
ESP-SR 内置 |
支持约200条中英文命令 |
|
声纹识别(说话人) |
自定义 MFCC + DTW |
自行实现 |
特定人识别 |
获取方式:
# ESP-SR 通过 ESP-IDF 组件管理器获取
idf.py add-dependency "espressif/esp-sr"
# 或克隆:
git clone https://github.com/espressif/esp-sr.git
模型以 C 数组形式嵌入固件,编译时链接到 Flash,运行时按需加载到 PSRAM。
3.3 模型嵌入流程
训练好的模型 (.tflite / .onnx)
│
▼
量化为 INT8 (TFLite Micro 格式)
│
▼
xxd -i model.tflite > model_data.h ← 转为 C 数组
│
▼
编译进固件 Flash,运行时加载到 PSRAM
4 软件架构
┌─────────────────────────────────────┐
│ MicroPython 业务层 │
│ (主循环、状态机、事件分发、GPIO) │
├─────────────────────────────────────┤
│ C User Module (桥接层) │
│ face_recognizer.c / voice_cmd.c │
├──────────────┬──────────────────────┤
│ ESP-WHO │ ESP-SR │
│ 人脸检测 │ 唤醒词+命令词 │
│ 人脸识别 │ 声纹特征提取 │
├──────────────┴──────────────────────┤
│ ESP-IDF (FreeRTOS, I2S, DVP驱动) │
└─────────────────────────────────────┘
5 具体编码实现
5.1 C桥接(模块编译进MicroPython固件)
需要自定义编译 MicroPython 固件,加入 C 扩展模块。
文件结构:
micropython/
├── ports/esp32/
│ ├── boards/ESP32_S3_AI/
│ │ ├── mpconfigboard.h
│ │ └── mpconfigboard.cmake
│ └── usermods/
│ ├── ai_face/
│ │ ├── micropython.cmake
│ │ └── ai_face.c
│ └── ai_voice/
│ ├── micropython.cmake
│ └── ai_voice.c
#include "py/runtime.h"
#include "py/obj.h"
#include "esp_camera.h"
#include "who_human_face_detection.h"
#include "who_human_face_recognition.h"
#include "face_recognition_tool.h"
ai_face.c(核心桥接):
// 已注册人脸特征存储 (最多存10人)
static face_id_list id_list = {0};
static bool camera_inited = false;
// 初始化摄像头
STATIC mp_obj_t ai_face_init(void) {
if (camera_inited) return mp_const_true;
camera_config_t config = {
.pin_pwdn = -1,
.pin_reset = -1,
.pin_xclk = 10,
.pin_sccb_sda = 14,
.pin_sccb_scl = 15,
.pin_d7 = 9, .pin_d6 = 8, .pin_d5 = 7, .pin_d4 = 6,
.pin_d3 = 5, .pin_d2 = 4, .pin_d1 = 3, .pin_d0 = 2,
.pin_vsync = 12,
.pin_href = 13,
.pin_pclk = 11,
.xclk_freq_hz = 20000000,
.pixel_format = PIXFORMAT_RGB565,
.frame_size = FRAMESIZE_240X240,
.fb_count = 2,
.grab_mode = CAMERA_GRAB_LATEST, };
esp_err_t err = esp_camera_init(&config);
if (err != ESP_OK) {
mp_raise_msg(&mp_type_RuntimeError, MP_ERROR_TEXT("camera init fail"));
}
face_id_init(&id_list, 10, ENROLL_CONFIRM_TIMES);
camera_inited = true;
return mp_const_true;}
STATIC MP_DEFINE_CONST_FUN_OBJ_0(ai_face_init_obj, ai_face_init);
// 注册人脸 — 传入人名,拍照提取特征存储
STATIC mp_obj_t ai_face_enroll(mp_obj_t name_obj) {
const char *name = mp_obj_str_get_str(name_obj);
camera_fb_t *fb = esp_camera_fb_get();
if (!fb) return mp_obj_new_int(-1);
// 检测人脸
box_array_t *boxes = face_detect(fb->buf, fb->width, fb->height);
if (!boxes || boxes->len == 0) {
esp_camera_fb_return(fb);
return mp_obj_new_int(-2); // 未检测到人脸 }
// 提取特征并注册
int id = face_id_enroll(&id_list, fb->buf, fb->width, fb->height,
&boxes->box[0], name);
esp_camera_fb_return(fb);
free(boxes);
return mp_obj_new_int(id);}
STATIC MP_DEFINE_CONST_FUN_OBJ_1(ai_face_enroll_obj, ai_face_enroll);
// 识别人脸 — 返回人名或 None
STATIC mp_obj_t ai_face_recognize(void) {
camera_fb_t *fb = esp_camera_fb_get();
if (!fb) return mp_const_none;
box_array_t *boxes = face_detect(fb->buf, fb->width, fb->height);
if (!boxes || boxes->len == 0) {
esp_camera_fb_return(fb);
return mp_const_none; }
face_id_info_t info;
int matched = face_id_recognize(&id_list, fb->buf, fb->width, fb->height, &boxes->box[0], &info);
esp_camera_fb_return(fb);
free(boxes);
if (matched >= 0) {
return mp_obj_new_str(info.name, strlen(info.name)); }
return mp_const_none;}
STATIC MP_DEFINE_CONST_FUN_OBJ_0(ai_face_recognize_obj, ai_face_recognize);
// 模块定义
STATIC const mp_rom_map_elem_t ai_face_globals_table[] = {
{ MP_ROM_QSTR(MP_QSTR___name__), MP_ROM_QSTR(MP_QSTR_ai_face) },
{ MP_ROM_QSTR(MP_QSTR_init), MP_ROM_PTR(&ai_face_init_obj) },
{ MP_ROM_QSTR(MP_QSTR_enroll), MP_ROM_PTR(&ai_face_enroll_obj) },
{ MP_ROM_QSTR(MP_QSTR_recognize), MP_ROM_PTR(&ai_face_recognize_obj) },
};
STATIC MP_DEFINE_CONST_DICT(ai_face_globals, ai_face_globals_table);
const mp_obj_module_t ai_face_module = {
.base = { &mp_type_module },
.globals = (mp_obj_dict_t *)&ai_face_globals,};
MP_REGISTER_MODULE(MP_QSTR_ai_face, ai_face_module);
ai_voice.c(语音桥接,类似结构):
#include "py/runtime.h"
#include "esp_sr.h"
#include "esp_mn_speech_commands.h"
#include "driver/i2s_std.h"
#include "esp_wn_iface.h"
#include "esp_mn_iface.h"
static esp_mn_iface_t *multinet = NULL;
static model_iface_data_t *mn_data = NULL;
static srmodel_list_t *sr_models = NULL;
// 初始化语音识别引擎
STATIC mp_obj_t ai_voice_init(void) {
// I2S 配置
i2s_chan_handle_t rx_handle;
i2s_chan_config_t chan_cfg = I2S_CHANNEL_DEFAULT_CONFIG(I2S_NUM_0, I2S_ROLE_MASTER);
i2s_new_channel(&chan_cfg, NULL, &rx_handle);
i2s_std_config_t std_cfg = {
.clk_cfg = I2S_STD_CLK_DEFAULT_CONFIG(16000),
.slot_cfg = I2S_STD_PHILIPS_SLOT_DEFAULT_CONFIG(I2S_DATA_BIT_WIDTH_16BIT,
I2S_SLOT_MODE_MONO),
.gpio_cfg = {
.bclk = GPIO_NUM_17,
.ws = GPIO_NUM_16,
.din = GPIO_NUM_18,
},
};
i2s_channel_init_std_mode(rx_handle, &std_cfg);
i2s_channel_enable(rx_handle);
// 加载模型
sr_models = esp_srmodel_init("model");
char *mn_name = esp_srmodel_filter(sr_models, ESP_MN_PREFIX, ESP_MN_CHINESE);
multinet = esp_mn_handle_from_name(mn_name);
mn_data = multinet->create(mn_name, 6000);
// 添加命令词
esp_mn_commands_update_from_sdkconfig(multinet, mn_data);
return mp_const_true;}
STATIC MP_DEFINE_CONST_FUN_OBJ_0(ai_voice_init_obj, ai_voice_init);
// 监听并识别一次命令 (阻塞)
STATIC mp_obj_t ai_voice_listen(void) {
// 读取音频数据并送入模型
int16_t *buffer = heap_caps_malloc(1024 * sizeof(int16_t), MALLOC_CAP_SPIRAM);
size_t bytes_read;
i2s_channel_read(rx_handle, buffer, 1024 * 2, &bytes_read, portMAX_DELAY);
esp_mn_state_t state = multinet->detect(mn_data, buffer);
free(buffer);
if (state == ESP_MN_STATE_DETECTED) {
esp_mn_results_t *result = multinet->get_results(mn_data);
return mp_obj_new_int(result->command_id[0]); }
return mp_obj_new_int(-1);}
STATIC MP_DEFINE_CONST_FUN_OBJ_0(ai_voice_listen_obj, ai_voice_listen);
STATIC const mp_rom_map_elem_t ai_voice_globals_table[] = {
{ MP_ROM_QSTR(MP_QSTR___name__), MP_ROM_QSTR(MP_QSTR_ai_voice) },
{ MP_ROM_QSTR(MP_QSTR_init), MP_ROM_PTR(&ai_voice_init_obj) },
{ MP_ROM_QSTR(MP_QSTR_listen), MP_ROM_PTR(&ai_voice_listen_obj) },};
STATIC MP_DEFINE_CONST_DICT(ai_voice_globals, ai_voice_globals_table);
const mp_obj_module_t ai_voice_module = {
.base = { &mp_type_module },
.globals = (mp_obj_dict_t *)&ai_voice_globals,};
MP_REGISTER_MODULE(MP_QSTR_ai_voice, ai_voice_module);
5.2 固件编译
# 1. 环境准备
git clone https://github.com/micropython/micropython.git
cd micropython
git submodule update --init
source ~/esp/esp-idf/export.sh# 2. 配置 ESP-IDF (需 v5.1+)
# 3. 将 usermods 放入 ports/esp32/usermods/
# 4. 编译(指定用户模块和板级配置)
cd ports/esp32
make USER_C_MODULES=usermods/micropython.cmake \
BOARD=ESP32_S3_N16R8 \
FROZEN_MANIFEST=manifest.py
esptool.py --port /dev/ttyUSB0 write_flash -z 0 build-ESP32_S3/firmware.bin# 5. 烧录
5.3 MicroPython业务层代码
"""main.py — ESP32-S3 本地人脸+语音识别主程序
烧录到 ESP32-S3 的 MicroPython 文件系统根目录"""
import ai_face;import ai_voice;import machine;import time# ── 硬件 ──
led_green = machine.Pin(48, machine.Pin.OUT) # 识别成功
led_red = machine.Pin(47, machine.Pin.OUT) # 识别失败
relay = machine.Pin(46, machine.Pin.OUT) # 门锁/设备控制
AUTHORIZED_USERS = {} # {face_id: name}# ── 授权用户 ──
# ── 语音命令映射 ──
VOICE_COMMANDS = { 0: "开灯", 1: "关灯", 2: "开门", 3: "关门",4: "注册人脸",}
class SmartGate: """融合人脸+语音的本地识别控制器"""
STATE_IDLE = 0
STATE_ENROLL = 1
STATE_VERIFY = 2
def __init__(self):
self.state = self.STATE_IDLE
self.current_user = None
self._init_hardware()
def _init_hardware(self):
print("[SYS] 初始化摄像头..."); ai_face.init()
print("[SYS] 初始化语音引擎..."); ai_voice.init()
led_green.off(); led_red.off(); relay.off()
print("[SYS] 系统就绪")
# ─────── 人脸注册 ───────
def enroll_face(self, name):
"""注册新用户人脸,需正对摄像头拍3次"""
print(f"[ENROLL] 开始注册: {name}"); success_count = 0
for attempt in range(3): print(f" 第 {attempt+1}/3 次采集,请正对摄像头...")
self._blink(led_green, times=2); e.sleep_ms(500)
result = ai_face.enroll(name)
if result >= 0:
success_count += 1; print(f" 采集成功 (ID={result})")
elif result == -2: print(" 未检测到人脸,请调整位置")
else: print(" 采集失败")
time.sleep(1) if success_count >= 2:
AUTHORIZED_USERS[result] = name
print(f"[ENROLL] {name} 注册完成")
self._blink(led_green, times=5)
return True
else:
print(f"[ENROLL] {name} 注册失败")
self._blink(led_red, times=5)
return False
# ─────── 人脸识别 ───────
def verify_face(self):
"""识别当前摄像头中的人脸,返回用户名或 None"""
name = ai_face.recognize()
if name is not None:
print(f"[FACE] 识别到: {name}")
self.current_user = name
led_green.on()
led_red.off()
return name
else:
led_green.off()
led_red.on()
time.sleep_ms(200)
led_red.off()
return None
# ─────── 语音命令处理 ───────
def process_voice(self):
"""监听一次语音命令并执行"""
cmd_id = ai_voice.listen()
if cmd_id < 0:
return None
cmd_name = VOICE_COMMANDS.get(cmd_id, f"未知命令({cmd_id})")
print(f"[VOICE] 命令: {cmd_name}")
# 注册命令不需要人脸验证
if cmd_id == 4:
self.state = self.STATE_ENROLL
return cmd_name
# 其他命令需先验证身份
if self.current_user is None:
print("[AUTH] 请先进行人脸验证")
self._blink(led_red, times=3)
return None
self._execute_command(cmd_id)
return cmd_name
def _execute_command(self, cmd_id):
if cmd_id == 0: # 开灯
led_green.on()
elif cmd_id == 1: # 关灯
led_green.off()
elif cmd_id == 2: # 开门
relay.on()
print("[ACT] 门已开启,5秒后自动关闭")
time.sleep(5)
relay.off()
elif cmd_id == 3: # 关门
relay.off()
# ─────── 工具方法 ───────
def _blink(self, led, times=3, interval_ms=150):
for _ in range(times):
led.on()
time.sleep_ms(interval_ms)
led.off()
time.sleep_ms(interval_ms)
# ══════════════════════════════════
# 主循环
# ══════════════════════════════════
def main():
gate = SmartGate()
# 首次使用:注册管理员
print("=== 首次使用请注册管理员 ===")
print("对准摄像头,说 '注册人脸'")
while True:
try:
# 每轮:先尝试人脸识别
user = gate.verify_face()
# 再监听语音命令
cmd = gate.process_voice()
# 处理注册状态
if gate.state == gate.STATE_ENROLL:
gate.enroll_face("user_" + str(time.ticks_ms()))
gate.state = gate.STATE_IDLE
# 10秒无操作清除身份
time.sleep_ms(100)
except KeyboardInterrupt:
print("\n[SYS] 系统关闭")
break
except Exception as e:
print(f"[ERR] {e}")
time.sleep(1)
if __name__ == "__main__":
main()
5.4 声纹识别的纯MicroPython简化实现
如果不想依赖 ESP-SR 做说话人验证,可以用 MFCC + 模板匹配做极简声纹:
""voiceprint.py — 基于MFCC的极简声纹识别
适用于区分2-5个已注册说话人"""
import math, array
class VoicePrint:
"""MFCC特征提取 + DTW模板匹配的声纹识别"""
def __init__(self, sr=16000, n_mfcc=13):
self.sr = sr
self.n_mfcc = n_mfcc
self.templates = {} # {name: [mfcc_frames]}
self._mel_filters = self._build_mel_filters(26, 512, sr)
def _build_mel_filters(self, n_filters, nfft, sr):
"""构建Mel滤波器组"""
low_mel = 0; high_mel = 2595 * math.log10(1 + (sr / 2) / 700)
mel_points = [low_mel + i * (high_mel - low_mel) / (n_filters + 1)
for i in range(n_filters + 2)]
hz_points = [700 * (10 ** (m / 2595) - 1) for m in mel_points]
bins = [int((nfft + 1) * h / sr) for h in hz_points]
filters = []
for i in range(n_filters):
f = array.array('f', [0.0] * (nfft // 2 + 1))
for j in range(bins[i], bins[i+1]):
if bins[i+1] != bins[i]: f[j] = (j - bins[i]) / (bins[i+1] - bins[i])
for j in range(bins[i+1], bins[i+2]): if bins[i+2] != bins[i+1]: bins[i+2] - j) / (bins[i+2] - bins[i+1])
filters.append(f)
return filters
def extract_mfcc(self, audio_samples):
"""从16bit PCM提取MFCC特征序列"""
frame_len = 512
hop = 256
n_frames = (len(audio_samples) - frame_len) // hop
mfcc_seq = []
for i in range(n_frames):
frame = audio_samples[i*hop : i*hop+frame_len] # 汉明窗
windowed = [frame[j] * (0.54 - 0.46 * math.cos(2*math.pi*j/(frame_len-1)))
for j in range(frame_len)] # 简化FFT功率谱 (实际应用中用 ulab.numpy.fft)
power = self._power_spectrum(windowed) # Mel滤波
mel_energy = []
for filt in self._mel_filters:
e = sum(power[j] * filt[j] for j in range(len(power)))
mel_energy.append(math.log(e + 1e-10)) # DCT 取前 n_mfcc 个系数
mfcc = self._dct(mel_energy)[:self.n_mfcc]; mfcc_seq.append(mfcc); return mfcc_seq
def _power_spectrum(self, frame):
"""简化功率谱估计"""
n = len(frame)
power = []
for k in range(n // 2 + 1):
re = sum(frame[j] * math.cos(2*math.pi*k*j/n) for j in range(n))
im = sum(frame[j] * math.sin(2*math.pi*k*j/n) for j in range(n))
power.append((re*re + im*im) / n)
return power
def _dct(self, x):
n = len(x)
return [sum(x[j] * math.cos(math.pi*(j+0.5)*k/n); for j in range(n)); for k in range(n)]
def enroll(self, name, audio_samples):
"""注册说话人声纹模板"""
mfcc = self.extract_mfcc(audio_samples)
self.templates[name] = mfcc
print(f"[VP] 已注册 {name}, {len(mfcc)} 帧特征")
def identify(self, audio_samples, threshold=50.0):
"""识别说话人,返回 (name, distance) 或 (None, -1)"""
mfcc = self.extract_mfcc(audio_samples)
best_name = None
best_dist = float('inf')
for name, template in self.templates.items():
dist = self._dtw_distance(mfcc, template)
if dist < best_dist: best_dist = dist; best_name = name
if best_dist < threshold: return best_name, best_dist; return None, best_dist
def _dtw_distance(self, s1, s2):
"""动态时间规整距离"""
n, m = len(s1), len(s2) # 内存优化:只保留两行
prev = [float('inf')] * (m + 1)
curr = [float('inf')] * (m + 1)
prev[0] = 0
for i in range(1, n + 1):
curr[0] = float('inf')
for j in range(1, m + 1):
cost = sum((a - b) ** 2 for a, b in zip(s1[i-1], s2[j-1]))
curr[j] = cost + min(prev[j], curr[j-1], prev[j-1])
prev, curr = curr, prev
return math.sqrt(prev[m]) / max(n, m)
6 纯MicroPython替代方案无需编译C模块
如果不想编译自定义固件,可以用 MicroPython + HTTP 本地服务 的折中方案:
ESP32-S3 (MicroPython) ←──WiFi──→ 本地树莓派/PC
采集图像+音频 运行 face_recognition
发送到局域网 + whisper 推理
接收识别结果 返回结果
但这不满足"本地"的要求。真正的全本地方案必须走 C 模块路径。
7 总结
|
维度 |
方案 |
|
硬件 |
ESP32-S3 (N16R8) + OV2640 + INMP441 |
|
人脸模型 |
ESP-WHO 内置 MobileFaceNet,~1.4MB |
|
语音模型 |
ESP-SR 内置 WakeNet + MultiNet,~3MB |
|
声纹识别 |
MFCC + DTW(纯 Python 可实现) |
|
开发语言 |
C(AI推理模块)+ MicroPython(业务逻辑) |
|
编译工具 |
ESP-IDF v5.1+ 编译自定义 MicroPython 固件 |
|
识别延迟 |
人脸 ~300ms,语音命令 ~500ms(ESP32-S3) |
|
成本 |
硬件合计约 ¥80-120 |
核心要点:ESP32-S3 上做本地 AI 推理,性能瓶颈在 C 层而非 Python 层。MicroPython 的角色是胶水语言——负责流程控制、状态管理和外设交互,AI 推理必须下沉到 C 模块。不存在纯 MicroPython 可用的本地神经网络推理库能跑在 ESP32 上。
AtomGit 是由开放原子开源基金会联合 CSDN 等生态伙伴共同推出的新一代开源与人工智能协作平台。平台坚持“开放、中立、公益”的理念,把代码托管、模型共享、数据集托管、智能体开发体验和算力服务整合在一起,为开发者提供从开发、训练到部署的一站式体验。
更多推荐

所有评论(0)