add independent modules (not integrated into framework)
- modules/antaf/ — Antaf LLM provider, voice passthrough, bridge scripts - modules/tts/ — sherpa-onnx local TTS provider - modules/docs/ — integration plan These are standalone files, NOT patched into xiaozhi-server framework. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
ae260da3eb
commit
a88e7072b3
|
|
@ -0,0 +1,143 @@
|
|||
import json
|
||||
import requests
|
||||
from config.logger import setup_logging
|
||||
from core.providers.llm.base import LLMProviderBase
|
||||
|
||||
TAG = __name__
|
||||
logger = setup_logging()
|
||||
|
||||
|
||||
class LLMProvider(LLMProviderBase):
|
||||
"""
|
||||
蚂蚁阿福 LLM Provider
|
||||
通过 Frida HTTP Bridge (port 18900) 对接蚂蚁阿福 App 的文字对话 API。
|
||||
Bridge 运行在手机上,通过 adb forward 或网络暴露 SSE 流式接口。
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
self.bridge_url = config.get("bridge_url", "http://127.0.0.1:18900")
|
||||
self.timeout = config.get("timeout", 60)
|
||||
self.should_idle = False # signal to send system idle after TTS
|
||||
logger.bind(tag=TAG).info(
|
||||
f"AntafLLM 初始化: bridge={self.bridge_url}, timeout={self.timeout}s"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _is_thinking(text):
|
||||
"""检测蚂蚁阿福的内心思考/推理过程,这些不应该发给用户"""
|
||||
thinking_patterns = [
|
||||
"用户问", "用户说", "用户的", "用户可能", "用户真正",
|
||||
"我得", "我会", "我在想", "我决定", "我要",
|
||||
"语气比较", "感觉他", "让他知道", "让他觉得",
|
||||
"先安抚", "得先", "不想表现",
|
||||
"整体语气", "这样能", "这样他",
|
||||
"所以我", "还带了个",
|
||||
]
|
||||
for p in thinking_patterns:
|
||||
if p in text:
|
||||
return True
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _clean_text(text):
|
||||
"""清理阿福返回文本中的脏数据"""
|
||||
# 去掉阿福内部状态文本
|
||||
junk = [
|
||||
"完成资料引用", "内容生成", "正在思考", "正在搜索",
|
||||
]
|
||||
for j in junk:
|
||||
text = text.replace(j, "")
|
||||
return text.strip()
|
||||
|
||||
@staticmethod
|
||||
def _is_system_injected(content):
|
||||
"""检测是否为系统注入的消息(非用户真实输入)"""
|
||||
if not content:
|
||||
return True
|
||||
markers = [
|
||||
"[系统提示]", "tool_call", "<tool_call>", "TOOL USE",
|
||||
"系统提示", "工具调用", "function_call",
|
||||
"handle_exit_intent", "你有以下工具", "You have access",
|
||||
]
|
||||
for m in markers:
|
||||
if m in content:
|
||||
return True
|
||||
# 超过200字的 user 消息大概率是系统注入的
|
||||
if len(content) > 200:
|
||||
return True
|
||||
return False
|
||||
|
||||
def response(self, session_id, dialogue, **kwargs):
|
||||
# 从 dialogue 中提取真正的用户消息(跳过系统注入的 user 消息)
|
||||
query = ""
|
||||
for msg in reversed(dialogue):
|
||||
if msg.get("role") == "user":
|
||||
content = msg.get("content", "")
|
||||
if not self._is_system_injected(content):
|
||||
# ASR 结果可能是 JSON: {"content":"...", "language":"zh", "emotion":"..."}
|
||||
try:
|
||||
parsed = json.loads(content)
|
||||
if isinstance(parsed, dict) and "content" in parsed:
|
||||
query = parsed["content"]
|
||||
else:
|
||||
query = content
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
query = content
|
||||
break
|
||||
|
||||
if not query:
|
||||
logger.bind(tag=TAG).warning("对话中没有用户消息")
|
||||
yield "抱歉,我没有收到您的问题。"
|
||||
return
|
||||
|
||||
# 追加简短回答提示,避免阿福回复过长导致TTS排队卡顿
|
||||
query = query + "(请用2-3句话简短回答)"
|
||||
self.should_idle = False
|
||||
logger.bind(tag=TAG).info(f"AntafLLM 请求: {query[:50]}...")
|
||||
|
||||
try:
|
||||
url = f"{self.bridge_url}/chat"
|
||||
resp = requests.get(
|
||||
url,
|
||||
params={"q": query},
|
||||
stream=True,
|
||||
timeout=self.timeout,
|
||||
)
|
||||
resp.encoding = "utf-8"
|
||||
|
||||
seen_texts = set()
|
||||
for line in resp.iter_lines(decode_unicode=True):
|
||||
if not line:
|
||||
continue
|
||||
if line.startswith("data: "):
|
||||
data = line[6:]
|
||||
if data == "[DONE]":
|
||||
break
|
||||
if not data or len(data.strip()) == 0:
|
||||
continue
|
||||
# 去重:跳过完全相同的文本块
|
||||
if data in seen_texts:
|
||||
continue
|
||||
seen_texts.add(data)
|
||||
# 过滤思考过程
|
||||
if self._is_thinking(data):
|
||||
logger.bind(tag=TAG).debug(f"过滤思考内容: {data[:50]}...")
|
||||
continue
|
||||
# 清理脏数据
|
||||
data = self._clean_text(data)
|
||||
if not data:
|
||||
continue
|
||||
yield data
|
||||
|
||||
except requests.exceptions.ConnectionError:
|
||||
logger.bind(tag=TAG).error("无法连接蚂蚁阿福 Bridge,请检查手机和 Frida 状态")
|
||||
self.should_idle = True
|
||||
yield "抱歉,蚂蚁阿福服务暂时不可用。"
|
||||
except requests.exceptions.Timeout:
|
||||
logger.bind(tag=TAG).error(f"蚂蚁阿福 Bridge 超时 ({self.timeout}s)")
|
||||
self.should_idle = True
|
||||
yield "抱歉,回答超时了。"
|
||||
except Exception as e:
|
||||
logger.bind(tag=TAG).error(f"AntafLLM 异常: {e}")
|
||||
self.should_idle = True
|
||||
yield "抱歉,发生了错误。"
|
||||
|
|
@ -0,0 +1,251 @@
|
|||
"""
|
||||
Antaf Voice Passthrough ASR Provider
|
||||
|
||||
Replaces ASR→LLM→TTS pipeline with direct audio forwarding to Antaf voice_bridge.
|
||||
ESP32 audio → decode Opus → resample 16kHz→48kHz → inject to voice_bridge (type=3)
|
||||
voice_bridge speaker (type=0) → resample 48kHz→16kHz → encode Opus → send to ESP32
|
||||
|
||||
Runs within xiaozhi-server, keeping all protocol handling (hello, OTA, wake word) intact.
|
||||
"""
|
||||
|
||||
import json
|
||||
import struct
|
||||
import asyncio
|
||||
import threading
|
||||
import numpy as np
|
||||
import opuslib_next
|
||||
from scipy.signal import resample_poly
|
||||
from math import gcd
|
||||
from config.logger import setup_logging
|
||||
from core.providers.asr.base import ASRProviderBase
|
||||
from core.handle.sendAudioHandle import send_tts_message
|
||||
|
||||
TAG = __name__
|
||||
logger = setup_logging()
|
||||
|
||||
# Audio parameters
|
||||
ESP_SR = 16000
|
||||
ESP_FRAME_SAMPLES = 960 # 60ms at 16kHz
|
||||
BRIDGE_SR = 48000
|
||||
BRIDGE_FRAME_SAMPLES = 480 # 960 bytes / 2 = 480 samples
|
||||
|
||||
# Resample ratios
|
||||
UP = (BRIDGE_SR // gcd(BRIDGE_SR, ESP_SR), ESP_SR // gcd(BRIDGE_SR, ESP_SR)) # (3,1)
|
||||
DOWN = (ESP_SR // gcd(ESP_SR, BRIDGE_SR), BRIDGE_SR // gcd(ESP_SR, BRIDGE_SR)) # (1,3)
|
||||
|
||||
|
||||
class ASRProvider(ASRProviderBase):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.bridge_host = config.get("bridge_host", "127.0.0.1")
|
||||
self.bridge_port = int(config.get("bridge_port", 18901))
|
||||
self.interface_type = "NON_STREAM"
|
||||
self.conn = None
|
||||
self.bridge_reader = None
|
||||
self.bridge_writer = None
|
||||
self.opus_decoder = None
|
||||
self.opus_encoder = None
|
||||
self._inject_buf = np.array([], dtype=np.int16)
|
||||
self._speaker_buf = np.array([], dtype=np.int16)
|
||||
self._tts_started = False
|
||||
self._recv_task = None
|
||||
self._connected = False
|
||||
logger.bind(tag=TAG).info(
|
||||
f"AntafPassthrough 初始化: bridge={self.bridge_host}:{self.bridge_port}"
|
||||
)
|
||||
|
||||
async def open_audio_channels(self, conn):
|
||||
"""Override: connect to bridge, start passthrough instead of normal ASR."""
|
||||
# Clean up previous connection if any
|
||||
await self.close()
|
||||
|
||||
self.conn = conn
|
||||
self.opus_decoder = opuslib_next.Decoder(ESP_SR, 1)
|
||||
self.opus_encoder = opuslib_next.Encoder(ESP_SR, 1, opuslib_next.APPLICATION_AUDIO)
|
||||
self._tts_started = False
|
||||
self._silence_count = 0
|
||||
self._inject_buf = np.array([], dtype=np.int16)
|
||||
self._speaker_buf = np.array([], dtype=np.int16)
|
||||
self._write_lock = threading.Lock()
|
||||
|
||||
# Connect to voice_bridge
|
||||
try:
|
||||
self.bridge_reader, self.bridge_writer = await asyncio.open_connection(
|
||||
self.bridge_host, self.bridge_port
|
||||
)
|
||||
# Read connected event
|
||||
ftype, data = await self._bridge_recv()
|
||||
if ftype == 1:
|
||||
msg = json.loads(data.decode())
|
||||
logger.bind(tag=TAG).info(f"Bridge connected: {msg.get('protocol')}")
|
||||
|
||||
# Send start + inject_on
|
||||
self._bridge_send_cmd({"cmd": "start"})
|
||||
ftype, data = await self._bridge_recv()
|
||||
if ftype == 1:
|
||||
logger.bind(tag=TAG).info(f"Bridge: {json.loads(data.decode())}")
|
||||
|
||||
self._bridge_send_cmd({"cmd": "inject_on"})
|
||||
ftype, data = await self._bridge_recv()
|
||||
if ftype == 1:
|
||||
logger.bind(tag=TAG).info(f"Bridge: {json.loads(data.decode())}")
|
||||
|
||||
self._connected = True
|
||||
logger.bind(tag=TAG).info("Voice bridge ready (inject mode)")
|
||||
|
||||
# Start speaker receive loop
|
||||
self._recv_task = asyncio.create_task(self._speaker_recv_loop())
|
||||
|
||||
except Exception as e:
|
||||
logger.bind(tag=TAG).error(f"Bridge connection failed: {e}")
|
||||
self._connected = False
|
||||
|
||||
# Start normal audio processing thread (reads from asr_audio_queue)
|
||||
conn.asr_priority_thread = threading.Thread(
|
||||
target=self._audio_thread, args=(conn,), daemon=True
|
||||
)
|
||||
conn.asr_priority_thread.start()
|
||||
|
||||
def _audio_thread(self, conn):
|
||||
"""Read Opus frames from queue, decode, resample, inject to bridge."""
|
||||
import queue as queue_module
|
||||
frame_count = 0
|
||||
while not conn.stop_event.is_set():
|
||||
try:
|
||||
opus_data = conn.asr_audio_queue.get(timeout=1)
|
||||
if not self._connected:
|
||||
continue
|
||||
|
||||
frame_count += 1
|
||||
if frame_count <= 3 or frame_count % 200 == 0:
|
||||
logger.bind(tag=TAG).debug(f"Audio frame #{frame_count}")
|
||||
|
||||
# Decode Opus → PCM 16kHz
|
||||
pcm = self.opus_decoder.decode(opus_data, ESP_FRAME_SAMPLES)
|
||||
samples = np.frombuffer(pcm, dtype=np.int16)
|
||||
|
||||
# Resample 16kHz → 48kHz
|
||||
upsampled = resample_poly(samples, UP[0], UP[1]).astype(np.int16)
|
||||
|
||||
# Split into bridge frames and inject
|
||||
self._inject_buf = np.concatenate([self._inject_buf, upsampled])
|
||||
while len(self._inject_buf) >= BRIDGE_FRAME_SAMPLES:
|
||||
frame = self._inject_buf[:BRIDGE_FRAME_SAMPLES]
|
||||
self._inject_buf = self._inject_buf[BRIDGE_FRAME_SAMPLES:]
|
||||
self._bridge_send_inject(frame.tobytes())
|
||||
|
||||
except queue_module.Empty:
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.bind(tag=TAG).error(f"Audio thread error: {e}")
|
||||
|
||||
async def _speaker_recv_loop(self):
|
||||
"""Receive speaker PCM from bridge, resample, encode Opus, send to ESP32."""
|
||||
try:
|
||||
while self._connected:
|
||||
ftype, data = await self._bridge_recv()
|
||||
if ftype == 0:
|
||||
# Speaker audio
|
||||
await self._handle_speaker(data)
|
||||
elif ftype == 1:
|
||||
msg = json.loads(data.decode())
|
||||
logger.bind(tag=TAG).debug(f"Bridge event: {msg}")
|
||||
except asyncio.IncompleteReadError:
|
||||
logger.bind(tag=TAG).warning("Bridge connection closed")
|
||||
except Exception as e:
|
||||
logger.bind(tag=TAG).error(f"Speaker recv error: {e}")
|
||||
finally:
|
||||
self._connected = False
|
||||
|
||||
async def _handle_speaker(self, pcm_bytes):
|
||||
"""Process speaker frame and send to ESP32."""
|
||||
if not self.conn or not self.conn.websocket:
|
||||
return
|
||||
|
||||
samples = np.frombuffer(pcm_bytes, dtype=np.int16)
|
||||
max_amp = int(np.max(np.abs(samples)))
|
||||
|
||||
# Track silence for tts stop
|
||||
if max_amp < 10:
|
||||
if self._tts_started:
|
||||
self._silence_count += 1
|
||||
# 50 frames of silence (~1 second) → send tts stop
|
||||
if self._silence_count > 50:
|
||||
try:
|
||||
await send_tts_message(self.conn, "stop")
|
||||
self.conn.client_is_speaking = False
|
||||
self._tts_started = False
|
||||
self._silence_count = 0
|
||||
logger.bind(tag=TAG).info("Sent tts stop to ESP32")
|
||||
except Exception as e:
|
||||
logger.bind(tag=TAG).error(f"Send tts stop error: {e}")
|
||||
return
|
||||
|
||||
# Reset silence counter on non-silent frame
|
||||
self._silence_count = 0
|
||||
|
||||
# Send tts start before first audio
|
||||
if not self._tts_started:
|
||||
try:
|
||||
await send_tts_message(self.conn, "start")
|
||||
self._tts_started = True
|
||||
self.conn.client_is_speaking = True
|
||||
logger.bind(tag=TAG).info("Sent tts start to ESP32")
|
||||
except Exception as e:
|
||||
logger.bind(tag=TAG).error(f"Send tts start error: {e}")
|
||||
return
|
||||
|
||||
# Resample 48kHz → 16kHz
|
||||
downsampled = resample_poly(samples, DOWN[0], DOWN[1]).astype(np.int16)
|
||||
|
||||
# Accumulate and encode
|
||||
self._speaker_buf = np.concatenate([self._speaker_buf, downsampled])
|
||||
while len(self._speaker_buf) >= ESP_FRAME_SAMPLES:
|
||||
frame = self._speaker_buf[:ESP_FRAME_SAMPLES]
|
||||
self._speaker_buf = self._speaker_buf[ESP_FRAME_SAMPLES:]
|
||||
opus_data = self.opus_encoder.encode(frame.tobytes(), ESP_FRAME_SAMPLES)
|
||||
try:
|
||||
await self.conn.websocket.send(opus_data)
|
||||
except Exception as e:
|
||||
logger.bind(tag=TAG).error(f"Send opus to ESP32 error: {e}")
|
||||
return
|
||||
|
||||
# Bridge TCP helpers
|
||||
async def _bridge_recv(self):
|
||||
header = await self.bridge_reader.readexactly(5)
|
||||
length = struct.unpack(">I", header[:4])[0]
|
||||
ftype = header[4]
|
||||
data = await self.bridge_reader.readexactly(length)
|
||||
return ftype, data
|
||||
|
||||
def _bridge_send_cmd(self, cmd):
|
||||
data = json.dumps(cmd).encode()
|
||||
header = struct.pack(">IB", len(data), 1)
|
||||
with self._write_lock:
|
||||
self.bridge_writer.write(header + data)
|
||||
|
||||
def _bridge_send_inject(self, pcm_bytes):
|
||||
header = struct.pack(">IB", len(pcm_bytes), 3)
|
||||
with self._write_lock:
|
||||
self.bridge_writer.write(header + pcm_bytes)
|
||||
|
||||
# ASR interface — never returns text, LLM/TTS never triggered
|
||||
async def receive_audio(self, conn, audio, audio_have_voice):
|
||||
"""No-op: audio is handled by _audio_thread directly from queue."""
|
||||
pass
|
||||
|
||||
async def speech_to_text(self, opus_data, session_id, audio_format="opus", artifacts=None):
|
||||
"""Never called in passthrough mode."""
|
||||
return "", None
|
||||
|
||||
async def close(self):
|
||||
self._connected = False
|
||||
if self._recv_task:
|
||||
self._recv_task.cancel()
|
||||
if self.bridge_writer:
|
||||
try:
|
||||
self._bridge_send_cmd({"cmd": "inject_off"})
|
||||
self._bridge_send_cmd({"cmd": "stop"})
|
||||
self.bridge_writer.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
|
@ -0,0 +1,307 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
ESP32 ↔ Antaf Voice Relay
|
||||
Bridges ESP32 (WebSocket/Opus) with Antaf voice_bridge (TCP/PCM).
|
||||
|
||||
ESP32 → Opus decode → resample 16kHz→48kHz → voice_bridge inject (type=3)
|
||||
ESP32 ← Opus encode ← resample 48kHz→16kHz ← voice_bridge speaker (type=0)
|
||||
|
||||
Usage: python relay.py [--ws-port 8010] [--bridge-host 127.0.0.1] [--bridge-port 18901]
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import struct
|
||||
import argparse
|
||||
import logging
|
||||
import numpy as np
|
||||
from scipy.signal import resample_poly
|
||||
from math import gcd
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
||||
log = logging.getLogger("relay")
|
||||
|
||||
try:
|
||||
import opuslib_next as opuslib
|
||||
except ImportError:
|
||||
import opuslib
|
||||
|
||||
import websockets
|
||||
|
||||
# Audio parameters
|
||||
ESP_SAMPLE_RATE = 16000 # ESP32 Opus sample rate
|
||||
ESP_FRAME_MS = 60 # ESP32 frame duration
|
||||
ESP_FRAME_SIZE = ESP_SAMPLE_RATE * ESP_FRAME_MS // 1000 # 960 samples
|
||||
|
||||
BRIDGE_SAMPLE_RATE = 48000 # voice_bridge micIn sample rate
|
||||
BRIDGE_FRAME_BYTES = 960 # 480 samples * 2 bytes
|
||||
BRIDGE_FRAME_SAMPLES = 480
|
||||
|
||||
# Resampling ratios
|
||||
UP_GCD = gcd(BRIDGE_SAMPLE_RATE, ESP_SAMPLE_RATE) # 16000 → 48000
|
||||
UP_RATIO = (BRIDGE_SAMPLE_RATE // UP_GCD, ESP_SAMPLE_RATE // UP_GCD) # (3, 1)
|
||||
DOWN_GCD = gcd(ESP_SAMPLE_RATE, BRIDGE_SAMPLE_RATE) # 48000 → 16000
|
||||
DOWN_RATIO = (ESP_SAMPLE_RATE // DOWN_GCD, BRIDGE_SAMPLE_RATE // DOWN_GCD) # (1, 3)
|
||||
|
||||
|
||||
class BridgeClient:
|
||||
"""TCP client for voice_bridge_v7."""
|
||||
|
||||
def __init__(self, host, port):
|
||||
self.host = host
|
||||
self.port = port
|
||||
self.reader = None
|
||||
self.writer = None
|
||||
self.on_speaker_frame = None # callback(pcm_bytes)
|
||||
self._recv_task = None
|
||||
|
||||
async def connect(self):
|
||||
self.reader, self.writer = await asyncio.open_connection(self.host, self.port)
|
||||
log.info(f"Connected to voice_bridge {self.host}:{self.port}")
|
||||
|
||||
# Read connected event
|
||||
ftype, data = await self._recv_frame()
|
||||
if ftype == 1:
|
||||
msg = json.loads(data.decode())
|
||||
log.info(f"Bridge: {msg.get('protocol')}")
|
||||
|
||||
async def _recv_frame(self):
|
||||
header = await self.reader.readexactly(5)
|
||||
length = struct.unpack(">I", header[:4])[0]
|
||||
ftype = header[4]
|
||||
data = await self.reader.readexactly(length)
|
||||
return ftype, data
|
||||
|
||||
def _send_frame(self, ftype, data):
|
||||
header = struct.pack(">IB", len(data), ftype)
|
||||
self.writer.write(header + data)
|
||||
# Note: no await drain() here — voice frames are time-sensitive,
|
||||
# TCP buffer handles backpressure
|
||||
|
||||
def send_cmd(self, cmd):
|
||||
self._send_frame(1, json.dumps(cmd).encode())
|
||||
|
||||
def send_inject(self, pcm_bytes):
|
||||
self._send_frame(3, pcm_bytes)
|
||||
|
||||
async def start_recv_loop(self):
|
||||
"""Background task: receive frames from bridge."""
|
||||
try:
|
||||
while True:
|
||||
ftype, data = await self._recv_frame()
|
||||
if ftype == 0 and self.on_speaker_frame:
|
||||
# Speaker audio
|
||||
await self.on_speaker_frame(data)
|
||||
elif ftype == 1:
|
||||
msg = json.loads(data.decode())
|
||||
log.info(f"Bridge event: {msg}")
|
||||
except asyncio.IncompleteReadError:
|
||||
log.warning("Bridge connection closed")
|
||||
except Exception as e:
|
||||
log.error(f"Bridge recv error: {e}")
|
||||
|
||||
async def setup_voice(self):
|
||||
"""Start capture and enable inject. Voice chat must already be open."""
|
||||
self.send_cmd({"cmd": "start"})
|
||||
ftype, data = await self._recv_frame()
|
||||
if ftype == 1:
|
||||
msg = json.loads(data.decode())
|
||||
log.info(f"Bridge: {msg}")
|
||||
self.send_cmd({"cmd": "inject_on"})
|
||||
ftype, data = await self._recv_frame()
|
||||
if ftype == 1:
|
||||
msg = json.loads(data.decode())
|
||||
log.info(f"Bridge: {msg}")
|
||||
log.info("Voice bridge ready (inject mode)")
|
||||
|
||||
async def close(self):
|
||||
try:
|
||||
self.send_cmd({"cmd": "inject_off"})
|
||||
self.send_cmd({"cmd": "stop"})
|
||||
except Exception:
|
||||
pass
|
||||
if self.writer:
|
||||
self.writer.close()
|
||||
|
||||
|
||||
class Relay:
|
||||
"""Main relay: ESP32 WebSocket ↔ Antaf voice_bridge TCP."""
|
||||
|
||||
def __init__(self, ws_port, bridge_host, bridge_port):
|
||||
self.ws_port = ws_port
|
||||
self.bridge_host = bridge_host
|
||||
self.bridge_port = bridge_port
|
||||
self.bridge = None
|
||||
self.ws = None
|
||||
self.opus_decoder = None
|
||||
self.opus_encoder = None
|
||||
# Buffer for resampled PCM to split into bridge frames
|
||||
self._inject_buf = np.array([], dtype=np.int16)
|
||||
# Buffer for speaker PCM to accumulate before encoding
|
||||
self._speaker_buf = np.array([], dtype=np.int16)
|
||||
self._audio_in_count = 0
|
||||
self._audio_out_count = 0
|
||||
self._tts_started = False # track if we sent tts start to ESP32
|
||||
|
||||
async def handle_esp32(self, websocket):
|
||||
"""Handle one ESP32 WebSocket connection."""
|
||||
log.info(f"ESP32 connected from {websocket.remote_address}")
|
||||
self.ws = websocket
|
||||
|
||||
# Init Opus codec
|
||||
self.opus_decoder = opuslib.Decoder(ESP_SAMPLE_RATE, 1)
|
||||
self.opus_encoder = opuslib.Encoder(ESP_SAMPLE_RATE, 1, opuslib.APPLICATION_AUDIO)
|
||||
|
||||
# Connect to voice bridge and setup voice chat first
|
||||
self.bridge = BridgeClient(self.bridge_host, self.bridge_port)
|
||||
await self.bridge.connect()
|
||||
await self.bridge.setup_voice()
|
||||
|
||||
# Now start receiving speaker audio
|
||||
self.bridge.on_speaker_frame = self._on_speaker_frame
|
||||
recv_task = asyncio.create_task(self.bridge.start_recv_loop())
|
||||
|
||||
try:
|
||||
async for message in websocket:
|
||||
if isinstance(message, str):
|
||||
# Text message from ESP32 (hello, listen, etc.)
|
||||
await self._handle_text(message)
|
||||
elif isinstance(message, bytes):
|
||||
# Opus audio from ESP32
|
||||
await self._handle_audio(message)
|
||||
except websockets.exceptions.ConnectionClosed:
|
||||
log.info("ESP32 disconnected")
|
||||
finally:
|
||||
recv_task.cancel()
|
||||
await self.bridge.close()
|
||||
self.ws = None
|
||||
log.info("Session ended")
|
||||
|
||||
async def _handle_text(self, message):
|
||||
"""Handle text messages from ESP32."""
|
||||
try:
|
||||
msg = json.loads(message)
|
||||
msg_type = msg.get("type")
|
||||
|
||||
if msg_type == "hello":
|
||||
# Respond with proper hello — must match xiaozhi protocol
|
||||
resp = {
|
||||
"type": "hello",
|
||||
"version": 1,
|
||||
"transport": "websocket",
|
||||
"session_id": "relay-session",
|
||||
"audio_params": {
|
||||
"format": "opus",
|
||||
"sample_rate": ESP_SAMPLE_RATE,
|
||||
"channels": 1,
|
||||
"frame_duration": ESP_FRAME_MS,
|
||||
},
|
||||
}
|
||||
await self.ws.send(json.dumps(resp))
|
||||
log.info(f"ESP32 hello: {msg.get('audio_params')}")
|
||||
|
||||
elif msg_type == "listen":
|
||||
state = msg.get("state")
|
||||
log.info(f"ESP32 listen: {state}")
|
||||
if state == "detect":
|
||||
text = msg.get("text", "")
|
||||
log.info(f"Wake word: {text}")
|
||||
# Don't send tts start — let ESP32 continue recording
|
||||
|
||||
elif msg_type == "abort":
|
||||
log.info("ESP32 abort")
|
||||
|
||||
except json.JSONDecodeError:
|
||||
log.warning(f"Invalid JSON from ESP32: {message[:100]}")
|
||||
|
||||
async def _handle_audio(self, opus_data):
|
||||
"""Decode Opus from ESP32, resample, inject into voice_bridge."""
|
||||
try:
|
||||
self._audio_in_count += 1
|
||||
if self._audio_in_count <= 3 or self._audio_in_count % 100 == 0:
|
||||
log.info(f"ESP32 audio frame #{self._audio_in_count}, size={len(opus_data)}")
|
||||
|
||||
# Decode Opus → PCM 16kHz mono
|
||||
pcm = self.opus_decoder.decode(opus_data, ESP_FRAME_SIZE)
|
||||
samples = np.frombuffer(pcm, dtype=np.int16)
|
||||
|
||||
# Resample 16kHz → 48kHz
|
||||
upsampled = resample_poly(samples, UP_RATIO[0], UP_RATIO[1]).astype(np.int16)
|
||||
|
||||
# Append to inject buffer and send in bridge frame sizes
|
||||
self._inject_buf = np.concatenate([self._inject_buf, upsampled])
|
||||
while len(self._inject_buf) >= BRIDGE_FRAME_SAMPLES:
|
||||
frame = self._inject_buf[:BRIDGE_FRAME_SAMPLES]
|
||||
self._inject_buf = self._inject_buf[BRIDGE_FRAME_SAMPLES:]
|
||||
self.bridge.send_inject(frame.tobytes())
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Audio inject error: {e}")
|
||||
|
||||
async def _on_speaker_frame(self, pcm_bytes):
|
||||
"""Receive speaker PCM from bridge, resample, encode Opus, send to ESP32."""
|
||||
if not self.ws or getattr(self.ws, 'closed', False):
|
||||
return
|
||||
try:
|
||||
self._audio_out_count += 1
|
||||
|
||||
samples = np.frombuffer(pcm_bytes, dtype=np.int16)
|
||||
max_amp = int(np.max(np.abs(samples)))
|
||||
|
||||
if self._audio_out_count <= 3 or self._audio_out_count % 100 == 0:
|
||||
log.info(f"Speaker frame #{self._audio_out_count}, size={len(pcm_bytes)}, max_amp={max_amp}")
|
||||
|
||||
# Only send non-silent frames to ESP32
|
||||
if max_amp < 10:
|
||||
# If we were playing and now silent for a while, send tts stop
|
||||
if self._tts_started and self._audio_out_count % 50 == 0:
|
||||
# Check later — don't stop immediately, silence gaps are normal
|
||||
pass
|
||||
return
|
||||
|
||||
# Send tts start before first audio frame
|
||||
if not self._tts_started:
|
||||
await self.ws.send(json.dumps({
|
||||
"type": "tts", "state": "start",
|
||||
"session_id": "relay-session"
|
||||
}))
|
||||
self._tts_started = True
|
||||
log.info("Sent tts start to ESP32")
|
||||
|
||||
# Resample 48kHz → 16kHz
|
||||
downsampled = resample_poly(samples, DOWN_RATIO[0], DOWN_RATIO[1]).astype(np.int16)
|
||||
|
||||
# Accumulate into speaker buffer, encode when we have enough
|
||||
self._speaker_buf = np.concatenate([self._speaker_buf, downsampled])
|
||||
while len(self._speaker_buf) >= ESP_FRAME_SIZE:
|
||||
frame = self._speaker_buf[:ESP_FRAME_SIZE]
|
||||
self._speaker_buf = self._speaker_buf[ESP_FRAME_SIZE:]
|
||||
# Encode PCM → Opus
|
||||
opus_data = self.opus_encoder.encode(frame.tobytes(), ESP_FRAME_SIZE)
|
||||
await self.ws.send(opus_data)
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Speaker send error: {e}")
|
||||
|
||||
async def run(self):
|
||||
log.info(f"Relay starting on ws://0.0.0.0:{self.ws_port}/xiaozhi/v1/")
|
||||
async with websockets.serve(
|
||||
self.handle_esp32, "0.0.0.0", self.ws_port,
|
||||
ping_interval=30, ping_timeout=10,
|
||||
):
|
||||
await asyncio.Future() # run forever
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="ESP32-Antaf Voice Relay")
|
||||
parser.add_argument("--ws-port", type=int, default=8010, help="WebSocket port for ESP32")
|
||||
parser.add_argument("--bridge-host", default="127.0.0.1", help="voice_bridge host")
|
||||
parser.add_argument("--bridge-port", type=int, default=18901, help="voice_bridge port")
|
||||
args = parser.parse_args()
|
||||
|
||||
relay = Relay(args.ws_port, args.bridge_host, args.bridge_port)
|
||||
asyncio.run(relay.run())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -0,0 +1,130 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Test voice_bridge_v7 audio injection.
|
||||
Connect to voice_bridge, open voice chat, enable inject mode,
|
||||
send silence frames, and print any speaker output received.
|
||||
|
||||
Usage: python test_inject.py [host] [port]
|
||||
"""
|
||||
import socket
|
||||
import struct
|
||||
import json
|
||||
import time
|
||||
import sys
|
||||
import threading
|
||||
|
||||
HOST = sys.argv[1] if len(sys.argv) > 1 else "127.0.0.1"
|
||||
PORT = int(sys.argv[2]) if len(sys.argv) > 2 else 18901
|
||||
|
||||
FRAME_SIZE = 960 # 960 bytes per frame (480 samples * 16bit)
|
||||
|
||||
|
||||
def send_cmd(sock, cmd):
|
||||
data = json.dumps(cmd).encode("utf-8")
|
||||
header = struct.pack(">IB", len(data), 1) # type=1 text
|
||||
sock.sendall(header + data)
|
||||
|
||||
|
||||
def send_inject(sock, pcm_frame):
|
||||
header = struct.pack(">IB", len(pcm_frame), 3) # type=3 inject
|
||||
sock.sendall(header + pcm_frame)
|
||||
|
||||
|
||||
def recv_exact(sock, n):
|
||||
buf = b""
|
||||
while len(buf) < n:
|
||||
chunk = sock.recv(n - len(buf))
|
||||
if not chunk:
|
||||
return None
|
||||
buf += chunk
|
||||
return buf
|
||||
|
||||
|
||||
def recv_frame(sock):
|
||||
header = recv_exact(sock, 5)
|
||||
if header is None:
|
||||
return None, None
|
||||
length = struct.unpack(">I", header[:4])[0]
|
||||
ftype = header[4]
|
||||
if length > 1048576:
|
||||
return None, None
|
||||
data = recv_exact(sock, length)
|
||||
if data is None:
|
||||
return None, None
|
||||
return ftype, data
|
||||
|
||||
|
||||
def receiver(sock):
|
||||
"""Background thread to print received frames."""
|
||||
spk_count = 0
|
||||
while True:
|
||||
try:
|
||||
ftype, data = recv_frame(sock)
|
||||
if ftype is None:
|
||||
print("[RECV] Connection closed")
|
||||
break
|
||||
if ftype == 1: # text/json
|
||||
msg = json.loads(data.decode("utf-8"))
|
||||
print(f"[RECV] {msg}")
|
||||
elif ftype == 0: # speaker audio
|
||||
spk_count += 1
|
||||
# Check if audio is non-silent
|
||||
samples = struct.unpack(f"<{len(data)//2}h", data)
|
||||
max_amp = max(abs(s) for s in samples)
|
||||
if spk_count <= 5 or spk_count % 100 == 0 or max_amp > 500:
|
||||
print(f"[SPK] frame={spk_count} size={len(data)} max_amp={max_amp}")
|
||||
elif ftype == 2: # mic audio
|
||||
pass # ignore mic echo
|
||||
except Exception as e:
|
||||
print(f"[RECV] Error: {e}")
|
||||
break
|
||||
|
||||
|
||||
def main():
|
||||
print(f"Connecting to {HOST}:{PORT}...")
|
||||
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
sock.connect((HOST, PORT))
|
||||
print("Connected")
|
||||
|
||||
# Start receiver thread
|
||||
t = threading.Thread(target=receiver, args=(sock,), daemon=True)
|
||||
t.start()
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
# Open voice chat
|
||||
print("Opening voice chat...")
|
||||
send_cmd(sock, {"cmd": "open_voice"})
|
||||
time.sleep(3)
|
||||
|
||||
# Start capture
|
||||
print("Starting capture...")
|
||||
send_cmd(sock, {"cmd": "start"})
|
||||
time.sleep(1)
|
||||
|
||||
# Enable inject mode
|
||||
print("Enabling inject mode...")
|
||||
send_cmd(sock, {"cmd": "inject_on"})
|
||||
time.sleep(0.5)
|
||||
|
||||
# Send silence frames for 3 seconds (48kHz, 960 bytes/frame = 20ms)
|
||||
# 3 seconds = 150 frames
|
||||
print("Sending 150 silence frames (3 seconds)...")
|
||||
silence = b"\x00" * FRAME_SIZE
|
||||
for i in range(150):
|
||||
send_inject(sock, silence)
|
||||
time.sleep(0.02) # 20ms per frame
|
||||
|
||||
print("Done sending. Waiting for speaker output...")
|
||||
time.sleep(10)
|
||||
|
||||
# Stop
|
||||
send_cmd(sock, {"cmd": "inject_off"})
|
||||
send_cmd(sock, {"cmd": "stop"})
|
||||
time.sleep(1)
|
||||
|
||||
print("Test complete")
|
||||
sock.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -0,0 +1,209 @@
|
|||
// voice_bridge_v7.js — Voice Bridge with Audio Injection
|
||||
// Hook point: libantaudio.so MFAntAudio3AV2Filter::process(micIn, spkRef, out, size, &result)
|
||||
// TCP :18901
|
||||
// Frame: 4-byte len + 1-byte type + payload
|
||||
// type 0: speaker/AI audio (spkRef, downstream to client)
|
||||
// type 1: text/JSON command
|
||||
// type 2: mic audio (micIn, downstream to client)
|
||||
// type 3: inject audio (upstream from client, replaces micIn)
|
||||
|
||||
var voiceActive = false;
|
||||
var clientOS = null;
|
||||
var capturedSpk = 0, capturedMic = 0, spkBytes = 0, micBytes = 0;
|
||||
var injectMode = false; // true = replace mic with injected audio
|
||||
var injectQueue = []; // queue of PCM frames to inject
|
||||
|
||||
function wf(os, type, jArr) {
|
||||
try {
|
||||
var len = jArr.length;
|
||||
var h = Java.array("byte", [(len>>24)&0xFF,(len>>16)&0xFF,(len>>8)&0xFF,len&0xFF, type]);
|
||||
os.write(h); os.write(jArr); os.flush();
|
||||
} catch(e) {}
|
||||
}
|
||||
function wt(os, text) {
|
||||
wf(os, 1, Java.use("java.lang.String").$new(text).getBytes("UTF-8"));
|
||||
}
|
||||
|
||||
// === Hook libantaudio.so ===
|
||||
var hooked = false;
|
||||
function tryHook() {
|
||||
if (hooked) return;
|
||||
var m = Process.findModuleByName("libantaudio.so");
|
||||
if (!m) return;
|
||||
var addr = m.findExportByName("_ZN8antaudio20MFAntAudio3AV2Filter7processEPhS1_S1_iRi");
|
||||
if (!addr) return;
|
||||
hooked = true;
|
||||
|
||||
Interceptor.attach(addr, {
|
||||
onEnter: function(args) {
|
||||
if (!voiceActive || !clientOS) return;
|
||||
var size = args[4].toInt32();
|
||||
if (size <= 0) return;
|
||||
|
||||
try {
|
||||
// If inject mode, replace micIn with queued or silence
|
||||
if (injectMode) {
|
||||
if (injectQueue.length > 0) {
|
||||
var frame = injectQueue.shift();
|
||||
// Only write if frame size matches expected size
|
||||
if (frame.byteLength === size) {
|
||||
args[1].writeByteArray(frame);
|
||||
} else if (frame.byteLength > 0) {
|
||||
// Size mismatch — pad or truncate
|
||||
var buf = new ArrayBuffer(size);
|
||||
var dst = new Uint8Array(buf);
|
||||
var src = new Uint8Array(frame);
|
||||
var copyLen = Math.min(size, frame.byteLength);
|
||||
for (var k = 0; k < copyLen; k++) dst[k] = src[k];
|
||||
args[1].writeByteArray(buf);
|
||||
}
|
||||
} else {
|
||||
// No data queued — inject silence to avoid mic leak
|
||||
var silence = new ArrayBuffer(size);
|
||||
args[1].writeByteArray(silence);
|
||||
}
|
||||
}
|
||||
|
||||
// Always capture speaker/AI output (type 0)
|
||||
var spkPcm = args[2].readByteArray(size);
|
||||
var spkArr = Java.array("byte", Array.from(new Uint8Array(spkPcm)));
|
||||
wf(clientOS, 0, spkArr);
|
||||
capturedSpk++; spkBytes += size;
|
||||
|
||||
// Capture mic (type 2) only when not injecting
|
||||
if (!injectMode) {
|
||||
var micPcm = args[1].readByteArray(size);
|
||||
var micArr = Java.array("byte", Array.from(new Uint8Array(micPcm)));
|
||||
wf(clientOS, 2, micArr);
|
||||
}
|
||||
capturedMic++; micBytes += size;
|
||||
|
||||
if (capturedMic <= 3 || capturedMic % 500 === 0)
|
||||
console.log("[VOICE] mic=" + capturedMic + " spk=" + capturedSpk + " inject=" + injectQueue.length);
|
||||
} catch(e) {}
|
||||
}
|
||||
});
|
||||
console.log("[VOICE] 3AV2Filter.process hooked @ " + addr);
|
||||
}
|
||||
|
||||
[0, 1000, 3000, 5000, 10000, 15000, 20000].forEach(function(ms) { setTimeout(tryHook, ms); });
|
||||
try {
|
||||
new ApiResolver("module").enumerateMatches("exports:linker*!*dlopen*").forEach(function(d) {
|
||||
Interceptor.attach(d.address, { onLeave: function() { setTimeout(tryHook, 500); } });
|
||||
});
|
||||
} catch(e) {}
|
||||
|
||||
// === TCP Server ===
|
||||
Java.perform(function() {
|
||||
var SS = Java.use("java.net.ServerSocket");
|
||||
var JS = Java.use("java.lang.String");
|
||||
var server = SS.$new(18901);
|
||||
console.log("[VOICE] Listening :18901");
|
||||
|
||||
function openVoice(os) {
|
||||
Java.scheduleOnMainThread(function() {
|
||||
try {
|
||||
Java.choose("com.antgroup.aijk.android.ijklauncher.biz.activity.IJKActivity", {
|
||||
onMatch: function(a) {
|
||||
var fm = a.getSupportFragmentManager();
|
||||
var f = Java.use("com.antgroup.aijk.android.ijkchat.biz.voicechat.IjkVoiceChatFragment").$new();
|
||||
f.show(fm, "v");
|
||||
console.log("[VOICE] Opened");
|
||||
}, onComplete: function() {}
|
||||
});
|
||||
setTimeout(function() { wt(os, JSON.stringify({event:"voice_opened"})); }, 2000);
|
||||
} catch(e) { wt(os, JSON.stringify({event:"error",msg:""+e})); }
|
||||
});
|
||||
}
|
||||
|
||||
function closeVoice(os) {
|
||||
Java.scheduleOnMainThread(function() {
|
||||
try {
|
||||
Java.choose("com.antgroup.aijk.android.ijkchat.biz.voicechat.IjkVoiceChatFragment", {
|
||||
onMatch: function(f) { f.dismiss(); console.log("[VOICE] Closed"); },
|
||||
onComplete: function() {}
|
||||
});
|
||||
setTimeout(function() { wt(os, JSON.stringify({event:"voice_closed"})); }, 1000);
|
||||
} catch(e) { wt(os, JSON.stringify({event:"error",msg:""+e})); }
|
||||
});
|
||||
}
|
||||
|
||||
var Srv = Java.registerClass({
|
||||
name: "com.antaf.voice.S7",
|
||||
implements: [Java.use("java.lang.Runnable")],
|
||||
methods: {
|
||||
run: function() {
|
||||
while (true) {
|
||||
try {
|
||||
console.log("[VOICE] Waiting...");
|
||||
var c = server.accept();
|
||||
var is = c.getInputStream();
|
||||
var os = c.getOutputStream();
|
||||
clientOS = os;
|
||||
console.log("[VOICE] Connected");
|
||||
wt(os, JSON.stringify({
|
||||
event:"connected", protocol:"antaf-voice-v8",
|
||||
commands:["open_voice","close_voice","start","stop","status","inject_on","inject_off"],
|
||||
audio:"pcm-16bit-960b-frames",
|
||||
frameTypes:{0:"spk_ai",1:"text",2:"mic",3:"inject"}
|
||||
}));
|
||||
|
||||
while (true) {
|
||||
var hb = [];
|
||||
for (var i=0;i<5;i++) { var b=is.read(); if(b<0) throw "EOF"; hb.push(b); }
|
||||
var fl=(hb[0]<<24)|(hb[1]<<16)|(hb[2]<<8)|hb[3], ft=hb[4];
|
||||
if (fl>1048576) break;
|
||||
var pb = [];
|
||||
for (var i=0;i<fl;i++) { var b=is.read(); if(b<0) throw "EOF"; pb.push(b&0xFF); }
|
||||
|
||||
if (ft === 3) {
|
||||
// type 3: inject audio frame into micIn
|
||||
var arr = new ArrayBuffer(pb.length);
|
||||
var view = new Uint8Array(arr);
|
||||
for (var j=0;j<pb.length;j++) view[j] = pb[j];
|
||||
injectQueue.push(arr);
|
||||
}
|
||||
else if (ft === 1) {
|
||||
var pl = Java.array("byte", pb);
|
||||
var cmd = JSON.parse(JS.$new(pl,"UTF-8").toString());
|
||||
console.log("[VOICE] Cmd: " + JSON.stringify(cmd));
|
||||
if (cmd.cmd === "open_voice") openVoice(os);
|
||||
else if (cmd.cmd === "close_voice") closeVoice(os);
|
||||
else if (cmd.cmd === "start") {
|
||||
voiceActive = true;
|
||||
capturedSpk=0;capturedMic=0;spkBytes=0;micBytes=0;
|
||||
injectQueue = [];
|
||||
wt(os, JSON.stringify({event:"started",hooked:hooked}));
|
||||
}
|
||||
else if (cmd.cmd === "stop") {
|
||||
voiceActive = false;
|
||||
injectMode = false;
|
||||
injectQueue = [];
|
||||
wt(os, JSON.stringify({event:"stopped",spk:{frames:capturedSpk,bytes:spkBytes},mic:{frames:capturedMic,bytes:micBytes}}));
|
||||
}
|
||||
else if (cmd.cmd === "inject_on") {
|
||||
injectMode = true;
|
||||
injectQueue = [];
|
||||
wt(os, JSON.stringify({event:"inject_on"}));
|
||||
console.log("[VOICE] Inject mode ON");
|
||||
}
|
||||
else if (cmd.cmd === "inject_off") {
|
||||
injectMode = false;
|
||||
injectQueue = [];
|
||||
wt(os, JSON.stringify({event:"inject_off"}));
|
||||
console.log("[VOICE] Inject mode OFF");
|
||||
}
|
||||
else if (cmd.cmd === "status") {
|
||||
wt(os, JSON.stringify({event:"status",active:voiceActive,hooked:hooked,inject:injectMode,queue:injectQueue.length,spk:{frames:capturedSpk,bytes:spkBytes},mic:{frames:capturedMic,bytes:micBytes}}));
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch(e) { console.log("[VOICE] Ended: "+e); }
|
||||
finally { voiceActive=false; clientOS=null; injectMode=false; injectQueue=[]; }
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
Java.use("java.lang.Thread").$new(Srv.$new()).start();
|
||||
console.log("[VOICE] Ready (v7 + inject)");
|
||||
});
|
||||
|
|
@ -0,0 +1,181 @@
|
|||
// voice_bridge_v8.js — Voice Bridge with Audio Injection (attach after voice chat opened)
|
||||
//
|
||||
// STARTUP ORDER:
|
||||
// 1. Launch app: adb shell monkey -p com.antgroup.aijk.android ...
|
||||
// 2. Open voice chat manually or via adb tap
|
||||
// 3. Wait for libantaudio.so to load
|
||||
// 4. Attach frida with this script
|
||||
//
|
||||
// Hook point: libantaudio.so MFAntAudio3AV2Filter::process(micIn, spkRef, out, size, &result)
|
||||
// TCP :18901
|
||||
// Frame: 4-byte len + 1-byte type + payload
|
||||
// type 0: speaker/AI audio (spkRef, downstream to client)
|
||||
// type 1: text/JSON command
|
||||
// type 2: mic audio (micIn, downstream to client)
|
||||
// type 3: inject audio (upstream from client, replaces micIn)
|
||||
|
||||
var voiceActive = false;
|
||||
var clientOS = null;
|
||||
var capturedSpk = 0, capturedMic = 0, spkBytes = 0, micBytes = 0;
|
||||
var injectMode = false;
|
||||
var injectQueue = [];
|
||||
|
||||
function wf(os, type, jArr) {
|
||||
try {
|
||||
var len = jArr.length;
|
||||
var h = Java.array("byte", [(len>>24)&0xFF,(len>>16)&0xFF,(len>>8)&0xFF,len&0xFF, type]);
|
||||
os.write(h); os.write(jArr); os.flush();
|
||||
} catch(e) {}
|
||||
}
|
||||
function wt(os, text) {
|
||||
wf(os, 1, Java.use("java.lang.String").$new(text).getBytes("UTF-8"));
|
||||
}
|
||||
|
||||
// === Hook libantaudio.so (should already be loaded) ===
|
||||
var hooked = false;
|
||||
function tryHook() {
|
||||
if (hooked) return;
|
||||
var m = Process.findModuleByName("libantaudio.so");
|
||||
if (!m) return;
|
||||
var addr = m.findExportByName("_ZN8antaudio20MFAntAudio3AV2Filter7processEPhS1_S1_iRi");
|
||||
if (!addr) return;
|
||||
hooked = true;
|
||||
|
||||
Interceptor.attach(addr, {
|
||||
onEnter: function(args) {
|
||||
if (!voiceActive || !clientOS) return;
|
||||
var size = args[4].toInt32();
|
||||
if (size <= 0) return;
|
||||
|
||||
try {
|
||||
if (injectMode) {
|
||||
if (injectQueue.length > 0) {
|
||||
var frame = injectQueue.shift();
|
||||
if (frame.byteLength === size) {
|
||||
args[1].writeByteArray(frame);
|
||||
} else {
|
||||
var buf = new ArrayBuffer(size);
|
||||
var dst = new Uint8Array(buf);
|
||||
var src = new Uint8Array(frame);
|
||||
var copyLen = Math.min(size, frame.byteLength);
|
||||
for (var k = 0; k < copyLen; k++) dst[k] = src[k];
|
||||
args[1].writeByteArray(buf);
|
||||
}
|
||||
} else {
|
||||
var silence = new ArrayBuffer(size);
|
||||
args[1].writeByteArray(silence);
|
||||
}
|
||||
}
|
||||
|
||||
// Always capture speaker/AI output (type 0)
|
||||
var spkPcm = args[2].readByteArray(size);
|
||||
var spkArr = Java.array("byte", Array.from(new Uint8Array(spkPcm)));
|
||||
wf(clientOS, 0, spkArr);
|
||||
capturedSpk++; spkBytes += size;
|
||||
|
||||
if (!injectMode) {
|
||||
var micPcm = args[1].readByteArray(size);
|
||||
var micArr = Java.array("byte", Array.from(new Uint8Array(micPcm)));
|
||||
wf(clientOS, 2, micArr);
|
||||
}
|
||||
capturedMic++; micBytes += size;
|
||||
|
||||
if (capturedMic <= 3 || capturedMic % 500 === 0)
|
||||
console.log("[VOICE] mic=" + capturedMic + " spk=" + capturedSpk + " inject=" + injectQueue.length);
|
||||
} catch(e) {}
|
||||
}
|
||||
});
|
||||
console.log("[VOICE] process hooked @ " + addr);
|
||||
}
|
||||
|
||||
// Hook immediately — lib should already be loaded since voice chat is open
|
||||
tryHook();
|
||||
if (!hooked) {
|
||||
// Retry a few times in case of timing
|
||||
[500, 1000, 2000, 5000].forEach(function(ms) { setTimeout(tryHook, ms); });
|
||||
}
|
||||
|
||||
// === TCP Server ===
|
||||
Java.perform(function() {
|
||||
var SS = Java.use("java.net.ServerSocket");
|
||||
var JS = Java.use("java.lang.String");
|
||||
var server = SS.$new(18901);
|
||||
console.log("[VOICE] Listening :18901");
|
||||
|
||||
var Srv = Java.registerClass({
|
||||
name: "com.antaf.voice.S8",
|
||||
implements: [Java.use("java.lang.Runnable")],
|
||||
methods: {
|
||||
run: function() {
|
||||
while (true) {
|
||||
try {
|
||||
console.log("[VOICE] Waiting for client...");
|
||||
var c = server.accept();
|
||||
var is = c.getInputStream();
|
||||
var os = c.getOutputStream();
|
||||
clientOS = os;
|
||||
console.log("[VOICE] Client connected");
|
||||
wt(os, JSON.stringify({
|
||||
event:"connected", protocol:"antaf-voice-v8",
|
||||
hooked: hooked,
|
||||
commands:["start","stop","status","inject_on","inject_off"],
|
||||
audio:"pcm-16bit-960b-frames",
|
||||
frameTypes:{0:"spk_ai",1:"text",2:"mic",3:"inject"}
|
||||
}));
|
||||
|
||||
while (true) {
|
||||
var hb = [];
|
||||
for (var i=0;i<5;i++) { var b=is.read(); if(b<0) throw "EOF"; hb.push(b); }
|
||||
var fl=(hb[0]<<24)|(hb[1]<<16)|(hb[2]<<8)|hb[3], ft=hb[4];
|
||||
if (fl>1048576) break;
|
||||
var pb = [];
|
||||
for (var i=0;i<fl;i++) { var b=is.read(); if(b<0) throw "EOF"; pb.push(b&0xFF); }
|
||||
|
||||
if (ft === 3) {
|
||||
var arr = new ArrayBuffer(pb.length);
|
||||
var view = new Uint8Array(arr);
|
||||
for (var j=0;j<pb.length;j++) view[j] = pb[j];
|
||||
injectQueue.push(arr);
|
||||
}
|
||||
else if (ft === 1) {
|
||||
var pl = Java.array("byte", pb);
|
||||
var cmd = JSON.parse(JS.$new(pl,"UTF-8").toString());
|
||||
console.log("[VOICE] Cmd: " + JSON.stringify(cmd));
|
||||
if (cmd.cmd === "start") {
|
||||
voiceActive = true;
|
||||
capturedSpk=0;capturedMic=0;spkBytes=0;micBytes=0;
|
||||
injectQueue = [];
|
||||
wt(os, JSON.stringify({event:"started",hooked:hooked}));
|
||||
}
|
||||
else if (cmd.cmd === "stop") {
|
||||
voiceActive = false;
|
||||
injectMode = false;
|
||||
injectQueue = [];
|
||||
wt(os, JSON.stringify({event:"stopped",spk:{frames:capturedSpk,bytes:spkBytes},mic:{frames:capturedMic,bytes:micBytes}}));
|
||||
}
|
||||
else if (cmd.cmd === "inject_on") {
|
||||
injectMode = true;
|
||||
injectQueue = [];
|
||||
wt(os, JSON.stringify({event:"inject_on"}));
|
||||
console.log("[VOICE] Inject ON");
|
||||
}
|
||||
else if (cmd.cmd === "inject_off") {
|
||||
injectMode = false;
|
||||
injectQueue = [];
|
||||
wt(os, JSON.stringify({event:"inject_off"}));
|
||||
console.log("[VOICE] Inject OFF");
|
||||
}
|
||||
else if (cmd.cmd === "status") {
|
||||
wt(os, JSON.stringify({event:"status",active:voiceActive,hooked:hooked,inject:injectMode,queue:injectQueue.length,spk:{frames:capturedSpk,bytes:spkBytes},mic:{frames:capturedMic,bytes:micBytes}}));
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch(e) { console.log("[VOICE] Client disconnected: "+e); }
|
||||
finally { voiceActive=false; clientOS=null; injectMode=false; injectQueue=[]; }
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
Java.use("java.lang.Thread").$new(Srv.$new()).start();
|
||||
console.log("[VOICE] Ready (v8, no open_voice)");
|
||||
});
|
||||
|
|
@ -0,0 +1,335 @@
|
|||
# 蚂蚁阿福接入小智 ESP32 — 实施方案
|
||||
|
||||
## 项目目标
|
||||
|
||||
将蚂蚁阿福 App 的 AI 能力接入小智 ESP32 硬件终端,用户通过 ESP32 设备语音对话,
|
||||
后端对接蚂蚁阿福代替自建 LLM,省去 GPU 资源(两张 RTX 3090 + Qwen3-32B)。
|
||||
|
||||
---
|
||||
|
||||
## 系统架构
|
||||
|
||||
### 方案A:文字接入(自定义 LLM Provider)
|
||||
|
||||
```
|
||||
ESP32 设备 PlugAI 服务端 手机
|
||||
┌──────────┐ WebSocket ┌──────────────────┐ HTTP/SSE ┌─<E2948C><E29480><EFBFBD>────────────┐
|
||||
│ 麦克风 │ ──────────────→│ ASR (FunASR) │ │ 蚂蚁阿福 App │
|
||||
│ 唤<><E594A4><EFBFBD>词 │ │ 语音→文字 │ │ + Frida 注入 │
|
||||
│ AEC/NS │ │ │ GET /chat?q= │ │
|
||||
│ │ │ AntafLLM Provider│──────────────→│ HTTP Bridge │
|
||||
│ │ │ (新增) │←──────────────│ (port 18900) │
|
||||
│ │ │ │ SSE 流式回答 │ │
|
||||
│ 喇叭 │←───────────────│ TTS (EdgeTTS) │ │ │
|
||||
│ │ WebSocket │ 文字→语音 │ │ │
|
||||
└──────────┘ └──────────────────┘ └──────────────┘
|
||||
```
|
||||
|
||||
**数据流**: ESP32 音频 → FunASR(语音转文字) → AntafLLM(文<><E69687><EFBFBD>发给阿福) → EdgeTTS(回答转语音) → ESP32 播放
|
||||
|
||||
### 方案B:语音直通(替代整个 ASR+LLM+TTS)
|
||||
|
||||
```
|
||||
ESP32 设备 PlugAI 服务端 手机
|
||||
┌──────────┐ WebSocket ┌─────<E29480><E29480><EFBFBD>────────────┐ TCP 二进制 ┌──<E29480><E29480><EFBFBD>───────────┐
|
||||
│ 麦克风 │ ──────────────→│ 音频转发模块(新增) │ │ 蚂蚁<E89A82><E89A81>福 App │
|
||||
│ │ │ Opus解码 │ PCM注入mic │ + Frida <20><>入 │
|
||||
│ │ │ 重采样 24k→48k │──────────────→│ Voice Bridge │
|
||||
│ │ │ │ PCM speaker │ (port 18901) │
|
||||
│ 喇叭 │←───────────────│ 重采样 24k→24k │←──────────────│ libantaudio │
|
||||
│ │ WebSocket │ Opus编码 │ │ │
|
||||
└──────────┘ └──────────────────┘ └──────────────┘
|
||||
```
|
||||
|
||||
**数据流**: ESP32 音频 → 解码+重采样 → 注入阿福麦克风 → 阿福完整处理(ASR+LLM+TTS) → 捕获音频 → 编码 → ESP32 播放
|
||||
|
||||
---
|
||||
|
||||
## 可行性评估
|
||||
|
||||
| 维度 | 方案A (文字接入) | 方案B (语音直通) |
|
||||
|------|-----------------|-----------------|
|
||||
| 可行性 | **高** | **中低** |
|
||||
| 实现难度 | 低 (1个Python文件) | 高 (改JS+写转发模块) |
|
||||
| 改动范围 | 新增 LLM Provider + 改配置 | 改 voice_bridge.js + 新增转发模块 |
|
||||
| 延迟 | 中 (ASR+网络+TTS 各一轮) | 低 (音频直通) |
|
||||
| 音质 | EdgeTTS (微软高质量) | 阿福原生 TTS |
|
||||
| GPU 依赖 | 无 (省掉 Qwen3-32B) | 无 |
|
||||
| 手机依赖 | 需要 (App+Frida+adb) | 需要 (App+Frida+adb) |
|
||||
| 核心风险 | 低 | **voice_bridge 当前不支持音频注入** |
|
||||
|
||||
**结论**: 先实施方案A,验证通过后再做方案B。
|
||||
|
||||
---
|
||||
|
||||
## 方案A 详细实施
|
||||
|
||||
### 前置条件
|
||||
|
||||
| 组件 | 状态 | 说明 |
|
||||
|------|------|------|
|
||||
| ESP32 设备 | 已就绪 | 固件已烧录,WiFi+服务端已配置 |
|
||||
| 小智服务端 | 已就绪 | ws://14.18.247.51:8010 运行中 |
|
||||
| ASR (FunASR) | 已就绪 | CPU 模式 |
|
||||
| TTS (EdgeTTS) | 已就绪 | 微软免费 |
|
||||
| 蚂蚁阿福 HTTP Bridge | 已就绪 | http_bridge_stream.js (port 18900) |
|
||||
| Frida + 手机 | 需部署 | 手机需连到服务端可达的网络 |
|
||||
|
||||
### 第1步:创建 AntafLLM Provider
|
||||
|
||||
文件路径:`backend/main/xiaozhi-server/core/providers/llm/antaf/antaf.py`
|
||||
|
||||
```python
|
||||
import requests
|
||||
from config.logger import setup_logging
|
||||
from core.providers.llm.base import LLMProviderBase
|
||||
|
||||
TAG = __name__
|
||||
logger = setup_logging()
|
||||
|
||||
|
||||
class LLMProvider(LLMProviderBase):
|
||||
"""
|
||||
蚂蚁阿福 LLM Provider
|
||||
通过 Frida HTTP Bridge (port 18900) 对接蚂蚁阿福 App 的文字对话 API。
|
||||
Bridge 运行在手机上,通过 adb forward 或网络暴露 SSE 流式接口。
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
self.bridge_url = config.get("bridge_url", "http://127.0.0.1:18900")
|
||||
self.timeout = config.get("timeout", 60)
|
||||
logger.bind(tag=TAG).info(
|
||||
f"AntafLLM 初始化: bridge={self.bridge_url}, timeout={self.timeout}s"
|
||||
)
|
||||
|
||||
def response(self, session_id, dialogue, **kwargs):
|
||||
"""
|
||||
流式返回蚂蚁阿福的回答。
|
||||
1. 从 dialogue 取最后一条用户消息
|
||||
2. GET {bridge_url}/chat?q={query}
|
||||
3. 解析 SSE 流,yield 每个 delta 文本
|
||||
"""
|
||||
# 提取最后一条用户消息
|
||||
query = ""
|
||||
for msg in reversed(dialogue):
|
||||
if msg.get("role") == "user":
|
||||
query = msg.get("content", "")
|
||||
break
|
||||
|
||||
if not query:
|
||||
logger.bind(tag=TAG).warning("对话中没有用户消息")
|
||||
yield "抱歉,我没有收到您的问题。"
|
||||
return
|
||||
|
||||
logger.bind(tag=TAG).info(f"AntafLLM 请求: {query[:50]}...")
|
||||
|
||||
try:
|
||||
url = f"{self.bridge_url}/chat"
|
||||
resp = requests.get(
|
||||
url,
|
||||
params={"q": query},
|
||||
stream=True,
|
||||
timeout=self.timeout
|
||||
)
|
||||
resp.encoding = "utf-8"
|
||||
|
||||
for line in resp.iter_lines(decode_unicode=True):
|
||||
if not line:
|
||||
continue
|
||||
if line.startswith("data: "):
|
||||
data = line[6:] # 去掉 "data: " 前缀
|
||||
if data == "[DONE]":
|
||||
break
|
||||
if data and len(data.strip()) > 0:
|
||||
yield data
|
||||
|
||||
except requests.exceptions.ConnectionError:
|
||||
logger.bind(tag=TAG).error("无法连接蚂<E68EA5><E89A82>阿福 Bridge,请检查手机和 Frida 状态")
|
||||
yield "抱歉,蚂蚁阿福服务暂时不可用。"
|
||||
except requests.exceptions.Timeout:
|
||||
logger.bind(tag=TAG).error(f"蚂蚁阿福 Bridge 超时 ({self.timeout}s)")
|
||||
yield "抱歉,回答超时了。"
|
||||
except Exception as e:
|
||||
logger.bind(tag=TAG).error(f"AntafLLM 异常: {e}")
|
||||
yield "抱歉,发生了错误。"
|
||||
```
|
||||
|
||||
### 第2步:修改服务端配置
|
||||
|
||||
编辑 `backend/main/xiaozhi-server/data/.config.yaml`:
|
||||
|
||||
```yaml
|
||||
selected_module:
|
||||
LLM: antaf # 改为<E694B9><E4B8BA><EFBFBD>蚁阿福
|
||||
|
||||
LLM:
|
||||
antaf:
|
||||
type: antaf
|
||||
bridge_url: http://<手机IP>:18900 # 手机的 HTTP Bridge 地址
|
||||
timeout: 60 # SSE 流超时时间
|
||||
```
|
||||
|
||||
也可以保留原来的 Qwen3 配置,方便切换:
|
||||
|
||||
```yaml
|
||||
LLM:
|
||||
antaf:
|
||||
type: antaf
|
||||
bridge_url: http://<手机IP>:18900
|
||||
timeout: 60
|
||||
Qwen3:
|
||||
type: openai
|
||||
model_name: Qwen3-32B
|
||||
url: http://127.0.0.1:30000/v1
|
||||
api_key: EMPTY
|
||||
```
|
||||
|
||||
### 第3步:网络打通
|
||||
|
||||
手机的 Frida Bridge 端口需要让 PlugAI 服<><E69C8D><EFBFBD>器能访问到。有两种方式:
|
||||
|
||||
#### 方式1:手机直连局域网(推荐)
|
||||
|
||||
如果手机和 PlugAI 服务器在同一网络(或手机有公网可达 IP):
|
||||
```bash
|
||||
# 手机上启动 bridge 后,服务端直接访问
|
||||
# bridge_url: http://<手机内网IP>:18900
|
||||
curl http://<手机IP>:18900/chat?q=hello
|
||||
```
|
||||
|
||||
#### 方式2:adb forward + SSH 隧道
|
||||
|
||||
手机通过 USB 连接一台中间机器,再通过 SSH 隧道暴露<E69AB4><E99CB2><EFBFBD>
|
||||
```bash
|
||||
# 中间机器上
|
||||
adb forward tcp:18900 tcp:18900
|
||||
|
||||
# PlugAI 上建 SSH 隧道
|
||||
ssh -L 18900:127.0.0.1:18900 user@中间机器IP
|
||||
# bridge_url: http://127.0.0.1:18900
|
||||
```
|
||||
|
||||
### 第4步:启动与测试
|
||||
|
||||
```bash
|
||||
# 1. 手机端:启动 Frida + HTTP Bridge
|
||||
frida -U -p <PID> -l http_bridge_stream.js
|
||||
|
||||
# 2. 先测 bridge 连通性
|
||||
curl -N 'http://<手机IP>:18900/chat?q=你好'
|
||||
|
||||
# 3. PlugAI 服务端:重启小智服务
|
||||
cd /home/ZeroStack/xiaozhi/xiaozhi-esp32-server/main/xiaozhi-server
|
||||
source /home/ZeroStack/xiaozhi/venv/bin/activate
|
||||
python app.py
|
||||
|
||||
# 4. ESP32 设备:唤醒测试
|
||||
# 说 "你好小智" → 提问 → 应该听到蚂蚁阿福的回答(EdgeTTS 合成的语音)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 方案B 详细实施(后续)
|
||||
|
||||
### 核心改造:voice_bridge.js 支持音频注入
|
||||
|
||||
当前 voice_bridge.js 的 `MFAntAudio3AV2Filter::process` hook 只**读取** micIn 缓冲区。
|
||||
需要改造为可以从外部**写入** micIn 缓冲区,替换真实麦克风输入。
|
||||
|
||||
#### 改造要点
|
||||
|
||||
```javascript
|
||||
// voice_bridge.js 新增功能
|
||||
var injectBuffer = null; // 外部注入的 PCM 数据
|
||||
|
||||
// 新增 inject 命令:接收外部 PCM 音频帧
|
||||
// 客户端发送: [4字节长度][type=3][960字节PCM数据]
|
||||
// type 3 = inject audio
|
||||
|
||||
Interceptor.attach(processAddr, {
|
||||
onEnter: function(args) {
|
||||
var micIn = args[1]; // 麦克风输入缓冲区 (960 bytes)
|
||||
var frameSize = args[4]; // 960
|
||||
|
||||
if (injectBuffer !== null) {
|
||||
// 用注入数据覆盖真实麦克风输入
|
||||
micIn.writeByteArray(injectBuffer);
|
||||
injectBuffer = null;
|
||||
}
|
||||
}
|
||||
});
|
||||
```
|
||||
|
||||
#### 采样率转换
|
||||
|
||||
| 来源 | 格式 | 需转换为 |
|
||||
|------|------|---------|
|
||||
| ESP32 → 服务端 | Opus 24kHz mono | PCM 48kHz mono (阿福 mic) |
|
||||
| 阿福 speaker 输出 | PCM 24kHz stereo | Opus 24kHz mono (ESP32) |
|
||||
|
||||
服务端需要:
|
||||
- libopus 解码/编码
|
||||
- resampy 或 scipy 做采样率转换
|
||||
- 960字节帧对齐(20ms @ 48kHz)
|
||||
|
||||
#### 新增音频转发模块
|
||||
|
||||
文件<EFBFBD><EFBFBD>径:`backend/main/xiaozhi-server/core/providers/asr/antaf_voice/antaf_voice.py`
|
||||
|
||||
这是一个特殊的 ASR Provider,它不做语音识别,而是:
|
||||
1. 接收 ESP32 的 Opus 音频流
|
||||
2. 解码为 PCM,重采样 24k→48k
|
||||
3. 通过 TCP 发送到 voice_bridge (port 18901) 的 inject 命令
|
||||
4. 接收 voice_bridge 的 speaker 输出
|
||||
5. 重采样 24k stereo → 24k mono,Opus 编码
|
||||
6. 直接发回 ESP32(跳过 LLM 和 TTS)
|
||||
|
||||
#### 方案B 风险点
|
||||
|
||||
1. **帧时序同步**: ESP32 音频帧和阿福 process() 调用频率可能不一致
|
||||
2. **延迟累积**: 网络传输 + 两次重采样 + 注入延迟
|
||||
3. **VAD 冲突**: 阿福自带 VAD 可能与注入音频不匹配
|
||||
4. **回声消除失效**: 注入 mic 数据后,阿<EFBC8C><E998BF>的 AEC 参考信号(spkRef)对不上
|
||||
5. **对话控制**: 何时 open_voice / close_voice 需要与 ESP32 唤醒状态同步
|
||||
|
||||
---
|
||||
|
||||
## 依赖清单
|
||||
|
||||
### 方案A(新增依赖)
|
||||
- `requests` — Python HTTP 库(服务端 venv 中应已有)
|
||||
|
||||
### 方案B(新增<E696B0><E5A29E>赖)
|
||||
- `opuslib` 或 `pyogg` — Opus 编解码
|
||||
- `resampy` 或 `scipy.signal` — 采样率转换
|
||||
- `numpy` — 音频数据处理
|
||||
|
||||
---
|
||||
|
||||
## 文件清单
|
||||
|
||||
### 方案A
|
||||
| 操<><E6938D> | 文件 |
|
||||
|------|------|
|
||||
| 新增 | `backend/main/xiaozhi-server/core/providers/llm/antaf/__init__.py` |
|
||||
| 新增 | `backend/main/xiaozhi-server/core/providers/llm/antaf/antaf.py` |
|
||||
| 修改 | `backend/main/xiaozhi-server/data/.config.yaml` |
|
||||
|
||||
### 方案B(额外)
|
||||
| 操作 | 文件 |
|
||||
|------|------|
|
||||
| 修改 | `antaf/voice_bridge.js` (新增 inject 命令) |
|
||||
| 新增 | `backend/main/xiaozhi-server/core/providers/asr/antaf_voice/antaf_voice.py` |
|
||||
| 新增 | `backend/main/xiaozhi-server/core/utils/audio_resample.py` |
|
||||
|
||||
---
|
||||
|
||||
## 里程碑
|
||||
|
||||
| 阶段 | 目标 | 预期产出 |
|
||||
|------|------|---------|
|
||||
| M1 | 方案A 代码实现 | AntafLLM Provider + 配置 |
|
||||
| M2 | 网络打通 | PlugAI ↔ 手机 Bridge 连通 |
|
||||
| M3 | 端到端测试 | ESP32 唤醒→阿福回答→语音播报 |
|
||||
| M4 | 方案B 原型 | voice_bridge 音频注入验证 |
|
||||
| M5 | 方案B 集成 | 全语音直通链路 |
|
||||
|
|
@ -0,0 +1,79 @@
|
|||
import io
|
||||
import os
|
||||
import wave
|
||||
import asyncio
|
||||
import numpy as np
|
||||
import sherpa_onnx
|
||||
from config.logger import setup_logging
|
||||
from core.providers.tts.base import TTSProviderBase
|
||||
|
||||
TAG = __name__
|
||||
logger = setup_logging()
|
||||
|
||||
|
||||
class TTSProvider(TTSProviderBase):
|
||||
def __init__(self, config, delete_audio_file):
|
||||
super().__init__(config, delete_audio_file)
|
||||
model_dir = config.get("model_dir", "models/vits-melo-tts-zh_en")
|
||||
speed = config.get("speed", 1.0)
|
||||
self.speed = float(speed) if speed else 1.0
|
||||
self.sid = int(config.get("sid", 0))
|
||||
|
||||
# 优先使用 int8 量化模型(更快)
|
||||
model_file = f"{model_dir}/model.int8.onnx"
|
||||
if not os.path.exists(model_file) or os.path.getsize(model_file) < 1024:
|
||||
model_file = f"{model_dir}/model.onnx"
|
||||
|
||||
num_threads = int(config.get("num_threads", 8))
|
||||
|
||||
tts_config = sherpa_onnx.OfflineTtsConfig(
|
||||
model=sherpa_onnx.OfflineTtsModelConfig(
|
||||
vits=sherpa_onnx.OfflineTtsVitsModelConfig(
|
||||
model=model_file,
|
||||
lexicon=f"{model_dir}/lexicon.txt",
|
||||
tokens=f"{model_dir}/tokens.txt",
|
||||
dict_dir=f"{model_dir}/dict",
|
||||
),
|
||||
num_threads=num_threads,
|
||||
),
|
||||
rule_fsts=f"{model_dir}/date.fst,{model_dir}/phone.fst,{model_dir}/number.fst,{model_dir}/new_heteronym.fst",
|
||||
max_num_sentences=1,
|
||||
)
|
||||
self.tts = sherpa_onnx.OfflineTts(tts_config)
|
||||
self.sample_rate = self.tts.sample_rate
|
||||
logger.bind(tag=TAG).info(
|
||||
f"SherpaOnnxTTS 初始化完成: model_dir={model_dir}, sample_rate={self.sample_rate}, sid={self.sid}"
|
||||
)
|
||||
|
||||
def _generate_wav(self, text):
|
||||
"""同步合成,在线程池中调用"""
|
||||
from scipy.signal import resample_poly
|
||||
from math import gcd
|
||||
|
||||
audio = self.tts.generate(text, sid=self.sid, speed=self.speed)
|
||||
samples = np.array(audio.samples, dtype=np.float32)
|
||||
|
||||
# 重采样到目标采样率(设备要求 24000Hz,模型输出 44100Hz)
|
||||
target_sr = 24000
|
||||
if self.sample_rate != target_sr:
|
||||
g = gcd(self.sample_rate, target_sr)
|
||||
samples = resample_poly(samples, target_sr // g, self.sample_rate // g)
|
||||
|
||||
pcm = (samples * 32767).astype(np.int16)
|
||||
|
||||
wav_io = io.BytesIO()
|
||||
with wave.open(wav_io, "wb") as wf:
|
||||
wf.setnchannels(1)
|
||||
wf.setsampwidth(2)
|
||||
wf.setframerate(target_sr)
|
||||
wf.writeframes(pcm.tobytes())
|
||||
return wav_io.getvalue()
|
||||
|
||||
async def text_to_speak(self, text, output_file):
|
||||
wav_data = self._generate_wav(text)
|
||||
|
||||
if output_file:
|
||||
with open(output_file, "wb") as f:
|
||||
f.write(wav_data)
|
||||
else:
|
||||
return wav_data
|
||||
Loading…
Reference in New Issue