add independent modules (not integrated into framework)

- modules/antaf/ — Antaf LLM provider, voice passthrough, bridge scripts
- modules/tts/ — sherpa-onnx local TTS provider
- modules/docs/ — integration plan

These are standalone files, NOT patched into xiaozhi-server framework.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
hailin 2026-04-06 11:40:31 -07:00
parent ae260da3eb
commit a88e7072b3
8 changed files with 1635 additions and 0 deletions

143
modules/antaf/antaf_llm.py Normal file
View File

@ -0,0 +1,143 @@
import json
import requests
from config.logger import setup_logging
from core.providers.llm.base import LLMProviderBase
TAG = __name__
logger = setup_logging()
class LLMProvider(LLMProviderBase):
"""
蚂蚁阿福 LLM Provider
通过 Frida HTTP Bridge (port 18900) 对接蚂蚁阿福 App 的文字对话 API
Bridge 运行在手机上通过 adb forward 或网络暴露 SSE 流式接口
"""
def __init__(self, config):
self.bridge_url = config.get("bridge_url", "http://127.0.0.1:18900")
self.timeout = config.get("timeout", 60)
self.should_idle = False # signal to send system idle after TTS
logger.bind(tag=TAG).info(
f"AntafLLM 初始化: bridge={self.bridge_url}, timeout={self.timeout}s"
)
@staticmethod
def _is_thinking(text):
"""检测蚂蚁阿福的内心思考/推理过程,这些不应该发给用户"""
thinking_patterns = [
"用户问", "用户说", "用户的", "用户可能", "用户真正",
"我得", "我会", "我在想", "我决定", "我要",
"语气比较", "感觉他", "让他知道", "让他觉得",
"先安抚", "得先", "不想表现",
"整体语气", "这样能", "这样他",
"所以我", "还带了个",
]
for p in thinking_patterns:
if p in text:
return True
return False
@staticmethod
def _clean_text(text):
"""清理阿福返回文本中的脏数据"""
# 去掉阿福内部状态文本
junk = [
"完成资料引用", "内容生成", "正在思考", "正在搜索",
]
for j in junk:
text = text.replace(j, "")
return text.strip()
@staticmethod
def _is_system_injected(content):
"""检测是否为系统注入的消息(非用户真实输入)"""
if not content:
return True
markers = [
"[系统提示]", "tool_call", "<tool_call>", "TOOL USE",
"系统提示", "工具调用", "function_call",
"handle_exit_intent", "你有以下工具", "You have access",
]
for m in markers:
if m in content:
return True
# 超过200字的 user 消息大概率是系统注入的
if len(content) > 200:
return True
return False
def response(self, session_id, dialogue, **kwargs):
# 从 dialogue 中提取真正的用户消息(跳过系统注入的 user 消息)
query = ""
for msg in reversed(dialogue):
if msg.get("role") == "user":
content = msg.get("content", "")
if not self._is_system_injected(content):
# ASR 结果可能是 JSON: {"content":"...", "language":"zh", "emotion":"..."}
try:
parsed = json.loads(content)
if isinstance(parsed, dict) and "content" in parsed:
query = parsed["content"]
else:
query = content
except (json.JSONDecodeError, TypeError):
query = content
break
if not query:
logger.bind(tag=TAG).warning("对话中没有用户消息")
yield "抱歉,我没有收到您的问题。"
return
# 追加简短回答提示避免阿福回复过长导致TTS排队卡顿
query = query + "请用2-3句话简短回答"
self.should_idle = False
logger.bind(tag=TAG).info(f"AntafLLM 请求: {query[:50]}...")
try:
url = f"{self.bridge_url}/chat"
resp = requests.get(
url,
params={"q": query},
stream=True,
timeout=self.timeout,
)
resp.encoding = "utf-8"
seen_texts = set()
for line in resp.iter_lines(decode_unicode=True):
if not line:
continue
if line.startswith("data: "):
data = line[6:]
if data == "[DONE]":
break
if not data or len(data.strip()) == 0:
continue
# 去重:跳过完全相同的文本块
if data in seen_texts:
continue
seen_texts.add(data)
# 过滤思考过程
if self._is_thinking(data):
logger.bind(tag=TAG).debug(f"过滤思考内容: {data[:50]}...")
continue
# 清理脏数据
data = self._clean_text(data)
if not data:
continue
yield data
except requests.exceptions.ConnectionError:
logger.bind(tag=TAG).error("无法连接蚂蚁阿福 Bridge请检查手机和 Frida 状态")
self.should_idle = True
yield "抱歉,蚂蚁阿福服务暂时不可用。"
except requests.exceptions.Timeout:
logger.bind(tag=TAG).error(f"蚂蚁阿福 Bridge 超时 ({self.timeout}s)")
self.should_idle = True
yield "抱歉,回答超时了。"
except Exception as e:
logger.bind(tag=TAG).error(f"AntafLLM 异常: {e}")
self.should_idle = True
yield "抱歉,发生了错误。"

View File

@ -0,0 +1,251 @@
"""
Antaf Voice Passthrough ASR Provider
Replaces ASRLLMTTS pipeline with direct audio forwarding to Antaf voice_bridge.
ESP32 audio decode Opus resample 16kHz48kHz inject to voice_bridge (type=3)
voice_bridge speaker (type=0) resample 48kHz16kHz encode Opus send to ESP32
Runs within xiaozhi-server, keeping all protocol handling (hello, OTA, wake word) intact.
"""
import json
import struct
import asyncio
import threading
import numpy as np
import opuslib_next
from scipy.signal import resample_poly
from math import gcd
from config.logger import setup_logging
from core.providers.asr.base import ASRProviderBase
from core.handle.sendAudioHandle import send_tts_message
TAG = __name__
logger = setup_logging()
# Audio parameters
ESP_SR = 16000
ESP_FRAME_SAMPLES = 960 # 60ms at 16kHz
BRIDGE_SR = 48000
BRIDGE_FRAME_SAMPLES = 480 # 960 bytes / 2 = 480 samples
# Resample ratios
UP = (BRIDGE_SR // gcd(BRIDGE_SR, ESP_SR), ESP_SR // gcd(BRIDGE_SR, ESP_SR)) # (3,1)
DOWN = (ESP_SR // gcd(ESP_SR, BRIDGE_SR), BRIDGE_SR // gcd(ESP_SR, BRIDGE_SR)) # (1,3)
class ASRProvider(ASRProviderBase):
def __init__(self, config):
super().__init__()
self.bridge_host = config.get("bridge_host", "127.0.0.1")
self.bridge_port = int(config.get("bridge_port", 18901))
self.interface_type = "NON_STREAM"
self.conn = None
self.bridge_reader = None
self.bridge_writer = None
self.opus_decoder = None
self.opus_encoder = None
self._inject_buf = np.array([], dtype=np.int16)
self._speaker_buf = np.array([], dtype=np.int16)
self._tts_started = False
self._recv_task = None
self._connected = False
logger.bind(tag=TAG).info(
f"AntafPassthrough 初始化: bridge={self.bridge_host}:{self.bridge_port}"
)
async def open_audio_channels(self, conn):
"""Override: connect to bridge, start passthrough instead of normal ASR."""
# Clean up previous connection if any
await self.close()
self.conn = conn
self.opus_decoder = opuslib_next.Decoder(ESP_SR, 1)
self.opus_encoder = opuslib_next.Encoder(ESP_SR, 1, opuslib_next.APPLICATION_AUDIO)
self._tts_started = False
self._silence_count = 0
self._inject_buf = np.array([], dtype=np.int16)
self._speaker_buf = np.array([], dtype=np.int16)
self._write_lock = threading.Lock()
# Connect to voice_bridge
try:
self.bridge_reader, self.bridge_writer = await asyncio.open_connection(
self.bridge_host, self.bridge_port
)
# Read connected event
ftype, data = await self._bridge_recv()
if ftype == 1:
msg = json.loads(data.decode())
logger.bind(tag=TAG).info(f"Bridge connected: {msg.get('protocol')}")
# Send start + inject_on
self._bridge_send_cmd({"cmd": "start"})
ftype, data = await self._bridge_recv()
if ftype == 1:
logger.bind(tag=TAG).info(f"Bridge: {json.loads(data.decode())}")
self._bridge_send_cmd({"cmd": "inject_on"})
ftype, data = await self._bridge_recv()
if ftype == 1:
logger.bind(tag=TAG).info(f"Bridge: {json.loads(data.decode())}")
self._connected = True
logger.bind(tag=TAG).info("Voice bridge ready (inject mode)")
# Start speaker receive loop
self._recv_task = asyncio.create_task(self._speaker_recv_loop())
except Exception as e:
logger.bind(tag=TAG).error(f"Bridge connection failed: {e}")
self._connected = False
# Start normal audio processing thread (reads from asr_audio_queue)
conn.asr_priority_thread = threading.Thread(
target=self._audio_thread, args=(conn,), daemon=True
)
conn.asr_priority_thread.start()
def _audio_thread(self, conn):
"""Read Opus frames from queue, decode, resample, inject to bridge."""
import queue as queue_module
frame_count = 0
while not conn.stop_event.is_set():
try:
opus_data = conn.asr_audio_queue.get(timeout=1)
if not self._connected:
continue
frame_count += 1
if frame_count <= 3 or frame_count % 200 == 0:
logger.bind(tag=TAG).debug(f"Audio frame #{frame_count}")
# Decode Opus → PCM 16kHz
pcm = self.opus_decoder.decode(opus_data, ESP_FRAME_SAMPLES)
samples = np.frombuffer(pcm, dtype=np.int16)
# Resample 16kHz → 48kHz
upsampled = resample_poly(samples, UP[0], UP[1]).astype(np.int16)
# Split into bridge frames and inject
self._inject_buf = np.concatenate([self._inject_buf, upsampled])
while len(self._inject_buf) >= BRIDGE_FRAME_SAMPLES:
frame = self._inject_buf[:BRIDGE_FRAME_SAMPLES]
self._inject_buf = self._inject_buf[BRIDGE_FRAME_SAMPLES:]
self._bridge_send_inject(frame.tobytes())
except queue_module.Empty:
continue
except Exception as e:
logger.bind(tag=TAG).error(f"Audio thread error: {e}")
async def _speaker_recv_loop(self):
"""Receive speaker PCM from bridge, resample, encode Opus, send to ESP32."""
try:
while self._connected:
ftype, data = await self._bridge_recv()
if ftype == 0:
# Speaker audio
await self._handle_speaker(data)
elif ftype == 1:
msg = json.loads(data.decode())
logger.bind(tag=TAG).debug(f"Bridge event: {msg}")
except asyncio.IncompleteReadError:
logger.bind(tag=TAG).warning("Bridge connection closed")
except Exception as e:
logger.bind(tag=TAG).error(f"Speaker recv error: {e}")
finally:
self._connected = False
async def _handle_speaker(self, pcm_bytes):
"""Process speaker frame and send to ESP32."""
if not self.conn or not self.conn.websocket:
return
samples = np.frombuffer(pcm_bytes, dtype=np.int16)
max_amp = int(np.max(np.abs(samples)))
# Track silence for tts stop
if max_amp < 10:
if self._tts_started:
self._silence_count += 1
# 50 frames of silence (~1 second) → send tts stop
if self._silence_count > 50:
try:
await send_tts_message(self.conn, "stop")
self.conn.client_is_speaking = False
self._tts_started = False
self._silence_count = 0
logger.bind(tag=TAG).info("Sent tts stop to ESP32")
except Exception as e:
logger.bind(tag=TAG).error(f"Send tts stop error: {e}")
return
# Reset silence counter on non-silent frame
self._silence_count = 0
# Send tts start before first audio
if not self._tts_started:
try:
await send_tts_message(self.conn, "start")
self._tts_started = True
self.conn.client_is_speaking = True
logger.bind(tag=TAG).info("Sent tts start to ESP32")
except Exception as e:
logger.bind(tag=TAG).error(f"Send tts start error: {e}")
return
# Resample 48kHz → 16kHz
downsampled = resample_poly(samples, DOWN[0], DOWN[1]).astype(np.int16)
# Accumulate and encode
self._speaker_buf = np.concatenate([self._speaker_buf, downsampled])
while len(self._speaker_buf) >= ESP_FRAME_SAMPLES:
frame = self._speaker_buf[:ESP_FRAME_SAMPLES]
self._speaker_buf = self._speaker_buf[ESP_FRAME_SAMPLES:]
opus_data = self.opus_encoder.encode(frame.tobytes(), ESP_FRAME_SAMPLES)
try:
await self.conn.websocket.send(opus_data)
except Exception as e:
logger.bind(tag=TAG).error(f"Send opus to ESP32 error: {e}")
return
# Bridge TCP helpers
async def _bridge_recv(self):
header = await self.bridge_reader.readexactly(5)
length = struct.unpack(">I", header[:4])[0]
ftype = header[4]
data = await self.bridge_reader.readexactly(length)
return ftype, data
def _bridge_send_cmd(self, cmd):
data = json.dumps(cmd).encode()
header = struct.pack(">IB", len(data), 1)
with self._write_lock:
self.bridge_writer.write(header + data)
def _bridge_send_inject(self, pcm_bytes):
header = struct.pack(">IB", len(pcm_bytes), 3)
with self._write_lock:
self.bridge_writer.write(header + pcm_bytes)
# ASR interface — never returns text, LLM/TTS never triggered
async def receive_audio(self, conn, audio, audio_have_voice):
"""No-op: audio is handled by _audio_thread directly from queue."""
pass
async def speech_to_text(self, opus_data, session_id, audio_format="opus", artifacts=None):
"""Never called in passthrough mode."""
return "", None
async def close(self):
self._connected = False
if self._recv_task:
self._recv_task.cancel()
if self.bridge_writer:
try:
self._bridge_send_cmd({"cmd": "inject_off"})
self._bridge_send_cmd({"cmd": "stop"})
self.bridge_writer.close()
except Exception:
pass

307
modules/antaf/relay.py Normal file
View File

@ -0,0 +1,307 @@
#!/usr/bin/env python3
"""
ESP32 Antaf Voice Relay
Bridges ESP32 (WebSocket/Opus) with Antaf voice_bridge (TCP/PCM).
ESP32 Opus decode resample 16kHz48kHz voice_bridge inject (type=3)
ESP32 Opus encode resample 48kHz16kHz voice_bridge speaker (type=0)
Usage: python relay.py [--ws-port 8010] [--bridge-host 127.0.0.1] [--bridge-port 18901]
"""
import asyncio
import json
import struct
import argparse
import logging
import numpy as np
from scipy.signal import resample_poly
from math import gcd
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
log = logging.getLogger("relay")
try:
import opuslib_next as opuslib
except ImportError:
import opuslib
import websockets
# Audio parameters
ESP_SAMPLE_RATE = 16000 # ESP32 Opus sample rate
ESP_FRAME_MS = 60 # ESP32 frame duration
ESP_FRAME_SIZE = ESP_SAMPLE_RATE * ESP_FRAME_MS // 1000 # 960 samples
BRIDGE_SAMPLE_RATE = 48000 # voice_bridge micIn sample rate
BRIDGE_FRAME_BYTES = 960 # 480 samples * 2 bytes
BRIDGE_FRAME_SAMPLES = 480
# Resampling ratios
UP_GCD = gcd(BRIDGE_SAMPLE_RATE, ESP_SAMPLE_RATE) # 16000 → 48000
UP_RATIO = (BRIDGE_SAMPLE_RATE // UP_GCD, ESP_SAMPLE_RATE // UP_GCD) # (3, 1)
DOWN_GCD = gcd(ESP_SAMPLE_RATE, BRIDGE_SAMPLE_RATE) # 48000 → 16000
DOWN_RATIO = (ESP_SAMPLE_RATE // DOWN_GCD, BRIDGE_SAMPLE_RATE // DOWN_GCD) # (1, 3)
class BridgeClient:
"""TCP client for voice_bridge_v7."""
def __init__(self, host, port):
self.host = host
self.port = port
self.reader = None
self.writer = None
self.on_speaker_frame = None # callback(pcm_bytes)
self._recv_task = None
async def connect(self):
self.reader, self.writer = await asyncio.open_connection(self.host, self.port)
log.info(f"Connected to voice_bridge {self.host}:{self.port}")
# Read connected event
ftype, data = await self._recv_frame()
if ftype == 1:
msg = json.loads(data.decode())
log.info(f"Bridge: {msg.get('protocol')}")
async def _recv_frame(self):
header = await self.reader.readexactly(5)
length = struct.unpack(">I", header[:4])[0]
ftype = header[4]
data = await self.reader.readexactly(length)
return ftype, data
def _send_frame(self, ftype, data):
header = struct.pack(">IB", len(data), ftype)
self.writer.write(header + data)
# Note: no await drain() here — voice frames are time-sensitive,
# TCP buffer handles backpressure
def send_cmd(self, cmd):
self._send_frame(1, json.dumps(cmd).encode())
def send_inject(self, pcm_bytes):
self._send_frame(3, pcm_bytes)
async def start_recv_loop(self):
"""Background task: receive frames from bridge."""
try:
while True:
ftype, data = await self._recv_frame()
if ftype == 0 and self.on_speaker_frame:
# Speaker audio
await self.on_speaker_frame(data)
elif ftype == 1:
msg = json.loads(data.decode())
log.info(f"Bridge event: {msg}")
except asyncio.IncompleteReadError:
log.warning("Bridge connection closed")
except Exception as e:
log.error(f"Bridge recv error: {e}")
async def setup_voice(self):
"""Start capture and enable inject. Voice chat must already be open."""
self.send_cmd({"cmd": "start"})
ftype, data = await self._recv_frame()
if ftype == 1:
msg = json.loads(data.decode())
log.info(f"Bridge: {msg}")
self.send_cmd({"cmd": "inject_on"})
ftype, data = await self._recv_frame()
if ftype == 1:
msg = json.loads(data.decode())
log.info(f"Bridge: {msg}")
log.info("Voice bridge ready (inject mode)")
async def close(self):
try:
self.send_cmd({"cmd": "inject_off"})
self.send_cmd({"cmd": "stop"})
except Exception:
pass
if self.writer:
self.writer.close()
class Relay:
"""Main relay: ESP32 WebSocket ↔ Antaf voice_bridge TCP."""
def __init__(self, ws_port, bridge_host, bridge_port):
self.ws_port = ws_port
self.bridge_host = bridge_host
self.bridge_port = bridge_port
self.bridge = None
self.ws = None
self.opus_decoder = None
self.opus_encoder = None
# Buffer for resampled PCM to split into bridge frames
self._inject_buf = np.array([], dtype=np.int16)
# Buffer for speaker PCM to accumulate before encoding
self._speaker_buf = np.array([], dtype=np.int16)
self._audio_in_count = 0
self._audio_out_count = 0
self._tts_started = False # track if we sent tts start to ESP32
async def handle_esp32(self, websocket):
"""Handle one ESP32 WebSocket connection."""
log.info(f"ESP32 connected from {websocket.remote_address}")
self.ws = websocket
# Init Opus codec
self.opus_decoder = opuslib.Decoder(ESP_SAMPLE_RATE, 1)
self.opus_encoder = opuslib.Encoder(ESP_SAMPLE_RATE, 1, opuslib.APPLICATION_AUDIO)
# Connect to voice bridge and setup voice chat first
self.bridge = BridgeClient(self.bridge_host, self.bridge_port)
await self.bridge.connect()
await self.bridge.setup_voice()
# Now start receiving speaker audio
self.bridge.on_speaker_frame = self._on_speaker_frame
recv_task = asyncio.create_task(self.bridge.start_recv_loop())
try:
async for message in websocket:
if isinstance(message, str):
# Text message from ESP32 (hello, listen, etc.)
await self._handle_text(message)
elif isinstance(message, bytes):
# Opus audio from ESP32
await self._handle_audio(message)
except websockets.exceptions.ConnectionClosed:
log.info("ESP32 disconnected")
finally:
recv_task.cancel()
await self.bridge.close()
self.ws = None
log.info("Session ended")
async def _handle_text(self, message):
"""Handle text messages from ESP32."""
try:
msg = json.loads(message)
msg_type = msg.get("type")
if msg_type == "hello":
# Respond with proper hello — must match xiaozhi protocol
resp = {
"type": "hello",
"version": 1,
"transport": "websocket",
"session_id": "relay-session",
"audio_params": {
"format": "opus",
"sample_rate": ESP_SAMPLE_RATE,
"channels": 1,
"frame_duration": ESP_FRAME_MS,
},
}
await self.ws.send(json.dumps(resp))
log.info(f"ESP32 hello: {msg.get('audio_params')}")
elif msg_type == "listen":
state = msg.get("state")
log.info(f"ESP32 listen: {state}")
if state == "detect":
text = msg.get("text", "")
log.info(f"Wake word: {text}")
# Don't send tts start — let ESP32 continue recording
elif msg_type == "abort":
log.info("ESP32 abort")
except json.JSONDecodeError:
log.warning(f"Invalid JSON from ESP32: {message[:100]}")
async def _handle_audio(self, opus_data):
"""Decode Opus from ESP32, resample, inject into voice_bridge."""
try:
self._audio_in_count += 1
if self._audio_in_count <= 3 or self._audio_in_count % 100 == 0:
log.info(f"ESP32 audio frame #{self._audio_in_count}, size={len(opus_data)}")
# Decode Opus → PCM 16kHz mono
pcm = self.opus_decoder.decode(opus_data, ESP_FRAME_SIZE)
samples = np.frombuffer(pcm, dtype=np.int16)
# Resample 16kHz → 48kHz
upsampled = resample_poly(samples, UP_RATIO[0], UP_RATIO[1]).astype(np.int16)
# Append to inject buffer and send in bridge frame sizes
self._inject_buf = np.concatenate([self._inject_buf, upsampled])
while len(self._inject_buf) >= BRIDGE_FRAME_SAMPLES:
frame = self._inject_buf[:BRIDGE_FRAME_SAMPLES]
self._inject_buf = self._inject_buf[BRIDGE_FRAME_SAMPLES:]
self.bridge.send_inject(frame.tobytes())
except Exception as e:
log.error(f"Audio inject error: {e}")
async def _on_speaker_frame(self, pcm_bytes):
"""Receive speaker PCM from bridge, resample, encode Opus, send to ESP32."""
if not self.ws or getattr(self.ws, 'closed', False):
return
try:
self._audio_out_count += 1
samples = np.frombuffer(pcm_bytes, dtype=np.int16)
max_amp = int(np.max(np.abs(samples)))
if self._audio_out_count <= 3 or self._audio_out_count % 100 == 0:
log.info(f"Speaker frame #{self._audio_out_count}, size={len(pcm_bytes)}, max_amp={max_amp}")
# Only send non-silent frames to ESP32
if max_amp < 10:
# If we were playing and now silent for a while, send tts stop
if self._tts_started and self._audio_out_count % 50 == 0:
# Check later — don't stop immediately, silence gaps are normal
pass
return
# Send tts start before first audio frame
if not self._tts_started:
await self.ws.send(json.dumps({
"type": "tts", "state": "start",
"session_id": "relay-session"
}))
self._tts_started = True
log.info("Sent tts start to ESP32")
# Resample 48kHz → 16kHz
downsampled = resample_poly(samples, DOWN_RATIO[0], DOWN_RATIO[1]).astype(np.int16)
# Accumulate into speaker buffer, encode when we have enough
self._speaker_buf = np.concatenate([self._speaker_buf, downsampled])
while len(self._speaker_buf) >= ESP_FRAME_SIZE:
frame = self._speaker_buf[:ESP_FRAME_SIZE]
self._speaker_buf = self._speaker_buf[ESP_FRAME_SIZE:]
# Encode PCM → Opus
opus_data = self.opus_encoder.encode(frame.tobytes(), ESP_FRAME_SIZE)
await self.ws.send(opus_data)
except Exception as e:
log.error(f"Speaker send error: {e}")
async def run(self):
log.info(f"Relay starting on ws://0.0.0.0:{self.ws_port}/xiaozhi/v1/")
async with websockets.serve(
self.handle_esp32, "0.0.0.0", self.ws_port,
ping_interval=30, ping_timeout=10,
):
await asyncio.Future() # run forever
def main():
parser = argparse.ArgumentParser(description="ESP32-Antaf Voice Relay")
parser.add_argument("--ws-port", type=int, default=8010, help="WebSocket port for ESP32")
parser.add_argument("--bridge-host", default="127.0.0.1", help="voice_bridge host")
parser.add_argument("--bridge-port", type=int, default=18901, help="voice_bridge port")
args = parser.parse_args()
relay = Relay(args.ws_port, args.bridge_host, args.bridge_port)
asyncio.run(relay.run())
if __name__ == "__main__":
main()

View File

@ -0,0 +1,130 @@
#!/usr/bin/env python3
"""Test voice_bridge_v7 audio injection.
Connect to voice_bridge, open voice chat, enable inject mode,
send silence frames, and print any speaker output received.
Usage: python test_inject.py [host] [port]
"""
import socket
import struct
import json
import time
import sys
import threading
HOST = sys.argv[1] if len(sys.argv) > 1 else "127.0.0.1"
PORT = int(sys.argv[2]) if len(sys.argv) > 2 else 18901
FRAME_SIZE = 960 # 960 bytes per frame (480 samples * 16bit)
def send_cmd(sock, cmd):
data = json.dumps(cmd).encode("utf-8")
header = struct.pack(">IB", len(data), 1) # type=1 text
sock.sendall(header + data)
def send_inject(sock, pcm_frame):
header = struct.pack(">IB", len(pcm_frame), 3) # type=3 inject
sock.sendall(header + pcm_frame)
def recv_exact(sock, n):
buf = b""
while len(buf) < n:
chunk = sock.recv(n - len(buf))
if not chunk:
return None
buf += chunk
return buf
def recv_frame(sock):
header = recv_exact(sock, 5)
if header is None:
return None, None
length = struct.unpack(">I", header[:4])[0]
ftype = header[4]
if length > 1048576:
return None, None
data = recv_exact(sock, length)
if data is None:
return None, None
return ftype, data
def receiver(sock):
"""Background thread to print received frames."""
spk_count = 0
while True:
try:
ftype, data = recv_frame(sock)
if ftype is None:
print("[RECV] Connection closed")
break
if ftype == 1: # text/json
msg = json.loads(data.decode("utf-8"))
print(f"[RECV] {msg}")
elif ftype == 0: # speaker audio
spk_count += 1
# Check if audio is non-silent
samples = struct.unpack(f"<{len(data)//2}h", data)
max_amp = max(abs(s) for s in samples)
if spk_count <= 5 or spk_count % 100 == 0 or max_amp > 500:
print(f"[SPK] frame={spk_count} size={len(data)} max_amp={max_amp}")
elif ftype == 2: # mic audio
pass # ignore mic echo
except Exception as e:
print(f"[RECV] Error: {e}")
break
def main():
print(f"Connecting to {HOST}:{PORT}...")
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.connect((HOST, PORT))
print("Connected")
# Start receiver thread
t = threading.Thread(target=receiver, args=(sock,), daemon=True)
t.start()
time.sleep(1)
# Open voice chat
print("Opening voice chat...")
send_cmd(sock, {"cmd": "open_voice"})
time.sleep(3)
# Start capture
print("Starting capture...")
send_cmd(sock, {"cmd": "start"})
time.sleep(1)
# Enable inject mode
print("Enabling inject mode...")
send_cmd(sock, {"cmd": "inject_on"})
time.sleep(0.5)
# Send silence frames for 3 seconds (48kHz, 960 bytes/frame = 20ms)
# 3 seconds = 150 frames
print("Sending 150 silence frames (3 seconds)...")
silence = b"\x00" * FRAME_SIZE
for i in range(150):
send_inject(sock, silence)
time.sleep(0.02) # 20ms per frame
print("Done sending. Waiting for speaker output...")
time.sleep(10)
# Stop
send_cmd(sock, {"cmd": "inject_off"})
send_cmd(sock, {"cmd": "stop"})
time.sleep(1)
print("Test complete")
sock.close()
if __name__ == "__main__":
main()

View File

@ -0,0 +1,209 @@
// voice_bridge_v7.js — Voice Bridge with Audio Injection
// Hook point: libantaudio.so MFAntAudio3AV2Filter::process(micIn, spkRef, out, size, &result)
// TCP :18901
// Frame: 4-byte len + 1-byte type + payload
// type 0: speaker/AI audio (spkRef, downstream to client)
// type 1: text/JSON command
// type 2: mic audio (micIn, downstream to client)
// type 3: inject audio (upstream from client, replaces micIn)
var voiceActive = false;
var clientOS = null;
var capturedSpk = 0, capturedMic = 0, spkBytes = 0, micBytes = 0;
var injectMode = false; // true = replace mic with injected audio
var injectQueue = []; // queue of PCM frames to inject
function wf(os, type, jArr) {
try {
var len = jArr.length;
var h = Java.array("byte", [(len>>24)&0xFF,(len>>16)&0xFF,(len>>8)&0xFF,len&0xFF, type]);
os.write(h); os.write(jArr); os.flush();
} catch(e) {}
}
function wt(os, text) {
wf(os, 1, Java.use("java.lang.String").$new(text).getBytes("UTF-8"));
}
// === Hook libantaudio.so ===
var hooked = false;
function tryHook() {
if (hooked) return;
var m = Process.findModuleByName("libantaudio.so");
if (!m) return;
var addr = m.findExportByName("_ZN8antaudio20MFAntAudio3AV2Filter7processEPhS1_S1_iRi");
if (!addr) return;
hooked = true;
Interceptor.attach(addr, {
onEnter: function(args) {
if (!voiceActive || !clientOS) return;
var size = args[4].toInt32();
if (size <= 0) return;
try {
// If inject mode, replace micIn with queued or silence
if (injectMode) {
if (injectQueue.length > 0) {
var frame = injectQueue.shift();
// Only write if frame size matches expected size
if (frame.byteLength === size) {
args[1].writeByteArray(frame);
} else if (frame.byteLength > 0) {
// Size mismatch — pad or truncate
var buf = new ArrayBuffer(size);
var dst = new Uint8Array(buf);
var src = new Uint8Array(frame);
var copyLen = Math.min(size, frame.byteLength);
for (var k = 0; k < copyLen; k++) dst[k] = src[k];
args[1].writeByteArray(buf);
}
} else {
// No data queued — inject silence to avoid mic leak
var silence = new ArrayBuffer(size);
args[1].writeByteArray(silence);
}
}
// Always capture speaker/AI output (type 0)
var spkPcm = args[2].readByteArray(size);
var spkArr = Java.array("byte", Array.from(new Uint8Array(spkPcm)));
wf(clientOS, 0, spkArr);
capturedSpk++; spkBytes += size;
// Capture mic (type 2) only when not injecting
if (!injectMode) {
var micPcm = args[1].readByteArray(size);
var micArr = Java.array("byte", Array.from(new Uint8Array(micPcm)));
wf(clientOS, 2, micArr);
}
capturedMic++; micBytes += size;
if (capturedMic <= 3 || capturedMic % 500 === 0)
console.log("[VOICE] mic=" + capturedMic + " spk=" + capturedSpk + " inject=" + injectQueue.length);
} catch(e) {}
}
});
console.log("[VOICE] 3AV2Filter.process hooked @ " + addr);
}
[0, 1000, 3000, 5000, 10000, 15000, 20000].forEach(function(ms) { setTimeout(tryHook, ms); });
try {
new ApiResolver("module").enumerateMatches("exports:linker*!*dlopen*").forEach(function(d) {
Interceptor.attach(d.address, { onLeave: function() { setTimeout(tryHook, 500); } });
});
} catch(e) {}
// === TCP Server ===
Java.perform(function() {
var SS = Java.use("java.net.ServerSocket");
var JS = Java.use("java.lang.String");
var server = SS.$new(18901);
console.log("[VOICE] Listening :18901");
function openVoice(os) {
Java.scheduleOnMainThread(function() {
try {
Java.choose("com.antgroup.aijk.android.ijklauncher.biz.activity.IJKActivity", {
onMatch: function(a) {
var fm = a.getSupportFragmentManager();
var f = Java.use("com.antgroup.aijk.android.ijkchat.biz.voicechat.IjkVoiceChatFragment").$new();
f.show(fm, "v");
console.log("[VOICE] Opened");
}, onComplete: function() {}
});
setTimeout(function() { wt(os, JSON.stringify({event:"voice_opened"})); }, 2000);
} catch(e) { wt(os, JSON.stringify({event:"error",msg:""+e})); }
});
}
function closeVoice(os) {
Java.scheduleOnMainThread(function() {
try {
Java.choose("com.antgroup.aijk.android.ijkchat.biz.voicechat.IjkVoiceChatFragment", {
onMatch: function(f) { f.dismiss(); console.log("[VOICE] Closed"); },
onComplete: function() {}
});
setTimeout(function() { wt(os, JSON.stringify({event:"voice_closed"})); }, 1000);
} catch(e) { wt(os, JSON.stringify({event:"error",msg:""+e})); }
});
}
var Srv = Java.registerClass({
name: "com.antaf.voice.S7",
implements: [Java.use("java.lang.Runnable")],
methods: {
run: function() {
while (true) {
try {
console.log("[VOICE] Waiting...");
var c = server.accept();
var is = c.getInputStream();
var os = c.getOutputStream();
clientOS = os;
console.log("[VOICE] Connected");
wt(os, JSON.stringify({
event:"connected", protocol:"antaf-voice-v8",
commands:["open_voice","close_voice","start","stop","status","inject_on","inject_off"],
audio:"pcm-16bit-960b-frames",
frameTypes:{0:"spk_ai",1:"text",2:"mic",3:"inject"}
}));
while (true) {
var hb = [];
for (var i=0;i<5;i++) { var b=is.read(); if(b<0) throw "EOF"; hb.push(b); }
var fl=(hb[0]<<24)|(hb[1]<<16)|(hb[2]<<8)|hb[3], ft=hb[4];
if (fl>1048576) break;
var pb = [];
for (var i=0;i<fl;i++) { var b=is.read(); if(b<0) throw "EOF"; pb.push(b&0xFF); }
if (ft === 3) {
// type 3: inject audio frame into micIn
var arr = new ArrayBuffer(pb.length);
var view = new Uint8Array(arr);
for (var j=0;j<pb.length;j++) view[j] = pb[j];
injectQueue.push(arr);
}
else if (ft === 1) {
var pl = Java.array("byte", pb);
var cmd = JSON.parse(JS.$new(pl,"UTF-8").toString());
console.log("[VOICE] Cmd: " + JSON.stringify(cmd));
if (cmd.cmd === "open_voice") openVoice(os);
else if (cmd.cmd === "close_voice") closeVoice(os);
else if (cmd.cmd === "start") {
voiceActive = true;
capturedSpk=0;capturedMic=0;spkBytes=0;micBytes=0;
injectQueue = [];
wt(os, JSON.stringify({event:"started",hooked:hooked}));
}
else if (cmd.cmd === "stop") {
voiceActive = false;
injectMode = false;
injectQueue = [];
wt(os, JSON.stringify({event:"stopped",spk:{frames:capturedSpk,bytes:spkBytes},mic:{frames:capturedMic,bytes:micBytes}}));
}
else if (cmd.cmd === "inject_on") {
injectMode = true;
injectQueue = [];
wt(os, JSON.stringify({event:"inject_on"}));
console.log("[VOICE] Inject mode ON");
}
else if (cmd.cmd === "inject_off") {
injectMode = false;
injectQueue = [];
wt(os, JSON.stringify({event:"inject_off"}));
console.log("[VOICE] Inject mode OFF");
}
else if (cmd.cmd === "status") {
wt(os, JSON.stringify({event:"status",active:voiceActive,hooked:hooked,inject:injectMode,queue:injectQueue.length,spk:{frames:capturedSpk,bytes:spkBytes},mic:{frames:capturedMic,bytes:micBytes}}));
}
}
}
} catch(e) { console.log("[VOICE] Ended: "+e); }
finally { voiceActive=false; clientOS=null; injectMode=false; injectQueue=[]; }
}
}
}
});
Java.use("java.lang.Thread").$new(Srv.$new()).start();
console.log("[VOICE] Ready (v7 + inject)");
});

View File

@ -0,0 +1,181 @@
// voice_bridge_v8.js — Voice Bridge with Audio Injection (attach after voice chat opened)
//
// STARTUP ORDER:
// 1. Launch app: adb shell monkey -p com.antgroup.aijk.android ...
// 2. Open voice chat manually or via adb tap
// 3. Wait for libantaudio.so to load
// 4. Attach frida with this script
//
// Hook point: libantaudio.so MFAntAudio3AV2Filter::process(micIn, spkRef, out, size, &result)
// TCP :18901
// Frame: 4-byte len + 1-byte type + payload
// type 0: speaker/AI audio (spkRef, downstream to client)
// type 1: text/JSON command
// type 2: mic audio (micIn, downstream to client)
// type 3: inject audio (upstream from client, replaces micIn)
var voiceActive = false;
var clientOS = null;
var capturedSpk = 0, capturedMic = 0, spkBytes = 0, micBytes = 0;
var injectMode = false;
var injectQueue = [];
function wf(os, type, jArr) {
try {
var len = jArr.length;
var h = Java.array("byte", [(len>>24)&0xFF,(len>>16)&0xFF,(len>>8)&0xFF,len&0xFF, type]);
os.write(h); os.write(jArr); os.flush();
} catch(e) {}
}
function wt(os, text) {
wf(os, 1, Java.use("java.lang.String").$new(text).getBytes("UTF-8"));
}
// === Hook libantaudio.so (should already be loaded) ===
var hooked = false;
function tryHook() {
if (hooked) return;
var m = Process.findModuleByName("libantaudio.so");
if (!m) return;
var addr = m.findExportByName("_ZN8antaudio20MFAntAudio3AV2Filter7processEPhS1_S1_iRi");
if (!addr) return;
hooked = true;
Interceptor.attach(addr, {
onEnter: function(args) {
if (!voiceActive || !clientOS) return;
var size = args[4].toInt32();
if (size <= 0) return;
try {
if (injectMode) {
if (injectQueue.length > 0) {
var frame = injectQueue.shift();
if (frame.byteLength === size) {
args[1].writeByteArray(frame);
} else {
var buf = new ArrayBuffer(size);
var dst = new Uint8Array(buf);
var src = new Uint8Array(frame);
var copyLen = Math.min(size, frame.byteLength);
for (var k = 0; k < copyLen; k++) dst[k] = src[k];
args[1].writeByteArray(buf);
}
} else {
var silence = new ArrayBuffer(size);
args[1].writeByteArray(silence);
}
}
// Always capture speaker/AI output (type 0)
var spkPcm = args[2].readByteArray(size);
var spkArr = Java.array("byte", Array.from(new Uint8Array(spkPcm)));
wf(clientOS, 0, spkArr);
capturedSpk++; spkBytes += size;
if (!injectMode) {
var micPcm = args[1].readByteArray(size);
var micArr = Java.array("byte", Array.from(new Uint8Array(micPcm)));
wf(clientOS, 2, micArr);
}
capturedMic++; micBytes += size;
if (capturedMic <= 3 || capturedMic % 500 === 0)
console.log("[VOICE] mic=" + capturedMic + " spk=" + capturedSpk + " inject=" + injectQueue.length);
} catch(e) {}
}
});
console.log("[VOICE] process hooked @ " + addr);
}
// Hook immediately — lib should already be loaded since voice chat is open
tryHook();
if (!hooked) {
// Retry a few times in case of timing
[500, 1000, 2000, 5000].forEach(function(ms) { setTimeout(tryHook, ms); });
}
// === TCP Server ===
Java.perform(function() {
var SS = Java.use("java.net.ServerSocket");
var JS = Java.use("java.lang.String");
var server = SS.$new(18901);
console.log("[VOICE] Listening :18901");
var Srv = Java.registerClass({
name: "com.antaf.voice.S8",
implements: [Java.use("java.lang.Runnable")],
methods: {
run: function() {
while (true) {
try {
console.log("[VOICE] Waiting for client...");
var c = server.accept();
var is = c.getInputStream();
var os = c.getOutputStream();
clientOS = os;
console.log("[VOICE] Client connected");
wt(os, JSON.stringify({
event:"connected", protocol:"antaf-voice-v8",
hooked: hooked,
commands:["start","stop","status","inject_on","inject_off"],
audio:"pcm-16bit-960b-frames",
frameTypes:{0:"spk_ai",1:"text",2:"mic",3:"inject"}
}));
while (true) {
var hb = [];
for (var i=0;i<5;i++) { var b=is.read(); if(b<0) throw "EOF"; hb.push(b); }
var fl=(hb[0]<<24)|(hb[1]<<16)|(hb[2]<<8)|hb[3], ft=hb[4];
if (fl>1048576) break;
var pb = [];
for (var i=0;i<fl;i++) { var b=is.read(); if(b<0) throw "EOF"; pb.push(b&0xFF); }
if (ft === 3) {
var arr = new ArrayBuffer(pb.length);
var view = new Uint8Array(arr);
for (var j=0;j<pb.length;j++) view[j] = pb[j];
injectQueue.push(arr);
}
else if (ft === 1) {
var pl = Java.array("byte", pb);
var cmd = JSON.parse(JS.$new(pl,"UTF-8").toString());
console.log("[VOICE] Cmd: " + JSON.stringify(cmd));
if (cmd.cmd === "start") {
voiceActive = true;
capturedSpk=0;capturedMic=0;spkBytes=0;micBytes=0;
injectQueue = [];
wt(os, JSON.stringify({event:"started",hooked:hooked}));
}
else if (cmd.cmd === "stop") {
voiceActive = false;
injectMode = false;
injectQueue = [];
wt(os, JSON.stringify({event:"stopped",spk:{frames:capturedSpk,bytes:spkBytes},mic:{frames:capturedMic,bytes:micBytes}}));
}
else if (cmd.cmd === "inject_on") {
injectMode = true;
injectQueue = [];
wt(os, JSON.stringify({event:"inject_on"}));
console.log("[VOICE] Inject ON");
}
else if (cmd.cmd === "inject_off") {
injectMode = false;
injectQueue = [];
wt(os, JSON.stringify({event:"inject_off"}));
console.log("[VOICE] Inject OFF");
}
else if (cmd.cmd === "status") {
wt(os, JSON.stringify({event:"status",active:voiceActive,hooked:hooked,inject:injectMode,queue:injectQueue.length,spk:{frames:capturedSpk,bytes:spkBytes},mic:{frames:capturedMic,bytes:micBytes}}));
}
}
}
} catch(e) { console.log("[VOICE] Client disconnected: "+e); }
finally { voiceActive=false; clientOS=null; injectMode=false; injectQueue=[]; }
}
}
}
});
Java.use("java.lang.Thread").$new(Srv.$new()).start();
console.log("[VOICE] Ready (v8, no open_voice)");
});

View File

@ -0,0 +1,335 @@
# 蚂蚁阿福接入小智 ESP32 — 实施方案
## 项目目标
将蚂蚁阿福 App 的 AI 能力接入小智 ESP32 硬件终端,用户通过 ESP32 设备语音对话,
后端对接蚂蚁阿福代替自建 LLM省去 GPU 资源(两张 RTX 3090 + Qwen3-32B
---
## 系统架构
### 方案A文字接入自定义 LLM Provider
```
ESP32 设备 PlugAI 服务端 手机
┌──────────┐ WebSocket ┌──────────────────┐ HTTP/SSE ┌─<E2948C><E29480><EFBFBD>────────────┐
│ 麦克风 │ ──────────────→│ ASR (FunASR) │ │ 蚂蚁阿福 App │
│ 唤<><E594A4><EFBFBD>词 │ │ 语音→文字 │ │ + Frida 注入 │
│ AEC/NS │ │ │ GET /chat?q= │ │
│ │ │ AntafLLM Provider│──────────────→│ HTTP Bridge │
│ │ │ (新增) │←──────────────│ (port 18900) │
│ │ │ │ SSE 流式回答 │ │
│ 喇叭 │←───────────────│ TTS (EdgeTTS) │ │ │
│ │ WebSocket │ 文字→语音 │ │ │
└──────────┘ └──────────────────┘ └──────────────┘
```
**数据流**: ESP32 音频 → FunASR(语音转文字) → AntafLLM(文<><E69687><EFBFBD>发给阿福) → EdgeTTS(回答转语音) → ESP32 播放
### 方案B语音直通替代整个 ASR+LLM+TTS
```
ESP32 设备 PlugAI 服务端 手机
┌──────────┐ WebSocket ┌─────<E29480><E29480><EFBFBD>────────────┐ TCP 二进制 ┌──<E29480><E29480><EFBFBD>───────────┐
│ 麦克风 │ ──────────────→│ 音频转发模块(新增) │ │ 蚂蚁<E89A82><E89A81>福 App │
│ │ │ Opus解码 │ PCM注入mic │ + Frida <20><>入 │
│ │ │ 重采样 24k→48k │──────────────→│ Voice Bridge │
│ │ │ │ PCM speaker │ (port 18901) │
│ 喇叭 │←───────────────│ 重采样 24k→24k │←──────────────│ libantaudio │
│ │ WebSocket │ Opus编码 │ │ │
└──────────┘ └──────────────────┘ └──────────────┘
```
**数据流**: ESP32 音频 → 解码+重采样 → 注入阿福麦克风 → 阿福完整处理(ASR+LLM+TTS) → 捕获音频 → 编码 → ESP32 播放
---
## 可行性评估
| 维度 | 方案A (文字接入) | 方案B (语音直通) |
|------|-----------------|-----------------|
| 可行性 | **高** | **中低** |
| 实现难度 | 低 (1个Python文件) | 高 (改JS+写转发模块) |
| 改动范围 | 新增 LLM Provider + 改配置 | 改 voice_bridge.js + 新增转发模块 |
| 延迟 | 中 (ASR+网络+TTS 各一轮) | 低 (音频直通) |
| 音质 | EdgeTTS (微软高质量) | 阿福原生 TTS |
| GPU 依赖 | 无 (省掉 Qwen3-32B) | 无 |
| 手机依赖 | 需要 (App+Frida+adb) | 需要 (App+Frida+adb) |
| 核心风险 | 低 | **voice_bridge 当前不支持音频注入** |
**结论**: 先实施方案A验证通过后再做方案B。
---
## 方案A 详细实施
### 前置条件
| 组件 | 状态 | 说明 |
|------|------|------|
| ESP32 设备 | 已就绪 | 固件已烧录WiFi+服务端已配置 |
| 小智服务端 | 已就绪 | ws://14.18.247.51:8010 运行中 |
| ASR (FunASR) | 已就绪 | CPU 模式 |
| TTS (EdgeTTS) | 已就绪 | 微软免费 |
| 蚂蚁阿福 HTTP Bridge | 已就绪 | http_bridge_stream.js (port 18900) |
| Frida + 手机 | 需部署 | 手机需连到服务端可达的网络 |
### 第1步创建 AntafLLM Provider
文件路径:`backend/main/xiaozhi-server/core/providers/llm/antaf/antaf.py`
```python
import requests
from config.logger import setup_logging
from core.providers.llm.base import LLMProviderBase
TAG = __name__
logger = setup_logging()
class LLMProvider(LLMProviderBase):
"""
蚂蚁阿福 LLM Provider
通过 Frida HTTP Bridge (port 18900) 对接蚂蚁阿福 App 的文字对话 API。
Bridge 运行在手机上,通过 adb forward 或网络暴露 SSE 流式接口。
"""
def __init__(self, config):
self.bridge_url = config.get("bridge_url", "http://127.0.0.1:18900")
self.timeout = config.get("timeout", 60)
logger.bind(tag=TAG).info(
f"AntafLLM 初始化: bridge={self.bridge_url}, timeout={self.timeout}s"
)
def response(self, session_id, dialogue, **kwargs):
"""
流式返回蚂蚁阿福的回答。
1. 从 dialogue 取最后一条用户消息
2. GET {bridge_url}/chat?q={query}
3. 解析 SSE 流yield 每个 delta 文本
"""
# 提取最后一条用户消息
query = ""
for msg in reversed(dialogue):
if msg.get("role") == "user":
query = msg.get("content", "")
break
if not query:
logger.bind(tag=TAG).warning("对话中没有用户消息")
yield "抱歉,我没有收到您的问题。"
return
logger.bind(tag=TAG).info(f"AntafLLM 请求: {query[:50]}...")
try:
url = f"{self.bridge_url}/chat"
resp = requests.get(
url,
params={"q": query},
stream=True,
timeout=self.timeout
)
resp.encoding = "utf-8"
for line in resp.iter_lines(decode_unicode=True):
if not line:
continue
if line.startswith("data: "):
data = line[6:] # 去掉 "data: " 前缀
if data == "[DONE]":
break
if data and len(data.strip()) > 0:
yield data
except requests.exceptions.ConnectionError:
logger.bind(tag=TAG).error("无法连接蚂<E68EA5><E89A82>阿福 Bridge请检查手机和 Frida 状态")
yield "抱歉,蚂蚁阿福服务暂时不可用。"
except requests.exceptions.Timeout:
logger.bind(tag=TAG).error(f"蚂蚁阿福 Bridge 超时 ({self.timeout}s)")
yield "抱歉,回答超时了。"
except Exception as e:
logger.bind(tag=TAG).error(f"AntafLLM 异常: {e}")
yield "抱歉,发生了错误。"
```
### 第2步修改服务端配置
编辑 `backend/main/xiaozhi-server/data/.config.yaml`
```yaml
selected_module:
LLM: antaf # 改为<E694B9><E4B8BA><EFBFBD>蚁阿福
LLM:
antaf:
type: antaf
bridge_url: http://<手机IP>:18900 # 手机的 HTTP Bridge 地址
timeout: 60 # SSE 流超时时间
```
也可以保留原来的 Qwen3 配置,方便切换:
```yaml
LLM:
antaf:
type: antaf
bridge_url: http://<手机IP>:18900
timeout: 60
Qwen3:
type: openai
model_name: Qwen3-32B
url: http://127.0.0.1:30000/v1
api_key: EMPTY
```
### 第3步网络打通
手机的 Frida Bridge 端口需要让 PlugAI 服<><E69C8D><EFBFBD>器能访问到。有两种方式
#### 方式1手机直连局域网推荐
如果手机和 PlugAI 服务器在同一网络(或手机有公网可达 IP
```bash
# 手机上启动 bridge 后,服务端直接访问
# bridge_url: http://<手机内网IP>:18900
curl http://<手机IP>:18900/chat?q=hello
```
#### 方式2adb forward + SSH 隧道
手机通过 USB 连接一台中间机器,再通过 SSH 隧道暴露<E69AB4><E99CB2><EFBFBD>
```bash
# 中间机器上
adb forward tcp:18900 tcp:18900
# PlugAI 上建 SSH 隧道
ssh -L 18900:127.0.0.1:18900 user@中间机器IP
# bridge_url: http://127.0.0.1:18900
```
### 第4步启动与测试
```bash
# 1. 手机端:启动 Frida + HTTP Bridge
frida -U -p <PID> -l http_bridge_stream.js
# 2. 先测 bridge 连通性
curl -N 'http://<手机IP>:18900/chat?q=你好'
# 3. PlugAI 服务端:重启小智服务
cd /home/ZeroStack/xiaozhi/xiaozhi-esp32-server/main/xiaozhi-server
source /home/ZeroStack/xiaozhi/venv/bin/activate
python app.py
# 4. ESP32 设备:唤醒测试
# 说 "你好小智" → 提问 → 应该听到蚂蚁阿福的回答EdgeTTS 合成的语音)
```
---
## 方案B 详细实施(后续)
### 核心改造voice_bridge.js 支持音频注入
当前 voice_bridge.js 的 `MFAntAudio3AV2Filter::process` hook 只**读取** micIn 缓冲区。
需要改造为可以从外部**写入** micIn 缓冲区,替换真实麦克风输入。
#### 改造要点
```javascript
// voice_bridge.js 新增功能
var injectBuffer = null; // 外部注入的 PCM 数据
// 新增 inject 命令:接收外部 PCM 音频帧
// 客户端发送: [4字节长度][type=3][960字节PCM数据]
// type 3 = inject audio
Interceptor.attach(processAddr, {
onEnter: function(args) {
var micIn = args[1]; // 麦克风输入缓冲区 (960 bytes)
var frameSize = args[4]; // 960
if (injectBuffer !== null) {
// 用注入数据覆盖真实麦克风输入
micIn.writeByteArray(injectBuffer);
injectBuffer = null;
}
}
});
```
#### 采样率转换
| 来源 | 格式 | 需转换为 |
|------|------|---------|
| ESP32 → 服务端 | Opus 24kHz mono | PCM 48kHz mono (阿福 mic) |
| 阿福 speaker 输出 | PCM 24kHz stereo | Opus 24kHz mono (ESP32) |
服务端需要:
- libopus 解码/编码
- resampy 或 scipy 做采样率转换
- 960字节帧对齐20ms @ 48kHz
#### 新增音频转发模块
文件<EFBFBD><EFBFBD>`backend/main/xiaozhi-server/core/providers/asr/antaf_voice/antaf_voice.py`
这是一个特殊的 ASR Provider它不做语音识别而是
1. 接收 ESP32 的 Opus 音频流
2. 解码为 PCM重采样 24k→48k
3. 通过 TCP 发送到 voice_bridge (port 18901) 的 inject 命令
4. 接收 voice_bridge 的 speaker 输出
5. 重采样 24k stereo → 24k monoOpus 编码
6. 直接发回 ESP32跳过 LLM 和 TTS
#### 方案B 风险点
1. **帧时序同步**: ESP32 音频帧和阿福 process() 调用频率可能不一致
2. **延迟累积**: 网络传输 + 两次重采样 + 注入延迟
3. **VAD 冲突**: 阿福自带 VAD 可能与注入音频不匹配
4. **回声消除失效**: 注入 mic 数据后<EFBC8C><E998BF>的 AEC 参考信号spkRef对不上
5. **对话控制**: 何时 open_voice / close_voice 需要与 ESP32 唤醒状态同步
---
## 依赖清单
### 方案A新增依赖
- `requests` — Python HTTP 库(服务端 venv 中应已有)
### 方案B新增<E696B0><E5A29E>
- `opuslib``pyogg` — Opus 编解码
- `resampy``scipy.signal` — 采样率转换
- `numpy` — 音频数据处理
---
## 文件清单
### 方案A
| 操<><E6938D> | 文件 |
|------|------|
| 新增 | `backend/main/xiaozhi-server/core/providers/llm/antaf/__init__.py` |
| 新增 | `backend/main/xiaozhi-server/core/providers/llm/antaf/antaf.py` |
| 修改 | `backend/main/xiaozhi-server/data/.config.yaml` |
### 方案B额外
| 操作 | 文件 |
|------|------|
| 修改 | `antaf/voice_bridge.js` (新增 inject 命令) |
| 新增 | `backend/main/xiaozhi-server/core/providers/asr/antaf_voice/antaf_voice.py` |
| 新增 | `backend/main/xiaozhi-server/core/utils/audio_resample.py` |
---
## 里程碑
| 阶段 | 目标 | 预期产出 |
|------|------|---------|
| M1 | 方案A 代码实现 | AntafLLM Provider + 配置 |
| M2 | 网络打通 | PlugAI ↔ 手机 Bridge 连通 |
| M3 | 端到端测试 | ESP32 唤醒→阿福回答→语音播报 |
| M4 | 方案B 原型 | voice_bridge 音频注入验证 |
| M5 | 方案B 集成 | 全语音直通链路 |

79
modules/tts/sherpa_tts.py Normal file
View File

@ -0,0 +1,79 @@
import io
import os
import wave
import asyncio
import numpy as np
import sherpa_onnx
from config.logger import setup_logging
from core.providers.tts.base import TTSProviderBase
TAG = __name__
logger = setup_logging()
class TTSProvider(TTSProviderBase):
def __init__(self, config, delete_audio_file):
super().__init__(config, delete_audio_file)
model_dir = config.get("model_dir", "models/vits-melo-tts-zh_en")
speed = config.get("speed", 1.0)
self.speed = float(speed) if speed else 1.0
self.sid = int(config.get("sid", 0))
# 优先使用 int8 量化模型(更快)
model_file = f"{model_dir}/model.int8.onnx"
if not os.path.exists(model_file) or os.path.getsize(model_file) < 1024:
model_file = f"{model_dir}/model.onnx"
num_threads = int(config.get("num_threads", 8))
tts_config = sherpa_onnx.OfflineTtsConfig(
model=sherpa_onnx.OfflineTtsModelConfig(
vits=sherpa_onnx.OfflineTtsVitsModelConfig(
model=model_file,
lexicon=f"{model_dir}/lexicon.txt",
tokens=f"{model_dir}/tokens.txt",
dict_dir=f"{model_dir}/dict",
),
num_threads=num_threads,
),
rule_fsts=f"{model_dir}/date.fst,{model_dir}/phone.fst,{model_dir}/number.fst,{model_dir}/new_heteronym.fst",
max_num_sentences=1,
)
self.tts = sherpa_onnx.OfflineTts(tts_config)
self.sample_rate = self.tts.sample_rate
logger.bind(tag=TAG).info(
f"SherpaOnnxTTS 初始化完成: model_dir={model_dir}, sample_rate={self.sample_rate}, sid={self.sid}"
)
def _generate_wav(self, text):
"""同步合成,在线程池中调用"""
from scipy.signal import resample_poly
from math import gcd
audio = self.tts.generate(text, sid=self.sid, speed=self.speed)
samples = np.array(audio.samples, dtype=np.float32)
# 重采样到目标采样率(设备要求 24000Hz模型输出 44100Hz
target_sr = 24000
if self.sample_rate != target_sr:
g = gcd(self.sample_rate, target_sr)
samples = resample_poly(samples, target_sr // g, self.sample_rate // g)
pcm = (samples * 32767).astype(np.int16)
wav_io = io.BytesIO()
with wave.open(wav_io, "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(target_sr)
wf.writeframes(pcm.tobytes())
return wav_io.getvalue()
async def text_to_speak(self, text, output_file):
wav_data = self._generate_wav(text)
if output_file:
with open(output_file, "wb") as f:
f.write(wav_data)
else:
return wav_data