add independent modules (not integrated into framework)
- modules/antaf/ — Antaf LLM provider, voice passthrough, bridge scripts - modules/tts/ — sherpa-onnx local TTS provider - modules/docs/ — integration plan These are standalone files, NOT patched into xiaozhi-server framework. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
ae260da3eb
commit
a88e7072b3
|
|
@ -0,0 +1,143 @@
|
||||||
|
import json
|
||||||
|
import requests
|
||||||
|
from config.logger import setup_logging
|
||||||
|
from core.providers.llm.base import LLMProviderBase
|
||||||
|
|
||||||
|
TAG = __name__
|
||||||
|
logger = setup_logging()
|
||||||
|
|
||||||
|
|
||||||
|
class LLMProvider(LLMProviderBase):
|
||||||
|
"""
|
||||||
|
蚂蚁阿福 LLM Provider
|
||||||
|
通过 Frida HTTP Bridge (port 18900) 对接蚂蚁阿福 App 的文字对话 API。
|
||||||
|
Bridge 运行在手机上,通过 adb forward 或网络暴露 SSE 流式接口。
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config):
|
||||||
|
self.bridge_url = config.get("bridge_url", "http://127.0.0.1:18900")
|
||||||
|
self.timeout = config.get("timeout", 60)
|
||||||
|
self.should_idle = False # signal to send system idle after TTS
|
||||||
|
logger.bind(tag=TAG).info(
|
||||||
|
f"AntafLLM 初始化: bridge={self.bridge_url}, timeout={self.timeout}s"
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _is_thinking(text):
|
||||||
|
"""检测蚂蚁阿福的内心思考/推理过程,这些不应该发给用户"""
|
||||||
|
thinking_patterns = [
|
||||||
|
"用户问", "用户说", "用户的", "用户可能", "用户真正",
|
||||||
|
"我得", "我会", "我在想", "我决定", "我要",
|
||||||
|
"语气比较", "感觉他", "让他知道", "让他觉得",
|
||||||
|
"先安抚", "得先", "不想表现",
|
||||||
|
"整体语气", "这样能", "这样他",
|
||||||
|
"所以我", "还带了个",
|
||||||
|
]
|
||||||
|
for p in thinking_patterns:
|
||||||
|
if p in text:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _clean_text(text):
|
||||||
|
"""清理阿福返回文本中的脏数据"""
|
||||||
|
# 去掉阿福内部状态文本
|
||||||
|
junk = [
|
||||||
|
"完成资料引用", "内容生成", "正在思考", "正在搜索",
|
||||||
|
]
|
||||||
|
for j in junk:
|
||||||
|
text = text.replace(j, "")
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _is_system_injected(content):
|
||||||
|
"""检测是否为系统注入的消息(非用户真实输入)"""
|
||||||
|
if not content:
|
||||||
|
return True
|
||||||
|
markers = [
|
||||||
|
"[系统提示]", "tool_call", "<tool_call>", "TOOL USE",
|
||||||
|
"系统提示", "工具调用", "function_call",
|
||||||
|
"handle_exit_intent", "你有以下工具", "You have access",
|
||||||
|
]
|
||||||
|
for m in markers:
|
||||||
|
if m in content:
|
||||||
|
return True
|
||||||
|
# 超过200字的 user 消息大概率是系统注入的
|
||||||
|
if len(content) > 200:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def response(self, session_id, dialogue, **kwargs):
|
||||||
|
# 从 dialogue 中提取真正的用户消息(跳过系统注入的 user 消息)
|
||||||
|
query = ""
|
||||||
|
for msg in reversed(dialogue):
|
||||||
|
if msg.get("role") == "user":
|
||||||
|
content = msg.get("content", "")
|
||||||
|
if not self._is_system_injected(content):
|
||||||
|
# ASR 结果可能是 JSON: {"content":"...", "language":"zh", "emotion":"..."}
|
||||||
|
try:
|
||||||
|
parsed = json.loads(content)
|
||||||
|
if isinstance(parsed, dict) and "content" in parsed:
|
||||||
|
query = parsed["content"]
|
||||||
|
else:
|
||||||
|
query = content
|
||||||
|
except (json.JSONDecodeError, TypeError):
|
||||||
|
query = content
|
||||||
|
break
|
||||||
|
|
||||||
|
if not query:
|
||||||
|
logger.bind(tag=TAG).warning("对话中没有用户消息")
|
||||||
|
yield "抱歉,我没有收到您的问题。"
|
||||||
|
return
|
||||||
|
|
||||||
|
# 追加简短回答提示,避免阿福回复过长导致TTS排队卡顿
|
||||||
|
query = query + "(请用2-3句话简短回答)"
|
||||||
|
self.should_idle = False
|
||||||
|
logger.bind(tag=TAG).info(f"AntafLLM 请求: {query[:50]}...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
url = f"{self.bridge_url}/chat"
|
||||||
|
resp = requests.get(
|
||||||
|
url,
|
||||||
|
params={"q": query},
|
||||||
|
stream=True,
|
||||||
|
timeout=self.timeout,
|
||||||
|
)
|
||||||
|
resp.encoding = "utf-8"
|
||||||
|
|
||||||
|
seen_texts = set()
|
||||||
|
for line in resp.iter_lines(decode_unicode=True):
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
if line.startswith("data: "):
|
||||||
|
data = line[6:]
|
||||||
|
if data == "[DONE]":
|
||||||
|
break
|
||||||
|
if not data or len(data.strip()) == 0:
|
||||||
|
continue
|
||||||
|
# 去重:跳过完全相同的文本块
|
||||||
|
if data in seen_texts:
|
||||||
|
continue
|
||||||
|
seen_texts.add(data)
|
||||||
|
# 过滤思考过程
|
||||||
|
if self._is_thinking(data):
|
||||||
|
logger.bind(tag=TAG).debug(f"过滤思考内容: {data[:50]}...")
|
||||||
|
continue
|
||||||
|
# 清理脏数据
|
||||||
|
data = self._clean_text(data)
|
||||||
|
if not data:
|
||||||
|
continue
|
||||||
|
yield data
|
||||||
|
|
||||||
|
except requests.exceptions.ConnectionError:
|
||||||
|
logger.bind(tag=TAG).error("无法连接蚂蚁阿福 Bridge,请检查手机和 Frida 状态")
|
||||||
|
self.should_idle = True
|
||||||
|
yield "抱歉,蚂蚁阿福服务暂时不可用。"
|
||||||
|
except requests.exceptions.Timeout:
|
||||||
|
logger.bind(tag=TAG).error(f"蚂蚁阿福 Bridge 超时 ({self.timeout}s)")
|
||||||
|
self.should_idle = True
|
||||||
|
yield "抱歉,回答超时了。"
|
||||||
|
except Exception as e:
|
||||||
|
logger.bind(tag=TAG).error(f"AntafLLM 异常: {e}")
|
||||||
|
self.should_idle = True
|
||||||
|
yield "抱歉,发生了错误。"
|
||||||
|
|
@ -0,0 +1,251 @@
|
||||||
|
"""
|
||||||
|
Antaf Voice Passthrough ASR Provider
|
||||||
|
|
||||||
|
Replaces ASR→LLM→TTS pipeline with direct audio forwarding to Antaf voice_bridge.
|
||||||
|
ESP32 audio → decode Opus → resample 16kHz→48kHz → inject to voice_bridge (type=3)
|
||||||
|
voice_bridge speaker (type=0) → resample 48kHz→16kHz → encode Opus → send to ESP32
|
||||||
|
|
||||||
|
Runs within xiaozhi-server, keeping all protocol handling (hello, OTA, wake word) intact.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import struct
|
||||||
|
import asyncio
|
||||||
|
import threading
|
||||||
|
import numpy as np
|
||||||
|
import opuslib_next
|
||||||
|
from scipy.signal import resample_poly
|
||||||
|
from math import gcd
|
||||||
|
from config.logger import setup_logging
|
||||||
|
from core.providers.asr.base import ASRProviderBase
|
||||||
|
from core.handle.sendAudioHandle import send_tts_message
|
||||||
|
|
||||||
|
TAG = __name__
|
||||||
|
logger = setup_logging()
|
||||||
|
|
||||||
|
# Audio parameters
|
||||||
|
ESP_SR = 16000
|
||||||
|
ESP_FRAME_SAMPLES = 960 # 60ms at 16kHz
|
||||||
|
BRIDGE_SR = 48000
|
||||||
|
BRIDGE_FRAME_SAMPLES = 480 # 960 bytes / 2 = 480 samples
|
||||||
|
|
||||||
|
# Resample ratios
|
||||||
|
UP = (BRIDGE_SR // gcd(BRIDGE_SR, ESP_SR), ESP_SR // gcd(BRIDGE_SR, ESP_SR)) # (3,1)
|
||||||
|
DOWN = (ESP_SR // gcd(ESP_SR, BRIDGE_SR), BRIDGE_SR // gcd(ESP_SR, BRIDGE_SR)) # (1,3)
|
||||||
|
|
||||||
|
|
||||||
|
class ASRProvider(ASRProviderBase):
|
||||||
|
def __init__(self, config):
|
||||||
|
super().__init__()
|
||||||
|
self.bridge_host = config.get("bridge_host", "127.0.0.1")
|
||||||
|
self.bridge_port = int(config.get("bridge_port", 18901))
|
||||||
|
self.interface_type = "NON_STREAM"
|
||||||
|
self.conn = None
|
||||||
|
self.bridge_reader = None
|
||||||
|
self.bridge_writer = None
|
||||||
|
self.opus_decoder = None
|
||||||
|
self.opus_encoder = None
|
||||||
|
self._inject_buf = np.array([], dtype=np.int16)
|
||||||
|
self._speaker_buf = np.array([], dtype=np.int16)
|
||||||
|
self._tts_started = False
|
||||||
|
self._recv_task = None
|
||||||
|
self._connected = False
|
||||||
|
logger.bind(tag=TAG).info(
|
||||||
|
f"AntafPassthrough 初始化: bridge={self.bridge_host}:{self.bridge_port}"
|
||||||
|
)
|
||||||
|
|
||||||
|
async def open_audio_channels(self, conn):
|
||||||
|
"""Override: connect to bridge, start passthrough instead of normal ASR."""
|
||||||
|
# Clean up previous connection if any
|
||||||
|
await self.close()
|
||||||
|
|
||||||
|
self.conn = conn
|
||||||
|
self.opus_decoder = opuslib_next.Decoder(ESP_SR, 1)
|
||||||
|
self.opus_encoder = opuslib_next.Encoder(ESP_SR, 1, opuslib_next.APPLICATION_AUDIO)
|
||||||
|
self._tts_started = False
|
||||||
|
self._silence_count = 0
|
||||||
|
self._inject_buf = np.array([], dtype=np.int16)
|
||||||
|
self._speaker_buf = np.array([], dtype=np.int16)
|
||||||
|
self._write_lock = threading.Lock()
|
||||||
|
|
||||||
|
# Connect to voice_bridge
|
||||||
|
try:
|
||||||
|
self.bridge_reader, self.bridge_writer = await asyncio.open_connection(
|
||||||
|
self.bridge_host, self.bridge_port
|
||||||
|
)
|
||||||
|
# Read connected event
|
||||||
|
ftype, data = await self._bridge_recv()
|
||||||
|
if ftype == 1:
|
||||||
|
msg = json.loads(data.decode())
|
||||||
|
logger.bind(tag=TAG).info(f"Bridge connected: {msg.get('protocol')}")
|
||||||
|
|
||||||
|
# Send start + inject_on
|
||||||
|
self._bridge_send_cmd({"cmd": "start"})
|
||||||
|
ftype, data = await self._bridge_recv()
|
||||||
|
if ftype == 1:
|
||||||
|
logger.bind(tag=TAG).info(f"Bridge: {json.loads(data.decode())}")
|
||||||
|
|
||||||
|
self._bridge_send_cmd({"cmd": "inject_on"})
|
||||||
|
ftype, data = await self._bridge_recv()
|
||||||
|
if ftype == 1:
|
||||||
|
logger.bind(tag=TAG).info(f"Bridge: {json.loads(data.decode())}")
|
||||||
|
|
||||||
|
self._connected = True
|
||||||
|
logger.bind(tag=TAG).info("Voice bridge ready (inject mode)")
|
||||||
|
|
||||||
|
# Start speaker receive loop
|
||||||
|
self._recv_task = asyncio.create_task(self._speaker_recv_loop())
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.bind(tag=TAG).error(f"Bridge connection failed: {e}")
|
||||||
|
self._connected = False
|
||||||
|
|
||||||
|
# Start normal audio processing thread (reads from asr_audio_queue)
|
||||||
|
conn.asr_priority_thread = threading.Thread(
|
||||||
|
target=self._audio_thread, args=(conn,), daemon=True
|
||||||
|
)
|
||||||
|
conn.asr_priority_thread.start()
|
||||||
|
|
||||||
|
def _audio_thread(self, conn):
|
||||||
|
"""Read Opus frames from queue, decode, resample, inject to bridge."""
|
||||||
|
import queue as queue_module
|
||||||
|
frame_count = 0
|
||||||
|
while not conn.stop_event.is_set():
|
||||||
|
try:
|
||||||
|
opus_data = conn.asr_audio_queue.get(timeout=1)
|
||||||
|
if not self._connected:
|
||||||
|
continue
|
||||||
|
|
||||||
|
frame_count += 1
|
||||||
|
if frame_count <= 3 or frame_count % 200 == 0:
|
||||||
|
logger.bind(tag=TAG).debug(f"Audio frame #{frame_count}")
|
||||||
|
|
||||||
|
# Decode Opus → PCM 16kHz
|
||||||
|
pcm = self.opus_decoder.decode(opus_data, ESP_FRAME_SAMPLES)
|
||||||
|
samples = np.frombuffer(pcm, dtype=np.int16)
|
||||||
|
|
||||||
|
# Resample 16kHz → 48kHz
|
||||||
|
upsampled = resample_poly(samples, UP[0], UP[1]).astype(np.int16)
|
||||||
|
|
||||||
|
# Split into bridge frames and inject
|
||||||
|
self._inject_buf = np.concatenate([self._inject_buf, upsampled])
|
||||||
|
while len(self._inject_buf) >= BRIDGE_FRAME_SAMPLES:
|
||||||
|
frame = self._inject_buf[:BRIDGE_FRAME_SAMPLES]
|
||||||
|
self._inject_buf = self._inject_buf[BRIDGE_FRAME_SAMPLES:]
|
||||||
|
self._bridge_send_inject(frame.tobytes())
|
||||||
|
|
||||||
|
except queue_module.Empty:
|
||||||
|
continue
|
||||||
|
except Exception as e:
|
||||||
|
logger.bind(tag=TAG).error(f"Audio thread error: {e}")
|
||||||
|
|
||||||
|
async def _speaker_recv_loop(self):
|
||||||
|
"""Receive speaker PCM from bridge, resample, encode Opus, send to ESP32."""
|
||||||
|
try:
|
||||||
|
while self._connected:
|
||||||
|
ftype, data = await self._bridge_recv()
|
||||||
|
if ftype == 0:
|
||||||
|
# Speaker audio
|
||||||
|
await self._handle_speaker(data)
|
||||||
|
elif ftype == 1:
|
||||||
|
msg = json.loads(data.decode())
|
||||||
|
logger.bind(tag=TAG).debug(f"Bridge event: {msg}")
|
||||||
|
except asyncio.IncompleteReadError:
|
||||||
|
logger.bind(tag=TAG).warning("Bridge connection closed")
|
||||||
|
except Exception as e:
|
||||||
|
logger.bind(tag=TAG).error(f"Speaker recv error: {e}")
|
||||||
|
finally:
|
||||||
|
self._connected = False
|
||||||
|
|
||||||
|
async def _handle_speaker(self, pcm_bytes):
|
||||||
|
"""Process speaker frame and send to ESP32."""
|
||||||
|
if not self.conn or not self.conn.websocket:
|
||||||
|
return
|
||||||
|
|
||||||
|
samples = np.frombuffer(pcm_bytes, dtype=np.int16)
|
||||||
|
max_amp = int(np.max(np.abs(samples)))
|
||||||
|
|
||||||
|
# Track silence for tts stop
|
||||||
|
if max_amp < 10:
|
||||||
|
if self._tts_started:
|
||||||
|
self._silence_count += 1
|
||||||
|
# 50 frames of silence (~1 second) → send tts stop
|
||||||
|
if self._silence_count > 50:
|
||||||
|
try:
|
||||||
|
await send_tts_message(self.conn, "stop")
|
||||||
|
self.conn.client_is_speaking = False
|
||||||
|
self._tts_started = False
|
||||||
|
self._silence_count = 0
|
||||||
|
logger.bind(tag=TAG).info("Sent tts stop to ESP32")
|
||||||
|
except Exception as e:
|
||||||
|
logger.bind(tag=TAG).error(f"Send tts stop error: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Reset silence counter on non-silent frame
|
||||||
|
self._silence_count = 0
|
||||||
|
|
||||||
|
# Send tts start before first audio
|
||||||
|
if not self._tts_started:
|
||||||
|
try:
|
||||||
|
await send_tts_message(self.conn, "start")
|
||||||
|
self._tts_started = True
|
||||||
|
self.conn.client_is_speaking = True
|
||||||
|
logger.bind(tag=TAG).info("Sent tts start to ESP32")
|
||||||
|
except Exception as e:
|
||||||
|
logger.bind(tag=TAG).error(f"Send tts start error: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Resample 48kHz → 16kHz
|
||||||
|
downsampled = resample_poly(samples, DOWN[0], DOWN[1]).astype(np.int16)
|
||||||
|
|
||||||
|
# Accumulate and encode
|
||||||
|
self._speaker_buf = np.concatenate([self._speaker_buf, downsampled])
|
||||||
|
while len(self._speaker_buf) >= ESP_FRAME_SAMPLES:
|
||||||
|
frame = self._speaker_buf[:ESP_FRAME_SAMPLES]
|
||||||
|
self._speaker_buf = self._speaker_buf[ESP_FRAME_SAMPLES:]
|
||||||
|
opus_data = self.opus_encoder.encode(frame.tobytes(), ESP_FRAME_SAMPLES)
|
||||||
|
try:
|
||||||
|
await self.conn.websocket.send(opus_data)
|
||||||
|
except Exception as e:
|
||||||
|
logger.bind(tag=TAG).error(f"Send opus to ESP32 error: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Bridge TCP helpers
|
||||||
|
async def _bridge_recv(self):
|
||||||
|
header = await self.bridge_reader.readexactly(5)
|
||||||
|
length = struct.unpack(">I", header[:4])[0]
|
||||||
|
ftype = header[4]
|
||||||
|
data = await self.bridge_reader.readexactly(length)
|
||||||
|
return ftype, data
|
||||||
|
|
||||||
|
def _bridge_send_cmd(self, cmd):
|
||||||
|
data = json.dumps(cmd).encode()
|
||||||
|
header = struct.pack(">IB", len(data), 1)
|
||||||
|
with self._write_lock:
|
||||||
|
self.bridge_writer.write(header + data)
|
||||||
|
|
||||||
|
def _bridge_send_inject(self, pcm_bytes):
|
||||||
|
header = struct.pack(">IB", len(pcm_bytes), 3)
|
||||||
|
with self._write_lock:
|
||||||
|
self.bridge_writer.write(header + pcm_bytes)
|
||||||
|
|
||||||
|
# ASR interface — never returns text, LLM/TTS never triggered
|
||||||
|
async def receive_audio(self, conn, audio, audio_have_voice):
|
||||||
|
"""No-op: audio is handled by _audio_thread directly from queue."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
async def speech_to_text(self, opus_data, session_id, audio_format="opus", artifacts=None):
|
||||||
|
"""Never called in passthrough mode."""
|
||||||
|
return "", None
|
||||||
|
|
||||||
|
async def close(self):
|
||||||
|
self._connected = False
|
||||||
|
if self._recv_task:
|
||||||
|
self._recv_task.cancel()
|
||||||
|
if self.bridge_writer:
|
||||||
|
try:
|
||||||
|
self._bridge_send_cmd({"cmd": "inject_off"})
|
||||||
|
self._bridge_send_cmd({"cmd": "stop"})
|
||||||
|
self.bridge_writer.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
@ -0,0 +1,307 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
ESP32 ↔ Antaf Voice Relay
|
||||||
|
Bridges ESP32 (WebSocket/Opus) with Antaf voice_bridge (TCP/PCM).
|
||||||
|
|
||||||
|
ESP32 → Opus decode → resample 16kHz→48kHz → voice_bridge inject (type=3)
|
||||||
|
ESP32 ← Opus encode ← resample 48kHz→16kHz ← voice_bridge speaker (type=0)
|
||||||
|
|
||||||
|
Usage: python relay.py [--ws-port 8010] [--bridge-host 127.0.0.1] [--bridge-port 18901]
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import struct
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import numpy as np
|
||||||
|
from scipy.signal import resample_poly
|
||||||
|
from math import gcd
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
||||||
|
log = logging.getLogger("relay")
|
||||||
|
|
||||||
|
try:
|
||||||
|
import opuslib_next as opuslib
|
||||||
|
except ImportError:
|
||||||
|
import opuslib
|
||||||
|
|
||||||
|
import websockets
|
||||||
|
|
||||||
|
# Audio parameters
|
||||||
|
ESP_SAMPLE_RATE = 16000 # ESP32 Opus sample rate
|
||||||
|
ESP_FRAME_MS = 60 # ESP32 frame duration
|
||||||
|
ESP_FRAME_SIZE = ESP_SAMPLE_RATE * ESP_FRAME_MS // 1000 # 960 samples
|
||||||
|
|
||||||
|
BRIDGE_SAMPLE_RATE = 48000 # voice_bridge micIn sample rate
|
||||||
|
BRIDGE_FRAME_BYTES = 960 # 480 samples * 2 bytes
|
||||||
|
BRIDGE_FRAME_SAMPLES = 480
|
||||||
|
|
||||||
|
# Resampling ratios
|
||||||
|
UP_GCD = gcd(BRIDGE_SAMPLE_RATE, ESP_SAMPLE_RATE) # 16000 → 48000
|
||||||
|
UP_RATIO = (BRIDGE_SAMPLE_RATE // UP_GCD, ESP_SAMPLE_RATE // UP_GCD) # (3, 1)
|
||||||
|
DOWN_GCD = gcd(ESP_SAMPLE_RATE, BRIDGE_SAMPLE_RATE) # 48000 → 16000
|
||||||
|
DOWN_RATIO = (ESP_SAMPLE_RATE // DOWN_GCD, BRIDGE_SAMPLE_RATE // DOWN_GCD) # (1, 3)
|
||||||
|
|
||||||
|
|
||||||
|
class BridgeClient:
|
||||||
|
"""TCP client for voice_bridge_v7."""
|
||||||
|
|
||||||
|
def __init__(self, host, port):
|
||||||
|
self.host = host
|
||||||
|
self.port = port
|
||||||
|
self.reader = None
|
||||||
|
self.writer = None
|
||||||
|
self.on_speaker_frame = None # callback(pcm_bytes)
|
||||||
|
self._recv_task = None
|
||||||
|
|
||||||
|
async def connect(self):
|
||||||
|
self.reader, self.writer = await asyncio.open_connection(self.host, self.port)
|
||||||
|
log.info(f"Connected to voice_bridge {self.host}:{self.port}")
|
||||||
|
|
||||||
|
# Read connected event
|
||||||
|
ftype, data = await self._recv_frame()
|
||||||
|
if ftype == 1:
|
||||||
|
msg = json.loads(data.decode())
|
||||||
|
log.info(f"Bridge: {msg.get('protocol')}")
|
||||||
|
|
||||||
|
async def _recv_frame(self):
|
||||||
|
header = await self.reader.readexactly(5)
|
||||||
|
length = struct.unpack(">I", header[:4])[0]
|
||||||
|
ftype = header[4]
|
||||||
|
data = await self.reader.readexactly(length)
|
||||||
|
return ftype, data
|
||||||
|
|
||||||
|
def _send_frame(self, ftype, data):
|
||||||
|
header = struct.pack(">IB", len(data), ftype)
|
||||||
|
self.writer.write(header + data)
|
||||||
|
# Note: no await drain() here — voice frames are time-sensitive,
|
||||||
|
# TCP buffer handles backpressure
|
||||||
|
|
||||||
|
def send_cmd(self, cmd):
|
||||||
|
self._send_frame(1, json.dumps(cmd).encode())
|
||||||
|
|
||||||
|
def send_inject(self, pcm_bytes):
|
||||||
|
self._send_frame(3, pcm_bytes)
|
||||||
|
|
||||||
|
async def start_recv_loop(self):
|
||||||
|
"""Background task: receive frames from bridge."""
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
ftype, data = await self._recv_frame()
|
||||||
|
if ftype == 0 and self.on_speaker_frame:
|
||||||
|
# Speaker audio
|
||||||
|
await self.on_speaker_frame(data)
|
||||||
|
elif ftype == 1:
|
||||||
|
msg = json.loads(data.decode())
|
||||||
|
log.info(f"Bridge event: {msg}")
|
||||||
|
except asyncio.IncompleteReadError:
|
||||||
|
log.warning("Bridge connection closed")
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f"Bridge recv error: {e}")
|
||||||
|
|
||||||
|
async def setup_voice(self):
|
||||||
|
"""Start capture and enable inject. Voice chat must already be open."""
|
||||||
|
self.send_cmd({"cmd": "start"})
|
||||||
|
ftype, data = await self._recv_frame()
|
||||||
|
if ftype == 1:
|
||||||
|
msg = json.loads(data.decode())
|
||||||
|
log.info(f"Bridge: {msg}")
|
||||||
|
self.send_cmd({"cmd": "inject_on"})
|
||||||
|
ftype, data = await self._recv_frame()
|
||||||
|
if ftype == 1:
|
||||||
|
msg = json.loads(data.decode())
|
||||||
|
log.info(f"Bridge: {msg}")
|
||||||
|
log.info("Voice bridge ready (inject mode)")
|
||||||
|
|
||||||
|
async def close(self):
|
||||||
|
try:
|
||||||
|
self.send_cmd({"cmd": "inject_off"})
|
||||||
|
self.send_cmd({"cmd": "stop"})
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if self.writer:
|
||||||
|
self.writer.close()
|
||||||
|
|
||||||
|
|
||||||
|
class Relay:
|
||||||
|
"""Main relay: ESP32 WebSocket ↔ Antaf voice_bridge TCP."""
|
||||||
|
|
||||||
|
def __init__(self, ws_port, bridge_host, bridge_port):
|
||||||
|
self.ws_port = ws_port
|
||||||
|
self.bridge_host = bridge_host
|
||||||
|
self.bridge_port = bridge_port
|
||||||
|
self.bridge = None
|
||||||
|
self.ws = None
|
||||||
|
self.opus_decoder = None
|
||||||
|
self.opus_encoder = None
|
||||||
|
# Buffer for resampled PCM to split into bridge frames
|
||||||
|
self._inject_buf = np.array([], dtype=np.int16)
|
||||||
|
# Buffer for speaker PCM to accumulate before encoding
|
||||||
|
self._speaker_buf = np.array([], dtype=np.int16)
|
||||||
|
self._audio_in_count = 0
|
||||||
|
self._audio_out_count = 0
|
||||||
|
self._tts_started = False # track if we sent tts start to ESP32
|
||||||
|
|
||||||
|
async def handle_esp32(self, websocket):
|
||||||
|
"""Handle one ESP32 WebSocket connection."""
|
||||||
|
log.info(f"ESP32 connected from {websocket.remote_address}")
|
||||||
|
self.ws = websocket
|
||||||
|
|
||||||
|
# Init Opus codec
|
||||||
|
self.opus_decoder = opuslib.Decoder(ESP_SAMPLE_RATE, 1)
|
||||||
|
self.opus_encoder = opuslib.Encoder(ESP_SAMPLE_RATE, 1, opuslib.APPLICATION_AUDIO)
|
||||||
|
|
||||||
|
# Connect to voice bridge and setup voice chat first
|
||||||
|
self.bridge = BridgeClient(self.bridge_host, self.bridge_port)
|
||||||
|
await self.bridge.connect()
|
||||||
|
await self.bridge.setup_voice()
|
||||||
|
|
||||||
|
# Now start receiving speaker audio
|
||||||
|
self.bridge.on_speaker_frame = self._on_speaker_frame
|
||||||
|
recv_task = asyncio.create_task(self.bridge.start_recv_loop())
|
||||||
|
|
||||||
|
try:
|
||||||
|
async for message in websocket:
|
||||||
|
if isinstance(message, str):
|
||||||
|
# Text message from ESP32 (hello, listen, etc.)
|
||||||
|
await self._handle_text(message)
|
||||||
|
elif isinstance(message, bytes):
|
||||||
|
# Opus audio from ESP32
|
||||||
|
await self._handle_audio(message)
|
||||||
|
except websockets.exceptions.ConnectionClosed:
|
||||||
|
log.info("ESP32 disconnected")
|
||||||
|
finally:
|
||||||
|
recv_task.cancel()
|
||||||
|
await self.bridge.close()
|
||||||
|
self.ws = None
|
||||||
|
log.info("Session ended")
|
||||||
|
|
||||||
|
async def _handle_text(self, message):
|
||||||
|
"""Handle text messages from ESP32."""
|
||||||
|
try:
|
||||||
|
msg = json.loads(message)
|
||||||
|
msg_type = msg.get("type")
|
||||||
|
|
||||||
|
if msg_type == "hello":
|
||||||
|
# Respond with proper hello — must match xiaozhi protocol
|
||||||
|
resp = {
|
||||||
|
"type": "hello",
|
||||||
|
"version": 1,
|
||||||
|
"transport": "websocket",
|
||||||
|
"session_id": "relay-session",
|
||||||
|
"audio_params": {
|
||||||
|
"format": "opus",
|
||||||
|
"sample_rate": ESP_SAMPLE_RATE,
|
||||||
|
"channels": 1,
|
||||||
|
"frame_duration": ESP_FRAME_MS,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
await self.ws.send(json.dumps(resp))
|
||||||
|
log.info(f"ESP32 hello: {msg.get('audio_params')}")
|
||||||
|
|
||||||
|
elif msg_type == "listen":
|
||||||
|
state = msg.get("state")
|
||||||
|
log.info(f"ESP32 listen: {state}")
|
||||||
|
if state == "detect":
|
||||||
|
text = msg.get("text", "")
|
||||||
|
log.info(f"Wake word: {text}")
|
||||||
|
# Don't send tts start — let ESP32 continue recording
|
||||||
|
|
||||||
|
elif msg_type == "abort":
|
||||||
|
log.info("ESP32 abort")
|
||||||
|
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
log.warning(f"Invalid JSON from ESP32: {message[:100]}")
|
||||||
|
|
||||||
|
async def _handle_audio(self, opus_data):
|
||||||
|
"""Decode Opus from ESP32, resample, inject into voice_bridge."""
|
||||||
|
try:
|
||||||
|
self._audio_in_count += 1
|
||||||
|
if self._audio_in_count <= 3 or self._audio_in_count % 100 == 0:
|
||||||
|
log.info(f"ESP32 audio frame #{self._audio_in_count}, size={len(opus_data)}")
|
||||||
|
|
||||||
|
# Decode Opus → PCM 16kHz mono
|
||||||
|
pcm = self.opus_decoder.decode(opus_data, ESP_FRAME_SIZE)
|
||||||
|
samples = np.frombuffer(pcm, dtype=np.int16)
|
||||||
|
|
||||||
|
# Resample 16kHz → 48kHz
|
||||||
|
upsampled = resample_poly(samples, UP_RATIO[0], UP_RATIO[1]).astype(np.int16)
|
||||||
|
|
||||||
|
# Append to inject buffer and send in bridge frame sizes
|
||||||
|
self._inject_buf = np.concatenate([self._inject_buf, upsampled])
|
||||||
|
while len(self._inject_buf) >= BRIDGE_FRAME_SAMPLES:
|
||||||
|
frame = self._inject_buf[:BRIDGE_FRAME_SAMPLES]
|
||||||
|
self._inject_buf = self._inject_buf[BRIDGE_FRAME_SAMPLES:]
|
||||||
|
self.bridge.send_inject(frame.tobytes())
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f"Audio inject error: {e}")
|
||||||
|
|
||||||
|
async def _on_speaker_frame(self, pcm_bytes):
|
||||||
|
"""Receive speaker PCM from bridge, resample, encode Opus, send to ESP32."""
|
||||||
|
if not self.ws or getattr(self.ws, 'closed', False):
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
self._audio_out_count += 1
|
||||||
|
|
||||||
|
samples = np.frombuffer(pcm_bytes, dtype=np.int16)
|
||||||
|
max_amp = int(np.max(np.abs(samples)))
|
||||||
|
|
||||||
|
if self._audio_out_count <= 3 or self._audio_out_count % 100 == 0:
|
||||||
|
log.info(f"Speaker frame #{self._audio_out_count}, size={len(pcm_bytes)}, max_amp={max_amp}")
|
||||||
|
|
||||||
|
# Only send non-silent frames to ESP32
|
||||||
|
if max_amp < 10:
|
||||||
|
# If we were playing and now silent for a while, send tts stop
|
||||||
|
if self._tts_started and self._audio_out_count % 50 == 0:
|
||||||
|
# Check later — don't stop immediately, silence gaps are normal
|
||||||
|
pass
|
||||||
|
return
|
||||||
|
|
||||||
|
# Send tts start before first audio frame
|
||||||
|
if not self._tts_started:
|
||||||
|
await self.ws.send(json.dumps({
|
||||||
|
"type": "tts", "state": "start",
|
||||||
|
"session_id": "relay-session"
|
||||||
|
}))
|
||||||
|
self._tts_started = True
|
||||||
|
log.info("Sent tts start to ESP32")
|
||||||
|
|
||||||
|
# Resample 48kHz → 16kHz
|
||||||
|
downsampled = resample_poly(samples, DOWN_RATIO[0], DOWN_RATIO[1]).astype(np.int16)
|
||||||
|
|
||||||
|
# Accumulate into speaker buffer, encode when we have enough
|
||||||
|
self._speaker_buf = np.concatenate([self._speaker_buf, downsampled])
|
||||||
|
while len(self._speaker_buf) >= ESP_FRAME_SIZE:
|
||||||
|
frame = self._speaker_buf[:ESP_FRAME_SIZE]
|
||||||
|
self._speaker_buf = self._speaker_buf[ESP_FRAME_SIZE:]
|
||||||
|
# Encode PCM → Opus
|
||||||
|
opus_data = self.opus_encoder.encode(frame.tobytes(), ESP_FRAME_SIZE)
|
||||||
|
await self.ws.send(opus_data)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f"Speaker send error: {e}")
|
||||||
|
|
||||||
|
async def run(self):
|
||||||
|
log.info(f"Relay starting on ws://0.0.0.0:{self.ws_port}/xiaozhi/v1/")
|
||||||
|
async with websockets.serve(
|
||||||
|
self.handle_esp32, "0.0.0.0", self.ws_port,
|
||||||
|
ping_interval=30, ping_timeout=10,
|
||||||
|
):
|
||||||
|
await asyncio.Future() # run forever
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="ESP32-Antaf Voice Relay")
|
||||||
|
parser.add_argument("--ws-port", type=int, default=8010, help="WebSocket port for ESP32")
|
||||||
|
parser.add_argument("--bridge-host", default="127.0.0.1", help="voice_bridge host")
|
||||||
|
parser.add_argument("--bridge-port", type=int, default=18901, help="voice_bridge port")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
relay = Relay(args.ws_port, args.bridge_host, args.bridge_port)
|
||||||
|
asyncio.run(relay.run())
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -0,0 +1,130 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Test voice_bridge_v7 audio injection.
|
||||||
|
Connect to voice_bridge, open voice chat, enable inject mode,
|
||||||
|
send silence frames, and print any speaker output received.
|
||||||
|
|
||||||
|
Usage: python test_inject.py [host] [port]
|
||||||
|
"""
|
||||||
|
import socket
|
||||||
|
import struct
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
|
|
||||||
|
HOST = sys.argv[1] if len(sys.argv) > 1 else "127.0.0.1"
|
||||||
|
PORT = int(sys.argv[2]) if len(sys.argv) > 2 else 18901
|
||||||
|
|
||||||
|
FRAME_SIZE = 960 # 960 bytes per frame (480 samples * 16bit)
|
||||||
|
|
||||||
|
|
||||||
|
def send_cmd(sock, cmd):
|
||||||
|
data = json.dumps(cmd).encode("utf-8")
|
||||||
|
header = struct.pack(">IB", len(data), 1) # type=1 text
|
||||||
|
sock.sendall(header + data)
|
||||||
|
|
||||||
|
|
||||||
|
def send_inject(sock, pcm_frame):
|
||||||
|
header = struct.pack(">IB", len(pcm_frame), 3) # type=3 inject
|
||||||
|
sock.sendall(header + pcm_frame)
|
||||||
|
|
||||||
|
|
||||||
|
def recv_exact(sock, n):
|
||||||
|
buf = b""
|
||||||
|
while len(buf) < n:
|
||||||
|
chunk = sock.recv(n - len(buf))
|
||||||
|
if not chunk:
|
||||||
|
return None
|
||||||
|
buf += chunk
|
||||||
|
return buf
|
||||||
|
|
||||||
|
|
||||||
|
def recv_frame(sock):
|
||||||
|
header = recv_exact(sock, 5)
|
||||||
|
if header is None:
|
||||||
|
return None, None
|
||||||
|
length = struct.unpack(">I", header[:4])[0]
|
||||||
|
ftype = header[4]
|
||||||
|
if length > 1048576:
|
||||||
|
return None, None
|
||||||
|
data = recv_exact(sock, length)
|
||||||
|
if data is None:
|
||||||
|
return None, None
|
||||||
|
return ftype, data
|
||||||
|
|
||||||
|
|
||||||
|
def receiver(sock):
|
||||||
|
"""Background thread to print received frames."""
|
||||||
|
spk_count = 0
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
ftype, data = recv_frame(sock)
|
||||||
|
if ftype is None:
|
||||||
|
print("[RECV] Connection closed")
|
||||||
|
break
|
||||||
|
if ftype == 1: # text/json
|
||||||
|
msg = json.loads(data.decode("utf-8"))
|
||||||
|
print(f"[RECV] {msg}")
|
||||||
|
elif ftype == 0: # speaker audio
|
||||||
|
spk_count += 1
|
||||||
|
# Check if audio is non-silent
|
||||||
|
samples = struct.unpack(f"<{len(data)//2}h", data)
|
||||||
|
max_amp = max(abs(s) for s in samples)
|
||||||
|
if spk_count <= 5 or spk_count % 100 == 0 or max_amp > 500:
|
||||||
|
print(f"[SPK] frame={spk_count} size={len(data)} max_amp={max_amp}")
|
||||||
|
elif ftype == 2: # mic audio
|
||||||
|
pass # ignore mic echo
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[RECV] Error: {e}")
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print(f"Connecting to {HOST}:{PORT}...")
|
||||||
|
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||||
|
sock.connect((HOST, PORT))
|
||||||
|
print("Connected")
|
||||||
|
|
||||||
|
# Start receiver thread
|
||||||
|
t = threading.Thread(target=receiver, args=(sock,), daemon=True)
|
||||||
|
t.start()
|
||||||
|
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
# Open voice chat
|
||||||
|
print("Opening voice chat...")
|
||||||
|
send_cmd(sock, {"cmd": "open_voice"})
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
# Start capture
|
||||||
|
print("Starting capture...")
|
||||||
|
send_cmd(sock, {"cmd": "start"})
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
# Enable inject mode
|
||||||
|
print("Enabling inject mode...")
|
||||||
|
send_cmd(sock, {"cmd": "inject_on"})
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
# Send silence frames for 3 seconds (48kHz, 960 bytes/frame = 20ms)
|
||||||
|
# 3 seconds = 150 frames
|
||||||
|
print("Sending 150 silence frames (3 seconds)...")
|
||||||
|
silence = b"\x00" * FRAME_SIZE
|
||||||
|
for i in range(150):
|
||||||
|
send_inject(sock, silence)
|
||||||
|
time.sleep(0.02) # 20ms per frame
|
||||||
|
|
||||||
|
print("Done sending. Waiting for speaker output...")
|
||||||
|
time.sleep(10)
|
||||||
|
|
||||||
|
# Stop
|
||||||
|
send_cmd(sock, {"cmd": "inject_off"})
|
||||||
|
send_cmd(sock, {"cmd": "stop"})
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
print("Test complete")
|
||||||
|
sock.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -0,0 +1,209 @@
|
||||||
|
// voice_bridge_v7.js — Voice Bridge with Audio Injection
|
||||||
|
// Hook point: libantaudio.so MFAntAudio3AV2Filter::process(micIn, spkRef, out, size, &result)
|
||||||
|
// TCP :18901
|
||||||
|
// Frame: 4-byte len + 1-byte type + payload
|
||||||
|
// type 0: speaker/AI audio (spkRef, downstream to client)
|
||||||
|
// type 1: text/JSON command
|
||||||
|
// type 2: mic audio (micIn, downstream to client)
|
||||||
|
// type 3: inject audio (upstream from client, replaces micIn)
|
||||||
|
|
||||||
|
var voiceActive = false;
|
||||||
|
var clientOS = null;
|
||||||
|
var capturedSpk = 0, capturedMic = 0, spkBytes = 0, micBytes = 0;
|
||||||
|
var injectMode = false; // true = replace mic with injected audio
|
||||||
|
var injectQueue = []; // queue of PCM frames to inject
|
||||||
|
|
||||||
|
function wf(os, type, jArr) {
|
||||||
|
try {
|
||||||
|
var len = jArr.length;
|
||||||
|
var h = Java.array("byte", [(len>>24)&0xFF,(len>>16)&0xFF,(len>>8)&0xFF,len&0xFF, type]);
|
||||||
|
os.write(h); os.write(jArr); os.flush();
|
||||||
|
} catch(e) {}
|
||||||
|
}
|
||||||
|
function wt(os, text) {
|
||||||
|
wf(os, 1, Java.use("java.lang.String").$new(text).getBytes("UTF-8"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// === Hook libantaudio.so ===
|
||||||
|
var hooked = false;
|
||||||
|
function tryHook() {
|
||||||
|
if (hooked) return;
|
||||||
|
var m = Process.findModuleByName("libantaudio.so");
|
||||||
|
if (!m) return;
|
||||||
|
var addr = m.findExportByName("_ZN8antaudio20MFAntAudio3AV2Filter7processEPhS1_S1_iRi");
|
||||||
|
if (!addr) return;
|
||||||
|
hooked = true;
|
||||||
|
|
||||||
|
Interceptor.attach(addr, {
|
||||||
|
onEnter: function(args) {
|
||||||
|
if (!voiceActive || !clientOS) return;
|
||||||
|
var size = args[4].toInt32();
|
||||||
|
if (size <= 0) return;
|
||||||
|
|
||||||
|
try {
|
||||||
|
// If inject mode, replace micIn with queued or silence
|
||||||
|
if (injectMode) {
|
||||||
|
if (injectQueue.length > 0) {
|
||||||
|
var frame = injectQueue.shift();
|
||||||
|
// Only write if frame size matches expected size
|
||||||
|
if (frame.byteLength === size) {
|
||||||
|
args[1].writeByteArray(frame);
|
||||||
|
} else if (frame.byteLength > 0) {
|
||||||
|
// Size mismatch — pad or truncate
|
||||||
|
var buf = new ArrayBuffer(size);
|
||||||
|
var dst = new Uint8Array(buf);
|
||||||
|
var src = new Uint8Array(frame);
|
||||||
|
var copyLen = Math.min(size, frame.byteLength);
|
||||||
|
for (var k = 0; k < copyLen; k++) dst[k] = src[k];
|
||||||
|
args[1].writeByteArray(buf);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// No data queued — inject silence to avoid mic leak
|
||||||
|
var silence = new ArrayBuffer(size);
|
||||||
|
args[1].writeByteArray(silence);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Always capture speaker/AI output (type 0)
|
||||||
|
var spkPcm = args[2].readByteArray(size);
|
||||||
|
var spkArr = Java.array("byte", Array.from(new Uint8Array(spkPcm)));
|
||||||
|
wf(clientOS, 0, spkArr);
|
||||||
|
capturedSpk++; spkBytes += size;
|
||||||
|
|
||||||
|
// Capture mic (type 2) only when not injecting
|
||||||
|
if (!injectMode) {
|
||||||
|
var micPcm = args[1].readByteArray(size);
|
||||||
|
var micArr = Java.array("byte", Array.from(new Uint8Array(micPcm)));
|
||||||
|
wf(clientOS, 2, micArr);
|
||||||
|
}
|
||||||
|
capturedMic++; micBytes += size;
|
||||||
|
|
||||||
|
if (capturedMic <= 3 || capturedMic % 500 === 0)
|
||||||
|
console.log("[VOICE] mic=" + capturedMic + " spk=" + capturedSpk + " inject=" + injectQueue.length);
|
||||||
|
} catch(e) {}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
console.log("[VOICE] 3AV2Filter.process hooked @ " + addr);
|
||||||
|
}
|
||||||
|
|
||||||
|
[0, 1000, 3000, 5000, 10000, 15000, 20000].forEach(function(ms) { setTimeout(tryHook, ms); });
|
||||||
|
try {
|
||||||
|
new ApiResolver("module").enumerateMatches("exports:linker*!*dlopen*").forEach(function(d) {
|
||||||
|
Interceptor.attach(d.address, { onLeave: function() { setTimeout(tryHook, 500); } });
|
||||||
|
});
|
||||||
|
} catch(e) {}
|
||||||
|
|
||||||
|
// === TCP Server ===
|
||||||
|
Java.perform(function() {
|
||||||
|
var SS = Java.use("java.net.ServerSocket");
|
||||||
|
var JS = Java.use("java.lang.String");
|
||||||
|
var server = SS.$new(18901);
|
||||||
|
console.log("[VOICE] Listening :18901");
|
||||||
|
|
||||||
|
function openVoice(os) {
|
||||||
|
Java.scheduleOnMainThread(function() {
|
||||||
|
try {
|
||||||
|
Java.choose("com.antgroup.aijk.android.ijklauncher.biz.activity.IJKActivity", {
|
||||||
|
onMatch: function(a) {
|
||||||
|
var fm = a.getSupportFragmentManager();
|
||||||
|
var f = Java.use("com.antgroup.aijk.android.ijkchat.biz.voicechat.IjkVoiceChatFragment").$new();
|
||||||
|
f.show(fm, "v");
|
||||||
|
console.log("[VOICE] Opened");
|
||||||
|
}, onComplete: function() {}
|
||||||
|
});
|
||||||
|
setTimeout(function() { wt(os, JSON.stringify({event:"voice_opened"})); }, 2000);
|
||||||
|
} catch(e) { wt(os, JSON.stringify({event:"error",msg:""+e})); }
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function closeVoice(os) {
|
||||||
|
Java.scheduleOnMainThread(function() {
|
||||||
|
try {
|
||||||
|
Java.choose("com.antgroup.aijk.android.ijkchat.biz.voicechat.IjkVoiceChatFragment", {
|
||||||
|
onMatch: function(f) { f.dismiss(); console.log("[VOICE] Closed"); },
|
||||||
|
onComplete: function() {}
|
||||||
|
});
|
||||||
|
setTimeout(function() { wt(os, JSON.stringify({event:"voice_closed"})); }, 1000);
|
||||||
|
} catch(e) { wt(os, JSON.stringify({event:"error",msg:""+e})); }
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
var Srv = Java.registerClass({
|
||||||
|
name: "com.antaf.voice.S7",
|
||||||
|
implements: [Java.use("java.lang.Runnable")],
|
||||||
|
methods: {
|
||||||
|
run: function() {
|
||||||
|
while (true) {
|
||||||
|
try {
|
||||||
|
console.log("[VOICE] Waiting...");
|
||||||
|
var c = server.accept();
|
||||||
|
var is = c.getInputStream();
|
||||||
|
var os = c.getOutputStream();
|
||||||
|
clientOS = os;
|
||||||
|
console.log("[VOICE] Connected");
|
||||||
|
wt(os, JSON.stringify({
|
||||||
|
event:"connected", protocol:"antaf-voice-v8",
|
||||||
|
commands:["open_voice","close_voice","start","stop","status","inject_on","inject_off"],
|
||||||
|
audio:"pcm-16bit-960b-frames",
|
||||||
|
frameTypes:{0:"spk_ai",1:"text",2:"mic",3:"inject"}
|
||||||
|
}));
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
var hb = [];
|
||||||
|
for (var i=0;i<5;i++) { var b=is.read(); if(b<0) throw "EOF"; hb.push(b); }
|
||||||
|
var fl=(hb[0]<<24)|(hb[1]<<16)|(hb[2]<<8)|hb[3], ft=hb[4];
|
||||||
|
if (fl>1048576) break;
|
||||||
|
var pb = [];
|
||||||
|
for (var i=0;i<fl;i++) { var b=is.read(); if(b<0) throw "EOF"; pb.push(b&0xFF); }
|
||||||
|
|
||||||
|
if (ft === 3) {
|
||||||
|
// type 3: inject audio frame into micIn
|
||||||
|
var arr = new ArrayBuffer(pb.length);
|
||||||
|
var view = new Uint8Array(arr);
|
||||||
|
for (var j=0;j<pb.length;j++) view[j] = pb[j];
|
||||||
|
injectQueue.push(arr);
|
||||||
|
}
|
||||||
|
else if (ft === 1) {
|
||||||
|
var pl = Java.array("byte", pb);
|
||||||
|
var cmd = JSON.parse(JS.$new(pl,"UTF-8").toString());
|
||||||
|
console.log("[VOICE] Cmd: " + JSON.stringify(cmd));
|
||||||
|
if (cmd.cmd === "open_voice") openVoice(os);
|
||||||
|
else if (cmd.cmd === "close_voice") closeVoice(os);
|
||||||
|
else if (cmd.cmd === "start") {
|
||||||
|
voiceActive = true;
|
||||||
|
capturedSpk=0;capturedMic=0;spkBytes=0;micBytes=0;
|
||||||
|
injectQueue = [];
|
||||||
|
wt(os, JSON.stringify({event:"started",hooked:hooked}));
|
||||||
|
}
|
||||||
|
else if (cmd.cmd === "stop") {
|
||||||
|
voiceActive = false;
|
||||||
|
injectMode = false;
|
||||||
|
injectQueue = [];
|
||||||
|
wt(os, JSON.stringify({event:"stopped",spk:{frames:capturedSpk,bytes:spkBytes},mic:{frames:capturedMic,bytes:micBytes}}));
|
||||||
|
}
|
||||||
|
else if (cmd.cmd === "inject_on") {
|
||||||
|
injectMode = true;
|
||||||
|
injectQueue = [];
|
||||||
|
wt(os, JSON.stringify({event:"inject_on"}));
|
||||||
|
console.log("[VOICE] Inject mode ON");
|
||||||
|
}
|
||||||
|
else if (cmd.cmd === "inject_off") {
|
||||||
|
injectMode = false;
|
||||||
|
injectQueue = [];
|
||||||
|
wt(os, JSON.stringify({event:"inject_off"}));
|
||||||
|
console.log("[VOICE] Inject mode OFF");
|
||||||
|
}
|
||||||
|
else if (cmd.cmd === "status") {
|
||||||
|
wt(os, JSON.stringify({event:"status",active:voiceActive,hooked:hooked,inject:injectMode,queue:injectQueue.length,spk:{frames:capturedSpk,bytes:spkBytes},mic:{frames:capturedMic,bytes:micBytes}}));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch(e) { console.log("[VOICE] Ended: "+e); }
|
||||||
|
finally { voiceActive=false; clientOS=null; injectMode=false; injectQueue=[]; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
Java.use("java.lang.Thread").$new(Srv.$new()).start();
|
||||||
|
console.log("[VOICE] Ready (v7 + inject)");
|
||||||
|
});
|
||||||
|
|
@ -0,0 +1,181 @@
|
||||||
|
// voice_bridge_v8.js — Voice Bridge with Audio Injection (attach after voice chat opened)
|
||||||
|
//
|
||||||
|
// STARTUP ORDER:
|
||||||
|
// 1. Launch app: adb shell monkey -p com.antgroup.aijk.android ...
|
||||||
|
// 2. Open voice chat manually or via adb tap
|
||||||
|
// 3. Wait for libantaudio.so to load
|
||||||
|
// 4. Attach frida with this script
|
||||||
|
//
|
||||||
|
// Hook point: libantaudio.so MFAntAudio3AV2Filter::process(micIn, spkRef, out, size, &result)
|
||||||
|
// TCP :18901
|
||||||
|
// Frame: 4-byte len + 1-byte type + payload
|
||||||
|
// type 0: speaker/AI audio (spkRef, downstream to client)
|
||||||
|
// type 1: text/JSON command
|
||||||
|
// type 2: mic audio (micIn, downstream to client)
|
||||||
|
// type 3: inject audio (upstream from client, replaces micIn)
|
||||||
|
|
||||||
|
var voiceActive = false;
|
||||||
|
var clientOS = null;
|
||||||
|
var capturedSpk = 0, capturedMic = 0, spkBytes = 0, micBytes = 0;
|
||||||
|
var injectMode = false;
|
||||||
|
var injectQueue = [];
|
||||||
|
|
||||||
|
function wf(os, type, jArr) {
|
||||||
|
try {
|
||||||
|
var len = jArr.length;
|
||||||
|
var h = Java.array("byte", [(len>>24)&0xFF,(len>>16)&0xFF,(len>>8)&0xFF,len&0xFF, type]);
|
||||||
|
os.write(h); os.write(jArr); os.flush();
|
||||||
|
} catch(e) {}
|
||||||
|
}
|
||||||
|
function wt(os, text) {
|
||||||
|
wf(os, 1, Java.use("java.lang.String").$new(text).getBytes("UTF-8"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// === Hook libantaudio.so (should already be loaded) ===
|
||||||
|
var hooked = false;
|
||||||
|
function tryHook() {
|
||||||
|
if (hooked) return;
|
||||||
|
var m = Process.findModuleByName("libantaudio.so");
|
||||||
|
if (!m) return;
|
||||||
|
var addr = m.findExportByName("_ZN8antaudio20MFAntAudio3AV2Filter7processEPhS1_S1_iRi");
|
||||||
|
if (!addr) return;
|
||||||
|
hooked = true;
|
||||||
|
|
||||||
|
Interceptor.attach(addr, {
|
||||||
|
onEnter: function(args) {
|
||||||
|
if (!voiceActive || !clientOS) return;
|
||||||
|
var size = args[4].toInt32();
|
||||||
|
if (size <= 0) return;
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (injectMode) {
|
||||||
|
if (injectQueue.length > 0) {
|
||||||
|
var frame = injectQueue.shift();
|
||||||
|
if (frame.byteLength === size) {
|
||||||
|
args[1].writeByteArray(frame);
|
||||||
|
} else {
|
||||||
|
var buf = new ArrayBuffer(size);
|
||||||
|
var dst = new Uint8Array(buf);
|
||||||
|
var src = new Uint8Array(frame);
|
||||||
|
var copyLen = Math.min(size, frame.byteLength);
|
||||||
|
for (var k = 0; k < copyLen; k++) dst[k] = src[k];
|
||||||
|
args[1].writeByteArray(buf);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
var silence = new ArrayBuffer(size);
|
||||||
|
args[1].writeByteArray(silence);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Always capture speaker/AI output (type 0)
|
||||||
|
var spkPcm = args[2].readByteArray(size);
|
||||||
|
var spkArr = Java.array("byte", Array.from(new Uint8Array(spkPcm)));
|
||||||
|
wf(clientOS, 0, spkArr);
|
||||||
|
capturedSpk++; spkBytes += size;
|
||||||
|
|
||||||
|
if (!injectMode) {
|
||||||
|
var micPcm = args[1].readByteArray(size);
|
||||||
|
var micArr = Java.array("byte", Array.from(new Uint8Array(micPcm)));
|
||||||
|
wf(clientOS, 2, micArr);
|
||||||
|
}
|
||||||
|
capturedMic++; micBytes += size;
|
||||||
|
|
||||||
|
if (capturedMic <= 3 || capturedMic % 500 === 0)
|
||||||
|
console.log("[VOICE] mic=" + capturedMic + " spk=" + capturedSpk + " inject=" + injectQueue.length);
|
||||||
|
} catch(e) {}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
console.log("[VOICE] process hooked @ " + addr);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Hook immediately — lib should already be loaded since voice chat is open
|
||||||
|
tryHook();
|
||||||
|
if (!hooked) {
|
||||||
|
// Retry a few times in case of timing
|
||||||
|
[500, 1000, 2000, 5000].forEach(function(ms) { setTimeout(tryHook, ms); });
|
||||||
|
}
|
||||||
|
|
||||||
|
// === TCP Server ===
|
||||||
|
Java.perform(function() {
|
||||||
|
var SS = Java.use("java.net.ServerSocket");
|
||||||
|
var JS = Java.use("java.lang.String");
|
||||||
|
var server = SS.$new(18901);
|
||||||
|
console.log("[VOICE] Listening :18901");
|
||||||
|
|
||||||
|
var Srv = Java.registerClass({
|
||||||
|
name: "com.antaf.voice.S8",
|
||||||
|
implements: [Java.use("java.lang.Runnable")],
|
||||||
|
methods: {
|
||||||
|
run: function() {
|
||||||
|
while (true) {
|
||||||
|
try {
|
||||||
|
console.log("[VOICE] Waiting for client...");
|
||||||
|
var c = server.accept();
|
||||||
|
var is = c.getInputStream();
|
||||||
|
var os = c.getOutputStream();
|
||||||
|
clientOS = os;
|
||||||
|
console.log("[VOICE] Client connected");
|
||||||
|
wt(os, JSON.stringify({
|
||||||
|
event:"connected", protocol:"antaf-voice-v8",
|
||||||
|
hooked: hooked,
|
||||||
|
commands:["start","stop","status","inject_on","inject_off"],
|
||||||
|
audio:"pcm-16bit-960b-frames",
|
||||||
|
frameTypes:{0:"spk_ai",1:"text",2:"mic",3:"inject"}
|
||||||
|
}));
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
var hb = [];
|
||||||
|
for (var i=0;i<5;i++) { var b=is.read(); if(b<0) throw "EOF"; hb.push(b); }
|
||||||
|
var fl=(hb[0]<<24)|(hb[1]<<16)|(hb[2]<<8)|hb[3], ft=hb[4];
|
||||||
|
if (fl>1048576) break;
|
||||||
|
var pb = [];
|
||||||
|
for (var i=0;i<fl;i++) { var b=is.read(); if(b<0) throw "EOF"; pb.push(b&0xFF); }
|
||||||
|
|
||||||
|
if (ft === 3) {
|
||||||
|
var arr = new ArrayBuffer(pb.length);
|
||||||
|
var view = new Uint8Array(arr);
|
||||||
|
for (var j=0;j<pb.length;j++) view[j] = pb[j];
|
||||||
|
injectQueue.push(arr);
|
||||||
|
}
|
||||||
|
else if (ft === 1) {
|
||||||
|
var pl = Java.array("byte", pb);
|
||||||
|
var cmd = JSON.parse(JS.$new(pl,"UTF-8").toString());
|
||||||
|
console.log("[VOICE] Cmd: " + JSON.stringify(cmd));
|
||||||
|
if (cmd.cmd === "start") {
|
||||||
|
voiceActive = true;
|
||||||
|
capturedSpk=0;capturedMic=0;spkBytes=0;micBytes=0;
|
||||||
|
injectQueue = [];
|
||||||
|
wt(os, JSON.stringify({event:"started",hooked:hooked}));
|
||||||
|
}
|
||||||
|
else if (cmd.cmd === "stop") {
|
||||||
|
voiceActive = false;
|
||||||
|
injectMode = false;
|
||||||
|
injectQueue = [];
|
||||||
|
wt(os, JSON.stringify({event:"stopped",spk:{frames:capturedSpk,bytes:spkBytes},mic:{frames:capturedMic,bytes:micBytes}}));
|
||||||
|
}
|
||||||
|
else if (cmd.cmd === "inject_on") {
|
||||||
|
injectMode = true;
|
||||||
|
injectQueue = [];
|
||||||
|
wt(os, JSON.stringify({event:"inject_on"}));
|
||||||
|
console.log("[VOICE] Inject ON");
|
||||||
|
}
|
||||||
|
else if (cmd.cmd === "inject_off") {
|
||||||
|
injectMode = false;
|
||||||
|
injectQueue = [];
|
||||||
|
wt(os, JSON.stringify({event:"inject_off"}));
|
||||||
|
console.log("[VOICE] Inject OFF");
|
||||||
|
}
|
||||||
|
else if (cmd.cmd === "status") {
|
||||||
|
wt(os, JSON.stringify({event:"status",active:voiceActive,hooked:hooked,inject:injectMode,queue:injectQueue.length,spk:{frames:capturedSpk,bytes:spkBytes},mic:{frames:capturedMic,bytes:micBytes}}));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch(e) { console.log("[VOICE] Client disconnected: "+e); }
|
||||||
|
finally { voiceActive=false; clientOS=null; injectMode=false; injectQueue=[]; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
Java.use("java.lang.Thread").$new(Srv.$new()).start();
|
||||||
|
console.log("[VOICE] Ready (v8, no open_voice)");
|
||||||
|
});
|
||||||
|
|
@ -0,0 +1,335 @@
|
||||||
|
# 蚂蚁阿福接入小智 ESP32 — 实施方案
|
||||||
|
|
||||||
|
## 项目目标
|
||||||
|
|
||||||
|
将蚂蚁阿福 App 的 AI 能力接入小智 ESP32 硬件终端,用户通过 ESP32 设备语音对话,
|
||||||
|
后端对接蚂蚁阿福代替自建 LLM,省去 GPU 资源(两张 RTX 3090 + Qwen3-32B)。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 系统架构
|
||||||
|
|
||||||
|
### 方案A:文字接入(自定义 LLM Provider)
|
||||||
|
|
||||||
|
```
|
||||||
|
ESP32 设备 PlugAI 服务端 手机
|
||||||
|
┌──────────┐ WebSocket ┌──────────────────┐ HTTP/SSE ┌─<E2948C><E29480><EFBFBD>────────────┐
|
||||||
|
│ 麦克风 │ ──────────────→│ ASR (FunASR) │ │ 蚂蚁阿福 App │
|
||||||
|
│ 唤<><E594A4><EFBFBD>词 │ │ 语音→文字 │ │ + Frida 注入 │
|
||||||
|
│ AEC/NS │ │ │ GET /chat?q= │ │
|
||||||
|
│ │ │ AntafLLM Provider│──────────────→│ HTTP Bridge │
|
||||||
|
│ │ │ (新增) │←──────────────│ (port 18900) │
|
||||||
|
│ │ │ │ SSE 流式回答 │ │
|
||||||
|
│ 喇叭 │←───────────────│ TTS (EdgeTTS) │ │ │
|
||||||
|
│ │ WebSocket │ 文字→语音 │ │ │
|
||||||
|
└──────────┘ └──────────────────┘ └──────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
**数据流**: ESP32 音频 → FunASR(语音转文字) → AntafLLM(文<><E69687><EFBFBD>发给阿福) → EdgeTTS(回答转语音) → ESP32 播放
|
||||||
|
|
||||||
|
### 方案B:语音直通(替代整个 ASR+LLM+TTS)
|
||||||
|
|
||||||
|
```
|
||||||
|
ESP32 设备 PlugAI 服务端 手机
|
||||||
|
┌──────────┐ WebSocket ┌─────<E29480><E29480><EFBFBD>────────────┐ TCP 二进制 ┌──<E29480><E29480><EFBFBD>───────────┐
|
||||||
|
│ 麦克风 │ ──────────────→│ 音频转发模块(新增) │ │ 蚂蚁<E89A82><E89A81>福 App │
|
||||||
|
│ │ │ Opus解码 │ PCM注入mic │ + Frida <20><>入 │
|
||||||
|
│ │ │ 重采样 24k→48k │──────────────→│ Voice Bridge │
|
||||||
|
│ │ │ │ PCM speaker │ (port 18901) │
|
||||||
|
│ 喇叭 │←───────────────│ 重采样 24k→24k │←──────────────│ libantaudio │
|
||||||
|
│ │ WebSocket │ Opus编码 │ │ │
|
||||||
|
└──────────┘ └──────────────────┘ └──────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
**数据流**: ESP32 音频 → 解码+重采样 → 注入阿福麦克风 → 阿福完整处理(ASR+LLM+TTS) → 捕获音频 → 编码 → ESP32 播放
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 可行性评估
|
||||||
|
|
||||||
|
| 维度 | 方案A (文字接入) | 方案B (语音直通) |
|
||||||
|
|------|-----------------|-----------------|
|
||||||
|
| 可行性 | **高** | **中低** |
|
||||||
|
| 实现难度 | 低 (1个Python文件) | 高 (改JS+写转发模块) |
|
||||||
|
| 改动范围 | 新增 LLM Provider + 改配置 | 改 voice_bridge.js + 新增转发模块 |
|
||||||
|
| 延迟 | 中 (ASR+网络+TTS 各一轮) | 低 (音频直通) |
|
||||||
|
| 音质 | EdgeTTS (微软高质量) | 阿福原生 TTS |
|
||||||
|
| GPU 依赖 | 无 (省掉 Qwen3-32B) | 无 |
|
||||||
|
| 手机依赖 | 需要 (App+Frida+adb) | 需要 (App+Frida+adb) |
|
||||||
|
| 核心风险 | 低 | **voice_bridge 当前不支持音频注入** |
|
||||||
|
|
||||||
|
**结论**: 先实施方案A,验证通过后再做方案B。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 方案A 详细实施
|
||||||
|
|
||||||
|
### 前置条件
|
||||||
|
|
||||||
|
| 组件 | 状态 | 说明 |
|
||||||
|
|------|------|------|
|
||||||
|
| ESP32 设备 | 已就绪 | 固件已烧录,WiFi+服务端已配置 |
|
||||||
|
| 小智服务端 | 已就绪 | ws://14.18.247.51:8010 运行中 |
|
||||||
|
| ASR (FunASR) | 已就绪 | CPU 模式 |
|
||||||
|
| TTS (EdgeTTS) | 已就绪 | 微软免费 |
|
||||||
|
| 蚂蚁阿福 HTTP Bridge | 已就绪 | http_bridge_stream.js (port 18900) |
|
||||||
|
| Frida + 手机 | 需部署 | 手机需连到服务端可达的网络 |
|
||||||
|
|
||||||
|
### 第1步:创建 AntafLLM Provider
|
||||||
|
|
||||||
|
文件路径:`backend/main/xiaozhi-server/core/providers/llm/antaf/antaf.py`
|
||||||
|
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
from config.logger import setup_logging
|
||||||
|
from core.providers.llm.base import LLMProviderBase
|
||||||
|
|
||||||
|
TAG = __name__
|
||||||
|
logger = setup_logging()
|
||||||
|
|
||||||
|
|
||||||
|
class LLMProvider(LLMProviderBase):
|
||||||
|
"""
|
||||||
|
蚂蚁阿福 LLM Provider
|
||||||
|
通过 Frida HTTP Bridge (port 18900) 对接蚂蚁阿福 App 的文字对话 API。
|
||||||
|
Bridge 运行在手机上,通过 adb forward 或网络暴露 SSE 流式接口。
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config):
|
||||||
|
self.bridge_url = config.get("bridge_url", "http://127.0.0.1:18900")
|
||||||
|
self.timeout = config.get("timeout", 60)
|
||||||
|
logger.bind(tag=TAG).info(
|
||||||
|
f"AntafLLM 初始化: bridge={self.bridge_url}, timeout={self.timeout}s"
|
||||||
|
)
|
||||||
|
|
||||||
|
def response(self, session_id, dialogue, **kwargs):
|
||||||
|
"""
|
||||||
|
流式返回蚂蚁阿福的回答。
|
||||||
|
1. 从 dialogue 取最后一条用户消息
|
||||||
|
2. GET {bridge_url}/chat?q={query}
|
||||||
|
3. 解析 SSE 流,yield 每个 delta 文本
|
||||||
|
"""
|
||||||
|
# 提取最后一条用户消息
|
||||||
|
query = ""
|
||||||
|
for msg in reversed(dialogue):
|
||||||
|
if msg.get("role") == "user":
|
||||||
|
query = msg.get("content", "")
|
||||||
|
break
|
||||||
|
|
||||||
|
if not query:
|
||||||
|
logger.bind(tag=TAG).warning("对话中没有用户消息")
|
||||||
|
yield "抱歉,我没有收到您的问题。"
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.bind(tag=TAG).info(f"AntafLLM 请求: {query[:50]}...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
url = f"{self.bridge_url}/chat"
|
||||||
|
resp = requests.get(
|
||||||
|
url,
|
||||||
|
params={"q": query},
|
||||||
|
stream=True,
|
||||||
|
timeout=self.timeout
|
||||||
|
)
|
||||||
|
resp.encoding = "utf-8"
|
||||||
|
|
||||||
|
for line in resp.iter_lines(decode_unicode=True):
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
if line.startswith("data: "):
|
||||||
|
data = line[6:] # 去掉 "data: " 前缀
|
||||||
|
if data == "[DONE]":
|
||||||
|
break
|
||||||
|
if data and len(data.strip()) > 0:
|
||||||
|
yield data
|
||||||
|
|
||||||
|
except requests.exceptions.ConnectionError:
|
||||||
|
logger.bind(tag=TAG).error("无法连接蚂<E68EA5><E89A82>阿福 Bridge,请检查手机和 Frida 状态")
|
||||||
|
yield "抱歉,蚂蚁阿福服务暂时不可用。"
|
||||||
|
except requests.exceptions.Timeout:
|
||||||
|
logger.bind(tag=TAG).error(f"蚂蚁阿福 Bridge 超时 ({self.timeout}s)")
|
||||||
|
yield "抱歉,回答超时了。"
|
||||||
|
except Exception as e:
|
||||||
|
logger.bind(tag=TAG).error(f"AntafLLM 异常: {e}")
|
||||||
|
yield "抱歉,发生了错误。"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 第2步:修改服务端配置
|
||||||
|
|
||||||
|
编辑 `backend/main/xiaozhi-server/data/.config.yaml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
selected_module:
|
||||||
|
LLM: antaf # 改为<E694B9><E4B8BA><EFBFBD>蚁阿福
|
||||||
|
|
||||||
|
LLM:
|
||||||
|
antaf:
|
||||||
|
type: antaf
|
||||||
|
bridge_url: http://<手机IP>:18900 # 手机的 HTTP Bridge 地址
|
||||||
|
timeout: 60 # SSE 流超时时间
|
||||||
|
```
|
||||||
|
|
||||||
|
也可以保留原来的 Qwen3 配置,方便切换:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
LLM:
|
||||||
|
antaf:
|
||||||
|
type: antaf
|
||||||
|
bridge_url: http://<手机IP>:18900
|
||||||
|
timeout: 60
|
||||||
|
Qwen3:
|
||||||
|
type: openai
|
||||||
|
model_name: Qwen3-32B
|
||||||
|
url: http://127.0.0.1:30000/v1
|
||||||
|
api_key: EMPTY
|
||||||
|
```
|
||||||
|
|
||||||
|
### 第3步:网络打通
|
||||||
|
|
||||||
|
手机的 Frida Bridge 端口需要让 PlugAI 服<><E69C8D><EFBFBD>器能访问到。有两种方式:
|
||||||
|
|
||||||
|
#### 方式1:手机直连局域网(推荐)
|
||||||
|
|
||||||
|
如果手机和 PlugAI 服务器在同一网络(或手机有公网可达 IP):
|
||||||
|
```bash
|
||||||
|
# 手机上启动 bridge 后,服务端直接访问
|
||||||
|
# bridge_url: http://<手机内网IP>:18900
|
||||||
|
curl http://<手机IP>:18900/chat?q=hello
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 方式2:adb forward + SSH 隧道
|
||||||
|
|
||||||
|
手机通过 USB 连接一台中间机器,再通过 SSH 隧道暴露<E69AB4><E99CB2><EFBFBD>
|
||||||
|
```bash
|
||||||
|
# 中间机器上
|
||||||
|
adb forward tcp:18900 tcp:18900
|
||||||
|
|
||||||
|
# PlugAI 上建 SSH 隧道
|
||||||
|
ssh -L 18900:127.0.0.1:18900 user@中间机器IP
|
||||||
|
# bridge_url: http://127.0.0.1:18900
|
||||||
|
```
|
||||||
|
|
||||||
|
### 第4步:启动与测试
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. 手机端:启动 Frida + HTTP Bridge
|
||||||
|
frida -U -p <PID> -l http_bridge_stream.js
|
||||||
|
|
||||||
|
# 2. 先测 bridge 连通性
|
||||||
|
curl -N 'http://<手机IP>:18900/chat?q=你好'
|
||||||
|
|
||||||
|
# 3. PlugAI 服务端:重启小智服务
|
||||||
|
cd /home/ZeroStack/xiaozhi/xiaozhi-esp32-server/main/xiaozhi-server
|
||||||
|
source /home/ZeroStack/xiaozhi/venv/bin/activate
|
||||||
|
python app.py
|
||||||
|
|
||||||
|
# 4. ESP32 设备:唤醒测试
|
||||||
|
# 说 "你好小智" → 提问 → 应该听到蚂蚁阿福的回答(EdgeTTS 合成的语音)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 方案B 详细实施(后续)
|
||||||
|
|
||||||
|
### 核心改造:voice_bridge.js 支持音频注入
|
||||||
|
|
||||||
|
当前 voice_bridge.js 的 `MFAntAudio3AV2Filter::process` hook 只**读取** micIn 缓冲区。
|
||||||
|
需要改造为可以从外部**写入** micIn 缓冲区,替换真实麦克风输入。
|
||||||
|
|
||||||
|
#### 改造要点
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// voice_bridge.js 新增功能
|
||||||
|
var injectBuffer = null; // 外部注入的 PCM 数据
|
||||||
|
|
||||||
|
// 新增 inject 命令:接收外部 PCM 音频帧
|
||||||
|
// 客户端发送: [4字节长度][type=3][960字节PCM数据]
|
||||||
|
// type 3 = inject audio
|
||||||
|
|
||||||
|
Interceptor.attach(processAddr, {
|
||||||
|
onEnter: function(args) {
|
||||||
|
var micIn = args[1]; // 麦克风输入缓冲区 (960 bytes)
|
||||||
|
var frameSize = args[4]; // 960
|
||||||
|
|
||||||
|
if (injectBuffer !== null) {
|
||||||
|
// 用注入数据覆盖真实麦克风输入
|
||||||
|
micIn.writeByteArray(injectBuffer);
|
||||||
|
injectBuffer = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 采样率转换
|
||||||
|
|
||||||
|
| 来源 | 格式 | 需转换为 |
|
||||||
|
|------|------|---------|
|
||||||
|
| ESP32 → 服务端 | Opus 24kHz mono | PCM 48kHz mono (阿福 mic) |
|
||||||
|
| 阿福 speaker 输出 | PCM 24kHz stereo | Opus 24kHz mono (ESP32) |
|
||||||
|
|
||||||
|
服务端需要:
|
||||||
|
- libopus 解码/编码
|
||||||
|
- resampy 或 scipy 做采样率转换
|
||||||
|
- 960字节帧对齐(20ms @ 48kHz)
|
||||||
|
|
||||||
|
#### 新增音频转发模块
|
||||||
|
|
||||||
|
文件<EFBFBD><EFBFBD>径:`backend/main/xiaozhi-server/core/providers/asr/antaf_voice/antaf_voice.py`
|
||||||
|
|
||||||
|
这是一个特殊的 ASR Provider,它不做语音识别,而是:
|
||||||
|
1. 接收 ESP32 的 Opus 音频流
|
||||||
|
2. 解码为 PCM,重采样 24k→48k
|
||||||
|
3. 通过 TCP 发送到 voice_bridge (port 18901) 的 inject 命令
|
||||||
|
4. 接收 voice_bridge 的 speaker 输出
|
||||||
|
5. 重采样 24k stereo → 24k mono,Opus 编码
|
||||||
|
6. 直接发回 ESP32(跳过 LLM 和 TTS)
|
||||||
|
|
||||||
|
#### 方案B 风险点
|
||||||
|
|
||||||
|
1. **帧时序同步**: ESP32 音频帧和阿福 process() 调用频率可能不一致
|
||||||
|
2. **延迟累积**: 网络传输 + 两次重采样 + 注入延迟
|
||||||
|
3. **VAD 冲突**: 阿福自带 VAD 可能与注入音频不匹配
|
||||||
|
4. **回声消除失效**: 注入 mic 数据后,阿<EFBC8C><E998BF>的 AEC 参考信号(spkRef)对不上
|
||||||
|
5. **对话控制**: 何时 open_voice / close_voice 需要与 ESP32 唤醒状态同步
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 依赖清单
|
||||||
|
|
||||||
|
### 方案A(新增依赖)
|
||||||
|
- `requests` — Python HTTP 库(服务端 venv 中应已有)
|
||||||
|
|
||||||
|
### 方案B(新增<E696B0><E5A29E>赖)
|
||||||
|
- `opuslib` 或 `pyogg` — Opus 编解码
|
||||||
|
- `resampy` 或 `scipy.signal` — 采样率转换
|
||||||
|
- `numpy` — 音频数据处理
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 文件清单
|
||||||
|
|
||||||
|
### 方案A
|
||||||
|
| 操<><E6938D> | 文件 |
|
||||||
|
|------|------|
|
||||||
|
| 新增 | `backend/main/xiaozhi-server/core/providers/llm/antaf/__init__.py` |
|
||||||
|
| 新增 | `backend/main/xiaozhi-server/core/providers/llm/antaf/antaf.py` |
|
||||||
|
| 修改 | `backend/main/xiaozhi-server/data/.config.yaml` |
|
||||||
|
|
||||||
|
### 方案B(额外)
|
||||||
|
| 操作 | 文件 |
|
||||||
|
|------|------|
|
||||||
|
| 修改 | `antaf/voice_bridge.js` (新增 inject 命令) |
|
||||||
|
| 新增 | `backend/main/xiaozhi-server/core/providers/asr/antaf_voice/antaf_voice.py` |
|
||||||
|
| 新增 | `backend/main/xiaozhi-server/core/utils/audio_resample.py` |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 里程碑
|
||||||
|
|
||||||
|
| 阶段 | 目标 | 预期产出 |
|
||||||
|
|------|------|---------|
|
||||||
|
| M1 | 方案A 代码实现 | AntafLLM Provider + 配置 |
|
||||||
|
| M2 | 网络打通 | PlugAI ↔ 手机 Bridge 连通 |
|
||||||
|
| M3 | 端到端测试 | ESP32 唤醒→阿福回答→语音播报 |
|
||||||
|
| M4 | 方案B 原型 | voice_bridge 音频注入验证 |
|
||||||
|
| M5 | 方案B 集成 | 全语音直通链路 |
|
||||||
|
|
@ -0,0 +1,79 @@
|
||||||
|
import io
|
||||||
|
import os
|
||||||
|
import wave
|
||||||
|
import asyncio
|
||||||
|
import numpy as np
|
||||||
|
import sherpa_onnx
|
||||||
|
from config.logger import setup_logging
|
||||||
|
from core.providers.tts.base import TTSProviderBase
|
||||||
|
|
||||||
|
TAG = __name__
|
||||||
|
logger = setup_logging()
|
||||||
|
|
||||||
|
|
||||||
|
class TTSProvider(TTSProviderBase):
|
||||||
|
def __init__(self, config, delete_audio_file):
|
||||||
|
super().__init__(config, delete_audio_file)
|
||||||
|
model_dir = config.get("model_dir", "models/vits-melo-tts-zh_en")
|
||||||
|
speed = config.get("speed", 1.0)
|
||||||
|
self.speed = float(speed) if speed else 1.0
|
||||||
|
self.sid = int(config.get("sid", 0))
|
||||||
|
|
||||||
|
# 优先使用 int8 量化模型(更快)
|
||||||
|
model_file = f"{model_dir}/model.int8.onnx"
|
||||||
|
if not os.path.exists(model_file) or os.path.getsize(model_file) < 1024:
|
||||||
|
model_file = f"{model_dir}/model.onnx"
|
||||||
|
|
||||||
|
num_threads = int(config.get("num_threads", 8))
|
||||||
|
|
||||||
|
tts_config = sherpa_onnx.OfflineTtsConfig(
|
||||||
|
model=sherpa_onnx.OfflineTtsModelConfig(
|
||||||
|
vits=sherpa_onnx.OfflineTtsVitsModelConfig(
|
||||||
|
model=model_file,
|
||||||
|
lexicon=f"{model_dir}/lexicon.txt",
|
||||||
|
tokens=f"{model_dir}/tokens.txt",
|
||||||
|
dict_dir=f"{model_dir}/dict",
|
||||||
|
),
|
||||||
|
num_threads=num_threads,
|
||||||
|
),
|
||||||
|
rule_fsts=f"{model_dir}/date.fst,{model_dir}/phone.fst,{model_dir}/number.fst,{model_dir}/new_heteronym.fst",
|
||||||
|
max_num_sentences=1,
|
||||||
|
)
|
||||||
|
self.tts = sherpa_onnx.OfflineTts(tts_config)
|
||||||
|
self.sample_rate = self.tts.sample_rate
|
||||||
|
logger.bind(tag=TAG).info(
|
||||||
|
f"SherpaOnnxTTS 初始化完成: model_dir={model_dir}, sample_rate={self.sample_rate}, sid={self.sid}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def _generate_wav(self, text):
|
||||||
|
"""同步合成,在线程池中调用"""
|
||||||
|
from scipy.signal import resample_poly
|
||||||
|
from math import gcd
|
||||||
|
|
||||||
|
audio = self.tts.generate(text, sid=self.sid, speed=self.speed)
|
||||||
|
samples = np.array(audio.samples, dtype=np.float32)
|
||||||
|
|
||||||
|
# 重采样到目标采样率(设备要求 24000Hz,模型输出 44100Hz)
|
||||||
|
target_sr = 24000
|
||||||
|
if self.sample_rate != target_sr:
|
||||||
|
g = gcd(self.sample_rate, target_sr)
|
||||||
|
samples = resample_poly(samples, target_sr // g, self.sample_rate // g)
|
||||||
|
|
||||||
|
pcm = (samples * 32767).astype(np.int16)
|
||||||
|
|
||||||
|
wav_io = io.BytesIO()
|
||||||
|
with wave.open(wav_io, "wb") as wf:
|
||||||
|
wf.setnchannels(1)
|
||||||
|
wf.setsampwidth(2)
|
||||||
|
wf.setframerate(target_sr)
|
||||||
|
wf.writeframes(pcm.tobytes())
|
||||||
|
return wav_io.getvalue()
|
||||||
|
|
||||||
|
async def text_to_speak(self, text, output_file):
|
||||||
|
wav_data = self._generate_wav(text)
|
||||||
|
|
||||||
|
if output_file:
|
||||||
|
with open(output_file, "wb") as f:
|
||||||
|
f.write(wav_data)
|
||||||
|
else:
|
||||||
|
return wav_data
|
||||||
Loading…
Reference in New Issue