add independent modules (not integrated into framework)

- modules/antaf/ — Antaf LLM provider, voice passthrough, bridge scripts - modules/tts/ — sherpa-onnx local TTS provider - modules/docs/ — integration plan These are standalone files, NOT patched into xiaozhi-server framework. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-06 11:40:31 -07:00 · 2026-04-06 11:40:31 -07:00 · a88e7072b3
parent ae260da3eb
commit a88e7072b3
8 changed files with 1635 additions and 0 deletions
--- a/modules/antaf/antaf_llm.py
+++ b/modules/antaf/antaf_llm.py
@ -0,0 +1,143 @@
+import json
+import requests
+from config.logger import setup_logging
+from core.providers.llm.base import LLMProviderBase
+
+TAG = __name__
+logger = setup_logging()
+
+
+class LLMProvider(LLMProviderBase):
+    """
+    蚂蚁阿福 LLM Provider
+    通过 Frida HTTP Bridge (port 18900) 对接蚂蚁阿福 App 的文字对话 API。
+    Bridge 运行在手机上，通过 adb forward 或网络暴露 SSE 流式接口。
+    """
+
+    def __init__(self, config):
+        self.bridge_url = config.get("bridge_url", "http://127.0.0.1:18900")
+        self.timeout = config.get("timeout", 60)
+        self.should_idle = False  # signal to send system idle after TTS
+        logger.bind(tag=TAG).info(
+            f"AntafLLM 初始化: bridge={self.bridge_url}, timeout={self.timeout}s"
+        )
+
+    @staticmethod
+    def _is_thinking(text):
+        """检测蚂蚁阿福的内心思考/推理过程，这些不应该发给用户"""
+        thinking_patterns = [
+            "用户问", "用户说", "用户的", "用户可能", "用户真正",
+            "我得", "我会", "我在想", "我决定", "我要",
+            "语气比较", "感觉他", "让他知道", "让他觉得",
+            "先安抚", "得先", "不想表现",
+            "整体语气", "这样能", "这样他",
+            "所以我", "还带了个",
+        ]
+        for p in thinking_patterns:
+            if p in text:
+                return True
+        return False
+
+    @staticmethod
+    def _clean_text(text):
+        """清理阿福返回文本中的脏数据"""
+        # 去掉阿福内部状态文本
+        junk = [
+            "完成资料引用", "内容生成", "正在思考", "正在搜索",
+        ]
+        for j in junk:
+            text = text.replace(j, "")
+        return text.strip()
+
+    @staticmethod
+    def _is_system_injected(content):
+        """检测是否为系统注入的消息（非用户真实输入）"""
+        if not content:
+            return True
+        markers = [
+            "[系统提示]", "tool_call", "<tool_call>", "TOOL USE",
+            "系统提示", "工具调用", "function_call",
+            "handle_exit_intent", "你有以下工具", "You have access",
+        ]
+        for m in markers:
+            if m in content:
+                return True
+        # 超过200字的 user 消息大概率是系统注入的
+        if len(content) > 200:
+            return True
+        return False
+
+    def response(self, session_id, dialogue, **kwargs):
+        # 从 dialogue 中提取真正的用户消息（跳过系统注入的 user 消息）
+        query = ""
+        for msg in reversed(dialogue):
+            if msg.get("role") == "user":
+                content = msg.get("content", "")
+                if not self._is_system_injected(content):
+                    # ASR 结果可能是 JSON: {"content":"...", "language":"zh", "emotion":"..."}
+                    try:
+                        parsed = json.loads(content)
+                        if isinstance(parsed, dict) and "content" in parsed:
+                            query = parsed["content"]
+                        else:
+                            query = content
+                    except (json.JSONDecodeError, TypeError):
+                        query = content
+                    break
+
+        if not query:
+            logger.bind(tag=TAG).warning("对话中没有用户消息")
+            yield "抱歉，我没有收到您的问题。"
+            return
+
+        # 追加简短回答提示，避免阿福回复过长导致TTS排队卡顿
+        query = query + "（请用2-3句话简短回答）"
+        self.should_idle = False
+        logger.bind(tag=TAG).info(f"AntafLLM 请求: {query[:50]}...")
+
+        try:
+            url = f"{self.bridge_url}/chat"
+            resp = requests.get(
+                url,
+                params={"q": query},
+                stream=True,
+                timeout=self.timeout,
+            )
+            resp.encoding = "utf-8"
+
+            seen_texts = set()
+            for line in resp.iter_lines(decode_unicode=True):
+                if not line:
+                    continue
+                if line.startswith("data: "):
+                    data = line[6:]
+                    if data == "[DONE]":
+                        break
+                    if not data or len(data.strip()) == 0:
+                        continue
+                    # 去重：跳过完全相同的文本块
+                    if data in seen_texts:
+                        continue
+                    seen_texts.add(data)
+                    # 过滤思考过程
+                    if self._is_thinking(data):
+                        logger.bind(tag=TAG).debug(f"过滤思考内容: {data[:50]}...")
+                        continue
+                    # 清理脏数据
+                    data = self._clean_text(data)
+                    if not data:
+                        continue
+                    yield data
+
+        except requests.exceptions.ConnectionError:
+            logger.bind(tag=TAG).error("无法连接蚂蚁阿福 Bridge，请检查手机和 Frida 状态")
+            self.should_idle = True
+            yield "抱歉，蚂蚁阿福服务暂时不可用。"
+        except requests.exceptions.Timeout:
+            logger.bind(tag=TAG).error(f"蚂蚁阿福 Bridge 超时 ({self.timeout}s)")
+            self.should_idle = True
+            yield "抱歉，回答超时了。"
+        except Exception as e:
+            logger.bind(tag=TAG).error(f"AntafLLM 异常: {e}")
+            self.should_idle = True
+            yield "抱歉，发生了错误。"
--- a/modules/antaf/antaf_passthrough.py
+++ b/modules/antaf/antaf_passthrough.py
@ -0,0 +1,251 @@
+"""
+Antaf Voice Passthrough ASR Provider
+
+Replaces ASR→LLM→TTS pipeline with direct audio forwarding to Antaf voice_bridge.
+ESP32 audio → decode Opus → resample 16kHz→48kHz → inject to voice_bridge (type=3)
+voice_bridge speaker (type=0) → resample 48kHz→16kHz → encode Opus → send to ESP32
+
+Runs within xiaozhi-server, keeping all protocol handling (hello, OTA, wake word) intact.
+"""
+
+import json
+import struct
+import asyncio
+import threading
+import numpy as np
+import opuslib_next
+from scipy.signal import resample_poly
+from math import gcd
+from config.logger import setup_logging
+from core.providers.asr.base import ASRProviderBase
+from core.handle.sendAudioHandle import send_tts_message
+
+TAG = __name__
+logger = setup_logging()
+
+# Audio parameters
+ESP_SR = 16000
+ESP_FRAME_SAMPLES = 960  # 60ms at 16kHz
+BRIDGE_SR = 48000
+BRIDGE_FRAME_SAMPLES = 480  # 960 bytes / 2 = 480 samples
+
+# Resample ratios
+UP = (BRIDGE_SR // gcd(BRIDGE_SR, ESP_SR), ESP_SR // gcd(BRIDGE_SR, ESP_SR))  # (3,1)
+DOWN = (ESP_SR // gcd(ESP_SR, BRIDGE_SR), BRIDGE_SR // gcd(ESP_SR, BRIDGE_SR))  # (1,3)
+
+
+class ASRProvider(ASRProviderBase):
+    def __init__(self, config):
+        super().__init__()
+        self.bridge_host = config.get("bridge_host", "127.0.0.1")
+        self.bridge_port = int(config.get("bridge_port", 18901))
+        self.interface_type = "NON_STREAM"
+        self.conn = None
+        self.bridge_reader = None
+        self.bridge_writer = None
+        self.opus_decoder = None
+        self.opus_encoder = None
+        self._inject_buf = np.array([], dtype=np.int16)
+        self._speaker_buf = np.array([], dtype=np.int16)
+        self._tts_started = False
+        self._recv_task = None
+        self._connected = False
+        logger.bind(tag=TAG).info(
+            f"AntafPassthrough 初始化: bridge={self.bridge_host}:{self.bridge_port}"
+        )
+
+    async def open_audio_channels(self, conn):
+        """Override: connect to bridge, start passthrough instead of normal ASR."""
+        # Clean up previous connection if any
+        await self.close()
+
+        self.conn = conn
+        self.opus_decoder = opuslib_next.Decoder(ESP_SR, 1)
+        self.opus_encoder = opuslib_next.Encoder(ESP_SR, 1, opuslib_next.APPLICATION_AUDIO)
+        self._tts_started = False
+        self._silence_count = 0
+        self._inject_buf = np.array([], dtype=np.int16)
+        self._speaker_buf = np.array([], dtype=np.int16)
+        self._write_lock = threading.Lock()
+
+        # Connect to voice_bridge
+        try:
+            self.bridge_reader, self.bridge_writer = await asyncio.open_connection(
+                self.bridge_host, self.bridge_port
+            )
+            # Read connected event
+            ftype, data = await self._bridge_recv()
+            if ftype == 1:
+                msg = json.loads(data.decode())
+                logger.bind(tag=TAG).info(f"Bridge connected: {msg.get('protocol')}")
+
+            # Send start + inject_on
+            self._bridge_send_cmd({"cmd": "start"})
+            ftype, data = await self._bridge_recv()
+            if ftype == 1:
+                logger.bind(tag=TAG).info(f"Bridge: {json.loads(data.decode())}")
+
+            self._bridge_send_cmd({"cmd": "inject_on"})
+            ftype, data = await self._bridge_recv()
+            if ftype == 1:
+                logger.bind(tag=TAG).info(f"Bridge: {json.loads(data.decode())}")
+
+            self._connected = True
+            logger.bind(tag=TAG).info("Voice bridge ready (inject mode)")
+
+            # Start speaker receive loop
+            self._recv_task = asyncio.create_task(self._speaker_recv_loop())
+
+        except Exception as e:
+            logger.bind(tag=TAG).error(f"Bridge connection failed: {e}")
+            self._connected = False
+
+        # Start normal audio processing thread (reads from asr_audio_queue)
+        conn.asr_priority_thread = threading.Thread(
+            target=self._audio_thread, args=(conn,), daemon=True
+        )
+        conn.asr_priority_thread.start()
+
+    def _audio_thread(self, conn):
+        """Read Opus frames from queue, decode, resample, inject to bridge."""
+        import queue as queue_module
+        frame_count = 0
+        while not conn.stop_event.is_set():
+            try:
+                opus_data = conn.asr_audio_queue.get(timeout=1)
+                if not self._connected:
+                    continue
+
+                frame_count += 1
+                if frame_count <= 3 or frame_count % 200 == 0:
+                    logger.bind(tag=TAG).debug(f"Audio frame #{frame_count}")
+
+                # Decode Opus → PCM 16kHz
+                pcm = self.opus_decoder.decode(opus_data, ESP_FRAME_SAMPLES)
+                samples = np.frombuffer(pcm, dtype=np.int16)
+
+                # Resample 16kHz → 48kHz
+                upsampled = resample_poly(samples, UP[0], UP[1]).astype(np.int16)
+
+                # Split into bridge frames and inject
+                self._inject_buf = np.concatenate([self._inject_buf, upsampled])
+                while len(self._inject_buf) >= BRIDGE_FRAME_SAMPLES:
+                    frame = self._inject_buf[:BRIDGE_FRAME_SAMPLES]
+                    self._inject_buf = self._inject_buf[BRIDGE_FRAME_SAMPLES:]
+                    self._bridge_send_inject(frame.tobytes())
+
+            except queue_module.Empty:
+                continue
+            except Exception as e:
+                logger.bind(tag=TAG).error(f"Audio thread error: {e}")
+
+    async def _speaker_recv_loop(self):
+        """Receive speaker PCM from bridge, resample, encode Opus, send to ESP32."""
+        try:
+            while self._connected:
+                ftype, data = await self._bridge_recv()
+                if ftype == 0:
+                    # Speaker audio
+                    await self._handle_speaker(data)
+                elif ftype == 1:
+                    msg = json.loads(data.decode())
+                    logger.bind(tag=TAG).debug(f"Bridge event: {msg}")
+        except asyncio.IncompleteReadError:
+            logger.bind(tag=TAG).warning("Bridge connection closed")
+        except Exception as e:
+            logger.bind(tag=TAG).error(f"Speaker recv error: {e}")
+        finally:
+            self._connected = False
+
+    async def _handle_speaker(self, pcm_bytes):
+        """Process speaker frame and send to ESP32."""
+        if not self.conn or not self.conn.websocket:
+            return
+
+        samples = np.frombuffer(pcm_bytes, dtype=np.int16)
+        max_amp = int(np.max(np.abs(samples)))
+
+        # Track silence for tts stop
+        if max_amp < 10:
+            if self._tts_started:
+                self._silence_count += 1
+                # 50 frames of silence (~1 second) → send tts stop
+                if self._silence_count > 50:
+                    try:
+                        await send_tts_message(self.conn, "stop")
+                        self.conn.client_is_speaking = False
+                        self._tts_started = False
+                        self._silence_count = 0
+                        logger.bind(tag=TAG).info("Sent tts stop to ESP32")
+                    except Exception as e:
+                        logger.bind(tag=TAG).error(f"Send tts stop error: {e}")
+            return
+
+        # Reset silence counter on non-silent frame
+        self._silence_count = 0
+
+        # Send tts start before first audio
+        if not self._tts_started:
+            try:
+                await send_tts_message(self.conn, "start")
+                self._tts_started = True
+                self.conn.client_is_speaking = True
+                logger.bind(tag=TAG).info("Sent tts start to ESP32")
+            except Exception as e:
+                logger.bind(tag=TAG).error(f"Send tts start error: {e}")
+                return
+
+        # Resample 48kHz → 16kHz
+        downsampled = resample_poly(samples, DOWN[0], DOWN[1]).astype(np.int16)
+
+        # Accumulate and encode
+        self._speaker_buf = np.concatenate([self._speaker_buf, downsampled])
+        while len(self._speaker_buf) >= ESP_FRAME_SAMPLES:
+            frame = self._speaker_buf[:ESP_FRAME_SAMPLES]
+            self._speaker_buf = self._speaker_buf[ESP_FRAME_SAMPLES:]
+            opus_data = self.opus_encoder.encode(frame.tobytes(), ESP_FRAME_SAMPLES)
+            try:
+                await self.conn.websocket.send(opus_data)
+            except Exception as e:
+                logger.bind(tag=TAG).error(f"Send opus to ESP32 error: {e}")
+                return
+
+    # Bridge TCP helpers
+    async def _bridge_recv(self):
+        header = await self.bridge_reader.readexactly(5)
+        length = struct.unpack(">I", header[:4])[0]
+        ftype = header[4]
+        data = await self.bridge_reader.readexactly(length)
+        return ftype, data
+
+    def _bridge_send_cmd(self, cmd):
+        data = json.dumps(cmd).encode()
+        header = struct.pack(">IB", len(data), 1)
+        with self._write_lock:
+            self.bridge_writer.write(header + data)
+
+    def _bridge_send_inject(self, pcm_bytes):
+        header = struct.pack(">IB", len(pcm_bytes), 3)
+        with self._write_lock:
+            self.bridge_writer.write(header + pcm_bytes)
+
+    # ASR interface — never returns text, LLM/TTS never triggered
+    async def receive_audio(self, conn, audio, audio_have_voice):
+        """No-op: audio is handled by _audio_thread directly from queue."""
+        pass
+
+    async def speech_to_text(self, opus_data, session_id, audio_format="opus", artifacts=None):
+        """Never called in passthrough mode."""
+        return "", None
+
+    async def close(self):
+        self._connected = False
+        if self._recv_task:
+            self._recv_task.cancel()
+        if self.bridge_writer:
+            try:
+                self._bridge_send_cmd({"cmd": "inject_off"})
+                self._bridge_send_cmd({"cmd": "stop"})
+                self.bridge_writer.close()
+            except Exception:
+                pass
--- a/modules/antaf/relay.py
+++ b/modules/antaf/relay.py
@ -0,0 +1,307 @@
+#!/usr/bin/env python3
+"""
+ESP32 ↔ Antaf Voice Relay
+Bridges ESP32 (WebSocket/Opus) with Antaf voice_bridge (TCP/PCM).
+
+ESP32 → Opus decode → resample 16kHz→48kHz → voice_bridge inject (type=3)
+ESP32 ← Opus encode ← resample 48kHz→16kHz ← voice_bridge speaker (type=0)
+
+Usage: python relay.py [--ws-port 8010] [--bridge-host 127.0.0.1] [--bridge-port 18901]
+"""
+
+import asyncio
+import json
+import struct
+import argparse
+import logging
+import numpy as np
+from scipy.signal import resample_poly
+from math import gcd
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+log = logging.getLogger("relay")
+
+try:
+    import opuslib_next as opuslib
+except ImportError:
+    import opuslib
+
+import websockets
+
+# Audio parameters
+ESP_SAMPLE_RATE = 16000      # ESP32 Opus sample rate
+ESP_FRAME_MS = 60            # ESP32 frame duration
+ESP_FRAME_SIZE = ESP_SAMPLE_RATE * ESP_FRAME_MS // 1000  # 960 samples
+
+BRIDGE_SAMPLE_RATE = 48000   # voice_bridge micIn sample rate
+BRIDGE_FRAME_BYTES = 960     # 480 samples * 2 bytes
+BRIDGE_FRAME_SAMPLES = 480
+
+# Resampling ratios
+UP_GCD = gcd(BRIDGE_SAMPLE_RATE, ESP_SAMPLE_RATE)    # 16000 → 48000
+UP_RATIO = (BRIDGE_SAMPLE_RATE // UP_GCD, ESP_SAMPLE_RATE // UP_GCD)  # (3, 1)
+DOWN_GCD = gcd(ESP_SAMPLE_RATE, BRIDGE_SAMPLE_RATE)  # 48000 → 16000
+DOWN_RATIO = (ESP_SAMPLE_RATE // DOWN_GCD, BRIDGE_SAMPLE_RATE // DOWN_GCD)  # (1, 3)
+
+
+class BridgeClient:
+    """TCP client for voice_bridge_v7."""
+
+    def __init__(self, host, port):
+        self.host = host
+        self.port = port
+        self.reader = None
+        self.writer = None
+        self.on_speaker_frame = None  # callback(pcm_bytes)
+        self._recv_task = None
+
+    async def connect(self):
+        self.reader, self.writer = await asyncio.open_connection(self.host, self.port)
+        log.info(f"Connected to voice_bridge {self.host}:{self.port}")
+
+        # Read connected event
+        ftype, data = await self._recv_frame()
+        if ftype == 1:
+            msg = json.loads(data.decode())
+            log.info(f"Bridge: {msg.get('protocol')}")
+
+    async def _recv_frame(self):
+        header = await self.reader.readexactly(5)
+        length = struct.unpack(">I", header[:4])[0]
+        ftype = header[4]
+        data = await self.reader.readexactly(length)
+        return ftype, data
+
+    def _send_frame(self, ftype, data):
+        header = struct.pack(">IB", len(data), ftype)
+        self.writer.write(header + data)
+        # Note: no await drain() here — voice frames are time-sensitive,
+        # TCP buffer handles backpressure
+
+    def send_cmd(self, cmd):
+        self._send_frame(1, json.dumps(cmd).encode())
+
+    def send_inject(self, pcm_bytes):
+        self._send_frame(3, pcm_bytes)
+
+    async def start_recv_loop(self):
+        """Background task: receive frames from bridge."""
+        try:
+            while True:
+                ftype, data = await self._recv_frame()
+                if ftype == 0 and self.on_speaker_frame:
+                    # Speaker audio
+                    await self.on_speaker_frame(data)
+                elif ftype == 1:
+                    msg = json.loads(data.decode())
+                    log.info(f"Bridge event: {msg}")
+        except asyncio.IncompleteReadError:
+            log.warning("Bridge connection closed")
+        except Exception as e:
+            log.error(f"Bridge recv error: {e}")
+
+    async def setup_voice(self):
+        """Start capture and enable inject. Voice chat must already be open."""
+        self.send_cmd({"cmd": "start"})
+        ftype, data = await self._recv_frame()
+        if ftype == 1:
+            msg = json.loads(data.decode())
+            log.info(f"Bridge: {msg}")
+        self.send_cmd({"cmd": "inject_on"})
+        ftype, data = await self._recv_frame()
+        if ftype == 1:
+            msg = json.loads(data.decode())
+            log.info(f"Bridge: {msg}")
+        log.info("Voice bridge ready (inject mode)")
+
+    async def close(self):
+        try:
+            self.send_cmd({"cmd": "inject_off"})
+            self.send_cmd({"cmd": "stop"})
+        except Exception:
+            pass
+        if self.writer:
+            self.writer.close()
+
+
+class Relay:
+    """Main relay: ESP32 WebSocket ↔ Antaf voice_bridge TCP."""
+
+    def __init__(self, ws_port, bridge_host, bridge_port):
+        self.ws_port = ws_port
+        self.bridge_host = bridge_host
+        self.bridge_port = bridge_port
+        self.bridge = None
+        self.ws = None
+        self.opus_decoder = None
+        self.opus_encoder = None
+        # Buffer for resampled PCM to split into bridge frames
+        self._inject_buf = np.array([], dtype=np.int16)
+        # Buffer for speaker PCM to accumulate before encoding
+        self._speaker_buf = np.array([], dtype=np.int16)
+        self._audio_in_count = 0
+        self._audio_out_count = 0
+        self._tts_started = False  # track if we sent tts start to ESP32
+
+    async def handle_esp32(self, websocket):
+        """Handle one ESP32 WebSocket connection."""
+        log.info(f"ESP32 connected from {websocket.remote_address}")
+        self.ws = websocket
+
+        # Init Opus codec
+        self.opus_decoder = opuslib.Decoder(ESP_SAMPLE_RATE, 1)
+        self.opus_encoder = opuslib.Encoder(ESP_SAMPLE_RATE, 1, opuslib.APPLICATION_AUDIO)
+
+        # Connect to voice bridge and setup voice chat first
+        self.bridge = BridgeClient(self.bridge_host, self.bridge_port)
+        await self.bridge.connect()
+        await self.bridge.setup_voice()
+
+        # Now start receiving speaker audio
+        self.bridge.on_speaker_frame = self._on_speaker_frame
+        recv_task = asyncio.create_task(self.bridge.start_recv_loop())
+
+        try:
+            async for message in websocket:
+                if isinstance(message, str):
+                    # Text message from ESP32 (hello, listen, etc.)
+                    await self._handle_text(message)
+                elif isinstance(message, bytes):
+                    # Opus audio from ESP32
+                    await self._handle_audio(message)
+        except websockets.exceptions.ConnectionClosed:
+            log.info("ESP32 disconnected")
+        finally:
+            recv_task.cancel()
+            await self.bridge.close()
+            self.ws = None
+            log.info("Session ended")
+
+    async def _handle_text(self, message):
+        """Handle text messages from ESP32."""
+        try:
+            msg = json.loads(message)
+            msg_type = msg.get("type")
+
+            if msg_type == "hello":
+                # Respond with proper hello — must match xiaozhi protocol
+                resp = {
+                    "type": "hello",
+                    "version": 1,
+                    "transport": "websocket",
+                    "session_id": "relay-session",
+                    "audio_params": {
+                        "format": "opus",
+                        "sample_rate": ESP_SAMPLE_RATE,
+                        "channels": 1,
+                        "frame_duration": ESP_FRAME_MS,
+                    },
+                }
+                await self.ws.send(json.dumps(resp))
+                log.info(f"ESP32 hello: {msg.get('audio_params')}")
+
+            elif msg_type == "listen":
+                state = msg.get("state")
+                log.info(f"ESP32 listen: {state}")
+                if state == "detect":
+                    text = msg.get("text", "")
+                    log.info(f"Wake word: {text}")
+                    # Don't send tts start — let ESP32 continue recording
+
+            elif msg_type == "abort":
+                log.info("ESP32 abort")
+
+        except json.JSONDecodeError:
+            log.warning(f"Invalid JSON from ESP32: {message[:100]}")
+
+    async def _handle_audio(self, opus_data):
+        """Decode Opus from ESP32, resample, inject into voice_bridge."""
+        try:
+            self._audio_in_count += 1
+            if self._audio_in_count <= 3 or self._audio_in_count % 100 == 0:
+                log.info(f"ESP32 audio frame #{self._audio_in_count}, size={len(opus_data)}")
+
+            # Decode Opus → PCM 16kHz mono
+            pcm = self.opus_decoder.decode(opus_data, ESP_FRAME_SIZE)
+            samples = np.frombuffer(pcm, dtype=np.int16)
+
+            # Resample 16kHz → 48kHz
+            upsampled = resample_poly(samples, UP_RATIO[0], UP_RATIO[1]).astype(np.int16)
+
+            # Append to inject buffer and send in bridge frame sizes
+            self._inject_buf = np.concatenate([self._inject_buf, upsampled])
+            while len(self._inject_buf) >= BRIDGE_FRAME_SAMPLES:
+                frame = self._inject_buf[:BRIDGE_FRAME_SAMPLES]
+                self._inject_buf = self._inject_buf[BRIDGE_FRAME_SAMPLES:]
+                self.bridge.send_inject(frame.tobytes())
+
+        except Exception as e:
+            log.error(f"Audio inject error: {e}")
+
+    async def _on_speaker_frame(self, pcm_bytes):
+        """Receive speaker PCM from bridge, resample, encode Opus, send to ESP32."""
+        if not self.ws or getattr(self.ws, 'closed', False):
+            return
+        try:
+            self._audio_out_count += 1
+
+            samples = np.frombuffer(pcm_bytes, dtype=np.int16)
+            max_amp = int(np.max(np.abs(samples)))
+
+            if self._audio_out_count <= 3 or self._audio_out_count % 100 == 0:
+                log.info(f"Speaker frame #{self._audio_out_count}, size={len(pcm_bytes)}, max_amp={max_amp}")
+
+            # Only send non-silent frames to ESP32
+            if max_amp < 10:
+                # If we were playing and now silent for a while, send tts stop
+                if self._tts_started and self._audio_out_count % 50 == 0:
+                    # Check later — don't stop immediately, silence gaps are normal
+                    pass
+                return
+
+            # Send tts start before first audio frame
+            if not self._tts_started:
+                await self.ws.send(json.dumps({
+                    "type": "tts", "state": "start",
+                    "session_id": "relay-session"
+                }))
+                self._tts_started = True
+                log.info("Sent tts start to ESP32")
+
+            # Resample 48kHz → 16kHz
+            downsampled = resample_poly(samples, DOWN_RATIO[0], DOWN_RATIO[1]).astype(np.int16)
+
+            # Accumulate into speaker buffer, encode when we have enough
+            self._speaker_buf = np.concatenate([self._speaker_buf, downsampled])
+            while len(self._speaker_buf) >= ESP_FRAME_SIZE:
+                frame = self._speaker_buf[:ESP_FRAME_SIZE]
+                self._speaker_buf = self._speaker_buf[ESP_FRAME_SIZE:]
+                # Encode PCM → Opus
+                opus_data = self.opus_encoder.encode(frame.tobytes(), ESP_FRAME_SIZE)
+                await self.ws.send(opus_data)
+
+        except Exception as e:
+            log.error(f"Speaker send error: {e}")
+
+    async def run(self):
+        log.info(f"Relay starting on ws://0.0.0.0:{self.ws_port}/xiaozhi/v1/")
+        async with websockets.serve(
+            self.handle_esp32, "0.0.0.0", self.ws_port,
+            ping_interval=30, ping_timeout=10,
+        ):
+            await asyncio.Future()  # run forever
+
+
+def main():
+    parser = argparse.ArgumentParser(description="ESP32-Antaf Voice Relay")
+    parser.add_argument("--ws-port", type=int, default=8010, help="WebSocket port for ESP32")
+    parser.add_argument("--bridge-host", default="127.0.0.1", help="voice_bridge host")
+    parser.add_argument("--bridge-port", type=int, default=18901, help="voice_bridge port")
+    args = parser.parse_args()
+
+    relay = Relay(args.ws_port, args.bridge_host, args.bridge_port)
+    asyncio.run(relay.run())
+
+
+if __name__ == "__main__":
+    main()
--- a/modules/antaf/test_inject.py
+++ b/modules/antaf/test_inject.py
@ -0,0 +1,130 @@
+#!/usr/bin/env python3
+"""Test voice_bridge_v7 audio injection.
+Connect to voice_bridge, open voice chat, enable inject mode,
+send silence frames, and print any speaker output received.
+
+Usage: python test_inject.py [host] [port]
+"""
+import socket
+import struct
+import json
+import time
+import sys
+import threading
+
+HOST = sys.argv[1] if len(sys.argv) > 1 else "127.0.0.1"
+PORT = int(sys.argv[2]) if len(sys.argv) > 2 else 18901
+
+FRAME_SIZE = 960  # 960 bytes per frame (480 samples * 16bit)
+
+
+def send_cmd(sock, cmd):
+    data = json.dumps(cmd).encode("utf-8")
+    header = struct.pack(">IB", len(data), 1)  # type=1 text
+    sock.sendall(header + data)
+
+
+def send_inject(sock, pcm_frame):
+    header = struct.pack(">IB", len(pcm_frame), 3)  # type=3 inject
+    sock.sendall(header + pcm_frame)
+
+
+def recv_exact(sock, n):
+    buf = b""
+    while len(buf) < n:
+        chunk = sock.recv(n - len(buf))
+        if not chunk:
+            return None
+        buf += chunk
+    return buf
+
+
+def recv_frame(sock):
+    header = recv_exact(sock, 5)
+    if header is None:
+        return None, None
+    length = struct.unpack(">I", header[:4])[0]
+    ftype = header[4]
+    if length > 1048576:
+        return None, None
+    data = recv_exact(sock, length)
+    if data is None:
+        return None, None
+    return ftype, data
+
+
+def receiver(sock):
+    """Background thread to print received frames."""
+    spk_count = 0
+    while True:
+        try:
+            ftype, data = recv_frame(sock)
+            if ftype is None:
+                print("[RECV] Connection closed")
+                break
+            if ftype == 1:  # text/json
+                msg = json.loads(data.decode("utf-8"))
+                print(f"[RECV] {msg}")
+            elif ftype == 0:  # speaker audio
+                spk_count += 1
+                # Check if audio is non-silent
+                samples = struct.unpack(f"<{len(data)//2}h", data)
+                max_amp = max(abs(s) for s in samples)
+                if spk_count <= 5 or spk_count % 100 == 0 or max_amp > 500:
+                    print(f"[SPK] frame={spk_count} size={len(data)} max_amp={max_amp}")
+            elif ftype == 2:  # mic audio
+                pass  # ignore mic echo
+        except Exception as e:
+            print(f"[RECV] Error: {e}")
+            break
+
+
+def main():
+    print(f"Connecting to {HOST}:{PORT}...")
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    sock.connect((HOST, PORT))
+    print("Connected")
+
+    # Start receiver thread
+    t = threading.Thread(target=receiver, args=(sock,), daemon=True)
+    t.start()
+
+    time.sleep(1)
+
+    # Open voice chat
+    print("Opening voice chat...")
+    send_cmd(sock, {"cmd": "open_voice"})
+    time.sleep(3)
+
+    # Start capture
+    print("Starting capture...")
+    send_cmd(sock, {"cmd": "start"})
+    time.sleep(1)
+
+    # Enable inject mode
+    print("Enabling inject mode...")
+    send_cmd(sock, {"cmd": "inject_on"})
+    time.sleep(0.5)
+
+    # Send silence frames for 3 seconds (48kHz, 960 bytes/frame = 20ms)
+    # 3 seconds = 150 frames
+    print("Sending 150 silence frames (3 seconds)...")
+    silence = b"\x00" * FRAME_SIZE
+    for i in range(150):
+        send_inject(sock, silence)
+        time.sleep(0.02)  # 20ms per frame
+
+    print("Done sending. Waiting for speaker output...")
+    time.sleep(10)
+
+    # Stop
+    send_cmd(sock, {"cmd": "inject_off"})
+    send_cmd(sock, {"cmd": "stop"})
+    time.sleep(1)
+
+    print("Test complete")
+    sock.close()
+
+
+if __name__ == "__main__":
+    main()
--- a/modules/antaf/voice_bridge_v7.js
+++ b/modules/antaf/voice_bridge_v7.js
@ -0,0 +1,209 @@
+// voice_bridge_v7.js — Voice Bridge with Audio Injection
+// Hook point: libantaudio.so MFAntAudio3AV2Filter::process(micIn, spkRef, out, size, &result)
+// TCP :18901
+// Frame: 4-byte len + 1-byte type + payload
+//   type 0: speaker/AI audio (spkRef, downstream to client)
+//   type 1: text/JSON command
+//   type 2: mic audio (micIn, downstream to client)
+//   type 3: inject audio (upstream from client, replaces micIn)
+
+var voiceActive = false;
+var clientOS = null;
+var capturedSpk = 0, capturedMic = 0, spkBytes = 0, micBytes = 0;
+var injectMode = false;  // true = replace mic with injected audio
+var injectQueue = [];    // queue of PCM frames to inject
+
+function wf(os, type, jArr) {
+    try {
+        var len = jArr.length;
+        var h = Java.array("byte", [(len>>24)&0xFF,(len>>16)&0xFF,(len>>8)&0xFF,len&0xFF, type]);
+        os.write(h); os.write(jArr); os.flush();
+    } catch(e) {}
+}
+function wt(os, text) {
+    wf(os, 1, Java.use("java.lang.String").$new(text).getBytes("UTF-8"));
+}
+
+// === Hook libantaudio.so ===
+var hooked = false;
+function tryHook() {
+    if (hooked) return;
+    var m = Process.findModuleByName("libantaudio.so");
+    if (!m) return;
+    var addr = m.findExportByName("_ZN8antaudio20MFAntAudio3AV2Filter7processEPhS1_S1_iRi");
+    if (!addr) return;
+    hooked = true;
+
+    Interceptor.attach(addr, {
+        onEnter: function(args) {
+            if (!voiceActive || !clientOS) return;
+            var size = args[4].toInt32();
+            if (size <= 0) return;
+
+            try {
+                // If inject mode, replace micIn with queued or silence
+                if (injectMode) {
+                    if (injectQueue.length > 0) {
+                        var frame = injectQueue.shift();
+                        // Only write if frame size matches expected size
+                        if (frame.byteLength === size) {
+                            args[1].writeByteArray(frame);
+                        } else if (frame.byteLength > 0) {
+                            // Size mismatch — pad or truncate
+                            var buf = new ArrayBuffer(size);
+                            var dst = new Uint8Array(buf);
+                            var src = new Uint8Array(frame);
+                            var copyLen = Math.min(size, frame.byteLength);
+                            for (var k = 0; k < copyLen; k++) dst[k] = src[k];
+                            args[1].writeByteArray(buf);
+                        }
+                    } else {
+                        // No data queued — inject silence to avoid mic leak
+                        var silence = new ArrayBuffer(size);
+                        args[1].writeByteArray(silence);
+                    }
+                }
+
+                // Always capture speaker/AI output (type 0)
+                var spkPcm = args[2].readByteArray(size);
+                var spkArr = Java.array("byte", Array.from(new Uint8Array(spkPcm)));
+                wf(clientOS, 0, spkArr);
+                capturedSpk++; spkBytes += size;
+
+                // Capture mic (type 2) only when not injecting
+                if (!injectMode) {
+                    var micPcm = args[1].readByteArray(size);
+                    var micArr = Java.array("byte", Array.from(new Uint8Array(micPcm)));
+                    wf(clientOS, 2, micArr);
+                }
+                capturedMic++; micBytes += size;
+
+                if (capturedMic <= 3 || capturedMic % 500 === 0)
+                    console.log("[VOICE] mic=" + capturedMic + " spk=" + capturedSpk + " inject=" + injectQueue.length);
+            } catch(e) {}
+        }
+    });
+    console.log("[VOICE] 3AV2Filter.process hooked @ " + addr);
+}
+
+[0, 1000, 3000, 5000, 10000, 15000, 20000].forEach(function(ms) { setTimeout(tryHook, ms); });
+try {
+    new ApiResolver("module").enumerateMatches("exports:linker*!*dlopen*").forEach(function(d) {
+        Interceptor.attach(d.address, { onLeave: function() { setTimeout(tryHook, 500); } });
+    });
+} catch(e) {}
+
+// === TCP Server ===
+Java.perform(function() {
+    var SS = Java.use("java.net.ServerSocket");
+    var JS = Java.use("java.lang.String");
+    var server = SS.$new(18901);
+    console.log("[VOICE] Listening :18901");
+
+    function openVoice(os) {
+        Java.scheduleOnMainThread(function() {
+            try {
+                Java.choose("com.antgroup.aijk.android.ijklauncher.biz.activity.IJKActivity", {
+                    onMatch: function(a) {
+                        var fm = a.getSupportFragmentManager();
+                        var f = Java.use("com.antgroup.aijk.android.ijkchat.biz.voicechat.IjkVoiceChatFragment").$new();
+                        f.show(fm, "v");
+                        console.log("[VOICE] Opened");
+                    }, onComplete: function() {}
+                });
+                setTimeout(function() { wt(os, JSON.stringify({event:"voice_opened"})); }, 2000);
+            } catch(e) { wt(os, JSON.stringify({event:"error",msg:""+e})); }
+        });
+    }
+
+    function closeVoice(os) {
+        Java.scheduleOnMainThread(function() {
+            try {
+                Java.choose("com.antgroup.aijk.android.ijkchat.biz.voicechat.IjkVoiceChatFragment", {
+                    onMatch: function(f) { f.dismiss(); console.log("[VOICE] Closed"); },
+                    onComplete: function() {}
+                });
+                setTimeout(function() { wt(os, JSON.stringify({event:"voice_closed"})); }, 1000);
+            } catch(e) { wt(os, JSON.stringify({event:"error",msg:""+e})); }
+        });
+    }
+
+    var Srv = Java.registerClass({
+        name: "com.antaf.voice.S7",
+        implements: [Java.use("java.lang.Runnable")],
+        methods: {
+            run: function() {
+                while (true) {
+                    try {
+                        console.log("[VOICE] Waiting...");
+                        var c = server.accept();
+                        var is = c.getInputStream();
+                        var os = c.getOutputStream();
+                        clientOS = os;
+                        console.log("[VOICE] Connected");
+                        wt(os, JSON.stringify({
+                            event:"connected", protocol:"antaf-voice-v8",
+                            commands:["open_voice","close_voice","start","stop","status","inject_on","inject_off"],
+                            audio:"pcm-16bit-960b-frames",
+                            frameTypes:{0:"spk_ai",1:"text",2:"mic",3:"inject"}
+                        }));
+
+                        while (true) {
+                            var hb = [];
+                            for (var i=0;i<5;i++) { var b=is.read(); if(b<0) throw "EOF"; hb.push(b); }
+                            var fl=(hb[0]<<24)|(hb[1]<<16)|(hb[2]<<8)|hb[3], ft=hb[4];
+                            if (fl>1048576) break;
+                            var pb = [];
+                            for (var i=0;i<fl;i++) { var b=is.read(); if(b<0) throw "EOF"; pb.push(b&0xFF); }
+
+                            if (ft === 3) {
+                                // type 3: inject audio frame into micIn
+                                var arr = new ArrayBuffer(pb.length);
+                                var view = new Uint8Array(arr);
+                                for (var j=0;j<pb.length;j++) view[j] = pb[j];
+                                injectQueue.push(arr);
+                            }
+                            else if (ft === 1) {
+                                var pl = Java.array("byte", pb);
+                                var cmd = JSON.parse(JS.$new(pl,"UTF-8").toString());
+                                console.log("[VOICE] Cmd: " + JSON.stringify(cmd));
+                                if (cmd.cmd === "open_voice") openVoice(os);
+                                else if (cmd.cmd === "close_voice") closeVoice(os);
+                                else if (cmd.cmd === "start") {
+                                    voiceActive = true;
+                                    capturedSpk=0;capturedMic=0;spkBytes=0;micBytes=0;
+                                    injectQueue = [];
+                                    wt(os, JSON.stringify({event:"started",hooked:hooked}));
+                                }
+                                else if (cmd.cmd === "stop") {
+                                    voiceActive = false;
+                                    injectMode = false;
+                                    injectQueue = [];
+                                    wt(os, JSON.stringify({event:"stopped",spk:{frames:capturedSpk,bytes:spkBytes},mic:{frames:capturedMic,bytes:micBytes}}));
+                                }
+                                else if (cmd.cmd === "inject_on") {
+                                    injectMode = true;
+                                    injectQueue = [];
+                                    wt(os, JSON.stringify({event:"inject_on"}));
+                                    console.log("[VOICE] Inject mode ON");
+                                }
+                                else if (cmd.cmd === "inject_off") {
+                                    injectMode = false;
+                                    injectQueue = [];
+                                    wt(os, JSON.stringify({event:"inject_off"}));
+                                    console.log("[VOICE] Inject mode OFF");
+                                }
+                                else if (cmd.cmd === "status") {
+                                    wt(os, JSON.stringify({event:"status",active:voiceActive,hooked:hooked,inject:injectMode,queue:injectQueue.length,spk:{frames:capturedSpk,bytes:spkBytes},mic:{frames:capturedMic,bytes:micBytes}}));
+                                }
+                            }
+                        }
+                    } catch(e) { console.log("[VOICE] Ended: "+e); }
+                    finally { voiceActive=false; clientOS=null; injectMode=false; injectQueue=[]; }
+                }
+            }
+        }
+    });
+    Java.use("java.lang.Thread").$new(Srv.$new()).start();
+    console.log("[VOICE] Ready (v7 + inject)");
+});
--- a/modules/antaf/voice_bridge_v8.js
+++ b/modules/antaf/voice_bridge_v8.js
@ -0,0 +1,181 @@
+// voice_bridge_v8.js — Voice Bridge with Audio Injection (attach after voice chat opened)
+//
+// STARTUP ORDER:
+// 1. Launch app: adb shell monkey -p com.antgroup.aijk.android ...
+// 2. Open voice chat manually or via adb tap
+// 3. Wait for libantaudio.so to load
+// 4. Attach frida with this script
+//
+// Hook point: libantaudio.so MFAntAudio3AV2Filter::process(micIn, spkRef, out, size, &result)
+// TCP :18901
+// Frame: 4-byte len + 1-byte type + payload
+//   type 0: speaker/AI audio (spkRef, downstream to client)
+//   type 1: text/JSON command
+//   type 2: mic audio (micIn, downstream to client)
+//   type 3: inject audio (upstream from client, replaces micIn)
+
+var voiceActive = false;
+var clientOS = null;
+var capturedSpk = 0, capturedMic = 0, spkBytes = 0, micBytes = 0;
+var injectMode = false;
+var injectQueue = [];
+
+function wf(os, type, jArr) {
+    try {
+        var len = jArr.length;
+        var h = Java.array("byte", [(len>>24)&0xFF,(len>>16)&0xFF,(len>>8)&0xFF,len&0xFF, type]);
+        os.write(h); os.write(jArr); os.flush();
+    } catch(e) {}
+}
+function wt(os, text) {
+    wf(os, 1, Java.use("java.lang.String").$new(text).getBytes("UTF-8"));
+}
+
+// === Hook libantaudio.so (should already be loaded) ===
+var hooked = false;
+function tryHook() {
+    if (hooked) return;
+    var m = Process.findModuleByName("libantaudio.so");
+    if (!m) return;
+    var addr = m.findExportByName("_ZN8antaudio20MFAntAudio3AV2Filter7processEPhS1_S1_iRi");
+    if (!addr) return;
+    hooked = true;
+
+    Interceptor.attach(addr, {
+        onEnter: function(args) {
+            if (!voiceActive || !clientOS) return;
+            var size = args[4].toInt32();
+            if (size <= 0) return;
+
+            try {
+                if (injectMode) {
+                    if (injectQueue.length > 0) {
+                        var frame = injectQueue.shift();
+                        if (frame.byteLength === size) {
+                            args[1].writeByteArray(frame);
+                        } else {
+                            var buf = new ArrayBuffer(size);
+                            var dst = new Uint8Array(buf);
+                            var src = new Uint8Array(frame);
+                            var copyLen = Math.min(size, frame.byteLength);
+                            for (var k = 0; k < copyLen; k++) dst[k] = src[k];
+                            args[1].writeByteArray(buf);
+                        }
+                    } else {
+                        var silence = new ArrayBuffer(size);
+                        args[1].writeByteArray(silence);
+                    }
+                }
+
+                // Always capture speaker/AI output (type 0)
+                var spkPcm = args[2].readByteArray(size);
+                var spkArr = Java.array("byte", Array.from(new Uint8Array(spkPcm)));
+                wf(clientOS, 0, spkArr);
+                capturedSpk++; spkBytes += size;
+
+                if (!injectMode) {
+                    var micPcm = args[1].readByteArray(size);
+                    var micArr = Java.array("byte", Array.from(new Uint8Array(micPcm)));
+                    wf(clientOS, 2, micArr);
+                }
+                capturedMic++; micBytes += size;
+
+                if (capturedMic <= 3 || capturedMic % 500 === 0)
+                    console.log("[VOICE] mic=" + capturedMic + " spk=" + capturedSpk + " inject=" + injectQueue.length);
+            } catch(e) {}
+        }
+    });
+    console.log("[VOICE] process hooked @ " + addr);
+}
+
+// Hook immediately — lib should already be loaded since voice chat is open
+tryHook();
+if (!hooked) {
+    // Retry a few times in case of timing
+    [500, 1000, 2000, 5000].forEach(function(ms) { setTimeout(tryHook, ms); });
+}
+
+// === TCP Server ===
+Java.perform(function() {
+    var SS = Java.use("java.net.ServerSocket");
+    var JS = Java.use("java.lang.String");
+    var server = SS.$new(18901);
+    console.log("[VOICE] Listening :18901");
+
+    var Srv = Java.registerClass({
+        name: "com.antaf.voice.S8",
+        implements: [Java.use("java.lang.Runnable")],
+        methods: {
+            run: function() {
+                while (true) {
+                    try {
+                        console.log("[VOICE] Waiting for client...");
+                        var c = server.accept();
+                        var is = c.getInputStream();
+                        var os = c.getOutputStream();
+                        clientOS = os;
+                        console.log("[VOICE] Client connected");
+                        wt(os, JSON.stringify({
+                            event:"connected", protocol:"antaf-voice-v8",
+                            hooked: hooked,
+                            commands:["start","stop","status","inject_on","inject_off"],
+                            audio:"pcm-16bit-960b-frames",
+                            frameTypes:{0:"spk_ai",1:"text",2:"mic",3:"inject"}
+                        }));
+
+                        while (true) {
+                            var hb = [];
+                            for (var i=0;i<5;i++) { var b=is.read(); if(b<0) throw "EOF"; hb.push(b); }
+                            var fl=(hb[0]<<24)|(hb[1]<<16)|(hb[2]<<8)|hb[3], ft=hb[4];
+                            if (fl>1048576) break;
+                            var pb = [];
+                            for (var i=0;i<fl;i++) { var b=is.read(); if(b<0) throw "EOF"; pb.push(b&0xFF); }
+
+                            if (ft === 3) {
+                                var arr = new ArrayBuffer(pb.length);
+                                var view = new Uint8Array(arr);
+                                for (var j=0;j<pb.length;j++) view[j] = pb[j];
+                                injectQueue.push(arr);
+                            }
+                            else if (ft === 1) {
+                                var pl = Java.array("byte", pb);
+                                var cmd = JSON.parse(JS.$new(pl,"UTF-8").toString());
+                                console.log("[VOICE] Cmd: " + JSON.stringify(cmd));
+                                if (cmd.cmd === "start") {
+                                    voiceActive = true;
+                                    capturedSpk=0;capturedMic=0;spkBytes=0;micBytes=0;
+                                    injectQueue = [];
+                                    wt(os, JSON.stringify({event:"started",hooked:hooked}));
+                                }
+                                else if (cmd.cmd === "stop") {
+                                    voiceActive = false;
+                                    injectMode = false;
+                                    injectQueue = [];
+                                    wt(os, JSON.stringify({event:"stopped",spk:{frames:capturedSpk,bytes:spkBytes},mic:{frames:capturedMic,bytes:micBytes}}));
+                                }
+                                else if (cmd.cmd === "inject_on") {
+                                    injectMode = true;
+                                    injectQueue = [];
+                                    wt(os, JSON.stringify({event:"inject_on"}));
+                                    console.log("[VOICE] Inject ON");
+                                }
+                                else if (cmd.cmd === "inject_off") {
+                                    injectMode = false;
+                                    injectQueue = [];
+                                    wt(os, JSON.stringify({event:"inject_off"}));
+                                    console.log("[VOICE] Inject OFF");
+                                }
+                                else if (cmd.cmd === "status") {
+                                    wt(os, JSON.stringify({event:"status",active:voiceActive,hooked:hooked,inject:injectMode,queue:injectQueue.length,spk:{frames:capturedSpk,bytes:spkBytes},mic:{frames:capturedMic,bytes:micBytes}}));
+                                }
+                            }
+                        }
+                    } catch(e) { console.log("[VOICE] Client disconnected: "+e); }
+                    finally { voiceActive=false; clientOS=null; injectMode=false; injectQueue=[]; }
+                }
+            }
+        }
+    });
+    Java.use("java.lang.Thread").$new(Srv.$new()).start();
+    console.log("[VOICE] Ready (v8, no open_voice)");
+});
--- a/modules/docs/antaf-integration-plan.md
+++ b/modules/docs/antaf-integration-plan.md
@ -0,0 +1,335 @@
+# 蚂蚁阿福接入小智 ESP32 — 实施方案
+
+## 项目目标
+
+将蚂蚁阿福 App 的 AI 能力接入小智 ESP32 硬件终端，用户通过 ESP32 设备语音对话，
+后端对接蚂蚁阿福代替自建 LLM，省去 GPU 资源（两张 RTX 3090 + Qwen3-32B）。
+
+---
+
+## 系统架构
+
+### 方案A：文字接入（自定义 LLM Provider）
+
+```
+ESP32 设备                    PlugAI 服务端                       手机
+┌──────────┐    WebSocket    ┌──────────────────┐    HTTP/SSE    ┌─<E2948C><E29480><EFBFBD>────────────┐
+│ 麦克风    │ ──────────────→│ ASR (FunASR)     │               │ 蚂蚁阿福 App  │
+│ 唤<><E594A4><EFBFBD>词    │                │   语音→文字       │               │ + Frida 注入  │
+│ AEC/NS   │                │                  │  GET /chat?q= │              │
+│          │                │ AntafLLM Provider│──────────────→│ HTTP Bridge  │
+│          │                │   (新增)          │←──────────────│ (port 18900) │
+│          │                │                  │  SSE 流式回答  │              │
+│ 喇叭     │←───────────────│ TTS (EdgeTTS)    │               │              │
+│          │    WebSocket    │   文字→语音       │               │              │
+└──────────┘                └──────────────────┘               └──────────────┘
+```
+
+**数据流**: ESP32 音频 → FunASR(语音转文字) → AntafLLM(文<><E69687><EFBFBD>发给阿福) → EdgeTTS(回答转语音) → ESP32 播放
+
+### 方案B：语音直通（替代整个 ASR+LLM+TTS）
+
+```
+ESP32 设备                    PlugAI 服务端                       手机
+┌──────────┐    WebSocket    ┌─────<E29480><E29480><EFBFBD>────────────┐    TCP 二进制  ┌──<E29480><E29480><EFBFBD>───────────┐
+│ 麦克风    │ ──────────────→│ 音频转发模块(新增) │               │ 蚂蚁<E89A82><E89A81>福 App  │
+│          │                │   Opus解码        │  PCM注入mic   │ + Frida <20><>入  │
+│          │                │   重采样 24k→48k  │──────────────→│ Voice Bridge │
+│          │                │                  │  PCM speaker  │ (port 18901) │
+│ 喇叭     │←───────────────│   重采样 24k→24k  │←──────────────│ libantaudio  │
+│          │    WebSocket    │   Opus编码        │               │              │
+└──────────┘                └──────────────────┘               └──────────────┘
+```
+
+**数据流**: ESP32 音频 → 解码+重采样 → 注入阿福麦克风 → 阿福完整处理(ASR+LLM+TTS) → 捕获音频 → 编码 → ESP32 播放
+
+---
+
+## 可行性评估
+
+| 维度 | 方案A (文字接入) | 方案B (语音直通) |
+|------|-----------------|-----------------|
+| 可行性 | **高** | **中低** |
+| 实现难度 | 低 (1个Python文件) | 高 (改JS+写转发模块) |
+| 改动范围 | 新增 LLM Provider + 改配置 | 改 voice_bridge.js + 新增转发模块 |
+| 延迟 | 中 (ASR+网络+TTS 各一轮) | 低 (音频直通) |
+| 音质 | EdgeTTS (微软高质量) | 阿福原生 TTS |
+| GPU 依赖 | 无 (省掉 Qwen3-32B) | 无 |
+| 手机依赖 | 需要 (App+Frida+adb) | 需要 (App+Frida+adb) |
+| 核心风险 | 低 | **voice_bridge 当前不支持音频注入** |
+
+**结论**: 先实施方案A，验证通过后再做方案B。
+
+---
+
+## 方案A 详细实施
+
+### 前置条件
+
+| 组件 | 状态 | 说明 |
+|------|------|------|
+| ESP32 设备 | 已就绪 | 固件已烧录，WiFi+服务端已配置 |
+| 小智服务端 | 已就绪 | ws://14.18.247.51:8010 运行中 |
+| ASR (FunASR) | 已就绪 | CPU 模式 |
+| TTS (EdgeTTS) | 已就绪 | 微软免费 |
+| 蚂蚁阿福 HTTP Bridge | 已就绪 | http_bridge_stream.js (port 18900) |
+| Frida + 手机 | 需部署 | 手机需连到服务端可达的网络 |
+
+### 第1步：创建 AntafLLM Provider
+
+文件路径：`backend/main/xiaozhi-server/core/providers/llm/antaf/antaf.py`
+
+```python
+import requests
+from config.logger import setup_logging
+from core.providers.llm.base import LLMProviderBase
+
+TAG = __name__
+logger = setup_logging()
+
+
+class LLMProvider(LLMProviderBase):
+    """
+    蚂蚁阿福 LLM Provider
+    通过 Frida HTTP Bridge (port 18900) 对接蚂蚁阿福 App 的文字对话 API。
+    Bridge 运行在手机上，通过 adb forward 或网络暴露 SSE 流式接口。
+    """
+
+    def __init__(self, config):
+        self.bridge_url = config.get("bridge_url", "http://127.0.0.1:18900")
+        self.timeout = config.get("timeout", 60)
+        logger.bind(tag=TAG).info(
+            f"AntafLLM 初始化: bridge={self.bridge_url}, timeout={self.timeout}s"
+        )
+
+    def response(self, session_id, dialogue, **kwargs):
+        """
+        流式返回蚂蚁阿福的回答。
+        1. 从 dialogue 取最后一条用户消息
+        2. GET {bridge_url}/chat?q={query}
+        3. 解析 SSE 流，yield 每个 delta 文本
+        """
+        # 提取最后一条用户消息
+        query = ""
+        for msg in reversed(dialogue):
+            if msg.get("role") == "user":
+                query = msg.get("content", "")
+                break
+
+        if not query:
+            logger.bind(tag=TAG).warning("对话中没有用户消息")
+            yield "抱歉，我没有收到您的问题。"
+            return
+
+        logger.bind(tag=TAG).info(f"AntafLLM 请求: {query[:50]}...")
+
+        try:
+            url = f"{self.bridge_url}/chat"
+            resp = requests.get(
+                url,
+                params={"q": query},
+                stream=True,
+                timeout=self.timeout
+            )
+            resp.encoding = "utf-8"
+
+            for line in resp.iter_lines(decode_unicode=True):
+                if not line:
+                    continue
+                if line.startswith("data: "):
+                    data = line[6:]  # 去掉 "data: " 前缀
+                    if data == "[DONE]":
+                        break
+                    if data and len(data.strip()) > 0:
+                        yield data
+
+        except requests.exceptions.ConnectionError:
+            logger.bind(tag=TAG).error("无法连接蚂<E68EA5><E89A82>阿福 Bridge，请检查手机和 Frida 状态")
+            yield "抱歉，蚂蚁阿福服务暂时不可用。"
+        except requests.exceptions.Timeout:
+            logger.bind(tag=TAG).error(f"蚂蚁阿福 Bridge 超时 ({self.timeout}s)")
+            yield "抱歉，回答超时了。"
+        except Exception as e:
+            logger.bind(tag=TAG).error(f"AntafLLM 异常: {e}")
+            yield "抱歉，发生了错误。"
+```
+
+### 第2步：修改服务端配置
+
+编辑 `backend/main/xiaozhi-server/data/.config.yaml`：
+
+```yaml
+selected_module:
+  LLM: antaf       # 改为<E694B9><E4B8BA><EFBFBD>蚁阿福
+
+LLM:
+  antaf:
+    type: antaf
+    bridge_url: http://<手机IP>:18900    # 手机的 HTTP Bridge 地址
+    timeout: 60                          # SSE 流超时时间
+```
+
+也可以保留原来的 Qwen3 配置，方便切换：
+
+```yaml
+LLM:
+  antaf:
+    type: antaf
+    bridge_url: http://<手机IP>:18900
+    timeout: 60
+  Qwen3:
+    type: openai
+    model_name: Qwen3-32B
+    url: http://127.0.0.1:30000/v1
+    api_key: EMPTY
+```
+
+### 第3步：网络打通
+
+手机的 Frida Bridge 端口需要让 PlugAI 服<><E69C8D><EFBFBD>器能访问到。有两种方式：
+
+#### 方式1：手机直连局域网（推荐）
+
+如果手机和 PlugAI 服务器在同一网络（或手机有公网可达 IP）：
+```bash
+# 手机上启动 bridge 后，服务端直接访问
+# bridge_url: http://<手机内网IP>:18900
+curl http://<手机IP>:18900/chat?q=hello
+```
+
+#### 方式2：adb forward + SSH 隧道
+
+手机通过 USB 连接一台中间机器，再通过 SSH 隧道暴露<E69AB4><E99CB2><EFBFBD>
+```bash
+# 中间机器上
+adb forward tcp:18900 tcp:18900
+
+# PlugAI 上建 SSH 隧道
+ssh -L 18900:127.0.0.1:18900 user@中间机器IP
+# bridge_url: http://127.0.0.1:18900
+```
+
+### 第4步：启动与测试
+
+```bash
+# 1. 手机端：启动 Frida + HTTP Bridge
+frida -U -p <PID> -l http_bridge_stream.js
+
+# 2. 先测 bridge 连通性
+curl -N 'http://<手机IP>:18900/chat?q=你好'
+
+# 3. PlugAI 服务端：重启小智服务
+cd /home/ZeroStack/xiaozhi/xiaozhi-esp32-server/main/xiaozhi-server
+source /home/ZeroStack/xiaozhi/venv/bin/activate
+python app.py
+
+# 4. ESP32 设备：唤醒测试
+# 说 "你好小智" → 提问 → 应该听到蚂蚁阿福的回答（EdgeTTS 合成的语音）
+```
+
+---
+
+## 方案B 详细实施（后续）
+
+### 核心改造：voice_bridge.js 支持音频注入
+
+当前 voice_bridge.js 的 `MFAntAudio3AV2Filter::process` hook 只**读取** micIn 缓冲区。
+需要改造为可以从外部**写入** micIn 缓冲区，替换真实麦克风输入。
+
+#### 改造要点
+
+```javascript
+// voice_bridge.js 新增功能
+var injectBuffer = null;  // 外部注入的 PCM 数据
+
+// 新增 inject 命令：接收外部 PCM 音频帧
+// 客户端发送: [4字节长度][type=3][960字节PCM数据]
+// type 3 = inject audio
+
+Interceptor.attach(processAddr, {
+    onEnter: function(args) {
+        var micIn = args[1];       // 麦克风输入缓冲区 (960 bytes)
+        var frameSize = args[4];   // 960
+
+        if (injectBuffer !== null) {
+            // 用注入数据覆盖真实麦克风输入
+            micIn.writeByteArray(injectBuffer);
+            injectBuffer = null;
+        }
+    }
+});
+```
+
+#### 采样率转换
+
+| 来源 | 格式 | 需转换为 |
+|------|------|---------|
+| ESP32 → 服务端 | Opus 24kHz mono | PCM 48kHz mono (阿福 mic) |
+| 阿福 speaker 输出 | PCM 24kHz stereo | Opus 24kHz mono (ESP32) |
+
+服务端需要：
+- libopus 解码/编码
+- resampy 或 scipy 做采样率转换
+- 960字节帧对齐（20ms @ 48kHz）
+
+#### 新增音频转发模块
+
+文件<EFBFBD><EFBFBD>径：`backend/main/xiaozhi-server/core/providers/asr/antaf_voice/antaf_voice.py`
+
+这是一个特殊的 ASR Provider，它不做语音识别，而是：
+1. 接收 ESP32 的 Opus 音频流
+2. 解码为 PCM，重采样 24k→48k
+3. 通过 TCP 发送到 voice_bridge (port 18901) 的 inject 命令
+4. 接收 voice_bridge 的 speaker 输出
+5. 重采样 24k stereo → 24k mono，Opus 编码
+6. 直接发回 ESP32（跳过 LLM 和 TTS）
+
+#### 方案B 风险点
+
+1. **帧时序同步**: ESP32 音频帧和阿福 process() 调用频率可能不一致
+2. **延迟累积**: 网络传输 + 两次重采样 + 注入延迟
+3. **VAD 冲突**: 阿福自带 VAD 可能与注入音频不匹配
+4. **回声消除失效**: 注入 mic 数据后，阿<EFBC8C><E998BF>的 AEC 参考信号（spkRef）对不上
+5. **对话控制**: 何时 open_voice / close_voice 需要与 ESP32 唤醒状态同步
+
+---
+
+## 依赖清单
+
+### 方案A（新增依赖）
+- `requests` — Python HTTP 库（服务端 venv 中应已有）
+
+### 方案B（新增<E696B0><E5A29E>赖）
+- `opuslib` 或 `pyogg` — Opus 编解码
+- `resampy` 或 `scipy.signal` — 采样率转换
+- `numpy` — 音频数据处理
+
+---
+
+## 文件清单
+
+### 方案A
+| 操<><E6938D> | 文件 |
+|------|------|
+| 新增 | `backend/main/xiaozhi-server/core/providers/llm/antaf/__init__.py` |
+| 新增 | `backend/main/xiaozhi-server/core/providers/llm/antaf/antaf.py` |
+| 修改 | `backend/main/xiaozhi-server/data/.config.yaml` |
+
+### 方案B（额外）
+| 操作 | 文件 |
+|------|------|
+| 修改 | `antaf/voice_bridge.js` (新增 inject 命令) |
+| 新增 | `backend/main/xiaozhi-server/core/providers/asr/antaf_voice/antaf_voice.py` |
+| 新增 | `backend/main/xiaozhi-server/core/utils/audio_resample.py` |
+
+---
+
+## 里程碑
+
+| 阶段 | 目标 | 预期产出 |
+|------|------|---------|
+| M1 | 方案A 代码实现 | AntafLLM Provider + 配置 |
+| M2 | 网络打通 | PlugAI ↔ 手机 Bridge 连通 |
+| M3 | 端到端测试 | ESP32 唤醒→阿福回答→语音播报 |
+| M4 | 方案B 原型 | voice_bridge 音频注入验证 |
+| M5 | 方案B 集成 | 全语音直通链路 |
--- a/modules/tts/sherpa_tts.py
+++ b/modules/tts/sherpa_tts.py
@ -0,0 +1,79 @@
+import io
+import os
+import wave
+import asyncio
+import numpy as np
+import sherpa_onnx
+from config.logger import setup_logging
+from core.providers.tts.base import TTSProviderBase
+
+TAG = __name__
+logger = setup_logging()
+
+
+class TTSProvider(TTSProviderBase):
+    def __init__(self, config, delete_audio_file):
+        super().__init__(config, delete_audio_file)
+        model_dir = config.get("model_dir", "models/vits-melo-tts-zh_en")
+        speed = config.get("speed", 1.0)
+        self.speed = float(speed) if speed else 1.0
+        self.sid = int(config.get("sid", 0))
+
+        # 优先使用 int8 量化模型（更快）
+        model_file = f"{model_dir}/model.int8.onnx"
+        if not os.path.exists(model_file) or os.path.getsize(model_file) < 1024:
+            model_file = f"{model_dir}/model.onnx"
+
+        num_threads = int(config.get("num_threads", 8))
+
+        tts_config = sherpa_onnx.OfflineTtsConfig(
+            model=sherpa_onnx.OfflineTtsModelConfig(
+                vits=sherpa_onnx.OfflineTtsVitsModelConfig(
+                    model=model_file,
+                    lexicon=f"{model_dir}/lexicon.txt",
+                    tokens=f"{model_dir}/tokens.txt",
+                    dict_dir=f"{model_dir}/dict",
+                ),
+                num_threads=num_threads,
+            ),
+            rule_fsts=f"{model_dir}/date.fst,{model_dir}/phone.fst,{model_dir}/number.fst,{model_dir}/new_heteronym.fst",
+            max_num_sentences=1,
+        )
+        self.tts = sherpa_onnx.OfflineTts(tts_config)
+        self.sample_rate = self.tts.sample_rate
+        logger.bind(tag=TAG).info(
+            f"SherpaOnnxTTS 初始化完成: model_dir={model_dir}, sample_rate={self.sample_rate}, sid={self.sid}"
+        )
+
+    def _generate_wav(self, text):
+        """同步合成，在线程池中调用"""
+        from scipy.signal import resample_poly
+        from math import gcd
+
+        audio = self.tts.generate(text, sid=self.sid, speed=self.speed)
+        samples = np.array(audio.samples, dtype=np.float32)
+
+        # 重采样到目标采样率（设备要求 24000Hz，模型输出 44100Hz）
+        target_sr = 24000
+        if self.sample_rate != target_sr:
+            g = gcd(self.sample_rate, target_sr)
+            samples = resample_poly(samples, target_sr // g, self.sample_rate // g)
+
+        pcm = (samples * 32767).astype(np.int16)
+
+        wav_io = io.BytesIO()
+        with wave.open(wav_io, "wb") as wf:
+            wf.setnchannels(1)
+            wf.setsampwidth(2)
+            wf.setframerate(target_sr)
+            wf.writeframes(pcm.tobytes())
+        return wav_io.getvalue()
+
+    async def text_to_speak(self, text, output_file):
+        wav_data = self._generate_wav(text)
+
+        if output_file:
+            with open(output_file, "wb") as f:
+                f.write(wav_data)
+        else:
+            return wav_data