From 216f2fe6a025528fafc2066c651aa104a4bc995a Mon Sep 17 00:00:00 2001
From: hailin <hailin.zhao@gdzx.xyz>
Date: Mon, 6 Apr 2026 05:05:24 -0700
Subject: [PATCH] feat: add voice relay (Plan B) - ESP32 audio passthrough to
 Antaf
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- voice_bridge_v7.js: audio injection support (type=3 frames)
- relay.py: WebSocket↔TCP bridge with Opus↔PCM + resampling
- test_inject.py: injection verification script
- Injection verified: 1454 frames stable, no crash

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 antaf/test_inject.py     | 130 +++++++++++++++++++
 antaf/voice_bridge_v7.js | 209 ++++++++++++++++++++++++++++++
 relay.py                 | 270 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 609 insertions(+)
 create mode 100644 antaf/test_inject.py
 create mode 100644 antaf/voice_bridge_v7.js
 create mode 100644 relay.py

diff --git a/antaf/test_inject.py b/antaf/test_inject.py
new file mode 100644
index 0000000..c56a031
--- /dev/null
+++ b/antaf/test_inject.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python3
+"""Test voice_bridge_v7 audio injection.
+Connect to voice_bridge, open voice chat, enable inject mode,
+send silence frames, and print any speaker output received.
+
+Usage: python test_inject.py [host] [port]
+"""
+import socket
+import struct
+import json
+import time
+import sys
+import threading
+
+HOST = sys.argv[1] if len(sys.argv) > 1 else "127.0.0.1"
+PORT = int(sys.argv[2]) if len(sys.argv) > 2 else 18901
+
+FRAME_SIZE = 960  # 960 bytes per frame (480 samples * 16bit)
+
+
+def send_cmd(sock, cmd):
+    data = json.dumps(cmd).encode("utf-8")
+    header = struct.pack(">IB", len(data), 1)  # type=1 text
+    sock.sendall(header + data)
+
+
+def send_inject(sock, pcm_frame):
+    header = struct.pack(">IB", len(pcm_frame), 3)  # type=3 inject
+    sock.sendall(header + pcm_frame)
+
+
+def recv_exact(sock, n):
+    buf = b""
+    while len(buf) < n:
+        chunk = sock.recv(n - len(buf))
+        if not chunk:
+            return None
+        buf += chunk
+    return buf
+
+
+def recv_frame(sock):
+    header = recv_exact(sock, 5)
+    if header is None:
+        return None, None
+    length = struct.unpack(">I", header[:4])[0]
+    ftype = header[4]
+    if length > 1048576:
+        return None, None
+    data = recv_exact(sock, length)
+    if data is None:
+        return None, None
+    return ftype, data
+
+
+def receiver(sock):
+    """Background thread to print received frames."""
+    spk_count = 0
+    while True:
+        try:
+            ftype, data = recv_frame(sock)
+            if ftype is None:
+                print("[RECV] Connection closed")
+                break
+            if ftype == 1:  # text/json
+                msg = json.loads(data.decode("utf-8"))
+                print(f"[RECV] {msg}")
+            elif ftype == 0:  # speaker audio
+                spk_count += 1
+                # Check if audio is non-silent
+                samples = struct.unpack(f"<{len(data)//2}h", data)
+                max_amp = max(abs(s) for s in samples)
+                if spk_count <= 5 or spk_count % 100 == 0 or max_amp > 500:
+                    print(f"[SPK] frame={spk_count} size={len(data)} max_amp={max_amp}")
+            elif ftype == 2:  # mic audio
+                pass  # ignore mic echo
+        except Exception as e:
+            print(f"[RECV] Error: {e}")
+            break
+
+
+def main():
+    print(f"Connecting to {HOST}:{PORT}...")
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    sock.connect((HOST, PORT))
+    print("Connected")
+
+    # Start receiver thread
+    t = threading.Thread(target=receiver, args=(sock,), daemon=True)
+    t.start()
+
+    time.sleep(1)
+
+    # Open voice chat
+    print("Opening voice chat...")
+    send_cmd(sock, {"cmd": "open_voice"})
+    time.sleep(3)
+
+    # Start capture
+    print("Starting capture...")
+    send_cmd(sock, {"cmd": "start"})
+    time.sleep(1)
+
+    # Enable inject mode
+    print("Enabling inject mode...")
+    send_cmd(sock, {"cmd": "inject_on"})
+    time.sleep(0.5)
+
+    # Send silence frames for 3 seconds (48kHz, 960 bytes/frame = 20ms)
+    # 3 seconds = 150 frames
+    print("Sending 150 silence frames (3 seconds)...")
+    silence = b"\x00" * FRAME_SIZE
+    for i in range(150):
+        send_inject(sock, silence)
+        time.sleep(0.02)  # 20ms per frame
+
+    print("Done sending. Waiting for speaker output...")
+    time.sleep(10)
+
+    # Stop
+    send_cmd(sock, {"cmd": "inject_off"})
+    send_cmd(sock, {"cmd": "stop"})
+    time.sleep(1)
+
+    print("Test complete")
+    sock.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/antaf/voice_bridge_v7.js b/antaf/voice_bridge_v7.js
new file mode 100644
index 0000000..5cf95a5
--- /dev/null
+++ b/antaf/voice_bridge_v7.js
@@ -0,0 +1,209 @@
+// voice_bridge_v7.js — Voice Bridge with Audio Injection
+// Hook point: libantaudio.so MFAntAudio3AV2Filter::process(micIn, spkRef, out, size, &result)
+// TCP :18901
+// Frame: 4-byte len + 1-byte type + payload
+//   type 0: speaker/AI audio (spkRef, downstream to client)
+//   type 1: text/JSON command
+//   type 2: mic audio (micIn, downstream to client)
+//   type 3: inject audio (upstream from client, replaces micIn)
+
+var voiceActive = false;
+var clientOS = null;
+var capturedSpk = 0, capturedMic = 0, spkBytes = 0, micBytes = 0;
+var injectMode = false;  // true = replace mic with injected audio
+var injectQueue = [];    // queue of PCM frames to inject
+
+function wf(os, type, jArr) {
+    try {
+        var len = jArr.length;
+        var h = Java.array("byte", [(len>>24)&0xFF,(len>>16)&0xFF,(len>>8)&0xFF,len&0xFF, type]);
+        os.write(h); os.write(jArr); os.flush();
+    } catch(e) {}
+}
+function wt(os, text) {
+    wf(os, 1, Java.use("java.lang.String").$new(text).getBytes("UTF-8"));
+}
+
+// === Hook libantaudio.so ===
+var hooked = false;
+function tryHook() {
+    if (hooked) return;
+    var m = Process.findModuleByName("libantaudio.so");
+    if (!m) return;
+    var addr = m.findExportByName("_ZN8antaudio20MFAntAudio3AV2Filter7processEPhS1_S1_iRi");
+    if (!addr) return;
+    hooked = true;
+
+    Interceptor.attach(addr, {
+        onEnter: function(args) {
+            if (!voiceActive || !clientOS) return;
+            var size = args[4].toInt32();
+            if (size <= 0) return;
+
+            try {
+                // If inject mode, replace micIn with queued or silence
+                if (injectMode) {
+                    if (injectQueue.length > 0) {
+                        var frame = injectQueue.shift();
+                        // Only write if frame size matches expected size
+                        if (frame.byteLength === size) {
+                            args[1].writeByteArray(frame);
+                        } else if (frame.byteLength > 0) {
+                            // Size mismatch — pad or truncate
+                            var buf = new ArrayBuffer(size);
+                            var dst = new Uint8Array(buf);
+                            var src = new Uint8Array(frame);
+                            var copyLen = Math.min(size, frame.byteLength);
+                            for (var k = 0; k < copyLen; k++) dst[k] = src[k];
+                            args[1].writeByteArray(buf);
+                        }
+                    } else {
+                        // No data queued — inject silence to avoid mic leak
+                        var silence = new ArrayBuffer(size);
+                        args[1].writeByteArray(silence);
+                    }
+                }
+
+                // Always capture speaker/AI output (type 0)
+                var spkPcm = args[2].readByteArray(size);
+                var spkArr = Java.array("byte", Array.from(new Uint8Array(spkPcm)));
+                wf(clientOS, 0, spkArr);
+                capturedSpk++; spkBytes += size;
+
+                // Capture mic (type 2) only when not injecting
+                if (!injectMode) {
+                    var micPcm = args[1].readByteArray(size);
+                    var micArr = Java.array("byte", Array.from(new Uint8Array(micPcm)));
+                    wf(clientOS, 2, micArr);
+                }
+                capturedMic++; micBytes += size;
+
+                if (capturedMic <= 3 || capturedMic % 500 === 0)
+                    console.log("[VOICE] mic=" + capturedMic + " spk=" + capturedSpk + " inject=" + injectQueue.length);
+            } catch(e) {}
+        }
+    });
+    console.log("[VOICE] 3AV2Filter.process hooked @ " + addr);
+}
+
+[0, 1000, 3000, 5000, 10000, 15000, 20000].forEach(function(ms) { setTimeout(tryHook, ms); });
+try {
+    new ApiResolver("module").enumerateMatches("exports:linker*!*dlopen*").forEach(function(d) {
+        Interceptor.attach(d.address, { onLeave: function() { setTimeout(tryHook, 500); } });
+    });
+} catch(e) {}
+
+// === TCP Server ===
+Java.perform(function() {
+    var SS = Java.use("java.net.ServerSocket");
+    var JS = Java.use("java.lang.String");
+    var server = SS.$new(18901);
+    console.log("[VOICE] Listening :18901");
+
+    function openVoice(os) {
+        Java.scheduleOnMainThread(function() {
+            try {
+                Java.choose("com.antgroup.aijk.android.ijklauncher.biz.activity.IJKActivity", {
+                    onMatch: function(a) {
+                        var fm = a.getSupportFragmentManager();
+                        var f = Java.use("com.antgroup.aijk.android.ijkchat.biz.voicechat.IjkVoiceChatFragment").$new();
+                        f.show(fm, "v");
+                        console.log("[VOICE] Opened");
+                    }, onComplete: function() {}
+                });
+                setTimeout(function() { wt(os, JSON.stringify({event:"voice_opened"})); }, 2000);
+            } catch(e) { wt(os, JSON.stringify({event:"error",msg:""+e})); }
+        });
+    }
+
+    function closeVoice(os) {
+        Java.scheduleOnMainThread(function() {
+            try {
+                Java.choose("com.antgroup.aijk.android.ijkchat.biz.voicechat.IjkVoiceChatFragment", {
+                    onMatch: function(f) { f.dismiss(); console.log("[VOICE] Closed"); },
+                    onComplete: function() {}
+                });
+                setTimeout(function() { wt(os, JSON.stringify({event:"voice_closed"})); }, 1000);
+            } catch(e) { wt(os, JSON.stringify({event:"error",msg:""+e})); }
+        });
+    }
+
+    var Srv = Java.registerClass({
+        name: "com.antaf.voice.S7",
+        implements: [Java.use("java.lang.Runnable")],
+        methods: {
+            run: function() {
+                while (true) {
+                    try {
+                        console.log("[VOICE] Waiting...");
+                        var c = server.accept();
+                        var is = c.getInputStream();
+                        var os = c.getOutputStream();
+                        clientOS = os;
+                        console.log("[VOICE] Connected");
+                        wt(os, JSON.stringify({
+                            event:"connected", protocol:"antaf-voice-v8",
+                            commands:["open_voice","close_voice","start","stop","status","inject_on","inject_off"],
+                            audio:"pcm-16bit-960b-frames",
+                            frameTypes:{0:"spk_ai",1:"text",2:"mic",3:"inject"}
+                        }));
+
+                        while (true) {
+                            var hb = [];
+                            for (var i=0;i<5;i++) { var b=is.read(); if(b<0) throw "EOF"; hb.push(b); }
+                            var fl=(hb[0]<<24)|(hb[1]<<16)|(hb[2]<<8)|hb[3], ft=hb[4];
+                            if (fl>1048576) break;
+                            var pb = [];
+                            for (var i=0;i<fl;i++) { var b=is.read(); if(b<0) throw "EOF"; pb.push(b&0xFF); }
+
+                            if (ft === 3) {
+                                // type 3: inject audio frame into micIn
+                                var arr = new ArrayBuffer(pb.length);
+                                var view = new Uint8Array(arr);
+                                for (var j=0;j<pb.length;j++) view[j] = pb[j];
+                                injectQueue.push(arr);
+                            }
+                            else if (ft === 1) {
+                                var pl = Java.array("byte", pb);
+                                var cmd = JSON.parse(JS.$new(pl,"UTF-8").toString());
+                                console.log("[VOICE] Cmd: " + JSON.stringify(cmd));
+                                if (cmd.cmd === "open_voice") openVoice(os);
+                                else if (cmd.cmd === "close_voice") closeVoice(os);
+                                else if (cmd.cmd === "start") {
+                                    voiceActive = true;
+                                    capturedSpk=0;capturedMic=0;spkBytes=0;micBytes=0;
+                                    injectQueue = [];
+                                    wt(os, JSON.stringify({event:"started",hooked:hooked}));
+                                }
+                                else if (cmd.cmd === "stop") {
+                                    voiceActive = false;
+                                    injectMode = false;
+                                    injectQueue = [];
+                                    wt(os, JSON.stringify({event:"stopped",spk:{frames:capturedSpk,bytes:spkBytes},mic:{frames:capturedMic,bytes:micBytes}}));
+                                }
+                                else if (cmd.cmd === "inject_on") {
+                                    injectMode = true;
+                                    injectQueue = [];
+                                    wt(os, JSON.stringify({event:"inject_on"}));
+                                    console.log("[VOICE] Inject mode ON");
+                                }
+                                else if (cmd.cmd === "inject_off") {
+                                    injectMode = false;
+                                    injectQueue = [];
+                                    wt(os, JSON.stringify({event:"inject_off"}));
+                                    console.log("[VOICE] Inject mode OFF");
+                                }
+                                else if (cmd.cmd === "status") {
+                                    wt(os, JSON.stringify({event:"status",active:voiceActive,hooked:hooked,inject:injectMode,queue:injectQueue.length,spk:{frames:capturedSpk,bytes:spkBytes},mic:{frames:capturedMic,bytes:micBytes}}));
+                                }
+                            }
+                        }
+                    } catch(e) { console.log("[VOICE] Ended: "+e); }
+                    finally { voiceActive=false; clientOS=null; injectMode=false; injectQueue=[]; }
+                }
+            }
+        }
+    });
+    Java.use("java.lang.Thread").$new(Srv.$new()).start();
+    console.log("[VOICE] Ready (v7 + inject)");
+});
diff --git a/relay.py b/relay.py
new file mode 100644
index 0000000..b821c0a
--- /dev/null
+++ b/relay.py
@@ -0,0 +1,270 @@
+#!/usr/bin/env python3
+"""
+ESP32 ↔ Antaf Voice Relay
+Bridges ESP32 (WebSocket/Opus) with Antaf voice_bridge (TCP/PCM).
+
+ESP32 → Opus decode → resample 16kHz→48kHz → voice_bridge inject (type=3)
+ESP32 ← Opus encode ← resample 48kHz→16kHz ← voice_bridge speaker (type=0)
+
+Usage: python relay.py [--ws-port 8010] [--bridge-host 127.0.0.1] [--bridge-port 18901]
+"""
+
+import asyncio
+import json
+import struct
+import argparse
+import logging
+import numpy as np
+from scipy.signal import resample_poly
+from math import gcd
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+log = logging.getLogger("relay")
+
+try:
+    import opuslib_next as opuslib
+except ImportError:
+    import opuslib
+
+import websockets
+
+# Audio parameters
+ESP_SAMPLE_RATE = 16000      # ESP32 Opus sample rate
+ESP_FRAME_MS = 60            # ESP32 frame duration
+ESP_FRAME_SIZE = ESP_SAMPLE_RATE * ESP_FRAME_MS // 1000  # 960 samples
+
+BRIDGE_SAMPLE_RATE = 48000   # voice_bridge micIn sample rate
+BRIDGE_FRAME_BYTES = 960     # 480 samples * 2 bytes
+BRIDGE_FRAME_SAMPLES = 480
+
+# Resampling ratios
+UP_GCD = gcd(BRIDGE_SAMPLE_RATE, ESP_SAMPLE_RATE)    # 16000 → 48000
+UP_RATIO = (BRIDGE_SAMPLE_RATE // UP_GCD, ESP_SAMPLE_RATE // UP_GCD)  # (3, 1)
+DOWN_GCD = gcd(ESP_SAMPLE_RATE, BRIDGE_SAMPLE_RATE)  # 48000 → 16000
+DOWN_RATIO = (ESP_SAMPLE_RATE // DOWN_GCD, BRIDGE_SAMPLE_RATE // DOWN_GCD)  # (1, 3)
+
+
+class BridgeClient:
+    """TCP client for voice_bridge_v7."""
+
+    def __init__(self, host, port):
+        self.host = host
+        self.port = port
+        self.reader = None
+        self.writer = None
+        self.on_speaker_frame = None  # callback(pcm_bytes)
+        self._recv_task = None
+
+    async def connect(self):
+        self.reader, self.writer = await asyncio.open_connection(self.host, self.port)
+        log.info(f"Connected to voice_bridge {self.host}:{self.port}")
+
+        # Read connected event
+        ftype, data = await self._recv_frame()
+        if ftype == 1:
+            msg = json.loads(data.decode())
+            log.info(f"Bridge: {msg.get('protocol')}")
+
+    async def _recv_frame(self):
+        header = await self.reader.readexactly(5)
+        length = struct.unpack(">I", header[:4])[0]
+        ftype = header[4]
+        data = await self.reader.readexactly(length)
+        return ftype, data
+
+    def _send_frame(self, ftype, data):
+        header = struct.pack(">IB", len(data), ftype)
+        self.writer.write(header + data)
+        # Note: no await drain() here — voice frames are time-sensitive,
+        # TCP buffer handles backpressure
+
+    def send_cmd(self, cmd):
+        self._send_frame(1, json.dumps(cmd).encode())
+
+    def send_inject(self, pcm_bytes):
+        self._send_frame(3, pcm_bytes)
+
+    async def start_recv_loop(self):
+        """Background task: receive frames from bridge."""
+        try:
+            while True:
+                ftype, data = await self._recv_frame()
+                if ftype == 0 and self.on_speaker_frame:
+                    # Speaker audio
+                    await self.on_speaker_frame(data)
+                elif ftype == 1:
+                    msg = json.loads(data.decode())
+                    log.info(f"Bridge event: {msg}")
+        except asyncio.IncompleteReadError:
+            log.warning("Bridge connection closed")
+        except Exception as e:
+            log.error(f"Bridge recv error: {e}")
+
+    async def setup_voice(self):
+        """Open voice chat, start capture, enable inject."""
+        self.send_cmd({"cmd": "open_voice"})
+        await asyncio.sleep(3)
+        self.send_cmd({"cmd": "start"})
+        await asyncio.sleep(1)
+        self.send_cmd({"cmd": "inject_on"})
+        await asyncio.sleep(0.5)
+        log.info("Voice bridge ready (inject mode)")
+
+    async def close(self):
+        self.send_cmd({"cmd": "inject_off"})
+        self.send_cmd({"cmd": "stop"})
+        self.send_cmd({"cmd": "close_voice"})
+        await asyncio.sleep(1)
+        if self.writer:
+            self.writer.close()
+
+
+class Relay:
+    """Main relay: ESP32 WebSocket ↔ Antaf voice_bridge TCP."""
+
+    def __init__(self, ws_port, bridge_host, bridge_port):
+        self.ws_port = ws_port
+        self.bridge_host = bridge_host
+        self.bridge_port = bridge_port
+        self.bridge = None
+        self.ws = None
+        self.opus_decoder = None
+        self.opus_encoder = None
+        # Buffer for resampled PCM to split into bridge frames
+        self._inject_buf = np.array([], dtype=np.int16)
+        # Buffer for speaker PCM to accumulate before encoding
+        self._speaker_buf = np.array([], dtype=np.int16)
+
+    async def handle_esp32(self, websocket):
+        """Handle one ESP32 WebSocket connection."""
+        log.info(f"ESP32 connected from {websocket.remote_address}")
+        self.ws = websocket
+
+        # Init Opus codec
+        self.opus_decoder = opuslib.Decoder(ESP_SAMPLE_RATE, 1)
+        self.opus_encoder = opuslib.Encoder(ESP_SAMPLE_RATE, 1, opuslib.APPLICATION_AUDIO)
+
+        # Connect to voice bridge
+        self.bridge = BridgeClient(self.bridge_host, self.bridge_port)
+        await self.bridge.connect()
+        self.bridge.on_speaker_frame = self._on_speaker_frame
+        recv_task = asyncio.create_task(self.bridge.start_recv_loop())
+
+        # Setup voice chat
+        await self.bridge.setup_voice()
+
+        try:
+            async for message in websocket:
+                if isinstance(message, str):
+                    # Text message from ESP32 (hello, listen, etc.)
+                    await self._handle_text(message)
+                elif isinstance(message, bytes):
+                    # Opus audio from ESP32
+                    await self._handle_audio(message)
+        except websockets.exceptions.ConnectionClosed:
+            log.info("ESP32 disconnected")
+        finally:
+            recv_task.cancel()
+            await self.bridge.close()
+            self.ws = None
+            log.info("Session ended")
+
+    async def _handle_text(self, message):
+        """Handle text messages from ESP32."""
+        try:
+            msg = json.loads(message)
+            msg_type = msg.get("type")
+
+            if msg_type == "hello":
+                # Respond with hello ack
+                resp = {
+                    "type": "hello",
+                    "session_id": "relay-session",
+                    "transport": "websocket",
+                }
+                await self.ws.send(json.dumps(resp))
+                log.info(f"ESP32 hello: {msg.get('audio_params')}")
+
+            elif msg_type == "listen":
+                state = msg.get("state")
+                log.debug(f"ESP32 listen: {state}")
+                if state == "detect":
+                    # Wake word detected — acknowledge
+                    text = msg.get("text", "")
+                    log.info(f"Wake word: {text}")
+                    # Send TTS start to keep ESP32 happy
+                    await self.ws.send(json.dumps({
+                        "type": "tts", "state": "start",
+                        "session_id": msg.get("session_id", "")
+                    }))
+
+            elif msg_type == "abort":
+                log.info("ESP32 abort")
+
+        except json.JSONDecodeError:
+            log.warning(f"Invalid JSON from ESP32: {message[:100]}")
+
+    async def _handle_audio(self, opus_data):
+        """Decode Opus from ESP32, resample, inject into voice_bridge."""
+        try:
+            # Decode Opus → PCM 16kHz mono
+            pcm = self.opus_decoder.decode(opus_data, ESP_FRAME_SIZE)
+            samples = np.frombuffer(pcm, dtype=np.int16)
+
+            # Resample 16kHz → 48kHz
+            upsampled = resample_poly(samples, UP_RATIO[0], UP_RATIO[1]).astype(np.int16)
+
+            # Append to inject buffer and send in bridge frame sizes
+            self._inject_buf = np.concatenate([self._inject_buf, upsampled])
+            while len(self._inject_buf) >= BRIDGE_FRAME_SAMPLES:
+                frame = self._inject_buf[:BRIDGE_FRAME_SAMPLES]
+                self._inject_buf = self._inject_buf[BRIDGE_FRAME_SAMPLES:]
+                self.bridge.send_inject(frame.tobytes())
+
+        except Exception as e:
+            log.error(f"Audio inject error: {e}")
+
+    async def _on_speaker_frame(self, pcm_bytes):
+        """Receive speaker PCM from bridge, resample, encode Opus, send to ESP32."""
+        if not self.ws:
+            return
+        try:
+            samples = np.frombuffer(pcm_bytes, dtype=np.int16)
+
+            # Resample 48kHz → 16kHz
+            downsampled = resample_poly(samples, DOWN_RATIO[0], DOWN_RATIO[1]).astype(np.int16)
+
+            # Accumulate into speaker buffer, encode when we have enough
+            self._speaker_buf = np.concatenate([self._speaker_buf, downsampled])
+            while len(self._speaker_buf) >= ESP_FRAME_SIZE:
+                frame = self._speaker_buf[:ESP_FRAME_SIZE]
+                self._speaker_buf = self._speaker_buf[ESP_FRAME_SIZE:]
+                # Encode PCM → Opus
+                opus_data = self.opus_encoder.encode(frame.tobytes(), ESP_FRAME_SIZE)
+                await self.ws.send(opus_data)
+
+        except Exception as e:
+            log.error(f"Speaker send error: {e}")
+
+    async def run(self):
+        log.info(f"Relay starting on ws://0.0.0.0:{self.ws_port}/xiaozhi/v1/")
+        async with websockets.serve(
+            self.handle_esp32, "0.0.0.0", self.ws_port,
+            ping_interval=30, ping_timeout=10,
+        ):
+            await asyncio.Future()  # run forever
+
+
+def main():
+    parser = argparse.ArgumentParser(description="ESP32-Antaf Voice Relay")
+    parser.add_argument("--ws-port", type=int, default=8010, help="WebSocket port for ESP32")
+    parser.add_argument("--bridge-host", default="127.0.0.1", help="voice_bridge host")
+    parser.add_argument("--bridge-port", type=int, default=18901, help="voice_bridge port")
+    args = parser.parse_args()
+
+    relay = Relay(args.ws_port, args.bridge_host, args.bridge_port)
+    asyncio.run(relay.run())
+
+
+if __name__ == "__main__":
+    main()