From fc3c99d09ccd6504ad67bc0b964221ab210ddb81 Mon Sep 17 00:00:00 2001
From: hailin <hailin.zhao@gdzx.xyz>
Date: Mon, 6 Apr 2026 07:10:10 -0700
Subject: [PATCH] fix: send tts start before speaker audio so ESP32 plays it

ESP32 ignores binary audio unless it receives tts start first.
Also skip silent frames to reduce bandwidth.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 relay.py | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/relay.py b/relay.py
index 9fb1db0..9c8e201 100644
--- a/relay.py
+++ b/relay.py
@@ -141,6 +141,7 @@ class Relay:
         self._speaker_buf = np.array([], dtype=np.int16)
         self._audio_in_count = 0
         self._audio_out_count = 0
+        self._tts_started = False  # track if we sent tts start to ESP32
 
     async def handle_esp32(self, websocket):
         """Handle one ESP32 WebSocket connection."""
@@ -236,12 +237,29 @@ class Relay:
             return
         try:
             self._audio_out_count += 1
-            if self._audio_out_count <= 3 or self._audio_out_count % 100 == 0:
-                samples_peek = np.frombuffer(pcm_bytes, dtype=np.int16)
-                max_amp = int(np.max(np.abs(samples_peek)))
-                log.info(f"Speaker frame #{self._audio_out_count}, size={len(pcm_bytes)}, max_amp={max_amp}")
 
             samples = np.frombuffer(pcm_bytes, dtype=np.int16)
+            max_amp = int(np.max(np.abs(samples)))
+
+            if self._audio_out_count <= 3 or self._audio_out_count % 100 == 0:
+                log.info(f"Speaker frame #{self._audio_out_count}, size={len(pcm_bytes)}, max_amp={max_amp}")
+
+            # Only send non-silent frames to ESP32
+            if max_amp < 10:
+                # If we were playing and now silent for a while, send tts stop
+                if self._tts_started and self._audio_out_count % 50 == 0:
+                    # Check later — don't stop immediately, silence gaps are normal
+                    pass
+                return
+
+            # Send tts start before first audio frame
+            if not self._tts_started:
+                await self.ws.send(json.dumps({
+                    "type": "tts", "state": "start",
+                    "session_id": "relay-session"
+                }))
+                self._tts_started = True
+                log.info("Sent tts start to ESP32")
 
             # Resample 48kHz → 16kHz
             downsampled = resample_poly(samples, DOWN_RATIO[0], DOWN_RATIO[1]).astype(np.int16)