From fc3c99d09ccd6504ad67bc0b964221ab210ddb81 Mon Sep 17 00:00:00 2001 From: hailin Date: Mon, 6 Apr 2026 07:10:10 -0700 Subject: [PATCH] fix: send tts start before speaker audio so ESP32 plays it ESP32 ignores binary audio unless it receives tts start first. Also skip silent frames to reduce bandwidth. Co-Authored-By: Claude Opus 4.6 (1M context) --- relay.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/relay.py b/relay.py index 9fb1db0..9c8e201 100644 --- a/relay.py +++ b/relay.py @@ -141,6 +141,7 @@ class Relay: self._speaker_buf = np.array([], dtype=np.int16) self._audio_in_count = 0 self._audio_out_count = 0 + self._tts_started = False # track if we sent tts start to ESP32 async def handle_esp32(self, websocket): """Handle one ESP32 WebSocket connection.""" @@ -236,12 +237,29 @@ class Relay: return try: self._audio_out_count += 1 - if self._audio_out_count <= 3 or self._audio_out_count % 100 == 0: - samples_peek = np.frombuffer(pcm_bytes, dtype=np.int16) - max_amp = int(np.max(np.abs(samples_peek))) - log.info(f"Speaker frame #{self._audio_out_count}, size={len(pcm_bytes)}, max_amp={max_amp}") samples = np.frombuffer(pcm_bytes, dtype=np.int16) + max_amp = int(np.max(np.abs(samples))) + + if self._audio_out_count <= 3 or self._audio_out_count % 100 == 0: + log.info(f"Speaker frame #{self._audio_out_count}, size={len(pcm_bytes)}, max_amp={max_amp}") + + # Only send non-silent frames to ESP32 + if max_amp < 10: + # If we were playing and now silent for a while, send tts stop + if self._tts_started and self._audio_out_count % 50 == 0: + # Check later — don't stop immediately, silence gaps are normal + pass + return + + # Send tts start before first audio frame + if not self._tts_started: + await self.ws.send(json.dumps({ + "type": "tts", "state": "start", + "session_id": "relay-session" + })) + self._tts_started = True + log.info("Sent tts start to ESP32") # Resample 48kHz → 16kHz downsampled = resample_poly(samples, DOWN_RATIO[0], DOWN_RATIO[1]).astype(np.int16)