diff --git a/relay.py b/relay.py index 9fb1db0..9c8e201 100644 --- a/relay.py +++ b/relay.py @@ -141,6 +141,7 @@ class Relay: self._speaker_buf = np.array([], dtype=np.int16) self._audio_in_count = 0 self._audio_out_count = 0 + self._tts_started = False # track if we sent tts start to ESP32 async def handle_esp32(self, websocket): """Handle one ESP32 WebSocket connection.""" @@ -236,12 +237,29 @@ class Relay: return try: self._audio_out_count += 1 - if self._audio_out_count <= 3 or self._audio_out_count % 100 == 0: - samples_peek = np.frombuffer(pcm_bytes, dtype=np.int16) - max_amp = int(np.max(np.abs(samples_peek))) - log.info(f"Speaker frame #{self._audio_out_count}, size={len(pcm_bytes)}, max_amp={max_amp}") samples = np.frombuffer(pcm_bytes, dtype=np.int16) + max_amp = int(np.max(np.abs(samples))) + + if self._audio_out_count <= 3 or self._audio_out_count % 100 == 0: + log.info(f"Speaker frame #{self._audio_out_count}, size={len(pcm_bytes)}, max_amp={max_amp}") + + # Only send non-silent frames to ESP32 + if max_amp < 10: + # If we were playing and now silent for a while, send tts stop + if self._tts_started and self._audio_out_count % 50 == 0: + # Check later — don't stop immediately, silence gaps are normal + pass + return + + # Send tts start before first audio frame + if not self._tts_started: + await self.ws.send(json.dumps({ + "type": "tts", "state": "start", + "session_id": "relay-session" + })) + self._tts_started = True + log.info("Sent tts start to ESP32") # Resample 48kHz → 16kHz downsampled = resample_poly(samples, DOWN_RATIO[0], DOWN_RATIO[1]).astype(np.int16)