fix: send tts start before speaker audio so ESP32 plays it

ESP32 ignores binary audio unless it receives tts start first.
Also skip silent frames to reduce bandwidth.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
hailin 2026-04-06 07:10:10 -07:00
parent b39dab2536
commit fc3c99d09c
1 changed files with 22 additions and 4 deletions

View File

@ -141,6 +141,7 @@ class Relay:
self._speaker_buf = np.array([], dtype=np.int16) self._speaker_buf = np.array([], dtype=np.int16)
self._audio_in_count = 0 self._audio_in_count = 0
self._audio_out_count = 0 self._audio_out_count = 0
self._tts_started = False # track if we sent tts start to ESP32
async def handle_esp32(self, websocket): async def handle_esp32(self, websocket):
"""Handle one ESP32 WebSocket connection.""" """Handle one ESP32 WebSocket connection."""
@ -236,12 +237,29 @@ class Relay:
return return
try: try:
self._audio_out_count += 1 self._audio_out_count += 1
if self._audio_out_count <= 3 or self._audio_out_count % 100 == 0:
samples_peek = np.frombuffer(pcm_bytes, dtype=np.int16)
max_amp = int(np.max(np.abs(samples_peek)))
log.info(f"Speaker frame #{self._audio_out_count}, size={len(pcm_bytes)}, max_amp={max_amp}")
samples = np.frombuffer(pcm_bytes, dtype=np.int16) samples = np.frombuffer(pcm_bytes, dtype=np.int16)
max_amp = int(np.max(np.abs(samples)))
if self._audio_out_count <= 3 or self._audio_out_count % 100 == 0:
log.info(f"Speaker frame #{self._audio_out_count}, size={len(pcm_bytes)}, max_amp={max_amp}")
# Only send non-silent frames to ESP32
if max_amp < 10:
# If we were playing and now silent for a while, send tts stop
if self._tts_started and self._audio_out_count % 50 == 0:
# Check later — don't stop immediately, silence gaps are normal
pass
return
# Send tts start before first audio frame
if not self._tts_started:
await self.ws.send(json.dumps({
"type": "tts", "state": "start",
"session_id": "relay-session"
}))
self._tts_started = True
log.info("Sent tts start to ESP32")
# Resample 48kHz → 16kHz # Resample 48kHz → 16kHz
downsampled = resample_poly(samples, DOWN_RATIO[0], DOWN_RATIO[1]).astype(np.int16) downsampled = resample_poly(samples, DOWN_RATIO[0], DOWN_RATIO[1]).astype(np.int16)