fix: send tts start before speaker audio so ESP32 plays it
ESP32 ignores binary audio unless it receives tts start first. Also skip silent frames to reduce bandwidth. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
b39dab2536
commit
fc3c99d09c
26
relay.py
26
relay.py
|
|
@ -141,6 +141,7 @@ class Relay:
|
||||||
self._speaker_buf = np.array([], dtype=np.int16)
|
self._speaker_buf = np.array([], dtype=np.int16)
|
||||||
self._audio_in_count = 0
|
self._audio_in_count = 0
|
||||||
self._audio_out_count = 0
|
self._audio_out_count = 0
|
||||||
|
self._tts_started = False # track if we sent tts start to ESP32
|
||||||
|
|
||||||
async def handle_esp32(self, websocket):
|
async def handle_esp32(self, websocket):
|
||||||
"""Handle one ESP32 WebSocket connection."""
|
"""Handle one ESP32 WebSocket connection."""
|
||||||
|
|
@ -236,12 +237,29 @@ class Relay:
|
||||||
return
|
return
|
||||||
try:
|
try:
|
||||||
self._audio_out_count += 1
|
self._audio_out_count += 1
|
||||||
if self._audio_out_count <= 3 or self._audio_out_count % 100 == 0:
|
|
||||||
samples_peek = np.frombuffer(pcm_bytes, dtype=np.int16)
|
|
||||||
max_amp = int(np.max(np.abs(samples_peek)))
|
|
||||||
log.info(f"Speaker frame #{self._audio_out_count}, size={len(pcm_bytes)}, max_amp={max_amp}")
|
|
||||||
|
|
||||||
samples = np.frombuffer(pcm_bytes, dtype=np.int16)
|
samples = np.frombuffer(pcm_bytes, dtype=np.int16)
|
||||||
|
max_amp = int(np.max(np.abs(samples)))
|
||||||
|
|
||||||
|
if self._audio_out_count <= 3 or self._audio_out_count % 100 == 0:
|
||||||
|
log.info(f"Speaker frame #{self._audio_out_count}, size={len(pcm_bytes)}, max_amp={max_amp}")
|
||||||
|
|
||||||
|
# Only send non-silent frames to ESP32
|
||||||
|
if max_amp < 10:
|
||||||
|
# If we were playing and now silent for a while, send tts stop
|
||||||
|
if self._tts_started and self._audio_out_count % 50 == 0:
|
||||||
|
# Check later — don't stop immediately, silence gaps are normal
|
||||||
|
pass
|
||||||
|
return
|
||||||
|
|
||||||
|
# Send tts start before first audio frame
|
||||||
|
if not self._tts_started:
|
||||||
|
await self.ws.send(json.dumps({
|
||||||
|
"type": "tts", "state": "start",
|
||||||
|
"session_id": "relay-session"
|
||||||
|
}))
|
||||||
|
self._tts_started = True
|
||||||
|
log.info("Sent tts start to ESP32")
|
||||||
|
|
||||||
# Resample 48kHz → 16kHz
|
# Resample 48kHz → 16kHz
|
||||||
downsampled = resample_poly(samples, DOWN_RATIO[0], DOWN_RATIO[1]).astype(np.int16)
|
downsampled = resample_poly(samples, DOWN_RATIO[0], DOWN_RATIO[1]).astype(np.int16)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue