From 5679622996e4ff23ed27b27bf218cfad322f1ce4 Mon Sep 17 00:00:00 2001 From: hailin Date: Sun, 5 Apr 2026 23:11:48 -0700 Subject: [PATCH] fix: resample TTS audio from 44100Hz to 24000Hz for device compatibility Model outputs 44100Hz but device expects 24000Hz via Opus. Without resampling, audio plays at wrong speed causing 29s delays between segments. Verified: synthesis+resample takes 0.38s for 1.6s audio. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../xiaozhi-server/core/providers/tts/sherpa_tts.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/backend/main/xiaozhi-server/core/providers/tts/sherpa_tts.py b/backend/main/xiaozhi-server/core/providers/tts/sherpa_tts.py index a76cdc4..a997ac6 100644 --- a/backend/main/xiaozhi-server/core/providers/tts/sherpa_tts.py +++ b/backend/main/xiaozhi-server/core/providers/tts/sherpa_tts.py @@ -47,15 +47,25 @@ class TTSProvider(TTSProviderBase): def _generate_wav(self, text): """同步合成,在线程池中调用""" + from scipy.signal import resample_poly + from math import gcd + audio = self.tts.generate(text, sid=self.sid, speed=self.speed) samples = np.array(audio.samples, dtype=np.float32) + + # 重采样到目标采样率(设备要求 24000Hz,模型输出 44100Hz) + target_sr = 24000 + if self.sample_rate != target_sr: + g = gcd(self.sample_rate, target_sr) + samples = resample_poly(samples, target_sr // g, self.sample_rate // g) + pcm = (samples * 32767).astype(np.int16) wav_io = io.BytesIO() with wave.open(wav_io, "wb") as wf: wf.setnchannels(1) wf.setsampwidth(2) - wf.setframerate(self.sample_rate) + wf.setframerate(target_sr) wf.writeframes(pcm.tobytes()) return wav_io.getvalue()