fix: resample TTS audio from 44100Hz to 24000Hz for device compatibility

Model outputs 44100Hz but device expects 24000Hz via Opus. Without resampling, audio plays at wrong speed causing 29s delays between segments. Verified: synthesis+resample takes 0.38s for 1.6s audio. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-05 23:11:48 -07:00 · 2026-04-05 23:11:48 -07:00 · 5679622996
parent 9b2b875c2b
commit 5679622996
1 changed files with 11 additions and 1 deletions
--- a/backend/main/xiaozhi-server/core/providers/tts/sherpa_tts.py
+++ b/backend/main/xiaozhi-server/core/providers/tts/sherpa_tts.py
@ -47,15 +47,25 @@ class TTSProvider(TTSProviderBase):

    def _generate_wav(self, text):
        """同步合成，在线程池中调用"""
+        from scipy.signal import resample_poly
+        from math import gcd
+
        audio = self.tts.generate(text, sid=self.sid, speed=self.speed)
        samples = np.array(audio.samples, dtype=np.float32)
+
+        # 重采样到目标采样率（设备要求 24000Hz，模型输出 44100Hz）
+        target_sr = 24000
+        if self.sample_rate != target_sr:
+            g = gcd(self.sample_rate, target_sr)
+            samples = resample_poly(samples, target_sr // g, self.sample_rate // g)
+
        pcm = (samples * 32767).astype(np.int16)

        wav_io = io.BytesIO()
        with wave.open(wav_io, "wb") as wf:
            wf.setnchannels(1)
            wf.setsampwidth(2)
-            wf.setframerate(self.sample_rate)
+            wf.setframerate(target_sr)
            wf.writeframes(pcm.tobytes())
        return wav_io.getvalue()