From 5679622996e4ff23ed27b27bf218cfad322f1ce4 Mon Sep 17 00:00:00 2001
From: hailin <hailin.zhao@gdzx.xyz>
Date: Sun, 5 Apr 2026 23:11:48 -0700
Subject: [PATCH] fix: resample TTS audio from 44100Hz to 24000Hz for device
 compatibility

Model outputs 44100Hz but device expects 24000Hz via Opus. Without
resampling, audio plays at wrong speed causing 29s delays between
segments. Verified: synthesis+resample takes 0.38s for 1.6s audio.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../xiaozhi-server/core/providers/tts/sherpa_tts.py  | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/backend/main/xiaozhi-server/core/providers/tts/sherpa_tts.py b/backend/main/xiaozhi-server/core/providers/tts/sherpa_tts.py
index a76cdc4..a997ac6 100644
--- a/backend/main/xiaozhi-server/core/providers/tts/sherpa_tts.py
+++ b/backend/main/xiaozhi-server/core/providers/tts/sherpa_tts.py
@@ -47,15 +47,25 @@ class TTSProvider(TTSProviderBase):
 
     def _generate_wav(self, text):
         """同步合成，在线程池中调用"""
+        from scipy.signal import resample_poly
+        from math import gcd
+
         audio = self.tts.generate(text, sid=self.sid, speed=self.speed)
         samples = np.array(audio.samples, dtype=np.float32)
+
+        # 重采样到目标采样率（设备要求 24000Hz，模型输出 44100Hz）
+        target_sr = 24000
+        if self.sample_rate != target_sr:
+            g = gcd(self.sample_rate, target_sr)
+            samples = resample_poly(samples, target_sr // g, self.sample_rate // g)
+
         pcm = (samples * 32767).astype(np.int16)
 
         wav_io = io.BytesIO()
         with wave.open(wav_io, "wb") as wf:
             wf.setnchannels(1)
             wf.setsampwidth(2)
-            wf.setframerate(self.sample_rate)
+            wf.setframerate(target_sr)
             wf.writeframes(pcm.tobytes())
         return wav_io.getvalue()