fix: resample TTS audio from 44100Hz to 24000Hz for device compatibility

Model outputs 44100Hz but device expects 24000Hz via Opus. Without
resampling, audio plays at wrong speed causing 29s delays between
segments. Verified: synthesis+resample takes 0.38s for 1.6s audio.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
hailin 2026-04-05 23:11:48 -07:00
parent 9b2b875c2b
commit 5679622996
1 changed files with 11 additions and 1 deletions

View File

@ -47,15 +47,25 @@ class TTSProvider(TTSProviderBase):
def _generate_wav(self, text):
"""同步合成,在线程池中调用"""
from scipy.signal import resample_poly
from math import gcd
audio = self.tts.generate(text, sid=self.sid, speed=self.speed)
samples = np.array(audio.samples, dtype=np.float32)
# 重采样到目标采样率(设备要求 24000Hz模型输出 44100Hz
target_sr = 24000
if self.sample_rate != target_sr:
g = gcd(self.sample_rate, target_sr)
samples = resample_poly(samples, target_sr // g, self.sample_rate // g)
pcm = (samples * 32767).astype(np.int16)
wav_io = io.BytesIO()
with wave.open(wav_io, "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(self.sample_rate)
wf.setframerate(target_sr)
wf.writeframes(pcm.tobytes())
return wav_io.getvalue()