From e5599d4f436ad232074bee0dbe9bdab4bffbad71 Mon Sep 17 00:00:00 2001 From: hailin Date: Sun, 5 Apr 2026 22:43:51 -0700 Subject: [PATCH] feat: add sherpa-onnx local TTS provider Offline VITS TTS using sherpa-onnx, no network dependency. Uses vits-melo-tts-zh_en model for Chinese/English. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../core/providers/tts/sherpa_tts.py | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 backend/main/xiaozhi-server/core/providers/tts/sherpa_tts.py diff --git a/backend/main/xiaozhi-server/core/providers/tts/sherpa_tts.py b/backend/main/xiaozhi-server/core/providers/tts/sherpa_tts.py new file mode 100644 index 0000000..3137f8e --- /dev/null +++ b/backend/main/xiaozhi-server/core/providers/tts/sherpa_tts.py @@ -0,0 +1,55 @@ +import io +import wave +import numpy as np +import sherpa_onnx +from config.logger import setup_logging +from core.providers.tts.base import TTSProviderBase + +TAG = __name__ +logger = setup_logging() + + +class TTSProvider(TTSProviderBase): + def __init__(self, config, delete_audio_file): + super().__init__(config, delete_audio_file) + model_dir = config.get("model_dir", "models/vits-melo-tts-zh_en") + speed = config.get("speed", 1.0) + self.speed = float(speed) if speed else 1.0 + self.sid = int(config.get("sid", 0)) + + tts_config = sherpa_onnx.OfflineTtsConfig( + model=sherpa_onnx.OfflineTtsModelConfig( + vits=sherpa_onnx.OfflineTtsVitsModelConfig( + model=f"{model_dir}/model.onnx", + lexicon=f"{model_dir}/lexicon.txt", + tokens=f"{model_dir}/tokens.txt", + dict_dir=f"{model_dir}/dict", + ), + ), + rule_fsts=f"{model_dir}/date.fst,{model_dir}/phone.fst,{model_dir}/number.fst,{model_dir}/new_heteronym.fst", + max_num_sentences=1, + ) + self.tts = sherpa_onnx.OfflineTts(tts_config) + self.sample_rate = self.tts.sample_rate + logger.bind(tag=TAG).info( + f"SherpaOnnxTTS 初始化完成: model_dir={model_dir}, sample_rate={self.sample_rate}, sid={self.sid}" + ) + + async def text_to_speak(self, text, output_file): + audio = self.tts.generate(text, sid=self.sid, speed=self.speed) + samples = np.array(audio.samples, dtype=np.float32) + pcm = (samples * 32767).astype(np.int16) + + wav_io = io.BytesIO() + with wave.open(wav_io, "wb") as wf: + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(self.sample_rate) + wf.writeframes(pcm.tobytes()) + wav_data = wav_io.getvalue() + + if output_file: + with open(output_file, "wb") as f: + f.write(wav_data) + else: + return wav_data