feat: add Qwen3-TTS CustomVoice GPU provider, switch TTS

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-07 02:21:46 -07:00 · 2026-04-07 02:21:46 -07:00 · d3fd9cc391
parent 51fa106d7d
commit d3fd9cc391
2 changed files with 98 additions and 3 deletions
--- a/config/.config.yaml
+++ b/config/.config.yaml
@ -25,7 +25,7 @@ wakeup_words:

 selected_module:
  LLM: antaf
-  TTS: sherpa_tts
+  TTS: qwen3_tts
  ASR: qwen3_asr_local

 LLM:
@ -40,14 +40,22 @@ LLM:
    api_key: token-abc123

 TTS:
-  EdgeTTS:
-    voice: zh-CN-YunxiNeural
+  qwen3_tts:
+    type: qwen3_tts
+    model_path: /home/ZeroStack/xiaozhi/Qwen3-TTS-12Hz-1.7B-CustomVoice
+    tokenizer_path: /home/ZeroStack/xiaozhi/Qwen3-TTS-Tokenizer-12Hz
+    device: cuda:0
+    dtype: bfloat16
+    speaker: Chelsie
+    language: Chinese
  sherpa_tts:
    type: sherpa_tts
    model_dir: models/vits-melo-tts-zh_en
    speed: 1.0
    sid: 0
    num_threads: 8
+  EdgeTTS:
+    voice: zh-CN-YunxiNeural

 ASR:
  FunASR:
--- a/modules/tts/qwen3_tts.py
+++ b/modules/tts/qwen3_tts.py
@ -0,0 +1,87 @@
+"""
+Qwen3-TTS CustomVoice Provider for xiaozhi-server
+Based on sherpa_tts.py structure.
+GPU inference using qwen-tts package.
+"""
+
+import io
+import os
+import time
+import wave
+import asyncio
+import numpy as np
+
+from config.logger import setup_logging
+from core.providers.tts.base import TTSProviderBase
+
+TAG = __name__
+logger = setup_logging()
+
+
+class TTSProvider(TTSProviderBase):
+    def __init__(self, config, delete_audio_file):
+        super().__init__(config, delete_audio_file)
+        import torch
+        from qwen_tts import Qwen3TTSModel, Qwen3TTSTokenizer
+
+        model_path = config.get("model_path", "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice")
+        tokenizer_path = config.get("tokenizer_path", "Qwen/Qwen3-TTS-Tokenizer-12Hz")
+        device = config.get("device", "cuda:0")
+        dtype_str = config.get("dtype", "bfloat16")
+        dtype = getattr(torch, dtype_str, torch.bfloat16)
+        self.speaker = config.get("speaker", "Chelsie")
+        self.language = config.get("language", "Chinese")
+
+        logger.bind(tag=TAG).info(
+            "Qwen3TTS loading: model=%s device=%s speaker=%s" % (model_path, device, self.speaker)
+        )
+        t0 = time.time()
+
+        self.model = Qwen3TTSModel.from_pretrained(
+            model_path,
+            device_map=device,
+            dtype=dtype,
+        )
+        self.tokenizer = Qwen3TTSTokenizer.from_pretrained(tokenizer_path)
+
+        # Get supported speakers
+        speakers = self.model.get_supported_speakers()
+        logger.bind(tag=TAG).info(
+            "Qwen3TTS loaded in %.1fs, speakers=%s" % (time.time() - t0, speakers)
+        )
+
+    async def text_to_speak(self, text, output_file):
+        t0 = time.time()
+
+        # Run in thread pool to avoid blocking
+        loop = asyncio.get_event_loop()
+        wavs, sr = await loop.run_in_executor(
+            None,
+            lambda: self.model.generate_custom_voice(
+                text=text,
+                speaker=self.speaker,
+                language=self.language,
+            )
+        )
+
+        audio = wavs[0]
+        duration = len(audio) / sr
+        logger.bind(tag=TAG).info(
+            "TTS: %.2fs合成 %.1fs音频 sr=%d [%s]" % (time.time() - t0, duration, sr, text[:30])
+        )
+
+        # Convert to WAV bytes
+        pcm = (audio * 32767).astype(np.int16)
+        wav_io = io.BytesIO()
+        with wave.open(wav_io, "wb") as wf:
+            wf.setnchannels(1)
+            wf.setsampwidth(2)
+            wf.setframerate(sr)
+            wf.writeframes(pcm.tobytes())
+        wav_data = wav_io.getvalue()
+
+        if output_file:
+            with open(output_file, "wb") as f:
+                f.write(wav_data)
+        else:
+            return wav_data