feat: add Qwen3-TTS CustomVoice GPU provider, switch TTS

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-07 02:21:46 -07:00 · 2026-04-07 02:21:46 -07:00 · d3fd9cc391
parent 51fa106d7d
commit d3fd9cc391
2 changed files with 98 additions and 3 deletions
--- a/config/.config.yaml
+++ b/config/.config.yaml
@ -25,7 +25,7 @@ wakeup_words:
 selected_module:
  LLM: antaf
-  TTS: sherpa_tts
+  TTS: qwen3_tts
  ASR: qwen3_asr_local
 LLM:
@ -40,14 +40,22 @@ LLM:
    api_key: token-abc123
 TTS:
-  EdgeTTS:
+  qwen3_tts:
-    voice: zh-CN-YunxiNeural
+    type: qwen3_tts
    model_path: /home/ZeroStack/xiaozhi/Qwen3-TTS-12Hz-1.7B-CustomVoice
    tokenizer_path: /home/ZeroStack/xiaozhi/Qwen3-TTS-Tokenizer-12Hz
    device: cuda:0
    dtype: bfloat16
    speaker: Chelsie
    language: Chinese
  sherpa_tts:
    type: sherpa_tts
    model_dir: models/vits-melo-tts-zh_en
    speed: 1.0
    sid: 0
    num_threads: 8
  EdgeTTS:
    voice: zh-CN-YunxiNeural
 ASR:
  FunASR:
--- a/modules/tts/qwen3_tts.py
+++ b/modules/tts/qwen3_tts.py
@ -0,0 +1,87 @@
 """
 Qwen3-TTS CustomVoice Provider for xiaozhi-server
 Based on sherpa_tts.py structure.
 GPU inference using qwen-tts package.
 """
 import io
 import os
 import time
 import wave
 import asyncio
 import numpy as np
 from config.logger import setup_logging
 from core.providers.tts.base import TTSProviderBase
 TAG = __name__
 logger = setup_logging()
 class TTSProvider(TTSProviderBase):
    def __init__(self, config, delete_audio_file):
        super().__init__(config, delete_audio_file)
        import torch
        from qwen_tts import Qwen3TTSModel, Qwen3TTSTokenizer
        model_path = config.get("model_path", "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice")
        tokenizer_path = config.get("tokenizer_path", "Qwen/Qwen3-TTS-Tokenizer-12Hz")
        device = config.get("device", "cuda:0")
        dtype_str = config.get("dtype", "bfloat16")
        dtype = getattr(torch, dtype_str, torch.bfloat16)
        self.speaker = config.get("speaker", "Chelsie")
        self.language = config.get("language", "Chinese")
        logger.bind(tag=TAG).info(
            "Qwen3TTS loading: model=%s device=%s speaker=%s" % (model_path, device, self.speaker)
        )
        t0 = time.time()
        self.model = Qwen3TTSModel.from_pretrained(
            model_path,
            device_map=device,
            dtype=dtype,
        )
        self.tokenizer = Qwen3TTSTokenizer.from_pretrained(tokenizer_path)
        # Get supported speakers
        speakers = self.model.get_supported_speakers()
        logger.bind(tag=TAG).info(
            "Qwen3TTS loaded in %.1fs, speakers=%s" % (time.time() - t0, speakers)
        )
    async def text_to_speak(self, text, output_file):
        t0 = time.time()
        # Run in thread pool to avoid blocking
        loop = asyncio.get_event_loop()
        wavs, sr = await loop.run_in_executor(
            None,
            lambda: self.model.generate_custom_voice(
                text=text,
                speaker=self.speaker,
                language=self.language,
            )
        )
        audio = wavs[0]
        duration = len(audio) / sr
        logger.bind(tag=TAG).info(
            "TTS: %.2fs合成 %.1fs音频 sr=%d [%s]" % (time.time() - t0, duration, sr, text[:30])
        )
        # Convert to WAV bytes
        pcm = (audio * 32767).astype(np.int16)
        wav_io = io.BytesIO()
        with wave.open(wav_io, "wb") as wf:
            wf.setnchannels(1)
            wf.setsampwidth(2)
            wf.setframerate(sr)
            wf.writeframes(pcm.tobytes())
        wav_data = wav_io.getvalue()
        if output_file:
            with open(output_file, "wb") as f:
                f.write(wav_data)
        else:
            return wav_data