feat: add Qwen3-TTS CustomVoice GPU provider, switch TTS

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
hailin 2026-04-07 02:21:46 -07:00
parent 51fa106d7d
commit d3fd9cc391
2 changed files with 98 additions and 3 deletions

View File

@ -25,7 +25,7 @@ wakeup_words:
selected_module: selected_module:
LLM: antaf LLM: antaf
TTS: sherpa_tts TTS: qwen3_tts
ASR: qwen3_asr_local ASR: qwen3_asr_local
LLM: LLM:
@ -40,14 +40,22 @@ LLM:
api_key: token-abc123 api_key: token-abc123
TTS: TTS:
EdgeTTS: qwen3_tts:
voice: zh-CN-YunxiNeural type: qwen3_tts
model_path: /home/ZeroStack/xiaozhi/Qwen3-TTS-12Hz-1.7B-CustomVoice
tokenizer_path: /home/ZeroStack/xiaozhi/Qwen3-TTS-Tokenizer-12Hz
device: cuda:0
dtype: bfloat16
speaker: Chelsie
language: Chinese
sherpa_tts: sherpa_tts:
type: sherpa_tts type: sherpa_tts
model_dir: models/vits-melo-tts-zh_en model_dir: models/vits-melo-tts-zh_en
speed: 1.0 speed: 1.0
sid: 0 sid: 0
num_threads: 8 num_threads: 8
EdgeTTS:
voice: zh-CN-YunxiNeural
ASR: ASR:
FunASR: FunASR:

87
modules/tts/qwen3_tts.py Normal file
View File

@ -0,0 +1,87 @@
"""
Qwen3-TTS CustomVoice Provider for xiaozhi-server
Based on sherpa_tts.py structure.
GPU inference using qwen-tts package.
"""
import io
import os
import time
import wave
import asyncio
import numpy as np
from config.logger import setup_logging
from core.providers.tts.base import TTSProviderBase
TAG = __name__
logger = setup_logging()
class TTSProvider(TTSProviderBase):
def __init__(self, config, delete_audio_file):
super().__init__(config, delete_audio_file)
import torch
from qwen_tts import Qwen3TTSModel, Qwen3TTSTokenizer
model_path = config.get("model_path", "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice")
tokenizer_path = config.get("tokenizer_path", "Qwen/Qwen3-TTS-Tokenizer-12Hz")
device = config.get("device", "cuda:0")
dtype_str = config.get("dtype", "bfloat16")
dtype = getattr(torch, dtype_str, torch.bfloat16)
self.speaker = config.get("speaker", "Chelsie")
self.language = config.get("language", "Chinese")
logger.bind(tag=TAG).info(
"Qwen3TTS loading: model=%s device=%s speaker=%s" % (model_path, device, self.speaker)
)
t0 = time.time()
self.model = Qwen3TTSModel.from_pretrained(
model_path,
device_map=device,
dtype=dtype,
)
self.tokenizer = Qwen3TTSTokenizer.from_pretrained(tokenizer_path)
# Get supported speakers
speakers = self.model.get_supported_speakers()
logger.bind(tag=TAG).info(
"Qwen3TTS loaded in %.1fs, speakers=%s" % (time.time() - t0, speakers)
)
async def text_to_speak(self, text, output_file):
t0 = time.time()
# Run in thread pool to avoid blocking
loop = asyncio.get_event_loop()
wavs, sr = await loop.run_in_executor(
None,
lambda: self.model.generate_custom_voice(
text=text,
speaker=self.speaker,
language=self.language,
)
)
audio = wavs[0]
duration = len(audio) / sr
logger.bind(tag=TAG).info(
"TTS: %.2fs合成 %.1fs音频 sr=%d [%s]" % (time.time() - t0, duration, sr, text[:30])
)
# Convert to WAV bytes
pcm = (audio * 32767).astype(np.int16)
wav_io = io.BytesIO()
with wave.open(wav_io, "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(sr)
wf.writeframes(pcm.tobytes())
wav_data = wav_io.getvalue()
if output_file:
with open(output_file, "wb") as f:
f.write(wav_data)
else:
return wav_data