feat: add Qwen3-TTS CustomVoice GPU provider, switch TTS
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
51fa106d7d
commit
d3fd9cc391
|
|
@ -25,7 +25,7 @@ wakeup_words:
|
|||
|
||||
selected_module:
|
||||
LLM: antaf
|
||||
TTS: sherpa_tts
|
||||
TTS: qwen3_tts
|
||||
ASR: qwen3_asr_local
|
||||
|
||||
LLM:
|
||||
|
|
@ -40,14 +40,22 @@ LLM:
|
|||
api_key: token-abc123
|
||||
|
||||
TTS:
|
||||
EdgeTTS:
|
||||
voice: zh-CN-YunxiNeural
|
||||
qwen3_tts:
|
||||
type: qwen3_tts
|
||||
model_path: /home/ZeroStack/xiaozhi/Qwen3-TTS-12Hz-1.7B-CustomVoice
|
||||
tokenizer_path: /home/ZeroStack/xiaozhi/Qwen3-TTS-Tokenizer-12Hz
|
||||
device: cuda:0
|
||||
dtype: bfloat16
|
||||
speaker: Chelsie
|
||||
language: Chinese
|
||||
sherpa_tts:
|
||||
type: sherpa_tts
|
||||
model_dir: models/vits-melo-tts-zh_en
|
||||
speed: 1.0
|
||||
sid: 0
|
||||
num_threads: 8
|
||||
EdgeTTS:
|
||||
voice: zh-CN-YunxiNeural
|
||||
|
||||
ASR:
|
||||
FunASR:
|
||||
|
|
|
|||
|
|
@ -0,0 +1,87 @@
|
|||
"""
|
||||
Qwen3-TTS CustomVoice Provider for xiaozhi-server
|
||||
Based on sherpa_tts.py structure.
|
||||
GPU inference using qwen-tts package.
|
||||
"""
|
||||
|
||||
import io
|
||||
import os
|
||||
import time
|
||||
import wave
|
||||
import asyncio
|
||||
import numpy as np
|
||||
|
||||
from config.logger import setup_logging
|
||||
from core.providers.tts.base import TTSProviderBase
|
||||
|
||||
TAG = __name__
|
||||
logger = setup_logging()
|
||||
|
||||
|
||||
class TTSProvider(TTSProviderBase):
|
||||
def __init__(self, config, delete_audio_file):
|
||||
super().__init__(config, delete_audio_file)
|
||||
import torch
|
||||
from qwen_tts import Qwen3TTSModel, Qwen3TTSTokenizer
|
||||
|
||||
model_path = config.get("model_path", "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice")
|
||||
tokenizer_path = config.get("tokenizer_path", "Qwen/Qwen3-TTS-Tokenizer-12Hz")
|
||||
device = config.get("device", "cuda:0")
|
||||
dtype_str = config.get("dtype", "bfloat16")
|
||||
dtype = getattr(torch, dtype_str, torch.bfloat16)
|
||||
self.speaker = config.get("speaker", "Chelsie")
|
||||
self.language = config.get("language", "Chinese")
|
||||
|
||||
logger.bind(tag=TAG).info(
|
||||
"Qwen3TTS loading: model=%s device=%s speaker=%s" % (model_path, device, self.speaker)
|
||||
)
|
||||
t0 = time.time()
|
||||
|
||||
self.model = Qwen3TTSModel.from_pretrained(
|
||||
model_path,
|
||||
device_map=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
self.tokenizer = Qwen3TTSTokenizer.from_pretrained(tokenizer_path)
|
||||
|
||||
# Get supported speakers
|
||||
speakers = self.model.get_supported_speakers()
|
||||
logger.bind(tag=TAG).info(
|
||||
"Qwen3TTS loaded in %.1fs, speakers=%s" % (time.time() - t0, speakers)
|
||||
)
|
||||
|
||||
async def text_to_speak(self, text, output_file):
|
||||
t0 = time.time()
|
||||
|
||||
# Run in thread pool to avoid blocking
|
||||
loop = asyncio.get_event_loop()
|
||||
wavs, sr = await loop.run_in_executor(
|
||||
None,
|
||||
lambda: self.model.generate_custom_voice(
|
||||
text=text,
|
||||
speaker=self.speaker,
|
||||
language=self.language,
|
||||
)
|
||||
)
|
||||
|
||||
audio = wavs[0]
|
||||
duration = len(audio) / sr
|
||||
logger.bind(tag=TAG).info(
|
||||
"TTS: %.2fs合成 %.1fs音频 sr=%d [%s]" % (time.time() - t0, duration, sr, text[:30])
|
||||
)
|
||||
|
||||
# Convert to WAV bytes
|
||||
pcm = (audio * 32767).astype(np.int16)
|
||||
wav_io = io.BytesIO()
|
||||
with wave.open(wav_io, "wb") as wf:
|
||||
wf.setnchannels(1)
|
||||
wf.setsampwidth(2)
|
||||
wf.setframerate(sr)
|
||||
wf.writeframes(pcm.tobytes())
|
||||
wav_data = wav_io.getvalue()
|
||||
|
||||
if output_file:
|
||||
with open(output_file, "wb") as f:
|
||||
f.write(wav_data)
|
||||
else:
|
||||
return wav_data
|
||||
Loading…
Reference in New Issue