feat: add Qwen3-TTS CustomVoice GPU provider, switch TTS
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
51fa106d7d
commit
d3fd9cc391
|
|
@ -25,7 +25,7 @@ wakeup_words:
|
||||||
|
|
||||||
selected_module:
|
selected_module:
|
||||||
LLM: antaf
|
LLM: antaf
|
||||||
TTS: sherpa_tts
|
TTS: qwen3_tts
|
||||||
ASR: qwen3_asr_local
|
ASR: qwen3_asr_local
|
||||||
|
|
||||||
LLM:
|
LLM:
|
||||||
|
|
@ -40,14 +40,22 @@ LLM:
|
||||||
api_key: token-abc123
|
api_key: token-abc123
|
||||||
|
|
||||||
TTS:
|
TTS:
|
||||||
EdgeTTS:
|
qwen3_tts:
|
||||||
voice: zh-CN-YunxiNeural
|
type: qwen3_tts
|
||||||
|
model_path: /home/ZeroStack/xiaozhi/Qwen3-TTS-12Hz-1.7B-CustomVoice
|
||||||
|
tokenizer_path: /home/ZeroStack/xiaozhi/Qwen3-TTS-Tokenizer-12Hz
|
||||||
|
device: cuda:0
|
||||||
|
dtype: bfloat16
|
||||||
|
speaker: Chelsie
|
||||||
|
language: Chinese
|
||||||
sherpa_tts:
|
sherpa_tts:
|
||||||
type: sherpa_tts
|
type: sherpa_tts
|
||||||
model_dir: models/vits-melo-tts-zh_en
|
model_dir: models/vits-melo-tts-zh_en
|
||||||
speed: 1.0
|
speed: 1.0
|
||||||
sid: 0
|
sid: 0
|
||||||
num_threads: 8
|
num_threads: 8
|
||||||
|
EdgeTTS:
|
||||||
|
voice: zh-CN-YunxiNeural
|
||||||
|
|
||||||
ASR:
|
ASR:
|
||||||
FunASR:
|
FunASR:
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,87 @@
|
||||||
|
"""
|
||||||
|
Qwen3-TTS CustomVoice Provider for xiaozhi-server
|
||||||
|
Based on sherpa_tts.py structure.
|
||||||
|
GPU inference using qwen-tts package.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import io
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import wave
|
||||||
|
import asyncio
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from config.logger import setup_logging
|
||||||
|
from core.providers.tts.base import TTSProviderBase
|
||||||
|
|
||||||
|
TAG = __name__
|
||||||
|
logger = setup_logging()
|
||||||
|
|
||||||
|
|
||||||
|
class TTSProvider(TTSProviderBase):
|
||||||
|
def __init__(self, config, delete_audio_file):
|
||||||
|
super().__init__(config, delete_audio_file)
|
||||||
|
import torch
|
||||||
|
from qwen_tts import Qwen3TTSModel, Qwen3TTSTokenizer
|
||||||
|
|
||||||
|
model_path = config.get("model_path", "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice")
|
||||||
|
tokenizer_path = config.get("tokenizer_path", "Qwen/Qwen3-TTS-Tokenizer-12Hz")
|
||||||
|
device = config.get("device", "cuda:0")
|
||||||
|
dtype_str = config.get("dtype", "bfloat16")
|
||||||
|
dtype = getattr(torch, dtype_str, torch.bfloat16)
|
||||||
|
self.speaker = config.get("speaker", "Chelsie")
|
||||||
|
self.language = config.get("language", "Chinese")
|
||||||
|
|
||||||
|
logger.bind(tag=TAG).info(
|
||||||
|
"Qwen3TTS loading: model=%s device=%s speaker=%s" % (model_path, device, self.speaker)
|
||||||
|
)
|
||||||
|
t0 = time.time()
|
||||||
|
|
||||||
|
self.model = Qwen3TTSModel.from_pretrained(
|
||||||
|
model_path,
|
||||||
|
device_map=device,
|
||||||
|
dtype=dtype,
|
||||||
|
)
|
||||||
|
self.tokenizer = Qwen3TTSTokenizer.from_pretrained(tokenizer_path)
|
||||||
|
|
||||||
|
# Get supported speakers
|
||||||
|
speakers = self.model.get_supported_speakers()
|
||||||
|
logger.bind(tag=TAG).info(
|
||||||
|
"Qwen3TTS loaded in %.1fs, speakers=%s" % (time.time() - t0, speakers)
|
||||||
|
)
|
||||||
|
|
||||||
|
async def text_to_speak(self, text, output_file):
|
||||||
|
t0 = time.time()
|
||||||
|
|
||||||
|
# Run in thread pool to avoid blocking
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
wavs, sr = await loop.run_in_executor(
|
||||||
|
None,
|
||||||
|
lambda: self.model.generate_custom_voice(
|
||||||
|
text=text,
|
||||||
|
speaker=self.speaker,
|
||||||
|
language=self.language,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
audio = wavs[0]
|
||||||
|
duration = len(audio) / sr
|
||||||
|
logger.bind(tag=TAG).info(
|
||||||
|
"TTS: %.2fs合成 %.1fs音频 sr=%d [%s]" % (time.time() - t0, duration, sr, text[:30])
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert to WAV bytes
|
||||||
|
pcm = (audio * 32767).astype(np.int16)
|
||||||
|
wav_io = io.BytesIO()
|
||||||
|
with wave.open(wav_io, "wb") as wf:
|
||||||
|
wf.setnchannels(1)
|
||||||
|
wf.setsampwidth(2)
|
||||||
|
wf.setframerate(sr)
|
||||||
|
wf.writeframes(pcm.tobytes())
|
||||||
|
wav_data = wav_io.getvalue()
|
||||||
|
|
||||||
|
if output_file:
|
||||||
|
with open(output_file, "wb") as f:
|
||||||
|
f.write(wav_data)
|
||||||
|
else:
|
||||||
|
return wav_data
|
||||||
Loading…
Reference in New Issue