feat: TTS on 2 GPUs (cuda:2,cuda:3) for faster inference

This commit is contained in:
hailin 2026-04-07 03:28:16 -07:00
parent b75e813c03
commit 17923f3bdc
2 changed files with 19 additions and 7 deletions

View File

@ -42,7 +42,7 @@ TTS:
type: qwen3_tts
model_path: /home/ZeroStack/xiaozhi/Qwen3-TTS-12Hz-1.7B-CustomVoice
tokenizer_path: /home/ZeroStack/xiaozhi/Qwen3-TTS-Tokenizer-12Hz
device: cuda:2
device: cuda:2,cuda:3
dtype: bfloat16
speaker: uncle_fu
language: Chinese

View File

@ -37,12 +37,24 @@ class TTSProvider(TTSProviderBase):
)
t0 = time.time()
self.model = Qwen3TTSModel.from_pretrained(
model_path,
device_map=device,
dtype=dtype,
attn_implementation="flash_attention_2",
)
# Use multiple GPUs if specified (e.g. "cuda:2,cuda:3")
if "," in device:
gpu_ids = [d.strip().replace("cuda:", "") for d in device.split(",")]
max_memory = {int(g): "22GiB" for g in gpu_ids}
self.model = Qwen3TTSModel.from_pretrained(
model_path,
device_map="auto",
max_memory=max_memory,
dtype=dtype,
attn_implementation="flash_attention_2",
)
else:
self.model = Qwen3TTSModel.from_pretrained(
model_path,
device_map=device,
dtype=dtype,
attn_implementation="flash_attention_2",
)
self.tokenizer = Qwen3TTSTokenizer.from_pretrained(tokenizer_path)
# Get supported speakers