CoquiTTS/coquitts.py

59 lines
2.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import torch
from TTS.tts.models.xtts import XttsAudioConfig # 允许 XttsAudioConfig
from TTS.tts.configs.xtts_config import XttsConfig # 允许 XttsConfig
from TTS.config.shared_configs import BaseDatasetConfig # 允许 BaseDatasetConfig
from TTS.tts.models.xtts import XttsArgs # 允许 XttsArgs
# **添加 `XttsConfig`、`XttsAudioConfig`、`BaseDatasetConfig` 和 `XttsArgs` 到 PyTorch 安全全局对象**
torch.serialization.add_safe_globals([XttsConfig, XttsAudioConfig, BaseDatasetConfig, XttsArgs])
# **强制 `weights_only=False`**
def load_fsspec_fixed(*args, **kwargs):
kwargs["weights_only"] = False # 关键修正
return torch.load(*args, **kwargs)
# **覆盖 Coqui TTS 的 `load_fsspec` 方法**
import TTS.utils.io
TTS.utils.io.load_fsspec = load_fsspec_fixed
from TTS.api import TTS
import os
from pydub import AudioSegment # 用于合并音频
# 选择 XTTS v2 多语言模型
MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
# 加载 TTS 模型
print("正在加载 TTS 预训练模型,请稍候...")
tts = TTS(MODEL_NAME)
# **指定参考音频**
speaker_wav_zh = "example_speaker_zh.wav" # 你必须提供这个音频
speaker_wav_en = "example_speaker_en.wav" # 你必须提供这个音频
# **确保 `speaker_wav` 存在**
if not os.path.exists(speaker_wav_zh):
raise FileNotFoundError(f"错误:找不到 {speaker_wav_zh}XTTS v2 需要一个参考音频!")
# **拆分文本**
text_cn1 = "你好,欢迎使用"
text_en = "Coqui TTS"
text_cn2 = "进行中文语音合成!"
test_all = "Not long ago, my colleague Carlos Fenollosa made a bold claim in his book La singularidad: ChatGPT is AGI. My initial reaction was immediate dismissal. But knowing his careful reasoning, I kept reading. As I followed his argument, something unexpected happened — I found myself agreeing, though from an entirely different perspective. What I discovered challenges both those waiting for AGIs arrival and those claiming its nowhere near: we might be asking the wrong question altogether."
# 生成临时音频文件
tts.tts_to_file(text=text_cn1, file_path="part1.wav", speaker_wav=speaker_wav_zh, language="zh-cn")
tts.tts_to_file(text=text_en, file_path="part2.wav", speaker_wav=speaker_wav_en, language="en")
tts.tts_to_file(text=text_cn2, file_path="part3.wav", speaker_wav=speaker_wav_zh, language="zh-cn")
# tts.tts_to_file(text=test_all, file_path="all.wav", speaker_wav=speaker_wav_en, language="en")
# # **合并音频**
audio1 = AudioSegment.from_wav("part1.wav")
audio2 = AudioSegment.from_wav("part2.wav")
audio3 = AudioSegment.from_wav("part3.wav")
final_audio = audio1 + audio2 + audio3
final_audio.export("output.wav", format="wav")
print(f"✅ 语音合成完成!已保存到 output.wav")