59 lines
2.7 KiB
Python
59 lines
2.7 KiB
Python
import torch
|
||
from TTS.tts.models.xtts import XttsAudioConfig # 允许 XttsAudioConfig
|
||
from TTS.tts.configs.xtts_config import XttsConfig # 允许 XttsConfig
|
||
from TTS.config.shared_configs import BaseDatasetConfig # 允许 BaseDatasetConfig
|
||
from TTS.tts.models.xtts import XttsArgs # 允许 XttsArgs
|
||
|
||
# **添加 `XttsConfig`、`XttsAudioConfig`、`BaseDatasetConfig` 和 `XttsArgs` 到 PyTorch 安全全局对象**
|
||
torch.serialization.add_safe_globals([XttsConfig, XttsAudioConfig, BaseDatasetConfig, XttsArgs])
|
||
|
||
# **强制 `weights_only=False`**
|
||
def load_fsspec_fixed(*args, **kwargs):
|
||
kwargs["weights_only"] = False # 关键修正
|
||
return torch.load(*args, **kwargs)
|
||
|
||
# **覆盖 Coqui TTS 的 `load_fsspec` 方法**
|
||
import TTS.utils.io
|
||
TTS.utils.io.load_fsspec = load_fsspec_fixed
|
||
|
||
from TTS.api import TTS
|
||
import os
|
||
from pydub import AudioSegment # 用于合并音频
|
||
|
||
# 选择 XTTS v2 多语言模型
|
||
MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
|
||
|
||
# 加载 TTS 模型
|
||
print("正在加载 TTS 预训练模型,请稍候...")
|
||
tts = TTS(MODEL_NAME)
|
||
|
||
# **指定参考音频**
|
||
speaker_wav_zh = "example_speaker_zh.wav" # 你必须提供这个音频
|
||
speaker_wav_en = "example_speaker_en.wav" # 你必须提供这个音频
|
||
|
||
# **确保 `speaker_wav` 存在**
|
||
if not os.path.exists(speaker_wav_zh):
|
||
raise FileNotFoundError(f"错误:找不到 {speaker_wav_zh},XTTS v2 需要一个参考音频!")
|
||
|
||
# **拆分文本**
|
||
text_cn1 = "你好,欢迎使用"
|
||
text_en = "Coqui TTS"
|
||
text_cn2 = "进行中文语音合成!"
|
||
test_all = "Not long ago, my colleague Carlos Fenollosa made a bold claim in his book ‘La singularidad’: ChatGPT is AGI. My initial reaction was immediate dismissal. But knowing his careful reasoning, I kept reading. As I followed his argument, something unexpected happened — I found myself agreeing, though from an entirely different perspective. What I discovered challenges both those waiting for AGI’s arrival and those claiming it’s nowhere near: we might be asking the wrong question altogether."
|
||
|
||
# 生成临时音频文件
|
||
tts.tts_to_file(text=text_cn1, file_path="part1.wav", speaker_wav=speaker_wav_zh, language="zh-cn")
|
||
tts.tts_to_file(text=text_en, file_path="part2.wav", speaker_wav=speaker_wav_en, language="en")
|
||
tts.tts_to_file(text=text_cn2, file_path="part3.wav", speaker_wav=speaker_wav_zh, language="zh-cn")
|
||
# tts.tts_to_file(text=test_all, file_path="all.wav", speaker_wav=speaker_wav_en, language="en")
|
||
|
||
# # **合并音频**
|
||
audio1 = AudioSegment.from_wav("part1.wav")
|
||
audio2 = AudioSegment.from_wav("part2.wav")
|
||
audio3 = AudioSegment.from_wav("part3.wav")
|
||
|
||
final_audio = audio1 + audio2 + audio3
|
||
final_audio.export("output.wav", format="wav")
|
||
|
||
print(f"✅ 语音合成完成!已保存到 output.wav")
|