Initial commit
This commit is contained in:
commit
54cb82f1b9
|
|
@ -0,0 +1,3 @@
|
|||
import torch
|
||||
print("Torch 是否使用 GPU:", torch.cuda.is_available())
|
||||
print("当前使用的 GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
import torch
|
||||
from TTS.tts.models.xtts import XttsAudioConfig # 允许 XttsAudioConfig
|
||||
from TTS.tts.configs.xtts_config import XttsConfig # 允许 XttsConfig
|
||||
from TTS.config.shared_configs import BaseDatasetConfig # 允许 BaseDatasetConfig
|
||||
from TTS.tts.models.xtts import XttsArgs # 允许 XttsArgs
|
||||
|
||||
# **添加 `XttsConfig`、`XttsAudioConfig`、`BaseDatasetConfig` 和 `XttsArgs` 到 PyTorch 安全全局对象**
|
||||
torch.serialization.add_safe_globals([XttsConfig, XttsAudioConfig, BaseDatasetConfig, XttsArgs])
|
||||
|
||||
# **强制 `weights_only=False`**
|
||||
def load_fsspec_fixed(*args, **kwargs):
|
||||
kwargs["weights_only"] = False # 关键修正
|
||||
return torch.load(*args, **kwargs)
|
||||
|
||||
# **覆盖 Coqui TTS 的 `load_fsspec` 方法**
|
||||
import TTS.utils.io
|
||||
TTS.utils.io.load_fsspec = load_fsspec_fixed
|
||||
|
||||
from TTS.api import TTS
|
||||
import os
|
||||
from pydub import AudioSegment # 用于合并音频
|
||||
|
||||
# 选择 XTTS v2 多语言模型
|
||||
MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
|
||||
|
||||
# 加载 TTS 模型
|
||||
print("正在加载 TTS 预训练模型,请稍候...")
|
||||
tts = TTS(MODEL_NAME)
|
||||
|
||||
# **指定参考音频**
|
||||
speaker_wav_zh = "example_speaker_zh.wav" # 你必须提供这个音频
|
||||
speaker_wav_en = "example_speaker_en.wav" # 你必须提供这个音频
|
||||
|
||||
# **确保 `speaker_wav` 存在**
|
||||
if not os.path.exists(speaker_wav_zh):
|
||||
raise FileNotFoundError(f"错误:找不到 {speaker_wav_zh},XTTS v2 需要一个参考音频!")
|
||||
|
||||
# **拆分文本**
|
||||
text_cn1 = "你好,欢迎使用"
|
||||
text_en = "Coqui TTS"
|
||||
text_cn2 = "进行中文语音合成!"
|
||||
test_all = "Not long ago, my colleague Carlos Fenollosa made a bold claim in his book ‘La singularidad’: ChatGPT is AGI. My initial reaction was immediate dismissal. But knowing his careful reasoning, I kept reading. As I followed his argument, something unexpected happened — I found myself agreeing, though from an entirely different perspective. What I discovered challenges both those waiting for AGI’s arrival and those claiming it’s nowhere near: we might be asking the wrong question altogether."
|
||||
|
||||
# 生成临时音频文件
|
||||
tts.tts_to_file(text=text_cn1, file_path="part1.wav", speaker_wav=speaker_wav_zh, language="zh-cn")
|
||||
tts.tts_to_file(text=text_en, file_path="part2.wav", speaker_wav=speaker_wav_en, language="en")
|
||||
tts.tts_to_file(text=text_cn2, file_path="part3.wav", speaker_wav=speaker_wav_zh, language="zh-cn")
|
||||
# tts.tts_to_file(text=test_all, file_path="all.wav", speaker_wav=speaker_wav_en, language="en")
|
||||
|
||||
# # **合并音频**
|
||||
audio1 = AudioSegment.from_wav("part1.wav")
|
||||
audio2 = AudioSegment.from_wav("part2.wav")
|
||||
audio3 = AudioSegment.from_wav("part3.wav")
|
||||
|
||||
final_audio = audio1 + audio2 + audio3
|
||||
final_audio.export("output.wav", format="wav")
|
||||
|
||||
print(f"✅ 语音合成完成!已保存到 output.wav")
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
import sounddevice as sd
|
||||
|
||||
print("🎤 正在查询可用的音频设备...\n")
|
||||
devices = sd.query_devices()
|
||||
|
||||
for i, device in enumerate(devices):
|
||||
print(f"设备 ID {i}: {device['name']} - 输入通道: {device['max_input_channels']}")
|
||||
|
||||
# 只显示支持 WASAPI 环回录音的设备
|
||||
wasapi_devices = [d for d in devices if "loopback" in d["name"].lower()]
|
||||
if wasapi_devices:
|
||||
print("\n✅ 发现 WASAPI '环回录音' 设备:")
|
||||
for d in wasapi_devices:
|
||||
print(f" - {d['name']}")
|
||||
else:
|
||||
print("\n❌ 没有找到 'WASAPI 环回录音' 设备,请尝试手动启用或安装 Virtual Cable。")
|
||||
Binary file not shown.
Binary file not shown.
|
|
@ -0,0 +1,181 @@
|
|||
import pyaudio
|
||||
import wave
|
||||
import numpy as np
|
||||
import time
|
||||
from pydub import AudioSegment
|
||||
|
||||
# 录音参数
|
||||
FORMAT = pyaudio.paFloat32 # 32-bit 浮点格式
|
||||
CHANNELS = 1
|
||||
RATE = 44100
|
||||
CHUNK = 1024
|
||||
OUTPUT_FILENAME = "output.wav"
|
||||
THRESHOLD = 0.008 # 适配 paFloat32(范围是 [-1.0, 1.0])
|
||||
SILENCE_DURATION = 2 # 静音时间(秒)
|
||||
|
||||
# **🔍 列出所有可用设备并解释用途**
|
||||
def list_audio_devices():
|
||||
audio = pyaudio.PyAudio()
|
||||
device_info = []
|
||||
|
||||
print("\n🎤 **可用的录音设备列表**\n")
|
||||
print(f"{'ID':<5}{'设备名称':<35}{'输入通道数':<15}{'设备类型'}")
|
||||
print("="*80)
|
||||
|
||||
for i in range(audio.get_device_count()):
|
||||
dev = audio.get_device_info_by_index(i)
|
||||
name = dev['name'].lower()
|
||||
channels = dev['maxInputChannels']
|
||||
|
||||
if channels > 0:
|
||||
# 设备类型判断
|
||||
if "stereo mix" in name or "what you hear" in name:
|
||||
device_type = "✅ 立体声混音(推荐)"
|
||||
elif "loopback" in name:
|
||||
device_type = "🔄 环回录音(可选)"
|
||||
elif "mic" in name or "microphone" in name:
|
||||
device_type = "🎤 麦克风(不推荐)"
|
||||
else:
|
||||
device_type = "🎧 其他音频设备"
|
||||
|
||||
print(f"{i:<5}{dev['name']:<35}{channels:<15}{device_type}")
|
||||
device_info.append((i, dev['name'], device_type))
|
||||
|
||||
audio.terminate()
|
||||
print("\n✅ **如果 '立体声混音' 存在,优先使用它**,否则尝试 '环回' 或手动选择。\n")
|
||||
return device_info
|
||||
|
||||
# **🔍 自动选择最佳录音设备**
|
||||
def get_best_device():
|
||||
audio = pyaudio.PyAudio()
|
||||
best_device = None
|
||||
fallback_device = None
|
||||
|
||||
print("\n🔍 **正在尝试自动选择最佳录音设备...**")
|
||||
|
||||
for i in range(audio.get_device_count()):
|
||||
dev = audio.get_device_info_by_index(i)
|
||||
name = dev['name'].lower()
|
||||
channels = dev['maxInputChannels']
|
||||
|
||||
if channels > 0:
|
||||
# **优先选择“立体声混音”**
|
||||
if "stereo mix" in name or "what you hear" in name:
|
||||
print(f"✅ 选择设备: {dev['name']} (ID: {i}) - 立体声混音(最佳)")
|
||||
best_device = i
|
||||
break
|
||||
|
||||
# **其次选择“环回(loopback)”**
|
||||
if "loopback" in name and best_device is None:
|
||||
print(f"🔄 选择设备: {dev['name']} (ID: {i}) - 环回录音")
|
||||
best_device = i
|
||||
|
||||
# **如果没有立体声混音或环回,选择一个非麦克风的可用设备**
|
||||
if "mic" not in name and "microphone" not in name and fallback_device is None:
|
||||
fallback_device = i
|
||||
|
||||
audio.terminate()
|
||||
|
||||
if best_device is not None:
|
||||
return best_device
|
||||
elif fallback_device is not None:
|
||||
print(f"⚠️ 没有找到 '立体声混音',使用默认设备 (ID: {fallback_device})")
|
||||
return fallback_device
|
||||
else:
|
||||
print("\n❌ 没有检测到合适的设备,请手动选择一个设备 ID:")
|
||||
device_list = list_audio_devices()
|
||||
selected_device = input("🔹 请输入你要使用的设备 ID(数字): ")
|
||||
try:
|
||||
selected_device = int(selected_device)
|
||||
return selected_device
|
||||
except ValueError:
|
||||
raise RuntimeError("❌ 设备 ID 无效,程序终止!")
|
||||
|
||||
# **🎤 运行设备列表**
|
||||
device_list = list_audio_devices()
|
||||
|
||||
# **🔧 选择最佳设备**
|
||||
device_index = get_best_device()
|
||||
|
||||
# 初始化 PyAudio
|
||||
audio = pyaudio.PyAudio()
|
||||
try:
|
||||
if device_index is not None:
|
||||
stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True,
|
||||
input_device_index=device_index,
|
||||
frames_per_buffer=CHUNK)
|
||||
else:
|
||||
stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True,
|
||||
frames_per_buffer=CHUNK)
|
||||
|
||||
print("🎤 只录制系统声音,麦克风已禁用!")
|
||||
print("🎤 等待声音触发录音...")
|
||||
|
||||
# 等待声音触发
|
||||
while True:
|
||||
data = stream.read(CHUNK, exception_on_overflow=False)
|
||||
audio_data = np.frombuffer(data, dtype=np.float32) # 读取 32-bit 浮点数据
|
||||
volume = np.max(np.abs(audio_data)) # 计算音量
|
||||
|
||||
if volume > THRESHOLD:
|
||||
print("🎙 检测到声音,开始录音...")
|
||||
break
|
||||
|
||||
frames = [data]
|
||||
silent_start = None
|
||||
|
||||
# 开始录音,直到检测到 2 秒以上静音
|
||||
while True:
|
||||
data = stream.read(CHUNK, exception_on_overflow=False)
|
||||
audio_data = np.frombuffer(data, dtype=np.float32)
|
||||
|
||||
# **放大音量**
|
||||
volume_boost = 5.0 # 放大 5 倍
|
||||
audio_data = np.clip(audio_data * volume_boost, -1.0, 1.0) # 避免溢出
|
||||
frames.append(audio_data.tobytes())
|
||||
|
||||
volume = np.max(np.abs(audio_data)) # 计算当前音量
|
||||
|
||||
if volume < THRESHOLD:
|
||||
if silent_start is None:
|
||||
silent_start = time.time()
|
||||
elif time.time() - silent_start >= SILENCE_DURATION:
|
||||
print("🤫 检测到静音超过 2 秒,停止录音...")
|
||||
break
|
||||
else:
|
||||
silent_start = None
|
||||
|
||||
print("🎼 录音结束,正在保存文件...")
|
||||
|
||||
# 关闭流
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
audio.terminate()
|
||||
|
||||
# **转换 `paFloat32` 录音数据为 `paInt16`(标准 WAV 格式)**
|
||||
int_frames = []
|
||||
for frame in frames:
|
||||
float_data = np.frombuffer(frame, dtype=np.float32) # 读取浮点数据
|
||||
int_data = np.int16(float_data * 32767) # 转换为 16-bit PCM
|
||||
int_frames.append(int_data.tobytes())
|
||||
|
||||
# 保存为 WAV
|
||||
wf = wave.open(OUTPUT_FILENAME, 'wb')
|
||||
wf.setnchannels(CHANNELS)
|
||||
wf.setsampwidth(2) # 16-bit PCM
|
||||
wf.setframerate(RATE)
|
||||
wf.writeframes(b''.join(int_frames))
|
||||
wf.close()
|
||||
|
||||
print(f"✅ 录音已保存为 {OUTPUT_FILENAME}")
|
||||
|
||||
# **自动归一化音量**
|
||||
audio = AudioSegment.from_wav(OUTPUT_FILENAME)
|
||||
normalized_audio = audio.apply_gain(-audio.dBFS)
|
||||
normalized_audio.export("output_loud.wav", format="wav")
|
||||
|
||||
print("✅ 录音音量已调整,保存为 output_loud.wav")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 录音失败: {e}")
|
||||
audio.terminate()
|
||||
|
|
@ -0,0 +1,64 @@
|
|||
import torch
|
||||
from TTS.tts.models.xtts import XttsAudioConfig
|
||||
from TTS.tts.configs.xtts_config import XttsConfig
|
||||
from TTS.config.shared_configs import BaseDatasetConfig
|
||||
from TTS.tts.models.xtts import XttsArgs
|
||||
|
||||
# **添加 `XttsConfig` 到 PyTorch 安全全局对象**
|
||||
torch.serialization.add_safe_globals([XttsConfig, XttsAudioConfig, BaseDatasetConfig, XttsArgs])
|
||||
|
||||
# **强制 `weights_only=False` 解决 UnpicklingError**
|
||||
# def load_fsspec_fixed(*args, **kwargs):
|
||||
# kwargs["weights_only"] = False # 关键修正
|
||||
# return torch.load(*args, **kwargs)
|
||||
|
||||
def load_fsspec_fixed(*args, **kwargs):
|
||||
kwargs.pop("cache", None) # 移除 cache 参数,避免报错
|
||||
kwargs["weights_only"] = False # 关键修正
|
||||
return torch.load(*args, **kwargs)
|
||||
|
||||
|
||||
# **覆盖 Coqui TTS 的 `load_fsspec` 方法**
|
||||
import TTS.utils.io
|
||||
TTS.utils.io.load_fsspec = load_fsspec_fixed
|
||||
|
||||
from TTS.api import TTS
|
||||
|
||||
# **XTTS v2 高质量多语言模型**
|
||||
XTTS_MODEL = "tts_models/multilingual/multi-dataset/xtts_v2"
|
||||
|
||||
# **VITS 高质量单语言模型**
|
||||
VITS_MODEL = "tts_models/en/ljspeech/vits"
|
||||
|
||||
# **高质量的参考音频**
|
||||
speaker_wav_en = "example_speaker_en.wav" # 必须提供此音频,确保质量良好
|
||||
|
||||
# **要转换的文本**
|
||||
text_en = "This is a high-quality text-to-speech conversion using XTTS v2 and VITS."
|
||||
text = "记者从越秀区了解到,广州博物馆隆重推出“吉祥有年——广州博物馆藏吉祥文物展”及“吉祥有年·潮派趁墟”主题新春市集,让市民群众在探秘文物珍宝的同时,也能解锁非遗新体验、品尝地道广府味,一起喜迎吉祥乙巳蛇年。"
|
||||
|
||||
# **加载 XTTS v2**
|
||||
print("🚀 正在加载 XTTS v2 模型,请稍候...")
|
||||
tts_xtts = TTS(XTTS_MODEL)
|
||||
|
||||
# **使用 XTTS v2 生成语音**
|
||||
tts_xtts.tts_to_file(
|
||||
text=text,
|
||||
file_path="output_xtts.wav",
|
||||
speaker_wav=speaker_wav_en, # 参考音频(用于克隆音色)
|
||||
language="zh-cn", # 语言代码,必须匹配文本语言
|
||||
split_sentences=True # 让模型自动优化长文本
|
||||
)
|
||||
print("✅ XTTS v2 语音合成完成!已保存到 output_xtts.wav 🎵")
|
||||
|
||||
# **加载 VITS**
|
||||
print("🚀 正在加载 VITS 模型,请稍候...")
|
||||
tts_vits = TTS(VITS_MODEL)
|
||||
|
||||
# **使用 VITS 生成语音**
|
||||
tts_vits.tts_to_file(
|
||||
text=text_en,
|
||||
split_sentences=True,
|
||||
file_path="output_vits.wav"
|
||||
)
|
||||
print("✅ VITS 语音合成完成!已保存到 output_vits.wav 🎵")
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
from TTS.utils.manage import ModelManager
|
||||
|
||||
# 获取 Coqui TTS 官方模型列表
|
||||
manager = ModelManager()
|
||||
models = manager.list_models()
|
||||
|
||||
print("✅ 可用的 TTS 预训练模型列表:")
|
||||
for model in models:
|
||||
print(model)
|
||||
Binary file not shown.
Binary file not shown.
|
|
@ -0,0 +1,16 @@
|
|||
import sounddevice as sd
|
||||
import wavio
|
||||
|
||||
# 录制参数
|
||||
duration = 5 # 录制 5 秒
|
||||
samplerate = 22050 # 采样率 (XTTS v2 兼容的)
|
||||
filename = "example_speaker.wav"
|
||||
|
||||
print("开始录音,说话吧...")
|
||||
audio = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype='int16')
|
||||
sd.wait()
|
||||
print("录音完成,保存中...")
|
||||
|
||||
# 保存 WAV 文件
|
||||
wavio.write(filename, audio, samplerate, sampwidth=2)
|
||||
print(f"录音已保存为 {filename}")
|
||||
Loading…
Reference in New Issue