Initial commit

This commit is contained in:
hailin 2025-02-03 14:31:26 +08:00
commit 54cb82f1b9
13 changed files with 347 additions and 0 deletions

BIN
Nova.wav Normal file

Binary file not shown.

BIN
Onyx.wav Normal file

Binary file not shown.

3
checktorchgpu.py Normal file
View File

@ -0,0 +1,3 @@
import torch
print("Torch 是否使用 GPU:", torch.cuda.is_available())
print("当前使用的 GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

58
coquitts.py Normal file
View File

@ -0,0 +1,58 @@
import torch
from TTS.tts.models.xtts import XttsAudioConfig # 允许 XttsAudioConfig
from TTS.tts.configs.xtts_config import XttsConfig # 允许 XttsConfig
from TTS.config.shared_configs import BaseDatasetConfig # 允许 BaseDatasetConfig
from TTS.tts.models.xtts import XttsArgs # 允许 XttsArgs
# **添加 `XttsConfig`、`XttsAudioConfig`、`BaseDatasetConfig` 和 `XttsArgs` 到 PyTorch 安全全局对象**
torch.serialization.add_safe_globals([XttsConfig, XttsAudioConfig, BaseDatasetConfig, XttsArgs])
# **强制 `weights_only=False`**
def load_fsspec_fixed(*args, **kwargs):
kwargs["weights_only"] = False # 关键修正
return torch.load(*args, **kwargs)
# **覆盖 Coqui TTS 的 `load_fsspec` 方法**
import TTS.utils.io
TTS.utils.io.load_fsspec = load_fsspec_fixed
from TTS.api import TTS
import os
from pydub import AudioSegment # 用于合并音频
# 选择 XTTS v2 多语言模型
MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
# 加载 TTS 模型
print("正在加载 TTS 预训练模型,请稍候...")
tts = TTS(MODEL_NAME)
# **指定参考音频**
speaker_wav_zh = "example_speaker_zh.wav" # 你必须提供这个音频
speaker_wav_en = "example_speaker_en.wav" # 你必须提供这个音频
# **确保 `speaker_wav` 存在**
if not os.path.exists(speaker_wav_zh):
raise FileNotFoundError(f"错误:找不到 {speaker_wav_zh}XTTS v2 需要一个参考音频!")
# **拆分文本**
text_cn1 = "你好,欢迎使用"
text_en = "Coqui TTS"
text_cn2 = "进行中文语音合成!"
test_all = "Not long ago, my colleague Carlos Fenollosa made a bold claim in his book La singularidad: ChatGPT is AGI. My initial reaction was immediate dismissal. But knowing his careful reasoning, I kept reading. As I followed his argument, something unexpected happened — I found myself agreeing, though from an entirely different perspective. What I discovered challenges both those waiting for AGIs arrival and those claiming its nowhere near: we might be asking the wrong question altogether."
# 生成临时音频文件
tts.tts_to_file(text=text_cn1, file_path="part1.wav", speaker_wav=speaker_wav_zh, language="zh-cn")
tts.tts_to_file(text=text_en, file_path="part2.wav", speaker_wav=speaker_wav_en, language="en")
tts.tts_to_file(text=text_cn2, file_path="part3.wav", speaker_wav=speaker_wav_zh, language="zh-cn")
# tts.tts_to_file(text=test_all, file_path="all.wav", speaker_wav=speaker_wav_en, language="en")
# # **合并音频**
audio1 = AudioSegment.from_wav("part1.wav")
audio2 = AudioSegment.from_wav("part2.wav")
audio3 = AudioSegment.from_wav("part3.wav")
final_audio = audio1 + audio2 + audio3
final_audio.export("output.wav", format="wav")
print(f"✅ 语音合成完成!已保存到 output.wav")

16
detectdevice.py Normal file
View File

@ -0,0 +1,16 @@
import sounddevice as sd
print("🎤 正在查询可用的音频设备...\n")
devices = sd.query_devices()
for i, device in enumerate(devices):
print(f"设备 ID {i}: {device['name']} - 输入通道: {device['max_input_channels']}")
# 只显示支持 WASAPI 环回录音的设备
wasapi_devices = [d for d in devices if "loopback" in d["name"].lower()]
if wasapi_devices:
print("\n✅ 发现 WASAPI '环回录音' 设备:")
for d in wasapi_devices:
print(f" - {d['name']}")
else:
print("\n❌ 没有找到 'WASAPI 环回录音' 设备,请尝试手动启用或安装 Virtual Cable。")

BIN
example_speaker_en.wav Normal file

Binary file not shown.

BIN
example_speaker_zh.wav Normal file

Binary file not shown.

181
fromsoundcardpf32.py Normal file
View File

@ -0,0 +1,181 @@
import pyaudio
import wave
import numpy as np
import time
from pydub import AudioSegment
# 录音参数
FORMAT = pyaudio.paFloat32 # 32-bit 浮点格式
CHANNELS = 1
RATE = 44100
CHUNK = 1024
OUTPUT_FILENAME = "output.wav"
THRESHOLD = 0.008 # 适配 paFloat32范围是 [-1.0, 1.0]
SILENCE_DURATION = 2 # 静音时间(秒)
# **🔍 列出所有可用设备并解释用途**
def list_audio_devices():
audio = pyaudio.PyAudio()
device_info = []
print("\n🎤 **可用的录音设备列表**\n")
print(f"{'ID':<5}{'设备名称':<35}{'输入通道数':<15}{'设备类型'}")
print("="*80)
for i in range(audio.get_device_count()):
dev = audio.get_device_info_by_index(i)
name = dev['name'].lower()
channels = dev['maxInputChannels']
if channels > 0:
# 设备类型判断
if "stereo mix" in name or "what you hear" in name:
device_type = "✅ 立体声混音(推荐)"
elif "loopback" in name:
device_type = "🔄 环回录音(可选)"
elif "mic" in name or "microphone" in name:
device_type = "🎤 麦克风(不推荐)"
else:
device_type = "🎧 其他音频设备"
print(f"{i:<5}{dev['name']:<35}{channels:<15}{device_type}")
device_info.append((i, dev['name'], device_type))
audio.terminate()
print("\n✅ **如果 '立体声混音' 存在,优先使用它**,否则尝试 '环回' 或手动选择。\n")
return device_info
# **🔍 自动选择最佳录音设备**
def get_best_device():
audio = pyaudio.PyAudio()
best_device = None
fallback_device = None
print("\n🔍 **正在尝试自动选择最佳录音设备...**")
for i in range(audio.get_device_count()):
dev = audio.get_device_info_by_index(i)
name = dev['name'].lower()
channels = dev['maxInputChannels']
if channels > 0:
# **优先选择“立体声混音”**
if "stereo mix" in name or "what you hear" in name:
print(f"✅ 选择设备: {dev['name']} (ID: {i}) - 立体声混音(最佳)")
best_device = i
break
# **其次选择“环回loopback”**
if "loopback" in name and best_device is None:
print(f"🔄 选择设备: {dev['name']} (ID: {i}) - 环回录音")
best_device = i
# **如果没有立体声混音或环回,选择一个非麦克风的可用设备**
if "mic" not in name and "microphone" not in name and fallback_device is None:
fallback_device = i
audio.terminate()
if best_device is not None:
return best_device
elif fallback_device is not None:
print(f"⚠️ 没有找到 '立体声混音',使用默认设备 (ID: {fallback_device})")
return fallback_device
else:
print("\n❌ 没有检测到合适的设备,请手动选择一个设备 ID")
device_list = list_audio_devices()
selected_device = input("🔹 请输入你要使用的设备 ID数字: ")
try:
selected_device = int(selected_device)
return selected_device
except ValueError:
raise RuntimeError("❌ 设备 ID 无效,程序终止!")
# **🎤 运行设备列表**
device_list = list_audio_devices()
# **🔧 选择最佳设备**
device_index = get_best_device()
# 初始化 PyAudio
audio = pyaudio.PyAudio()
try:
if device_index is not None:
stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True,
input_device_index=device_index,
frames_per_buffer=CHUNK)
else:
stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True,
frames_per_buffer=CHUNK)
print("🎤 只录制系统声音,麦克风已禁用!")
print("🎤 等待声音触发录音...")
# 等待声音触发
while True:
data = stream.read(CHUNK, exception_on_overflow=False)
audio_data = np.frombuffer(data, dtype=np.float32) # 读取 32-bit 浮点数据
volume = np.max(np.abs(audio_data)) # 计算音量
if volume > THRESHOLD:
print("🎙 检测到声音,开始录音...")
break
frames = [data]
silent_start = None
# 开始录音,直到检测到 2 秒以上静音
while True:
data = stream.read(CHUNK, exception_on_overflow=False)
audio_data = np.frombuffer(data, dtype=np.float32)
# **放大音量**
volume_boost = 5.0 # 放大 5 倍
audio_data = np.clip(audio_data * volume_boost, -1.0, 1.0) # 避免溢出
frames.append(audio_data.tobytes())
volume = np.max(np.abs(audio_data)) # 计算当前音量
if volume < THRESHOLD:
if silent_start is None:
silent_start = time.time()
elif time.time() - silent_start >= SILENCE_DURATION:
print("🤫 检测到静音超过 2 秒,停止录音...")
break
else:
silent_start = None
print("🎼 录音结束,正在保存文件...")
# 关闭流
stream.stop_stream()
stream.close()
audio.terminate()
# **转换 `paFloat32` 录音数据为 `paInt16`(标准 WAV 格式)**
int_frames = []
for frame in frames:
float_data = np.frombuffer(frame, dtype=np.float32) # 读取浮点数据
int_data = np.int16(float_data * 32767) # 转换为 16-bit PCM
int_frames.append(int_data.tobytes())
# 保存为 WAV
wf = wave.open(OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(2) # 16-bit PCM
wf.setframerate(RATE)
wf.writeframes(b''.join(int_frames))
wf.close()
print(f"✅ 录音已保存为 {OUTPUT_FILENAME}")
# **自动归一化音量**
audio = AudioSegment.from_wav(OUTPUT_FILENAME)
normalized_audio = audio.apply_gain(-audio.dBFS)
normalized_audio.export("output_loud.wav", format="wav")
print("✅ 录音音量已调整,保存为 output_loud.wav")
except Exception as e:
print(f"❌ 录音失败: {e}")
audio.terminate()

64
hifi.py Normal file
View File

@ -0,0 +1,64 @@
import torch
from TTS.tts.models.xtts import XttsAudioConfig
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.config.shared_configs import BaseDatasetConfig
from TTS.tts.models.xtts import XttsArgs
# **添加 `XttsConfig` 到 PyTorch 安全全局对象**
torch.serialization.add_safe_globals([XttsConfig, XttsAudioConfig, BaseDatasetConfig, XttsArgs])
# **强制 `weights_only=False` 解决 UnpicklingError**
# def load_fsspec_fixed(*args, **kwargs):
# kwargs["weights_only"] = False # 关键修正
# return torch.load(*args, **kwargs)
def load_fsspec_fixed(*args, **kwargs):
kwargs.pop("cache", None) # 移除 cache 参数,避免报错
kwargs["weights_only"] = False # 关键修正
return torch.load(*args, **kwargs)
# **覆盖 Coqui TTS 的 `load_fsspec` 方法**
import TTS.utils.io
TTS.utils.io.load_fsspec = load_fsspec_fixed
from TTS.api import TTS
# **XTTS v2 高质量多语言模型**
XTTS_MODEL = "tts_models/multilingual/multi-dataset/xtts_v2"
# **VITS 高质量单语言模型**
VITS_MODEL = "tts_models/en/ljspeech/vits"
# **高质量的参考音频**
speaker_wav_en = "example_speaker_en.wav" # 必须提供此音频,确保质量良好
# **要转换的文本**
text_en = "This is a high-quality text-to-speech conversion using XTTS v2 and VITS."
text = "记者从越秀区了解到,广州博物馆隆重推出“吉祥有年——广州博物馆藏吉祥文物展”及“吉祥有年·潮派趁墟”主题新春市集,让市民群众在探秘文物珍宝的同时,也能解锁非遗新体验、品尝地道广府味,一起喜迎吉祥乙巳蛇年。"
# **加载 XTTS v2**
print("🚀 正在加载 XTTS v2 模型,请稍候...")
tts_xtts = TTS(XTTS_MODEL)
# **使用 XTTS v2 生成语音**
tts_xtts.tts_to_file(
text=text,
file_path="output_xtts.wav",
speaker_wav=speaker_wav_en, # 参考音频(用于克隆音色)
language="zh-cn", # 语言代码,必须匹配文本语言
split_sentences=True # 让模型自动优化长文本
)
print("✅ XTTS v2 语音合成完成!已保存到 output_xtts.wav 🎵")
# **加载 VITS**
print("🚀 正在加载 VITS 模型,请稍候...")
tts_vits = TTS(VITS_MODEL)
# **使用 VITS 生成语音**
tts_vits.tts_to_file(
text=text_en,
split_sentences=True,
file_path="output_vits.wav"
)
print("✅ VITS 语音合成完成!已保存到 output_vits.wav 🎵")

9
list.py Normal file
View File

@ -0,0 +1,9 @@
from TTS.utils.manage import ModelManager
# 获取 Coqui TTS 官方模型列表
manager = ModelManager()
models = manager.list_models()
print("✅ 可用的 TTS 预训练模型列表:")
for model in models:
print(model)

BIN
output_vits.wav Normal file

Binary file not shown.

BIN
output_xtts.wav Normal file

Binary file not shown.

16
record.py Normal file
View File

@ -0,0 +1,16 @@
import sounddevice as sd
import wavio
# 录制参数
duration = 5 # 录制 5 秒
samplerate = 22050 # 采样率 (XTTS v2 兼容的)
filename = "example_speaker.wav"
print("开始录音,说话吧...")
audio = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype='int16')
sd.wait()
print("录音完成,保存中...")
# 保存 WAV 文件
wavio.write(filename, audio, samplerate, sampwidth=2)
print(f"录音已保存为 {filename}")