commit 54cb82f1b947446045046bcd28bca3e8f7dc5db4 Author: hailin Date: Mon Feb 3 14:31:26 2025 +0800 Initial commit diff --git a/Nova.wav b/Nova.wav new file mode 100644 index 0000000..337e39b Binary files /dev/null and b/Nova.wav differ diff --git a/Onyx.wav b/Onyx.wav new file mode 100644 index 0000000..d522942 Binary files /dev/null and b/Onyx.wav differ diff --git a/checktorchgpu.py b/checktorchgpu.py new file mode 100644 index 0000000..d6f9519 --- /dev/null +++ b/checktorchgpu.py @@ -0,0 +1,3 @@ +import torch +print("Torch 是否使用 GPU:", torch.cuda.is_available()) +print("当前使用的 GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU") diff --git a/coquitts.py b/coquitts.py new file mode 100644 index 0000000..c7d4b6b --- /dev/null +++ b/coquitts.py @@ -0,0 +1,58 @@ +import torch +from TTS.tts.models.xtts import XttsAudioConfig # 允许 XttsAudioConfig +from TTS.tts.configs.xtts_config import XttsConfig # 允许 XttsConfig +from TTS.config.shared_configs import BaseDatasetConfig # 允许 BaseDatasetConfig +from TTS.tts.models.xtts import XttsArgs # 允许 XttsArgs + +# **添加 `XttsConfig`、`XttsAudioConfig`、`BaseDatasetConfig` 和 `XttsArgs` 到 PyTorch 安全全局对象** +torch.serialization.add_safe_globals([XttsConfig, XttsAudioConfig, BaseDatasetConfig, XttsArgs]) + +# **强制 `weights_only=False`** +def load_fsspec_fixed(*args, **kwargs): + kwargs["weights_only"] = False # 关键修正 + return torch.load(*args, **kwargs) + +# **覆盖 Coqui TTS 的 `load_fsspec` 方法** +import TTS.utils.io +TTS.utils.io.load_fsspec = load_fsspec_fixed + +from TTS.api import TTS +import os +from pydub import AudioSegment # 用于合并音频 + +# 选择 XTTS v2 多语言模型 +MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2" + +# 加载 TTS 模型 +print("正在加载 TTS 预训练模型,请稍候...") +tts = TTS(MODEL_NAME) + +# **指定参考音频** +speaker_wav_zh = "example_speaker_zh.wav" # 你必须提供这个音频 +speaker_wav_en = "example_speaker_en.wav" # 你必须提供这个音频 + +# **确保 `speaker_wav` 存在** +if not os.path.exists(speaker_wav_zh): + raise FileNotFoundError(f"错误:找不到 {speaker_wav_zh},XTTS v2 需要一个参考音频!") + +# **拆分文本** +text_cn1 = "你好,欢迎使用" +text_en = "Coqui TTS" +text_cn2 = "进行中文语音合成!" +test_all = "Not long ago, my colleague Carlos Fenollosa made a bold claim in his book ‘La singularidad’: ChatGPT is AGI. My initial reaction was immediate dismissal. But knowing his careful reasoning, I kept reading. As I followed his argument, something unexpected happened — I found myself agreeing, though from an entirely different perspective. What I discovered challenges both those waiting for AGI’s arrival and those claiming it’s nowhere near: we might be asking the wrong question altogether." + +# 生成临时音频文件 +tts.tts_to_file(text=text_cn1, file_path="part1.wav", speaker_wav=speaker_wav_zh, language="zh-cn") +tts.tts_to_file(text=text_en, file_path="part2.wav", speaker_wav=speaker_wav_en, language="en") +tts.tts_to_file(text=text_cn2, file_path="part3.wav", speaker_wav=speaker_wav_zh, language="zh-cn") +# tts.tts_to_file(text=test_all, file_path="all.wav", speaker_wav=speaker_wav_en, language="en") + +# # **合并音频** +audio1 = AudioSegment.from_wav("part1.wav") +audio2 = AudioSegment.from_wav("part2.wav") +audio3 = AudioSegment.from_wav("part3.wav") + +final_audio = audio1 + audio2 + audio3 +final_audio.export("output.wav", format="wav") + +print(f"✅ 语音合成完成!已保存到 output.wav") diff --git a/detectdevice.py b/detectdevice.py new file mode 100644 index 0000000..76b62d3 --- /dev/null +++ b/detectdevice.py @@ -0,0 +1,16 @@ +import sounddevice as sd + +print("🎤 正在查询可用的音频设备...\n") +devices = sd.query_devices() + +for i, device in enumerate(devices): + print(f"设备 ID {i}: {device['name']} - 输入通道: {device['max_input_channels']}") + +# 只显示支持 WASAPI 环回录音的设备 +wasapi_devices = [d for d in devices if "loopback" in d["name"].lower()] +if wasapi_devices: + print("\n✅ 发现 WASAPI '环回录音' 设备:") + for d in wasapi_devices: + print(f" - {d['name']}") +else: + print("\n❌ 没有找到 'WASAPI 环回录音' 设备,请尝试手动启用或安装 Virtual Cable。") \ No newline at end of file diff --git a/example_speaker_en.wav b/example_speaker_en.wav new file mode 100644 index 0000000..337e39b Binary files /dev/null and b/example_speaker_en.wav differ diff --git a/example_speaker_zh.wav b/example_speaker_zh.wav new file mode 100644 index 0000000..337e39b Binary files /dev/null and b/example_speaker_zh.wav differ diff --git a/fromsoundcardpf32.py b/fromsoundcardpf32.py new file mode 100644 index 0000000..b383f64 --- /dev/null +++ b/fromsoundcardpf32.py @@ -0,0 +1,181 @@ +import pyaudio +import wave +import numpy as np +import time +from pydub import AudioSegment + +# 录音参数 +FORMAT = pyaudio.paFloat32 # 32-bit 浮点格式 +CHANNELS = 1 +RATE = 44100 +CHUNK = 1024 +OUTPUT_FILENAME = "output.wav" +THRESHOLD = 0.008 # 适配 paFloat32(范围是 [-1.0, 1.0]) +SILENCE_DURATION = 2 # 静音时间(秒) + +# **🔍 列出所有可用设备并解释用途** +def list_audio_devices(): + audio = pyaudio.PyAudio() + device_info = [] + + print("\n🎤 **可用的录音设备列表**\n") + print(f"{'ID':<5}{'设备名称':<35}{'输入通道数':<15}{'设备类型'}") + print("="*80) + + for i in range(audio.get_device_count()): + dev = audio.get_device_info_by_index(i) + name = dev['name'].lower() + channels = dev['maxInputChannels'] + + if channels > 0: + # 设备类型判断 + if "stereo mix" in name or "what you hear" in name: + device_type = "✅ 立体声混音(推荐)" + elif "loopback" in name: + device_type = "🔄 环回录音(可选)" + elif "mic" in name or "microphone" in name: + device_type = "🎤 麦克风(不推荐)" + else: + device_type = "🎧 其他音频设备" + + print(f"{i:<5}{dev['name']:<35}{channels:<15}{device_type}") + device_info.append((i, dev['name'], device_type)) + + audio.terminate() + print("\n✅ **如果 '立体声混音' 存在,优先使用它**,否则尝试 '环回' 或手动选择。\n") + return device_info + +# **🔍 自动选择最佳录音设备** +def get_best_device(): + audio = pyaudio.PyAudio() + best_device = None + fallback_device = None + + print("\n🔍 **正在尝试自动选择最佳录音设备...**") + + for i in range(audio.get_device_count()): + dev = audio.get_device_info_by_index(i) + name = dev['name'].lower() + channels = dev['maxInputChannels'] + + if channels > 0: + # **优先选择“立体声混音”** + if "stereo mix" in name or "what you hear" in name: + print(f"✅ 选择设备: {dev['name']} (ID: {i}) - 立体声混音(最佳)") + best_device = i + break + + # **其次选择“环回(loopback)”** + if "loopback" in name and best_device is None: + print(f"🔄 选择设备: {dev['name']} (ID: {i}) - 环回录音") + best_device = i + + # **如果没有立体声混音或环回,选择一个非麦克风的可用设备** + if "mic" not in name and "microphone" not in name and fallback_device is None: + fallback_device = i + + audio.terminate() + + if best_device is not None: + return best_device + elif fallback_device is not None: + print(f"⚠️ 没有找到 '立体声混音',使用默认设备 (ID: {fallback_device})") + return fallback_device + else: + print("\n❌ 没有检测到合适的设备,请手动选择一个设备 ID:") + device_list = list_audio_devices() + selected_device = input("🔹 请输入你要使用的设备 ID(数字): ") + try: + selected_device = int(selected_device) + return selected_device + except ValueError: + raise RuntimeError("❌ 设备 ID 无效,程序终止!") + +# **🎤 运行设备列表** +device_list = list_audio_devices() + +# **🔧 选择最佳设备** +device_index = get_best_device() + +# 初始化 PyAudio +audio = pyaudio.PyAudio() +try: + if device_index is not None: + stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, + input_device_index=device_index, + frames_per_buffer=CHUNK) + else: + stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, + frames_per_buffer=CHUNK) + + print("🎤 只录制系统声音,麦克风已禁用!") + print("🎤 等待声音触发录音...") + + # 等待声音触发 + while True: + data = stream.read(CHUNK, exception_on_overflow=False) + audio_data = np.frombuffer(data, dtype=np.float32) # 读取 32-bit 浮点数据 + volume = np.max(np.abs(audio_data)) # 计算音量 + + if volume > THRESHOLD: + print("🎙 检测到声音,开始录音...") + break + + frames = [data] + silent_start = None + + # 开始录音,直到检测到 2 秒以上静音 + while True: + data = stream.read(CHUNK, exception_on_overflow=False) + audio_data = np.frombuffer(data, dtype=np.float32) + + # **放大音量** + volume_boost = 5.0 # 放大 5 倍 + audio_data = np.clip(audio_data * volume_boost, -1.0, 1.0) # 避免溢出 + frames.append(audio_data.tobytes()) + + volume = np.max(np.abs(audio_data)) # 计算当前音量 + + if volume < THRESHOLD: + if silent_start is None: + silent_start = time.time() + elif time.time() - silent_start >= SILENCE_DURATION: + print("🤫 检测到静音超过 2 秒,停止录音...") + break + else: + silent_start = None + + print("🎼 录音结束,正在保存文件...") + + # 关闭流 + stream.stop_stream() + stream.close() + audio.terminate() + + # **转换 `paFloat32` 录音数据为 `paInt16`(标准 WAV 格式)** + int_frames = [] + for frame in frames: + float_data = np.frombuffer(frame, dtype=np.float32) # 读取浮点数据 + int_data = np.int16(float_data * 32767) # 转换为 16-bit PCM + int_frames.append(int_data.tobytes()) + + # 保存为 WAV + wf = wave.open(OUTPUT_FILENAME, 'wb') + wf.setnchannels(CHANNELS) + wf.setsampwidth(2) # 16-bit PCM + wf.setframerate(RATE) + wf.writeframes(b''.join(int_frames)) + wf.close() + + print(f"✅ 录音已保存为 {OUTPUT_FILENAME}") + + # **自动归一化音量** + audio = AudioSegment.from_wav(OUTPUT_FILENAME) + normalized_audio = audio.apply_gain(-audio.dBFS) + normalized_audio.export("output_loud.wav", format="wav") + + print("✅ 录音音量已调整,保存为 output_loud.wav") + +except Exception as e: + print(f"❌ 录音失败: {e}") + audio.terminate() diff --git a/hifi.py b/hifi.py new file mode 100644 index 0000000..1b1ef28 --- /dev/null +++ b/hifi.py @@ -0,0 +1,64 @@ +import torch +from TTS.tts.models.xtts import XttsAudioConfig +from TTS.tts.configs.xtts_config import XttsConfig +from TTS.config.shared_configs import BaseDatasetConfig +from TTS.tts.models.xtts import XttsArgs + +# **添加 `XttsConfig` 到 PyTorch 安全全局对象** +torch.serialization.add_safe_globals([XttsConfig, XttsAudioConfig, BaseDatasetConfig, XttsArgs]) + +# **强制 `weights_only=False` 解决 UnpicklingError** +# def load_fsspec_fixed(*args, **kwargs): +# kwargs["weights_only"] = False # 关键修正 +# return torch.load(*args, **kwargs) + +def load_fsspec_fixed(*args, **kwargs): + kwargs.pop("cache", None) # 移除 cache 参数,避免报错 + kwargs["weights_only"] = False # 关键修正 + return torch.load(*args, **kwargs) + + +# **覆盖 Coqui TTS 的 `load_fsspec` 方法** +import TTS.utils.io +TTS.utils.io.load_fsspec = load_fsspec_fixed + +from TTS.api import TTS + +# **XTTS v2 高质量多语言模型** +XTTS_MODEL = "tts_models/multilingual/multi-dataset/xtts_v2" + +# **VITS 高质量单语言模型** +VITS_MODEL = "tts_models/en/ljspeech/vits" + +# **高质量的参考音频** +speaker_wav_en = "example_speaker_en.wav" # 必须提供此音频,确保质量良好 + +# **要转换的文本** +text_en = "This is a high-quality text-to-speech conversion using XTTS v2 and VITS." +text = "记者从越秀区了解到,广州博物馆隆重推出“吉祥有年——广州博物馆藏吉祥文物展”及“吉祥有年·潮派趁墟”主题新春市集,让市民群众在探秘文物珍宝的同时,也能解锁非遗新体验、品尝地道广府味,一起喜迎吉祥乙巳蛇年。" + +# **加载 XTTS v2** +print("🚀 正在加载 XTTS v2 模型,请稍候...") +tts_xtts = TTS(XTTS_MODEL) + +# **使用 XTTS v2 生成语音** +tts_xtts.tts_to_file( + text=text, + file_path="output_xtts.wav", + speaker_wav=speaker_wav_en, # 参考音频(用于克隆音色) + language="zh-cn", # 语言代码,必须匹配文本语言 + split_sentences=True # 让模型自动优化长文本 +) +print("✅ XTTS v2 语音合成完成!已保存到 output_xtts.wav 🎵") + +# **加载 VITS** +print("🚀 正在加载 VITS 模型,请稍候...") +tts_vits = TTS(VITS_MODEL) + +# **使用 VITS 生成语音** +tts_vits.tts_to_file( + text=text_en, + split_sentences=True, + file_path="output_vits.wav" +) +print("✅ VITS 语音合成完成!已保存到 output_vits.wav 🎵") diff --git a/list.py b/list.py new file mode 100644 index 0000000..53814d5 --- /dev/null +++ b/list.py @@ -0,0 +1,9 @@ +from TTS.utils.manage import ModelManager + +# 获取 Coqui TTS 官方模型列表 +manager = ModelManager() +models = manager.list_models() + +print("✅ 可用的 TTS 预训练模型列表:") +for model in models: + print(model) \ No newline at end of file diff --git a/output_vits.wav b/output_vits.wav new file mode 100644 index 0000000..bed42df Binary files /dev/null and b/output_vits.wav differ diff --git a/output_xtts.wav b/output_xtts.wav new file mode 100644 index 0000000..373c7c5 Binary files /dev/null and b/output_xtts.wav differ diff --git a/record.py b/record.py new file mode 100644 index 0000000..6f45d99 --- /dev/null +++ b/record.py @@ -0,0 +1,16 @@ +import sounddevice as sd +import wavio + +# 录制参数 +duration = 5 # 录制 5 秒 +samplerate = 22050 # 采样率 (XTTS v2 兼容的) +filename = "example_speaker.wav" + +print("开始录音,说话吧...") +audio = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype='int16') +sd.wait() +print("录音完成,保存中...") + +# 保存 WAV 文件 +wavio.write(filename, audio, samplerate, sampwidth=2) +print(f"录音已保存为 {filename}") \ No newline at end of file