Initial commit

2025-02-03 14:31:26 +08:00 · 2025-02-03 14:31:26 +08:00 · 54cb82f1b9
commit 54cb82f1b9
13 changed files with 347 additions and 0 deletions
--- a/Nova.wav
+++ b/Nova.wav
--- a/Onyx.wav
+++ b/Onyx.wav
--- a/checktorchgpu.py
+++ b/checktorchgpu.py
@ -0,0 +1,3 @@
+import torch
+print("Torch 是否使用 GPU:", torch.cuda.is_available())
+print("当前使用的 GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
--- a/coquitts.py
+++ b/coquitts.py
@ -0,0 +1,58 @@
+import torch
+from TTS.tts.models.xtts import XttsAudioConfig  # 允许 XttsAudioConfig
+from TTS.tts.configs.xtts_config import XttsConfig  # 允许 XttsConfig
+from TTS.config.shared_configs import BaseDatasetConfig  # 允许 BaseDatasetConfig
+from TTS.tts.models.xtts import XttsArgs  # 允许 XttsArgs
+
+# **添加 `XttsConfig`、`XttsAudioConfig`、`BaseDatasetConfig` 和 `XttsArgs` 到 PyTorch 安全全局对象**
+torch.serialization.add_safe_globals([XttsConfig, XttsAudioConfig, BaseDatasetConfig, XttsArgs])
+
+# **强制 `weights_only=False`**
+def load_fsspec_fixed(*args, **kwargs):
+    kwargs["weights_only"] = False  # 关键修正
+    return torch.load(*args, **kwargs)
+
+# **覆盖 Coqui TTS 的 `load_fsspec` 方法**
+import TTS.utils.io
+TTS.utils.io.load_fsspec = load_fsspec_fixed
+
+from TTS.api import TTS
+import os
+from pydub import AudioSegment  # 用于合并音频
+
+# 选择 XTTS v2 多语言模型
+MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
+
+# 加载 TTS 模型
+print("正在加载 TTS 预训练模型，请稍候...")
+tts = TTS(MODEL_NAME)
+
+# **指定参考音频**
+speaker_wav_zh = "example_speaker_zh.wav"  # 你必须提供这个音频
+speaker_wav_en = "example_speaker_en.wav"  # 你必须提供这个音频
+
+# **确保 `speaker_wav` 存在**
+if not os.path.exists(speaker_wav_zh):
+    raise FileNotFoundError(f"错误：找不到 {speaker_wav_zh}，XTTS v2 需要一个参考音频！")
+
+# **拆分文本**
+text_cn1 = "你好，欢迎使用"
+text_en = "Coqui TTS"
+text_cn2 = "进行中文语音合成！"
+test_all = "Not long ago, my colleague Carlos Fenollosa made a bold claim in his book ‘La singularidad’: ChatGPT is AGI. My initial reaction was immediate dismissal. But knowing his careful reasoning, I kept reading. As I followed his argument, something unexpected happened — I found myself agreeing, though from an entirely different perspective. What I discovered challenges both those waiting for AGI’s arrival and those claiming it’s nowhere near: we might be asking the wrong question altogether."
+
+# 生成临时音频文件
+tts.tts_to_file(text=text_cn1, file_path="part1.wav", speaker_wav=speaker_wav_zh, language="zh-cn")
+tts.tts_to_file(text=text_en, file_path="part2.wav", speaker_wav=speaker_wav_en, language="en")
+tts.tts_to_file(text=text_cn2, file_path="part3.wav", speaker_wav=speaker_wav_zh, language="zh-cn")
+# tts.tts_to_file(text=test_all, file_path="all.wav", speaker_wav=speaker_wav_en, language="en")
+
+# # **合并音频**
+audio1 = AudioSegment.from_wav("part1.wav")
+audio2 = AudioSegment.from_wav("part2.wav")
+audio3 = AudioSegment.from_wav("part3.wav")
+
+final_audio = audio1 + audio2 + audio3
+final_audio.export("output.wav", format="wav")
+
+print(f"✅ 语音合成完成！已保存到 output.wav")
--- a/detectdevice.py
+++ b/detectdevice.py
@ -0,0 +1,16 @@
+import sounddevice as sd
+
+print("🎤 正在查询可用的音频设备...\n")
+devices = sd.query_devices()
+
+for i, device in enumerate(devices):
+    print(f"设备 ID {i}: {device['name']} - 输入通道: {device['max_input_channels']}")
+
+# 只显示支持 WASAPI 环回录音的设备
+wasapi_devices = [d for d in devices if "loopback" in d["name"].lower()]
+if wasapi_devices:
+    print("\n✅ 发现 WASAPI '环回录音' 设备：")
+    for d in wasapi_devices:
+        print(f" - {d['name']}")
+else:
+    print("\n❌ 没有找到 'WASAPI 环回录音' 设备，请尝试手动启用或安装 Virtual Cable。")
--- a/example_speaker_en.wav
+++ b/example_speaker_en.wav
--- a/example_speaker_zh.wav
+++ b/example_speaker_zh.wav
--- a/fromsoundcardpf32.py
+++ b/fromsoundcardpf32.py
@ -0,0 +1,181 @@
+import pyaudio
+import wave
+import numpy as np
+import time
+from pydub import AudioSegment
+
+# 录音参数
+FORMAT = pyaudio.paFloat32  # 32-bit 浮点格式
+CHANNELS = 1
+RATE = 44100
+CHUNK = 1024
+OUTPUT_FILENAME = "output.wav"
+THRESHOLD = 0.008  # 适配 paFloat32（范围是 [-1.0, 1.0]）
+SILENCE_DURATION = 2  # 静音时间（秒）
+
+# **🔍 列出所有可用设备并解释用途**
+def list_audio_devices():
+    audio = pyaudio.PyAudio()
+    device_info = []
+    
+    print("\n🎤 **可用的录音设备列表**\n")
+    print(f"{'ID':<5}{'设备名称':<35}{'输入通道数':<15}{'设备类型'}")
+    print("="*80)
+
+    for i in range(audio.get_device_count()):
+        dev = audio.get_device_info_by_index(i)
+        name = dev['name'].lower()
+        channels = dev['maxInputChannels']
+
+        if channels > 0:
+            # 设备类型判断
+            if "stereo mix" in name or "what you hear" in name:
+                device_type = "✅ 立体声混音（推荐）"
+            elif "loopback" in name:
+                device_type = "🔄 环回录音（可选）"
+            elif "mic" in name or "microphone" in name:
+                device_type = "🎤 麦克风（不推荐）"
+            else:
+                device_type = "🎧 其他音频设备"
+
+            print(f"{i:<5}{dev['name']:<35}{channels:<15}{device_type}")
+            device_info.append((i, dev['name'], device_type))
+    
+    audio.terminate()
+    print("\n✅ **如果 '立体声混音' 存在，优先使用它**，否则尝试 '环回' 或手动选择。\n")
+    return device_info
+
+# **🔍 自动选择最佳录音设备**
+def get_best_device():
+    audio = pyaudio.PyAudio()
+    best_device = None
+    fallback_device = None
+
+    print("\n🔍 **正在尝试自动选择最佳录音设备...**")
+
+    for i in range(audio.get_device_count()):
+        dev = audio.get_device_info_by_index(i)
+        name = dev['name'].lower()
+        channels = dev['maxInputChannels']
+
+        if channels > 0:
+            # **优先选择“立体声混音”**
+            if "stereo mix" in name or "what you hear" in name:
+                print(f"✅ 选择设备: {dev['name']} (ID: {i}) - 立体声混音（最佳）")
+                best_device = i
+                break
+
+            # **其次选择“环回（loopback）”**
+            if "loopback" in name and best_device is None:
+                print(f"🔄 选择设备: {dev['name']} (ID: {i}) - 环回录音")
+                best_device = i
+
+            # **如果没有立体声混音或环回，选择一个非麦克风的可用设备**
+            if "mic" not in name and "microphone" not in name and fallback_device is None:
+                fallback_device = i
+
+    audio.terminate()
+
+    if best_device is not None:
+        return best_device
+    elif fallback_device is not None:
+        print(f"⚠️ 没有找到 '立体声混音'，使用默认设备 (ID: {fallback_device})")
+        return fallback_device
+    else:
+        print("\n❌ 没有检测到合适的设备，请手动选择一个设备 ID：")
+        device_list = list_audio_devices()
+        selected_device = input("🔹 请输入你要使用的设备 ID（数字）: ")
+        try:
+            selected_device = int(selected_device)
+            return selected_device
+        except ValueError:
+            raise RuntimeError("❌ 设备 ID 无效，程序终止！")
+
+# **🎤 运行设备列表**
+device_list = list_audio_devices()
+
+# **🔧 选择最佳设备**
+device_index = get_best_device()
+
+# 初始化 PyAudio
+audio = pyaudio.PyAudio()
+try:
+    if device_index is not None:
+        stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True,
+                            input_device_index=device_index,
+                            frames_per_buffer=CHUNK)
+    else:
+        stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True,
+                            frames_per_buffer=CHUNK)
+
+    print("🎤 只录制系统声音，麦克风已禁用！")
+    print("🎤 等待声音触发录音...")
+
+    # 等待声音触发
+    while True:
+        data = stream.read(CHUNK, exception_on_overflow=False)
+        audio_data = np.frombuffer(data, dtype=np.float32)  # 读取 32-bit 浮点数据
+        volume = np.max(np.abs(audio_data))  # 计算音量
+
+        if volume > THRESHOLD:
+            print("🎙 检测到声音，开始录音...")
+            break
+
+    frames = [data]  
+    silent_start = None
+
+    # 开始录音，直到检测到 2 秒以上静音
+    while True:
+        data = stream.read(CHUNK, exception_on_overflow=False)
+        audio_data = np.frombuffer(data, dtype=np.float32)
+
+        # **放大音量**
+        volume_boost = 5.0  # 放大 5 倍
+        audio_data = np.clip(audio_data * volume_boost, -1.0, 1.0)  # 避免溢出
+        frames.append(audio_data.tobytes())
+
+        volume = np.max(np.abs(audio_data))  # 计算当前音量
+
+        if volume < THRESHOLD:
+            if silent_start is None:
+                silent_start = time.time()
+            elif time.time() - silent_start >= SILENCE_DURATION:
+                print("🤫 检测到静音超过 2 秒，停止录音...")
+                break
+        else:
+            silent_start = None  
+
+    print("🎼 录音结束，正在保存文件...")
+
+    # 关闭流
+    stream.stop_stream()
+    stream.close()
+    audio.terminate()
+
+    # **转换 `paFloat32` 录音数据为 `paInt16`（标准 WAV 格式）**
+    int_frames = []
+    for frame in frames:
+        float_data = np.frombuffer(frame, dtype=np.float32)  # 读取浮点数据
+        int_data = np.int16(float_data * 32767)  # 转换为 16-bit PCM
+        int_frames.append(int_data.tobytes())
+
+    # 保存为 WAV
+    wf = wave.open(OUTPUT_FILENAME, 'wb')
+    wf.setnchannels(CHANNELS)
+    wf.setsampwidth(2)  # 16-bit PCM
+    wf.setframerate(RATE)
+    wf.writeframes(b''.join(int_frames))
+    wf.close()
+
+    print(f"✅ 录音已保存为 {OUTPUT_FILENAME}")
+
+    # **自动归一化音量**
+    audio = AudioSegment.from_wav(OUTPUT_FILENAME)
+    normalized_audio = audio.apply_gain(-audio.dBFS)
+    normalized_audio.export("output_loud.wav", format="wav")
+
+    print("✅ 录音音量已调整，保存为 output_loud.wav")
+
+except Exception as e:
+    print(f"❌ 录音失败: {e}")
+    audio.terminate()
--- a/hifi.py
+++ b/hifi.py
@ -0,0 +1,64 @@
+import torch
+from TTS.tts.models.xtts import XttsAudioConfig  
+from TTS.tts.configs.xtts_config import XttsConfig  
+from TTS.config.shared_configs import BaseDatasetConfig  
+from TTS.tts.models.xtts import XttsArgs  
+
+# **添加 `XttsConfig` 到 PyTorch 安全全局对象**
+torch.serialization.add_safe_globals([XttsConfig, XttsAudioConfig, BaseDatasetConfig, XttsArgs])
+
+# **强制 `weights_only=False` 解决 UnpicklingError**
+# def load_fsspec_fixed(*args, **kwargs):
+#     kwargs["weights_only"] = False  # 关键修正
+#     return torch.load(*args, **kwargs)
+
+def load_fsspec_fixed(*args, **kwargs):
+    kwargs.pop("cache", None)  # 移除 cache 参数，避免报错
+    kwargs["weights_only"] = False  # 关键修正
+    return torch.load(*args, **kwargs)
+
+
+# **覆盖 Coqui TTS 的 `load_fsspec` 方法**
+import TTS.utils.io
+TTS.utils.io.load_fsspec = load_fsspec_fixed
+
+from TTS.api import TTS
+
+# **XTTS v2 高质量多语言模型**
+XTTS_MODEL = "tts_models/multilingual/multi-dataset/xtts_v2"
+
+# **VITS 高质量单语言模型**
+VITS_MODEL = "tts_models/en/ljspeech/vits"
+
+# **高质量的参考音频**
+speaker_wav_en = "example_speaker_en.wav"  # 必须提供此音频，确保质量良好
+
+# **要转换的文本**
+text_en = "This is a high-quality text-to-speech conversion using XTTS v2 and VITS."
+text = "记者从越秀区了解到，广州博物馆隆重推出“吉祥有年——广州博物馆藏吉祥文物展”及“吉祥有年·潮派趁墟”主题新春市集，让市民群众在探秘文物珍宝的同时，也能解锁非遗新体验、品尝地道广府味，一起喜迎吉祥乙巳蛇年。"
+
+# **加载 XTTS v2**
+print("🚀 正在加载 XTTS v2 模型，请稍候...")
+tts_xtts = TTS(XTTS_MODEL)
+
+# **使用 XTTS v2 生成语音**
+tts_xtts.tts_to_file(
+    text=text, 
+    file_path="output_xtts.wav", 
+    speaker_wav=speaker_wav_en,  # 参考音频（用于克隆音色）
+    language="zh-cn",  # 语言代码，必须匹配文本语言
+    split_sentences=True  # 让模型自动优化长文本
+)
+print("✅ XTTS v2 语音合成完成！已保存到 output_xtts.wav 🎵")
+
+# **加载 VITS**
+print("🚀 正在加载 VITS 模型，请稍候...")
+tts_vits = TTS(VITS_MODEL)
+
+# **使用 VITS 生成语音**
+tts_vits.tts_to_file(
+    text=text_en,
+    split_sentences=True,
+    file_path="output_vits.wav"
+)
+print("✅ VITS 语音合成完成！已保存到 output_vits.wav 🎵")
--- a/list.py
+++ b/list.py
@ -0,0 +1,9 @@
+from TTS.utils.manage import ModelManager
+
+# 获取 Coqui TTS 官方模型列表
+manager = ModelManager()
+models = manager.list_models()
+
+print("✅ 可用的 TTS 预训练模型列表：")
+for model in models:
+    print(model)
--- a/output_vits.wav
+++ b/output_vits.wav
--- a/output_xtts.wav
+++ b/output_xtts.wav
--- a/record.py
+++ b/record.py
@ -0,0 +1,16 @@
+import sounddevice as sd
+import wavio
+
+# 录制参数
+duration = 5  # 录制 5 秒
+samplerate = 22050  # 采样率 (XTTS v2 兼容的)
+filename = "example_speaker.wav"
+
+print("开始录音，说话吧...")
+audio = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype='int16')
+sd.wait()
+print("录音完成，保存中...")
+
+# 保存 WAV 文件
+wavio.write(filename, audio, samplerate, sampwidth=2)
+print(f"录音已保存为 {filename}")