Voice Test

"""Temporary test endpoints for TTS and STT — browser-accessible.""" import asyncio import io import os import struct import tempfile import threading import numpy as np from fastapi import APIRouter, Request, Query, UploadFile, File from fastapi.responses import HTMLResponse, Response router = APIRouter() _loading_lock = threading.Lock() def _ensure_local_tts(app) -> bool: """Lazy-load Kokoro TTS if not yet loaded. Returns True if available.""" tts = getattr(app.state, "tts", None) if tts is not None and tts._pipeline is not None: return True with _loading_lock: # Double-check after acquiring lock tts = getattr(app.state, "tts", None) if tts is not None and tts._pipeline is not None: return True try: from ..tts.kokoro_service import KokoroTTSService, _patch_misaki_compat from ..config.settings import settings _patch_misaki_compat() from kokoro import KPipeline svc = KokoroTTSService(model=settings.kokoro_model, voice=settings.kokoro_voice) svc._pipeline = KPipeline(lang_code='z') app.state.tts = svc print(f"[lazy] Local TTS loaded: {settings.kokoro_model} voice={settings.kokoro_voice}", flush=True) return True except Exception as e: print(f"[lazy] Failed to load local TTS: {e}", flush=True) return False def _ensure_local_stt(app) -> bool: """Lazy-load faster-whisper STT if not yet loaded. Returns True if available.""" stt = getattr(app.state, "stt", None) if stt is not None and stt._model is not None: return True with _loading_lock: # Double-check after acquiring lock stt = getattr(app.state, "stt", None) if stt is not None and stt._model is not None: return True try: from ..stt.whisper_service import WhisperSTTService from faster_whisper import WhisperModel from ..config.settings import settings svc = WhisperSTTService( model=settings.whisper_model, device=settings.device, language=settings.whisper_language, ) compute_type = "float16" if settings.device == "cuda" else "int8" try: svc._model = WhisperModel(svc.model_name, device=svc.device, compute_type=compute_type) except Exception: if svc.device != "cpu": svc._model = WhisperModel(svc.model_name, device="cpu", compute_type="int8") app.state.stt = svc print(f"[lazy] Local STT loaded: {settings.whisper_model}", flush=True) return True except Exception as e: print(f"[lazy] Failed to load local STT: {e}", flush=True) return False _SAMPLE_RATE = 24000 # Kokoro native output rate def _make_wav(pcm_bytes: bytes, sample_rate: int = 16000) -> bytes: """Wrap raw 16-bit PCM into a WAV container.""" buf = io.BytesIO() num_samples = len(pcm_bytes) // 2 data_size = num_samples * 2 # WAV header buf.write(b"RIFF") buf.write(struct.pack(" Voice Test

Voice I/O Test

TTS (Text to Speech)

STT (Speech to Text)

点击录音按钮说话，松开后自动识别。或上传音频文件。

上传音频

Round-trip (STT + TTS)

录音 → 识别文本 → 再合成语音播放。测试全链路。

""" @router.get("/tts/synthesize") async def tts_synthesize(request: Request, text: str = Query(..., min_length=1, max_length=500)): """Synthesize text to WAV audio (lazy-loads local Kokoro model on first call).""" loaded = await asyncio.get_event_loop().run_in_executor(None, _ensure_local_tts, request.app) if not loaded: return Response(content="Failed to load local TTS model", status_code=503) tts = request.app.state.tts loop = asyncio.get_event_loop() def _synth(): samples = [] for _, _, audio in tts._pipeline(text, voice=tts.voice): if hasattr(audio, "numpy"): samples.append(audio.numpy()) else: samples.append(audio) if not samples: return b"" audio_np = np.concatenate(samples) # Resample 24kHz → 16kHz target_len = int(len(audio_np) / _SAMPLE_RATE * 16000) indices = np.linspace(0, len(audio_np) - 1, target_len) resampled = np.interp(indices, np.arange(len(audio_np)), audio_np) pcm = (resampled * 32768).clip(-32768, 32767).astype(np.int16).tobytes() return _make_wav(pcm, 16000) wav_bytes = await loop.run_in_executor(None, _synth) if not wav_bytes: return Response(content="TTS produced no audio", status_code=500) return Response(content=wav_bytes, media_type="audio/wav") @router.post("/stt/transcribe") async def stt_transcribe(request: Request, audio: UploadFile = File(...)): """Transcribe uploaded audio to text via faster-whisper (lazy-loads on first call).""" loaded = await asyncio.get_event_loop().run_in_executor(None, _ensure_local_stt, request.app) if not loaded: return {"error": "Failed to load local STT model", "text": ""} stt = request.app.state.stt # Save uploaded file to temp raw = await audio.read() suffix = os.path.splitext(audio.filename or "audio.webm")[1] or ".webm" with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as f: f.write(raw) tmp_path = f.name try: # faster-whisper can handle webm/mp3/wav etc. directly loop = asyncio.get_event_loop() def _transcribe(): segments, info = stt._model.transcribe( tmp_path, language=stt.language if hasattr(stt, 'language') and stt.language else None, beam_size=5, vad_filter=True, ) text = "".join(seg.text for seg in segments).strip() return text, info text, info = await loop.run_in_executor(None, _transcribe) return { "text": text, "language": getattr(info, "language", ""), "duration": round(getattr(info, "duration", 0), 2), } finally: os.unlink(tmp_path) # ===================================================================== # OpenAI Voice API endpoints # ===================================================================== def _get_openai_client(): """Lazy-init OpenAI client with proxy support.""" from openai import OpenAI import httpx api_key = os.environ.get("OPENAI_API_KEY") base_url = os.environ.get("OPENAI_BASE_URL") if not api_key: return None kwargs = {"api_key": api_key} if base_url: kwargs["base_url"] = base_url # Disable SSL verification for self-signed proxy certs kwargs["http_client"] = httpx.Client(verify=False) return OpenAI(**kwargs) @router.get("/tts/synthesize-openai") async def tts_synthesize_openai( text: str = Query(..., min_length=1, max_length=500), model: str = Query("tts-1", regex="^(tts-1|tts-1-hd|gpt-4o-mini-tts)$"), voice: str = Query("alloy", regex="^(alloy|ash|ballad|coral|echo|fable|nova|onyx|sage|shimmer)$"), ): """Synthesize text to audio via OpenAI TTS API, resampled to 16kHz WAV.""" client = _get_openai_client() if client is None: return Response(content="OPENAI_API_KEY not configured", status_code=503) loop = asyncio.get_event_loop() def _synth(): response = client.audio.speech.create( model=model, voice=voice, input=text, response_format="pcm", # raw 24kHz 16-bit mono PCM (no header) ) raw_pcm = response.content # Resample 24kHz → 16kHz to match Flutter player expectations audio_np = np.frombuffer(raw_pcm, dtype=np.int16).astype(np.float32) target_len = int(len(audio_np) / 24000 * 16000) indices = np.linspace(0, len(audio_np) - 1, target_len) resampled = np.interp(indices, np.arange(len(audio_np)), audio_np) pcm_16k = resampled.clip(-32768, 32767).astype(np.int16).tobytes() return _make_wav(pcm_16k, 16000) try: wav_bytes = await loop.run_in_executor(None, _synth) return Response(content=wav_bytes, media_type="audio/wav") except Exception as e: return Response(content=f"OpenAI TTS error: {e}", status_code=500) @router.post("/stt/transcribe-openai") async def stt_transcribe_openai( audio: UploadFile = File(...), model: str = Query("gpt-4o-transcribe", regex="^(whisper-1|gpt-4o-transcribe|gpt-4o-mini-transcribe)$"), ): """Transcribe uploaded audio via OpenAI STT API.""" client = _get_openai_client() if client is None: return {"error": "OPENAI_API_KEY not configured", "text": ""} raw = await audio.read() suffix = os.path.splitext(audio.filename or "audio.wav")[1] or ".wav" with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as f: f.write(raw) tmp_path = f.name try: loop = asyncio.get_event_loop() def _transcribe(): with open(tmp_path, "rb") as af: result = client.audio.transcriptions.create( model=model, file=af, language="zh", ) return result.text text = await loop.run_in_executor(None, _transcribe) return {"text": text, "language": "zh", "model": model} except Exception as e: return {"error": f"OpenAI STT error: {e}", "text": ""} finally: os.unlink(tmp_path)