From 0bd050c80fb6f90146593c93c33a911e0231a189 Mon Sep 17 00:00:00 2001 From: hailin Date: Tue, 24 Feb 2026 05:08:00 -0800 Subject: [PATCH] feat: add STT test and round-trip test to voice test page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - STT: record from mic or upload audio file → faster-whisper transcription - Round-trip: record → STT → TTS → playback (full pipeline test) Co-Authored-By: Claude Opus 4.6 --- .../voice-service/src/api/test_tts.py | 224 +++++++++++++++--- 1 file changed, 196 insertions(+), 28 deletions(-) diff --git a/packages/services/voice-service/src/api/test_tts.py b/packages/services/voice-service/src/api/test_tts.py index 448520f..3a105e1 100644 --- a/packages/services/voice-service/src/api/test_tts.py +++ b/packages/services/voice-service/src/api/test_tts.py @@ -1,10 +1,10 @@ -"""Temporary TTS test endpoint — hit /api/v1/test/tts in a browser to verify.""" +"""Temporary test endpoints for TTS and STT — browser-accessible.""" import asyncio import io import struct import numpy as np -from fastapi import APIRouter, Request, Query +from fastapi import APIRouter, Request, Query, UploadFile, File from fastapi.responses import HTMLResponse, Response router = APIRouter() @@ -37,41 +37,170 @@ def _make_wav(pcm_bytes: bytes, sample_rate: int = 16000) -> bytes: @router.get("/tts", response_class=HTMLResponse) async def tts_test_page(): - """Simple HTML page to test TTS.""" + """Combined TTS + STT test page.""" return """ -TTS Test +Voice Test -

TTS Test

- +

Voice I/O Test

+ +
+

TTS (Text to Speech)

+
-
- +
+ +
+ +
+

STT (Speech to Text)

+

点击录音按钮说话,松开后自动识别。或上传音频文件。

+ + +
+
+ +
+ +
+

Round-trip (STT + TTS)

+

录音 → 识别文本 → 再合成语音播放。测试全链路。

+ +
+
+ +
+ """ @@ -84,7 +213,6 @@ async def tts_synthesize(request: Request, text: str = Query(..., min_length=1, if tts is None or tts._pipeline is None: return Response(content="TTS model not loaded", status_code=503) - # Run TTS in thread pool (CPU-bound) loop = asyncio.get_event_loop() def _synth(): samples = [] @@ -108,3 +236,43 @@ async def tts_synthesize(request: Request, text: str = Query(..., min_length=1, return Response(content="TTS produced no audio", status_code=500) return Response(content=wav_bytes, media_type="audio/wav") + + +@router.post("/stt/transcribe") +async def stt_transcribe(request: Request, audio: UploadFile = File(...)): + """Transcribe uploaded audio to text via faster-whisper.""" + stt = getattr(request.app.state, "stt", None) + if stt is None or stt._model is None: + return {"error": "STT model not loaded", "text": ""} + + import tempfile + import os + + # Save uploaded file to temp + raw = await audio.read() + suffix = os.path.splitext(audio.filename or "audio.webm")[1] or ".webm" + with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as f: + f.write(raw) + tmp_path = f.name + + try: + # faster-whisper can handle webm/mp3/wav etc. directly + loop = asyncio.get_event_loop() + def _transcribe(): + segments, info = stt._model.transcribe( + tmp_path, + language=stt.language if hasattr(stt, 'language') and stt.language else None, + beam_size=5, + vad_filter=True, + ) + text = "".join(seg.text for seg in segments).strip() + return text, info + + text, info = await loop.run_in_executor(None, _transcribe) + return { + "text": text, + "language": getattr(info, "language", ""), + "duration": round(getattr(info, "duration", 0), 2), + } + finally: + os.unlink(tmp_path)