diff --git a/packages/services/voice-service/src/api/test_tts.py b/packages/services/voice-service/src/api/test_tts.py
index 448520f..3a105e1 100644
--- a/packages/services/voice-service/src/api/test_tts.py
+++ b/packages/services/voice-service/src/api/test_tts.py
@@ -1,10 +1,10 @@
-"""Temporary TTS test endpoint — hit /api/v1/test/tts in a browser to verify."""
+"""Temporary test endpoints for TTS and STT — browser-accessible."""
import asyncio
import io
import struct
import numpy as np
-from fastapi import APIRouter, Request, Query
+from fastapi import APIRouter, Request, Query, UploadFile, File
from fastapi.responses import HTMLResponse, Response
router = APIRouter()
@@ -37,41 +37,170 @@ def _make_wav(pcm_bytes: bytes, sample_rate: int = 16000) -> bytes:
@router.get("/tts", response_class=HTMLResponse)
async def tts_test_page():
- """Simple HTML page to test TTS."""
+ """Combined TTS + STT test page."""
return """
-
TTS Test
+Voice Test
-TTS Test
-
+Voice I/O Test
+
+
+
TTS (Text to Speech)
+
-
-
+
+
+
+
+
+
STT (Speech to Text)
+
点击录音按钮说话,松开后自动识别。或上传音频文件。
+
+
+
+
+
+
+
+
+
Round-trip (STT + TTS)
+
录音 → 识别文本 → 再合成语音播放。测试全链路。
+
+
+
+
+
+
"""
@@ -84,7 +213,6 @@ async def tts_synthesize(request: Request, text: str = Query(..., min_length=1,
if tts is None or tts._pipeline is None:
return Response(content="TTS model not loaded", status_code=503)
- # Run TTS in thread pool (CPU-bound)
loop = asyncio.get_event_loop()
def _synth():
samples = []
@@ -108,3 +236,43 @@ async def tts_synthesize(request: Request, text: str = Query(..., min_length=1,
return Response(content="TTS produced no audio", status_code=500)
return Response(content=wav_bytes, media_type="audio/wav")
+
+
+@router.post("/stt/transcribe")
+async def stt_transcribe(request: Request, audio: UploadFile = File(...)):
+ """Transcribe uploaded audio to text via faster-whisper."""
+ stt = getattr(request.app.state, "stt", None)
+ if stt is None or stt._model is None:
+ return {"error": "STT model not loaded", "text": ""}
+
+ import tempfile
+ import os
+
+ # Save uploaded file to temp
+ raw = await audio.read()
+ suffix = os.path.splitext(audio.filename or "audio.webm")[1] or ".webm"
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as f:
+ f.write(raw)
+ tmp_path = f.name
+
+ try:
+ # faster-whisper can handle webm/mp3/wav etc. directly
+ loop = asyncio.get_event_loop()
+ def _transcribe():
+ segments, info = stt._model.transcribe(
+ tmp_path,
+ language=stt.language if hasattr(stt, 'language') and stt.language else None,
+ beam_size=5,
+ vad_filter=True,
+ )
+ text = "".join(seg.text for seg in segments).strip()
+ return text, info
+
+ text, info = await loop.run_in_executor(None, _transcribe)
+ return {
+ "text": text,
+ "language": getattr(info, "language", ""),
+ "duration": round(getattr(info, "duration", 0), 2),
+ }
+ finally:
+ os.unlink(tmp_path)