feat: add STT test and round-trip test to voice test page

- STT: record from mic or upload audio file → faster-whisper transcription - Round-trip: record → STT → TTS → playback (full pipeline test) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 05:08:00 -08:00 · 2026-02-24 05:08:00 -08:00 · 0bd050c80f
parent 0aa20cbc73
commit 0bd050c80f
1 changed files with 196 additions and 28 deletions
--- a/packages/services/voice-service/src/api/test_tts.py
+++ b/packages/services/voice-service/src/api/test_tts.py
@ -1,10 +1,10 @@
-"""Temporary TTS test endpoint — hit /api/v1/test/tts in a browser to verify."""
+"""Temporary test endpoints for TTS and STT — browser-accessible."""

 import asyncio
 import io
 import struct
 import numpy as np
-from fastapi import APIRouter, Request, Query
+from fastapi import APIRouter, Request, Query, UploadFile, File
 from fastapi.responses import HTMLResponse, Response

 router = APIRouter()
@ -37,41 +37,170 @@ def _make_wav(pcm_bytes: bytes, sample_rate: int = 16000) -> bytes:

@router.get("/tts", response_class=HTMLResponse)
 async def tts_test_page():
-    """Simple HTML page to test TTS."""
+    """Combined TTS + STT test page."""
    return """<!DOCTYPE html>
-<html><head><meta charset="utf-8"><title>TTS Test</title>
+<html><head><meta charset="utf-8"><title>Voice Test</title>
 <style>
-body { font-family: sans-serif; max-width: 600px; margin: 40px auto; padding: 0 20px; }
-textarea { width: 100%; height: 80px; font-size: 16px; }
-button { font-size: 18px; padding: 10px 30px; margin-top: 10px; cursor: pointer; }
-#status { margin-top: 15px; color: #666; }
-audio { margin-top: 15px; width: 100%; }
+body { font-family: sans-serif; max-width: 700px; margin: 30px auto; padding: 0 20px; }
+h2 { border-bottom: 2px solid #333; padding-bottom: 8px; }
+textarea { width: 100%; height: 70px; font-size: 15px; }
+button { font-size: 16px; padding: 8px 24px; margin: 8px 4px 8px 0; cursor: pointer; border-radius: 4px; border: 1px solid #999; }
+button:hover { background: #e0e0e0; }
+.recording { background: #ff4444 !important; color: white !important; }
+.status { margin-top: 10px; color: #666; font-size: 14px; }
+audio { margin-top: 10px; width: 100%; }
+.section { background: #f8f8f8; padding: 20px; border-radius: 8px; margin-bottom: 20px; }
+#stt-result { font-size: 18px; color: #333; margin-top: 10px; padding: 10px; background: white; border: 1px solid #ddd; border-radius: 4px; min-height: 40px; }
 </style></head>
 <body>
-<h2>TTS Test</h2>
-<textarea id="text" placeholder="输入要合成的文本...">你好，我是IT0运维助手。很高兴为您服务！</textarea>
+<h2>Voice I/O Test</h2>
+
+<div class="section">
+<h3>TTS (Text to Speech)</h3>
+<textarea id="tts-text" placeholder="输入要合成的文本...">你好，我是IT0运维助手。很高兴为您服务！</textarea>
 <br><button onclick="doTTS()">合成语音</button>
-<div id="status"></div>
-<audio id="player" controls style="display:none"></audio>
+<div class="status" id="tts-status"></div>
+<audio id="tts-player" controls style="display:none"></audio>
+</div>
+
+<div class="section">
+<h3>STT (Speech to Text)</h3>
+<p style="font-size:14px;color:#888;">点击录音按钮说话，松开后自动识别。或上传音频文件。</p>
+<button id="rec-btn" onmousedown="startRec()" onmouseup="stopRec()" ontouchstart="startRec()" ontouchend="stopRec()">按住录音</button>
+<label style="cursor:pointer;border:1px solid #999;padding:8px 24px;border-radius:4px;font-size:16px;">
+  上传音频 <input type="file" id="audio-file" accept="audio/*" style="display:none" onchange="uploadAudio(this)">
+</label>
+<div class="status" id="stt-status"></div>
+<div id="stt-result"></div>
+<audio id="stt-player" controls style="display:none"></audio>
+</div>
+
+<div class="section">
+<h3>Round-trip (STT + TTS)</h3>
+<p style="font-size:14px;color:#888;">录音 → 识别文本 → 再合成语音播放。测试全链路。</p>
+<button id="rt-btn" onmousedown="startRoundTrip()" onmouseup="stopRoundTrip()" ontouchstart="startRoundTrip()" ontouchend="stopRoundTrip()">按住说话 (Round-trip)</button>
+<div class="status" id="rt-status"></div>
+<div id="rt-result"></div>
+<audio id="rt-player" controls style="display:none"></audio>
+</div>
+
 <script>
+let mediaRec, audioChunks, recMode;
+
 async function doTTS() {
-  const text = document.getElementById('text').value.trim();
+  const text = document.getElementById('tts-text').value.trim();
  if (!text) return;
-  const status = document.getElementById('status');
-  const player = document.getElementById('player');
-  status.textContent = '合成中...';
-  player.style.display = 'none';
+  const st = document.getElementById('tts-status');
+  const pl = document.getElementById('tts-player');
+  st.textContent = '合成中...'; pl.style.display = 'none';
  const t0 = Date.now();
  try {
-    const resp = await fetch('/api/v1/test/tts/synthesize?text=' + encodeURIComponent(text));
-    if (!resp.ok) { status.textContent = 'Error: ' + resp.status; return; }
-    const blob = await resp.blob();
-    const ms = Date.now() - t0;
-    status.textContent = '完成！耗时 ' + ms + 'ms，大小 ' + (blob.size/1024).toFixed(1) + 'KB';
-    player.src = URL.createObjectURL(blob);
-    player.style.display = 'block';
-    player.play();
-  } catch(e) { status.textContent = 'Error: ' + e.message; }
+    const r = await fetch('/api/v1/test/tts/synthesize?text=' + encodeURIComponent(text));
+    if (!r.ok) { st.textContent = 'Error: ' + r.status + ' ' + await r.text(); return; }
+    const blob = await r.blob();
+    st.textContent = '完成！耗时 ' + (Date.now()-t0) + 'ms，大小 ' + (blob.size/1024).toFixed(1) + 'KB';
+    pl.src = URL.createObjectURL(blob); pl.style.display = 'block'; pl.play();
+  } catch(e) { st.textContent = 'Error: ' + e.message; }
+}
+
+function startRec() { _startRec('stt'); }
+function stopRec() { _stopRec('stt'); }
+function startRoundTrip() { _startRec('rt'); }
+function stopRoundTrip() { _stopRec('rt'); }
+
+async function _startRec(mode) {
+  recMode = mode;
+  const btn = document.getElementById(mode === 'rt' ? 'rt-btn' : 'rec-btn');
+  btn.classList.add('recording');
+  btn.textContent = '录音中...';
+  audioChunks = [];
+  try {
+    const stream = await navigator.mediaDevices.getUserMedia({ audio: { sampleRate: 16000, channelCount: 1 } });
+    mediaRec = new MediaRecorder(stream, { mimeType: 'audio/webm;codecs=opus' });
+    mediaRec.ondataavailable = e => { if (e.data.size > 0) audioChunks.push(e.data); };
+    mediaRec.onstop = () => {
+      stream.getTracks().forEach(t => t.stop());
+      const blob = new Blob(audioChunks, { type: 'audio/webm' });
+      if (mode === 'rt') doRoundTrip(blob);
+      else doSTT(blob);
+    };
+    mediaRec.start();
+  } catch(e) {
+    btn.classList.remove('recording');
+    btn.textContent = mode === 'rt' ? '按住说话 (Round-trip)' : '按住录音';
+    alert('麦克风权限被拒绝: ' + e.message);
+  }
+}
+
+function _stopRec(mode) {
+  const btn = document.getElementById(mode === 'rt' ? 'rt-btn' : 'rec-btn');
+  btn.classList.remove('recording');
+  btn.textContent = mode === 'rt' ? '按住说话 (Round-trip)' : '按住录音';
+  if (mediaRec && mediaRec.state === 'recording') mediaRec.stop();
+}
+
+async function doSTT(blob) {
+  const st = document.getElementById('stt-status');
+  const res = document.getElementById('stt-result');
+  const pl = document.getElementById('stt-player');
+  st.textContent = '识别中...'; res.textContent = '';
+  pl.src = URL.createObjectURL(blob); pl.style.display = 'block';
+  const t0 = Date.now();
+  try {
+    const fd = new FormData(); fd.append('audio', blob, 'recording.webm');
+    const r = await fetch('/api/v1/test/stt/transcribe', { method: 'POST', body: fd });
+    const data = await r.json();
+    st.textContent = '完成！耗时 ' + (Date.now()-t0) + 'ms';
+    res.textContent = data.text || '(empty)';
+  } catch(e) { st.textContent = 'Error: ' + e.message; }
+}
+
+async function uploadAudio(input) {
+  if (!input.files[0]) return;
+  const blob = input.files[0];
+  const st = document.getElementById('stt-status');
+  const res = document.getElementById('stt-result');
+  const pl = document.getElementById('stt-player');
+  st.textContent = '识别中...'; res.textContent = '';
+  pl.src = URL.createObjectURL(blob); pl.style.display = 'block';
+  const t0 = Date.now();
+  try {
+    const fd = new FormData(); fd.append('audio', blob, blob.name);
+    const r = await fetch('/api/v1/test/stt/transcribe', { method: 'POST', body: fd });
+    const data = await r.json();
+    st.textContent = '完成！耗时 ' + (Date.now()-t0) + 'ms';
+    res.textContent = data.text || '(empty)';
+  } catch(e) { st.textContent = 'Error: ' + e.message; }
+  input.value = '';
+}
+
+async function doRoundTrip(blob) {
+  const st = document.getElementById('rt-status');
+  const res = document.getElementById('rt-result');
+  const pl = document.getElementById('rt-player');
+  st.textContent = 'STT识别中...'; res.textContent = ''; pl.style.display = 'none';
+  const t0 = Date.now();
+  try {
+    // 1. STT
+    const fd = new FormData(); fd.append('audio', blob, 'recording.webm');
+    const r1 = await fetch('/api/v1/test/stt/transcribe', { method: 'POST', body: fd });
+    const sttData = await r1.json();
+    const text = sttData.text || '';
+    const sttMs = Date.now() - t0;
+    res.textContent = 'STT (' + sttMs + 'ms): ' + (text || '(empty)');
+    if (!text) { st.textContent = '识别为空'; return; }
+    // 2. TTS
+    st.textContent = 'TTS合成中...';
+    const t1 = Date.now();
+    const r2 = await fetch('/api/v1/test/tts/synthesize?text=' + encodeURIComponent(text));
+    if (!r2.ok) { st.textContent = 'TTS Error: ' + r2.status; return; }
+    const audioBlob = await r2.blob();
+    const ttsMs = Date.now() - t1;
+    const totalMs = Date.now() - t0;
+    st.textContent = '完成！STT=' + sttMs + 'ms + TTS=' + ttsMs + 'ms = 总计' + totalMs + 'ms';
+    res.textContent += '\\nTTS (' + ttsMs + 'ms): ' + (audioBlob.size/1024).toFixed(1) + 'KB';
+    pl.src = URL.createObjectURL(audioBlob); pl.style.display = 'block'; pl.play();
+  } catch(e) { st.textContent = 'Error: ' + e.message; }
 }
 </script>
 </body></html>"""
@ -84,7 +213,6 @@ async def tts_synthesize(request: Request, text: str = Query(..., min_length=1,
    if tts is None or tts._pipeline is None:
        return Response(content="TTS model not loaded", status_code=503)

-    # Run TTS in thread pool (CPU-bound)
    loop = asyncio.get_event_loop()
    def _synth():
        samples = []
@ -108,3 +236,43 @@ async def tts_synthesize(request: Request, text: str = Query(..., min_length=1,
        return Response(content="TTS produced no audio", status_code=500)

    return Response(content=wav_bytes, media_type="audio/wav")
+
+
+@router.post("/stt/transcribe")
+async def stt_transcribe(request: Request, audio: UploadFile = File(...)):
+    """Transcribe uploaded audio to text via faster-whisper."""
+    stt = getattr(request.app.state, "stt", None)
+    if stt is None or stt._model is None:
+        return {"error": "STT model not loaded", "text": ""}
+
+    import tempfile
+    import os
+
+    # Save uploaded file to temp
+    raw = await audio.read()
+    suffix = os.path.splitext(audio.filename or "audio.webm")[1] or ".webm"
+    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as f:
+        f.write(raw)
+        tmp_path = f.name
+
+    try:
+        # faster-whisper can handle webm/mp3/wav etc. directly
+        loop = asyncio.get_event_loop()
+        def _transcribe():
+            segments, info = stt._model.transcribe(
+                tmp_path,
+                language=stt.language if hasattr(stt, 'language') and stt.language else None,
+                beam_size=5,
+                vad_filter=True,
+            )
+            text = "".join(seg.text for seg in segments).strip()
+            return text, info
+
+        text, info = await loop.run_in_executor(None, _transcribe)
+        return {
+            "text": text,
+            "language": getattr(info, "language", ""),
+            "duration": round(getattr(info, "duration", 0), 2),
+        }
+    finally:
+        os.unlink(tmp_path)