feat: add STT test and round-trip test to voice test page

- STT: record from mic or upload audio file → faster-whisper transcription - Round-trip: record → STT → TTS → playback (full pipeline test) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 05:08:00 -08:00 · 2026-02-24 05:08:00 -08:00 · 0bd050c80f
parent 0aa20cbc73
commit 0bd050c80f
1 changed files with 196 additions and 28 deletions
--- a/packages/services/voice-service/src/api/test_tts.py
+++ b/packages/services/voice-service/src/api/test_tts.py
@ -1,10 +1,10 @@
-"""Temporary TTS test endpoint — hit /api/v1/test/tts in a browser to verify."""
+"""Temporary test endpoints for TTS and STT — browser-accessible."""
 import asyncio
 import io
 import struct
 import numpy as np
-from fastapi import APIRouter, Request, Query
+from fastapi import APIRouter, Request, Query, UploadFile, File
 from fastapi.responses import HTMLResponse, Response
 router = APIRouter()
@ -37,41 +37,170 @@ def _make_wav(pcm_bytes: bytes, sample_rate: int = 16000) -> bytes:
@router.get("/tts", response_class=HTMLResponse)
 async def tts_test_page():
-    """Simple HTML page to test TTS."""
+    """Combined TTS + STT test page."""
    return """<!DOCTYPE html>
-<html><head><meta charset="utf-8"><title>TTS Test</title>
+<html><head><meta charset="utf-8"><title>Voice Test</title>
 <style>
-body { font-family: sans-serif; max-width: 600px; margin: 40px auto; padding: 0 20px; }
+body { font-family: sans-serif; max-width: 700px; margin: 30px auto; padding: 0 20px; }
-textarea { width: 100%; height: 80px; font-size: 16px; }
+h2 { border-bottom: 2px solid #333; padding-bottom: 8px; }
-button { font-size: 18px; padding: 10px 30px; margin-top: 10px; cursor: pointer; }
+textarea { width: 100%; height: 70px; font-size: 15px; }
-#status { margin-top: 15px; color: #666; }
+button { font-size: 16px; padding: 8px 24px; margin: 8px 4px 8px 0; cursor: pointer; border-radius: 4px; border: 1px solid #999; }
-audio { margin-top: 15px; width: 100%; }
+button:hover { background: #e0e0e0; }
 .recording { background: #ff4444 !important; color: white !important; }
 .status { margin-top: 10px; color: #666; font-size: 14px; }
 audio { margin-top: 10px; width: 100%; }
 .section { background: #f8f8f8; padding: 20px; border-radius: 8px; margin-bottom: 20px; }
 #stt-result { font-size: 18px; color: #333; margin-top: 10px; padding: 10px; background: white; border: 1px solid #ddd; border-radius: 4px; min-height: 40px; }
 </style></head>
 <body>
-<h2>TTS Test</h2>
+<h2>Voice I/O Test</h2>
-<textarea id="text" placeholder="输入要合成的文本...">你好，我是IT0运维助手。很高兴为您服务！</textarea>
+
 <div class="section">
 <h3>TTS (Text to Speech)</h3>
 <textarea id="tts-text" placeholder="输入要合成的文本...">你好，我是IT0运维助手。很高兴为您服务！</textarea>
 <br><button onclick="doTTS()">合成语音</button>
-<div id="status"></div>
+<div class="status" id="tts-status"></div>
-<audio id="player" controls style="display:none"></audio>
+<audio id="tts-player" controls style="display:none"></audio>
 </div>
 <div class="section">
 <h3>STT (Speech to Text)</h3>
 <p style="font-size:14px;color:#888;">点击录音按钮说话，松开后自动识别。或上传音频文件。</p>
 <button id="rec-btn" onmousedown="startRec()" onmouseup="stopRec()" ontouchstart="startRec()" ontouchend="stopRec()">按住录音</button>
 <label style="cursor:pointer;border:1px solid #999;padding:8px 24px;border-radius:4px;font-size:16px;">
  上传音频 <input type="file" id="audio-file" accept="audio/*" style="display:none" onchange="uploadAudio(this)">
 </label>
 <div class="status" id="stt-status"></div>
 <div id="stt-result"></div>
 <audio id="stt-player" controls style="display:none"></audio>
 </div>
 <div class="section">
 <h3>Round-trip (STT + TTS)</h3>
 <p style="font-size:14px;color:#888;">录音 → 识别文本 → 再合成语音播放。测试全链路。</p>
 <button id="rt-btn" onmousedown="startRoundTrip()" onmouseup="stopRoundTrip()" ontouchstart="startRoundTrip()" ontouchend="stopRoundTrip()">按住说话 (Round-trip)</button>
 <div class="status" id="rt-status"></div>
 <div id="rt-result"></div>
 <audio id="rt-player" controls style="display:none"></audio>
 </div>
 <script>
 let mediaRec, audioChunks, recMode;
 async function doTTS() {
-  const text = document.getElementById('text').value.trim();
+  const text = document.getElementById('tts-text').value.trim();
  if (!text) return;
-  const status = document.getElementById('status');
+  const st = document.getElementById('tts-status');
-  const player = document.getElementById('player');
+  const pl = document.getElementById('tts-player');
-  status.textContent = '合成中...';
+  st.textContent = '合成中...'; pl.style.display = 'none';
  player.style.display = 'none';
  const t0 = Date.now();
  try {
-    const resp = await fetch('/api/v1/test/tts/synthesize?text=' + encodeURIComponent(text));
+    const r = await fetch('/api/v1/test/tts/synthesize?text=' + encodeURIComponent(text));
-    if (!resp.ok) { status.textContent = 'Error: ' + resp.status; return; }
+    if (!r.ok) { st.textContent = 'Error: ' + r.status + ' ' + await r.text(); return; }
-    const blob = await resp.blob();
+    const blob = await r.blob();
-    const ms = Date.now() - t0;
+    st.textContent = '完成！耗时 ' + (Date.now()-t0) + 'ms，大小 ' + (blob.size/1024).toFixed(1) + 'KB';
-    status.textContent = '完成！耗时 ' + ms + 'ms，大小 ' + (blob.size/1024).toFixed(1) + 'KB';
+    pl.src = URL.createObjectURL(blob); pl.style.display = 'block'; pl.play();
-    player.src = URL.createObjectURL(blob);
+  } catch(e) { st.textContent = 'Error: ' + e.message; }
-    player.style.display = 'block';
+}
-    player.play();
+
-  } catch(e) { status.textContent = 'Error: ' + e.message; }
+function startRec() { _startRec('stt'); }
 function stopRec() { _stopRec('stt'); }
 function startRoundTrip() { _startRec('rt'); }
 function stopRoundTrip() { _stopRec('rt'); }
 async function _startRec(mode) {
  recMode = mode;
  const btn = document.getElementById(mode === 'rt' ? 'rt-btn' : 'rec-btn');
  btn.classList.add('recording');
  btn.textContent = '录音中...';
  audioChunks = [];
  try {
    const stream = await navigator.mediaDevices.getUserMedia({ audio: { sampleRate: 16000, channelCount: 1 } });
    mediaRec = new MediaRecorder(stream, { mimeType: 'audio/webm;codecs=opus' });
    mediaRec.ondataavailable = e => { if (e.data.size > 0) audioChunks.push(e.data); };
    mediaRec.onstop = () => {
      stream.getTracks().forEach(t => t.stop());
      const blob = new Blob(audioChunks, { type: 'audio/webm' });
      if (mode === 'rt') doRoundTrip(blob);
      else doSTT(blob);
    };
    mediaRec.start();
  } catch(e) {
    btn.classList.remove('recording');
    btn.textContent = mode === 'rt' ? '按住说话 (Round-trip)' : '按住录音';
    alert('麦克风权限被拒绝: ' + e.message);
  }
 }
 function _stopRec(mode) {
  const btn = document.getElementById(mode === 'rt' ? 'rt-btn' : 'rec-btn');
  btn.classList.remove('recording');
  btn.textContent = mode === 'rt' ? '按住说话 (Round-trip)' : '按住录音';
  if (mediaRec && mediaRec.state === 'recording') mediaRec.stop();
 }
 async function doSTT(blob) {
  const st = document.getElementById('stt-status');
  const res = document.getElementById('stt-result');
  const pl = document.getElementById('stt-player');
  st.textContent = '识别中...'; res.textContent = '';
  pl.src = URL.createObjectURL(blob); pl.style.display = 'block';
  const t0 = Date.now();
  try {
    const fd = new FormData(); fd.append('audio', blob, 'recording.webm');
    const r = await fetch('/api/v1/test/stt/transcribe', { method: 'POST', body: fd });
    const data = await r.json();
    st.textContent = '完成！耗时 ' + (Date.now()-t0) + 'ms';
    res.textContent = data.text || '(empty)';
  } catch(e) { st.textContent = 'Error: ' + e.message; }
 }
 async function uploadAudio(input) {
  if (!input.files[0]) return;
  const blob = input.files[0];
  const st = document.getElementById('stt-status');
  const res = document.getElementById('stt-result');
  const pl = document.getElementById('stt-player');
  st.textContent = '识别中...'; res.textContent = '';
  pl.src = URL.createObjectURL(blob); pl.style.display = 'block';
  const t0 = Date.now();
  try {
    const fd = new FormData(); fd.append('audio', blob, blob.name);
    const r = await fetch('/api/v1/test/stt/transcribe', { method: 'POST', body: fd });
    const data = await r.json();
    st.textContent = '完成！耗时 ' + (Date.now()-t0) + 'ms';
    res.textContent = data.text || '(empty)';
  } catch(e) { st.textContent = 'Error: ' + e.message; }
  input.value = '';
 }
 async function doRoundTrip(blob) {
  const st = document.getElementById('rt-status');
  const res = document.getElementById('rt-result');
  const pl = document.getElementById('rt-player');
  st.textContent = 'STT识别中...'; res.textContent = ''; pl.style.display = 'none';
  const t0 = Date.now();
  try {
    // 1. STT
    const fd = new FormData(); fd.append('audio', blob, 'recording.webm');
    const r1 = await fetch('/api/v1/test/stt/transcribe', { method: 'POST', body: fd });
    const sttData = await r1.json();
    const text = sttData.text || '';
    const sttMs = Date.now() - t0;
    res.textContent = 'STT (' + sttMs + 'ms): ' + (text || '(empty)');
    if (!text) { st.textContent = '识别为空'; return; }
    // 2. TTS
    st.textContent = 'TTS合成中...';
    const t1 = Date.now();
    const r2 = await fetch('/api/v1/test/tts/synthesize?text=' + encodeURIComponent(text));
    if (!r2.ok) { st.textContent = 'TTS Error: ' + r2.status; return; }
    const audioBlob = await r2.blob();
    const ttsMs = Date.now() - t1;
    const totalMs = Date.now() - t0;
    st.textContent = '完成！STT=' + sttMs + 'ms + TTS=' + ttsMs + 'ms = 总计' + totalMs + 'ms';
    res.textContent += '\\nTTS (' + ttsMs + 'ms): ' + (audioBlob.size/1024).toFixed(1) + 'KB';
    pl.src = URL.createObjectURL(audioBlob); pl.style.display = 'block'; pl.play();
  } catch(e) { st.textContent = 'Error: ' + e.message; }
 }
 </script>
 </body></html>"""
@ -84,7 +213,6 @@ async def tts_synthesize(request: Request, text: str = Query(..., min_length=1,
    if tts is None or tts._pipeline is None:
        return Response(content="TTS model not loaded", status_code=503)
    # Run TTS in thread pool (CPU-bound)
    loop = asyncio.get_event_loop()
    def _synth():
        samples = []
@ -108,3 +236,43 @@ async def tts_synthesize(request: Request, text: str = Query(..., min_length=1,
        return Response(content="TTS produced no audio", status_code=500)
    return Response(content=wav_bytes, media_type="audio/wav")
@router.post("/stt/transcribe")
 async def stt_transcribe(request: Request, audio: UploadFile = File(...)):
    """Transcribe uploaded audio to text via faster-whisper."""
    stt = getattr(request.app.state, "stt", None)
    if stt is None or stt._model is None:
        return {"error": "STT model not loaded", "text": ""}
    import tempfile
    import os
    # Save uploaded file to temp
    raw = await audio.read()
    suffix = os.path.splitext(audio.filename or "audio.webm")[1] or ".webm"
    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as f:
        f.write(raw)
        tmp_path = f.name
    try:
        # faster-whisper can handle webm/mp3/wav etc. directly
        loop = asyncio.get_event_loop()
        def _transcribe():
            segments, info = stt._model.transcribe(
                tmp_path,
                language=stt.language if hasattr(stt, 'language') and stt.language else None,
                beam_size=5,
                vad_filter=True,
            )
            text = "".join(seg.text for seg in segments).strip()
            return text, info
        text, info = await loop.run_in_executor(None, _transcribe)
        return {
            "text": text,
            "language": getattr(info, "language", ""),
            "duration": round(getattr(info, "duration", 0), 2),
        }
    finally:
        os.unlink(tmp_path)