feat: add STT test and round-trip test to voice test page

- STT: record from mic or upload audio file → faster-whisper transcription
- Round-trip: record → STT → TTS → playback (full pipeline test)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
hailin 2026-02-24 05:08:00 -08:00
parent 0aa20cbc73
commit 0bd050c80f
1 changed files with 196 additions and 28 deletions

View File

@ -1,10 +1,10 @@
"""Temporary TTS test endpoint — hit /api/v1/test/tts in a browser to verify.""" """Temporary test endpoints for TTS and STT — browser-accessible."""
import asyncio import asyncio
import io import io
import struct import struct
import numpy as np import numpy as np
from fastapi import APIRouter, Request, Query from fastapi import APIRouter, Request, Query, UploadFile, File
from fastapi.responses import HTMLResponse, Response from fastapi.responses import HTMLResponse, Response
router = APIRouter() router = APIRouter()
@ -37,41 +37,170 @@ def _make_wav(pcm_bytes: bytes, sample_rate: int = 16000) -> bytes:
@router.get("/tts", response_class=HTMLResponse) @router.get("/tts", response_class=HTMLResponse)
async def tts_test_page(): async def tts_test_page():
"""Simple HTML page to test TTS.""" """Combined TTS + STT test page."""
return """<!DOCTYPE html> return """<!DOCTYPE html>
<html><head><meta charset="utf-8"><title>TTS Test</title> <html><head><meta charset="utf-8"><title>Voice Test</title>
<style> <style>
body { font-family: sans-serif; max-width: 600px; margin: 40px auto; padding: 0 20px; } body { font-family: sans-serif; max-width: 700px; margin: 30px auto; padding: 0 20px; }
textarea { width: 100%; height: 80px; font-size: 16px; } h2 { border-bottom: 2px solid #333; padding-bottom: 8px; }
button { font-size: 18px; padding: 10px 30px; margin-top: 10px; cursor: pointer; } textarea { width: 100%; height: 70px; font-size: 15px; }
#status { margin-top: 15px; color: #666; } button { font-size: 16px; padding: 8px 24px; margin: 8px 4px 8px 0; cursor: pointer; border-radius: 4px; border: 1px solid #999; }
audio { margin-top: 15px; width: 100%; } button:hover { background: #e0e0e0; }
.recording { background: #ff4444 !important; color: white !important; }
.status { margin-top: 10px; color: #666; font-size: 14px; }
audio { margin-top: 10px; width: 100%; }
.section { background: #f8f8f8; padding: 20px; border-radius: 8px; margin-bottom: 20px; }
#stt-result { font-size: 18px; color: #333; margin-top: 10px; padding: 10px; background: white; border: 1px solid #ddd; border-radius: 4px; min-height: 40px; }
</style></head> </style></head>
<body> <body>
<h2>TTS Test</h2> <h2>Voice I/O Test</h2>
<textarea id="text" placeholder="输入要合成的文本...">你好我是IT0运维助手很高兴为您服务</textarea>
<div class="section">
<h3>TTS (Text to Speech)</h3>
<textarea id="tts-text" placeholder="输入要合成的文本...">你好我是IT0运维助手很高兴为您服务</textarea>
<br><button onclick="doTTS()">合成语音</button> <br><button onclick="doTTS()">合成语音</button>
<div id="status"></div> <div class="status" id="tts-status"></div>
<audio id="player" controls style="display:none"></audio> <audio id="tts-player" controls style="display:none"></audio>
</div>
<div class="section">
<h3>STT (Speech to Text)</h3>
<p style="font-size:14px;color:#888;">点击录音按钮说话松开后自动识别或上传音频文件</p>
<button id="rec-btn" onmousedown="startRec()" onmouseup="stopRec()" ontouchstart="startRec()" ontouchend="stopRec()">按住录音</button>
<label style="cursor:pointer;border:1px solid #999;padding:8px 24px;border-radius:4px;font-size:16px;">
上传音频 <input type="file" id="audio-file" accept="audio/*" style="display:none" onchange="uploadAudio(this)">
</label>
<div class="status" id="stt-status"></div>
<div id="stt-result"></div>
<audio id="stt-player" controls style="display:none"></audio>
</div>
<div class="section">
<h3>Round-trip (STT + TTS)</h3>
<p style="font-size:14px;color:#888;">录音 识别文本 再合成语音播放测试全链路</p>
<button id="rt-btn" onmousedown="startRoundTrip()" onmouseup="stopRoundTrip()" ontouchstart="startRoundTrip()" ontouchend="stopRoundTrip()">按住说话 (Round-trip)</button>
<div class="status" id="rt-status"></div>
<div id="rt-result"></div>
<audio id="rt-player" controls style="display:none"></audio>
</div>
<script> <script>
let mediaRec, audioChunks, recMode;
async function doTTS() { async function doTTS() {
const text = document.getElementById('text').value.trim(); const text = document.getElementById('tts-text').value.trim();
if (!text) return; if (!text) return;
const status = document.getElementById('status'); const st = document.getElementById('tts-status');
const player = document.getElementById('player'); const pl = document.getElementById('tts-player');
status.textContent = '合成中...'; st.textContent = '合成中...'; pl.style.display = 'none';
player.style.display = 'none';
const t0 = Date.now(); const t0 = Date.now();
try { try {
const resp = await fetch('/api/v1/test/tts/synthesize?text=' + encodeURIComponent(text)); const r = await fetch('/api/v1/test/tts/synthesize?text=' + encodeURIComponent(text));
if (!resp.ok) { status.textContent = 'Error: ' + resp.status; return; } if (!r.ok) { st.textContent = 'Error: ' + r.status + ' ' + await r.text(); return; }
const blob = await resp.blob(); const blob = await r.blob();
const ms = Date.now() - t0; st.textContent = '完成!耗时 ' + (Date.now()-t0) + 'ms大小 ' + (blob.size/1024).toFixed(1) + 'KB';
status.textContent = '完成!耗时 ' + ms + 'ms大小 ' + (blob.size/1024).toFixed(1) + 'KB'; pl.src = URL.createObjectURL(blob); pl.style.display = 'block'; pl.play();
player.src = URL.createObjectURL(blob); } catch(e) { st.textContent = 'Error: ' + e.message; }
player.style.display = 'block'; }
player.play();
} catch(e) { status.textContent = 'Error: ' + e.message; } function startRec() { _startRec('stt'); }
function stopRec() { _stopRec('stt'); }
function startRoundTrip() { _startRec('rt'); }
function stopRoundTrip() { _stopRec('rt'); }
async function _startRec(mode) {
recMode = mode;
const btn = document.getElementById(mode === 'rt' ? 'rt-btn' : 'rec-btn');
btn.classList.add('recording');
btn.textContent = '录音中...';
audioChunks = [];
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: { sampleRate: 16000, channelCount: 1 } });
mediaRec = new MediaRecorder(stream, { mimeType: 'audio/webm;codecs=opus' });
mediaRec.ondataavailable = e => { if (e.data.size > 0) audioChunks.push(e.data); };
mediaRec.onstop = () => {
stream.getTracks().forEach(t => t.stop());
const blob = new Blob(audioChunks, { type: 'audio/webm' });
if (mode === 'rt') doRoundTrip(blob);
else doSTT(blob);
};
mediaRec.start();
} catch(e) {
btn.classList.remove('recording');
btn.textContent = mode === 'rt' ? '按住说话 (Round-trip)' : '按住录音';
alert('麦克风权限被拒绝: ' + e.message);
}
}
function _stopRec(mode) {
const btn = document.getElementById(mode === 'rt' ? 'rt-btn' : 'rec-btn');
btn.classList.remove('recording');
btn.textContent = mode === 'rt' ? '按住说话 (Round-trip)' : '按住录音';
if (mediaRec && mediaRec.state === 'recording') mediaRec.stop();
}
async function doSTT(blob) {
const st = document.getElementById('stt-status');
const res = document.getElementById('stt-result');
const pl = document.getElementById('stt-player');
st.textContent = '识别中...'; res.textContent = '';
pl.src = URL.createObjectURL(blob); pl.style.display = 'block';
const t0 = Date.now();
try {
const fd = new FormData(); fd.append('audio', blob, 'recording.webm');
const r = await fetch('/api/v1/test/stt/transcribe', { method: 'POST', body: fd });
const data = await r.json();
st.textContent = '完成!耗时 ' + (Date.now()-t0) + 'ms';
res.textContent = data.text || '(empty)';
} catch(e) { st.textContent = 'Error: ' + e.message; }
}
async function uploadAudio(input) {
if (!input.files[0]) return;
const blob = input.files[0];
const st = document.getElementById('stt-status');
const res = document.getElementById('stt-result');
const pl = document.getElementById('stt-player');
st.textContent = '识别中...'; res.textContent = '';
pl.src = URL.createObjectURL(blob); pl.style.display = 'block';
const t0 = Date.now();
try {
const fd = new FormData(); fd.append('audio', blob, blob.name);
const r = await fetch('/api/v1/test/stt/transcribe', { method: 'POST', body: fd });
const data = await r.json();
st.textContent = '完成!耗时 ' + (Date.now()-t0) + 'ms';
res.textContent = data.text || '(empty)';
} catch(e) { st.textContent = 'Error: ' + e.message; }
input.value = '';
}
async function doRoundTrip(blob) {
const st = document.getElementById('rt-status');
const res = document.getElementById('rt-result');
const pl = document.getElementById('rt-player');
st.textContent = 'STT识别中...'; res.textContent = ''; pl.style.display = 'none';
const t0 = Date.now();
try {
// 1. STT
const fd = new FormData(); fd.append('audio', blob, 'recording.webm');
const r1 = await fetch('/api/v1/test/stt/transcribe', { method: 'POST', body: fd });
const sttData = await r1.json();
const text = sttData.text || '';
const sttMs = Date.now() - t0;
res.textContent = 'STT (' + sttMs + 'ms): ' + (text || '(empty)');
if (!text) { st.textContent = '识别为空'; return; }
// 2. TTS
st.textContent = 'TTS合成中...';
const t1 = Date.now();
const r2 = await fetch('/api/v1/test/tts/synthesize?text=' + encodeURIComponent(text));
if (!r2.ok) { st.textContent = 'TTS Error: ' + r2.status; return; }
const audioBlob = await r2.blob();
const ttsMs = Date.now() - t1;
const totalMs = Date.now() - t0;
st.textContent = '完成STT=' + sttMs + 'ms + TTS=' + ttsMs + 'ms = 总计' + totalMs + 'ms';
res.textContent += '\\nTTS (' + ttsMs + 'ms): ' + (audioBlob.size/1024).toFixed(1) + 'KB';
pl.src = URL.createObjectURL(audioBlob); pl.style.display = 'block'; pl.play();
} catch(e) { st.textContent = 'Error: ' + e.message; }
} }
</script> </script>
</body></html>""" </body></html>"""
@ -84,7 +213,6 @@ async def tts_synthesize(request: Request, text: str = Query(..., min_length=1,
if tts is None or tts._pipeline is None: if tts is None or tts._pipeline is None:
return Response(content="TTS model not loaded", status_code=503) return Response(content="TTS model not loaded", status_code=503)
# Run TTS in thread pool (CPU-bound)
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
def _synth(): def _synth():
samples = [] samples = []
@ -108,3 +236,43 @@ async def tts_synthesize(request: Request, text: str = Query(..., min_length=1,
return Response(content="TTS produced no audio", status_code=500) return Response(content="TTS produced no audio", status_code=500)
return Response(content=wav_bytes, media_type="audio/wav") return Response(content=wav_bytes, media_type="audio/wav")
@router.post("/stt/transcribe")
async def stt_transcribe(request: Request, audio: UploadFile = File(...)):
"""Transcribe uploaded audio to text via faster-whisper."""
stt = getattr(request.app.state, "stt", None)
if stt is None or stt._model is None:
return {"error": "STT model not loaded", "text": ""}
import tempfile
import os
# Save uploaded file to temp
raw = await audio.read()
suffix = os.path.splitext(audio.filename or "audio.webm")[1] or ".webm"
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as f:
f.write(raw)
tmp_path = f.name
try:
# faster-whisper can handle webm/mp3/wav etc. directly
loop = asyncio.get_event_loop()
def _transcribe():
segments, info = stt._model.transcribe(
tmp_path,
language=stt.language if hasattr(stt, 'language') and stt.language else None,
beam_size=5,
vad_filter=True,
)
text = "".join(seg.text for seg in segments).strip()
return text, info
text, info = await loop.run_in_executor(None, _transcribe)
return {
"text": text,
"language": getattr(info, "language", ""),
"duration": round(getattr(info, "duration", 0), 2),
}
finally:
os.unlink(tmp_path)