feat: add STT test and round-trip test to voice test page

- STT: record from mic or upload audio file → faster-whisper transcription
- Round-trip: record → STT → TTS → playback (full pipeline test)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
hailin 2026-02-24 05:08:00 -08:00
parent 0aa20cbc73
commit 0bd050c80f
1 changed files with 196 additions and 28 deletions

View File

@ -1,10 +1,10 @@
"""Temporary TTS test endpoint — hit /api/v1/test/tts in a browser to verify."""
"""Temporary test endpoints for TTS and STT — browser-accessible."""
import asyncio
import io
import struct
import numpy as np
from fastapi import APIRouter, Request, Query
from fastapi import APIRouter, Request, Query, UploadFile, File
from fastapi.responses import HTMLResponse, Response
router = APIRouter()
@ -37,41 +37,170 @@ def _make_wav(pcm_bytes: bytes, sample_rate: int = 16000) -> bytes:
@router.get("/tts", response_class=HTMLResponse)
async def tts_test_page():
"""Simple HTML page to test TTS."""
"""Combined TTS + STT test page."""
return """<!DOCTYPE html>
<html><head><meta charset="utf-8"><title>TTS Test</title>
<html><head><meta charset="utf-8"><title>Voice Test</title>
<style>
body { font-family: sans-serif; max-width: 600px; margin: 40px auto; padding: 0 20px; }
textarea { width: 100%; height: 80px; font-size: 16px; }
button { font-size: 18px; padding: 10px 30px; margin-top: 10px; cursor: pointer; }
#status { margin-top: 15px; color: #666; }
audio { margin-top: 15px; width: 100%; }
body { font-family: sans-serif; max-width: 700px; margin: 30px auto; padding: 0 20px; }
h2 { border-bottom: 2px solid #333; padding-bottom: 8px; }
textarea { width: 100%; height: 70px; font-size: 15px; }
button { font-size: 16px; padding: 8px 24px; margin: 8px 4px 8px 0; cursor: pointer; border-radius: 4px; border: 1px solid #999; }
button:hover { background: #e0e0e0; }
.recording { background: #ff4444 !important; color: white !important; }
.status { margin-top: 10px; color: #666; font-size: 14px; }
audio { margin-top: 10px; width: 100%; }
.section { background: #f8f8f8; padding: 20px; border-radius: 8px; margin-bottom: 20px; }
#stt-result { font-size: 18px; color: #333; margin-top: 10px; padding: 10px; background: white; border: 1px solid #ddd; border-radius: 4px; min-height: 40px; }
</style></head>
<body>
<h2>TTS Test</h2>
<textarea id="text" placeholder="输入要合成的文本...">你好我是IT0运维助手很高兴为您服务</textarea>
<h2>Voice I/O Test</h2>
<div class="section">
<h3>TTS (Text to Speech)</h3>
<textarea id="tts-text" placeholder="输入要合成的文本...">你好我是IT0运维助手很高兴为您服务</textarea>
<br><button onclick="doTTS()">合成语音</button>
<div id="status"></div>
<audio id="player" controls style="display:none"></audio>
<div class="status" id="tts-status"></div>
<audio id="tts-player" controls style="display:none"></audio>
</div>
<div class="section">
<h3>STT (Speech to Text)</h3>
<p style="font-size:14px;color:#888;">点击录音按钮说话松开后自动识别或上传音频文件</p>
<button id="rec-btn" onmousedown="startRec()" onmouseup="stopRec()" ontouchstart="startRec()" ontouchend="stopRec()">按住录音</button>
<label style="cursor:pointer;border:1px solid #999;padding:8px 24px;border-radius:4px;font-size:16px;">
上传音频 <input type="file" id="audio-file" accept="audio/*" style="display:none" onchange="uploadAudio(this)">
</label>
<div class="status" id="stt-status"></div>
<div id="stt-result"></div>
<audio id="stt-player" controls style="display:none"></audio>
</div>
<div class="section">
<h3>Round-trip (STT + TTS)</h3>
<p style="font-size:14px;color:#888;">录音 识别文本 再合成语音播放测试全链路</p>
<button id="rt-btn" onmousedown="startRoundTrip()" onmouseup="stopRoundTrip()" ontouchstart="startRoundTrip()" ontouchend="stopRoundTrip()">按住说话 (Round-trip)</button>
<div class="status" id="rt-status"></div>
<div id="rt-result"></div>
<audio id="rt-player" controls style="display:none"></audio>
</div>
<script>
let mediaRec, audioChunks, recMode;
async function doTTS() {
const text = document.getElementById('text').value.trim();
const text = document.getElementById('tts-text').value.trim();
if (!text) return;
const status = document.getElementById('status');
const player = document.getElementById('player');
status.textContent = '合成中...';
player.style.display = 'none';
const st = document.getElementById('tts-status');
const pl = document.getElementById('tts-player');
st.textContent = '合成中...'; pl.style.display = 'none';
const t0 = Date.now();
try {
const resp = await fetch('/api/v1/test/tts/synthesize?text=' + encodeURIComponent(text));
if (!resp.ok) { status.textContent = 'Error: ' + resp.status; return; }
const blob = await resp.blob();
const ms = Date.now() - t0;
status.textContent = '完成!耗时 ' + ms + 'ms大小 ' + (blob.size/1024).toFixed(1) + 'KB';
player.src = URL.createObjectURL(blob);
player.style.display = 'block';
player.play();
} catch(e) { status.textContent = 'Error: ' + e.message; }
const r = await fetch('/api/v1/test/tts/synthesize?text=' + encodeURIComponent(text));
if (!r.ok) { st.textContent = 'Error: ' + r.status + ' ' + await r.text(); return; }
const blob = await r.blob();
st.textContent = '完成!耗时 ' + (Date.now()-t0) + 'ms大小 ' + (blob.size/1024).toFixed(1) + 'KB';
pl.src = URL.createObjectURL(blob); pl.style.display = 'block'; pl.play();
} catch(e) { st.textContent = 'Error: ' + e.message; }
}
function startRec() { _startRec('stt'); }
function stopRec() { _stopRec('stt'); }
function startRoundTrip() { _startRec('rt'); }
function stopRoundTrip() { _stopRec('rt'); }
async function _startRec(mode) {
recMode = mode;
const btn = document.getElementById(mode === 'rt' ? 'rt-btn' : 'rec-btn');
btn.classList.add('recording');
btn.textContent = '录音中...';
audioChunks = [];
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: { sampleRate: 16000, channelCount: 1 } });
mediaRec = new MediaRecorder(stream, { mimeType: 'audio/webm;codecs=opus' });
mediaRec.ondataavailable = e => { if (e.data.size > 0) audioChunks.push(e.data); };
mediaRec.onstop = () => {
stream.getTracks().forEach(t => t.stop());
const blob = new Blob(audioChunks, { type: 'audio/webm' });
if (mode === 'rt') doRoundTrip(blob);
else doSTT(blob);
};
mediaRec.start();
} catch(e) {
btn.classList.remove('recording');
btn.textContent = mode === 'rt' ? '按住说话 (Round-trip)' : '按住录音';
alert('麦克风权限被拒绝: ' + e.message);
}
}
function _stopRec(mode) {
const btn = document.getElementById(mode === 'rt' ? 'rt-btn' : 'rec-btn');
btn.classList.remove('recording');
btn.textContent = mode === 'rt' ? '按住说话 (Round-trip)' : '按住录音';
if (mediaRec && mediaRec.state === 'recording') mediaRec.stop();
}
async function doSTT(blob) {
const st = document.getElementById('stt-status');
const res = document.getElementById('stt-result');
const pl = document.getElementById('stt-player');
st.textContent = '识别中...'; res.textContent = '';
pl.src = URL.createObjectURL(blob); pl.style.display = 'block';
const t0 = Date.now();
try {
const fd = new FormData(); fd.append('audio', blob, 'recording.webm');
const r = await fetch('/api/v1/test/stt/transcribe', { method: 'POST', body: fd });
const data = await r.json();
st.textContent = '完成!耗时 ' + (Date.now()-t0) + 'ms';
res.textContent = data.text || '(empty)';
} catch(e) { st.textContent = 'Error: ' + e.message; }
}
async function uploadAudio(input) {
if (!input.files[0]) return;
const blob = input.files[0];
const st = document.getElementById('stt-status');
const res = document.getElementById('stt-result');
const pl = document.getElementById('stt-player');
st.textContent = '识别中...'; res.textContent = '';
pl.src = URL.createObjectURL(blob); pl.style.display = 'block';
const t0 = Date.now();
try {
const fd = new FormData(); fd.append('audio', blob, blob.name);
const r = await fetch('/api/v1/test/stt/transcribe', { method: 'POST', body: fd });
const data = await r.json();
st.textContent = '完成!耗时 ' + (Date.now()-t0) + 'ms';
res.textContent = data.text || '(empty)';
} catch(e) { st.textContent = 'Error: ' + e.message; }
input.value = '';
}
async function doRoundTrip(blob) {
const st = document.getElementById('rt-status');
const res = document.getElementById('rt-result');
const pl = document.getElementById('rt-player');
st.textContent = 'STT识别中...'; res.textContent = ''; pl.style.display = 'none';
const t0 = Date.now();
try {
// 1. STT
const fd = new FormData(); fd.append('audio', blob, 'recording.webm');
const r1 = await fetch('/api/v1/test/stt/transcribe', { method: 'POST', body: fd });
const sttData = await r1.json();
const text = sttData.text || '';
const sttMs = Date.now() - t0;
res.textContent = 'STT (' + sttMs + 'ms): ' + (text || '(empty)');
if (!text) { st.textContent = '识别为空'; return; }
// 2. TTS
st.textContent = 'TTS合成中...';
const t1 = Date.now();
const r2 = await fetch('/api/v1/test/tts/synthesize?text=' + encodeURIComponent(text));
if (!r2.ok) { st.textContent = 'TTS Error: ' + r2.status; return; }
const audioBlob = await r2.blob();
const ttsMs = Date.now() - t1;
const totalMs = Date.now() - t0;
st.textContent = '完成STT=' + sttMs + 'ms + TTS=' + ttsMs + 'ms = 总计' + totalMs + 'ms';
res.textContent += '\\nTTS (' + ttsMs + 'ms): ' + (audioBlob.size/1024).toFixed(1) + 'KB';
pl.src = URL.createObjectURL(audioBlob); pl.style.display = 'block'; pl.play();
} catch(e) { st.textContent = 'Error: ' + e.message; }
}
</script>
</body></html>"""
@ -84,7 +213,6 @@ async def tts_synthesize(request: Request, text: str = Query(..., min_length=1,
if tts is None or tts._pipeline is None:
return Response(content="TTS model not loaded", status_code=503)
# Run TTS in thread pool (CPU-bound)
loop = asyncio.get_event_loop()
def _synth():
samples = []
@ -108,3 +236,43 @@ async def tts_synthesize(request: Request, text: str = Query(..., min_length=1,
return Response(content="TTS produced no audio", status_code=500)
return Response(content=wav_bytes, media_type="audio/wav")
@router.post("/stt/transcribe")
async def stt_transcribe(request: Request, audio: UploadFile = File(...)):
"""Transcribe uploaded audio to text via faster-whisper."""
stt = getattr(request.app.state, "stt", None)
if stt is None or stt._model is None:
return {"error": "STT model not loaded", "text": ""}
import tempfile
import os
# Save uploaded file to temp
raw = await audio.read()
suffix = os.path.splitext(audio.filename or "audio.webm")[1] or ".webm"
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as f:
f.write(raw)
tmp_path = f.name
try:
# faster-whisper can handle webm/mp3/wav etc. directly
loop = asyncio.get_event_loop()
def _transcribe():
segments, info = stt._model.transcribe(
tmp_path,
language=stt.language if hasattr(stt, 'language') and stt.language else None,
beam_size=5,
vad_filter=True,
)
text = "".join(seg.text for seg in segments).strip()
return text, info
text, info = await loop.run_in_executor(None, _transcribe)
return {
"text": text,
"language": getattr(info, "language", ""),
"duration": round(getattr(info, "duration", 0), 2),
}
finally:
os.unlink(tmp_path)