feat: add STT test and round-trip test to voice test page
- STT: record from mic or upload audio file → faster-whisper transcription - Round-trip: record → STT → TTS → playback (full pipeline test) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
0aa20cbc73
commit
0bd050c80f
|
|
@ -1,10 +1,10 @@
|
|||
"""Temporary TTS test endpoint — hit /api/v1/test/tts in a browser to verify."""
|
||||
"""Temporary test endpoints for TTS and STT — browser-accessible."""
|
||||
|
||||
import asyncio
|
||||
import io
|
||||
import struct
|
||||
import numpy as np
|
||||
from fastapi import APIRouter, Request, Query
|
||||
from fastapi import APIRouter, Request, Query, UploadFile, File
|
||||
from fastapi.responses import HTMLResponse, Response
|
||||
|
||||
router = APIRouter()
|
||||
|
|
@ -37,41 +37,170 @@ def _make_wav(pcm_bytes: bytes, sample_rate: int = 16000) -> bytes:
|
|||
|
||||
@router.get("/tts", response_class=HTMLResponse)
|
||||
async def tts_test_page():
|
||||
"""Simple HTML page to test TTS."""
|
||||
"""Combined TTS + STT test page."""
|
||||
return """<!DOCTYPE html>
|
||||
<html><head><meta charset="utf-8"><title>TTS Test</title>
|
||||
<html><head><meta charset="utf-8"><title>Voice Test</title>
|
||||
<style>
|
||||
body { font-family: sans-serif; max-width: 600px; margin: 40px auto; padding: 0 20px; }
|
||||
textarea { width: 100%; height: 80px; font-size: 16px; }
|
||||
button { font-size: 18px; padding: 10px 30px; margin-top: 10px; cursor: pointer; }
|
||||
#status { margin-top: 15px; color: #666; }
|
||||
audio { margin-top: 15px; width: 100%; }
|
||||
body { font-family: sans-serif; max-width: 700px; margin: 30px auto; padding: 0 20px; }
|
||||
h2 { border-bottom: 2px solid #333; padding-bottom: 8px; }
|
||||
textarea { width: 100%; height: 70px; font-size: 15px; }
|
||||
button { font-size: 16px; padding: 8px 24px; margin: 8px 4px 8px 0; cursor: pointer; border-radius: 4px; border: 1px solid #999; }
|
||||
button:hover { background: #e0e0e0; }
|
||||
.recording { background: #ff4444 !important; color: white !important; }
|
||||
.status { margin-top: 10px; color: #666; font-size: 14px; }
|
||||
audio { margin-top: 10px; width: 100%; }
|
||||
.section { background: #f8f8f8; padding: 20px; border-radius: 8px; margin-bottom: 20px; }
|
||||
#stt-result { font-size: 18px; color: #333; margin-top: 10px; padding: 10px; background: white; border: 1px solid #ddd; border-radius: 4px; min-height: 40px; }
|
||||
</style></head>
|
||||
<body>
|
||||
<h2>TTS Test</h2>
|
||||
<textarea id="text" placeholder="输入要合成的文本...">你好,我是IT0运维助手。很高兴为您服务!</textarea>
|
||||
<h2>Voice I/O Test</h2>
|
||||
|
||||
<div class="section">
|
||||
<h3>TTS (Text to Speech)</h3>
|
||||
<textarea id="tts-text" placeholder="输入要合成的文本...">你好,我是IT0运维助手。很高兴为您服务!</textarea>
|
||||
<br><button onclick="doTTS()">合成语音</button>
|
||||
<div id="status"></div>
|
||||
<audio id="player" controls style="display:none"></audio>
|
||||
<div class="status" id="tts-status"></div>
|
||||
<audio id="tts-player" controls style="display:none"></audio>
|
||||
</div>
|
||||
|
||||
<div class="section">
|
||||
<h3>STT (Speech to Text)</h3>
|
||||
<p style="font-size:14px;color:#888;">点击录音按钮说话,松开后自动识别。或上传音频文件。</p>
|
||||
<button id="rec-btn" onmousedown="startRec()" onmouseup="stopRec()" ontouchstart="startRec()" ontouchend="stopRec()">按住录音</button>
|
||||
<label style="cursor:pointer;border:1px solid #999;padding:8px 24px;border-radius:4px;font-size:16px;">
|
||||
上传音频 <input type="file" id="audio-file" accept="audio/*" style="display:none" onchange="uploadAudio(this)">
|
||||
</label>
|
||||
<div class="status" id="stt-status"></div>
|
||||
<div id="stt-result"></div>
|
||||
<audio id="stt-player" controls style="display:none"></audio>
|
||||
</div>
|
||||
|
||||
<div class="section">
|
||||
<h3>Round-trip (STT + TTS)</h3>
|
||||
<p style="font-size:14px;color:#888;">录音 → 识别文本 → 再合成语音播放。测试全链路。</p>
|
||||
<button id="rt-btn" onmousedown="startRoundTrip()" onmouseup="stopRoundTrip()" ontouchstart="startRoundTrip()" ontouchend="stopRoundTrip()">按住说话 (Round-trip)</button>
|
||||
<div class="status" id="rt-status"></div>
|
||||
<div id="rt-result"></div>
|
||||
<audio id="rt-player" controls style="display:none"></audio>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
let mediaRec, audioChunks, recMode;
|
||||
|
||||
async function doTTS() {
|
||||
const text = document.getElementById('text').value.trim();
|
||||
const text = document.getElementById('tts-text').value.trim();
|
||||
if (!text) return;
|
||||
const status = document.getElementById('status');
|
||||
const player = document.getElementById('player');
|
||||
status.textContent = '合成中...';
|
||||
player.style.display = 'none';
|
||||
const st = document.getElementById('tts-status');
|
||||
const pl = document.getElementById('tts-player');
|
||||
st.textContent = '合成中...'; pl.style.display = 'none';
|
||||
const t0 = Date.now();
|
||||
try {
|
||||
const resp = await fetch('/api/v1/test/tts/synthesize?text=' + encodeURIComponent(text));
|
||||
if (!resp.ok) { status.textContent = 'Error: ' + resp.status; return; }
|
||||
const blob = await resp.blob();
|
||||
const ms = Date.now() - t0;
|
||||
status.textContent = '完成!耗时 ' + ms + 'ms,大小 ' + (blob.size/1024).toFixed(1) + 'KB';
|
||||
player.src = URL.createObjectURL(blob);
|
||||
player.style.display = 'block';
|
||||
player.play();
|
||||
} catch(e) { status.textContent = 'Error: ' + e.message; }
|
||||
const r = await fetch('/api/v1/test/tts/synthesize?text=' + encodeURIComponent(text));
|
||||
if (!r.ok) { st.textContent = 'Error: ' + r.status + ' ' + await r.text(); return; }
|
||||
const blob = await r.blob();
|
||||
st.textContent = '完成!耗时 ' + (Date.now()-t0) + 'ms,大小 ' + (blob.size/1024).toFixed(1) + 'KB';
|
||||
pl.src = URL.createObjectURL(blob); pl.style.display = 'block'; pl.play();
|
||||
} catch(e) { st.textContent = 'Error: ' + e.message; }
|
||||
}
|
||||
|
||||
function startRec() { _startRec('stt'); }
|
||||
function stopRec() { _stopRec('stt'); }
|
||||
function startRoundTrip() { _startRec('rt'); }
|
||||
function stopRoundTrip() { _stopRec('rt'); }
|
||||
|
||||
async function _startRec(mode) {
|
||||
recMode = mode;
|
||||
const btn = document.getElementById(mode === 'rt' ? 'rt-btn' : 'rec-btn');
|
||||
btn.classList.add('recording');
|
||||
btn.textContent = '录音中...';
|
||||
audioChunks = [];
|
||||
try {
|
||||
const stream = await navigator.mediaDevices.getUserMedia({ audio: { sampleRate: 16000, channelCount: 1 } });
|
||||
mediaRec = new MediaRecorder(stream, { mimeType: 'audio/webm;codecs=opus' });
|
||||
mediaRec.ondataavailable = e => { if (e.data.size > 0) audioChunks.push(e.data); };
|
||||
mediaRec.onstop = () => {
|
||||
stream.getTracks().forEach(t => t.stop());
|
||||
const blob = new Blob(audioChunks, { type: 'audio/webm' });
|
||||
if (mode === 'rt') doRoundTrip(blob);
|
||||
else doSTT(blob);
|
||||
};
|
||||
mediaRec.start();
|
||||
} catch(e) {
|
||||
btn.classList.remove('recording');
|
||||
btn.textContent = mode === 'rt' ? '按住说话 (Round-trip)' : '按住录音';
|
||||
alert('麦克风权限被拒绝: ' + e.message);
|
||||
}
|
||||
}
|
||||
|
||||
function _stopRec(mode) {
|
||||
const btn = document.getElementById(mode === 'rt' ? 'rt-btn' : 'rec-btn');
|
||||
btn.classList.remove('recording');
|
||||
btn.textContent = mode === 'rt' ? '按住说话 (Round-trip)' : '按住录音';
|
||||
if (mediaRec && mediaRec.state === 'recording') mediaRec.stop();
|
||||
}
|
||||
|
||||
async function doSTT(blob) {
|
||||
const st = document.getElementById('stt-status');
|
||||
const res = document.getElementById('stt-result');
|
||||
const pl = document.getElementById('stt-player');
|
||||
st.textContent = '识别中...'; res.textContent = '';
|
||||
pl.src = URL.createObjectURL(blob); pl.style.display = 'block';
|
||||
const t0 = Date.now();
|
||||
try {
|
||||
const fd = new FormData(); fd.append('audio', blob, 'recording.webm');
|
||||
const r = await fetch('/api/v1/test/stt/transcribe', { method: 'POST', body: fd });
|
||||
const data = await r.json();
|
||||
st.textContent = '完成!耗时 ' + (Date.now()-t0) + 'ms';
|
||||
res.textContent = data.text || '(empty)';
|
||||
} catch(e) { st.textContent = 'Error: ' + e.message; }
|
||||
}
|
||||
|
||||
async function uploadAudio(input) {
|
||||
if (!input.files[0]) return;
|
||||
const blob = input.files[0];
|
||||
const st = document.getElementById('stt-status');
|
||||
const res = document.getElementById('stt-result');
|
||||
const pl = document.getElementById('stt-player');
|
||||
st.textContent = '识别中...'; res.textContent = '';
|
||||
pl.src = URL.createObjectURL(blob); pl.style.display = 'block';
|
||||
const t0 = Date.now();
|
||||
try {
|
||||
const fd = new FormData(); fd.append('audio', blob, blob.name);
|
||||
const r = await fetch('/api/v1/test/stt/transcribe', { method: 'POST', body: fd });
|
||||
const data = await r.json();
|
||||
st.textContent = '完成!耗时 ' + (Date.now()-t0) + 'ms';
|
||||
res.textContent = data.text || '(empty)';
|
||||
} catch(e) { st.textContent = 'Error: ' + e.message; }
|
||||
input.value = '';
|
||||
}
|
||||
|
||||
async function doRoundTrip(blob) {
|
||||
const st = document.getElementById('rt-status');
|
||||
const res = document.getElementById('rt-result');
|
||||
const pl = document.getElementById('rt-player');
|
||||
st.textContent = 'STT识别中...'; res.textContent = ''; pl.style.display = 'none';
|
||||
const t0 = Date.now();
|
||||
try {
|
||||
// 1. STT
|
||||
const fd = new FormData(); fd.append('audio', blob, 'recording.webm');
|
||||
const r1 = await fetch('/api/v1/test/stt/transcribe', { method: 'POST', body: fd });
|
||||
const sttData = await r1.json();
|
||||
const text = sttData.text || '';
|
||||
const sttMs = Date.now() - t0;
|
||||
res.textContent = 'STT (' + sttMs + 'ms): ' + (text || '(empty)');
|
||||
if (!text) { st.textContent = '识别为空'; return; }
|
||||
// 2. TTS
|
||||
st.textContent = 'TTS合成中...';
|
||||
const t1 = Date.now();
|
||||
const r2 = await fetch('/api/v1/test/tts/synthesize?text=' + encodeURIComponent(text));
|
||||
if (!r2.ok) { st.textContent = 'TTS Error: ' + r2.status; return; }
|
||||
const audioBlob = await r2.blob();
|
||||
const ttsMs = Date.now() - t1;
|
||||
const totalMs = Date.now() - t0;
|
||||
st.textContent = '完成!STT=' + sttMs + 'ms + TTS=' + ttsMs + 'ms = 总计' + totalMs + 'ms';
|
||||
res.textContent += '\\nTTS (' + ttsMs + 'ms): ' + (audioBlob.size/1024).toFixed(1) + 'KB';
|
||||
pl.src = URL.createObjectURL(audioBlob); pl.style.display = 'block'; pl.play();
|
||||
} catch(e) { st.textContent = 'Error: ' + e.message; }
|
||||
}
|
||||
</script>
|
||||
</body></html>"""
|
||||
|
|
@ -84,7 +213,6 @@ async def tts_synthesize(request: Request, text: str = Query(..., min_length=1,
|
|||
if tts is None or tts._pipeline is None:
|
||||
return Response(content="TTS model not loaded", status_code=503)
|
||||
|
||||
# Run TTS in thread pool (CPU-bound)
|
||||
loop = asyncio.get_event_loop()
|
||||
def _synth():
|
||||
samples = []
|
||||
|
|
@ -108,3 +236,43 @@ async def tts_synthesize(request: Request, text: str = Query(..., min_length=1,
|
|||
return Response(content="TTS produced no audio", status_code=500)
|
||||
|
||||
return Response(content=wav_bytes, media_type="audio/wav")
|
||||
|
||||
|
||||
@router.post("/stt/transcribe")
|
||||
async def stt_transcribe(request: Request, audio: UploadFile = File(...)):
|
||||
"""Transcribe uploaded audio to text via faster-whisper."""
|
||||
stt = getattr(request.app.state, "stt", None)
|
||||
if stt is None or stt._model is None:
|
||||
return {"error": "STT model not loaded", "text": ""}
|
||||
|
||||
import tempfile
|
||||
import os
|
||||
|
||||
# Save uploaded file to temp
|
||||
raw = await audio.read()
|
||||
suffix = os.path.splitext(audio.filename or "audio.webm")[1] or ".webm"
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as f:
|
||||
f.write(raw)
|
||||
tmp_path = f.name
|
||||
|
||||
try:
|
||||
# faster-whisper can handle webm/mp3/wav etc. directly
|
||||
loop = asyncio.get_event_loop()
|
||||
def _transcribe():
|
||||
segments, info = stt._model.transcribe(
|
||||
tmp_path,
|
||||
language=stt.language if hasattr(stt, 'language') and stt.language else None,
|
||||
beam_size=5,
|
||||
vad_filter=True,
|
||||
)
|
||||
text = "".join(seg.text for seg in segments).strip()
|
||||
return text, info
|
||||
|
||||
text, info = await loop.run_in_executor(None, _transcribe)
|
||||
return {
|
||||
"text": text,
|
||||
"language": getattr(info, "language", ""),
|
||||
"duration": round(getattr(info, "duration", 0), 2),
|
||||
}
|
||||
finally:
|
||||
os.unlink(tmp_path)
|
||||
|
|
|
|||
Loading…
Reference in New Issue