feat: add STT test and round-trip test to voice test page
- STT: record from mic or upload audio file → faster-whisper transcription - Round-trip: record → STT → TTS → playback (full pipeline test) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
0aa20cbc73
commit
0bd050c80f
|
|
@ -1,10 +1,10 @@
|
||||||
"""Temporary TTS test endpoint — hit /api/v1/test/tts in a browser to verify."""
|
"""Temporary test endpoints for TTS and STT — browser-accessible."""
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import io
|
import io
|
||||||
import struct
|
import struct
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from fastapi import APIRouter, Request, Query
|
from fastapi import APIRouter, Request, Query, UploadFile, File
|
||||||
from fastapi.responses import HTMLResponse, Response
|
from fastapi.responses import HTMLResponse, Response
|
||||||
|
|
||||||
router = APIRouter()
|
router = APIRouter()
|
||||||
|
|
@ -37,41 +37,170 @@ def _make_wav(pcm_bytes: bytes, sample_rate: int = 16000) -> bytes:
|
||||||
|
|
||||||
@router.get("/tts", response_class=HTMLResponse)
|
@router.get("/tts", response_class=HTMLResponse)
|
||||||
async def tts_test_page():
|
async def tts_test_page():
|
||||||
"""Simple HTML page to test TTS."""
|
"""Combined TTS + STT test page."""
|
||||||
return """<!DOCTYPE html>
|
return """<!DOCTYPE html>
|
||||||
<html><head><meta charset="utf-8"><title>TTS Test</title>
|
<html><head><meta charset="utf-8"><title>Voice Test</title>
|
||||||
<style>
|
<style>
|
||||||
body { font-family: sans-serif; max-width: 600px; margin: 40px auto; padding: 0 20px; }
|
body { font-family: sans-serif; max-width: 700px; margin: 30px auto; padding: 0 20px; }
|
||||||
textarea { width: 100%; height: 80px; font-size: 16px; }
|
h2 { border-bottom: 2px solid #333; padding-bottom: 8px; }
|
||||||
button { font-size: 18px; padding: 10px 30px; margin-top: 10px; cursor: pointer; }
|
textarea { width: 100%; height: 70px; font-size: 15px; }
|
||||||
#status { margin-top: 15px; color: #666; }
|
button { font-size: 16px; padding: 8px 24px; margin: 8px 4px 8px 0; cursor: pointer; border-radius: 4px; border: 1px solid #999; }
|
||||||
audio { margin-top: 15px; width: 100%; }
|
button:hover { background: #e0e0e0; }
|
||||||
|
.recording { background: #ff4444 !important; color: white !important; }
|
||||||
|
.status { margin-top: 10px; color: #666; font-size: 14px; }
|
||||||
|
audio { margin-top: 10px; width: 100%; }
|
||||||
|
.section { background: #f8f8f8; padding: 20px; border-radius: 8px; margin-bottom: 20px; }
|
||||||
|
#stt-result { font-size: 18px; color: #333; margin-top: 10px; padding: 10px; background: white; border: 1px solid #ddd; border-radius: 4px; min-height: 40px; }
|
||||||
</style></head>
|
</style></head>
|
||||||
<body>
|
<body>
|
||||||
<h2>TTS Test</h2>
|
<h2>Voice I/O Test</h2>
|
||||||
<textarea id="text" placeholder="输入要合成的文本...">你好,我是IT0运维助手。很高兴为您服务!</textarea>
|
|
||||||
|
<div class="section">
|
||||||
|
<h3>TTS (Text to Speech)</h3>
|
||||||
|
<textarea id="tts-text" placeholder="输入要合成的文本...">你好,我是IT0运维助手。很高兴为您服务!</textarea>
|
||||||
<br><button onclick="doTTS()">合成语音</button>
|
<br><button onclick="doTTS()">合成语音</button>
|
||||||
<div id="status"></div>
|
<div class="status" id="tts-status"></div>
|
||||||
<audio id="player" controls style="display:none"></audio>
|
<audio id="tts-player" controls style="display:none"></audio>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="section">
|
||||||
|
<h3>STT (Speech to Text)</h3>
|
||||||
|
<p style="font-size:14px;color:#888;">点击录音按钮说话,松开后自动识别。或上传音频文件。</p>
|
||||||
|
<button id="rec-btn" onmousedown="startRec()" onmouseup="stopRec()" ontouchstart="startRec()" ontouchend="stopRec()">按住录音</button>
|
||||||
|
<label style="cursor:pointer;border:1px solid #999;padding:8px 24px;border-radius:4px;font-size:16px;">
|
||||||
|
上传音频 <input type="file" id="audio-file" accept="audio/*" style="display:none" onchange="uploadAudio(this)">
|
||||||
|
</label>
|
||||||
|
<div class="status" id="stt-status"></div>
|
||||||
|
<div id="stt-result"></div>
|
||||||
|
<audio id="stt-player" controls style="display:none"></audio>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="section">
|
||||||
|
<h3>Round-trip (STT + TTS)</h3>
|
||||||
|
<p style="font-size:14px;color:#888;">录音 → 识别文本 → 再合成语音播放。测试全链路。</p>
|
||||||
|
<button id="rt-btn" onmousedown="startRoundTrip()" onmouseup="stopRoundTrip()" ontouchstart="startRoundTrip()" ontouchend="stopRoundTrip()">按住说话 (Round-trip)</button>
|
||||||
|
<div class="status" id="rt-status"></div>
|
||||||
|
<div id="rt-result"></div>
|
||||||
|
<audio id="rt-player" controls style="display:none"></audio>
|
||||||
|
</div>
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
|
let mediaRec, audioChunks, recMode;
|
||||||
|
|
||||||
async function doTTS() {
|
async function doTTS() {
|
||||||
const text = document.getElementById('text').value.trim();
|
const text = document.getElementById('tts-text').value.trim();
|
||||||
if (!text) return;
|
if (!text) return;
|
||||||
const status = document.getElementById('status');
|
const st = document.getElementById('tts-status');
|
||||||
const player = document.getElementById('player');
|
const pl = document.getElementById('tts-player');
|
||||||
status.textContent = '合成中...';
|
st.textContent = '合成中...'; pl.style.display = 'none';
|
||||||
player.style.display = 'none';
|
|
||||||
const t0 = Date.now();
|
const t0 = Date.now();
|
||||||
try {
|
try {
|
||||||
const resp = await fetch('/api/v1/test/tts/synthesize?text=' + encodeURIComponent(text));
|
const r = await fetch('/api/v1/test/tts/synthesize?text=' + encodeURIComponent(text));
|
||||||
if (!resp.ok) { status.textContent = 'Error: ' + resp.status; return; }
|
if (!r.ok) { st.textContent = 'Error: ' + r.status + ' ' + await r.text(); return; }
|
||||||
const blob = await resp.blob();
|
const blob = await r.blob();
|
||||||
const ms = Date.now() - t0;
|
st.textContent = '完成!耗时 ' + (Date.now()-t0) + 'ms,大小 ' + (blob.size/1024).toFixed(1) + 'KB';
|
||||||
status.textContent = '完成!耗时 ' + ms + 'ms,大小 ' + (blob.size/1024).toFixed(1) + 'KB';
|
pl.src = URL.createObjectURL(blob); pl.style.display = 'block'; pl.play();
|
||||||
player.src = URL.createObjectURL(blob);
|
} catch(e) { st.textContent = 'Error: ' + e.message; }
|
||||||
player.style.display = 'block';
|
}
|
||||||
player.play();
|
|
||||||
} catch(e) { status.textContent = 'Error: ' + e.message; }
|
function startRec() { _startRec('stt'); }
|
||||||
|
function stopRec() { _stopRec('stt'); }
|
||||||
|
function startRoundTrip() { _startRec('rt'); }
|
||||||
|
function stopRoundTrip() { _stopRec('rt'); }
|
||||||
|
|
||||||
|
async function _startRec(mode) {
|
||||||
|
recMode = mode;
|
||||||
|
const btn = document.getElementById(mode === 'rt' ? 'rt-btn' : 'rec-btn');
|
||||||
|
btn.classList.add('recording');
|
||||||
|
btn.textContent = '录音中...';
|
||||||
|
audioChunks = [];
|
||||||
|
try {
|
||||||
|
const stream = await navigator.mediaDevices.getUserMedia({ audio: { sampleRate: 16000, channelCount: 1 } });
|
||||||
|
mediaRec = new MediaRecorder(stream, { mimeType: 'audio/webm;codecs=opus' });
|
||||||
|
mediaRec.ondataavailable = e => { if (e.data.size > 0) audioChunks.push(e.data); };
|
||||||
|
mediaRec.onstop = () => {
|
||||||
|
stream.getTracks().forEach(t => t.stop());
|
||||||
|
const blob = new Blob(audioChunks, { type: 'audio/webm' });
|
||||||
|
if (mode === 'rt') doRoundTrip(blob);
|
||||||
|
else doSTT(blob);
|
||||||
|
};
|
||||||
|
mediaRec.start();
|
||||||
|
} catch(e) {
|
||||||
|
btn.classList.remove('recording');
|
||||||
|
btn.textContent = mode === 'rt' ? '按住说话 (Round-trip)' : '按住录音';
|
||||||
|
alert('麦克风权限被拒绝: ' + e.message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function _stopRec(mode) {
|
||||||
|
const btn = document.getElementById(mode === 'rt' ? 'rt-btn' : 'rec-btn');
|
||||||
|
btn.classList.remove('recording');
|
||||||
|
btn.textContent = mode === 'rt' ? '按住说话 (Round-trip)' : '按住录音';
|
||||||
|
if (mediaRec && mediaRec.state === 'recording') mediaRec.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
async function doSTT(blob) {
|
||||||
|
const st = document.getElementById('stt-status');
|
||||||
|
const res = document.getElementById('stt-result');
|
||||||
|
const pl = document.getElementById('stt-player');
|
||||||
|
st.textContent = '识别中...'; res.textContent = '';
|
||||||
|
pl.src = URL.createObjectURL(blob); pl.style.display = 'block';
|
||||||
|
const t0 = Date.now();
|
||||||
|
try {
|
||||||
|
const fd = new FormData(); fd.append('audio', blob, 'recording.webm');
|
||||||
|
const r = await fetch('/api/v1/test/stt/transcribe', { method: 'POST', body: fd });
|
||||||
|
const data = await r.json();
|
||||||
|
st.textContent = '完成!耗时 ' + (Date.now()-t0) + 'ms';
|
||||||
|
res.textContent = data.text || '(empty)';
|
||||||
|
} catch(e) { st.textContent = 'Error: ' + e.message; }
|
||||||
|
}
|
||||||
|
|
||||||
|
async function uploadAudio(input) {
|
||||||
|
if (!input.files[0]) return;
|
||||||
|
const blob = input.files[0];
|
||||||
|
const st = document.getElementById('stt-status');
|
||||||
|
const res = document.getElementById('stt-result');
|
||||||
|
const pl = document.getElementById('stt-player');
|
||||||
|
st.textContent = '识别中...'; res.textContent = '';
|
||||||
|
pl.src = URL.createObjectURL(blob); pl.style.display = 'block';
|
||||||
|
const t0 = Date.now();
|
||||||
|
try {
|
||||||
|
const fd = new FormData(); fd.append('audio', blob, blob.name);
|
||||||
|
const r = await fetch('/api/v1/test/stt/transcribe', { method: 'POST', body: fd });
|
||||||
|
const data = await r.json();
|
||||||
|
st.textContent = '完成!耗时 ' + (Date.now()-t0) + 'ms';
|
||||||
|
res.textContent = data.text || '(empty)';
|
||||||
|
} catch(e) { st.textContent = 'Error: ' + e.message; }
|
||||||
|
input.value = '';
|
||||||
|
}
|
||||||
|
|
||||||
|
async function doRoundTrip(blob) {
|
||||||
|
const st = document.getElementById('rt-status');
|
||||||
|
const res = document.getElementById('rt-result');
|
||||||
|
const pl = document.getElementById('rt-player');
|
||||||
|
st.textContent = 'STT识别中...'; res.textContent = ''; pl.style.display = 'none';
|
||||||
|
const t0 = Date.now();
|
||||||
|
try {
|
||||||
|
// 1. STT
|
||||||
|
const fd = new FormData(); fd.append('audio', blob, 'recording.webm');
|
||||||
|
const r1 = await fetch('/api/v1/test/stt/transcribe', { method: 'POST', body: fd });
|
||||||
|
const sttData = await r1.json();
|
||||||
|
const text = sttData.text || '';
|
||||||
|
const sttMs = Date.now() - t0;
|
||||||
|
res.textContent = 'STT (' + sttMs + 'ms): ' + (text || '(empty)');
|
||||||
|
if (!text) { st.textContent = '识别为空'; return; }
|
||||||
|
// 2. TTS
|
||||||
|
st.textContent = 'TTS合成中...';
|
||||||
|
const t1 = Date.now();
|
||||||
|
const r2 = await fetch('/api/v1/test/tts/synthesize?text=' + encodeURIComponent(text));
|
||||||
|
if (!r2.ok) { st.textContent = 'TTS Error: ' + r2.status; return; }
|
||||||
|
const audioBlob = await r2.blob();
|
||||||
|
const ttsMs = Date.now() - t1;
|
||||||
|
const totalMs = Date.now() - t0;
|
||||||
|
st.textContent = '完成!STT=' + sttMs + 'ms + TTS=' + ttsMs + 'ms = 总计' + totalMs + 'ms';
|
||||||
|
res.textContent += '\\nTTS (' + ttsMs + 'ms): ' + (audioBlob.size/1024).toFixed(1) + 'KB';
|
||||||
|
pl.src = URL.createObjectURL(audioBlob); pl.style.display = 'block'; pl.play();
|
||||||
|
} catch(e) { st.textContent = 'Error: ' + e.message; }
|
||||||
}
|
}
|
||||||
</script>
|
</script>
|
||||||
</body></html>"""
|
</body></html>"""
|
||||||
|
|
@ -84,7 +213,6 @@ async def tts_synthesize(request: Request, text: str = Query(..., min_length=1,
|
||||||
if tts is None or tts._pipeline is None:
|
if tts is None or tts._pipeline is None:
|
||||||
return Response(content="TTS model not loaded", status_code=503)
|
return Response(content="TTS model not loaded", status_code=503)
|
||||||
|
|
||||||
# Run TTS in thread pool (CPU-bound)
|
|
||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
def _synth():
|
def _synth():
|
||||||
samples = []
|
samples = []
|
||||||
|
|
@ -108,3 +236,43 @@ async def tts_synthesize(request: Request, text: str = Query(..., min_length=1,
|
||||||
return Response(content="TTS produced no audio", status_code=500)
|
return Response(content="TTS produced no audio", status_code=500)
|
||||||
|
|
||||||
return Response(content=wav_bytes, media_type="audio/wav")
|
return Response(content=wav_bytes, media_type="audio/wav")
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/stt/transcribe")
|
||||||
|
async def stt_transcribe(request: Request, audio: UploadFile = File(...)):
|
||||||
|
"""Transcribe uploaded audio to text via faster-whisper."""
|
||||||
|
stt = getattr(request.app.state, "stt", None)
|
||||||
|
if stt is None or stt._model is None:
|
||||||
|
return {"error": "STT model not loaded", "text": ""}
|
||||||
|
|
||||||
|
import tempfile
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Save uploaded file to temp
|
||||||
|
raw = await audio.read()
|
||||||
|
suffix = os.path.splitext(audio.filename or "audio.webm")[1] or ".webm"
|
||||||
|
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as f:
|
||||||
|
f.write(raw)
|
||||||
|
tmp_path = f.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
# faster-whisper can handle webm/mp3/wav etc. directly
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
def _transcribe():
|
||||||
|
segments, info = stt._model.transcribe(
|
||||||
|
tmp_path,
|
||||||
|
language=stt.language if hasattr(stt, 'language') and stt.language else None,
|
||||||
|
beam_size=5,
|
||||||
|
vad_filter=True,
|
||||||
|
)
|
||||||
|
text = "".join(seg.text for seg in segments).strip()
|
||||||
|
return text, info
|
||||||
|
|
||||||
|
text, info = await loop.run_in_executor(None, _transcribe)
|
||||||
|
return {
|
||||||
|
"text": text,
|
||||||
|
"language": getattr(info, "language", ""),
|
||||||
|
"duration": round(getattr(info, "duration", 0), 2),
|
||||||
|
}
|
||||||
|
finally:
|
||||||
|
os.unlink(tmp_path)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue