feat: add temporary TTS test page at /api/v1/test/tts

Browser-accessible page to test text-to-speech synthesis without going through the full voice pipeline. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 05:06:02 -08:00 · 2026-02-24 05:06:02 -08:00 · 0aa20cbc73
parent 740f8f5f88
commit 0aa20cbc73
2 changed files with 112 additions and 0 deletions
--- a/packages/services/voice-service/src/api/main.py
+++ b/packages/services/voice-service/src/api/main.py
@ -10,6 +10,7 @@ from ..config.settings import settings
 from .health import router as health_router
 from .session_router import router as session_router
 from .twilio_webhook import router as twilio_router
 from .test_tts import router as test_tts_router
 logger = logging.getLogger(__name__)
@ -29,6 +30,7 @@ app.add_middleware(
 app.include_router(health_router, tags=["health"])
 app.include_router(session_router, prefix="/api/v1/voice", tags=["sessions"])
 app.include_router(twilio_router, prefix="/api/v1/twilio", tags=["twilio"])
 app.include_router(test_tts_router, prefix="/api/v1/test", tags=["test"])
 # ---------------------------------------------------------------------------
--- a/packages/services/voice-service/src/api/test_tts.py
+++ b/packages/services/voice-service/src/api/test_tts.py
@ -0,0 +1,110 @@
 """Temporary TTS test endpoint — hit /api/v1/test/tts in a browser to verify."""
 import asyncio
 import io
 import struct
 import numpy as np
 from fastapi import APIRouter, Request, Query
 from fastapi.responses import HTMLResponse, Response
 router = APIRouter()
 _SAMPLE_RATE = 24000  # Kokoro native output rate
 def _make_wav(pcm_bytes: bytes, sample_rate: int = 16000) -> bytes:
    """Wrap raw 16-bit PCM into a WAV container."""
    buf = io.BytesIO()
    num_samples = len(pcm_bytes) // 2
    data_size = num_samples * 2
    # WAV header
    buf.write(b"RIFF")
    buf.write(struct.pack("<I", 36 + data_size))
    buf.write(b"WAVE")
    buf.write(b"fmt ")
    buf.write(struct.pack("<I", 16))           # chunk size
    buf.write(struct.pack("<H", 1))            # PCM
    buf.write(struct.pack("<H", 1))            # mono
    buf.write(struct.pack("<I", sample_rate))   # sample rate
    buf.write(struct.pack("<I", sample_rate * 2))  # byte rate
    buf.write(struct.pack("<H", 2))            # block align
    buf.write(struct.pack("<H", 16))           # bits per sample
    buf.write(b"data")
    buf.write(struct.pack("<I", data_size))
    buf.write(pcm_bytes)
    return buf.getvalue()
@router.get("/tts", response_class=HTMLResponse)
 async def tts_test_page():
    """Simple HTML page to test TTS."""
    return """<!DOCTYPE html>
 <html><head><meta charset="utf-8"><title>TTS Test</title>
 <style>
 body { font-family: sans-serif; max-width: 600px; margin: 40px auto; padding: 0 20px; }
 textarea { width: 100%; height: 80px; font-size: 16px; }
 button { font-size: 18px; padding: 10px 30px; margin-top: 10px; cursor: pointer; }
 #status { margin-top: 15px; color: #666; }
 audio { margin-top: 15px; width: 100%; }
 </style></head>
 <body>
 <h2>TTS Test</h2>
 <textarea id="text" placeholder="输入要合成的文本...">你好，我是IT0运维助手。很高兴为您服务！</textarea>
 <br><button onclick="doTTS()">合成语音</button>
 <div id="status"></div>
 <audio id="player" controls style="display:none"></audio>
 <script>
 async function doTTS() {
  const text = document.getElementById('text').value.trim();
  if (!text) return;
  const status = document.getElementById('status');
  const player = document.getElementById('player');
  status.textContent = '合成中...';
  player.style.display = 'none';
  const t0 = Date.now();
  try {
    const resp = await fetch('/api/v1/test/tts/synthesize?text=' + encodeURIComponent(text));
    if (!resp.ok) { status.textContent = 'Error: ' + resp.status; return; }
    const blob = await resp.blob();
    const ms = Date.now() - t0;
    status.textContent = '完成！耗时 ' + ms + 'ms，大小 ' + (blob.size/1024).toFixed(1) + 'KB';
    player.src = URL.createObjectURL(blob);
    player.style.display = 'block';
    player.play();
  } catch(e) { status.textContent = 'Error: ' + e.message; }
 }
 </script>
 </body></html>"""
@router.get("/tts/synthesize")
 async def tts_synthesize(request: Request, text: str = Query(..., min_length=1, max_length=500)):
    """Synthesize text to WAV audio."""
    tts = getattr(request.app.state, "tts", None)
    if tts is None or tts._pipeline is None:
        return Response(content="TTS model not loaded", status_code=503)
    # Run TTS in thread pool (CPU-bound)
    loop = asyncio.get_event_loop()
    def _synth():
        samples = []
        for _, _, audio in tts._pipeline(text, voice=tts.voice):
            if hasattr(audio, "numpy"):
                samples.append(audio.numpy())
            else:
                samples.append(audio)
        if not samples:
            return b""
        audio_np = np.concatenate(samples)
        # Resample 24kHz → 16kHz
        target_len = int(len(audio_np) / _SAMPLE_RATE * 16000)
        indices = np.linspace(0, len(audio_np) - 1, target_len)
        resampled = np.interp(indices, np.arange(len(audio_np)), audio_np)
        pcm = (resampled * 32768).clip(-32768, 32767).astype(np.int16).tobytes()
        return _make_wav(pcm, 16000)
    wav_bytes = await loop.run_in_executor(None, _synth)
    if not wav_bytes:
        return Response(content="TTS produced no audio", status_code=500)
    return Response(content=wav_bytes, media_type="audio/wav")