feat: add OpenAI TTS/STT API endpoints for comparison testing

- Add openai package to voice-service requirements - Add /api/v1/test/tts/synthesize-openai (tts-1/tts-1-hd/gpt-4o-mini-tts) - Add /api/v1/test/stt/transcribe-openai (gpt-4o-transcribe/whisper-1) - Add OPENAI_API_KEY and OPENAI_BASE_URL env vars to voice-service - Flutter test page: SegmentedButton to toggle Local/OpenAI provider - All endpoints maintain same response format for easy comparison Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 07:20:03 -08:00 · 2026-02-24 07:20:03 -08:00 · d43baed3a5
parent ac0b8ee1c6
commit d43baed3a5
4 changed files with 172 additions and 34 deletions
--- a/deploy/docker/docker-compose.yml
+++ b/deploy/docker/docker-compose.yml
@ -327,6 +327,8 @@ services:
      - KOKORO_MODEL=${KOKORO_MODEL:-kokoro-82m}
      - KOKORO_VOICE=${KOKORO_VOICE:-zf_xiaoxiao}
      - DEVICE=${VOICE_DEVICE:-cpu}
+      - OPENAI_API_KEY=${OPENAI_API_KEY}
+      - OPENAI_BASE_URL=${OPENAI_BASE_URL}
    healthcheck:
      test: ["CMD-SHELL", "python3 -c \"import urllib.request; urllib.request.urlopen('http://localhost:3008/docs')\""]
      interval: 30s
--- a/it0_app/lib/features/agent_call/presentation/pages/voice_test_page.dart
+++ b/it0_app/lib/features/agent_call/presentation/pages/voice_test_page.dart
@ -10,8 +10,10 @@ import 'package:path/path.dart' as p;
 import 'package:permission_handler/permission_handler.dart';
 import '../../../../core/network/dio_client.dart';

+enum _VoiceProvider { local, openai }
+
 /// Temporary voice I/O test page — TTS + STT + Round-trip.
-/// Uses flutter_sound for both recording and playback.
+/// Supports switching between Local (Kokoro/faster-whisper) and OpenAI APIs.
 class VoiceTestPage extends ConsumerStatefulWidget {
  const VoiceTestPage({super.key});

@ -28,6 +30,8 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {
  bool _playerReady = false;
  bool _recorderReady = false;

+  _VoiceProvider _provider = _VoiceProvider.local;
+
  String _ttsStatus = '';
  String _sttStatus = '';
  String _sttResult = '';
@ -38,13 +42,22 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {
  String _recordMode = ''; // 'stt' or 'rt'
  String? _recordingPath;

+  // Endpoint paths based on provider
+  String get _ttsEndpoint => _provider == _VoiceProvider.openai
+      ? '/api/v1/test/tts/synthesize-openai'
+      : '/api/v1/test/tts/synthesize';
+
+  String get _sttEndpoint => _provider == _VoiceProvider.openai
+      ? '/api/v1/test/stt/transcribe-openai'
+      : '/api/v1/test/stt/transcribe';
+
  Dio get _dioBinary {
    final base = ref.read(dioClientProvider);
    return Dio(BaseOptions(
      baseUrl: base.options.baseUrl,
      headers: Map.from(base.options.headers),
      connectTimeout: const Duration(seconds: 30),
-      receiveTimeout: const Duration(seconds: 60),
+      receiveTimeout: const Duration(seconds: 120),
      responseType: ResponseType.bytes,
    ))..interceptors.addAll(base.interceptors);
  }
@ -77,26 +90,27 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {
  Future<void> _doTTS() async {
    final text = _ttsController.text.trim();
    if (text.isEmpty) return;
+    final label = _provider == _VoiceProvider.openai ? 'OpenAI' : 'Local';
    setState(() {
      _isSynthesizing = true;
-      _ttsStatus = '合成中...';
+      _ttsStatus = '[$label] 合成中...';
    });
    final sw = Stopwatch()..start();
    try {
      final resp = await _dioBinary.get(
-        '/api/v1/test/tts/synthesize',
+        _ttsEndpoint,
        queryParameters: {'text': text},
      );
      sw.stop();
      final bytes = resp.data as List<int>;
      setState(() {
        _ttsStatus =
-            '完成！耗时 ${sw.elapsedMilliseconds}ms，大小 ${(bytes.length / 1024).toStringAsFixed(1)}KB';
+            '[$label] 完成！耗时 ${sw.elapsedMilliseconds}ms，大小 ${(bytes.length / 1024).toStringAsFixed(1)}KB';
      });
      await _playWav(Uint8List.fromList(bytes));
    } catch (e) {
      sw.stop();
-      setState(() => _ttsStatus = '错误: $e');
+      setState(() => _ttsStatus = '[$label] 错误: $e');
    } finally {
      setState(() => _isSynthesizing = false);
    }
@ -161,26 +175,26 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {

  // ---- STT ----
  Future<void> _doSTT(String audioPath) async {
-    setState(() => _sttStatus = '识别中...');
+    final label = _provider == _VoiceProvider.openai ? 'OpenAI' : 'Local';
+    setState(() => _sttStatus = '[$label] 识别中...');
    final sw = Stopwatch()..start();
    try {
      final formData = FormData.fromMap({
        'audio':
            await MultipartFile.fromFile(audioPath, filename: 'recording.wav'),
      });
-      final resp =
-          await _dioJson.post('/api/v1/test/stt/transcribe', data: formData);
+      final resp = await _dioJson.post(_sttEndpoint, data: formData);
      sw.stop();
      final data = resp.data as Map<String, dynamic>;
      setState(() {
        _sttResult = data['text'] ?? '(empty)';
-        _sttStatus =
-            '完成！耗时 ${sw.elapsedMilliseconds}ms，时长 ${data['duration'] ?? 0}s';
+        final extra = data['duration'] != null ? '，时长 ${data['duration']}s' : '';
+        _sttStatus = '[$label] 完成！耗时 ${sw.elapsedMilliseconds}ms$extra';
      });
    } catch (e) {
      sw.stop();
      setState(() {
-        _sttStatus = '错误: $e';
+        _sttStatus = '[$label] 错误: $e';
        _sttResult = '';
      });
    } finally {
@ -190,7 +204,8 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {

  // ---- Round-trip: STT → TTS ----
  Future<void> _doRoundTrip(String audioPath) async {
-    setState(() => _rtStatus = 'STT 识别中...');
+    final label = _provider == _VoiceProvider.openai ? 'OpenAI' : 'Local';
+    setState(() => _rtStatus = '[$label] STT 识别中...');
    final totalSw = Stopwatch()..start();
    try {
      // 1. STT
@ -199,24 +214,23 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {
        'audio':
            await MultipartFile.fromFile(audioPath, filename: 'recording.wav'),
      });
-      final sttResp =
-          await _dioJson.post('/api/v1/test/stt/transcribe', data: formData);
+      final sttResp = await _dioJson.post(_sttEndpoint, data: formData);
      sttSw.stop();
      final sttData = sttResp.data as Map<String, dynamic>;
      final text = sttData['text'] ?? '';
      setState(() {
-        _rtResult = 'STT (${sttSw.elapsedMilliseconds}ms): $text';
-        _rtStatus = 'TTS 合成中...';
+        _rtResult = '[$label] STT (${sttSw.elapsedMilliseconds}ms): $text';
+        _rtStatus = '[$label] TTS 合成中...';
      });
      if (text.isEmpty) {
-        setState(() => _rtStatus = 'STT 识别为空');
+        setState(() => _rtStatus = '[$label] STT 识别为空');
        return;
      }

      // 2. TTS
      final ttsSw = Stopwatch()..start();
      final ttsResp = await _dioBinary.get(
-        '/api/v1/test/tts/synthesize',
+        _ttsEndpoint,
        queryParameters: {'text': text},
      );
      ttsSw.stop();
@ -224,14 +238,14 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {
      final audioBytes = ttsResp.data as List<int>;
      setState(() {
        _rtResult +=
-            '\nTTS (${ttsSw.elapsedMilliseconds}ms): ${(audioBytes.length / 1024).toStringAsFixed(1)}KB';
+            '\n[$label] TTS (${ttsSw.elapsedMilliseconds}ms): ${(audioBytes.length / 1024).toStringAsFixed(1)}KB';
        _rtStatus =
-            '完成！STT=${sttSw.elapsedMilliseconds}ms + TTS=${ttsSw.elapsedMilliseconds}ms = ${totalSw.elapsedMilliseconds}ms';
+            '[$label] STT=${sttSw.elapsedMilliseconds}ms + TTS=${ttsSw.elapsedMilliseconds}ms = ${totalSw.elapsedMilliseconds}ms';
      });
      await _playWav(Uint8List.fromList(audioBytes));
    } catch (e) {
      totalSw.stop();
-      setState(() => _rtStatus = '错误: $e');
+      setState(() => _rtStatus = '[$label] 错误: $e');
    } finally {
      _cleanupFile(audioPath);
    }
@ -259,6 +273,39 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {
      body: ListView(
        padding: const EdgeInsets.all(16),
        children: [
+          // Provider toggle
+          Center(
+            child: SegmentedButton<_VoiceProvider>(
+              segments: const [
+                ButtonSegment(
+                  value: _VoiceProvider.local,
+                  label: Text('Local'),
+                  icon: Icon(Icons.computer, size: 18),
+                ),
+                ButtonSegment(
+                  value: _VoiceProvider.openai,
+                  label: Text('OpenAI'),
+                  icon: Icon(Icons.cloud, size: 18),
+                ),
+              ],
+              selected: {_provider},
+              onSelectionChanged: (Set<_VoiceProvider> sel) {
+                setState(() => _provider = sel.first);
+              },
+            ),
+          ),
+          Padding(
+            padding: const EdgeInsets.only(top: 4, bottom: 12),
+            child: Center(
+              child: Text(
+                _provider == _VoiceProvider.openai
+                    ? 'STT: gpt-4o-transcribe  |  TTS: tts-1'
+                    : 'STT: faster-whisper  |  TTS: Kokoro',
+                style: TextStyle(fontSize: 12, color: Colors.grey[500]),
+              ),
+            ),
+          ),
+          // TTS Section
          _buildSection(
            title: 'TTS (文本转语音)',
            child: Column(
@ -287,12 +334,14 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {
                  Padding(
                    padding: const EdgeInsets.only(top: 8),
                    child: Text(_ttsStatus,
-                        style: TextStyle(color: Colors.grey[600], fontSize: 13)),
+                        style:
+                            TextStyle(color: Colors.grey[600], fontSize: 13)),
                  ),
              ],
            ),
          ),
          const SizedBox(height: 16),
+          // STT Section
          _buildSection(
            title: 'STT (语音转文本)',
            child: Column(
@ -304,8 +353,9 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {
                  child: ElevatedButton.icon(
                    onPressed: () {},
                    style: ElevatedButton.styleFrom(
-                      backgroundColor:
-                          _isRecording && _recordMode == 'stt' ? Colors.red : null,
+                      backgroundColor: _isRecording && _recordMode == 'stt'
+                          ? Colors.red
+                          : null,
                    ),
                    icon: Icon(_isRecording && _recordMode == 'stt'
                        ? Icons.mic
@ -319,7 +369,8 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {
                  Padding(
                    padding: const EdgeInsets.only(top: 8),
                    child: Text(_sttStatus,
-                        style: TextStyle(color: Colors.grey[600], fontSize: 13)),
+                        style:
+                            TextStyle(color: Colors.grey[600], fontSize: 13)),
                  ),
                if (_sttResult.isNotEmpty)
                  Container(
@ -330,12 +381,14 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {
                      color: Colors.grey[100],
                      borderRadius: BorderRadius.circular(8),
                    ),
-                    child: Text(_sttResult, style: const TextStyle(fontSize: 16)),
+                    child:
+                        Text(_sttResult, style: const TextStyle(fontSize: 16)),
                  ),
              ],
            ),
          ),
          const SizedBox(height: 16),
+          // Round-trip Section
          _buildSection(
            title: 'Round-trip (STT + TTS)',
            subtitle: '录音 → 识别文本 → 合成语音播放',
@ -348,8 +401,9 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {
                  child: ElevatedButton.icon(
                    onPressed: () {},
                    style: ElevatedButton.styleFrom(
-                      backgroundColor:
-                          _isRecording && _recordMode == 'rt' ? Colors.red : null,
+                      backgroundColor: _isRecording && _recordMode == 'rt'
+                          ? Colors.red
+                          : null,
                    ),
                    icon: Icon(_isRecording && _recordMode == 'rt'
                        ? Icons.mic
@ -363,7 +417,8 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {
                  Padding(
                    padding: const EdgeInsets.only(top: 8),
                    child: Text(_rtStatus,
-                        style: TextStyle(color: Colors.grey[600], fontSize: 13)),
+                        style:
+                            TextStyle(color: Colors.grey[600], fontSize: 13)),
                  ),
                if (_rtResult.isNotEmpty)
                  Container(
@ -374,7 +429,8 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {
                      color: Colors.grey[100],
                      borderRadius: BorderRadius.circular(8),
                    ),
-                    child: Text(_rtResult, style: const TextStyle(fontSize: 14)),
+                    child:
+                        Text(_rtResult, style: const TextStyle(fontSize: 14)),
                  ),
              ],
            ),
--- a/packages/services/voice-service/requirements.txt
+++ b/packages/services/voice-service/requirements.txt
@ -6,6 +6,7 @@ misaki==0.7.17
 silero-vad==5.1
 twilio==9.0.0
 anthropic>=0.32.0
+openai>=1.30.0
 websockets==12.0
 pydantic==2.6.0
 pydantic-settings==2.2.0
--- a/packages/services/voice-service/src/api/test_tts.py
+++ b/packages/services/voice-service/src/api/test_tts.py
@ -2,7 +2,9 @@

 import asyncio
 import io
+import os
 import struct
+import tempfile
 import numpy as np
 from fastapi import APIRouter, Request, Query, UploadFile, File
 from fastapi.responses import HTMLResponse, Response
@ -245,9 +247,6 @@ async def stt_transcribe(request: Request, audio: UploadFile = File(...)):
    if stt is None or stt._model is None:
        return {"error": "STT model not loaded", "text": ""}

-    import tempfile
-    import os
-
    # Save uploaded file to temp
    raw = await audio.read()
    suffix = os.path.splitext(audio.filename or "audio.webm")[1] or ".webm"
@ -276,3 +275,83 @@ async def stt_transcribe(request: Request, audio: UploadFile = File(...)):
        }
    finally:
        os.unlink(tmp_path)
+
+
+# =====================================================================
+# OpenAI Voice API endpoints
+# =====================================================================
+
+def _get_openai_client():
+    """Lazy-init OpenAI client with proxy support."""
+    from openai import OpenAI
+    api_key = os.environ.get("OPENAI_API_KEY")
+    base_url = os.environ.get("OPENAI_BASE_URL")
+    if not api_key:
+        return None
+    kwargs = {"api_key": api_key}
+    if base_url:
+        kwargs["base_url"] = base_url
+    return OpenAI(**kwargs)
+
+
+@router.get("/tts/synthesize-openai")
+async def tts_synthesize_openai(
+    text: str = Query(..., min_length=1, max_length=500),
+    model: str = Query("tts-1", regex="^(tts-1|tts-1-hd|gpt-4o-mini-tts)$"),
+    voice: str = Query("alloy", regex="^(alloy|ash|ballad|coral|echo|fable|nova|onyx|sage|shimmer)$"),
+):
+    """Synthesize text to audio via OpenAI TTS API."""
+    client = _get_openai_client()
+    if client is None:
+        return Response(content="OPENAI_API_KEY not configured", status_code=503)
+
+    loop = asyncio.get_event_loop()
+    def _synth():
+        response = client.audio.speech.create(
+            model=model,
+            voice=voice,
+            input=text,
+            response_format="wav",
+        )
+        return response.content
+
+    try:
+        wav_bytes = await loop.run_in_executor(None, _synth)
+        return Response(content=wav_bytes, media_type="audio/wav")
+    except Exception as e:
+        return Response(content=f"OpenAI TTS error: {e}", status_code=500)
+
+
+@router.post("/stt/transcribe-openai")
+async def stt_transcribe_openai(
+    audio: UploadFile = File(...),
+    model: str = Query("gpt-4o-transcribe", regex="^(whisper-1|gpt-4o-transcribe|gpt-4o-mini-transcribe)$"),
+):
+    """Transcribe uploaded audio via OpenAI STT API."""
+    client = _get_openai_client()
+    if client is None:
+        return {"error": "OPENAI_API_KEY not configured", "text": ""}
+
+    raw = await audio.read()
+    suffix = os.path.splitext(audio.filename or "audio.wav")[1] or ".wav"
+    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as f:
+        f.write(raw)
+        tmp_path = f.name
+
+    try:
+        loop = asyncio.get_event_loop()
+        def _transcribe():
+            with open(tmp_path, "rb") as af:
+                result = client.audio.transcriptions.create(
+                    model=model,
+                    file=af,
+                    language="zh",
+                )
+            return result.text
+
+        text = await loop.run_in_executor(None, _transcribe)
+        return {"text": text, "language": "zh", "model": model}
+    except Exception as e:
+        return {"error": f"OpenAI STT error: {e}", "text": ""}
+    finally:
+        os.unlink(tmp_path)