feat: add OpenAI TTS/STT API endpoints for comparison testing

- Add openai package to voice-service requirements
- Add /api/v1/test/tts/synthesize-openai (tts-1/tts-1-hd/gpt-4o-mini-tts)
- Add /api/v1/test/stt/transcribe-openai (gpt-4o-transcribe/whisper-1)
- Add OPENAI_API_KEY and OPENAI_BASE_URL env vars to voice-service
- Flutter test page: SegmentedButton to toggle Local/OpenAI provider
- All endpoints maintain same response format for easy comparison

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
hailin 2026-02-24 07:20:03 -08:00
parent ac0b8ee1c6
commit d43baed3a5
4 changed files with 172 additions and 34 deletions

View File

@ -327,6 +327,8 @@ services:
- KOKORO_MODEL=${KOKORO_MODEL:-kokoro-82m}
- KOKORO_VOICE=${KOKORO_VOICE:-zf_xiaoxiao}
- DEVICE=${VOICE_DEVICE:-cpu}
- OPENAI_API_KEY=${OPENAI_API_KEY}
- OPENAI_BASE_URL=${OPENAI_BASE_URL}
healthcheck:
test: ["CMD-SHELL", "python3 -c \"import urllib.request; urllib.request.urlopen('http://localhost:3008/docs')\""]
interval: 30s

View File

@ -10,8 +10,10 @@ import 'package:path/path.dart' as p;
import 'package:permission_handler/permission_handler.dart';
import '../../../../core/network/dio_client.dart';
enum _VoiceProvider { local, openai }
/// Temporary voice I/O test page TTS + STT + Round-trip.
/// Uses flutter_sound for both recording and playback.
/// Supports switching between Local (Kokoro/faster-whisper) and OpenAI APIs.
class VoiceTestPage extends ConsumerStatefulWidget {
const VoiceTestPage({super.key});
@ -28,6 +30,8 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {
bool _playerReady = false;
bool _recorderReady = false;
_VoiceProvider _provider = _VoiceProvider.local;
String _ttsStatus = '';
String _sttStatus = '';
String _sttResult = '';
@ -38,13 +42,22 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {
String _recordMode = ''; // 'stt' or 'rt'
String? _recordingPath;
// Endpoint paths based on provider
String get _ttsEndpoint => _provider == _VoiceProvider.openai
? '/api/v1/test/tts/synthesize-openai'
: '/api/v1/test/tts/synthesize';
String get _sttEndpoint => _provider == _VoiceProvider.openai
? '/api/v1/test/stt/transcribe-openai'
: '/api/v1/test/stt/transcribe';
Dio get _dioBinary {
final base = ref.read(dioClientProvider);
return Dio(BaseOptions(
baseUrl: base.options.baseUrl,
headers: Map.from(base.options.headers),
connectTimeout: const Duration(seconds: 30),
receiveTimeout: const Duration(seconds: 60),
receiveTimeout: const Duration(seconds: 120),
responseType: ResponseType.bytes,
))..interceptors.addAll(base.interceptors);
}
@ -77,26 +90,27 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {
Future<void> _doTTS() async {
final text = _ttsController.text.trim();
if (text.isEmpty) return;
final label = _provider == _VoiceProvider.openai ? 'OpenAI' : 'Local';
setState(() {
_isSynthesizing = true;
_ttsStatus = '合成中...';
_ttsStatus = '[$label] 合成中...';
});
final sw = Stopwatch()..start();
try {
final resp = await _dioBinary.get(
'/api/v1/test/tts/synthesize',
_ttsEndpoint,
queryParameters: {'text': text},
);
sw.stop();
final bytes = resp.data as List<int>;
setState(() {
_ttsStatus =
'完成!耗时 ${sw.elapsedMilliseconds}ms大小 ${(bytes.length / 1024).toStringAsFixed(1)}KB';
'[$label] 完成!耗时 ${sw.elapsedMilliseconds}ms大小 ${(bytes.length / 1024).toStringAsFixed(1)}KB';
});
await _playWav(Uint8List.fromList(bytes));
} catch (e) {
sw.stop();
setState(() => _ttsStatus = '错误: $e');
setState(() => _ttsStatus = '[$label] 错误: $e');
} finally {
setState(() => _isSynthesizing = false);
}
@ -161,26 +175,26 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {
// ---- STT ----
Future<void> _doSTT(String audioPath) async {
setState(() => _sttStatus = '识别中...');
final label = _provider == _VoiceProvider.openai ? 'OpenAI' : 'Local';
setState(() => _sttStatus = '[$label] 识别中...');
final sw = Stopwatch()..start();
try {
final formData = FormData.fromMap({
'audio':
await MultipartFile.fromFile(audioPath, filename: 'recording.wav'),
});
final resp =
await _dioJson.post('/api/v1/test/stt/transcribe', data: formData);
final resp = await _dioJson.post(_sttEndpoint, data: formData);
sw.stop();
final data = resp.data as Map<String, dynamic>;
setState(() {
_sttResult = data['text'] ?? '(empty)';
_sttStatus =
'完成!耗时 ${sw.elapsedMilliseconds}ms时长 ${data['duration'] ?? 0}s';
final extra = data['duration'] != null ? ',时长 ${data['duration']}s' : '';
_sttStatus = '[$label] 完成!耗时 ${sw.elapsedMilliseconds}ms$extra';
});
} catch (e) {
sw.stop();
setState(() {
_sttStatus = '错误: $e';
_sttStatus = '[$label] 错误: $e';
_sttResult = '';
});
} finally {
@ -190,7 +204,8 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {
// ---- Round-trip: STT TTS ----
Future<void> _doRoundTrip(String audioPath) async {
setState(() => _rtStatus = 'STT 识别中...');
final label = _provider == _VoiceProvider.openai ? 'OpenAI' : 'Local';
setState(() => _rtStatus = '[$label] STT 识别中...');
final totalSw = Stopwatch()..start();
try {
// 1. STT
@ -199,24 +214,23 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {
'audio':
await MultipartFile.fromFile(audioPath, filename: 'recording.wav'),
});
final sttResp =
await _dioJson.post('/api/v1/test/stt/transcribe', data: formData);
final sttResp = await _dioJson.post(_sttEndpoint, data: formData);
sttSw.stop();
final sttData = sttResp.data as Map<String, dynamic>;
final text = sttData['text'] ?? '';
setState(() {
_rtResult = 'STT (${sttSw.elapsedMilliseconds}ms): $text';
_rtStatus = 'TTS 合成中...';
_rtResult = '[$label] STT (${sttSw.elapsedMilliseconds}ms): $text';
_rtStatus = '[$label] TTS 合成中...';
});
if (text.isEmpty) {
setState(() => _rtStatus = 'STT 识别为空');
setState(() => _rtStatus = '[$label] STT 识别为空');
return;
}
// 2. TTS
final ttsSw = Stopwatch()..start();
final ttsResp = await _dioBinary.get(
'/api/v1/test/tts/synthesize',
_ttsEndpoint,
queryParameters: {'text': text},
);
ttsSw.stop();
@ -224,14 +238,14 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {
final audioBytes = ttsResp.data as List<int>;
setState(() {
_rtResult +=
'\nTTS (${ttsSw.elapsedMilliseconds}ms): ${(audioBytes.length / 1024).toStringAsFixed(1)}KB';
'\n[$label] TTS (${ttsSw.elapsedMilliseconds}ms): ${(audioBytes.length / 1024).toStringAsFixed(1)}KB';
_rtStatus =
'完成!STT=${sttSw.elapsedMilliseconds}ms + TTS=${ttsSw.elapsedMilliseconds}ms = ${totalSw.elapsedMilliseconds}ms';
'[$label] STT=${sttSw.elapsedMilliseconds}ms + TTS=${ttsSw.elapsedMilliseconds}ms = ${totalSw.elapsedMilliseconds}ms';
});
await _playWav(Uint8List.fromList(audioBytes));
} catch (e) {
totalSw.stop();
setState(() => _rtStatus = '错误: $e');
setState(() => _rtStatus = '[$label] 错误: $e');
} finally {
_cleanupFile(audioPath);
}
@ -259,6 +273,39 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {
body: ListView(
padding: const EdgeInsets.all(16),
children: [
// Provider toggle
Center(
child: SegmentedButton<_VoiceProvider>(
segments: const [
ButtonSegment(
value: _VoiceProvider.local,
label: Text('Local'),
icon: Icon(Icons.computer, size: 18),
),
ButtonSegment(
value: _VoiceProvider.openai,
label: Text('OpenAI'),
icon: Icon(Icons.cloud, size: 18),
),
],
selected: {_provider},
onSelectionChanged: (Set<_VoiceProvider> sel) {
setState(() => _provider = sel.first);
},
),
),
Padding(
padding: const EdgeInsets.only(top: 4, bottom: 12),
child: Center(
child: Text(
_provider == _VoiceProvider.openai
? 'STT: gpt-4o-transcribe | TTS: tts-1'
: 'STT: faster-whisper | TTS: Kokoro',
style: TextStyle(fontSize: 12, color: Colors.grey[500]),
),
),
),
// TTS Section
_buildSection(
title: 'TTS (文本转语音)',
child: Column(
@ -287,12 +334,14 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {
Padding(
padding: const EdgeInsets.only(top: 8),
child: Text(_ttsStatus,
style: TextStyle(color: Colors.grey[600], fontSize: 13)),
style:
TextStyle(color: Colors.grey[600], fontSize: 13)),
),
],
),
),
const SizedBox(height: 16),
// STT Section
_buildSection(
title: 'STT (语音转文本)',
child: Column(
@ -304,8 +353,9 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {
child: ElevatedButton.icon(
onPressed: () {},
style: ElevatedButton.styleFrom(
backgroundColor:
_isRecording && _recordMode == 'stt' ? Colors.red : null,
backgroundColor: _isRecording && _recordMode == 'stt'
? Colors.red
: null,
),
icon: Icon(_isRecording && _recordMode == 'stt'
? Icons.mic
@ -319,7 +369,8 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {
Padding(
padding: const EdgeInsets.only(top: 8),
child: Text(_sttStatus,
style: TextStyle(color: Colors.grey[600], fontSize: 13)),
style:
TextStyle(color: Colors.grey[600], fontSize: 13)),
),
if (_sttResult.isNotEmpty)
Container(
@ -330,12 +381,14 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {
color: Colors.grey[100],
borderRadius: BorderRadius.circular(8),
),
child: Text(_sttResult, style: const TextStyle(fontSize: 16)),
child:
Text(_sttResult, style: const TextStyle(fontSize: 16)),
),
],
),
),
const SizedBox(height: 16),
// Round-trip Section
_buildSection(
title: 'Round-trip (STT + TTS)',
subtitle: '录音 → 识别文本 → 合成语音播放',
@ -348,8 +401,9 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {
child: ElevatedButton.icon(
onPressed: () {},
style: ElevatedButton.styleFrom(
backgroundColor:
_isRecording && _recordMode == 'rt' ? Colors.red : null,
backgroundColor: _isRecording && _recordMode == 'rt'
? Colors.red
: null,
),
icon: Icon(_isRecording && _recordMode == 'rt'
? Icons.mic
@ -363,7 +417,8 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {
Padding(
padding: const EdgeInsets.only(top: 8),
child: Text(_rtStatus,
style: TextStyle(color: Colors.grey[600], fontSize: 13)),
style:
TextStyle(color: Colors.grey[600], fontSize: 13)),
),
if (_rtResult.isNotEmpty)
Container(
@ -374,7 +429,8 @@ class _VoiceTestPageState extends ConsumerState<VoiceTestPage> {
color: Colors.grey[100],
borderRadius: BorderRadius.circular(8),
),
child: Text(_rtResult, style: const TextStyle(fontSize: 14)),
child:
Text(_rtResult, style: const TextStyle(fontSize: 14)),
),
],
),

View File

@ -6,6 +6,7 @@ misaki==0.7.17
silero-vad==5.1
twilio==9.0.0
anthropic>=0.32.0
openai>=1.30.0
websockets==12.0
pydantic==2.6.0
pydantic-settings==2.2.0

View File

@ -2,7 +2,9 @@
import asyncio
import io
import os
import struct
import tempfile
import numpy as np
from fastapi import APIRouter, Request, Query, UploadFile, File
from fastapi.responses import HTMLResponse, Response
@ -245,9 +247,6 @@ async def stt_transcribe(request: Request, audio: UploadFile = File(...)):
if stt is None or stt._model is None:
return {"error": "STT model not loaded", "text": ""}
import tempfile
import os
# Save uploaded file to temp
raw = await audio.read()
suffix = os.path.splitext(audio.filename or "audio.webm")[1] or ".webm"
@ -276,3 +275,83 @@ async def stt_transcribe(request: Request, audio: UploadFile = File(...)):
}
finally:
os.unlink(tmp_path)
# =====================================================================
# OpenAI Voice API endpoints
# =====================================================================
def _get_openai_client():
"""Lazy-init OpenAI client with proxy support."""
from openai import OpenAI
api_key = os.environ.get("OPENAI_API_KEY")
base_url = os.environ.get("OPENAI_BASE_URL")
if not api_key:
return None
kwargs = {"api_key": api_key}
if base_url:
kwargs["base_url"] = base_url
return OpenAI(**kwargs)
@router.get("/tts/synthesize-openai")
async def tts_synthesize_openai(
text: str = Query(..., min_length=1, max_length=500),
model: str = Query("tts-1", regex="^(tts-1|tts-1-hd|gpt-4o-mini-tts)$"),
voice: str = Query("alloy", regex="^(alloy|ash|ballad|coral|echo|fable|nova|onyx|sage|shimmer)$"),
):
"""Synthesize text to audio via OpenAI TTS API."""
client = _get_openai_client()
if client is None:
return Response(content="OPENAI_API_KEY not configured", status_code=503)
loop = asyncio.get_event_loop()
def _synth():
response = client.audio.speech.create(
model=model,
voice=voice,
input=text,
response_format="wav",
)
return response.content
try:
wav_bytes = await loop.run_in_executor(None, _synth)
return Response(content=wav_bytes, media_type="audio/wav")
except Exception as e:
return Response(content=f"OpenAI TTS error: {e}", status_code=500)
@router.post("/stt/transcribe-openai")
async def stt_transcribe_openai(
audio: UploadFile = File(...),
model: str = Query("gpt-4o-transcribe", regex="^(whisper-1|gpt-4o-transcribe|gpt-4o-mini-transcribe)$"),
):
"""Transcribe uploaded audio via OpenAI STT API."""
client = _get_openai_client()
if client is None:
return {"error": "OPENAI_API_KEY not configured", "text": ""}
raw = await audio.read()
suffix = os.path.splitext(audio.filename or "audio.wav")[1] or ".wav"
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as f:
f.write(raw)
tmp_path = f.name
try:
loop = asyncio.get_event_loop()
def _transcribe():
with open(tmp_path, "rb") as af:
result = client.audio.transcriptions.create(
model=model,
file=af,
language="zh",
)
return result.text
text = await loop.run_in_executor(None, _transcribe)
return {"text": text, "language": "zh", "model": model}
except Exception as e:
return {"error": f"OpenAI STT error: {e}", "text": ""}
finally:
os.unlink(tmp_path)