fix: rewrite voice pipeline for direct WebSocket I/O, fix TTS and navigation
Root cause: Pipecat's WebsocketServerTransport creates its own WebSocket server on (host,port) and expects FrameProcessor subclasses. Our code was passing a FastAPI WebSocket object as 'host' and using plain STT/TTS/VAD service classes that aren't FrameProcessors. The pipeline crashed immediately when receiving audio, causing "disconnects when speaking". Changes: - **base_pipeline.py**: Complete rewrite — replaced Pipecat Pipeline with direct async loop: WebSocket → VAD → STT → Claude LLM → TTS → WebSocket. Supports barge-in (interrupt TTS when user speaks), audio chunking, and 24kHz→16kHz TTS resampling. - **session_router.py**: Pass WebSocket directly to pipeline instead of wrapping in AppTransport. - **app_transport.py**: Deprecated (no longer needed). - **kokoro_service.py**: Fix misaki compatibility (MutableToken→MToken rename), use correct Chinese voice 'zf_xiaoxiao', handle torch tensors. - **main.py**: Apply misaki monkey-patch before importing kokoro. - **settings.py**: Change default TTS voice from 'zh_female_1' (non-existent) to 'zf_xiaoxiao' (valid Kokoro-82M Chinese female voice). - **requirements.txt**: Remove pipecat-ai dependency, pin kokoro==0.3.5 + misaki==0.7.17, add Chinese NLP deps (pypinyin, cn2an, jieba, ordered-set). - **agent_call_page.dart**: Wrap each cleanup step in try/catch to ensure Navigator.pop() always executes after call ends. Add 3s timeout on session delete request. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
6cd53e713c
commit
7afbd54fce
|
|
@ -389,32 +389,37 @@ class _AgentCallPageState extends ConsumerState<AgentCallPage>
|
||||||
_reconnectTimer?.cancel();
|
_reconnectTimer?.cancel();
|
||||||
_reconnectTimer = null;
|
_reconnectTimer = null;
|
||||||
|
|
||||||
// Stop mic
|
// Cleanup all resources — each wrapped in try/catch to ensure we always
|
||||||
await _micSubscription?.cancel();
|
// reach Navigator.pop() at the end.
|
||||||
_micSubscription = null;
|
try {
|
||||||
|
await _micSubscription?.cancel();
|
||||||
|
_micSubscription = null;
|
||||||
|
} catch (_) {}
|
||||||
try {
|
try {
|
||||||
await _recorder.stop();
|
await _recorder.stop();
|
||||||
} catch (_) {}
|
} catch (_) {}
|
||||||
|
try {
|
||||||
// Stop playback
|
await _pcmPlayer.dispose();
|
||||||
await _pcmPlayer.dispose();
|
} catch (_) {}
|
||||||
|
try {
|
||||||
// Close WebSocket
|
_audioSubscription?.cancel();
|
||||||
_audioSubscription?.cancel();
|
} catch (_) {}
|
||||||
try {
|
try {
|
||||||
await _audioChannel?.sink.close();
|
await _audioChannel?.sink.close();
|
||||||
} catch (_) {}
|
} catch (_) {}
|
||||||
|
|
||||||
// Delete voice session on the server
|
// Delete voice session on the server (fire-and-forget)
|
||||||
if (_sessionId != null) {
|
if (_sessionId != null) {
|
||||||
try {
|
try {
|
||||||
final dio = ref.read(dioClientProvider);
|
final dio = ref.read(dioClientProvider);
|
||||||
await dio.delete('${ApiEndpoints.voice}/sessions/$_sessionId');
|
await dio.delete('${ApiEndpoints.voice}/sessions/$_sessionId')
|
||||||
|
.timeout(const Duration(seconds: 3));
|
||||||
} catch (_) {}
|
} catch (_) {}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Navigate back to the dial page
|
||||||
if (mounted) {
|
if (mounted) {
|
||||||
await Future.delayed(const Duration(seconds: 1));
|
await Future.delayed(const Duration(milliseconds: 500));
|
||||||
if (mounted) Navigator.of(context).pop();
|
if (mounted) Navigator.of(context).pop();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,16 +1,20 @@
|
||||||
fastapi==0.110.0
|
fastapi==0.110.0
|
||||||
uvicorn==0.29.0
|
uvicorn==0.29.0
|
||||||
pipecat-ai==0.0.30
|
|
||||||
faster-whisper==1.2.1
|
faster-whisper==1.2.1
|
||||||
kokoro==0.3.0
|
kokoro==0.3.5
|
||||||
|
misaki==0.7.17
|
||||||
silero-vad==5.1
|
silero-vad==5.1
|
||||||
twilio==9.0.0
|
twilio==9.0.0
|
||||||
anthropic==0.32.0
|
anthropic>=0.32.0
|
||||||
openai>=1.0.0
|
|
||||||
websockets==12.0
|
websockets==12.0
|
||||||
pydantic==2.6.0
|
pydantic==2.6.0
|
||||||
pydantic-settings==2.2.0
|
pydantic-settings==2.2.0
|
||||||
python-dotenv==1.0.0
|
python-dotenv==1.0.0
|
||||||
python-multipart==0.0.9
|
python-multipart==0.0.9
|
||||||
httpx==0.27.0
|
httpx==0.27.0
|
||||||
numpy==1.26.4
|
numpy>=1.26.4
|
||||||
|
torch>=2.0.0
|
||||||
|
ordered-set
|
||||||
|
pypinyin
|
||||||
|
cn2an
|
||||||
|
jieba
|
||||||
|
|
|
||||||
|
|
@ -106,13 +106,15 @@ def _load_models_sync():
|
||||||
|
|
||||||
# TTS
|
# TTS
|
||||||
try:
|
try:
|
||||||
from ..tts.kokoro_service import KokoroTTSService
|
from ..tts.kokoro_service import KokoroTTSService, _patch_misaki_compat
|
||||||
|
|
||||||
|
_patch_misaki_compat()
|
||||||
from kokoro import KPipeline
|
from kokoro import KPipeline
|
||||||
|
|
||||||
tts = KokoroTTSService(model=settings.kokoro_model, voice=settings.kokoro_voice)
|
tts = KokoroTTSService(model=settings.kokoro_model, voice=settings.kokoro_voice)
|
||||||
tts._pipeline = KPipeline(lang_code='z')
|
tts._pipeline = KPipeline(lang_code='z')
|
||||||
app.state.tts = tts
|
app.state.tts = tts
|
||||||
_p(f"[bg] TTS loaded: {settings.kokoro_model}")
|
_p(f"[bg] TTS loaded: {settings.kokoro_model} voice={settings.kokoro_voice}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
app.state.tts = None
|
app.state.tts = None
|
||||||
_p(f"[bg] WARNING: TTS failed: {e}")
|
_p(f"[bg] WARNING: TTS failed: {e}")
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,6 @@ from pydantic import BaseModel
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from ..config.settings import settings
|
from ..config.settings import settings
|
||||||
from ..pipeline.app_transport import AppTransport
|
|
||||||
from ..pipeline.base_pipeline import create_voice_pipeline
|
from ..pipeline.base_pipeline import create_voice_pipeline
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
@ -213,9 +212,6 @@ async def voice_websocket(websocket: WebSocket, session_id: str):
|
||||||
json.dumps({"type": "session.resumed", "session_id": session_id})
|
json.dumps({"type": "session.resumed", "session_id": session_id})
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create the AppTransport from the websocket connection
|
|
||||||
transport = AppTransport(websocket)
|
|
||||||
|
|
||||||
# Build the session context from stored session data
|
# Build the session context from stored session data
|
||||||
session_context = {
|
session_context = {
|
||||||
"session_id": session_id,
|
"session_id": session_id,
|
||||||
|
|
@ -223,9 +219,9 @@ async def voice_websocket(websocket: WebSocket, session_id: str):
|
||||||
"agent_context": session.get("agent_context", {}),
|
"agent_context": session.get("agent_context", {}),
|
||||||
}
|
}
|
||||||
|
|
||||||
# Create the Pipecat voice pipeline using shared services from app.state
|
# Create the voice pipeline using the WebSocket directly
|
||||||
task = await create_voice_pipeline(
|
task = await create_voice_pipeline(
|
||||||
transport,
|
websocket,
|
||||||
session_context,
|
session_context,
|
||||||
stt=getattr(app.state, "stt", None),
|
stt=getattr(app.state, "stt", None),
|
||||||
tts=getattr(app.state, "tts", None),
|
tts=getattr(app.state, "tts", None),
|
||||||
|
|
|
||||||
|
|
@ -22,7 +22,7 @@ class Settings(BaseSettings):
|
||||||
|
|
||||||
# TTS (Kokoro)
|
# TTS (Kokoro)
|
||||||
kokoro_model: str = "kokoro-82m"
|
kokoro_model: str = "kokoro-82m"
|
||||||
kokoro_voice: str = "zh_female_1"
|
kokoro_voice: str = "zf_xiaoxiao"
|
||||||
|
|
||||||
# Device (cpu or cuda)
|
# Device (cpu or cuda)
|
||||||
device: str = "cpu"
|
device: str = "cpu"
|
||||||
|
|
|
||||||
|
|
@ -1,31 +1,2 @@
|
||||||
"""
|
# This module is no longer used.
|
||||||
Flutter App WebSocket audio transport.
|
# Audio transport is now handled directly in base_pipeline.py via FastAPI WebSocket.
|
||||||
|
|
||||||
- Input: PCM 16kHz 16bit mono (Flutter recording format)
|
|
||||||
- Output: PCM 16kHz 16bit mono (Flutter playback format)
|
|
||||||
"""
|
|
||||||
|
|
||||||
from pipecat.transports.network.websocket_server import WebsocketServerTransport, WebsocketServerParams
|
|
||||||
|
|
||||||
|
|
||||||
class AppTransport:
|
|
||||||
"""WebSocket transport for Flutter App audio streaming."""
|
|
||||||
|
|
||||||
def __init__(self, websocket):
|
|
||||||
self.websocket = websocket
|
|
||||||
self.sample_rate = 16000
|
|
||||||
self._transport = WebsocketServerTransport(
|
|
||||||
websocket,
|
|
||||||
params=WebsocketServerParams(
|
|
||||||
audio_in_sample_rate=16000,
|
|
||||||
audio_out_sample_rate=16000,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
def input(self):
|
|
||||||
"""Audio input processor — receives PCM 16kHz 16bit mono from Flutter."""
|
|
||||||
return self._transport.input()
|
|
||||||
|
|
||||||
def output(self):
|
|
||||||
"""Audio output processor — sends PCM 16kHz 16bit mono to Flutter."""
|
|
||||||
return self._transport.output()
|
|
||||||
|
|
|
||||||
|
|
@ -1,69 +1,324 @@
|
||||||
"""
|
"""
|
||||||
Pipecat Pipeline core -- Voice dialogue pipeline definition.
|
Voice dialogue pipeline — direct WebSocket audio I/O.
|
||||||
|
|
||||||
Pipeline: Audio Input -> VAD -> STT -> LLM -> TTS -> Audio Output
|
Pipeline: Audio Input → VAD → STT → LLM → TTS → Audio Output
|
||||||
- Supports interruption (barge-in)
|
|
||||||
- Supports tool_use forwarding to agent-service
|
Runs as an async task that reads binary PCM frames from a FastAPI WebSocket,
|
||||||
|
detects speech with VAD, transcribes with STT, generates a response via
|
||||||
|
Claude LLM, synthesizes speech with TTS, and sends audio back.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from pipecat.pipeline.pipeline import Pipeline
|
import asyncio
|
||||||
from pipecat.pipeline.task import PipelineTask
|
import logging
|
||||||
from pipecat.services.anthropic import AnthropicLLMService
|
import time
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import anthropic
|
||||||
|
import numpy as np
|
||||||
|
from fastapi import WebSocket
|
||||||
|
|
||||||
from ..config.settings import settings
|
from ..config.settings import settings
|
||||||
from ..stt.whisper_service import WhisperSTTService
|
from ..stt.whisper_service import WhisperSTTService
|
||||||
from ..tts.kokoro_service import KokoroTTSService
|
from ..tts.kokoro_service import KokoroTTSService
|
||||||
from ..vad.silero_service import SileroVADService
|
from ..vad.silero_service import SileroVADService
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
async def create_voice_pipeline(transport, session_context, *, stt=None, tts=None, vad=None):
|
# Minimum speech duration in seconds before we transcribe
|
||||||
"""
|
_MIN_SPEECH_SECS = 0.5
|
||||||
Create a Pipecat voice dialogue pipeline.
|
# Silence duration in seconds after speech ends before we process
|
||||||
|
_SILENCE_AFTER_SPEECH_SECS = 0.8
|
||||||
|
# Sample rate
|
||||||
|
_SAMPLE_RATE = 16000
|
||||||
|
# Bytes per sample (16-bit PCM mono)
|
||||||
|
_BYTES_PER_SAMPLE = 2
|
||||||
|
# VAD chunk size (512 samples = 32ms at 16kHz, Silero expects this)
|
||||||
|
_VAD_CHUNK_SAMPLES = 512
|
||||||
|
_VAD_CHUNK_BYTES = _VAD_CHUNK_SAMPLES * _BYTES_PER_SAMPLE
|
||||||
|
# Max audio output chunk size sent over WebSocket (4KB)
|
||||||
|
_WS_AUDIO_CHUNK = 4096
|
||||||
|
|
||||||
|
|
||||||
|
class VoicePipelineTask:
|
||||||
|
"""Async voice pipeline that bridges a FastAPI WebSocket to STT/LLM/TTS."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
websocket: WebSocket,
|
||||||
|
session_context: dict,
|
||||||
|
*,
|
||||||
|
stt: Optional[WhisperSTTService] = None,
|
||||||
|
tts: Optional[KokoroTTSService] = None,
|
||||||
|
vad: Optional[SileroVADService] = None,
|
||||||
|
):
|
||||||
|
self.websocket = websocket
|
||||||
|
self.session_context = session_context
|
||||||
|
self.stt = stt
|
||||||
|
self.tts = tts
|
||||||
|
self.vad = vad
|
||||||
|
|
||||||
|
self._conversation: list[dict] = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": (
|
||||||
|
"You are iAgent, an AI voice assistant for IT operations. "
|
||||||
|
"Respond concisely in Chinese. Keep answers under 2 sentences "
|
||||||
|
"when possible. You are in a real-time voice conversation."
|
||||||
|
),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "好的,我是 iAgent 智能运维语音助手。有什么可以帮您的?",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
self._cancelled = False
|
||||||
|
self._speaking = False # True while sending TTS audio to client
|
||||||
|
|
||||||
|
async def run(self):
|
||||||
|
"""Main loop: read audio → VAD → STT → LLM → TTS → send audio."""
|
||||||
|
logger.info("Voice pipeline started for session %s", self.session_context.get("session_id"))
|
||||||
|
|
||||||
|
# Audio buffer for accumulating speech
|
||||||
|
speech_buffer = bytearray()
|
||||||
|
vad_buffer = bytearray() # accumulates until _VAD_CHUNK_BYTES
|
||||||
|
is_speech_active = False
|
||||||
|
silence_start: Optional[float] = None
|
||||||
|
speech_start: Optional[float] = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
while not self._cancelled:
|
||||||
|
try:
|
||||||
|
data = await asyncio.wait_for(
|
||||||
|
self.websocket.receive_bytes(), timeout=30.0
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
# No data for 30s — connection might be dead
|
||||||
|
continue
|
||||||
|
except Exception:
|
||||||
|
# WebSocket closed
|
||||||
|
break
|
||||||
|
|
||||||
|
# Accumulate into VAD-sized chunks
|
||||||
|
vad_buffer.extend(data)
|
||||||
|
|
||||||
|
while len(vad_buffer) >= _VAD_CHUNK_BYTES:
|
||||||
|
chunk = bytes(vad_buffer[:_VAD_CHUNK_BYTES])
|
||||||
|
del vad_buffer[:_VAD_CHUNK_BYTES]
|
||||||
|
|
||||||
|
# Run VAD
|
||||||
|
has_speech = self._detect_speech(chunk)
|
||||||
|
|
||||||
|
if has_speech:
|
||||||
|
if not is_speech_active:
|
||||||
|
is_speech_active = True
|
||||||
|
speech_start = time.time()
|
||||||
|
silence_start = None
|
||||||
|
# Barge-in: if we were speaking TTS, stop
|
||||||
|
if self._speaking:
|
||||||
|
self._cancelled_tts = True
|
||||||
|
logger.debug("Barge-in detected")
|
||||||
|
|
||||||
|
speech_buffer.extend(chunk)
|
||||||
|
silence_start = None
|
||||||
|
else:
|
||||||
|
if is_speech_active:
|
||||||
|
# Still accumulate a bit during silence gap
|
||||||
|
speech_buffer.extend(chunk)
|
||||||
|
|
||||||
|
if silence_start is None:
|
||||||
|
silence_start = time.time()
|
||||||
|
elif time.time() - silence_start >= _SILENCE_AFTER_SPEECH_SECS:
|
||||||
|
# Silence detected after speech — process
|
||||||
|
speech_duration = time.time() - (speech_start or time.time())
|
||||||
|
if speech_duration >= _MIN_SPEECH_SECS and len(speech_buffer) > 0:
|
||||||
|
await self._process_speech(bytes(speech_buffer))
|
||||||
|
|
||||||
|
# Reset
|
||||||
|
speech_buffer.clear()
|
||||||
|
is_speech_active = False
|
||||||
|
silence_start = None
|
||||||
|
speech_start = None
|
||||||
|
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
logger.info("Voice pipeline cancelled")
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception("Voice pipeline error: %s", exc)
|
||||||
|
finally:
|
||||||
|
logger.info("Voice pipeline ended for session %s", self.session_context.get("session_id"))
|
||||||
|
|
||||||
|
def cancel(self):
|
||||||
|
self._cancelled = True
|
||||||
|
|
||||||
|
def _detect_speech(self, chunk: bytes) -> bool:
|
||||||
|
"""Run VAD on a single chunk. Returns True if speech detected."""
|
||||||
|
if self.vad is None or self.vad._model is None:
|
||||||
|
# No VAD — treat everything as speech
|
||||||
|
return True
|
||||||
|
try:
|
||||||
|
return self.vad.detect_speech(chunk)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.debug("VAD error: %s", exc)
|
||||||
|
return True # Assume speech on error
|
||||||
|
|
||||||
|
async def _process_speech(self, audio_data: bytes):
|
||||||
|
"""Transcribe speech, generate LLM response, synthesize and send TTS."""
|
||||||
|
session_id = self.session_context.get("session_id", "?")
|
||||||
|
|
||||||
|
# 1. STT
|
||||||
|
text = await self._transcribe(audio_data)
|
||||||
|
if not text or not text.strip():
|
||||||
|
logger.debug("[%s] STT returned empty text, skipping", session_id)
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info("[%s] User said: %s", session_id, text.strip())
|
||||||
|
|
||||||
|
# Notify client that we heard them
|
||||||
|
try:
|
||||||
|
import json
|
||||||
|
await self.websocket.send_text(
|
||||||
|
json.dumps({"type": "transcript", "text": text.strip(), "role": "user"})
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 2. LLM
|
||||||
|
self._conversation.append({"role": "user", "content": text.strip()})
|
||||||
|
response_text = await self._llm_generate()
|
||||||
|
if not response_text:
|
||||||
|
logger.warning("[%s] LLM returned empty response", session_id)
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info("[%s] Agent says: %s", session_id, response_text)
|
||||||
|
self._conversation.append({"role": "assistant", "content": response_text})
|
||||||
|
|
||||||
|
# Notify client of the response text
|
||||||
|
try:
|
||||||
|
import json
|
||||||
|
await self.websocket.send_text(
|
||||||
|
json.dumps({"type": "transcript", "text": response_text, "role": "assistant"})
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 3. TTS → send audio back
|
||||||
|
await self._synthesize_and_send(response_text)
|
||||||
|
|
||||||
|
async def _transcribe(self, audio_data: bytes) -> str:
|
||||||
|
"""Transcribe audio using STT service."""
|
||||||
|
if self.stt is None or self.stt._model is None:
|
||||||
|
logger.warning("STT not available")
|
||||||
|
return ""
|
||||||
|
try:
|
||||||
|
return await self.stt.transcribe(audio_data)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error("STT error: %s", exc)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
async def _llm_generate(self) -> str:
|
||||||
|
"""Generate a response using Anthropic Claude."""
|
||||||
|
if not settings.anthropic_api_key:
|
||||||
|
logger.warning("Anthropic API key not set, returning default response")
|
||||||
|
return "抱歉,语音助手暂时无法连接到AI服务。"
|
||||||
|
|
||||||
|
try:
|
||||||
|
client = anthropic.AsyncAnthropic(api_key=settings.anthropic_api_key)
|
||||||
|
response = await client.messages.create(
|
||||||
|
model=settings.claude_model,
|
||||||
|
max_tokens=256,
|
||||||
|
messages=self._conversation,
|
||||||
|
)
|
||||||
|
return response.content[0].text if response.content else ""
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error("LLM error: %s", exc)
|
||||||
|
return "抱歉,AI服务暂时不可用,请稍后再试。"
|
||||||
|
|
||||||
|
async def _synthesize_and_send(self, text: str):
|
||||||
|
"""Synthesize text to speech and send audio chunks over WebSocket."""
|
||||||
|
self._speaking = True
|
||||||
|
self._cancelled_tts = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
if self.tts is None or self.tts._pipeline is None:
|
||||||
|
logger.warning("TTS not available, skipping audio response")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Run TTS (CPU-bound) in a thread
|
||||||
|
audio_bytes = await asyncio.get_event_loop().run_in_executor(
|
||||||
|
None, self._tts_sync, text
|
||||||
|
)
|
||||||
|
|
||||||
|
if not audio_bytes or self._cancelled_tts:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Send audio in chunks
|
||||||
|
offset = 0
|
||||||
|
while offset < len(audio_bytes) and not self._cancelled_tts:
|
||||||
|
end = min(offset + _WS_AUDIO_CHUNK, len(audio_bytes))
|
||||||
|
try:
|
||||||
|
await self.websocket.send_bytes(audio_bytes[offset:end])
|
||||||
|
except Exception:
|
||||||
|
break
|
||||||
|
offset = end
|
||||||
|
# Small yield to not starve the event loop
|
||||||
|
await asyncio.sleep(0.01)
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error("TTS/send error: %s", exc)
|
||||||
|
finally:
|
||||||
|
self._speaking = False
|
||||||
|
|
||||||
|
def _tts_sync(self, text: str) -> bytes:
|
||||||
|
"""Synchronous TTS synthesis (runs in thread pool)."""
|
||||||
|
try:
|
||||||
|
samples = []
|
||||||
|
for _, _, audio in self.tts._pipeline(text, voice=self.tts.voice):
|
||||||
|
samples.append(audio)
|
||||||
|
|
||||||
|
if not samples:
|
||||||
|
return b""
|
||||||
|
|
||||||
|
audio_np = np.concatenate(samples)
|
||||||
|
# Kokoro outputs at 24kHz, we need 16kHz
|
||||||
|
# Resample using linear interpolation
|
||||||
|
if len(audio_np) > 0:
|
||||||
|
original_rate = 24000
|
||||||
|
target_rate = _SAMPLE_RATE
|
||||||
|
duration = len(audio_np) / original_rate
|
||||||
|
target_samples = int(duration * target_rate)
|
||||||
|
indices = np.linspace(0, len(audio_np) - 1, target_samples)
|
||||||
|
resampled = np.interp(indices, np.arange(len(audio_np)), audio_np)
|
||||||
|
return (resampled * 32768).clip(-32768, 32767).astype(np.int16).tobytes()
|
||||||
|
|
||||||
|
return (audio_np * 32768).clip(-32768, 32767).astype(np.int16).tobytes()
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error("TTS synthesis error: %s", exc)
|
||||||
|
return b""
|
||||||
|
|
||||||
|
|
||||||
|
async def create_voice_pipeline(
|
||||||
|
websocket: WebSocket,
|
||||||
|
session_context: dict,
|
||||||
|
*,
|
||||||
|
stt=None,
|
||||||
|
tts=None,
|
||||||
|
vad=None,
|
||||||
|
) -> VoicePipelineTask:
|
||||||
|
"""Create a voice pipeline task for the given WebSocket connection.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
transport: AppTransport (Flutter WebSocket) or TwilioTransport
|
websocket: FastAPI WebSocket connection (already accepted)
|
||||||
session_context: Dialogue context (standing order info, server info, etc.)
|
session_context: Session metadata dict
|
||||||
stt: Optional pre-initialized STT service (uses app.state if not provided)
|
stt: Pre-initialized STT service
|
||||||
tts: Optional pre-initialized TTS service (uses app.state if not provided)
|
tts: Pre-initialized TTS service
|
||||||
vad: Optional pre-initialized VAD service (uses app.state if not provided)
|
vad: Pre-initialized VAD service
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
PipelineTask with interruption support
|
VoicePipelineTask ready to run
|
||||||
"""
|
"""
|
||||||
# Use provided services or create defaults from settings
|
return VoicePipelineTask(
|
||||||
if stt is None:
|
websocket,
|
||||||
stt = WhisperSTTService(
|
session_context,
|
||||||
model=settings.whisper_model,
|
stt=stt,
|
||||||
device=settings.whisper_device,
|
tts=tts,
|
||||||
language=settings.whisper_language,
|
vad=vad,
|
||||||
)
|
|
||||||
await stt.initialize()
|
|
||||||
|
|
||||||
if tts is None:
|
|
||||||
tts = KokoroTTSService(
|
|
||||||
model=settings.kokoro_model,
|
|
||||||
voice=settings.kokoro_voice,
|
|
||||||
)
|
|
||||||
await tts.initialize()
|
|
||||||
|
|
||||||
if vad is None:
|
|
||||||
vad = SileroVADService()
|
|
||||||
await vad.initialize()
|
|
||||||
|
|
||||||
# LLM service (Anthropic Claude)
|
|
||||||
llm = AnthropicLLMService(
|
|
||||||
api_key=settings.anthropic_api_key,
|
|
||||||
model=settings.claude_model,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Build the pipeline: input -> VAD -> STT -> LLM -> TTS -> output
|
|
||||||
pipeline = Pipeline([
|
|
||||||
transport.input(),
|
|
||||||
vad,
|
|
||||||
stt,
|
|
||||||
llm,
|
|
||||||
tts,
|
|
||||||
transport.output(),
|
|
||||||
])
|
|
||||||
|
|
||||||
return PipelineTask(pipeline, allow_interruptions=True)
|
|
||||||
|
|
|
||||||
|
|
@ -2,34 +2,48 @@
|
||||||
Kokoro-82M TTS service configuration.
|
Kokoro-82M TTS service configuration.
|
||||||
|
|
||||||
Model: Kokoro-82M (Chinese + English bilingual)
|
Model: Kokoro-82M (Chinese + English bilingual)
|
||||||
Voice: zh_female_1
|
Voice: zf_xiaoxiao (Chinese female)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def _patch_misaki_compat():
|
||||||
|
"""Patch misaki.en compatibility: MutableToken was renamed to MToken."""
|
||||||
|
try:
|
||||||
|
import misaki.en as en
|
||||||
|
if not hasattr(en, 'MutableToken') and hasattr(en, 'MToken'):
|
||||||
|
en.MutableToken = en.MToken
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class KokoroTTSService:
|
class KokoroTTSService:
|
||||||
"""Text-to-Speech service using Kokoro-82M."""
|
"""Text-to-Speech service using Kokoro-82M."""
|
||||||
|
|
||||||
def __init__(self, model: str = "kokoro-82m", voice: str = "zh_female_1"):
|
def __init__(self, model: str = "kokoro-82m", voice: str = "zf_xiaoxiao"):
|
||||||
self.model_name = model
|
self.model_name = model
|
||||||
self.voice = voice
|
self.voice = voice
|
||||||
self._pipeline = None
|
self._pipeline = None
|
||||||
|
|
||||||
async def initialize(self):
|
async def initialize(self):
|
||||||
"""Load the Kokoro TTS model."""
|
"""Load the Kokoro TTS model."""
|
||||||
|
_patch_misaki_compat()
|
||||||
from kokoro import KPipeline
|
from kokoro import KPipeline
|
||||||
|
|
||||||
self._pipeline = KPipeline(lang_code='z') # Chinese
|
self._pipeline = KPipeline(lang_code='z') # Chinese
|
||||||
|
|
||||||
async def synthesize(self, text: str) -> bytes:
|
async def synthesize(self, text: str) -> bytes:
|
||||||
"""Convert text to speech audio."""
|
"""Convert text to speech audio (24kHz float32 → 16-bit PCM)."""
|
||||||
samples = []
|
samples = []
|
||||||
for _, _, audio in self._pipeline(text, voice=self.voice):
|
for _, _, audio in self._pipeline(text, voice=self.voice):
|
||||||
samples.append(audio)
|
if hasattr(audio, 'numpy'):
|
||||||
|
samples.append(audio.numpy())
|
||||||
|
else:
|
||||||
|
samples.append(audio)
|
||||||
|
|
||||||
if not samples:
|
if not samples:
|
||||||
return b''
|
return b''
|
||||||
|
|
||||||
audio_np = np.concatenate(samples)
|
audio_np = np.concatenate(samples)
|
||||||
return (audio_np * 32768).astype(np.int16).tobytes()
|
return (audio_np * 32768).clip(-32768, 32767).astype(np.int16).tobytes()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue