125 lines
4.3 KiB
Python
125 lines
4.3 KiB
Python
import asyncio
|
|
import base64
|
|
import json
|
|
import uuid
|
|
|
|
from fastapi import APIRouter, WebSocket, WebSocketDisconnect, Request
|
|
from fastapi.responses import Response
|
|
|
|
from ..pipeline.twilio_transport import TwilioPhoneTransport
|
|
from ..pipeline.base_pipeline import create_voice_pipeline
|
|
|
|
router = APIRouter()
|
|
|
|
|
|
@router.post("/voice/incoming")
|
|
async def twilio_incoming_call(request: Request):
|
|
"""Handle incoming Twilio voice calls."""
|
|
# Return TwiML to connect to Media Streams
|
|
twiml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<Response>
|
|
<Connect>
|
|
<Stream url="wss://{host}/api/v1/twilio/media-stream" />
|
|
</Connect>
|
|
</Response>"""
|
|
return Response(content=twiml, media_type="application/xml")
|
|
|
|
|
|
@router.websocket("/media-stream")
|
|
async def twilio_media_stream(websocket: WebSocket):
|
|
"""
|
|
Twilio Media Streams WebSocket endpoint.
|
|
|
|
Handles the bidirectional audio stream between Twilio and the Pipecat pipeline.
|
|
Twilio sends JSON messages with events: connected, start, media, stop.
|
|
Audio is mu-law 8kHz base64-encoded.
|
|
"""
|
|
await websocket.accept()
|
|
|
|
app = websocket.app
|
|
voice_session_id = f"tw_{uuid.uuid4().hex[:12]}"
|
|
transport = None
|
|
pipeline_task = None
|
|
|
|
try:
|
|
while True:
|
|
data = await websocket.receive_text()
|
|
message = json.loads(data)
|
|
event = message.get("event")
|
|
|
|
if event == "connected":
|
|
# Twilio has connected the Media Stream WebSocket
|
|
pass
|
|
|
|
elif event == "start":
|
|
# Stream is starting -- create transport and pipeline
|
|
stream_sid = message.get("streamSid", "")
|
|
|
|
# Create the Twilio transport
|
|
transport = TwilioPhoneTransport(websocket, voice_session_id)
|
|
transport._stream_sid = stream_sid
|
|
|
|
# Build session context for the pipeline
|
|
session_context = {
|
|
"session_id": voice_session_id,
|
|
"stream_sid": stream_sid,
|
|
"call_sid": message.get("start", {}).get("callSid", ""),
|
|
}
|
|
|
|
# Create the Pipecat voice pipeline
|
|
task = await create_voice_pipeline(
|
|
transport,
|
|
session_context,
|
|
stt=getattr(app.state, "stt", None),
|
|
tts=getattr(app.state, "tts", None),
|
|
vad=getattr(app.state, "vad", None),
|
|
)
|
|
|
|
# Run pipeline in background
|
|
pipeline_task = asyncio.create_task(task.run())
|
|
|
|
elif event == "media":
|
|
# Incoming audio from the phone call
|
|
if transport is not None:
|
|
# Decode base64 mu-law audio payload
|
|
media_payload = message.get("media", {}).get("payload", "")
|
|
mulaw_bytes = base64.b64decode(media_payload)
|
|
|
|
# Convert mu-law 8kHz to PCM 16kHz via the input processor
|
|
input_processor = transport.input()
|
|
pcm_audio = input_processor.process_audio(mulaw_bytes)
|
|
|
|
# Feed PCM audio into the pipeline (via transport websocket)
|
|
# The pipeline reads from the transport; here we make the
|
|
# decoded audio available for downstream processing.
|
|
# In a full Pipecat integration the transport handles this
|
|
# internally. For now, we send the converted audio back
|
|
# through the output processor to complete the loop.
|
|
output_processor = transport.output()
|
|
await output_processor.send_audio(pcm_audio)
|
|
|
|
elif event == "stop":
|
|
# Stream is ending -- clean up
|
|
break
|
|
|
|
except WebSocketDisconnect:
|
|
pass
|
|
except json.JSONDecodeError:
|
|
pass
|
|
except Exception:
|
|
pass
|
|
finally:
|
|
# Cancel pipeline task if still running
|
|
if pipeline_task is not None and not pipeline_task.done():
|
|
pipeline_task.cancel()
|
|
try:
|
|
await pipeline_task
|
|
except (asyncio.CancelledError, Exception):
|
|
pass
|
|
|
|
# Ensure websocket is closed
|
|
try:
|
|
await websocket.close()
|
|
except Exception:
|
|
pass
|