it0/packages/services/voice-service/src/pipeline/twilio_transport.py

135 lines
4.0 KiB
Python

"""
Twilio Media Streams audio transport.
Used when Pipecat initiates outbound calls via Twilio.
Audio codec: mu-law 8kHz (phone standard) <-> PCM 16kHz (Whisper input)
"""
import audioop
import base64
import json
from twilio.rest import Client as TwilioClient
from ..config.settings import settings
class TwilioPhoneTransport:
"""Twilio Media Streams transport for phone calls."""
def __init__(self, websocket, voice_session_id: str):
self.websocket = websocket
self.voice_session_id = voice_session_id
self._stream_sid = None
async def initiate_call(self, phone_number: str):
"""
Initiate outbound call via Twilio REST API.
Creates a call that connects to the Media Streams WebSocket endpoint,
allowing bidirectional audio streaming.
"""
client = TwilioClient(
settings.twilio_account_sid,
settings.twilio_auth_token,
)
# TwiML instructs Twilio to open a Media Stream back to our server
twiml = (
'<?xml version="1.0" encoding="UTF-8"?>'
"<Response>"
"<Connect>"
f'<Stream url="wss://{settings.host}:{settings.port}'
f'/api/v1/twilio/media-stream" />'
"</Connect>"
"</Response>"
)
call = client.calls.create(
to=phone_number,
from_=settings.twilio_phone_number,
twiml=twiml,
)
return call.sid
def input(self):
"""
Audio input from phone (mu-law 8kHz -> PCM 16kHz conversion).
Returns a TwilioInputProcessor that decodes mu-law encoded audio
from the Twilio Media Stream and converts it to PCM 16kHz 16bit mono
for the pipeline (Whisper STT expects 16kHz PCM).
"""
return TwilioInputProcessor(self)
def output(self):
"""
Audio output to phone (PCM 16kHz -> mu-law 8kHz conversion).
Returns a TwilioOutputProcessor that converts PCM 16kHz audio from
the pipeline (TTS output) to mu-law 8kHz for Twilio phone playback.
"""
return TwilioOutputProcessor(self)
class TwilioInputProcessor:
"""Converts mu-law 8kHz audio from Twilio to PCM 16kHz for the pipeline."""
def __init__(self, transport: TwilioPhoneTransport):
self.transport = transport
def process_audio(self, mulaw_bytes: bytes) -> bytes:
"""
Convert mu-law 8kHz audio to PCM 16kHz 16bit mono.
Steps:
1. Decode mu-law to PCM 16bit (8kHz)
2. Resample from 8kHz to 16kHz (2x upsampling)
"""
# mu-law to linear PCM 16bit at 8kHz
pcm_8khz = audioop.ulaw2lin(mulaw_bytes, 2)
# Resample from 8kHz to 16kHz
pcm_16khz, _ = audioop.ratecv(pcm_8khz, 2, 1, 8000, 16000, None)
return pcm_16khz
class TwilioOutputProcessor:
"""Converts PCM 16kHz audio from the pipeline to mu-law 8kHz for Twilio."""
def __init__(self, transport: TwilioPhoneTransport):
self.transport = transport
def process_audio(self, pcm_bytes: bytes) -> bytes:
"""
Convert PCM 16kHz 16bit mono to mu-law 8kHz.
Steps:
1. Resample from 16kHz to 8kHz (2x downsampling)
2. Encode linear PCM to mu-law
"""
# Resample from 16kHz to 8kHz
pcm_8khz, _ = audioop.ratecv(pcm_bytes, 2, 1, 16000, 8000, None)
# Linear PCM to mu-law
mulaw_bytes = audioop.lin2ulaw(pcm_8khz, 2)
return mulaw_bytes
async def send_audio(self, pcm_bytes: bytes):
"""Convert PCM audio to mu-law and send via Twilio Media Stream."""
mulaw_bytes = self.process_audio(pcm_bytes)
# Twilio expects base64-encoded mu-law audio in a JSON media message
payload = json.dumps({
"event": "media",
"streamSid": self.transport._stream_sid,
"media": {
"payload": base64.b64encode(mulaw_bytes).decode("ascii"),
},
})
await self.transport.websocket.send_text(payload)