From afc1ae6fbea2eea0e72d210706024bf5150e760e Mon Sep 17 00:00:00 2001 From: hailin Date: Mon, 9 Mar 2026 06:56:31 -0700 Subject: [PATCH] feat(voice): randomly pick thinking sound from all 7 built-in clips per session Co-Authored-By: Claude Sonnet 4.6 --- packages/services/voice-agent/src/agent.py | 23 +++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/packages/services/voice-agent/src/agent.py b/packages/services/voice-agent/src/agent.py index 60bcfe6..0f03da0 100644 --- a/packages/services/voice-agent/src/agent.py +++ b/packages/services/voice-agent/src/agent.py @@ -387,16 +387,21 @@ async def entrypoint(ctx: JobContext) -> None: _cleanup_ref.append(_on_room_disconnect) # --- Thinking state audio feedback --- - # BackgroundAudioPlayer listens for AgentStateChangedEvent from the - # session. When state transitions to "thinking" (STT done, waiting for - # LLM response), it plays the built-in keyboard typing sound through - # the LiveKit audio track. The sound stops automatically when the agent - # enters "speaking" state (TTS begins). This gives the user audible - # feedback that the AI is processing their request. - # Available built-in clips: KEYBOARD_TYPING, KEYBOARD_TYPING2, - # OFFICE_AMBIENCE, CITY_AMBIENCE, FOREST_AMBIENCE, CROWDED_ROOM, HOLD_MUSIC + # Randomly pick one of all available built-in clips each session. + _all_clips = [ + BuiltinAudioClip.KEYBOARD_TYPING, + BuiltinAudioClip.KEYBOARD_TYPING2, + BuiltinAudioClip.OFFICE_AMBIENCE, + BuiltinAudioClip.CITY_AMBIENCE, + BuiltinAudioClip.FOREST_AMBIENCE, + BuiltinAudioClip.CROWDED_ROOM, + BuiltinAudioClip.HOLD_MUSIC, + ] + import random as _random + _chosen_clip = _random.choice(_all_clips) + logger.info("Thinking sound this session: %s", _chosen_clip) bg_audio = BackgroundAudioPlayer( - thinking_sound=BuiltinAudioClip.KEYBOARD_TYPING, + thinking_sound=_chosen_clip, ) await bg_audio.start(room=ctx.room, agent_session=session)