diff --git a/it0_app/lib/features/agent_call/presentation/pages/agent_call_page.dart b/it0_app/lib/features/agent_call/presentation/pages/agent_call_page.dart index 4fe75f7..7345fb1 100644 --- a/it0_app/lib/features/agent_call/presentation/pages/agent_call_page.dart +++ b/it0_app/lib/features/agent_call/presentation/pages/agent_call_page.dart @@ -43,7 +43,13 @@ class _AgentCallPageState extends ConsumerState final List _waveHeights = List.generate(20, (_) => 0.3); Timer? _waveTimer; - // Agent state (from lk.agent.state participant attribute) + // Agent state — read from the "lk.agent.state" participant attribute that + // LiveKit AgentSession publishes automatically. Values: + // "initializing" → agent starting up + // "listening" → waiting for user speech + // "thinking" → STT done, LLM processing (show thinking animation) + // "speaking" → TTS playing response + // See ParticipantAttributesChanged listener in _acceptCall(). String _agentState = ''; late AnimationController _thinkingController; @@ -149,6 +155,12 @@ class _AgentCallPageState extends ConsumerState _onCallEnded(); } }) + // Agent state monitoring: LiveKit AgentSession on the server publishes + // "lk.agent.state" as a participant attribute. When it changes, we + // update _agentState to drive UI changes: + // thinking → pulsing dots + "思考中..." + orange avatar glow + // speaking → waveform animation + "语音通话中" + // listening → default call UI ..on((event) { final state = event.attributes['lk.agent.state']; if (state != null && state != _agentState && mounted) { diff --git a/packages/services/voice-agent/src/agent.py b/packages/services/voice-agent/src/agent.py index b1df99c..3ac2f73 100644 --- a/packages/services/voice-agent/src/agent.py +++ b/packages/services/voice-agent/src/agent.py @@ -4,6 +4,27 @@ IT0 Voice Agent — LiveKit Agents v1.x entry point. Uses the official AgentServer + @server.rtc_session() pattern. Pipeline: VAD → STT → LLM (via agent-service) → TTS. +Agent State & Thinking Indicator +--------------------------------- +LiveKit AgentSession (v1.4.3+) automatically publishes the participant +attribute ``lk.agent.state`` with these values: + + initializing → listening → thinking → speaking → listening → ... + +The state transition happens inside the framework: + - RoomIO._on_agent_state_changed() calls + local_participant.set_attributes({"lk.agent.state": state}) + +On the Flutter side (livekit_client v2.6.4), the app listens for +ParticipantAttributesChanged events and reads the ``lk.agent.state`` +attribute to drive UI changes: + - "thinking" → pulsing dots animation + "思考中..." text + orange avatar + - "speaking" → waveform animation driven by audio level + - "listening" → default call UI + +BackgroundAudioPlayer is configured below to play a keyboard typing +sound effect during the "thinking" state as auditory feedback. + Usage: python -m src.agent start """ @@ -310,7 +331,15 @@ async def entrypoint(ctx: JobContext) -> None: room_output_options=room_io.RoomOutputOptions(), ) - # Play keyboard typing sound while agent is thinking (waiting for LLM) + # --- Thinking state audio feedback --- + # BackgroundAudioPlayer listens for AgentStateChangedEvent from the + # session. When state transitions to "thinking" (STT done, waiting for + # LLM response), it plays the built-in keyboard typing sound through + # the LiveKit audio track. The sound stops automatically when the agent + # enters "speaking" state (TTS begins). This gives the user audible + # feedback that the AI is processing their request. + # Available built-in clips: KEYBOARD_TYPING, KEYBOARD_TYPING2, + # OFFICE_AMBIENCE, CITY_AMBIENCE, FOREST_AMBIENCE, CROWDED_ROOM, HOLD_MUSIC bg_audio = BackgroundAudioPlayer( thinking_sound=BuiltinAudioClip.KEYBOARD_TYPING, )