feat(it0_app): add WhatsApp-style voice message with async agent interrupt

New VoiceMicButton widget (press-and-hold to record, release to send): - Records audio to a temp .m4a file via the `record` package - Slide-up gesture cancels recording without sending - Pulsing red mic icon + "松开发送/松开取消" feedback during recording New flow for voice messages: 1. Temp "🎤 识别中..." bubble shown immediately 2. Audio uploaded to POST /api/v1/agent/sessions/:id/voice-message (multipart/form-data; backend runs Whisper STT) 3. Placeholder replaced with real transcript 4. WS stream subscribed via new subscribeExistingTask() to receive agent's streaming response — same pipeline as text chat Voice messages act as async interrupts: if the agent is mid-task the backend hard-cancels it before processing the new voice command, so whoever presses the mic button always takes priority. Files changed: chat_remote_datasource.dart — sendVoiceMessage() multipart upload chat_repository.dart — subscribeExistingTask() interface method chat_repository_impl.dart — implement subscribeExistingTask(); fix sendVoiceMessage() stub chat_providers.dart — ChatNotifier.sendVoiceMessage() voice_mic_button.dart — NEW press-and-hold recording widget chat_page.dart — mic button added to input area Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-06 03:20:41 -08:00 · 2026-03-06 03:20:41 -08:00 · 55b983a950
parent a2af76bcd7
commit 55b983a950
6 changed files with 333 additions and 25 deletions
--- a/it0_app/lib/features/chat/data/datasources/chat_remote_datasource.dart
+++ b/it0_app/lib/features/chat/data/datasources/chat_remote_datasource.dart
@ -149,4 +149,27 @@ class ChatRemoteDatasource {
      },
    );
  }
  /// Uploads an audio file to the voice-message endpoint.
  /// Backend performs STT, interrupts any running task if needed, and
  /// starts a new agent task with the transcript.
  /// Returns { sessionId, taskId, transcript }.
  Future<Map<String, dynamic>> sendVoiceMessage({
    required String sessionId,
    required String audioPath,
    String language = 'zh',
  }) async {
    final formData = FormData.fromMap({
      'audio': await MultipartFile.fromFile(
        audioPath,
        filename: audioPath.split('/').last,
      ),
      'language': language,
    });
    final response = await _dio.post(
      '${ApiEndpoints.sessions}/$sessionId/voice-message',
      data: formData,
    );
    return response.data as Map<String, dynamic>;
  }
 }
--- a/it0_app/lib/features/chat/data/repositories/chat_repository_impl.dart
+++ b/it0_app/lib/features/chat/data/repositories/chat_repository_impl.dart
@ -82,28 +82,22 @@ class ChatRepositoryImpl implements ChatRepository {
    required String sessionId,
    required String audioPath,
  }) async* {
-    // For voice messages, we POST the audio file, then subscribe to WebSocket events
+    // Kept for interface compatibility — ChatNotifier calls sendVoiceMessage
-    // similar to sendMessage. The backend handles STT + agent processing.
+    // on the datasource directly (to get the transcript), then calls
-    final response = await _remoteDatasource.createTask(
+    // subscribeExistingTask. This method is not used.
-      sessionId: sessionId,
+    yield* subscribeExistingTask(sessionId: sessionId, taskId: '');
-      message: '[voice_input]',
+  }
      attachments: [{'filePath': audioPath, 'mediaType': 'audio/wav'}],
    );
-    final returnedSessionId = response['sessionId'] as String? ??
+  @override
-        response['session_id'] as String? ??
+  Stream<StreamEvent> subscribeExistingTask({
-        sessionId;
+    required String sessionId,
-    final taskId = response['taskId'] as String? ?? response['task_id'] as String?;
+    required String taskId,
-
+  }) async* {
-    // Emit the real sessionId and taskId so the notifier can capture them
+    final token = await _getAccessToken();
-    yield SessionInfoEvent(returnedSessionId);
+    await _webSocketClient.connect('/ws/agent', token: token);
    if (taskId != null) yield TaskInfoEvent(taskId);
    final voiceToken = await _getAccessToken();
    await _webSocketClient.connect('/ws/agent', token: voiceToken);
    _webSocketClient.send({
      'event': 'subscribe_session',
-      'data': {'sessionId': returnedSessionId, 'taskId': taskId},
+      'data': {'sessionId': sessionId, 'taskId': taskId},
    });
    yield* _webSocketClient.messages.transform(
--- a/it0_app/lib/features/chat/domain/repositories/chat_repository.dart
+++ b/it0_app/lib/features/chat/domain/repositories/chat_repository.dart
@ -33,6 +33,14 @@ abstract class ChatRepository {
    required String message,
  });
  /// Subscribes to an already-running agent task's WS stream without
  /// re-submitting a prompt. Used after a voice message upload to receive
  /// the agent's streamed response.
  Stream<StreamEvent> subscribeExistingTask({
    required String sessionId,
    required String taskId,
  });
  /// Confirms a standing order draft proposed by the agent.
  Future<void> confirmStandingOrder(
    String sessionId,
--- a/it0_app/lib/features/chat/presentation/pages/chat_page.dart
+++ b/it0_app/lib/features/chat/presentation/pages/chat_page.dart
@ -12,6 +12,7 @@ import '../widgets/stream_text_widget.dart';
 import '../widgets/approval_action_card.dart';
 import '../widgets/conversation_drawer.dart';
 import '../../../agent_call/presentation/pages/agent_call_page.dart';
 import '../widgets/voice_mic_button.dart';
 // ---------------------------------------------------------------------------
 // Chat page – Timeline workflow style (inspired by Claude Code VSCode)
@ -698,12 +699,22 @@ class _ChatPageState extends ConsumerState<ChatPage> {
                  ],
                )
              else
-                Padding(
+                Row(
-                  padding: const EdgeInsets.only(right: 4),
+                  mainAxisSize: MainAxisSize.min,
-                  child: IconButton(
+                  children: [
-                    icon: const Icon(Icons.send, size: 20),
+                    VoiceMicButton(
-                    onPressed: isAwaitingApproval ? null : _send,
+                      disabled: isAwaitingApproval,
-                  ),
+                      onAudioReady: (path) =>
                          ref.read(chatProvider.notifier).sendVoiceMessage(path),
                    ),
                    Padding(
                      padding: const EdgeInsets.only(right: 4),
                      child: IconButton(
                        icon: const Icon(Icons.send, size: 20),
                        onPressed: isAwaitingApproval ? null : _send,
                      ),
                    ),
                  ],
                ),
            ],
          ),
--- a/it0_app/lib/features/chat/presentation/providers/chat_providers.dart
+++ b/it0_app/lib/features/chat/presentation/providers/chat_providers.dart
@ -488,6 +488,91 @@ class ChatNotifier extends StateNotifier<ChatState> {
    }
  }
  /// Sends a recorded audio file as a voice message.
  ///
  /// Flow:
  ///   1. Shows a temporary "识别中..." user message bubble.
  ///   2. Uploads audio to the backend voice-message endpoint.
  ///      Backend runs Whisper STT, optionally interrupts any running task,
  ///      and starts a new agent task with the transcript.
  ///   3. Replaces the placeholder with the real transcript.
  ///   4. Subscribes to the WS stream for the new task.
  Future<void> sendVoiceMessage(String audioPath) async {
    final tempId = '${DateTime.now().microsecondsSinceEpoch}_voice';
    final tempMsg = ChatMessage(
      id: tempId,
      role: MessageRole.user,
      content: '🎤 识别中...',
      timestamp: DateTime.now(),
      type: MessageType.text,
    );
    // Cancel any ongoing subscription (voice message acts as interrupt)
    _eventSubscription?.cancel();
    _eventSubscription = null;
    _flushBuffersSync();
    state = state.copyWith(
      messages: [...state.messages, tempMsg],
      agentStatus: AgentStatus.thinking,
      error: null,
    );
    try {
      final datasource = _ref.read(chatRemoteDatasourceProvider);
      final sessionId = state.sessionId ?? 'new';
      final result = await datasource.sendVoiceMessage(
        sessionId: sessionId,
        audioPath: audioPath,
      );
      final returnedSessionId = result['sessionId'] as String? ?? sessionId;
      final taskId = result['taskId'] as String?;
      final transcript = result['transcript'] as String? ?? '🎤';
      // Replace placeholder with real transcript
      final updatedMessages = state.messages
          .map((m) => m.id == tempId ? m.copyWith(content: transcript) : m)
          .toList();
      state = state.copyWith(
        messages: updatedMessages,
        sessionId: returnedSessionId,
        taskId: taskId,
      );
      // Subscribe to the WS stream for the running task
      final repo = _ref.read(chatRepositoryProvider);
      final stream = repo.subscribeExistingTask(
        sessionId: returnedSessionId,
        taskId: taskId ?? '',
      );
      _eventSubscription = stream.listen(
        (event) => _handleStreamEvent(event),
        onError: (error) {
          state = state.copyWith(
            agentStatus: AgentStatus.error,
            error: '语音消息处理失败: $error',
          );
        },
        onDone: () {
          if (state.agentStatus != AgentStatus.error) {
            state = state.copyWith(agentStatus: AgentStatus.idle);
          }
        },
      );
    } catch (e) {
      // Remove placeholder on failure
      state = state.copyWith(
        messages: state.messages.where((m) => m.id != tempId).toList(),
        agentStatus: AgentStatus.error,
        error: '语音识别失败: $e',
      );
    }
  }
  Future<void> cancelCurrentTask() async {
    final taskId = state.taskId;
    if (taskId == null && state.sessionId == null) return;
--- a/it0_app/lib/features/chat/presentation/widgets/voice_mic_button.dart
+++ b/it0_app/lib/features/chat/presentation/widgets/voice_mic_button.dart
@ -0,0 +1,187 @@
 import 'dart:io';
 import 'package:flutter/material.dart';
 import 'package:path_provider/path_provider.dart';
 import 'package:record/record.dart';
 import '../../../../core/theme/app_colors.dart';
 /// WhatsApp-style press-and-hold mic button.
 ///
 /// • Press and hold → records audio to a temp file.
 /// • Release → stops recording and calls [onAudioReady] with the file path.
 /// • Slide up while holding → cancels recording without sending.
 ///
 /// Requires microphone permission (handled by the `record` package).
 class VoiceMicButton extends StatefulWidget {
  /// Called with the temp file path when the user releases the button.
  final void Function(String audioPath) onAudioReady;
  /// Whether the button should be disabled (e.g. awaiting approval).
  final bool disabled;
  const VoiceMicButton({
    super.key,
    required this.onAudioReady,
    this.disabled = false,
  });
  @override
  State<VoiceMicButton> createState() => _VoiceMicButtonState();
 }
 class _VoiceMicButtonState extends State<VoiceMicButton>
    with SingleTickerProviderStateMixin {
  final _recorder = AudioRecorder();
  bool _isRecording = false;
  bool _cancelled = false;
  // Slide-up cancel threshold (pixels above press origin)
  static const double _cancelThreshold = 60.0;
  Offset? _pressOrigin;
  late final AnimationController _pulseController;
  late final Animation<double> _pulseAnimation;
  @override
  void initState() {
    super.initState();
    _pulseController = AnimationController(
      vsync: this,
      duration: const Duration(milliseconds: 800),
    );
    _pulseAnimation = Tween<double>(begin: 1.0, end: 1.25).animate(
      CurvedAnimation(parent: _pulseController, curve: Curves.easeInOut),
    );
  }
  @override
  void dispose() {
    _recorder.dispose();
    _pulseController.dispose();
    super.dispose();
  }
  Future<void> _startRecording() async {
    final hasPermission = await _recorder.hasPermission();
    if (!hasPermission) {
      if (mounted) {
        ScaffoldMessenger.of(context).showSnackBar(
          const SnackBar(content: Text('需要麦克风权限')),
        );
      }
      return;
    }
    final dir = await getTemporaryDirectory();
    final path = '${dir.path}/voice_${DateTime.now().millisecondsSinceEpoch}.m4a';
    await _recorder.start(
      const RecordConfig(encoder: AudioEncoder.aacLc, sampleRate: 16000),
      path: path,
    );
    setState(() {
      _isRecording = true;
      _cancelled = false;
    });
    _pulseController.repeat(reverse: true);
  }
  Future<void> _stopRecording({required bool cancel}) async {
    if (!_isRecording) return;
    _pulseController.stop();
    _pulseController.reset();
    final path = await _recorder.stop();
    setState(() => _isRecording = false);
    if (cancel || path == null) return;
    // Ignore empty recordings (< ~0.3s)
    try {
      final size = await File(path).length();
      if (size < 2048) return;
    } catch (_) {
      return;
    }
    widget.onAudioReady(path);
  }
  void _onLongPressStart(LongPressStartDetails details) {
    if (widget.disabled) return;
    _pressOrigin = details.globalPosition;
    _startRecording();
  }
  void _onLongPressMoveUpdate(LongPressMoveUpdateDetails details) {
    if (_pressOrigin == null || !_isRecording) return;
    final dy = _pressOrigin!.dy - details.globalPosition.dy;
    setState(() => _cancelled = dy > _cancelThreshold);
  }
  void _onLongPressEnd(LongPressEndDetails details) {
    _stopRecording(cancel: _cancelled);
    _pressOrigin = null;
  }
  void _onLongPressCancel() {
    _stopRecording(cancel: true);
    _pressOrigin = null;
  }
  @override
  Widget build(BuildContext context) {
    if (_isRecording) {
      return _buildRecordingButton();
    }
    return GestureDetector(
      onLongPressStart: _onLongPressStart,
      onLongPressMoveUpdate: _onLongPressMoveUpdate,
      onLongPressEnd: _onLongPressEnd,
      onLongPressCancel: _onLongPressCancel,
      child: IconButton(
        icon: Icon(
          Icons.mic_none,
          size: 22,
          color: widget.disabled ? AppColors.textMuted : null,
        ),
        tooltip: '按住录音',
        onPressed: widget.disabled ? null : () {},
      ),
    );
  }
  Widget _buildRecordingButton() {
    final isCancelling = _cancelled;
    return GestureDetector(
      onLongPressMoveUpdate: _onLongPressMoveUpdate,
      onLongPressEnd: _onLongPressEnd,
      onLongPressCancel: _onLongPressCancel,
      child: Padding(
        padding: const EdgeInsets.symmetric(horizontal: 4, vertical: 4),
        child: Row(
          mainAxisSize: MainAxisSize.min,
          children: [
            ScaleTransition(
              scale: _pulseAnimation,
              child: Icon(
                Icons.mic,
                size: 22,
                color: isCancelling ? AppColors.textMuted : AppColors.error,
              ),
            ),
            const SizedBox(width: 4),
            Text(
              isCancelling ? '松开取消' : '松开发送',
              style: TextStyle(
                fontSize: 12,
                color: isCancelling ? AppColors.textMuted : AppColors.error,
              ),
            ),
          ],
        ),
      ),
    );
  }
 }