feat(it0_app): add WhatsApp-style voice message with async agent interrupt

New VoiceMicButton widget (press-and-hold to record, release to send): - Records audio to a temp .m4a file via the `record` package - Slide-up gesture cancels recording without sending - Pulsing red mic icon + "松开发送/松开取消" feedback during recording New flow for voice messages: 1. Temp "🎤 识别中..." bubble shown immediately 2. Audio uploaded to POST /api/v1/agent/sessions/:id/voice-message (multipart/form-data; backend runs Whisper STT) 3. Placeholder replaced with real transcript 4. WS stream subscribed via new subscribeExistingTask() to receive agent's streaming response — same pipeline as text chat Voice messages act as async interrupts: if the agent is mid-task the backend hard-cancels it before processing the new voice command, so whoever presses the mic button always takes priority. Files changed: chat_remote_datasource.dart — sendVoiceMessage() multipart upload chat_repository.dart — subscribeExistingTask() interface method chat_repository_impl.dart — implement subscribeExistingTask(); fix sendVoiceMessage() stub chat_providers.dart — ChatNotifier.sendVoiceMessage() voice_mic_button.dart — NEW press-and-hold recording widget chat_page.dart — mic button added to input area Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-06 03:20:41 -08:00 · 2026-03-06 03:20:41 -08:00 · 55b983a950
parent a2af76bcd7
commit 55b983a950
6 changed files with 333 additions and 25 deletions
--- a/it0_app/lib/features/chat/data/datasources/chat_remote_datasource.dart
+++ b/it0_app/lib/features/chat/data/datasources/chat_remote_datasource.dart
@ -149,4 +149,27 @@ class ChatRemoteDatasource {
      },
    );
  }
+
+  /// Uploads an audio file to the voice-message endpoint.
+  /// Backend performs STT, interrupts any running task if needed, and
+  /// starts a new agent task with the transcript.
+  /// Returns { sessionId, taskId, transcript }.
+  Future<Map<String, dynamic>> sendVoiceMessage({
+    required String sessionId,
+    required String audioPath,
+    String language = 'zh',
+  }) async {
+    final formData = FormData.fromMap({
+      'audio': await MultipartFile.fromFile(
+        audioPath,
+        filename: audioPath.split('/').last,
+      ),
+      'language': language,
+    });
+    final response = await _dio.post(
+      '${ApiEndpoints.sessions}/$sessionId/voice-message',
+      data: formData,
+    );
+    return response.data as Map<String, dynamic>;
+  }
 }
--- a/it0_app/lib/features/chat/data/repositories/chat_repository_impl.dart
+++ b/it0_app/lib/features/chat/data/repositories/chat_repository_impl.dart
@ -82,28 +82,22 @@ class ChatRepositoryImpl implements ChatRepository {
    required String sessionId,
    required String audioPath,
  }) async* {
-    // For voice messages, we POST the audio file, then subscribe to WebSocket events
-    // similar to sendMessage. The backend handles STT + agent processing.
-    final response = await _remoteDatasource.createTask(
-      sessionId: sessionId,
-      message: '[voice_input]',
-      attachments: [{'filePath': audioPath, 'mediaType': 'audio/wav'}],
-    );
+    // Kept for interface compatibility — ChatNotifier calls sendVoiceMessage
+    // on the datasource directly (to get the transcript), then calls
+    // subscribeExistingTask. This method is not used.
+    yield* subscribeExistingTask(sessionId: sessionId, taskId: '');
+  }

-    final returnedSessionId = response['sessionId'] as String? ??
-        response['session_id'] as String? ??
-        sessionId;
-    final taskId = response['taskId'] as String? ?? response['task_id'] as String?;
-
-    // Emit the real sessionId and taskId so the notifier can capture them
-    yield SessionInfoEvent(returnedSessionId);
-    if (taskId != null) yield TaskInfoEvent(taskId);
-
-    final voiceToken = await _getAccessToken();
-    await _webSocketClient.connect('/ws/agent', token: voiceToken);
+  @override
+  Stream<StreamEvent> subscribeExistingTask({
+    required String sessionId,
+    required String taskId,
+  }) async* {
+    final token = await _getAccessToken();
+    await _webSocketClient.connect('/ws/agent', token: token);
    _webSocketClient.send({
      'event': 'subscribe_session',
-      'data': {'sessionId': returnedSessionId, 'taskId': taskId},
+      'data': {'sessionId': sessionId, 'taskId': taskId},
    });

    yield* _webSocketClient.messages.transform(
--- a/it0_app/lib/features/chat/domain/repositories/chat_repository.dart
+++ b/it0_app/lib/features/chat/domain/repositories/chat_repository.dart
@ -33,6 +33,14 @@ abstract class ChatRepository {
    required String message,
  });

+  /// Subscribes to an already-running agent task's WS stream without
+  /// re-submitting a prompt. Used after a voice message upload to receive
+  /// the agent's streamed response.
+  Stream<StreamEvent> subscribeExistingTask({
+    required String sessionId,
+    required String taskId,
+  });
+
  /// Confirms a standing order draft proposed by the agent.
  Future<void> confirmStandingOrder(
    String sessionId,
--- a/it0_app/lib/features/chat/presentation/pages/chat_page.dart
+++ b/it0_app/lib/features/chat/presentation/pages/chat_page.dart
@ -12,6 +12,7 @@ import '../widgets/stream_text_widget.dart';
 import '../widgets/approval_action_card.dart';
 import '../widgets/conversation_drawer.dart';
 import '../../../agent_call/presentation/pages/agent_call_page.dart';
+import '../widgets/voice_mic_button.dart';

 // ---------------------------------------------------------------------------
 // Chat page – Timeline workflow style (inspired by Claude Code VSCode)
@ -698,6 +699,14 @@ class _ChatPageState extends ConsumerState<ChatPage> {
                  ],
                )
              else
+                Row(
+                  mainAxisSize: MainAxisSize.min,
+                  children: [
+                    VoiceMicButton(
+                      disabled: isAwaitingApproval,
+                      onAudioReady: (path) =>
+                          ref.read(chatProvider.notifier).sendVoiceMessage(path),
+                    ),
                    Padding(
                      padding: const EdgeInsets.only(right: 4),
                      child: IconButton(
@ -709,6 +718,8 @@ class _ChatPageState extends ConsumerState<ChatPage> {
                ),
            ],
          ),
+        ],
+      ),
    );
  }

--- a/it0_app/lib/features/chat/presentation/providers/chat_providers.dart
+++ b/it0_app/lib/features/chat/presentation/providers/chat_providers.dart
@ -488,6 +488,91 @@ class ChatNotifier extends StateNotifier<ChatState> {
    }
  }

+  /// Sends a recorded audio file as a voice message.
+  ///
+  /// Flow:
+  ///   1. Shows a temporary "识别中..." user message bubble.
+  ///   2. Uploads audio to the backend voice-message endpoint.
+  ///      Backend runs Whisper STT, optionally interrupts any running task,
+  ///      and starts a new agent task with the transcript.
+  ///   3. Replaces the placeholder with the real transcript.
+  ///   4. Subscribes to the WS stream for the new task.
+  Future<void> sendVoiceMessage(String audioPath) async {
+    final tempId = '${DateTime.now().microsecondsSinceEpoch}_voice';
+    final tempMsg = ChatMessage(
+      id: tempId,
+      role: MessageRole.user,
+      content: '🎤 识别中...',
+      timestamp: DateTime.now(),
+      type: MessageType.text,
+    );
+
+    // Cancel any ongoing subscription (voice message acts as interrupt)
+    _eventSubscription?.cancel();
+    _eventSubscription = null;
+    _flushBuffersSync();
+
+    state = state.copyWith(
+      messages: [...state.messages, tempMsg],
+      agentStatus: AgentStatus.thinking,
+      error: null,
+    );
+
+    try {
+      final datasource = _ref.read(chatRemoteDatasourceProvider);
+      final sessionId = state.sessionId ?? 'new';
+
+      final result = await datasource.sendVoiceMessage(
+        sessionId: sessionId,
+        audioPath: audioPath,
+      );
+
+      final returnedSessionId = result['sessionId'] as String? ?? sessionId;
+      final taskId = result['taskId'] as String?;
+      final transcript = result['transcript'] as String? ?? '🎤';
+
+      // Replace placeholder with real transcript
+      final updatedMessages = state.messages
+          .map((m) => m.id == tempId ? m.copyWith(content: transcript) : m)
+          .toList();
+
+      state = state.copyWith(
+        messages: updatedMessages,
+        sessionId: returnedSessionId,
+        taskId: taskId,
+      );
+
+      // Subscribe to the WS stream for the running task
+      final repo = _ref.read(chatRepositoryProvider);
+      final stream = repo.subscribeExistingTask(
+        sessionId: returnedSessionId,
+        taskId: taskId ?? '',
+      );
+
+      _eventSubscription = stream.listen(
+        (event) => _handleStreamEvent(event),
+        onError: (error) {
+          state = state.copyWith(
+            agentStatus: AgentStatus.error,
+            error: '语音消息处理失败: $error',
+          );
+        },
+        onDone: () {
+          if (state.agentStatus != AgentStatus.error) {
+            state = state.copyWith(agentStatus: AgentStatus.idle);
+          }
+        },
+      );
+    } catch (e) {
+      // Remove placeholder on failure
+      state = state.copyWith(
+        messages: state.messages.where((m) => m.id != tempId).toList(),
+        agentStatus: AgentStatus.error,
+        error: '语音识别失败: $e',
+      );
+    }
+  }
+
  Future<void> cancelCurrentTask() async {
    final taskId = state.taskId;
    if (taskId == null && state.sessionId == null) return;
--- a/it0_app/lib/features/chat/presentation/widgets/voice_mic_button.dart
+++ b/it0_app/lib/features/chat/presentation/widgets/voice_mic_button.dart
@ -0,0 +1,187 @@
+import 'dart:io';
+import 'package:flutter/material.dart';
+import 'package:path_provider/path_provider.dart';
+import 'package:record/record.dart';
+import '../../../../core/theme/app_colors.dart';
+
+/// WhatsApp-style press-and-hold mic button.
+///
+/// • Press and hold → records audio to a temp file.
+/// • Release → stops recording and calls [onAudioReady] with the file path.
+/// • Slide up while holding → cancels recording without sending.
+///
+/// Requires microphone permission (handled by the `record` package).
+class VoiceMicButton extends StatefulWidget {
+  /// Called with the temp file path when the user releases the button.
+  final void Function(String audioPath) onAudioReady;
+
+  /// Whether the button should be disabled (e.g. awaiting approval).
+  final bool disabled;
+
+  const VoiceMicButton({
+    super.key,
+    required this.onAudioReady,
+    this.disabled = false,
+  });
+
+  @override
+  State<VoiceMicButton> createState() => _VoiceMicButtonState();
+}
+
+class _VoiceMicButtonState extends State<VoiceMicButton>
+    with SingleTickerProviderStateMixin {
+  final _recorder = AudioRecorder();
+  bool _isRecording = false;
+  bool _cancelled = false;
+
+  // Slide-up cancel threshold (pixels above press origin)
+  static const double _cancelThreshold = 60.0;
+  Offset? _pressOrigin;
+
+  late final AnimationController _pulseController;
+  late final Animation<double> _pulseAnimation;
+
+  @override
+  void initState() {
+    super.initState();
+    _pulseController = AnimationController(
+      vsync: this,
+      duration: const Duration(milliseconds: 800),
+    );
+    _pulseAnimation = Tween<double>(begin: 1.0, end: 1.25).animate(
+      CurvedAnimation(parent: _pulseController, curve: Curves.easeInOut),
+    );
+  }
+
+  @override
+  void dispose() {
+    _recorder.dispose();
+    _pulseController.dispose();
+    super.dispose();
+  }
+
+  Future<void> _startRecording() async {
+    final hasPermission = await _recorder.hasPermission();
+    if (!hasPermission) {
+      if (mounted) {
+        ScaffoldMessenger.of(context).showSnackBar(
+          const SnackBar(content: Text('需要麦克风权限')),
+        );
+      }
+      return;
+    }
+
+    final dir = await getTemporaryDirectory();
+    final path = '${dir.path}/voice_${DateTime.now().millisecondsSinceEpoch}.m4a';
+
+    await _recorder.start(
+      const RecordConfig(encoder: AudioEncoder.aacLc, sampleRate: 16000),
+      path: path,
+    );
+
+    setState(() {
+      _isRecording = true;
+      _cancelled = false;
+    });
+    _pulseController.repeat(reverse: true);
+  }
+
+  Future<void> _stopRecording({required bool cancel}) async {
+    if (!_isRecording) return;
+
+    _pulseController.stop();
+    _pulseController.reset();
+
+    final path = await _recorder.stop();
+    setState(() => _isRecording = false);
+
+    if (cancel || path == null) return;
+
+    // Ignore empty recordings (< ~0.3s)
+    try {
+      final size = await File(path).length();
+      if (size < 2048) return;
+    } catch (_) {
+      return;
+    }
+
+    widget.onAudioReady(path);
+  }
+
+  void _onLongPressStart(LongPressStartDetails details) {
+    if (widget.disabled) return;
+    _pressOrigin = details.globalPosition;
+    _startRecording();
+  }
+
+  void _onLongPressMoveUpdate(LongPressMoveUpdateDetails details) {
+    if (_pressOrigin == null || !_isRecording) return;
+    final dy = _pressOrigin!.dy - details.globalPosition.dy;
+    setState(() => _cancelled = dy > _cancelThreshold);
+  }
+
+  void _onLongPressEnd(LongPressEndDetails details) {
+    _stopRecording(cancel: _cancelled);
+    _pressOrigin = null;
+  }
+
+  void _onLongPressCancel() {
+    _stopRecording(cancel: true);
+    _pressOrigin = null;
+  }
+
+  @override
+  Widget build(BuildContext context) {
+    if (_isRecording) {
+      return _buildRecordingButton();
+    }
+    return GestureDetector(
+      onLongPressStart: _onLongPressStart,
+      onLongPressMoveUpdate: _onLongPressMoveUpdate,
+      onLongPressEnd: _onLongPressEnd,
+      onLongPressCancel: _onLongPressCancel,
+      child: IconButton(
+        icon: Icon(
+          Icons.mic_none,
+          size: 22,
+          color: widget.disabled ? AppColors.textMuted : null,
+        ),
+        tooltip: '按住录音',
+        onPressed: widget.disabled ? null : () {},
+      ),
+    );
+  }
+
+  Widget _buildRecordingButton() {
+    final isCancelling = _cancelled;
+    return GestureDetector(
+      onLongPressMoveUpdate: _onLongPressMoveUpdate,
+      onLongPressEnd: _onLongPressEnd,
+      onLongPressCancel: _onLongPressCancel,
+      child: Padding(
+        padding: const EdgeInsets.symmetric(horizontal: 4, vertical: 4),
+        child: Row(
+          mainAxisSize: MainAxisSize.min,
+          children: [
+            ScaleTransition(
+              scale: _pulseAnimation,
+              child: Icon(
+                Icons.mic,
+                size: 22,
+                color: isCancelling ? AppColors.textMuted : AppColors.error,
+              ),
+            ),
+            const SizedBox(width: 4),
+            Text(
+              isCancelling ? '松开取消' : '松开发送',
+              style: TextStyle(
+                fontSize: 12,
+                color: isCancelling ? AppColors.textMuted : AppColors.error,
+              ),
+            ),
+          ],
+        ),
+      ),
+    );
+  }
+}