feat(it0_app): add WhatsApp-style voice message with async agent interrupt

New VoiceMicButton widget (press-and-hold to record, release to send):
- Records audio to a temp .m4a file via the `record` package
- Slide-up gesture cancels recording without sending
- Pulsing red mic icon + "松开发送/松开取消" feedback during recording

New flow for voice messages:
  1. Temp "🎤 识别中..." bubble shown immediately
  2. Audio uploaded to POST /api/v1/agent/sessions/:id/voice-message
     (multipart/form-data; backend runs Whisper STT)
  3. Placeholder replaced with real transcript
  4. WS stream subscribed via new subscribeExistingTask() to receive
     agent's streaming response — same pipeline as text chat

Voice messages act as async interrupts: if the agent is mid-task the
backend hard-cancels it before processing the new voice command,
so whoever presses the mic button always takes priority.

Files changed:
  chat_remote_datasource.dart — sendVoiceMessage() multipart upload
  chat_repository.dart        — subscribeExistingTask() interface method
  chat_repository_impl.dart   — implement subscribeExistingTask(); fix
                                sendVoiceMessage() stub
  chat_providers.dart         — ChatNotifier.sendVoiceMessage()
  voice_mic_button.dart       — NEW press-and-hold recording widget
  chat_page.dart              — mic button added to input area

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
hailin 2026-03-06 03:20:41 -08:00
parent a2af76bcd7
commit 55b983a950
6 changed files with 333 additions and 25 deletions

View File

@ -149,4 +149,27 @@ class ChatRemoteDatasource {
}, },
); );
} }
/// Uploads an audio file to the voice-message endpoint.
/// Backend performs STT, interrupts any running task if needed, and
/// starts a new agent task with the transcript.
/// Returns { sessionId, taskId, transcript }.
Future<Map<String, dynamic>> sendVoiceMessage({
required String sessionId,
required String audioPath,
String language = 'zh',
}) async {
final formData = FormData.fromMap({
'audio': await MultipartFile.fromFile(
audioPath,
filename: audioPath.split('/').last,
),
'language': language,
});
final response = await _dio.post(
'${ApiEndpoints.sessions}/$sessionId/voice-message',
data: formData,
);
return response.data as Map<String, dynamic>;
}
} }

View File

@ -82,28 +82,22 @@ class ChatRepositoryImpl implements ChatRepository {
required String sessionId, required String sessionId,
required String audioPath, required String audioPath,
}) async* { }) async* {
// For voice messages, we POST the audio file, then subscribe to WebSocket events // Kept for interface compatibility ChatNotifier calls sendVoiceMessage
// similar to sendMessage. The backend handles STT + agent processing. // on the datasource directly (to get the transcript), then calls
final response = await _remoteDatasource.createTask( // subscribeExistingTask. This method is not used.
sessionId: sessionId, yield* subscribeExistingTask(sessionId: sessionId, taskId: '');
message: '[voice_input]', }
attachments: [{'filePath': audioPath, 'mediaType': 'audio/wav'}],
);
final returnedSessionId = response['sessionId'] as String? ?? @override
response['session_id'] as String? ?? Stream<StreamEvent> subscribeExistingTask({
sessionId; required String sessionId,
final taskId = response['taskId'] as String? ?? response['task_id'] as String?; required String taskId,
}) async* {
// Emit the real sessionId and taskId so the notifier can capture them final token = await _getAccessToken();
yield SessionInfoEvent(returnedSessionId); await _webSocketClient.connect('/ws/agent', token: token);
if (taskId != null) yield TaskInfoEvent(taskId);
final voiceToken = await _getAccessToken();
await _webSocketClient.connect('/ws/agent', token: voiceToken);
_webSocketClient.send({ _webSocketClient.send({
'event': 'subscribe_session', 'event': 'subscribe_session',
'data': {'sessionId': returnedSessionId, 'taskId': taskId}, 'data': {'sessionId': sessionId, 'taskId': taskId},
}); });
yield* _webSocketClient.messages.transform( yield* _webSocketClient.messages.transform(

View File

@ -33,6 +33,14 @@ abstract class ChatRepository {
required String message, required String message,
}); });
/// Subscribes to an already-running agent task's WS stream without
/// re-submitting a prompt. Used after a voice message upload to receive
/// the agent's streamed response.
Stream<StreamEvent> subscribeExistingTask({
required String sessionId,
required String taskId,
});
/// Confirms a standing order draft proposed by the agent. /// Confirms a standing order draft proposed by the agent.
Future<void> confirmStandingOrder( Future<void> confirmStandingOrder(
String sessionId, String sessionId,

View File

@ -12,6 +12,7 @@ import '../widgets/stream_text_widget.dart';
import '../widgets/approval_action_card.dart'; import '../widgets/approval_action_card.dart';
import '../widgets/conversation_drawer.dart'; import '../widgets/conversation_drawer.dart';
import '../../../agent_call/presentation/pages/agent_call_page.dart'; import '../../../agent_call/presentation/pages/agent_call_page.dart';
import '../widgets/voice_mic_button.dart';
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
// Chat page Timeline workflow style (inspired by Claude Code VSCode) // Chat page Timeline workflow style (inspired by Claude Code VSCode)
@ -698,6 +699,14 @@ class _ChatPageState extends ConsumerState<ChatPage> {
], ],
) )
else else
Row(
mainAxisSize: MainAxisSize.min,
children: [
VoiceMicButton(
disabled: isAwaitingApproval,
onAudioReady: (path) =>
ref.read(chatProvider.notifier).sendVoiceMessage(path),
),
Padding( Padding(
padding: const EdgeInsets.only(right: 4), padding: const EdgeInsets.only(right: 4),
child: IconButton( child: IconButton(
@ -709,6 +718,8 @@ class _ChatPageState extends ConsumerState<ChatPage> {
), ),
], ],
), ),
],
),
); );
} }

View File

@ -488,6 +488,91 @@ class ChatNotifier extends StateNotifier<ChatState> {
} }
} }
/// Sends a recorded audio file as a voice message.
///
/// Flow:
/// 1. Shows a temporary "识别中..." user message bubble.
/// 2. Uploads audio to the backend voice-message endpoint.
/// Backend runs Whisper STT, optionally interrupts any running task,
/// and starts a new agent task with the transcript.
/// 3. Replaces the placeholder with the real transcript.
/// 4. Subscribes to the WS stream for the new task.
Future<void> sendVoiceMessage(String audioPath) async {
final tempId = '${DateTime.now().microsecondsSinceEpoch}_voice';
final tempMsg = ChatMessage(
id: tempId,
role: MessageRole.user,
content: '🎤 识别中...',
timestamp: DateTime.now(),
type: MessageType.text,
);
// Cancel any ongoing subscription (voice message acts as interrupt)
_eventSubscription?.cancel();
_eventSubscription = null;
_flushBuffersSync();
state = state.copyWith(
messages: [...state.messages, tempMsg],
agentStatus: AgentStatus.thinking,
error: null,
);
try {
final datasource = _ref.read(chatRemoteDatasourceProvider);
final sessionId = state.sessionId ?? 'new';
final result = await datasource.sendVoiceMessage(
sessionId: sessionId,
audioPath: audioPath,
);
final returnedSessionId = result['sessionId'] as String? ?? sessionId;
final taskId = result['taskId'] as String?;
final transcript = result['transcript'] as String? ?? '🎤';
// Replace placeholder with real transcript
final updatedMessages = state.messages
.map((m) => m.id == tempId ? m.copyWith(content: transcript) : m)
.toList();
state = state.copyWith(
messages: updatedMessages,
sessionId: returnedSessionId,
taskId: taskId,
);
// Subscribe to the WS stream for the running task
final repo = _ref.read(chatRepositoryProvider);
final stream = repo.subscribeExistingTask(
sessionId: returnedSessionId,
taskId: taskId ?? '',
);
_eventSubscription = stream.listen(
(event) => _handleStreamEvent(event),
onError: (error) {
state = state.copyWith(
agentStatus: AgentStatus.error,
error: '语音消息处理失败: $error',
);
},
onDone: () {
if (state.agentStatus != AgentStatus.error) {
state = state.copyWith(agentStatus: AgentStatus.idle);
}
},
);
} catch (e) {
// Remove placeholder on failure
state = state.copyWith(
messages: state.messages.where((m) => m.id != tempId).toList(),
agentStatus: AgentStatus.error,
error: '语音识别失败: $e',
);
}
}
Future<void> cancelCurrentTask() async { Future<void> cancelCurrentTask() async {
final taskId = state.taskId; final taskId = state.taskId;
if (taskId == null && state.sessionId == null) return; if (taskId == null && state.sessionId == null) return;

View File

@ -0,0 +1,187 @@
import 'dart:io';
import 'package:flutter/material.dart';
import 'package:path_provider/path_provider.dart';
import 'package:record/record.dart';
import '../../../../core/theme/app_colors.dart';
/// WhatsApp-style press-and-hold mic button.
///
/// Press and hold records audio to a temp file.
/// Release stops recording and calls [onAudioReady] with the file path.
/// Slide up while holding cancels recording without sending.
///
/// Requires microphone permission (handled by the `record` package).
class VoiceMicButton extends StatefulWidget {
/// Called with the temp file path when the user releases the button.
final void Function(String audioPath) onAudioReady;
/// Whether the button should be disabled (e.g. awaiting approval).
final bool disabled;
const VoiceMicButton({
super.key,
required this.onAudioReady,
this.disabled = false,
});
@override
State<VoiceMicButton> createState() => _VoiceMicButtonState();
}
class _VoiceMicButtonState extends State<VoiceMicButton>
with SingleTickerProviderStateMixin {
final _recorder = AudioRecorder();
bool _isRecording = false;
bool _cancelled = false;
// Slide-up cancel threshold (pixels above press origin)
static const double _cancelThreshold = 60.0;
Offset? _pressOrigin;
late final AnimationController _pulseController;
late final Animation<double> _pulseAnimation;
@override
void initState() {
super.initState();
_pulseController = AnimationController(
vsync: this,
duration: const Duration(milliseconds: 800),
);
_pulseAnimation = Tween<double>(begin: 1.0, end: 1.25).animate(
CurvedAnimation(parent: _pulseController, curve: Curves.easeInOut),
);
}
@override
void dispose() {
_recorder.dispose();
_pulseController.dispose();
super.dispose();
}
Future<void> _startRecording() async {
final hasPermission = await _recorder.hasPermission();
if (!hasPermission) {
if (mounted) {
ScaffoldMessenger.of(context).showSnackBar(
const SnackBar(content: Text('需要麦克风权限')),
);
}
return;
}
final dir = await getTemporaryDirectory();
final path = '${dir.path}/voice_${DateTime.now().millisecondsSinceEpoch}.m4a';
await _recorder.start(
const RecordConfig(encoder: AudioEncoder.aacLc, sampleRate: 16000),
path: path,
);
setState(() {
_isRecording = true;
_cancelled = false;
});
_pulseController.repeat(reverse: true);
}
Future<void> _stopRecording({required bool cancel}) async {
if (!_isRecording) return;
_pulseController.stop();
_pulseController.reset();
final path = await _recorder.stop();
setState(() => _isRecording = false);
if (cancel || path == null) return;
// Ignore empty recordings (< ~0.3s)
try {
final size = await File(path).length();
if (size < 2048) return;
} catch (_) {
return;
}
widget.onAudioReady(path);
}
void _onLongPressStart(LongPressStartDetails details) {
if (widget.disabled) return;
_pressOrigin = details.globalPosition;
_startRecording();
}
void _onLongPressMoveUpdate(LongPressMoveUpdateDetails details) {
if (_pressOrigin == null || !_isRecording) return;
final dy = _pressOrigin!.dy - details.globalPosition.dy;
setState(() => _cancelled = dy > _cancelThreshold);
}
void _onLongPressEnd(LongPressEndDetails details) {
_stopRecording(cancel: _cancelled);
_pressOrigin = null;
}
void _onLongPressCancel() {
_stopRecording(cancel: true);
_pressOrigin = null;
}
@override
Widget build(BuildContext context) {
if (_isRecording) {
return _buildRecordingButton();
}
return GestureDetector(
onLongPressStart: _onLongPressStart,
onLongPressMoveUpdate: _onLongPressMoveUpdate,
onLongPressEnd: _onLongPressEnd,
onLongPressCancel: _onLongPressCancel,
child: IconButton(
icon: Icon(
Icons.mic_none,
size: 22,
color: widget.disabled ? AppColors.textMuted : null,
),
tooltip: '按住录音',
onPressed: widget.disabled ? null : () {},
),
);
}
Widget _buildRecordingButton() {
final isCancelling = _cancelled;
return GestureDetector(
onLongPressMoveUpdate: _onLongPressMoveUpdate,
onLongPressEnd: _onLongPressEnd,
onLongPressCancel: _onLongPressCancel,
child: Padding(
padding: const EdgeInsets.symmetric(horizontal: 4, vertical: 4),
child: Row(
mainAxisSize: MainAxisSize.min,
children: [
ScaleTransition(
scale: _pulseAnimation,
child: Icon(
Icons.mic,
size: 22,
color: isCancelling ? AppColors.textMuted : AppColors.error,
),
),
const SizedBox(width: 4),
Text(
isCancelling ? '松开取消' : '松开发送',
style: TextStyle(
fontSize: 12,
color: isCancelling ? AppColors.textMuted : AppColors.error,
),
),
],
),
),
);
}
}