From 2182149c4ccc11c66e8787e9f8645242e938b596 Mon Sep 17 00:00:00 2001 From: hailin Date: Fri, 6 Mar 2026 07:01:39 -0800 Subject: [PATCH] feat(chat): voice-to-text fills input box instead of auto-sending - Add POST /api/v1/agent/transcribe endpoint (STT only, no agent trigger) - Add transcribeAudio() to chat datasource and provider - VoiceMicButton now fills the text input field with transcript; user reviews and sends manually - Add OPENAI_API_KEY/OPENAI_BASE_URL to agent-service in docker-compose Co-Authored-By: Claude Sonnet 4.6 --- deploy/docker/docker-compose.yml | 2 ++ .../datasources/chat_remote_datasource.dart | 21 ++++++++++++ .../chat/presentation/pages/chat_page.dart | 33 +++++++++++++++++-- .../providers/chat_providers.dart | 5 +++ .../rest/controllers/agent.controller.ts | 31 +++++++++++++++-- 5 files changed, 87 insertions(+), 5 deletions(-) diff --git a/deploy/docker/docker-compose.yml b/deploy/docker/docker-compose.yml index fa3cf63..90a2ff9 100644 --- a/deploy/docker/docker-compose.yml +++ b/deploy/docker/docker-compose.yml @@ -137,6 +137,8 @@ services: - AGENT_ENGINE_TYPE=claude_agent_sdk - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} - ANTHROPIC_BASE_URL=${ANTHROPIC_BASE_URL} + - OPENAI_API_KEY=${OPENAI_API_KEY} + - OPENAI_BASE_URL=${OPENAI_BASE_URL} - AGENT_SERVICE_PORT=3002 healthcheck: test: ["CMD-SHELL", "node -e \"require('http').get('http://localhost:3002/',r=>{process.exit(r.statusCode<500?0:1)}).on('error',()=>process.exit(1))\""] diff --git a/it0_app/lib/features/chat/data/datasources/chat_remote_datasource.dart b/it0_app/lib/features/chat/data/datasources/chat_remote_datasource.dart index 2bed529..8ddf06c 100644 --- a/it0_app/lib/features/chat/data/datasources/chat_remote_datasource.dart +++ b/it0_app/lib/features/chat/data/datasources/chat_remote_datasource.dart @@ -152,6 +152,27 @@ class ChatRemoteDatasource { /// Uploads an audio file to the voice-message endpoint. /// Backend performs STT, interrupts any running task if needed, and + /// Transcribe audio to text (STT only, does NOT send to agent). + /// Returns the transcript string. + Future transcribeAudio({ + required String audioPath, + String language = 'zh', + }) async { + final formData = FormData.fromMap({ + 'audio': await MultipartFile.fromFile( + audioPath, + filename: audioPath.split('/').last, + ), + 'language': language, + }); + final response = await _dio.post( + '${ApiEndpoints.agent}/transcribe', + data: formData, + ); + final data = response.data as Map; + return (data['text'] as String? ?? '').trim(); + } + /// starts a new agent task with the transcript. /// Returns { sessionId, taskId, transcript }. Future> sendVoiceMessage({ diff --git a/it0_app/lib/features/chat/presentation/pages/chat_page.dart b/it0_app/lib/features/chat/presentation/pages/chat_page.dart index 5256488..2e9e6fd 100644 --- a/it0_app/lib/features/chat/presentation/pages/chat_page.dart +++ b/it0_app/lib/features/chat/presentation/pages/chat_page.dart @@ -29,6 +29,7 @@ class _ChatPageState extends ConsumerState { final _messageController = TextEditingController(); final _scrollController = ScrollController(); final List _pendingAttachments = []; + bool _sttLoading = false; // -- Send ------------------------------------------------------------------ @@ -54,6 +55,33 @@ class _ChatPageState extends ConsumerState { _scrollToBottom(); } + Future _transcribeToInput(String audioPath) async { + setState(() { + _sttLoading = true; + _messageController.text = '识别中…'; + }); + try { + final text = await ref.read(chatProvider.notifier).transcribeAudio(audioPath); + if (mounted) { + setState(() { + _messageController.text = text; + _messageController.selection = TextSelection.collapsed( + offset: text.length, + ); + }); + } + } catch (e) { + if (mounted) { + setState(() => _messageController.text = ''); + ScaffoldMessenger.of(context).showSnackBar( + const SnackBar(content: Text('语音识别失败,请重试')), + ); + } + } finally { + if (mounted) setState(() => _sttLoading = false); + } + } + void _scrollToBottom({bool jump = false}) { WidgetsBinding.instance.addPostFrameCallback((_) { if (!_scrollController.hasClients) return; @@ -703,9 +731,8 @@ class _ChatPageState extends ConsumerState { mainAxisSize: MainAxisSize.min, children: [ VoiceMicButton( - disabled: isAwaitingApproval, - onAudioReady: (path) => - ref.read(chatProvider.notifier).sendVoiceMessage(path), + disabled: isAwaitingApproval || _sttLoading, + onAudioReady: _transcribeToInput, ), Padding( padding: const EdgeInsets.only(right: 4), diff --git a/it0_app/lib/features/chat/presentation/providers/chat_providers.dart b/it0_app/lib/features/chat/presentation/providers/chat_providers.dart index ae3eac7..347e985 100644 --- a/it0_app/lib/features/chat/presentation/providers/chat_providers.dart +++ b/it0_app/lib/features/chat/presentation/providers/chat_providers.dart @@ -573,6 +573,11 @@ class ChatNotifier extends StateNotifier { } } + Future transcribeAudio(String audioPath) async { + final datasource = _ref.read(chatRemoteDatasourceProvider); + return datasource.transcribeAudio(audioPath: audioPath); + } + Future cancelCurrentTask() async { final taskId = state.taskId; if (taskId == null && state.sessionId == null) return; diff --git a/packages/services/agent-service/src/interfaces/rest/controllers/agent.controller.ts b/packages/services/agent-service/src/interfaces/rest/controllers/agent.controller.ts index 7856c63..f16ccb1 100644 --- a/packages/services/agent-service/src/interfaces/rest/controllers/agent.controller.ts +++ b/packages/services/agent-service/src/interfaces/rest/controllers/agent.controller.ts @@ -403,6 +403,32 @@ export class AgentController { } } + /** + * Transcribe audio to text (STT only — does NOT trigger the agent). + * + * POST /api/v1/agent/transcribe + * Content-Type: multipart/form-data + * Fields: audio (file), language? (string, default 'zh') + * + * Response: { text: string } + */ + @Post('transcribe') + @UseInterceptors(FileInterceptor('audio', { storage: memoryStorage() })) + async transcribeAudio( + @UploadedFile() file: { buffer: Buffer; originalname: string; mimetype: string } | undefined, + @Body('language') language?: string, + ) { + if (!file?.buffer?.length) { + throw new BadRequestException('audio file is required'); + } + const text = await this.sttService.transcribe( + file.buffer, + file.originalname || 'audio.m4a', + language ?? 'zh', + ); + return { text: text?.trim() ?? '' }; + } + /** * Voice message endpoint — WhatsApp-style push-to-talk. * @@ -432,9 +458,10 @@ export class AgentController { throw new BadRequestException('audio file is required'); } - const session = await this.sessionRepository.findById(sessionId); + let session = await this.sessionRepository.findById(sessionId); if (!session || session.tenantId !== tenantId) { - throw new NotFoundException(`Session ${sessionId} not found`); + // No existing session (e.g. first voice message, sessionId = 'new') — auto-create one + session = this.createNewSession(tenantId, this.engineRegistry.getActiveEngine().engineType); } // STT: transcribe audio → text