From 2182149c4ccc11c66e8787e9f8645242e938b596 Mon Sep 17 00:00:00 2001
From: hailin <hailin.zhao@gdzx.xyz>
Date: Fri, 6 Mar 2026 07:01:39 -0800
Subject: [PATCH] feat(chat): voice-to-text fills input box instead of
 auto-sending

- Add POST /api/v1/agent/transcribe endpoint (STT only, no agent trigger)
- Add transcribeAudio() to chat datasource and provider
- VoiceMicButton now fills the text input field with transcript;
  user reviews and sends manually
- Add OPENAI_API_KEY/OPENAI_BASE_URL to agent-service in docker-compose

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 deploy/docker/docker-compose.yml              |  2 ++
 .../datasources/chat_remote_datasource.dart   | 21 ++++++++++++
 .../chat/presentation/pages/chat_page.dart    | 33 +++++++++++++++++--
 .../providers/chat_providers.dart             |  5 +++
 .../rest/controllers/agent.controller.ts      | 31 +++++++++++++++--
 5 files changed, 87 insertions(+), 5 deletions(-)
diff --git a/deploy/docker/docker-compose.yml b/deploy/docker/docker-compose.yml
index fa3cf63..90a2ff9 100644
--- a/deploy/docker/docker-compose.yml
+++ b/deploy/docker/docker-compose.yml
@@ -137,6 +137,8 @@ services:
       - AGENT_ENGINE_TYPE=claude_agent_sdk
       - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
       - ANTHROPIC_BASE_URL=${ANTHROPIC_BASE_URL}
+      - OPENAI_API_KEY=${OPENAI_API_KEY}
+      - OPENAI_BASE_URL=${OPENAI_BASE_URL}
       - AGENT_SERVICE_PORT=3002
     healthcheck:
       test: ["CMD-SHELL", "node -e \"require('http').get('http://localhost:3002/',r=>{process.exit(r.statusCode<500?0:1)}).on('error',()=>process.exit(1))\""]
diff --git a/it0_app/lib/features/chat/data/datasources/chat_remote_datasource.dart b/it0_app/lib/features/chat/data/datasources/chat_remote_datasource.dart
index 2bed529..8ddf06c 100644
--- a/it0_app/lib/features/chat/data/datasources/chat_remote_datasource.dart
+++ b/it0_app/lib/features/chat/data/datasources/chat_remote_datasource.dart
@@ -152,6 +152,27 @@ class ChatRemoteDatasource {
 
   /// Uploads an audio file to the voice-message endpoint.
   /// Backend performs STT, interrupts any running task if needed, and
+  /// Transcribe audio to text (STT only, does NOT send to agent).
+  /// Returns the transcript string.
+  Future<String> transcribeAudio({
+    required String audioPath,
+    String language = 'zh',
+  }) async {
+    final formData = FormData.fromMap({
+      'audio': await MultipartFile.fromFile(
+        audioPath,
+        filename: audioPath.split('/').last,
+      ),
+      'language': language,
+    });
+    final response = await _dio.post(
+      '${ApiEndpoints.agent}/transcribe',
+      data: formData,
+    );
+    final data = response.data as Map<String, dynamic>;
+    return (data['text'] as String? ?? '').trim();
+  }
+
   /// starts a new agent task with the transcript.
   /// Returns { sessionId, taskId, transcript }.
   Future<Map<String, dynamic>> sendVoiceMessage({
diff --git a/it0_app/lib/features/chat/presentation/pages/chat_page.dart b/it0_app/lib/features/chat/presentation/pages/chat_page.dart
index 5256488..2e9e6fd 100644
--- a/it0_app/lib/features/chat/presentation/pages/chat_page.dart
+++ b/it0_app/lib/features/chat/presentation/pages/chat_page.dart
@@ -29,6 +29,7 @@ class _ChatPageState extends ConsumerState<ChatPage> {
   final _messageController = TextEditingController();
   final _scrollController = ScrollController();
   final List<ChatAttachment> _pendingAttachments = [];
+  bool _sttLoading = false;
 
   // -- Send ------------------------------------------------------------------
 
@@ -54,6 +55,33 @@ class _ChatPageState extends ConsumerState<ChatPage> {
     _scrollToBottom();
   }
 
+  Future<void> _transcribeToInput(String audioPath) async {
+    setState(() {
+      _sttLoading = true;
+      _messageController.text = '识别中…';
+    });
+    try {
+      final text = await ref.read(chatProvider.notifier).transcribeAudio(audioPath);
+      if (mounted) {
+        setState(() {
+          _messageController.text = text;
+          _messageController.selection = TextSelection.collapsed(
+            offset: text.length,
+          );
+        });
+      }
+    } catch (e) {
+      if (mounted) {
+        setState(() => _messageController.text = '');
+        ScaffoldMessenger.of(context).showSnackBar(
+          const SnackBar(content: Text('语音识别失败，请重试')),
+        );
+      }
+    } finally {
+      if (mounted) setState(() => _sttLoading = false);
+    }
+  }
+
   void _scrollToBottom({bool jump = false}) {
     WidgetsBinding.instance.addPostFrameCallback((_) {
       if (!_scrollController.hasClients) return;
@@ -703,9 +731,8 @@ class _ChatPageState extends ConsumerState<ChatPage> {
                   mainAxisSize: MainAxisSize.min,
                   children: [
                     VoiceMicButton(
-                      disabled: isAwaitingApproval,
-                      onAudioReady: (path) =>
-                          ref.read(chatProvider.notifier).sendVoiceMessage(path),
+                      disabled: isAwaitingApproval || _sttLoading,
+                      onAudioReady: _transcribeToInput,
                     ),
                     Padding(
                       padding: const EdgeInsets.only(right: 4),
diff --git a/it0_app/lib/features/chat/presentation/providers/chat_providers.dart b/it0_app/lib/features/chat/presentation/providers/chat_providers.dart
index ae3eac7..347e985 100644
--- a/it0_app/lib/features/chat/presentation/providers/chat_providers.dart
+++ b/it0_app/lib/features/chat/presentation/providers/chat_providers.dart
@@ -573,6 +573,11 @@ class ChatNotifier extends StateNotifier<ChatState> {
     }
   }
 
+  Future<String> transcribeAudio(String audioPath) async {
+    final datasource = _ref.read(chatRemoteDatasourceProvider);
+    return datasource.transcribeAudio(audioPath: audioPath);
+  }
+
   Future<void> cancelCurrentTask() async {
     final taskId = state.taskId;
     if (taskId == null && state.sessionId == null) return;
diff --git a/packages/services/agent-service/src/interfaces/rest/controllers/agent.controller.ts b/packages/services/agent-service/src/interfaces/rest/controllers/agent.controller.ts
index 7856c63..f16ccb1 100644
--- a/packages/services/agent-service/src/interfaces/rest/controllers/agent.controller.ts
+++ b/packages/services/agent-service/src/interfaces/rest/controllers/agent.controller.ts
@@ -403,6 +403,32 @@ export class AgentController {
     }
   }
 
+  /**
+   * Transcribe audio to text (STT only — does NOT trigger the agent).
+   *
+   * POST /api/v1/agent/transcribe
+   * Content-Type: multipart/form-data
+   * Fields: audio (file), language? (string, default 'zh')
+   *
+   * Response: { text: string }
+   */
+  @Post('transcribe')
+  @UseInterceptors(FileInterceptor('audio', { storage: memoryStorage() }))
+  async transcribeAudio(
+    @UploadedFile() file: { buffer: Buffer; originalname: string; mimetype: string } | undefined,
+    @Body('language') language?: string,
+  ) {
+    if (!file?.buffer?.length) {
+      throw new BadRequestException('audio file is required');
+    }
+    const text = await this.sttService.transcribe(
+      file.buffer,
+      file.originalname || 'audio.m4a',
+      language ?? 'zh',
+    );
+    return { text: text?.trim() ?? '' };
+  }
+
   /**
    * Voice message endpoint — WhatsApp-style push-to-talk.
    *
@@ -432,9 +458,10 @@ export class AgentController {
       throw new BadRequestException('audio file is required');
     }
 
-    const session = await this.sessionRepository.findById(sessionId);
+    let session = await this.sessionRepository.findById(sessionId);
     if (!session || session.tenantId !== tenantId) {
-      throw new NotFoundException(`Session ${sessionId} not found`);
+      // No existing session (e.g. first voice message, sessionId = 'new') — auto-create one
+      session = this.createNewSession(tenantId, this.engineRegistry.getActiveEngine().engineType);
     }
 
     // STT: transcribe audio → text