feat(chat): voice-to-text fills input box instead of auto-sending

- Add POST /api/v1/agent/transcribe endpoint (STT only, no agent trigger) - Add transcribeAudio() to chat datasource and provider - VoiceMicButton now fills the text input field with transcript; user reviews and sends manually - Add OPENAI_API_KEY/OPENAI_BASE_URL to agent-service in docker-compose Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-06 07:01:39 -08:00 · 2026-03-06 07:01:39 -08:00 · 2182149c4c
parent 5721d75461
commit 2182149c4c
5 changed files with 87 additions and 5 deletions
--- a/deploy/docker/docker-compose.yml
+++ b/deploy/docker/docker-compose.yml
@ -137,6 +137,8 @@ services:
      - AGENT_ENGINE_TYPE=claude_agent_sdk
      - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
      - ANTHROPIC_BASE_URL=${ANTHROPIC_BASE_URL}
      - OPENAI_API_KEY=${OPENAI_API_KEY}
      - OPENAI_BASE_URL=${OPENAI_BASE_URL}
      - AGENT_SERVICE_PORT=3002
    healthcheck:
      test: ["CMD-SHELL", "node -e \"require('http').get('http://localhost:3002/',r=>{process.exit(r.statusCode<500?0:1)}).on('error',()=>process.exit(1))\""]
--- a/it0_app/lib/features/chat/data/datasources/chat_remote_datasource.dart
+++ b/it0_app/lib/features/chat/data/datasources/chat_remote_datasource.dart
@ -152,6 +152,27 @@ class ChatRemoteDatasource {
  /// Uploads an audio file to the voice-message endpoint.
  /// Backend performs STT, interrupts any running task if needed, and
  /// Transcribe audio to text (STT only, does NOT send to agent).
  /// Returns the transcript string.
  Future<String> transcribeAudio({
    required String audioPath,
    String language = 'zh',
  }) async {
    final formData = FormData.fromMap({
      'audio': await MultipartFile.fromFile(
        audioPath,
        filename: audioPath.split('/').last,
      ),
      'language': language,
    });
    final response = await _dio.post(
      '${ApiEndpoints.agent}/transcribe',
      data: formData,
    );
    final data = response.data as Map<String, dynamic>;
    return (data['text'] as String? ?? '').trim();
  }
  /// starts a new agent task with the transcript.
  /// Returns { sessionId, taskId, transcript }.
  Future<Map<String, dynamic>> sendVoiceMessage({
--- a/it0_app/lib/features/chat/presentation/pages/chat_page.dart
+++ b/it0_app/lib/features/chat/presentation/pages/chat_page.dart
@ -29,6 +29,7 @@ class _ChatPageState extends ConsumerState<ChatPage> {
  final _messageController = TextEditingController();
  final _scrollController = ScrollController();
  final List<ChatAttachment> _pendingAttachments = [];
  bool _sttLoading = false;
  // -- Send ------------------------------------------------------------------
@ -54,6 +55,33 @@ class _ChatPageState extends ConsumerState<ChatPage> {
    _scrollToBottom();
  }
  Future<void> _transcribeToInput(String audioPath) async {
    setState(() {
      _sttLoading = true;
      _messageController.text = '识别中…';
    });
    try {
      final text = await ref.read(chatProvider.notifier).transcribeAudio(audioPath);
      if (mounted) {
        setState(() {
          _messageController.text = text;
          _messageController.selection = TextSelection.collapsed(
            offset: text.length,
          );
        });
      }
    } catch (e) {
      if (mounted) {
        setState(() => _messageController.text = '');
        ScaffoldMessenger.of(context).showSnackBar(
          const SnackBar(content: Text('语音识别失败，请重试')),
        );
      }
    } finally {
      if (mounted) setState(() => _sttLoading = false);
    }
  }
  void _scrollToBottom({bool jump = false}) {
    WidgetsBinding.instance.addPostFrameCallback((_) {
      if (!_scrollController.hasClients) return;
@ -703,9 +731,8 @@ class _ChatPageState extends ConsumerState<ChatPage> {
                  mainAxisSize: MainAxisSize.min,
                  children: [
                    VoiceMicButton(
-                      disabled: isAwaitingApproval,
+                      disabled: isAwaitingApproval || _sttLoading,
-                      onAudioReady: (path) =>
+                      onAudioReady: _transcribeToInput,
                          ref.read(chatProvider.notifier).sendVoiceMessage(path),
                    ),
                    Padding(
                      padding: const EdgeInsets.only(right: 4),
--- a/it0_app/lib/features/chat/presentation/providers/chat_providers.dart
+++ b/it0_app/lib/features/chat/presentation/providers/chat_providers.dart
@ -573,6 +573,11 @@ class ChatNotifier extends StateNotifier<ChatState> {
    }
  }
  Future<String> transcribeAudio(String audioPath) async {
    final datasource = _ref.read(chatRemoteDatasourceProvider);
    return datasource.transcribeAudio(audioPath: audioPath);
  }
  Future<void> cancelCurrentTask() async {
    final taskId = state.taskId;
    if (taskId == null && state.sessionId == null) return;
--- a/packages/services/agent-service/src/interfaces/rest/controllers/agent.controller.ts
+++ b/packages/services/agent-service/src/interfaces/rest/controllers/agent.controller.ts
@ -403,6 +403,32 @@ export class AgentController {
    }
  }
  /**
   * Transcribe audio to text (STT only — does NOT trigger the agent).
   *
   * POST /api/v1/agent/transcribe
   * Content-Type: multipart/form-data
   * Fields: audio (file), language? (string, default 'zh')
   *
   * Response: { text: string }
   */
  @Post('transcribe')
  @UseInterceptors(FileInterceptor('audio', { storage: memoryStorage() }))
  async transcribeAudio(
    @UploadedFile() file: { buffer: Buffer; originalname: string; mimetype: string } | undefined,
    @Body('language') language?: string,
  ) {
    if (!file?.buffer?.length) {
      throw new BadRequestException('audio file is required');
    }
    const text = await this.sttService.transcribe(
      file.buffer,
      file.originalname || 'audio.m4a',
      language ?? 'zh',
    );
    return { text: text?.trim() ?? '' };
  }
  /**
   * Voice message endpoint — WhatsApp-style push-to-talk.
   *
@ -432,9 +458,10 @@ export class AgentController {
      throw new BadRequestException('audio file is required');
    }
-    const session = await this.sessionRepository.findById(sessionId);
+    let session = await this.sessionRepository.findById(sessionId);
    if (!session || session.tenantId !== tenantId) {
-      throw new NotFoundException(`Session ${sessionId} not found`);
+      // No existing session (e.g. first voice message, sessionId = 'new') — auto-create one
      session = this.createNewSession(tenantId, this.engineRegistry.getActiveEngine().engineType);
    }
    // STT: transcribe audio → text