feat(chat): voice-to-text fills input box instead of auto-sending
- Add POST /api/v1/agent/transcribe endpoint (STT only, no agent trigger) - Add transcribeAudio() to chat datasource and provider - VoiceMicButton now fills the text input field with transcript; user reviews and sends manually - Add OPENAI_API_KEY/OPENAI_BASE_URL to agent-service in docker-compose Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
5721d75461
commit
2182149c4c
|
|
@ -137,6 +137,8 @@ services:
|
||||||
- AGENT_ENGINE_TYPE=claude_agent_sdk
|
- AGENT_ENGINE_TYPE=claude_agent_sdk
|
||||||
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
|
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
|
||||||
- ANTHROPIC_BASE_URL=${ANTHROPIC_BASE_URL}
|
- ANTHROPIC_BASE_URL=${ANTHROPIC_BASE_URL}
|
||||||
|
- OPENAI_API_KEY=${OPENAI_API_KEY}
|
||||||
|
- OPENAI_BASE_URL=${OPENAI_BASE_URL}
|
||||||
- AGENT_SERVICE_PORT=3002
|
- AGENT_SERVICE_PORT=3002
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD-SHELL", "node -e \"require('http').get('http://localhost:3002/',r=>{process.exit(r.statusCode<500?0:1)}).on('error',()=>process.exit(1))\""]
|
test: ["CMD-SHELL", "node -e \"require('http').get('http://localhost:3002/',r=>{process.exit(r.statusCode<500?0:1)}).on('error',()=>process.exit(1))\""]
|
||||||
|
|
|
||||||
|
|
@ -152,6 +152,27 @@ class ChatRemoteDatasource {
|
||||||
|
|
||||||
/// Uploads an audio file to the voice-message endpoint.
|
/// Uploads an audio file to the voice-message endpoint.
|
||||||
/// Backend performs STT, interrupts any running task if needed, and
|
/// Backend performs STT, interrupts any running task if needed, and
|
||||||
|
/// Transcribe audio to text (STT only, does NOT send to agent).
|
||||||
|
/// Returns the transcript string.
|
||||||
|
Future<String> transcribeAudio({
|
||||||
|
required String audioPath,
|
||||||
|
String language = 'zh',
|
||||||
|
}) async {
|
||||||
|
final formData = FormData.fromMap({
|
||||||
|
'audio': await MultipartFile.fromFile(
|
||||||
|
audioPath,
|
||||||
|
filename: audioPath.split('/').last,
|
||||||
|
),
|
||||||
|
'language': language,
|
||||||
|
});
|
||||||
|
final response = await _dio.post(
|
||||||
|
'${ApiEndpoints.agent}/transcribe',
|
||||||
|
data: formData,
|
||||||
|
);
|
||||||
|
final data = response.data as Map<String, dynamic>;
|
||||||
|
return (data['text'] as String? ?? '').trim();
|
||||||
|
}
|
||||||
|
|
||||||
/// starts a new agent task with the transcript.
|
/// starts a new agent task with the transcript.
|
||||||
/// Returns { sessionId, taskId, transcript }.
|
/// Returns { sessionId, taskId, transcript }.
|
||||||
Future<Map<String, dynamic>> sendVoiceMessage({
|
Future<Map<String, dynamic>> sendVoiceMessage({
|
||||||
|
|
|
||||||
|
|
@ -29,6 +29,7 @@ class _ChatPageState extends ConsumerState<ChatPage> {
|
||||||
final _messageController = TextEditingController();
|
final _messageController = TextEditingController();
|
||||||
final _scrollController = ScrollController();
|
final _scrollController = ScrollController();
|
||||||
final List<ChatAttachment> _pendingAttachments = [];
|
final List<ChatAttachment> _pendingAttachments = [];
|
||||||
|
bool _sttLoading = false;
|
||||||
|
|
||||||
// -- Send ------------------------------------------------------------------
|
// -- Send ------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
@ -54,6 +55,33 @@ class _ChatPageState extends ConsumerState<ChatPage> {
|
||||||
_scrollToBottom();
|
_scrollToBottom();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Future<void> _transcribeToInput(String audioPath) async {
|
||||||
|
setState(() {
|
||||||
|
_sttLoading = true;
|
||||||
|
_messageController.text = '识别中…';
|
||||||
|
});
|
||||||
|
try {
|
||||||
|
final text = await ref.read(chatProvider.notifier).transcribeAudio(audioPath);
|
||||||
|
if (mounted) {
|
||||||
|
setState(() {
|
||||||
|
_messageController.text = text;
|
||||||
|
_messageController.selection = TextSelection.collapsed(
|
||||||
|
offset: text.length,
|
||||||
|
);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
if (mounted) {
|
||||||
|
setState(() => _messageController.text = '');
|
||||||
|
ScaffoldMessenger.of(context).showSnackBar(
|
||||||
|
const SnackBar(content: Text('语音识别失败,请重试')),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
if (mounted) setState(() => _sttLoading = false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void _scrollToBottom({bool jump = false}) {
|
void _scrollToBottom({bool jump = false}) {
|
||||||
WidgetsBinding.instance.addPostFrameCallback((_) {
|
WidgetsBinding.instance.addPostFrameCallback((_) {
|
||||||
if (!_scrollController.hasClients) return;
|
if (!_scrollController.hasClients) return;
|
||||||
|
|
@ -703,9 +731,8 @@ class _ChatPageState extends ConsumerState<ChatPage> {
|
||||||
mainAxisSize: MainAxisSize.min,
|
mainAxisSize: MainAxisSize.min,
|
||||||
children: [
|
children: [
|
||||||
VoiceMicButton(
|
VoiceMicButton(
|
||||||
disabled: isAwaitingApproval,
|
disabled: isAwaitingApproval || _sttLoading,
|
||||||
onAudioReady: (path) =>
|
onAudioReady: _transcribeToInput,
|
||||||
ref.read(chatProvider.notifier).sendVoiceMessage(path),
|
|
||||||
),
|
),
|
||||||
Padding(
|
Padding(
|
||||||
padding: const EdgeInsets.only(right: 4),
|
padding: const EdgeInsets.only(right: 4),
|
||||||
|
|
|
||||||
|
|
@ -573,6 +573,11 @@ class ChatNotifier extends StateNotifier<ChatState> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Future<String> transcribeAudio(String audioPath) async {
|
||||||
|
final datasource = _ref.read(chatRemoteDatasourceProvider);
|
||||||
|
return datasource.transcribeAudio(audioPath: audioPath);
|
||||||
|
}
|
||||||
|
|
||||||
Future<void> cancelCurrentTask() async {
|
Future<void> cancelCurrentTask() async {
|
||||||
final taskId = state.taskId;
|
final taskId = state.taskId;
|
||||||
if (taskId == null && state.sessionId == null) return;
|
if (taskId == null && state.sessionId == null) return;
|
||||||
|
|
|
||||||
|
|
@ -403,6 +403,32 @@ export class AgentController {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Transcribe audio to text (STT only — does NOT trigger the agent).
|
||||||
|
*
|
||||||
|
* POST /api/v1/agent/transcribe
|
||||||
|
* Content-Type: multipart/form-data
|
||||||
|
* Fields: audio (file), language? (string, default 'zh')
|
||||||
|
*
|
||||||
|
* Response: { text: string }
|
||||||
|
*/
|
||||||
|
@Post('transcribe')
|
||||||
|
@UseInterceptors(FileInterceptor('audio', { storage: memoryStorage() }))
|
||||||
|
async transcribeAudio(
|
||||||
|
@UploadedFile() file: { buffer: Buffer; originalname: string; mimetype: string } | undefined,
|
||||||
|
@Body('language') language?: string,
|
||||||
|
) {
|
||||||
|
if (!file?.buffer?.length) {
|
||||||
|
throw new BadRequestException('audio file is required');
|
||||||
|
}
|
||||||
|
const text = await this.sttService.transcribe(
|
||||||
|
file.buffer,
|
||||||
|
file.originalname || 'audio.m4a',
|
||||||
|
language ?? 'zh',
|
||||||
|
);
|
||||||
|
return { text: text?.trim() ?? '' };
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Voice message endpoint — WhatsApp-style push-to-talk.
|
* Voice message endpoint — WhatsApp-style push-to-talk.
|
||||||
*
|
*
|
||||||
|
|
@ -432,9 +458,10 @@ export class AgentController {
|
||||||
throw new BadRequestException('audio file is required');
|
throw new BadRequestException('audio file is required');
|
||||||
}
|
}
|
||||||
|
|
||||||
const session = await this.sessionRepository.findById(sessionId);
|
let session = await this.sessionRepository.findById(sessionId);
|
||||||
if (!session || session.tenantId !== tenantId) {
|
if (!session || session.tenantId !== tenantId) {
|
||||||
throw new NotFoundException(`Session ${sessionId} not found`);
|
// No existing session (e.g. first voice message, sessionId = 'new') — auto-create one
|
||||||
|
session = this.createNewSession(tenantId, this.engineRegistry.getActiveEngine().engineType);
|
||||||
}
|
}
|
||||||
|
|
||||||
// STT: transcribe audio → text
|
// STT: transcribe audio → text
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue