feat(stt): support auto language detection for mixed Chinese-English input

- Flutter: language='auto' omits the language field → backend receives none
- Backend: no language field → passes undefined to STT service
- STT service: language=undefined → omits language param from Whisper request
- Whisper auto-detects language per utterance when no hint is provided

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
hailin 2026-03-06 08:13:26 -08:00
parent 23675fa5a5
commit 4c7c05eb37
3 changed files with 9 additions and 6 deletions

View File

@ -154,17 +154,19 @@ class ChatRemoteDatasource {
/// Backend performs STT, interrupts any running task if needed, and /// Backend performs STT, interrupts any running task if needed, and
/// Transcribe audio to text (STT only, does NOT send to agent). /// Transcribe audio to text (STT only, does NOT send to agent).
/// Returns the transcript string. /// Returns the transcript string.
/// Pass language='auto' to let Whisper auto-detect (best for mixed-language).
Future<String> transcribeAudio({ Future<String> transcribeAudio({
required String audioPath, required String audioPath,
String language = 'zh', String language = 'zh',
}) async { }) async {
final formData = FormData.fromMap({ final fields = <String, dynamic>{
'audio': await MultipartFile.fromFile( 'audio': await MultipartFile.fromFile(
audioPath, audioPath,
filename: audioPath.split('/').last, filename: audioPath.split('/').last,
), ),
'language': language, };
}); if (language != 'auto') fields['language'] = language;
final formData = FormData.fromMap(fields);
final response = await _dio.post( final response = await _dio.post(
'${ApiEndpoints.agent}/transcribe', '${ApiEndpoints.agent}/transcribe',
data: formData, data: formData,

View File

@ -34,7 +34,7 @@ export class OpenAISttService {
async transcribe( async transcribe(
audioBuffer: Buffer, audioBuffer: Buffer,
filename: string, filename: string,
language = 'zh', language?: string,
): Promise<string> { ): Promise<string> {
const url = `${this.baseUrl}/v1/audio/transcriptions`; const url = `${this.baseUrl}/v1/audio/transcriptions`;
this.logger.log( this.logger.log(
@ -54,7 +54,7 @@ export class OpenAISttService {
}; };
appendField('model', 'whisper-1'); appendField('model', 'whisper-1');
appendField('language', language); if (language) appendField('language', language); // omit → Whisper auto-detects
appendField('response_format', 'json'); appendField('response_format', 'json');
// File field // File field

View File

@ -421,10 +421,11 @@ export class AgentController {
if (!file?.buffer?.length) { if (!file?.buffer?.length) {
throw new BadRequestException('audio file is required'); throw new BadRequestException('audio file is required');
} }
// language=undefined → Whisper auto-detects (best for mixed-language input)
const text = await this.sttService.transcribe( const text = await this.sttService.transcribe(
file.buffer, file.buffer,
file.originalname || 'audio.m4a', file.originalname || 'audio.m4a',
language ?? 'zh', language || undefined,
); );
return { text: text?.trim() ?? '' }; return { text: text?.trim() ?? '' };
} }