feat: replace speech_to_text with GTCRN ML noise reduction + backend STT

Replace traditional on-device speech_to_text with a modern pipeline:
- Record audio via `record` package with hardware noise suppression
- Apply GTCRN neural denoising (sherpa-onnx, ICASSP 2024, 48K params)
- Trim silence, POST to backend /voice/transcribe (faster-whisper)

Changes:
- Add /transcribe endpoint to voice-service for audio file upload
- Add SpeechEnhancer wrapper for sherpa-onnx GTCRN model (523KB)
- Rewrite chat_page.dart voice input: record → denoise → transcribe
- Keep NoiseReducer.trimSilence for silence removal only
- Upgrade record to v6.2.0, add sherpa_onnx, path_provider

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
hailin 2026-02-22 07:59:15 -08:00
parent 39c0d83424
commit a568558585
9 changed files with 498 additions and 172 deletions

View File

@ -15,16 +15,6 @@ import io.flutter.embedding.engine.FlutterEngine;
public final class GeneratedPluginRegistrant {
private static final String TAG = "GeneratedPluginRegistrant";
public static void registerWith(@NonNull FlutterEngine flutterEngine) {
try {
flutterEngine.getPlugins().add(new io.flutter.plugins.firebase.core.FlutterFirebaseCorePlugin());
} catch (Exception e) {
Log.e(TAG, "Error registering plugin firebase_core, io.flutter.plugins.firebase.core.FlutterFirebaseCorePlugin", e);
}
try {
flutterEngine.getPlugins().add(new io.flutter.plugins.firebase.messaging.FlutterFirebaseMessagingPlugin());
} catch (Exception e) {
Log.e(TAG, "Error registering plugin firebase_messaging, io.flutter.plugins.firebase.messaging.FlutterFirebaseMessagingPlugin", e);
}
try {
flutterEngine.getPlugins().add(new com.it_nomads.fluttersecurestorage.FlutterSecureStoragePlugin());
} catch (Exception e) {
@ -45,16 +35,16 @@ public final class GeneratedPluginRegistrant {
} catch (Exception e) {
Log.e(TAG, "Error registering plugin permission_handler_android, com.baseflow.permissionhandler.PermissionHandlerPlugin", e);
}
try {
flutterEngine.getPlugins().add(new com.llfbandit.record.RecordPlugin());
} catch (Exception e) {
Log.e(TAG, "Error registering plugin record_android, com.llfbandit.record.RecordPlugin", e);
}
try {
flutterEngine.getPlugins().add(new io.flutter.plugins.sharedpreferences.SharedPreferencesPlugin());
} catch (Exception e) {
Log.e(TAG, "Error registering plugin shared_preferences_android, io.flutter.plugins.sharedpreferences.SharedPreferencesPlugin", e);
}
try {
flutterEngine.getPlugins().add(new com.csdcorp.speech_to_text.SpeechToTextPlugin());
} catch (Exception e) {
Log.e(TAG, "Error registering plugin speech_to_text, com.csdcorp.speech_to_text.SpeechToTextPlugin", e);
}
try {
flutterEngine.getPlugins().add(new io.flutter.plugins.urllauncher.UrlLauncherPlugin());
} catch (Exception e) {

Binary file not shown.

View File

@ -0,0 +1,67 @@
import 'dart:math';
import 'dart:typed_data';
/// Utility for trimming leading/trailing silence from PCM audio.
///
/// The heavy-lifting noise reduction is handled by [SpeechEnhancer] (GTCRN).
/// This class provides a lightweight silence trimmer to reduce data sent
/// to the backend STT.
class NoiseReducer {
/// Trim leading and trailing silence from PCM audio.
///
/// Keeps a [marginMs] buffer around detected speech boundaries.
static Uint8List trimSilence(
Uint8List pcm16Data, {
int sampleRate = 16000,
double silenceThresholdDb = -40,
int marginMs = 200,
}) {
if (pcm16Data.length < 4) return pcm16Data;
final sampleCount = pcm16Data.length ~/ 2;
final byteData = ByteData.sublistView(pcm16Data);
final frameSize = (sampleRate * 0.020).round();
final marginSamples = (sampleRate * marginMs / 1000).round();
// Convert dB threshold to amplitude
final threshold = 32768 * pow(10, silenceThresholdDb / 20);
// Find first non-silent frame
int startSample = 0;
bool foundStart = false;
for (int i = 0; i < sampleCount - frameSize; i += frameSize) {
double rms = 0;
final end = min(i + frameSize, sampleCount);
for (int j = i; j < end; j++) {
final s = byteData.getInt16(j * 2, Endian.little).toDouble();
rms += s * s;
}
rms = sqrt(rms / (end - i));
if (rms > threshold) {
startSample = max(0, i - marginSamples);
foundStart = true;
break;
}
}
if (!foundStart) return Uint8List(0);
// Find last non-silent frame
int endSample = sampleCount;
for (int i = sampleCount - frameSize; i >= 0; i -= frameSize) {
double rms = 0;
final end = min(i + frameSize, sampleCount);
for (int j = i; j < end; j++) {
final s = byteData.getInt16(j * 2, Endian.little).toDouble();
rms += s * s;
}
rms = sqrt(rms / (end - i));
if (rms > threshold) {
endSample = min(sampleCount, i + frameSize + marginSamples);
break;
}
}
if (startSample >= endSample) return Uint8List(0);
return Uint8List.sublistView(pcm16Data, startSample * 2, endSample * 2);
}
}

View File

@ -0,0 +1,97 @@
import 'dart:io';
import 'dart:typed_data';
import 'package:flutter/services.dart' show rootBundle;
import 'package:path/path.dart' as p;
import 'package:path_provider/path_provider.dart';
import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa;
/// Wraps sherpa-onnx's GTCRN model for speech enhancement (noise reduction).
///
/// Uses the GTCRN model (ICASSP 2024, ~48K params, 523KB) which provides
/// lightweight ML-based denoising without over-processing ideal for
/// preprocessing audio before sending to backend STT (faster-whisper).
///
/// Usage:
/// final enhancer = SpeechEnhancer();
/// await enhancer.init();
/// final denoised = enhancer.enhance(pcm16Bytes);
/// enhancer.dispose();
class SpeechEnhancer {
static const String _modelAsset = 'assets/gtcrn_simple.onnx';
sherpa.OfflineSpeechDenoiser? _denoiser;
bool _initialized = false;
String? _modelPath;
/// Initialize the GTCRN denoiser. Must be called before [enhance].
/// Copies the model from Flutter assets to the filesystem on first run.
Future<void> init() async {
if (_initialized) return;
_modelPath = await _copyAssetToFile(_modelAsset);
final config = sherpa.OfflineSpeechDenoiserConfig(
model: sherpa.OfflineSpeechDenoiserModelConfig(
gtcrn: sherpa.OfflineSpeechDenoiserGtcrnModelConfig(
model: _modelPath!,
),
numThreads: 1,
debug: false,
provider: 'cpu',
),
);
_denoiser = sherpa.OfflineSpeechDenoiser(config);
_initialized = true;
}
/// Denoise PCM 16-bit signed little-endian mono audio at 16 kHz.
///
/// Returns denoised PCM bytes in the same format.
/// If the enhancer is not initialized, returns the input unchanged.
Uint8List enhance(Uint8List pcm16Bytes) {
if (_denoiser == null || pcm16Bytes.length < 640) return pcm16Bytes;
// Convert Int16 PCM Float32 normalized [-1.0, 1.0]
final sampleCount = pcm16Bytes.length ~/ 2;
final float32 = Float32List(sampleCount);
final byteData = ByteData.sublistView(pcm16Bytes);
for (int i = 0; i < sampleCount; i++) {
float32[i] = byteData.getInt16(i * 2, Endian.little) / 32768.0;
}
// Run GTCRN denoiser
final result = _denoiser!.run(samples: float32, sampleRate: 16000);
// Convert Float32 Int16 PCM bytes
final output = Uint8List(result.samples.length * 2);
final outView = ByteData.sublistView(output);
for (int i = 0; i < result.samples.length; i++) {
final val = (result.samples[i] * 32768.0).round().clamp(-32768, 32767);
outView.setInt16(i * 2, val, Endian.little);
}
return output;
}
/// Release native resources. Must be called when done.
void dispose() {
_denoiser?.free();
_denoiser = null;
_initialized = false;
}
/// Copy a Flutter asset to the app support directory (cached).
static Future<String> _copyAssetToFile(String assetPath) async {
final dir = await getApplicationSupportDirectory();
final target = p.join(dir.path, p.basename(assetPath));
final file = File(target);
if (!await file.exists()) {
final data = await rootBundle.load(assetPath);
await file.writeAsBytes(
data.buffer.asUint8List(data.offsetInBytes, data.lengthInBytes),
);
}
return target;
}
}

View File

@ -38,6 +38,9 @@ class ApiEndpoints {
static const String contacts = '$comm/contacts';
static const String messages = '$comm/messages';
// Voice
static const String transcribe = '$voice/transcribe';
// WebSocket
static const String wsTerminal = '/ws/terminal';
}

View File

@ -1,7 +1,11 @@
import 'dart:async';
import 'dart:typed_data';
import 'package:dio/dio.dart' show FormData, MultipartFile;
import 'package:flutter/material.dart';
import 'package:flutter_riverpod/flutter_riverpod.dart';
import 'package:speech_to_text/speech_to_text.dart' as stt;
import 'package:record/record.dart';
import '../../../../core/audio/noise_reducer.dart';
import '../../../../core/audio/speech_enhancer.dart';
import '../../../../core/config/api_endpoints.dart';
import '../../../../core/network/dio_client.dart';
import '../../../../core/network/websocket_client.dart';
@ -207,59 +211,101 @@ class _ChatPageState extends ConsumerState<ChatPage> with SingleTickerProviderSt
final _messageController = TextEditingController();
final _scrollController = ScrollController();
// -- Voice input (TODO 40) ------------------------------------------------
late final stt.SpeechToText _speech;
bool _speechAvailable = false;
// -- Voice input (record + GTCRN denoise + backend STT) -------------------
late final AudioRecorder _recorder;
final SpeechEnhancer _enhancer = SpeechEnhancer();
bool _isListening = false;
bool _isTranscribing = false;
List<List<int>> _audioChunks = [];
StreamSubscription<List<int>>? _audioSubscription;
late AnimationController _micPulseController;
@override
void initState() {
super.initState();
_speech = stt.SpeechToText();
_initSpeech();
_recorder = AudioRecorder();
_enhancer.init(); // load GTCRN model in background
_micPulseController = AnimationController(
vsync: this,
duration: const Duration(milliseconds: 800),
);
}
Future<void> _initSpeech() async {
_speechAvailable = await _speech.initialize(
onStatus: (status) {
if (status == 'done' || status == 'notListening') {
_stopListening(autoSubmit: true);
}
},
onError: (_) => _stopListening(),
);
if (mounted) setState(() {});
}
Future<void> _startListening() async {
final hasPermission = await _recorder.hasPermission();
if (!hasPermission || !mounted) return;
void _startListening() {
if (!_speechAvailable) return;
setState(() => _isListening = true);
_micPulseController.repeat(reverse: true);
_speech.listen(
onResult: (result) {
_messageController.text = result.recognizedWords;
if (result.finalResult) {
_stopListening(autoSubmit: true);
}
},
listenFor: const Duration(seconds: 30),
pauseFor: const Duration(seconds: 3),
);
_audioChunks = [];
// Stream raw PCM 16kHz mono with platform noise suppression + AGC
final stream = await _recorder.startStream(const RecordConfig(
encoder: AudioEncoder.pcm16bits,
sampleRate: 16000,
numChannels: 1,
noiseSuppress: true,
autoGain: true,
));
_audioSubscription = stream.listen((data) {
_audioChunks.add(data);
});
}
void _stopListening({bool autoSubmit = false}) {
_speech.stop();
Future<void> _stopListening({bool autoSubmit = false}) async {
if (!_isListening) return;
// Stop recording and stream
await _recorder.stop();
await _audioSubscription?.cancel();
_audioSubscription = null;
_micPulseController.stop();
_micPulseController.reset();
if (!mounted) return;
setState(() => _isListening = false);
if (autoSubmit && _messageController.text.trim().isNotEmpty) {
_send();
if (!autoSubmit || _audioChunks.isEmpty) return;
// Transcribe via backend
setState(() => _isTranscribing = true);
try {
// Combine recorded chunks into a single PCM buffer
final allBytes = _audioChunks.expand((c) => c).toList();
final pcmData = Uint8List.fromList(allBytes);
_audioChunks = [];
// GTCRN ML denoise (light) + trim leading/trailing silence
final denoised = _enhancer.enhance(pcmData);
final trimmed = NoiseReducer.trimSilence(denoised);
if (trimmed.isEmpty) {
if (mounted) setState(() => _isTranscribing = false);
return;
}
// POST to backend /voice/transcribe
final dio = ref.read(dioClientProvider);
final formData = FormData.fromMap({
'audio': MultipartFile.fromBytes(trimmed, filename: 'audio.pcm'),
});
final response = await dio.post(
ApiEndpoints.transcribe,
data: formData,
);
final text =
(response.data as Map<String, dynamic>)['text'] as String? ?? '';
if (text.isNotEmpty && mounted) {
_messageController.text = text;
_send();
}
} catch (_) {
// Voice failed silently user can still type
} finally {
if (mounted) setState(() => _isTranscribing = false);
}
}
@ -383,28 +429,42 @@ class _ChatPageState extends ConsumerState<ChatPage> with SingleTickerProviderSt
),
),
// Voice listening indicator
if (_isListening)
// Voice listening / transcribing indicator
if (_isListening || _isTranscribing)
Container(
padding: const EdgeInsets.symmetric(vertical: 8, horizontal: 16),
color: AppColors.error.withOpacity(0.1),
color: (_isListening ? AppColors.error : AppColors.primary)
.withOpacity(0.1),
child: Row(
children: [
AnimatedBuilder(
animation: _micPulseController,
builder: (context, _) => Icon(
Icons.mic,
color: AppColors.error,
size: 20 + (_micPulseController.value * 4),
if (_isListening)
AnimatedBuilder(
animation: _micPulseController,
builder: (context, _) => Icon(
Icons.mic,
color: AppColors.error,
size: 20 + (_micPulseController.value * 4),
),
)
else
const SizedBox(
width: 20,
height: 20,
child: CircularProgressIndicator(strokeWidth: 2),
),
const SizedBox(width: 8),
Text(
_isListening ? 'Listening...' : 'Transcribing...',
style: TextStyle(
color: _isListening ? AppColors.error : AppColors.primary,
),
),
const SizedBox(width: 8),
const Text('Listening...', style: TextStyle(color: AppColors.error)),
const Spacer(),
TextButton(
onPressed: () => _stopListening(),
child: const Text('Cancel'),
),
if (_isListening)
TextButton(
onPressed: () => _stopListening(),
child: const Text('Cancel'),
),
],
),
),
@ -459,7 +519,9 @@ class _ChatPageState extends ConsumerState<ChatPage> with SingleTickerProviderSt
_messageController.dispose();
_scrollController.dispose();
_micPulseController.dispose();
_speech.stop();
_audioSubscription?.cancel();
_recorder.dispose();
_enhancer.dispose();
super.dispose();
}
}

View File

@ -9,14 +9,6 @@ packages:
url: "https://pub.dev"
source: hosted
version: "85.0.0"
_flutterfire_internals:
dependency: transitive
description:
name: _flutterfire_internals
sha256: "37a42d06068e2fe3deddb2da079a8c4d105f241225ba27b7122b37e9865fd8f7"
url: "https://pub.dev"
source: hosted
version: "1.3.35"
analyzer:
dependency: transitive
description:
@ -177,6 +169,14 @@ packages:
url: "https://pub.dev"
source: hosted
version: "3.0.7"
cupertino_icons:
dependency: "direct main"
description:
name: cupertino_icons
sha256: ba631d1c7f7bef6b729a622b7b752645a2d076dba9976925b8f25725a30e1ee6
url: "https://pub.dev"
source: hosted
version: "1.0.8"
custom_lint_core:
dependency: transitive
description:
@ -249,54 +249,6 @@ packages:
url: "https://pub.dev"
source: hosted
version: "7.0.1"
firebase_core:
dependency: "direct main"
description:
name: firebase_core
sha256: "26de145bb9688a90962faec6f838247377b0b0d32cc0abecd9a4e43525fc856c"
url: "https://pub.dev"
source: hosted
version: "2.32.0"
firebase_core_platform_interface:
dependency: transitive
description:
name: firebase_core_platform_interface
sha256: "8bcfad6d7033f5ea951d15b867622a824b13812178bfec0c779b9d81de011bbb"
url: "https://pub.dev"
source: hosted
version: "5.4.2"
firebase_core_web:
dependency: transitive
description:
name: firebase_core_web
sha256: "362e52457ed2b7b180964769c1e04d1e0ea0259fdf7025fdfedd019d4ae2bd88"
url: "https://pub.dev"
source: hosted
version: "2.17.5"
firebase_messaging:
dependency: "direct main"
description:
name: firebase_messaging
sha256: a1662cc95d9750a324ad9df349b873360af6f11414902021f130c68ec02267c4
url: "https://pub.dev"
source: hosted
version: "14.9.4"
firebase_messaging_platform_interface:
dependency: transitive
description:
name: firebase_messaging_platform_interface
sha256: "87c4a922cb6f811cfb7a889bdbb3622702443c52a0271636cbc90d813ceac147"
url: "https://pub.dev"
source: hosted
version: "4.5.37"
firebase_messaging_web:
dependency: transitive
description:
name: firebase_messaging_web
sha256: "0d34dca01a7b103ed7f20138bffbb28eb0e61a677bf9e78a028a932e2c7322d5"
url: "https://pub.dev"
source: hosted
version: "3.8.7"
fixnum:
dependency: transitive
description:
@ -390,6 +342,14 @@ packages:
url: "https://pub.dev"
source: hosted
version: "3.1.2"
flutter_svg:
dependency: "direct main"
description:
name: flutter_svg
sha256: "87fbd7c534435b6c5d9d98b01e1fd527812b82e68ddd8bd35fc45ed0fa8f0a95"
url: "https://pub.dev"
source: hosted
version: "2.2.3"
flutter_test:
dependency: "direct dev"
description: flutter
@ -472,6 +432,14 @@ packages:
url: "https://pub.dev"
source: hosted
version: "1.1.0"
http:
dependency: transitive
description:
name: http
sha256: "87721a4a50b19c7f1d49001e51409bddc46303966ce89a65af4f4e6004896412"
url: "https://pub.dev"
source: hosted
version: "1.6.0"
http_multi_server:
dependency: transitive
description:
@ -633,15 +601,23 @@ packages:
source: hosted
version: "2.2.0"
path:
dependency: transitive
dependency: "direct main"
description:
name: path
sha256: "75cca69d1490965be98c73ceaea117e8a04dd21217b37b292c9ddbec0d955bc5"
url: "https://pub.dev"
source: hosted
version: "1.9.1"
path_provider:
path_parsing:
dependency: transitive
description:
name: path_parsing
sha256: "883402936929eac138ee0a45da5b0f2c80f89913e6dc3bf77eb65b84b409c6ca"
url: "https://pub.dev"
source: hosted
version: "1.1.0"
path_provider:
dependency: "direct main"
description:
name: path_provider
sha256: "50c5dd5b6e1aaf6fb3a78b33f6aa3afca52bf903a8a5298f53101fdaee55bbcd"
@ -688,14 +664,6 @@ packages:
url: "https://pub.dev"
source: hosted
version: "2.3.0"
pedantic:
dependency: transitive
description:
name: pedantic
sha256: "67fc27ed9639506c856c840ccce7594d0bdcd91bc8d53d6e52359449a1d50602"
url: "https://pub.dev"
source: hosted
version: "1.11.1"
permission_handler:
dependency: "direct main"
description:
@ -744,6 +712,14 @@ packages:
url: "https://pub.dev"
source: hosted
version: "0.2.1"
petitparser:
dependency: transitive
description:
name: petitparser
sha256: "91bd59303e9f769f108f8df05e371341b15d59e995e6806aefab827b58336675"
url: "https://pub.dev"
source: hosted
version: "7.0.2"
platform:
dependency: transitive
description:
@ -792,6 +768,70 @@ packages:
url: "https://pub.dev"
source: hosted
version: "3.2.2"
record:
dependency: "direct main"
description:
name: record
sha256: d5b6b334f3ab02460db6544e08583c942dbf23e3504bf1e14fd4cbe3d9409277
url: "https://pub.dev"
source: hosted
version: "6.2.0"
record_android:
dependency: transitive
description:
name: record_android
sha256: "94783f08403aed33ffb68797bf0715b0812eb852f3c7985644c945faea462ba1"
url: "https://pub.dev"
source: hosted
version: "1.5.1"
record_ios:
dependency: transitive
description:
name: record_ios
sha256: "8df7c136131bd05efc19256af29b2ba6ccc000ccc2c80d4b6b6d7a8d21a3b5a9"
url: "https://pub.dev"
source: hosted
version: "1.2.0"
record_linux:
dependency: transitive
description:
name: record_linux
sha256: c31a35cc158cd666fc6395f7f56fc054f31685571684be6b97670a27649ce5c7
url: "https://pub.dev"
source: hosted
version: "1.3.0"
record_macos:
dependency: transitive
description:
name: record_macos
sha256: "084902e63fc9c0c224c29203d6c75f0bdf9b6a40536c9d916393c8f4c4256488"
url: "https://pub.dev"
source: hosted
version: "1.2.1"
record_platform_interface:
dependency: transitive
description:
name: record_platform_interface
sha256: "8a81dbc4e14e1272a285bbfef6c9136d070a47d9b0d1f40aa6193516253ee2f6"
url: "https://pub.dev"
source: hosted
version: "1.5.0"
record_web:
dependency: transitive
description:
name: record_web
sha256: "7e9846981c1f2d111d86f0ae3309071f5bba8b624d1c977316706f08fc31d16d"
url: "https://pub.dev"
source: hosted
version: "1.3.0"
record_windows:
dependency: transitive
description:
name: record_windows
sha256: "223258060a1d25c62bae18282c16783f28581ec19401d17e56b5205b9f039d78"
url: "https://pub.dev"
source: hosted
version: "1.0.7"
riverpod:
dependency: transitive
description:
@ -896,6 +936,54 @@ packages:
url: "https://pub.dev"
source: hosted
version: "2.0.1"
sherpa_onnx:
dependency: "direct main"
description:
name: sherpa_onnx
sha256: "6f14669c62bb0812c3f99adcd4cae8677037106618cc51ce09d285a4c5db828a"
url: "https://pub.dev"
source: hosted
version: "1.12.25"
sherpa_onnx_android:
dependency: transitive
description:
name: sherpa_onnx_android
sha256: f9881cd42347eac0619298186d86f286ce6b74947a27b8506f6729496ebccc5d
url: "https://pub.dev"
source: hosted
version: "1.12.25"
sherpa_onnx_ios:
dependency: transitive
description:
name: sherpa_onnx_ios
sha256: a9c916340eda3bb24ce4598810fc141469f3d9afd0290390d1cc749044ae919d
url: "https://pub.dev"
source: hosted
version: "1.12.25"
sherpa_onnx_linux:
dependency: transitive
description:
name: sherpa_onnx_linux
sha256: "82d4664ab6df87a76c12987cb420cbe112133d710f9b2e30c4e83d7ad1e93fb0"
url: "https://pub.dev"
source: hosted
version: "1.12.25"
sherpa_onnx_macos:
dependency: transitive
description:
name: sherpa_onnx_macos
sha256: "453fa9a6fdff47d4e8aeef5f9c3ed64327e14769401a16b36213b7a3a3b8aae0"
url: "https://pub.dev"
source: hosted
version: "1.12.25"
sherpa_onnx_windows:
dependency: transitive
description:
name: sherpa_onnx_windows
sha256: c70446773ddab00e8f78b415fe1a580723d49c1f78ad7ce751183620b35c1ffd
url: "https://pub.dev"
source: hosted
version: "1.12.25"
sky_engine:
dependency: transitive
description: flutter
@ -925,30 +1013,6 @@ packages:
url: "https://pub.dev"
source: hosted
version: "1.10.2"
speech_to_text:
dependency: "direct main"
description:
name: speech_to_text
sha256: "57fef1d41bdebe298e84842c89bb4ac91f31cdbec7830c8cb1fc6b91d03abd42"
url: "https://pub.dev"
source: hosted
version: "6.6.0"
speech_to_text_macos:
dependency: transitive
description:
name: speech_to_text_macos
sha256: e685750f7542fcaa087a5396ee471e727ec648bf681f4da83c84d086322173f6
url: "https://pub.dev"
source: hosted
version: "1.1.0"
speech_to_text_platform_interface:
dependency: transitive
description:
name: speech_to_text_platform_interface
sha256: a1935847704e41ee468aad83181ddd2423d0833abe55d769c59afca07adb5114
url: "https://pub.dev"
source: hosted
version: "2.3.0"
stack_trace:
dependency: transitive
description:
@ -1093,6 +1157,30 @@ packages:
url: "https://pub.dev"
source: hosted
version: "4.5.2"
vector_graphics:
dependency: transitive
description:
name: vector_graphics
sha256: a4f059dc26fc8295b5921376600a194c4ec7d55e72f2fe4c7d2831e103d461e6
url: "https://pub.dev"
source: hosted
version: "1.1.19"
vector_graphics_codec:
dependency: transitive
description:
name: vector_graphics_codec
sha256: "99fd9fbd34d9f9a32efd7b6a6aae14125d8237b10403b422a6a6dfeac2806146"
url: "https://pub.dev"
source: hosted
version: "1.1.13"
vector_graphics_compiler:
dependency: transitive
description:
name: vector_graphics_compiler
sha256: "5a88dd14c0954a5398af544651c7fb51b457a2a556949bfb25369b210ef73a74"
url: "https://pub.dev"
source: hosted
version: "1.2.0"
vector_math:
dependency: transitive
description:
@ -1121,18 +1209,18 @@ packages:
dependency: transitive
description:
name: web
sha256: "97da13628db363c635202ad97068d47c5b8aa555808e7a9411963c533b449b27"
sha256: "868d88a33d8a87b18ffc05f9f030ba328ffefba92d6c127917a2ba740f9cfe4a"
url: "https://pub.dev"
source: hosted
version: "0.5.1"
version: "1.1.1"
web_socket_channel:
dependency: "direct main"
description:
name: web_socket_channel
sha256: "58c6666b342a38816b2e7e50ed0f1e261959630becd4c879c4f26bfa14aa5a42"
sha256: d88238e5eac9a42bb43ca4e721edba3c08c6354d4a53063afaa568516217621b
url: "https://pub.dev"
source: hosted
version: "2.4.5"
version: "2.4.0"
win32:
dependency: transitive
description:
@ -1149,6 +1237,14 @@ packages:
url: "https://pub.dev"
source: hosted
version: "1.1.0"
xml:
dependency: transitive
description:
name: xml
sha256: "971043b3a0d3da28727e40ed3e0b5d18b742fa5a68665cca88e74b7876d5e025"
url: "https://pub.dev"
source: hosted
version: "6.6.1"
xterm:
dependency: "direct main"
description:

View File

@ -9,6 +9,7 @@ environment:
dependencies:
flutter:
sdk: flutter
cupertino_icons: ^1.0.8
# State Management
flutter_riverpod: ^2.5.0
@ -36,13 +37,14 @@ dependencies:
flutter_markdown: ^0.7.0
flutter_svg: ^2.0.10+1
# Push Notifications
firebase_core: ^2.27.0
firebase_messaging: ^14.7.0
# Voice
speech_to_text: ^6.6.0
record: ^6.0.0
flutter_tts: ^4.0.0
sherpa_onnx: ^1.12.25
# File paths
path_provider: ^2.1.0
path: ^1.9.0
# Terminal
xterm: ^4.0.0
@ -70,11 +72,4 @@ flutter:
assets:
- assets/images/
- assets/icons/
- assets/animations/
fonts:
- family: Roboto
fonts:
- asset: assets/fonts/Roboto-Regular.ttf
- asset: assets/fonts/Roboto-Bold.ttf
weight: 700
- assets/gtcrn_simple.onnx

View File

@ -1,7 +1,8 @@
import asyncio
import uuid
from fastapi import APIRouter, WebSocket, WebSocketDisconnect, Request
from fastapi import APIRouter, WebSocket, WebSocketDisconnect, Request, UploadFile, File
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from typing import Optional
@ -151,3 +152,18 @@ async def voice_websocket(websocket: WebSocket, session_id: str):
await websocket.close()
except Exception:
pass
@router.post("/transcribe")
async def transcribe_audio(req: Request, audio: UploadFile = File(...)):
"""Transcribe uploaded audio (PCM 16kHz 16-bit mono) to text using Whisper."""
stt = getattr(req.app.state, "stt", None)
if stt is None or stt._model is None:
return JSONResponse(status_code=503, content={"error": "STT model not loaded"})
audio_data = await audio.read()
if len(audio_data) == 0:
return {"text": ""}
text = await stt.transcribe(audio_data)
return {"text": text.strip()}