feat: replace speech_to_text with GTCRN ML noise reduction + backend STT

Replace traditional on-device speech_to_text with a modern pipeline: - Record audio via `record` package with hardware noise suppression - Apply GTCRN neural denoising (sherpa-onnx, ICASSP 2024, 48K params) - Trim silence, POST to backend /voice/transcribe (faster-whisper) Changes: - Add /transcribe endpoint to voice-service for audio file upload - Add SpeechEnhancer wrapper for sherpa-onnx GTCRN model (523KB) - Rewrite chat_page.dart voice input: record → denoise → transcribe - Keep NoiseReducer.trimSilence for silence removal only - Upgrade record to v6.2.0, add sherpa_onnx, path_provider Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-22 07:59:15 -08:00 · 2026-02-22 07:59:15 -08:00 · a568558585
parent 39c0d83424
commit a568558585
9 changed files with 498 additions and 172 deletions
--- a/it0_app/android/app/src/main/java/io/flutter/plugins/GeneratedPluginRegistrant.java
+++ b/it0_app/android/app/src/main/java/io/flutter/plugins/GeneratedPluginRegistrant.java
@ -15,16 +15,6 @@ import io.flutter.embedding.engine.FlutterEngine;
 public final class GeneratedPluginRegistrant {
  private static final String TAG = "GeneratedPluginRegistrant";
  public static void registerWith(@NonNull FlutterEngine flutterEngine) {
-    try {
-      flutterEngine.getPlugins().add(new io.flutter.plugins.firebase.core.FlutterFirebaseCorePlugin());
-    } catch (Exception e) {
-      Log.e(TAG, "Error registering plugin firebase_core, io.flutter.plugins.firebase.core.FlutterFirebaseCorePlugin", e);
-    }
-    try {
-      flutterEngine.getPlugins().add(new io.flutter.plugins.firebase.messaging.FlutterFirebaseMessagingPlugin());
-    } catch (Exception e) {
-      Log.e(TAG, "Error registering plugin firebase_messaging, io.flutter.plugins.firebase.messaging.FlutterFirebaseMessagingPlugin", e);
-    }
    try {
      flutterEngine.getPlugins().add(new com.it_nomads.fluttersecurestorage.FlutterSecureStoragePlugin());
    } catch (Exception e) {
@ -45,16 +35,16 @@ public final class GeneratedPluginRegistrant {
    } catch (Exception e) {
      Log.e(TAG, "Error registering plugin permission_handler_android, com.baseflow.permissionhandler.PermissionHandlerPlugin", e);
    }
+    try {
+      flutterEngine.getPlugins().add(new com.llfbandit.record.RecordPlugin());
+    } catch (Exception e) {
+      Log.e(TAG, "Error registering plugin record_android, com.llfbandit.record.RecordPlugin", e);
+    }
    try {
      flutterEngine.getPlugins().add(new io.flutter.plugins.sharedpreferences.SharedPreferencesPlugin());
    } catch (Exception e) {
      Log.e(TAG, "Error registering plugin shared_preferences_android, io.flutter.plugins.sharedpreferences.SharedPreferencesPlugin", e);
    }
-    try {
-      flutterEngine.getPlugins().add(new com.csdcorp.speech_to_text.SpeechToTextPlugin());
-    } catch (Exception e) {
-      Log.e(TAG, "Error registering plugin speech_to_text, com.csdcorp.speech_to_text.SpeechToTextPlugin", e);
-    }
    try {
      flutterEngine.getPlugins().add(new io.flutter.plugins.urllauncher.UrlLauncherPlugin());
    } catch (Exception e) {
--- a/it0_app/assets/gtcrn_simple.onnx
+++ b/it0_app/assets/gtcrn_simple.onnx
--- a/it0_app/lib/core/audio/noise_reducer.dart
+++ b/it0_app/lib/core/audio/noise_reducer.dart
@ -0,0 +1,67 @@
+import 'dart:math';
+import 'dart:typed_data';
+
+/// Utility for trimming leading/trailing silence from PCM audio.
+///
+/// The heavy-lifting noise reduction is handled by [SpeechEnhancer] (GTCRN).
+/// This class provides a lightweight silence trimmer to reduce data sent
+/// to the backend STT.
+class NoiseReducer {
+  /// Trim leading and trailing silence from PCM audio.
+  ///
+  /// Keeps a [marginMs] buffer around detected speech boundaries.
+  static Uint8List trimSilence(
+    Uint8List pcm16Data, {
+    int sampleRate = 16000,
+    double silenceThresholdDb = -40,
+    int marginMs = 200,
+  }) {
+    if (pcm16Data.length < 4) return pcm16Data;
+
+    final sampleCount = pcm16Data.length ~/ 2;
+    final byteData = ByteData.sublistView(pcm16Data);
+    final frameSize = (sampleRate * 0.020).round();
+    final marginSamples = (sampleRate * marginMs / 1000).round();
+
+    // Convert dB threshold to amplitude
+    final threshold = 32768 * pow(10, silenceThresholdDb / 20);
+
+    // Find first non-silent frame
+    int startSample = 0;
+    bool foundStart = false;
+    for (int i = 0; i < sampleCount - frameSize; i += frameSize) {
+      double rms = 0;
+      final end = min(i + frameSize, sampleCount);
+      for (int j = i; j < end; j++) {
+        final s = byteData.getInt16(j * 2, Endian.little).toDouble();
+        rms += s * s;
+      }
+      rms = sqrt(rms / (end - i));
+      if (rms > threshold) {
+        startSample = max(0, i - marginSamples);
+        foundStart = true;
+        break;
+      }
+    }
+    if (!foundStart) return Uint8List(0);
+
+    // Find last non-silent frame
+    int endSample = sampleCount;
+    for (int i = sampleCount - frameSize; i >= 0; i -= frameSize) {
+      double rms = 0;
+      final end = min(i + frameSize, sampleCount);
+      for (int j = i; j < end; j++) {
+        final s = byteData.getInt16(j * 2, Endian.little).toDouble();
+        rms += s * s;
+      }
+      rms = sqrt(rms / (end - i));
+      if (rms > threshold) {
+        endSample = min(sampleCount, i + frameSize + marginSamples);
+        break;
+      }
+    }
+
+    if (startSample >= endSample) return Uint8List(0);
+    return Uint8List.sublistView(pcm16Data, startSample * 2, endSample * 2);
+  }
+}
--- a/it0_app/lib/core/audio/speech_enhancer.dart
+++ b/it0_app/lib/core/audio/speech_enhancer.dart
@ -0,0 +1,97 @@
+import 'dart:io';
+import 'dart:typed_data';
+import 'package:flutter/services.dart' show rootBundle;
+import 'package:path/path.dart' as p;
+import 'package:path_provider/path_provider.dart';
+import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa;
+
+/// Wraps sherpa-onnx's GTCRN model for speech enhancement (noise reduction).
+///
+/// Uses the GTCRN model (ICASSP 2024, ~48K params, 523KB) which provides
+/// lightweight ML-based denoising without over-processing — ideal for
+/// preprocessing audio before sending to backend STT (faster-whisper).
+///
+/// Usage:
+///   final enhancer = SpeechEnhancer();
+///   await enhancer.init();
+///   final denoised = enhancer.enhance(pcm16Bytes);
+///   enhancer.dispose();
+class SpeechEnhancer {
+  static const String _modelAsset = 'assets/gtcrn_simple.onnx';
+
+  sherpa.OfflineSpeechDenoiser? _denoiser;
+  bool _initialized = false;
+  String? _modelPath;
+
+  /// Initialize the GTCRN denoiser. Must be called before [enhance].
+  /// Copies the model from Flutter assets to the filesystem on first run.
+  Future<void> init() async {
+    if (_initialized) return;
+
+    _modelPath = await _copyAssetToFile(_modelAsset);
+
+    final config = sherpa.OfflineSpeechDenoiserConfig(
+      model: sherpa.OfflineSpeechDenoiserModelConfig(
+        gtcrn: sherpa.OfflineSpeechDenoiserGtcrnModelConfig(
+          model: _modelPath!,
+        ),
+        numThreads: 1,
+        debug: false,
+        provider: 'cpu',
+      ),
+    );
+
+    _denoiser = sherpa.OfflineSpeechDenoiser(config);
+    _initialized = true;
+  }
+
+  /// Denoise PCM 16-bit signed little-endian mono audio at 16 kHz.
+  ///
+  /// Returns denoised PCM bytes in the same format.
+  /// If the enhancer is not initialized, returns the input unchanged.
+  Uint8List enhance(Uint8List pcm16Bytes) {
+    if (_denoiser == null || pcm16Bytes.length < 640) return pcm16Bytes;
+
+    // Convert Int16 PCM → Float32 normalized [-1.0, 1.0]
+    final sampleCount = pcm16Bytes.length ~/ 2;
+    final float32 = Float32List(sampleCount);
+    final byteData = ByteData.sublistView(pcm16Bytes);
+    for (int i = 0; i < sampleCount; i++) {
+      float32[i] = byteData.getInt16(i * 2, Endian.little) / 32768.0;
+    }
+
+    // Run GTCRN denoiser
+    final result = _denoiser!.run(samples: float32, sampleRate: 16000);
+
+    // Convert Float32 → Int16 PCM bytes
+    final output = Uint8List(result.samples.length * 2);
+    final outView = ByteData.sublistView(output);
+    for (int i = 0; i < result.samples.length; i++) {
+      final val = (result.samples[i] * 32768.0).round().clamp(-32768, 32767);
+      outView.setInt16(i * 2, val, Endian.little);
+    }
+    return output;
+  }
+
+  /// Release native resources. Must be called when done.
+  void dispose() {
+    _denoiser?.free();
+    _denoiser = null;
+    _initialized = false;
+  }
+
+  /// Copy a Flutter asset to the app support directory (cached).
+  static Future<String> _copyAssetToFile(String assetPath) async {
+    final dir = await getApplicationSupportDirectory();
+    final target = p.join(dir.path, p.basename(assetPath));
+    final file = File(target);
+
+    if (!await file.exists()) {
+      final data = await rootBundle.load(assetPath);
+      await file.writeAsBytes(
+        data.buffer.asUint8List(data.offsetInBytes, data.lengthInBytes),
+      );
+    }
+    return target;
+  }
+}
--- a/it0_app/lib/core/config/api_endpoints.dart
+++ b/it0_app/lib/core/config/api_endpoints.dart
@ -38,6 +38,9 @@ class ApiEndpoints {
  static const String contacts = '$comm/contacts';
  static const String messages = '$comm/messages';

+  // Voice
+  static const String transcribe = '$voice/transcribe';
+
  // WebSocket
  static const String wsTerminal = '/ws/terminal';
 }
--- a/it0_app/lib/features/chat/presentation/pages/chat_page.dart
+++ b/it0_app/lib/features/chat/presentation/pages/chat_page.dart
@ -1,7 +1,11 @@
 import 'dart:async';
+import 'dart:typed_data';
+import 'package:dio/dio.dart' show FormData, MultipartFile;
 import 'package:flutter/material.dart';
 import 'package:flutter_riverpod/flutter_riverpod.dart';
-import 'package:speech_to_text/speech_to_text.dart' as stt;
+import 'package:record/record.dart';
+import '../../../../core/audio/noise_reducer.dart';
+import '../../../../core/audio/speech_enhancer.dart';
 import '../../../../core/config/api_endpoints.dart';
 import '../../../../core/network/dio_client.dart';
 import '../../../../core/network/websocket_client.dart';
@ -207,59 +211,101 @@ class _ChatPageState extends ConsumerState<ChatPage> with SingleTickerProviderSt
  final _messageController = TextEditingController();
  final _scrollController = ScrollController();

-  // -- Voice input (TODO 40) ------------------------------------------------
-  late final stt.SpeechToText _speech;
-  bool _speechAvailable = false;
+  // -- Voice input (record + GTCRN denoise + backend STT) -------------------
+  late final AudioRecorder _recorder;
+  final SpeechEnhancer _enhancer = SpeechEnhancer();
  bool _isListening = false;
+  bool _isTranscribing = false;
+  List<List<int>> _audioChunks = [];
+  StreamSubscription<List<int>>? _audioSubscription;
  late AnimationController _micPulseController;

  @override
  void initState() {
    super.initState();
-    _speech = stt.SpeechToText();
-    _initSpeech();
+    _recorder = AudioRecorder();
+    _enhancer.init(); // load GTCRN model in background
    _micPulseController = AnimationController(
      vsync: this,
      duration: const Duration(milliseconds: 800),
    );
  }

-  Future<void> _initSpeech() async {
-    _speechAvailable = await _speech.initialize(
-      onStatus: (status) {
-        if (status == 'done' || status == 'notListening') {
-          _stopListening(autoSubmit: true);
-        }
-      },
-      onError: (_) => _stopListening(),
-    );
-    if (mounted) setState(() {});
-  }
+  Future<void> _startListening() async {
+    final hasPermission = await _recorder.hasPermission();
+    if (!hasPermission || !mounted) return;

-  void _startListening() {
-    if (!_speechAvailable) return;
    setState(() => _isListening = true);
    _micPulseController.repeat(reverse: true);
-    _speech.listen(
-      onResult: (result) {
-        _messageController.text = result.recognizedWords;
-        if (result.finalResult) {
-          _stopListening(autoSubmit: true);
-        }
-      },
-      listenFor: const Duration(seconds: 30),
-      pauseFor: const Duration(seconds: 3),
-    );
+    _audioChunks = [];
+
+    // Stream raw PCM 16kHz mono with platform noise suppression + AGC
+    final stream = await _recorder.startStream(const RecordConfig(
+      encoder: AudioEncoder.pcm16bits,
+      sampleRate: 16000,
+      numChannels: 1,
+      noiseSuppress: true,
+      autoGain: true,
+    ));
+
+    _audioSubscription = stream.listen((data) {
+      _audioChunks.add(data);
+    });
  }

-  void _stopListening({bool autoSubmit = false}) {
-    _speech.stop();
+  Future<void> _stopListening({bool autoSubmit = false}) async {
+    if (!_isListening) return;
+
+    // Stop recording and stream
+    await _recorder.stop();
+    await _audioSubscription?.cancel();
+    _audioSubscription = null;
    _micPulseController.stop();
    _micPulseController.reset();
+
    if (!mounted) return;
    setState(() => _isListening = false);
-    if (autoSubmit && _messageController.text.trim().isNotEmpty) {
-      _send();
+
+    if (!autoSubmit || _audioChunks.isEmpty) return;
+
+    // Transcribe via backend
+    setState(() => _isTranscribing = true);
+
+    try {
+      // Combine recorded chunks into a single PCM buffer
+      final allBytes = _audioChunks.expand((c) => c).toList();
+      final pcmData = Uint8List.fromList(allBytes);
+      _audioChunks = [];
+
+      // GTCRN ML denoise (light) + trim leading/trailing silence
+      final denoised = _enhancer.enhance(pcmData);
+      final trimmed = NoiseReducer.trimSilence(denoised);
+
+      if (trimmed.isEmpty) {
+        if (mounted) setState(() => _isTranscribing = false);
+        return;
+      }
+
+      // POST to backend /voice/transcribe
+      final dio = ref.read(dioClientProvider);
+      final formData = FormData.fromMap({
+        'audio': MultipartFile.fromBytes(trimmed, filename: 'audio.pcm'),
+      });
+      final response = await dio.post(
+        ApiEndpoints.transcribe,
+        data: formData,
+      );
+
+      final text =
+          (response.data as Map<String, dynamic>)['text'] as String? ?? '';
+      if (text.isNotEmpty && mounted) {
+        _messageController.text = text;
+        _send();
+      }
+    } catch (_) {
+      // Voice failed silently – user can still type
+    } finally {
+      if (mounted) setState(() => _isTranscribing = false);
    }
  }

@ -383,28 +429,42 @@ class _ChatPageState extends ConsumerState<ChatPage> with SingleTickerProviderSt
                  ),
          ),

-          // Voice listening indicator
-          if (_isListening)
+          // Voice listening / transcribing indicator
+          if (_isListening || _isTranscribing)
            Container(
              padding: const EdgeInsets.symmetric(vertical: 8, horizontal: 16),
-              color: AppColors.error.withOpacity(0.1),
+              color: (_isListening ? AppColors.error : AppColors.primary)
+                  .withOpacity(0.1),
              child: Row(
                children: [
-                  AnimatedBuilder(
-                    animation: _micPulseController,
-                    builder: (context, _) => Icon(
-                      Icons.mic,
-                      color: AppColors.error,
-                      size: 20 + (_micPulseController.value * 4),
+                  if (_isListening)
+                    AnimatedBuilder(
+                      animation: _micPulseController,
+                      builder: (context, _) => Icon(
+                        Icons.mic,
+                        color: AppColors.error,
+                        size: 20 + (_micPulseController.value * 4),
+                      ),
+                    )
+                  else
+                    const SizedBox(
+                      width: 20,
+                      height: 20,
+                      child: CircularProgressIndicator(strokeWidth: 2),
+                    ),
+                  const SizedBox(width: 8),
+                  Text(
+                    _isListening ? 'Listening...' : 'Transcribing...',
+                    style: TextStyle(
+                      color: _isListening ? AppColors.error : AppColors.primary,
                    ),
                  ),
-                  const SizedBox(width: 8),
-                  const Text('Listening...', style: TextStyle(color: AppColors.error)),
                  const Spacer(),
-                  TextButton(
-                    onPressed: () => _stopListening(),
-                    child: const Text('Cancel'),
-                  ),
+                  if (_isListening)
+                    TextButton(
+                      onPressed: () => _stopListening(),
+                      child: const Text('Cancel'),
+                    ),
                ],
              ),
            ),
@ -459,7 +519,9 @@ class _ChatPageState extends ConsumerState<ChatPage> with SingleTickerProviderSt
    _messageController.dispose();
    _scrollController.dispose();
    _micPulseController.dispose();
-    _speech.stop();
+    _audioSubscription?.cancel();
+    _recorder.dispose();
+    _enhancer.dispose();
    super.dispose();
  }
 }
--- a/it0_app/pubspec.lock
+++ b/it0_app/pubspec.lock
@ -9,14 +9,6 @@ packages:
      url: "https://pub.dev"
    source: hosted
    version: "85.0.0"
-  _flutterfire_internals:
-    dependency: transitive
-    description:
-      name: _flutterfire_internals
-      sha256: "37a42d06068e2fe3deddb2da079a8c4d105f241225ba27b7122b37e9865fd8f7"
-      url: "https://pub.dev"
-    source: hosted
-    version: "1.3.35"
  analyzer:
    dependency: transitive
    description:
@ -177,6 +169,14 @@ packages:
      url: "https://pub.dev"
    source: hosted
    version: "3.0.7"
+  cupertino_icons:
+    dependency: "direct main"
+    description:
+      name: cupertino_icons
+      sha256: ba631d1c7f7bef6b729a622b7b752645a2d076dba9976925b8f25725a30e1ee6
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.0.8"
  custom_lint_core:
    dependency: transitive
    description:
@ -249,54 +249,6 @@ packages:
      url: "https://pub.dev"
    source: hosted
    version: "7.0.1"
-  firebase_core:
-    dependency: "direct main"
-    description:
-      name: firebase_core
-      sha256: "26de145bb9688a90962faec6f838247377b0b0d32cc0abecd9a4e43525fc856c"
-      url: "https://pub.dev"
-    source: hosted
-    version: "2.32.0"
-  firebase_core_platform_interface:
-    dependency: transitive
-    description:
-      name: firebase_core_platform_interface
-      sha256: "8bcfad6d7033f5ea951d15b867622a824b13812178bfec0c779b9d81de011bbb"
-      url: "https://pub.dev"
-    source: hosted
-    version: "5.4.2"
-  firebase_core_web:
-    dependency: transitive
-    description:
-      name: firebase_core_web
-      sha256: "362e52457ed2b7b180964769c1e04d1e0ea0259fdf7025fdfedd019d4ae2bd88"
-      url: "https://pub.dev"
-    source: hosted
-    version: "2.17.5"
-  firebase_messaging:
-    dependency: "direct main"
-    description:
-      name: firebase_messaging
-      sha256: a1662cc95d9750a324ad9df349b873360af6f11414902021f130c68ec02267c4
-      url: "https://pub.dev"
-    source: hosted
-    version: "14.9.4"
-  firebase_messaging_platform_interface:
-    dependency: transitive
-    description:
-      name: firebase_messaging_platform_interface
-      sha256: "87c4a922cb6f811cfb7a889bdbb3622702443c52a0271636cbc90d813ceac147"
-      url: "https://pub.dev"
-    source: hosted
-    version: "4.5.37"
-  firebase_messaging_web:
-    dependency: transitive
-    description:
-      name: firebase_messaging_web
-      sha256: "0d34dca01a7b103ed7f20138bffbb28eb0e61a677bf9e78a028a932e2c7322d5"
-      url: "https://pub.dev"
-    source: hosted
-    version: "3.8.7"
  fixnum:
    dependency: transitive
    description:
@ -390,6 +342,14 @@ packages:
      url: "https://pub.dev"
    source: hosted
    version: "3.1.2"
+  flutter_svg:
+    dependency: "direct main"
+    description:
+      name: flutter_svg
+      sha256: "87fbd7c534435b6c5d9d98b01e1fd527812b82e68ddd8bd35fc45ed0fa8f0a95"
+      url: "https://pub.dev"
+    source: hosted
+    version: "2.2.3"
  flutter_test:
    dependency: "direct dev"
    description: flutter
@ -472,6 +432,14 @@ packages:
      url: "https://pub.dev"
    source: hosted
    version: "1.1.0"
+  http:
+    dependency: transitive
+    description:
+      name: http
+      sha256: "87721a4a50b19c7f1d49001e51409bddc46303966ce89a65af4f4e6004896412"
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.6.0"
  http_multi_server:
    dependency: transitive
    description:
@ -633,15 +601,23 @@ packages:
    source: hosted
    version: "2.2.0"
  path:
-    dependency: transitive
+    dependency: "direct main"
    description:
      name: path
      sha256: "75cca69d1490965be98c73ceaea117e8a04dd21217b37b292c9ddbec0d955bc5"
      url: "https://pub.dev"
    source: hosted
    version: "1.9.1"
-  path_provider:
+  path_parsing:
    dependency: transitive
+    description:
+      name: path_parsing
+      sha256: "883402936929eac138ee0a45da5b0f2c80f89913e6dc3bf77eb65b84b409c6ca"
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.1.0"
+  path_provider:
+    dependency: "direct main"
    description:
      name: path_provider
      sha256: "50c5dd5b6e1aaf6fb3a78b33f6aa3afca52bf903a8a5298f53101fdaee55bbcd"
@ -688,14 +664,6 @@ packages:
      url: "https://pub.dev"
    source: hosted
    version: "2.3.0"
-  pedantic:
-    dependency: transitive
-    description:
-      name: pedantic
-      sha256: "67fc27ed9639506c856c840ccce7594d0bdcd91bc8d53d6e52359449a1d50602"
-      url: "https://pub.dev"
-    source: hosted
-    version: "1.11.1"
  permission_handler:
    dependency: "direct main"
    description:
@ -744,6 +712,14 @@ packages:
      url: "https://pub.dev"
    source: hosted
    version: "0.2.1"
+  petitparser:
+    dependency: transitive
+    description:
+      name: petitparser
+      sha256: "91bd59303e9f769f108f8df05e371341b15d59e995e6806aefab827b58336675"
+      url: "https://pub.dev"
+    source: hosted
+    version: "7.0.2"
  platform:
    dependency: transitive
    description:
@ -792,6 +768,70 @@ packages:
      url: "https://pub.dev"
    source: hosted
    version: "3.2.2"
+  record:
+    dependency: "direct main"
+    description:
+      name: record
+      sha256: d5b6b334f3ab02460db6544e08583c942dbf23e3504bf1e14fd4cbe3d9409277
+      url: "https://pub.dev"
+    source: hosted
+    version: "6.2.0"
+  record_android:
+    dependency: transitive
+    description:
+      name: record_android
+      sha256: "94783f08403aed33ffb68797bf0715b0812eb852f3c7985644c945faea462ba1"
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.5.1"
+  record_ios:
+    dependency: transitive
+    description:
+      name: record_ios
+      sha256: "8df7c136131bd05efc19256af29b2ba6ccc000ccc2c80d4b6b6d7a8d21a3b5a9"
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.2.0"
+  record_linux:
+    dependency: transitive
+    description:
+      name: record_linux
+      sha256: c31a35cc158cd666fc6395f7f56fc054f31685571684be6b97670a27649ce5c7
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.3.0"
+  record_macos:
+    dependency: transitive
+    description:
+      name: record_macos
+      sha256: "084902e63fc9c0c224c29203d6c75f0bdf9b6a40536c9d916393c8f4c4256488"
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.2.1"
+  record_platform_interface:
+    dependency: transitive
+    description:
+      name: record_platform_interface
+      sha256: "8a81dbc4e14e1272a285bbfef6c9136d070a47d9b0d1f40aa6193516253ee2f6"
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.5.0"
+  record_web:
+    dependency: transitive
+    description:
+      name: record_web
+      sha256: "7e9846981c1f2d111d86f0ae3309071f5bba8b624d1c977316706f08fc31d16d"
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.3.0"
+  record_windows:
+    dependency: transitive
+    description:
+      name: record_windows
+      sha256: "223258060a1d25c62bae18282c16783f28581ec19401d17e56b5205b9f039d78"
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.0.7"
  riverpod:
    dependency: transitive
    description:
@ -896,6 +936,54 @@ packages:
      url: "https://pub.dev"
    source: hosted
    version: "2.0.1"
+  sherpa_onnx:
+    dependency: "direct main"
+    description:
+      name: sherpa_onnx
+      sha256: "6f14669c62bb0812c3f99adcd4cae8677037106618cc51ce09d285a4c5db828a"
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.12.25"
+  sherpa_onnx_android:
+    dependency: transitive
+    description:
+      name: sherpa_onnx_android
+      sha256: f9881cd42347eac0619298186d86f286ce6b74947a27b8506f6729496ebccc5d
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.12.25"
+  sherpa_onnx_ios:
+    dependency: transitive
+    description:
+      name: sherpa_onnx_ios
+      sha256: a9c916340eda3bb24ce4598810fc141469f3d9afd0290390d1cc749044ae919d
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.12.25"
+  sherpa_onnx_linux:
+    dependency: transitive
+    description:
+      name: sherpa_onnx_linux
+      sha256: "82d4664ab6df87a76c12987cb420cbe112133d710f9b2e30c4e83d7ad1e93fb0"
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.12.25"
+  sherpa_onnx_macos:
+    dependency: transitive
+    description:
+      name: sherpa_onnx_macos
+      sha256: "453fa9a6fdff47d4e8aeef5f9c3ed64327e14769401a16b36213b7a3a3b8aae0"
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.12.25"
+  sherpa_onnx_windows:
+    dependency: transitive
+    description:
+      name: sherpa_onnx_windows
+      sha256: c70446773ddab00e8f78b415fe1a580723d49c1f78ad7ce751183620b35c1ffd
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.12.25"
  sky_engine:
    dependency: transitive
    description: flutter
@ -925,30 +1013,6 @@ packages:
      url: "https://pub.dev"
    source: hosted
    version: "1.10.2"
-  speech_to_text:
-    dependency: "direct main"
-    description:
-      name: speech_to_text
-      sha256: "57fef1d41bdebe298e84842c89bb4ac91f31cdbec7830c8cb1fc6b91d03abd42"
-      url: "https://pub.dev"
-    source: hosted
-    version: "6.6.0"
-  speech_to_text_macos:
-    dependency: transitive
-    description:
-      name: speech_to_text_macos
-      sha256: e685750f7542fcaa087a5396ee471e727ec648bf681f4da83c84d086322173f6
-      url: "https://pub.dev"
-    source: hosted
-    version: "1.1.0"
-  speech_to_text_platform_interface:
-    dependency: transitive
-    description:
-      name: speech_to_text_platform_interface
-      sha256: a1935847704e41ee468aad83181ddd2423d0833abe55d769c59afca07adb5114
-      url: "https://pub.dev"
-    source: hosted
-    version: "2.3.0"
  stack_trace:
    dependency: transitive
    description:
@ -1093,6 +1157,30 @@ packages:
      url: "https://pub.dev"
    source: hosted
    version: "4.5.2"
+  vector_graphics:
+    dependency: transitive
+    description:
+      name: vector_graphics
+      sha256: a4f059dc26fc8295b5921376600a194c4ec7d55e72f2fe4c7d2831e103d461e6
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.1.19"
+  vector_graphics_codec:
+    dependency: transitive
+    description:
+      name: vector_graphics_codec
+      sha256: "99fd9fbd34d9f9a32efd7b6a6aae14125d8237b10403b422a6a6dfeac2806146"
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.1.13"
+  vector_graphics_compiler:
+    dependency: transitive
+    description:
+      name: vector_graphics_compiler
+      sha256: "5a88dd14c0954a5398af544651c7fb51b457a2a556949bfb25369b210ef73a74"
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.2.0"
  vector_math:
    dependency: transitive
    description:
@ -1121,18 +1209,18 @@ packages:
    dependency: transitive
    description:
      name: web
-      sha256: "97da13628db363c635202ad97068d47c5b8aa555808e7a9411963c533b449b27"
+      sha256: "868d88a33d8a87b18ffc05f9f030ba328ffefba92d6c127917a2ba740f9cfe4a"
      url: "https://pub.dev"
    source: hosted
-    version: "0.5.1"
+    version: "1.1.1"
  web_socket_channel:
    dependency: "direct main"
    description:
      name: web_socket_channel
-      sha256: "58c6666b342a38816b2e7e50ed0f1e261959630becd4c879c4f26bfa14aa5a42"
+      sha256: d88238e5eac9a42bb43ca4e721edba3c08c6354d4a53063afaa568516217621b
      url: "https://pub.dev"
    source: hosted
-    version: "2.4.5"
+    version: "2.4.0"
  win32:
    dependency: transitive
    description:
@ -1149,6 +1237,14 @@ packages:
      url: "https://pub.dev"
    source: hosted
    version: "1.1.0"
+  xml:
+    dependency: transitive
+    description:
+      name: xml
+      sha256: "971043b3a0d3da28727e40ed3e0b5d18b742fa5a68665cca88e74b7876d5e025"
+      url: "https://pub.dev"
+    source: hosted
+    version: "6.6.1"
  xterm:
    dependency: "direct main"
    description:
--- a/it0_app/pubspec.yaml
+++ b/it0_app/pubspec.yaml
@ -9,6 +9,7 @@ environment:
 dependencies:
  flutter:
    sdk: flutter
+  cupertino_icons: ^1.0.8

  # State Management
  flutter_riverpod: ^2.5.0
@ -36,13 +37,14 @@ dependencies:
  flutter_markdown: ^0.7.0
  flutter_svg: ^2.0.10+1

-  # Push Notifications
-  firebase_core: ^2.27.0
-  firebase_messaging: ^14.7.0
-
  # Voice
-  speech_to_text: ^6.6.0
+  record: ^6.0.0
  flutter_tts: ^4.0.0
+  sherpa_onnx: ^1.12.25
+
+  # File paths
+  path_provider: ^2.1.0
+  path: ^1.9.0

  # Terminal
  xterm: ^4.0.0
@ -70,11 +72,4 @@ flutter:
  assets:
    - assets/images/
    - assets/icons/
-    - assets/animations/
-
-  fonts:
-    - family: Roboto
-      fonts:
-        - asset: assets/fonts/Roboto-Regular.ttf
-        - asset: assets/fonts/Roboto-Bold.ttf
-          weight: 700
+    - assets/gtcrn_simple.onnx
--- a/packages/services/voice-service/src/api/session_router.py
+++ b/packages/services/voice-service/src/api/session_router.py
@ -1,7 +1,8 @@
 import asyncio
 import uuid

-from fastapi import APIRouter, WebSocket, WebSocketDisconnect, Request
+from fastapi import APIRouter, WebSocket, WebSocketDisconnect, Request, UploadFile, File
+from fastapi.responses import JSONResponse
 from pydantic import BaseModel
 from typing import Optional

@ -151,3 +152,18 @@ async def voice_websocket(websocket: WebSocket, session_id: str):
            await websocket.close()
        except Exception:
            pass
+
+
+@router.post("/transcribe")
+async def transcribe_audio(req: Request, audio: UploadFile = File(...)):
+    """Transcribe uploaded audio (PCM 16kHz 16-bit mono) to text using Whisper."""
+    stt = getattr(req.app.state, "stt", None)
+    if stt is None or stt._model is None:
+        return JSONResponse(status_code=503, content={"error": "STT model not loaded"})
+
+    audio_data = await audio.read()
+    if len(audio_data) == 0:
+        return {"text": ""}
+
+    text = await stt.transcribe(audio_data)
+    return {"text": text.strip()}