feat:v1.0.0

2026-02-09 21:54:32 +08:00
parent 8f19377517
commit 68d25581e8
49 changed files with 1522 additions and 528 deletions
--- a/wei_ai_app/ios/Podfile
+++ b/wei_ai_app/ios/Podfile
@@ -1,5 +1,5 @@
-# Uncomment this line to define a global platform for your project
-# platform :ios, '13.0'
+# VAD 插件需要 iOS 15.1+
+platform :ios, '15.1'

 # CocoaPods analytics sends network stats synchronously affecting flutter build latency.
 ENV['COCOAPODS_DISABLE_STATS'] = 'true'
--- a/wei_ai_app/ios/Podfile.lock
+++ b/wei_ai_app/ios/Podfile.lock
@@ -1,17 +1,31 @@
 PODS:
  - app_links (6.4.1):
    - Flutter
+  - audio_session (0.0.1):
+    - Flutter
  - CwlCatchException (2.2.1):
    - CwlCatchExceptionSupport (~> 2.2.1)
  - CwlCatchExceptionSupport (2.2.1)
  - Flutter (1.0.0)
+  - flutter_pcm_sound (0.0.1):
+    - Flutter
  - flutter_tts (0.0.1):
    - Flutter
+  - just_audio (0.0.1):
+    - Flutter
+    - FlutterMacOS
+  - onnxruntime-c (1.22.0)
+  - onnxruntime-objc (1.22.0):
+    - onnxruntime-objc/Core (= 1.22.0)
+  - onnxruntime-objc/Core (1.22.0):
+    - onnxruntime-c (= 1.22.0)
  - path_provider_foundation (0.0.1):
    - Flutter
    - FlutterMacOS
  - permission_handler_apple (9.3.0):
    - Flutter
+  - record_ios (1.2.0):
+    - Flutter
  - shared_preferences_foundation (0.0.1):
    - Flutter
    - FlutterMacOS
@@ -21,52 +35,79 @@ PODS:
    - FlutterMacOS
  - url_launcher_ios (0.0.1):
    - Flutter
+  - vad (0.0.6):
+    - Flutter
+    - onnxruntime-objc (= 1.22.0)

 DEPENDENCIES:
  - app_links (from `.symlinks/plugins/app_links/ios`)
+  - audio_session (from `.symlinks/plugins/audio_session/ios`)
  - Flutter (from `Flutter`)
+  - flutter_pcm_sound (from `.symlinks/plugins/flutter_pcm_sound/ios`)
  - flutter_tts (from `.symlinks/plugins/flutter_tts/ios`)
+  - just_audio (from `.symlinks/plugins/just_audio/darwin`)
  - path_provider_foundation (from `.symlinks/plugins/path_provider_foundation/darwin`)
  - permission_handler_apple (from `.symlinks/plugins/permission_handler_apple/ios`)
+  - record_ios (from `.symlinks/plugins/record_ios/ios`)
  - shared_preferences_foundation (from `.symlinks/plugins/shared_preferences_foundation/darwin`)
  - speech_to_text (from `.symlinks/plugins/speech_to_text/darwin`)
  - url_launcher_ios (from `.symlinks/plugins/url_launcher_ios/ios`)
+  - vad (from `.symlinks/plugins/vad/ios`)

 SPEC REPOS:
  trunk:
    - CwlCatchException
    - CwlCatchExceptionSupport
+    - onnxruntime-c
+    - onnxruntime-objc

 EXTERNAL SOURCES:
  app_links:
    :path: ".symlinks/plugins/app_links/ios"
+  audio_session:
+    :path: ".symlinks/plugins/audio_session/ios"
  Flutter:
    :path: Flutter
+  flutter_pcm_sound:
+    :path: ".symlinks/plugins/flutter_pcm_sound/ios"
  flutter_tts:
    :path: ".symlinks/plugins/flutter_tts/ios"
+  just_audio:
+    :path: ".symlinks/plugins/just_audio/darwin"
  path_provider_foundation:
    :path: ".symlinks/plugins/path_provider_foundation/darwin"
  permission_handler_apple:
    :path: ".symlinks/plugins/permission_handler_apple/ios"
+  record_ios:
+    :path: ".symlinks/plugins/record_ios/ios"
  shared_preferences_foundation:
    :path: ".symlinks/plugins/shared_preferences_foundation/darwin"
  speech_to_text:
    :path: ".symlinks/plugins/speech_to_text/darwin"
  url_launcher_ios:
    :path: ".symlinks/plugins/url_launcher_ios/ios"
+  vad:
+    :path: ".symlinks/plugins/vad/ios"

 SPEC CHECKSUMS:
  app_links: 3dbc685f76b1693c66a6d9dd1e9ab6f73d97dc0a
+  audio_session: 9bb7f6c970f21241b19f5a3658097ae459681ba0
  CwlCatchException: 7acc161b299a6de7f0a46a6ed741eae2c8b4d75a
  CwlCatchExceptionSupport: 54ccab8d8c78907b57f99717fb19d4cc3bce02dc
  Flutter: cabc95a1d2626b1b06e7179b784ebcf0c0cde467
+  flutter_pcm_sound: e9c2f6ce580eefcab2af46763f0354484d5c4ac8
  flutter_tts: 35ac3c7d42412733e795ea96ad2d7e05d0a75113
+  just_audio: 4e391f57b79cad2b0674030a00453ca5ce817eed
+  onnxruntime-c: 7f778680e96145956c0a31945f260321eed2611a
+  onnxruntime-objc: 83d28b87525bd971259a66e153ea32b5d023de19
  path_provider_foundation: bb55f6dbba17d0dccd6737fe6f7f34fbd0376880
  permission_handler_apple: 4ed2196e43d0651e8ff7ca3483a069d469701f2d
+  record_ios: 412daca2350b228e698fffcd08f1f94ceb1e3844
  shared_preferences_foundation: 7036424c3d8ec98dfe75ff1667cb0cd531ec82bb
  speech_to_text: 3b313d98516d3d0406cea424782ec25470c59d19
  url_launcher_ios: 7a95fa5b60cc718a708b8f2966718e93db0cef1b
+  vad: 7934867589afe53567f492df66fb1615f2185822

-PODFILE CHECKSUM: 3c63482e143d1b91d2d2560aee9fb04ecc74ac7e
+PODFILE CHECKSUM: 8af221031d17e57937852c3979a7d2c40538cf89

 COCOAPODS: 1.16.2
--- a/wei_ai_app/ios/Runner/Info.plist
+++ b/wei_ai_app/ios/Runner/Info.plist
@@ -45,6 +45,26 @@
 	<string>We need access to your microphone for voice chat with AI characters.</string>
 	<key>NSSpeechRecognitionUsageDescription</key>
 	<string>We need speech recognition to convert your voice to text.</string>
+	<key>NSAppTransportSecurity</key>
+	<dict>
+		<key>NSExceptionDomains</key>
+		<dict>
+			<key>localhost</key>
+			<dict>
+				<key>NSExceptionAllowsInsecureHTTPLoads</key>
+				<true/>
+				<key>NSIncludesSubdomains</key>
+				<true/>
+			</dict>
+			<key>127.0.0.1</key>
+			<dict>
+				<key>NSExceptionAllowsInsecureHTTPLoads</key>
+				<true/>
+				<key>NSIncludesSubdomains</key>
+				<true/>
+			</dict>
+		</dict>
+	</dict>
 	<key>CADisableMinimumFrameDurationOnPhone</key>
 	<true/>
 	<key>UIApplicationSupportsIndirectInputEvents</key>
--- a/wei_ai_app/lib/core/config/minimax_config.dart
+++ b/wei_ai_app/lib/core/config/minimax_config.dart
@@ -0,0 +1,26 @@
+/// MiniMax TTS 配置
+///
+/// 注意：客户端直连会暴露 API Key，建议通过 --dart-define 注入并做轮换。
+class MinimaxConfig {
+  MinimaxConfig._();
+
+  /// WebSocket endpoint
+  static const String wsUrl = 'wss://api.minimax.io/ws/v1/t2a_v2';
+
+  /// API Key (临时写死，用于本地调试)
+  static const String apiKey =
+      'eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJHcm91cE5hbWUiOiJ2YmlvZGJkcCIsIlVzZXJOYW1lIjoidHNldCIsIkFjY291bnQiOiIiLCJTdWJqZWN0SUQiOiIxOTkyOTAyNTAzMzg5MjA1NDY3IiwiUGhvbmUiOiIiLCJHcm91cElEIjoiMTk5MjkwMjUwMzM4MDgyMDk1NSIsIlBhZ2VOYW1lIjoiIiwiTWFpbCI6InZiaW9kYmRwQGdtYWlsLmNvbSIsIkNyZWF0ZVRpbWUiOiIyMDI1LTEyLTA2IDE1OjQzOjUxIiwiVG9rZW5UeXBlIjoxLCJpc3MiOiJtaW5pbWF4In0.hf1M4cPe27Sz_QeSyYODqM6yrN8aQ68nRwYB7iQ3uO5nu0NSN7qHQRVxAt2tVuoOf503SEx5F-PfYyC85OFJFhWNNhhDuFuxPIz97LVz1oQUlIejZ_BmCMj4iWwGXTUmEugGK1lzcsI6eJz8eRjQHsxOgJJmxPLXWHTPs1gDqtnckAgjOBRQJSadP58Xe9EdI6n-2_SL_ni3Tqm3LuWq9tUPJa5WgDMZX9IDK7XXyZy0i1GoSXmp8P1O1JmIecBVUoCzyYFwWW787BNdYiyEV3UrFjC_4onJ8Tzh-eGq84-rtxBR5FKO2MpNU_I0xI-W3YJxOEl_JPXXGgX5ASTKNw';
+
+  /// 默认模型
+  static const String model = 'speech-2.6-turbo';
+
+  /// 默认音色（MiniMax 系统音色 - 青涩青年音色）
+  static const String defaultVoiceId = 'Chinese (Mandarin)_BashfulGirl';
+
+  /// 默认音频参数 (PCM 格式用于流式播放)
+  static const int sampleRate = 32000;
+  static const int channels = 1;
+  static const String format = 'pcm';
+
+  static bool get isEnabled => apiKey.isNotEmpty;
+}
--- a/wei_ai_app/lib/core/services/stt_service.dart
+++ b/wei_ai_app/lib/core/services/stt_service.dart
@@ -12,16 +12,31 @@ class STTService {

  bool get isListening => _isListening;

+  // 回调
+  Function(String text)? _onResult;
+  Function(String text)? _onFinalResult;
+
  Future<bool> init() async {
    if (_isInitialized) return true;

    try {
      _isInitialized = await _speech.initialize(
-        onError: (error) => debugPrint('❌ STT Error: $error'),
+        onError: (error) {
+          debugPrint('⚠️ STT Error: ${error.errorMsg}');
+          
+          // error_no_match 是常见的"没听到有效语音"错误
+          // 不应该中断整个流程
+          if (error.errorMsg == 'error_no_match') {
+            debugPrint('   (没有匹配到语音，忽略)');
+          }
+        },
        onStatus: (status) {
          debugPrint('🎤 STT Status: $status');
-          if (status == 'listening') _isListening = true;
-          if (status == 'notListening') _isListening = false;
+          if (status == 'listening') {
+            _isListening = true;
+          } else if (status == 'notListening' || status == 'done') {
+            _isListening = false;
+          }
        },
      );
      debugPrint('✅ STT Initialized: $_isInitialized');
@@ -44,20 +59,28 @@ class STTService {

    if (_isListening) await stop();

+    _onResult = onResult;
+    _onFinalResult = onFinalResult;
+
    await _speech.listen(
      onResult: (result) {
-        if (result.finalResult) {
-          onFinalResult(result.recognizedWords);
-        } else {
-          onResult(result.recognizedWords);
+        if (result.recognizedWords.isNotEmpty) {
+          if (result.finalResult) {
+            debugPrint('🎤 Final: "${result.recognizedWords}"');
+            _onFinalResult?.call(result.recognizedWords);
+          } else {
+            _onResult?.call(result.recognizedWords);
+          }
        }
      },
      localeId: localeId,
-      listenFor: const Duration(seconds: 30),
-      pauseFor: const Duration(seconds: 3), // Wait 3s of silence to consider "done"
-      partialResults: true,
-      cancelOnError: true,
-      listenMode: ListenMode.dictation,
+      listenFor: const Duration(seconds: 60),  // 最大监听时长
+      pauseFor: const Duration(milliseconds: 1500),  // 1.5秒静音后视为说完
+      listenOptions: SpeechListenOptions(
+        partialResults: true,
+        cancelOnError: false,  // 不要因错误取消
+        listenMode: ListenMode.dictation,
+      ),
    );
  }

--- a/wei_ai_app/lib/core/services/tts/minimax_tts_engine.dart
+++ b/wei_ai_app/lib/core/services/tts/minimax_tts_engine.dart
@@ -0,0 +1,502 @@
+import 'dart:async';
+import 'dart:convert';
+
+import 'package:flutter/foundation.dart';
+import 'package:flutter_pcm_sound/flutter_pcm_sound.dart';
+import 'package:web_socket_channel/web_socket_channel.dart';
+
+import '../../config/minimax_config.dart';
+import '../../models/character_model.dart';
+import 'tts_engine.dart';
+import 'ws_client.dart';
+
+/// MiniMax TTS 引擎 (PCM 流式版本 - 带预连接优化)
+/// 
+/// 优化特性：
+/// - 预连接：提前建立 WebSocket 连接
+/// - 连接复用：一个连接处理多次 TTS 请求
+/// - 自动重连：连接断开后自动重连
+class MiniMaxTtsEngine implements TtsEngine {
+  WebSocketChannel? _channel;
+  StreamSubscription? _wsSub;
+  Timer? _timeoutTimer;
+  Timer? _keepAliveTimer;
+
+  bool _isInitialized = false;
+  bool _isDisposed = false;
+  bool _isCancelled = false;
+  bool _isPlaying = false;
+
+  // 连接状态
+  bool _isConnected = false;
+  bool _isTaskReady = false;  // task_started 状态
+  Completer<void>? _connectCompleter;
+  Completer<void>? _speakCompleter;
+
+  // 当前任务的文本
+  String? _pendingText;
+
+  VoidCallback? _onStart;
+  VoidCallback? _onComplete;
+  Function(dynamic)? _onError;
+
+  // 统计信息
+  int _chunkCount = 0;
+  int _totalBytes = 0;
+  final Stopwatch _stopwatch = Stopwatch();
+
+  // 跟踪播放完成
+  bool _isFinalReceived = false;
+
+  @override
+  Future<void> init() async {
+    if (_isInitialized) return;
+    
+    // 初始化 PCM 播放器
+    await FlutterPcmSound.setup(
+      sampleRate: MinimaxConfig.sampleRate,
+      channelCount: MinimaxConfig.channels,
+    );
+    
+    FlutterPcmSound.setLogLevel(LogLevel.none);
+    FlutterPcmSound.setFeedCallback(_onFeedCallback);
+    
+    _isInitialized = true;
+    debugPrint('✅ MiniMaxTtsEngine initialized (PCM streaming mode)');
+  }
+
+  /// 预连接 WebSocket
+  /// 在语音页面打开时调用，提前建立连接
+  @override
+  Future<void> preconnect() async {
+    if (!_isInitialized) await init();
+    if (_isConnected && _isTaskReady) {
+      debugPrint('⚡ TTS 连接已就绪，无需重连');
+      return;
+    }
+    
+    debugPrint('🔌 TTS 预连接中...');
+    await _ensureConnection();
+    debugPrint('⚡ TTS 预连接完成，已就绪');
+  }
+
+  /// 确保 WebSocket 连接已建立且处于就绪状态
+  Future<void> _ensureConnection() async {
+    if (_isConnected && _isTaskReady) return;
+    
+    // 清理旧连接
+    await _cleanupConnection();
+    
+    _connectCompleter = Completer<void>();
+    
+    try {
+      final channel = connectTtsSocket(
+        Uri.parse(MinimaxConfig.wsUrl),
+        {
+          'Authorization': 'Bearer ${MinimaxConfig.apiKey}',
+        },
+      );
+      _channel = channel;
+      
+      // 监听消息
+      _wsSub = channel.stream.listen(
+        _handleConnectionMessage,
+        onError: (error) {
+          debugPrint('❌ TTS WebSocket 错误: $error');
+          _handleDisconnect();
+        },
+        onDone: () {
+          debugPrint('⚠️ TTS WebSocket 连接关闭');
+          _handleDisconnect();
+        },
+        cancelOnError: false,
+      );
+      
+      // 等待连接就绪（connected_success + task_started）
+      // 设置超时
+      _timeoutTimer?.cancel();
+      _timeoutTimer = Timer(const Duration(seconds: 10), () {
+        if (_connectCompleter != null && !_connectCompleter!.isCompleted) {
+          _connectCompleter!.completeError('连接超时');
+          _handleDisconnect();
+        }
+      });
+      
+      await _connectCompleter!.future;
+      _timeoutTimer?.cancel();
+      
+      // 启动保活
+      _startKeepAlive();
+      
+    } catch (e) {
+      debugPrint('❌ TTS 连接失败: $e');
+      _handleDisconnect();
+      rethrow;
+    }
+  }
+
+  void _handleConnectionMessage(dynamic message) {
+    if (_isDisposed) return;
+    
+    try {
+      final Map<String, dynamic> data = jsonDecode(message as String);
+      final String? event = data['event'] as String?;
+      
+      if (event == 'connected_success') {
+        debugPrint('📥 TTS connected_success');
+        _isConnected = true;
+        // 发送 task_start 进入就绪状态
+        _sendTaskStart();
+        return;
+      }
+      
+      if (event == 'task_started') {
+        debugPrint('📥 TTS task_started (就绪)');
+        _isTaskReady = true;
+        
+        // 如果是预连接，完成连接
+        if (_connectCompleter != null && !_connectCompleter!.isCompleted) {
+          _connectCompleter!.complete();
+        }
+        
+        // 如果有待发送的文本，立即发送
+        if (_pendingText != null) {
+          _sendText(_pendingText!);
+          _pendingText = null;
+        }
+        return;
+      }
+      
+      if (event == 'task_continued') {
+        // 正常的流式响应
+      }
+      
+      if (event == 'task_failed') {
+        final errorInfo = data['base_resp'] ?? data;
+        debugPrint('❌ TTS task_failed: ${jsonEncode(errorInfo)}');
+        _handleTaskError('task_failed: ${jsonEncode(errorInfo)}');
+        return;
+      }
+      
+      // 处理音频数据
+      if (data['data'] != null && data['data'] is Map<String, dynamic>) {
+        final audioData = data['data'] as Map<String, dynamic>;
+        final audioHex = audioData['audio'];
+        
+        if (audioHex is String && audioHex.isNotEmpty) {
+          final bytes = _hexToBytes(audioHex);
+          _chunkCount++;
+          _totalBytes += bytes.length;
+          
+          _feedAudioData(bytes);
+          
+          if (_chunkCount % 20 == 0) {
+            debugPrint('📥 已接收 $_chunkCount 块 ($_totalBytes bytes)');
+          }
+        }
+      }
+      
+      // 任务完成
+      if (data['is_final'] == true) {
+        _handleTaskComplete();
+      }
+      
+    } catch (e) {
+      debugPrint('❌ TTS 消息解析错误: $e');
+    }
+  }
+
+  void _sendTaskStart() {
+    if (_channel == null) return;
+    
+    final payload = {
+      'event': 'task_start',
+      'model': MinimaxConfig.model,
+      'voice_setting': {
+        'voice_id': MinimaxConfig.defaultVoiceId,
+        'speed': 1,
+        'vol': 1,
+        'pitch': 0,
+        'english_normalization': false,
+      },
+      'audio_setting': {
+        'sample_rate': MinimaxConfig.sampleRate,
+        'format': MinimaxConfig.format,
+        'channel': MinimaxConfig.channels,
+      },
+    };
+    
+    _channel!.sink.add(jsonEncode(payload));
+  }
+
+  void _sendText(String text) {
+    if (_channel == null || !_isTaskReady) return;
+    
+    debugPrint('📤 发送文本: "$text"');
+    _channel!.sink.add(jsonEncode({
+      'event': 'task_continue',
+      'text': text,
+    }));
+    _isPlaying = true;
+  }
+
+  void _handleTaskComplete() {
+    _stopwatch.stop();
+    _isFinalReceived = true;
+    _isTaskReady = false;
+    
+    debugPrint('');
+    debugPrint('═══════════════════════════════════════');
+    debugPrint('📊 TTS 完成: $_chunkCount 块, $_totalBytes bytes, ${_stopwatch.elapsedMilliseconds}ms');
+    debugPrint('═══════════════════════════════════════');
+    
+    // 发送 task_finish
+    _channel?.sink.add(jsonEncode({'event': 'task_finish'}));
+    
+    // 完成当前任务
+    if (_speakCompleter != null && !_speakCompleter!.isCompleted) {
+      _speakCompleter!.complete();
+    }
+    
+    // 准备下一次任务：重新发送 task_start
+    Future.delayed(const Duration(milliseconds: 100), () {
+      if (_isConnected && !_isDisposed && !_isCancelled) {
+        _sendTaskStart();
+      }
+    });
+  }
+
+  void _handleTaskError(String error) {
+    _isTaskReady = false;
+    
+    if (_speakCompleter != null && !_speakCompleter!.isCompleted) {
+      _speakCompleter!.completeError(TtsEngineException(error));
+    }
+    
+    if (_onError != null) _onError!(error);
+    
+    // 尝试重新进入就绪状态
+    Future.delayed(const Duration(milliseconds: 500), () {
+      if (_isConnected && !_isDisposed) {
+        _sendTaskStart();
+      }
+    });
+  }
+
+  void _handleDisconnect() {
+    final wasConnected = _isConnected;
+    _isConnected = false;
+    _isTaskReady = false;
+    _keepAliveTimer?.cancel();
+    
+    if (_connectCompleter != null && !_connectCompleter!.isCompleted) {
+      _connectCompleter!.completeError('连接断开');
+    }
+    
+    if (_speakCompleter != null && !_speakCompleter!.isCompleted) {
+      _speakCompleter!.completeError(const TtsEngineException('连接断开'));
+    }
+    
+    // 如果之前是连接状态，静默自动重连（为下次 TTS 做准备）
+    if (wasConnected && !_isDisposed && !_isCancelled) {
+      Future.delayed(const Duration(milliseconds: 500), () {
+        if (!_isDisposed && !_isCancelled) {
+          debugPrint('🔄 TTS 自动重连...');
+          _ensureConnection().then((_) {
+            debugPrint('⚡ TTS 重连成功');
+          }).catchError((e) {
+            debugPrint('⚠️ TTS 重连失败: $e (下次 speak 时会重试)');
+          });
+        }
+      });
+    }
+  }
+
+  void _startKeepAlive() {
+    _keepAliveTimer?.cancel();
+    // 每 30 秒检查连接状态
+    _keepAliveTimer = Timer.periodic(const Duration(seconds: 30), (_) {
+      if (!_isConnected && !_isDisposed) {
+        debugPrint('🔄 TTS 重连中...');
+        _ensureConnection().catchError((e) {
+          debugPrint('❌ TTS 重连失败: $e');
+        });
+      }
+    });
+  }
+
+  void _onFeedCallback(int remainingFrames) {
+    if (_isFinalReceived && remainingFrames == 0 && _isPlaying) {
+      _isPlaying = false;
+      debugPrint('🔊 PCM 播放完成');
+      if (_onComplete != null) {
+        _onComplete!();
+      }
+    }
+  }
+
+  @override
+  Future<void> speak(
+    String text, {
+    AiVoiceConfig? voiceConfig,
+  }) async {
+    if (!_isInitialized) await init();
+    if (_isDisposed) return;
+
+    final trimmed = text.trim();
+    if (trimmed.isEmpty) return;
+
+    if (!MinimaxConfig.isEnabled) {
+      throw const TtsEngineException('MiniMax API key is missing');
+    }
+
+    // 重置状态
+    _isCancelled = false;
+    _isFinalReceived = false;
+    _chunkCount = 0;
+    _totalBytes = 0;
+    _stopwatch.reset();
+    _stopwatch.start();
+
+    debugPrint('');
+    debugPrint('═══════════════════════════════════════');
+    debugPrint('🎤 TTS: "$trimmed"');
+    debugPrint('═══════════════════════════════════════');
+
+    _speakCompleter = Completer<void>();
+
+    try {
+      // 确保连接已就绪
+      if (!_isConnected || !_isTaskReady) {
+        debugPrint('⏳ 等待连接就绪...');
+        await _ensureConnection();
+      }
+      
+      // 发送文本
+      if (_isTaskReady) {
+        _sendText(trimmed);
+        if (_onStart != null) _onStart!();
+      } else {
+        // 连接还在建立中，设置待发送文本
+        _pendingText = trimmed;
+        if (_onStart != null) _onStart!();
+      }
+
+      // 设置超时
+      _timeoutTimer?.cancel();
+      _timeoutTimer = Timer(const Duration(seconds: 30), () {
+        if (_speakCompleter != null && !_speakCompleter!.isCompleted) {
+          _handleTaskError('TTS 超时');
+        }
+      });
+
+      // 等待任务完成
+      await _speakCompleter!.future;
+      _timeoutTimer?.cancel();
+
+    } catch (e) {
+      debugPrint('❌ TTS 异常: $e');
+      if (_onError != null) _onError!(e);
+      rethrow;
+    }
+  }
+
+  void _feedAudioData(Uint8List pcmBytes) {
+    if (_isCancelled || _isDisposed) return;
+    
+    final int16List = <int>[];
+    for (var i = 0; i < pcmBytes.length - 1; i += 2) {
+      final int16Value = (pcmBytes[i + 1] << 8) | pcmBytes[i];
+      final signed = int16Value >= 32768 ? int16Value - 65536 : int16Value;
+      int16List.add(signed);
+    }
+    
+    FlutterPcmSound.feed(PcmArrayInt16.fromList(int16List));
+  }
+
+  Uint8List _hexToBytes(String hex) {
+    final cleaned = hex.trim();
+    final len = cleaned.length;
+    if (len == 0) return Uint8List(0);
+    if (len % 2 != 0) {
+      throw const FormatException('Invalid hex string length');
+    }
+
+    final bytes = Uint8List(len ~/ 2);
+    for (var i = 0; i < len; i += 2) {
+      bytes[i ~/ 2] = int.parse(cleaned.substring(i, i + 2), radix: 16);
+    }
+    return bytes;
+  }
+
+  Future<void> _cleanupConnection() async {
+    _wsSub?.cancel();
+    _wsSub = null;
+    _channel?.sink.close();
+    _channel = null;
+    _isConnected = false;
+    _isTaskReady = false;
+  }
+
+  @override
+  Future<void> stop() async {
+    if (_isDisposed) return;
+    _isCancelled = true;
+    _isPlaying = false;
+    _isFinalReceived = false;
+    _pendingText = null;
+    _timeoutTimer?.cancel();
+    
+    // 停止 PCM 播放器
+    await FlutterPcmSound.release();
+    
+    // 重新设置播放器
+    if (_isInitialized) {
+      await FlutterPcmSound.setup(
+        sampleRate: MinimaxConfig.sampleRate,
+        channelCount: MinimaxConfig.channels,
+      );
+      FlutterPcmSound.setFeedCallback(_onFeedCallback);
+    }
+    
+    // 注意：不断开 WebSocket 连接，保持复用
+    // 如果有进行中的任务，发送 task_finish
+    if (_isTaskReady && _channel != null) {
+      _channel!.sink.add(jsonEncode({'event': 'task_finish'}));
+      _isTaskReady = false;
+      // 准备下一次任务
+      Future.delayed(const Duration(milliseconds: 100), () {
+        if (_isConnected && !_isDisposed) {
+          _sendTaskStart();
+        }
+      });
+    }
+    
+    debugPrint('🛑 TTS 已停止');
+  }
+
+  @override
+  void setCompletionHandler(VoidCallback handler) {
+    _onComplete = handler;
+  }
+
+  @override
+  void setErrorHandler(Function(dynamic) handler) {
+    _onError = handler;
+  }
+
+  @override
+  void setStartHandler(VoidCallback handler) {
+    _onStart = handler;
+  }
+
+  @override
+  void dispose() {
+    _isDisposed = true;
+    _keepAliveTimer?.cancel();
+    _timeoutTimer?.cancel();
+    _cleanupConnection();
+    FlutterPcmSound.release();
+    debugPrint('🗑️ MiniMaxTtsEngine disposed');
+  }
+}
--- a/wei_ai_app/lib/core/services/tts/system_tts_engine.dart
+++ b/wei_ai_app/lib/core/services/tts/system_tts_engine.dart
@@ -0,0 +1,88 @@
+import 'package:flutter/foundation.dart';
+import 'package:flutter_tts/flutter_tts.dart';
+import '../../models/character_model.dart';
+import 'tts_engine.dart';
+
+class SystemTtsEngine implements TtsEngine {
+  final FlutterTts _flutterTts = FlutterTts();
+  bool _isInitialized = false;
+
+  @override
+  Future<void> init() async {
+    if (_isInitialized) return;
+
+    try {
+      if (!kIsWeb) {
+        if (defaultTargetPlatform == TargetPlatform.iOS) {
+          await _flutterTts.setSharedInstance(true);
+          await _flutterTts.setIosAudioCategory(
+            IosTextToSpeechAudioCategory.playAndRecord,
+            [
+              IosTextToSpeechAudioCategoryOptions.allowBluetooth,
+              IosTextToSpeechAudioCategoryOptions.allowBluetoothA2DP,
+              IosTextToSpeechAudioCategoryOptions.mixWithOthers,
+              IosTextToSpeechAudioCategoryOptions.defaultToSpeaker,
+            ],
+            IosTextToSpeechAudioMode.defaultMode,
+          );
+        }
+      }
+
+      await _flutterTts.setLanguage("zh-CN");
+      await _flutterTts.setPitch(1.0);
+      await _flutterTts.setSpeechRate(0.5);
+
+      _isInitialized = true;
+      debugPrint('✅ SystemTtsEngine initialized');
+    } catch (e) {
+      debugPrint('❌ SystemTtsEngine init error: $e');
+    }
+  }
+
+  @override
+  Future<void> preconnect() async {
+    // 系统 TTS 不需要预连接
+  }
+
+  @override
+  Future<void> speak(
+    String text, {
+    AiVoiceConfig? voiceConfig,
+  }) async {
+    if (!_isInitialized) await init();
+    if (text.trim().isEmpty) return;
+
+    if (voiceConfig != null) {
+      await _flutterTts.setSpeechRate(voiceConfig.speed.clamp(0.2, 2.0));
+      await _flutterTts.setPitch(voiceConfig.pitch.clamp(0.5, 2.0));
+    }
+
+    debugPrint('🗣️ SystemTtsEngine Speaking: $text');
+    await _flutterTts.speak(text);
+  }
+
+  @override
+  Future<void> stop() async {
+    await _flutterTts.stop();
+  }
+
+  @override
+  void setCompletionHandler(VoidCallback handler) {
+    _flutterTts.setCompletionHandler(handler);
+  }
+
+  @override
+  void setStartHandler(VoidCallback handler) {
+    _flutterTts.setStartHandler(handler);
+  }
+
+  @override
+  void setErrorHandler(Function(dynamic) handler) {
+    _flutterTts.setErrorHandler(handler);
+  }
+
+  @override
+  void dispose() {
+    _flutterTts.stop();
+  }
+}
--- a/wei_ai_app/lib/core/services/tts/tts_engine.dart
+++ b/wei_ai_app/lib/core/services/tts/tts_engine.dart
@@ -0,0 +1,37 @@
+import 'package:flutter/foundation.dart';
+import '../../models/character_model.dart';
+
+abstract class TtsEngine {
+  Future<void> init();
+
+  /// 预连接（可选实现，用于提前建立连接）
+  Future<void> preconnect() async {}
+
+  Future<void> speak(
+    String text, {
+    AiVoiceConfig? voiceConfig,
+  });
+
+  Future<void> stop();
+
+  void setStartHandler(VoidCallback handler);
+
+  void setCompletionHandler(VoidCallback handler);
+
+  void setErrorHandler(Function(dynamic) handler);
+
+  void dispose();
+}
+
+class TtsEngineException implements Exception {
+  final String message;
+  final bool isCancelled;
+
+  const TtsEngineException(this.message, {this.isCancelled = false});
+
+  factory TtsEngineException.cancelled() =>
+      const TtsEngineException('cancelled', isCancelled: true);
+
+  @override
+  String toString() => 'TtsEngineException($message)';
+}
--- a/wei_ai_app/lib/core/services/tts/ws_client.dart
+++ b/wei_ai_app/lib/core/services/tts/ws_client.dart
@@ -0,0 +1,6 @@
+import 'package:web_socket_channel/web_socket_channel.dart';
+import 'ws_client_io.dart' if (dart.library.html) 'ws_client_web.dart';
+
+WebSocketChannel connectTtsSocket(Uri uri, Map<String, String> headers) {
+  return createWebSocketChannel(uri, headers);
+}
--- a/wei_ai_app/lib/core/services/tts/ws_client_io.dart
+++ b/wei_ai_app/lib/core/services/tts/ws_client_io.dart
@@ -0,0 +1,9 @@
+import 'package:web_socket_channel/io.dart';
+import 'package:web_socket_channel/web_socket_channel.dart';
+
+WebSocketChannel createWebSocketChannel(Uri uri, Map<String, String> headers) {
+  return IOWebSocketChannel.connect(
+    uri,
+    headers: headers,
+  );
+}
--- a/wei_ai_app/lib/core/services/tts/ws_client_web.dart
+++ b/wei_ai_app/lib/core/services/tts/ws_client_web.dart
@@ -0,0 +1,6 @@
+import 'package:web_socket_channel/web_socket_channel.dart';
+
+WebSocketChannel createWebSocketChannel(Uri uri, Map<String, String> headers) {
+  // WebSocket in browser doesn't support custom headers.
+  return WebSocketChannel.connect(uri);
+}
--- a/wei_ai_app/lib/core/services/tts_service.dart
+++ b/wei_ai_app/lib/core/services/tts_service.dart
@@ -1,65 +1,115 @@
-import 'package:flutter_tts/flutter_tts.dart';
 import 'package:flutter/foundation.dart';
+import '../config/minimax_config.dart';
+import '../models/character_model.dart';
+import 'tts/minimax_tts_engine.dart';
+import 'tts/system_tts_engine.dart';
+import 'tts/tts_engine.dart';

 class TTSService {
  static final TTSService _instance = TTSService._internal();
  factory TTSService() => _instance;
  TTSService._internal();

-  final FlutterTts _flutterTts = FlutterTts();
+  final TtsEngine _minimaxEngine = MiniMaxTtsEngine();
+  final TtsEngine _systemEngine = SystemTtsEngine();
+  TtsEngine? _activeEngine;
+
  bool _isInitialized = false;

+  VoidCallback? _onStart;
+  VoidCallback? _onComplete;
+  Function(dynamic)? _onError;
+
  Future<void> init() async {
    if (_isInitialized) return;

    try {
-      if (!kIsWeb) {
-        if (defaultTargetPlatform == TargetPlatform.iOS) {
-          await _flutterTts.setSharedInstance(true);
-          await _flutterTts.setIosAudioCategory(
-              IosTextToSpeechAudioCategory.playAndRecord,
-              [
-                IosTextToSpeechAudioCategoryOptions.allowBluetooth,
-                IosTextToSpeechAudioCategoryOptions.allowBluetoothA2DP,
-                IosTextToSpeechAudioCategoryOptions.mixWithOthers,
-                IosTextToSpeechAudioCategoryOptions.defaultToSpeaker
-              ],
-              IosTextToSpeechAudioMode.defaultMode);
-        }
-      }
-
-      await _flutterTts.setLanguage("zh-CN"); // Default to Chinese
-      await _flutterTts.setPitch(1.0);
-      await _flutterTts.setSpeechRate(0.5); // Normal rate
-      
+      _activeEngine = _selectEngine();
+      _applyHandlers(_activeEngine);
+      await _activeEngine!.init();
      _isInitialized = true;
-      debugPrint('✅ TTSService initialized');
+      debugPrint('✅ TTSService initialized (${_activeEngine.runtimeType})');
    } catch (e) {
      debugPrint('❌ TTSService init error: $e');
    }
  }

-  Future<void> speak(String text) async {
+  Future<void> speak(String text, {AiVoiceConfig? voiceConfig}) async {
    if (!_isInitialized) await init();
    if (text.isEmpty) return;
    
+    if (_activeEngine == null) {
+      _activeEngine = _selectEngine();
+      _applyHandlers(_activeEngine);
+      await _activeEngine!.init();
+    }
+
    debugPrint('🗣️ TTS Speaking: $text');
-    await _flutterTts.speak(text);
+    if (_activeEngine is MiniMaxTtsEngine) {
+      try {
+        await _activeEngine!.speak(text, voiceConfig: voiceConfig);
+        return;
+      } catch (e) {
+        debugPrint('⚠️ MiniMax TTS failed, falling back to system TTS: $e');
+        await _fallbackSpeak(text, voiceConfig: voiceConfig);
+        return;
+      }
+    }
+
+    await _activeEngine!.speak(text, voiceConfig: voiceConfig);
+  }
+
+  /// 预连接 TTS 服务（在语音页面打开时调用）
+  Future<void> preconnect() async {
+    if (!_isInitialized) await init();
+    await _activeEngine?.preconnect();
  }

  Future<void> stop() async {
-    await _flutterTts.stop();
+    await _activeEngine?.stop();
  }

  void setCompletionHandler(VoidCallback handler) {
-    _flutterTts.setCompletionHandler(handler);
+    _onComplete = handler;
+    _minimaxEngine.setCompletionHandler(handler);
+    _systemEngine.setCompletionHandler(handler);
  }
  
  void setStartHandler(VoidCallback handler) {
-    _flutterTts.setStartHandler(handler);
+    _onStart = handler;
+    _minimaxEngine.setStartHandler(handler);
+    _systemEngine.setStartHandler(handler);
  }

  void setErrorHandler(Function(dynamic) handler) {
-    _flutterTts.setErrorHandler(handler);
+    _onError = handler;
+    _minimaxEngine.setErrorHandler(handler);
+    _systemEngine.setErrorHandler(handler);
+  }
+
+  TtsEngine _selectEngine() {
+    if (MinimaxConfig.isEnabled && !kIsWeb) {
+      return _minimaxEngine;
+    }
+    return _systemEngine;
+  }
+
+  Future<void> _fallbackSpeak(String text, {AiVoiceConfig? voiceConfig}) async {
+    try {
+      _activeEngine = _systemEngine;
+      _applyHandlers(_activeEngine);
+      await _activeEngine!.init();
+      await _activeEngine!.speak(text, voiceConfig: voiceConfig);
+    } catch (e) {
+      debugPrint('❌ System TTS failed: $e');
+      if (_onError != null) _onError!(e);
+    }
+  }
+
+  void _applyHandlers(TtsEngine? engine) {
+    if (engine == null) return;
+    if (_onStart != null) engine.setStartHandler(_onStart!);
+    if (_onComplete != null) engine.setCompletionHandler(_onComplete!);
+    if (_onError != null) engine.setErrorHandler(_onError!);
  }
 }
--- a/wei_ai_app/lib/core/services/vad_stt_service.dart
+++ b/wei_ai_app/lib/core/services/vad_stt_service.dart
@@ -0,0 +1,258 @@
+import 'dart:convert';
+import 'package:flutter/foundation.dart';
+import 'package:http/http.dart' as http;
+import 'package:vad/vad.dart';
+
+/// VAD (Voice Activity Detection) + Google Speech-to-Text 服务
+/// 
+/// 使用 Silero VAD 模型检测语音开始/结束
+/// 然后将录制的音频发送给 Google Cloud Speech-to-Text API
+class VadSttService {
+  static final VadSttService _instance = VadSttService._internal();
+  factory VadSttService() => _instance;
+  VadSttService._internal();
+
+  VadHandler? _vadHandler;
+  bool _isInitialized = false;
+  bool _isListening = false;
+
+  // Google Cloud STT 配置
+  // TODO: 替换为你的 API Key
+  static const String _googleApiKey = 'AIzaSyD7Dg_Goc5Z9c5LzjTCnhCxLuwCVQz89bk';
+  static const String _googleSttUrl = 'https://speech.googleapis.com/v1/speech:recognize';
+
+  // 回调
+  Function(String text)? _onResult;
+  Function(String text)? _onFinalResult;
+  VoidCallback? _onSpeechStart;
+  VoidCallback? _onSpeechEnd;
+
+  // 统计
+  int _speechSegmentCount = 0;
+  final Stopwatch _speechStopwatch = Stopwatch();
+
+  bool get isListening => _isListening;
+
+  Future<bool> init() async {
+    if (_isInitialized) return true;
+
+    try {
+      _vadHandler = VadHandler.create(isDebug: true);
+      
+      // 设置事件监听
+      _setupEventHandlers();
+      
+      _isInitialized = true;
+      debugPrint('✅ VadSttService initialized (Silero VAD + Google STT)');
+      return true;
+    } catch (e) {
+      debugPrint('❌ VadSttService init failed: $e');
+      return false;
+    }
+  }
+
+  void _setupEventHandlers() {
+    if (_vadHandler == null) return;
+
+    // 检测到开始说话（可能是误触发）
+    _vadHandler!.onSpeechStart.listen((_) {
+      debugPrint('🎤 [VAD] 检测到声音...');
+      _speechStopwatch.reset();
+      _speechStopwatch.start();
+    });
+
+    // 确认是真正说话（不是噪音误触发）
+    _vadHandler!.onRealSpeechStart.listen((_) {
+      _speechSegmentCount++;
+      debugPrint('');
+      debugPrint('═══════════════════════════════════════');
+      debugPrint('🎤 [VAD] 语音段 #$_speechSegmentCount 开始');
+      debugPrint('═══════════════════════════════════════');
+      
+      if (_onSpeechStart != null) _onSpeechStart!();
+    });
+
+    // 说话结束，获取音频数据
+    _vadHandler!.onSpeechEnd.listen((List<double> samples) async {
+      _speechStopwatch.stop();
+      final durationMs = _speechStopwatch.elapsedMilliseconds;
+      
+      final sampleCount = samples.length;
+      final estimatedBytes = sampleCount * 2; // 16-bit = 2 bytes per sample
+      
+      debugPrint('');
+      debugPrint('═══════════════════════════════════════');
+      debugPrint('🎤 [VAD] 语音段 #$_speechSegmentCount 结束');
+      debugPrint('   时长: ${durationMs}ms');
+      debugPrint('   采样点: $sampleCount');
+      debugPrint('   数据大小: ~${(estimatedBytes / 1024).toStringAsFixed(1)} KB');
+      debugPrint('═══════════════════════════════════════');
+      
+      if (_onSpeechEnd != null) _onSpeechEnd!();
+      
+      // 太短的语音忽略
+      if (durationMs < 500) {
+        debugPrint('⚠️ 语音太短，忽略');
+        return;
+      }
+      
+      // 发送给 Google STT
+      await _transcribeWithGoogle(samples, durationMs);
+    });
+
+    // 误触发（检测到声音但不是有效语音）
+    _vadHandler!.onVADMisfire.listen((_) {
+      _speechStopwatch.stop();
+      debugPrint('⚠️ [VAD] 误触发（不是有效语音），忽略');
+    });
+
+    // 错误
+    _vadHandler!.onError.listen((String message) {
+      debugPrint('❌ [VAD] 错误: $message');
+    });
+  }
+
+  /// 将 double 采样点转换为 16-bit PCM 字节
+  Uint8List _convertSamplesToBytes(List<double> samples) {
+    final bytes = Uint8List(samples.length * 2);
+    for (int i = 0; i < samples.length; i++) {
+      // 将 -1.0 ~ 1.0 转换为 -32768 ~ 32767
+      int sample = (samples[i] * 32767).clamp(-32768, 32767).toInt();
+      // Little endian
+      bytes[i * 2] = sample & 0xFF;
+      bytes[i * 2 + 1] = (sample >> 8) & 0xFF;
+    }
+    return bytes;
+  }
+
+  /// 发送给 Google Speech-to-Text API
+  Future<void> _transcribeWithGoogle(List<double> samples, int durationMs) async {
+    if (_googleApiKey == 'YOUR_GOOGLE_API_KEY') {
+      debugPrint('⚠️ [Google STT] 请先配置 API Key!');
+      debugPrint('📝 [模拟结果] 语音时长 ${durationMs}ms, 采样点 ${samples.length}');
+      
+      // 模拟返回结果
+      if (_onFinalResult != null) {
+        _onFinalResult!('[模拟: ${durationMs}ms 语音]');
+      }
+      return;
+    }
+    
+    debugPrint('📤 [Google STT] 发送音频数据...');
+    
+    try {
+      // 转换为 PCM 字节
+      final audioBytes = _convertSamplesToBytes(samples);
+      final audioBase64 = base64Encode(audioBytes);
+      
+      debugPrint('   音频大小: ${(audioBytes.length / 1024).toStringAsFixed(1)} KB');
+      
+      // 构建请求
+      final requestBody = {
+        'config': {
+          'encoding': 'LINEAR16',
+          'sampleRateHertz': 16000, // VAD 默认采样率
+          'languageCode': 'zh-CN', // 中文
+          'enableAutomaticPunctuation': true,
+          // 不指定 model，使用默认模型
+        },
+        'audio': {
+          'content': audioBase64,
+        },
+      };
+      
+      final stopwatch = Stopwatch()..start();
+      
+      final response = await http.post(
+        Uri.parse('$_googleSttUrl?key=$_googleApiKey'),
+        headers: {'Content-Type': 'application/json'},
+        body: jsonEncode(requestBody),
+      );
+      
+      stopwatch.stop();
+      debugPrint('   响应时间: ${stopwatch.elapsedMilliseconds}ms');
+      
+      if (response.statusCode == 200) {
+        final result = jsonDecode(response.body);
+        
+        if (result['results'] != null && (result['results'] as List).isNotEmpty) {
+          final transcript = result['results'][0]['alternatives'][0]['transcript'] as String;
+          final confidence = result['results'][0]['alternatives'][0]['confidence'] ?? 0.0;
+          
+          debugPrint('');
+          debugPrint('═══════════════════════════════════════');
+          debugPrint('📝 [Google STT] 识别结果:');
+          debugPrint('   "$transcript"');
+          debugPrint('   置信度: ${(confidence * 100).toStringAsFixed(1)}%');
+          debugPrint('═══════════════════════════════════════');
+          
+          if (_onFinalResult != null) {
+            _onFinalResult!(transcript);
+          }
+        } else {
+          debugPrint('⚠️ [Google STT] 没有识别到文字');
+        }
+      } else {
+        debugPrint('❌ [Google STT] 请求失败: ${response.statusCode}');
+        debugPrint('   ${response.body}');
+      }
+    } catch (e) {
+      debugPrint('❌ [Google STT] 错误: $e');
+    }
+  }
+
+  Future<void> startListening({
+    Function(String text)? onResult,
+    Function(String text)? onFinalResult,
+    VoidCallback? onSpeechStart,
+    VoidCallback? onSpeechEnd,
+  }) async {
+    if (!_isInitialized) {
+      bool success = await init();
+      if (!success) return;
+    }
+
+    if (_isListening) {
+      debugPrint('⚠️ [VAD] 已经在监听中');
+      return;
+    }
+
+    _onResult = onResult;
+    _onFinalResult = onFinalResult;
+    _onSpeechStart = onSpeechStart;
+    _onSpeechEnd = onSpeechEnd;
+    _speechSegmentCount = 0;
+
+    debugPrint('');
+    debugPrint('🎤 [VAD] 开始监听...');
+    
+    try {
+      await _vadHandler?.startListening();
+      _isListening = true;
+      debugPrint('✅ [VAD] 监听已启动，等待语音输入...');
+    } catch (e) {
+      debugPrint('❌ [VAD] 启动监听失败: $e');
+    }
+  }
+
+  Future<void> stopListening() async {
+    if (!_isListening) return;
+    
+    debugPrint('🛑 [VAD] 停止监听');
+    
+    try {
+      await _vadHandler?.stopListening();
+      _isListening = false;
+    } catch (e) {
+      debugPrint('❌ [VAD] 停止监听失败: $e');
+    }
+  }
+
+  void dispose() {
+    _vadHandler?.dispose();
+    _vadHandler = null;
+    _isInitialized = false;
+    _isListening = false;
+    debugPrint('🗑️ VadSttService disposed');
+  }
+}
--- a/wei_ai_app/lib/screens/interaction/voice_session_controller.dart
+++ b/wei_ai_app/lib/screens/interaction/voice_session_controller.dart
@@ -1,6 +1,7 @@
 import 'dart:async';
 import 'package:flutter/foundation.dart';
 import '../../core/core.dart';
+import '../../core/services/vad_stt_service.dart';
 import 'package:permission_handler/permission_handler.dart';

 enum VoiceState {
@@ -25,8 +26,8 @@ class VoiceSessionController extends ChangeNotifier {
  String _aiTypingText = '';
  bool _isMicMuted = false;
  
-  // Services
-  final STTService _stt = STTService();
+  // Services - 使用 VAD 替代系统 STT
+  final VadSttService _vad = VadSttService();
  final TTSService _tts = TTSService();
  
  // State getters
@@ -35,9 +36,7 @@ class VoiceSessionController extends ChangeNotifier {
  String get aiTypingText => _aiTypingText;
  bool get isMicMuted => _isMicMuted;

-  // Buffer for sentence completion
-  String _sentenceBuffer = '';
-  final List<String> _punctuation = ['。', '？', '！', '.', '?', '!', '\n'];
+

  VoiceSessionController({
    required this.character,
@@ -52,13 +51,15 @@ class VoiceSessionController extends ChangeNotifier {
    await [Permission.microphone, Permission.speech].request();
    
    // Init services
-    await _stt.init();
+    await _vad.init();
    await _tts.init();
+    
+    // 预连接 TTS WebSocket（减少首次 TTS 延迟）
+    _tts.preconnect();

    // Setup TTS callbacks
    _tts.setStartHandler(() {
      debugPrint('🔊 TTS Started');
-      // Already paused STT in _processSpeakQueue
    });

    _tts.setCompletionHandler(() {
@@ -96,7 +97,7 @@ class VoiceSessionController extends ChangeNotifier {
    
    _state = VoiceState.listening;
    _recognizedText = '';
-    _lastProcessedLength = 0;
+
    notifyListeners();

    // Stop TTS if it's playing (Interruption)
@@ -106,7 +107,18 @@ class VoiceSessionController extends ChangeNotifier {
        _isSpeaking = false;
    }

-    await _stt.listen(
+    // 使用 VAD 监听
+    await _vad.startListening(
+      onSpeechStart: () {
+        // 用户开始说话
+        _recognizedText = 'Listening...';
+        notifyListeners();
+      },
+      onSpeechEnd: () {
+        // 用户说完了，等待 STT 处理
+        _recognizedText = 'Processing...';
+        notifyListeners();
+      },
      onResult: (text) {
        _recognizedText = text;
        notifyListeners();
@@ -114,14 +126,14 @@ class VoiceSessionController extends ChangeNotifier {
      onFinalResult: (text) {
        _recognizedText = text;
        notifyListeners();
+        // 发送给 LLM 处理
        _processUserMessage(text);
      },
-      localeId: 'zh-CN', // Make dynamic later if needed
    );
  }

  Future<void> stopListening() async {
-    await _stt.stop();
+    await _vad.stopListening();
  }

  Future<void> _processUserMessage(String text) async {
@@ -131,6 +143,8 @@ class VoiceSessionController extends ChangeNotifier {
      return;
    }

+
+
    _state = VoiceState.processing;
    onUserMessage(text); // Notify UI to show user message
    notifyListeners();
@@ -156,37 +170,31 @@ class VoiceSessionController extends ChangeNotifier {
    }
    
    _aiTypingText = '';
-    _sentenceBuffer = '';
-    _lastProcessedLength = 0;

    try {
      final fullResponse = await ChatService.sendMessage(
        character: character,
        messages: messages,
-        userMessage: text, // ChatService handles appending this if we use the right method
+        userMessage: text,
        onStream: (content) {
          _aiTypingText = content;
-          _processStreamChunk(content);
          notifyListeners();
        },
      );

-      // Process any remaining text in buffer
-      if (_sentenceBuffer.isNotEmpty) {
+      // Interaction finished, save AI message
+      final aiMsg = ChatMessage.assistant(fullResponse);
+      onAiMessage(aiMsg);
+
+      // Filter emojis and speak full text
+      final textToSpeak = _filterEmojis(fullResponse);
+      if (textToSpeak.isNotEmpty) {
        if (_state != VoiceState.speaking) {
          _state = VoiceState.speaking;
          notifyListeners();
        }
-        await _speak(_sentenceBuffer);
+        await _speak(textToSpeak);
      }
-
-      // Interaction finished, save AI message
-      final aiMsg = ChatMessage.assistant(fullResponse);
-      onAiMessage(aiMsg);
-      
-      // Note: We do NOT immediately startListening here.
-      // We rely on the TTS Completion Handler to trigger startListening
-      // when the entire queue is drained.
      
    } catch (e) {
      debugPrint('❌ Voice Process Error: $e');
@@ -197,65 +205,12 @@ class VoiceSessionController extends ChangeNotifier {
    }
  }

-  // Better implementation needs to handle state to avoid infinite loops
-  int _lastProcessedLength = 0;
-
-  void _processStreamChunk(String content) {
-    if (_state != VoiceState.speaking) {
-      _state = VoiceState.speaking;
-      notifyListeners();
-    }
-    
-    // Calculate delta (new content only)
-    if (content.length <= _lastProcessedLength) return;
-    
-    String delta = content.substring(_lastProcessedLength);
-    _lastProcessedLength = content.length;
-    _sentenceBuffer += delta;
-
-    // Check for punctuation to split sentences
-    bool foundPunctuation = false;
-    for (var p in _punctuation) {
-      if (_sentenceBuffer.contains(p)) {
-        foundPunctuation = true;
-        break;
-      }
-    }
-
-    if (foundPunctuation) {
-      _processBufferForSentences();
-    }
-  }
-  
-  void _processBufferForSentences() {
-    String tempBuffer = _sentenceBuffer;
-    String keepBuffer = '';
-
-    // Simple tokenizer: split by punctuation but keep the punctuation attached to the sentence
-    // This is a naive implementation. 
-    // "Hello! How are you?" -> ["Hello!", "How are you?"]
-    
-    // We iterate through chars to find split points
-    int lastSplitIndex = 0;
-    
-    for (int i = 0; i < tempBuffer.length; i++) {
-        String char = tempBuffer[i];
-        if (_punctuation.contains(char)) {
-            // Found end of a sentence
-            String sentence = tempBuffer.substring(lastSplitIndex, i + 1);
-            if (sentence.trim().isNotEmpty) {
-                _speak(sentence);
-            }
-            lastSplitIndex = i + 1;
-        }
-    }
-    
-    // Keep the remaining part that didn't end with punctuation
-    if (lastSplitIndex < tempBuffer.length) {
-        keepBuffer = tempBuffer.substring(lastSplitIndex);
-    }
-    
-    _sentenceBuffer = keepBuffer;
+  String _filterEmojis(String text) {
+    // Regex matches common emoji ranges
+    final RegExp emojiRegex = RegExp(
+      r'(\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff])'
+    );
+    return text.replaceAll(emojiRegex, '').trim();
  }
  
  Future<void> _speak(String text) async {
@@ -275,9 +230,10 @@ class VoiceSessionController extends ChangeNotifier {
             debugPrint('🎤 Queue empty, resuming listening...');
             _state = VoiceState.listening;
             notifyListeners();
-             // Debounce STT restart to avoid rapid stop/start deadlocks
+             // 延迟启动 STT，让音频会话有时间从播放切换到录音
             _silenceTimer?.cancel();
-             _silenceTimer = Timer(const Duration(milliseconds: 250), () {
+             _silenceTimer = Timer(const Duration(milliseconds: 800), () {
+               debugPrint('🎤 延迟后启动 STT...');
               startListening();
             });
          }
@@ -291,12 +247,12 @@ class VoiceSessionController extends ChangeNotifier {
      // Ensure STT is paused while speaking
      await stopListening();
      
-      await _tts.speak(text);
+      await _tts.speak(text, voiceConfig: character.aiVoiceConfig);
  }

  @override
  void dispose() {
-    _stt.stop();
+    _vad.stopListening();
    _tts.stop();
    super.dispose();
  }
--- a/wei_ai_app/macos/Flutter/GeneratedPluginRegistrant.swift
+++ b/wei_ai_app/macos/Flutter/GeneratedPluginRegistrant.swift
@@ -6,16 +6,24 @@ import FlutterMacOS
 import Foundation

 import app_links
+import audio_session
+import flutter_pcm_sound
 import flutter_tts
+import just_audio
 import path_provider_foundation
+import record_macos
 import shared_preferences_foundation
 import speech_to_text
 import url_launcher_macos

 func RegisterGeneratedPlugins(registry: FlutterPluginRegistry) {
  AppLinksMacosPlugin.register(with: registry.registrar(forPlugin: "AppLinksMacosPlugin"))
+  AudioSessionPlugin.register(with: registry.registrar(forPlugin: "AudioSessionPlugin"))
+  FlutterPcmSoundPlugin.register(with: registry.registrar(forPlugin: "FlutterPcmSoundPlugin"))
  FlutterTtsPlugin.register(with: registry.registrar(forPlugin: "FlutterTtsPlugin"))
+  JustAudioPlugin.register(with: registry.registrar(forPlugin: "JustAudioPlugin"))
  PathProviderPlugin.register(with: registry.registrar(forPlugin: "PathProviderPlugin"))
+  RecordMacOsPlugin.register(with: registry.registrar(forPlugin: "RecordMacOsPlugin"))
  SharedPreferencesPlugin.register(with: registry.registrar(forPlugin: "SharedPreferencesPlugin"))
  SpeechToTextPlugin.register(with: registry.registrar(forPlugin: "SpeechToTextPlugin"))
  UrlLauncherPlugin.register(with: registry.registrar(forPlugin: "UrlLauncherPlugin"))
--- a/wei_ai_app/pubspec.lock
+++ b/wei_ai_app/pubspec.lock
@@ -73,6 +73,14 @@ packages:
      url: "https://pub.dev"
    source: hosted
    version: "2.13.0"
+  audio_session:
+    dependency: transitive
+    description:
+      name: audio_session
+      sha256: "8f96a7fecbb718cb093070f868b4cdcb8a9b1053dce342ff8ab2fde10eb9afb7"
+      url: "https://pub.dev"
+    source: hosted
+    version: "0.2.2"
  boolean_selector:
    dependency: transitive
    description:
@@ -230,6 +238,14 @@ packages:
      url: "https://pub.dev"
    source: hosted
    version: "6.0.0"
+  flutter_pcm_sound:
+    dependency: "direct main"
+    description:
+      name: flutter_pcm_sound
+      sha256: "15c6894da8195122001375084d51449bd77849579c93fca2800c00b615699dc0"
+      url: "https://pub.dev"
+    source: hosted
+    version: "3.3.3"
  flutter_riverpod:
    dependency: "direct main"
    description:
@@ -368,6 +384,30 @@ packages:
      url: "https://pub.dev"
    source: hosted
    version: "4.10.0"
+  just_audio:
+    dependency: "direct main"
+    description:
+      name: just_audio
+      sha256: "9694e4734f515f2a052493d1d7e0d6de219ee0427c7c29492e246ff32a219908"
+      url: "https://pub.dev"
+    source: hosted
+    version: "0.10.5"
+  just_audio_platform_interface:
+    dependency: transitive
+    description:
+      name: just_audio_platform_interface
+      sha256: "2532c8d6702528824445921c5ff10548b518b13f808c2e34c2fd54793b999a6a"
+      url: "https://pub.dev"
+    source: hosted
+    version: "4.6.0"
+  just_audio_web:
+    dependency: transitive
+    description:
+      name: just_audio_web
+      sha256: "6ba8a2a7e87d57d32f0f7b42856ade3d6a9fbe0f1a11fabae0a4f00bb73f0663"
+      url: "https://pub.dev"
+    source: hosted
+    version: "0.4.16"
  jwt_decode:
    dependency: transitive
    description:
@@ -640,6 +680,70 @@ packages:
      url: "https://pub.dev"
    source: hosted
    version: "2.7.0"
+  record:
+    dependency: transitive
+    description:
+      name: record
+      sha256: d5b6b334f3ab02460db6544e08583c942dbf23e3504bf1e14fd4cbe3d9409277
+      url: "https://pub.dev"
+    source: hosted
+    version: "6.2.0"
+  record_android:
+    dependency: transitive
+    description:
+      name: record_android
+      sha256: "3bb3c6abbcb5fc1e86719fc6f0acdee89dfe8078543b92caad11854c487e435a"
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.5.0"
+  record_ios:
+    dependency: transitive
+    description:
+      name: record_ios
+      sha256: "8df7c136131bd05efc19256af29b2ba6ccc000ccc2c80d4b6b6d7a8d21a3b5a9"
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.2.0"
+  record_linux:
+    dependency: transitive
+    description:
+      name: record_linux
+      sha256: c31a35cc158cd666fc6395f7f56fc054f31685571684be6b97670a27649ce5c7
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.3.0"
+  record_macos:
+    dependency: transitive
+    description:
+      name: record_macos
+      sha256: f04d1547ff61ae54b4154e9726f656a17ad993f1a90f8f44bc40de94bafa072f
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.2.0"
+  record_platform_interface:
+    dependency: transitive
+    description:
+      name: record_platform_interface
+      sha256: "8a81dbc4e14e1272a285bbfef6c9136d070a47d9b0d1f40aa6193516253ee2f6"
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.5.0"
+  record_web:
+    dependency: transitive
+    description:
+      name: record_web
+      sha256: "7e9846981c1f2d111d86f0ae3309071f5bba8b624d1c977316706f08fc31d16d"
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.3.0"
+  record_windows:
+    dependency: transitive
+    description:
+      name: record_windows
+      sha256: "223258060a1d25c62bae18282c16783f28581ec19401d17e56b5205b9f039d78"
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.0.7"
  retry:
    dependency: transitive
    description:
@@ -861,6 +965,14 @@ packages:
      url: "https://pub.dev"
    source: hosted
    version: "2.12.0"
+  synchronized:
+    dependency: transitive
+    description:
+      name: synchronized
+      sha256: c254ade258ec8282947a0acbbc90b9575b4f19673533ee46f2f6e9b3aeefd7c0
+      url: "https://pub.dev"
+    source: hosted
+    version: "3.4.0"
  term_glyph:
    dependency: transitive
    description:
@@ -965,6 +1077,22 @@ packages:
      url: "https://pub.dev"
    source: hosted
    version: "3.1.5"
+  uuid:
+    dependency: transitive
+    description:
+      name: uuid
+      sha256: a11b666489b1954e01d992f3d601b1804a33937b5a8fe677bd26b8a9f96f96e8
+      url: "https://pub.dev"
+    source: hosted
+    version: "4.5.2"
+  vad:
+    dependency: "direct main"
+    description:
+      name: vad
+      sha256: ef6c8b12c5af7a6a519ff5684f074b8a2ac00c434705f544af379ea77bccd258
+      url: "https://pub.dev"
+    source: hosted
+    version: "0.0.7+1"
  vector_math:
    dependency: transitive
    description:
@@ -1006,7 +1134,7 @@ packages:
    source: hosted
    version: "1.0.1"
  web_socket_channel:
-    dependency: transitive
+    dependency: "direct main"
    description:
      name: web_socket_channel
      sha256: d645757fb0f4773d602444000a8131ff5d48c9e47adfe9772652dd1a4f2d45c8
--- a/wei_ai_app/pubspec.yaml
+++ b/wei_ai_app/pubspec.yaml
@@ -46,6 +46,10 @@ dependencies:
  speech_to_text: ^7.3.0
  flutter_tts: ^4.2.5
  permission_handler: ^12.0.1
+  just_audio: ^0.10.3
+  web_socket_channel: ^3.0.1
+  flutter_pcm_sound: ^3.3.3
+  vad: ^0.0.7+1

 dev_dependencies:
  flutter_test:
--- a/wei_ai_app/优化.md
+++ b/wei_ai_app/优化.md
@@ -0,0 +1,85 @@
+ 我已经详细阅读了整个 interaction
+  模块和相关服务代码，以下是完整的架构分析和优化建议：
+
+  ---
+  架构概览
+
+  screens/interaction/
+  ├── interaction_screen.dart          # 文字聊天 UI
+  ├── voice_mode_overlay.dart          # 语音通话 UI
+  └── voice_session_controller.dart    # 语音状态编排
+
+  core/services/
+  ├── chat_service.dart                # LLM API (SSE 流式)
+  ├── chat_storage_service.dart        # SharedPreferences 持久化
+  ├── stt_service.dart                 # 系统 STT (已废弃)
+  ├── vad_stt_service.dart             # Silero VAD + Google Cloud STT
+  ├── tts_service.dart                 # TTS 门面 + 引擎选择
+  └── tts/
+      ├── tts_engine.dart              # 抽象接口
+      ├── minimax_tts_engine.dart      # MiniMax WebSocket 流式 TTS
+      ├── system_tts_engine.dart       # 系统 TTS 兜底
+      └── ws_client_*.dart             # 跨平台 WebSocket
+
+  整体评价: 服务分层清晰，TTS 的抽象接口设计不错，语音对话的"句子级 TTS +
+  预连接"体验优化做得挺好。但有几个明显需要处理的问题：
+
+  ---
+  需要优化的问题 (按优先级)
+
+  P0 - 安全问题
+
+  1. API Key 硬编码 — minimax_config.dart 中 MiniMax JWT 和 vad_stt_service.dart
+   中 Google Cloud STT API Key 直接写在代码里。APK 反编译即可提取，存在严重的
+  Key 被盗刷风险。
+    - 建议: 所有第三方 API 调用走后端代理，客户端不存放任何 Key
+
+  P1 - 潜在 Bug
+
+  2. 消息重复保存 — InteractionScreen._sendMessage() 会保存消息到
+  storage，语音流程中 VoiceSessionController._processUserMessage()
+  也通过回调保存。如果两个流程不小心交叉，可能产生重复消息。
+    - 建议: 统一消息保存入口，只在一处写入
+  3. 句子缓冲丢失 — _sentenceBuffer
+  在流式结束或网络中断时，未说完的文本会丢失（TTS 不完整但文字显示完整）。
+    - 建议: 流结束时 flush 残留 buffer，增加超时机制（如 2s 无标点则强制切句）
+  4. 语音打断时资源泄漏 — 用户在 TTS 播放中关闭语音界面，_speakCompleter
+  可能不会被 complete，定时器可能继续运行。
+    - 建议: dispose() 中强制 complete 所有 Completer，取消所有 Timer
+  5. WebSocket 连接泄漏 — 退出语音模式后 TTS WebSocket
+  连接没有显式关闭，可能造成连接泄漏。
+    - 建议: 退出语音模式时显式调用 disconnect
+
+  P2 - 架构优化
+
+  6. 状态管理不统一 — InteractionScreen 用 Riverpod + setState
+  混合，VoiceSessionController 用 ChangeNotifier。风格不一致，维护成本高。
+    - 建议: 统一用 Riverpod，VoiceSessionController 迁移为 StateNotifierProvider
+  7. 服务紧耦合 — VoiceSessionController 直接 new VadSttService() /
+  TTSService()，无法做单元测试。
+    - 建议: 通过依赖注入 (Riverpod Provider) 注入服务实例
+  8. 存储性能 — SharedPreferences 每次写入都序列化整个 session
+  JSON，消息多了会卡。
+    - 建议: 考虑换用 Isar/Hive 等本地数据库，支持增量写入和分页加载
+
+  P3 - 体验优化
+
+  9. 无重试机制 — Google STT API
+  调用失败时没有重试逻辑，网络抖动直接导致语音识别失败。
+    - 建议: 增加 1-2 次重试，带指数退避
+  10. 未完成功能 — VoiceModeOverlay 的扬声器切换按钮是空实现 (onPressed: ()
+  {})，波形动画是固定的不跟实际音频挂钩。
+  11. 流式响应类型安全 — ChatService._sendStreamRequest() 中 JSON 解析用了
+  dynamic 类型，API 返回异常格式时可能 crash。
+
+  ---
+  做得好的地方
+
+  - TTS 引擎抽象 + MiniMax/System 自动降级兜底
+  - WebSocket 预连接 降低首次语音响应延迟
+  - 句子级流式 TTS 而非等全文，对话体感自然
+  - 30s keep-alive ping 保持连接复用
+  - LLM 流式渲染 + 打字动画，用户反馈及时
+
+  ---
+  要我针对某个具体问题开始修复吗？比如先处理 P1 的 bug 或 P2 的架构重构？