260 lines
6.9 KiB
Dart
260 lines
6.9 KiB
Dart
import 'dart:async';
|
||
import 'package:flutter/foundation.dart';
|
||
import '../../core/core.dart';
|
||
import '../../core/services/vad_stt_service.dart';
|
||
import 'package:permission_handler/permission_handler.dart';
|
||
|
||
enum VoiceState {
|
||
listening, // Waiting for user input
|
||
processing, // Sending to AI
|
||
speaking, // AI is talking
|
||
error,
|
||
}
|
||
|
||
class VoiceSessionController extends ChangeNotifier {
|
||
final CharacterModel character;
|
||
final Function(String) onUserMessage;
|
||
final Function(ChatMessage) onAiMessage;
|
||
|
||
// Speaking Queue
|
||
final List<String> _speakQueue = [];
|
||
bool _isSpeaking = false;
|
||
Timer? _silenceTimer;
|
||
|
||
VoiceState _state = VoiceState.listening;
|
||
String _recognizedText = '';
|
||
String _aiTypingText = '';
|
||
bool _isMicMuted = false;
|
||
|
||
// Services - 使用 VAD 替代系统 STT
|
||
final VadSttService _vad = VadSttService();
|
||
final TTSService _tts = TTSService();
|
||
|
||
// State getters
|
||
VoiceState get state => _state;
|
||
String get recognizedText => _recognizedText;
|
||
String get aiTypingText => _aiTypingText;
|
||
bool get isMicMuted => _isMicMuted;
|
||
|
||
|
||
|
||
VoiceSessionController({
|
||
required this.character,
|
||
required this.onUserMessage,
|
||
required this.onAiMessage,
|
||
}) {
|
||
_init();
|
||
}
|
||
|
||
Future<void> _init() async {
|
||
// Request permissions
|
||
await [Permission.microphone, Permission.speech].request();
|
||
|
||
// Init services
|
||
await _vad.init();
|
||
await _tts.init();
|
||
|
||
// 预连接 TTS WebSocket(减少首次 TTS 延迟)
|
||
_tts.preconnect();
|
||
|
||
// Setup TTS callbacks
|
||
_tts.setStartHandler(() {
|
||
debugPrint('🔊 TTS Started');
|
||
});
|
||
|
||
_tts.setCompletionHandler(() {
|
||
debugPrint('✅ TTS Completed');
|
||
_isSpeaking = false;
|
||
_processSpeakQueue(); // Play next
|
||
});
|
||
|
||
_tts.setErrorHandler((msg) {
|
||
debugPrint('❌ TTS Error: $msg');
|
||
_isSpeaking = false;
|
||
_processSpeakQueue();
|
||
});
|
||
|
||
// Start listening immediately
|
||
if (!_isMicMuted) {
|
||
startListening();
|
||
}
|
||
}
|
||
|
||
void toggleMic() {
|
||
_isMicMuted = !_isMicMuted;
|
||
if (_isMicMuted) {
|
||
stopListening();
|
||
} else {
|
||
if (_state == VoiceState.listening) {
|
||
startListening();
|
||
}
|
||
}
|
||
notifyListeners();
|
||
}
|
||
|
||
Future<void> startListening() async {
|
||
if (_isMicMuted) return;
|
||
|
||
_state = VoiceState.listening;
|
||
_recognizedText = '';
|
||
|
||
notifyListeners();
|
||
|
||
// Stop TTS if it's playing (Interruption)
|
||
if (_isSpeaking || _speakQueue.isNotEmpty) {
|
||
_speakQueue.clear();
|
||
await _tts.stop();
|
||
_isSpeaking = false;
|
||
}
|
||
|
||
// 使用 VAD 监听
|
||
await _vad.startListening(
|
||
onSpeechStart: () {
|
||
// 用户开始说话
|
||
_recognizedText = 'Listening...';
|
||
notifyListeners();
|
||
},
|
||
onSpeechEnd: () {
|
||
// 用户说完了,等待 STT 处理
|
||
_recognizedText = 'Processing...';
|
||
notifyListeners();
|
||
},
|
||
onResult: (text) {
|
||
_recognizedText = text;
|
||
notifyListeners();
|
||
},
|
||
onFinalResult: (text) {
|
||
_recognizedText = text;
|
||
notifyListeners();
|
||
// 发送给 LLM 处理
|
||
_processUserMessage(text);
|
||
},
|
||
);
|
||
}
|
||
|
||
Future<void> stopListening() async {
|
||
await _vad.stopListening();
|
||
}
|
||
|
||
Future<void> _processUserMessage(String text) async {
|
||
if (text.trim().isEmpty) {
|
||
// If empty, just listen again
|
||
startListening();
|
||
return;
|
||
}
|
||
|
||
|
||
|
||
_state = VoiceState.processing;
|
||
onUserMessage(text); // Notify UI to show user message
|
||
notifyListeners();
|
||
|
||
// Construct history for context (simplified for now, ideally pass full history)
|
||
// We will rely on ChatService to handle the full history if we pass the latest message
|
||
// But ChatService needs the list. For the voice mode, let's assume interacting adds to DB
|
||
// and we might need to fetch fresh context or pass it in.
|
||
// Ideally, the InteractionScreen manages the source of truth for messages.
|
||
// Here we'll just send the text to prompt the AI.
|
||
|
||
// Construct history
|
||
final session = await ChatStorageService.getSession(character.id);
|
||
var messages = session.messages;
|
||
|
||
// ChatService.sendMessage appends the userMessage automatically.
|
||
// We need to ensure 'messages' doesn't already contain it (if DB write was fast).
|
||
if (messages.isNotEmpty) {
|
||
final lastMsg = messages.last;
|
||
if (lastMsg.isUser && lastMsg.content == text) {
|
||
messages = List.from(messages)..removeLast();
|
||
}
|
||
}
|
||
|
||
_aiTypingText = '';
|
||
|
||
try {
|
||
final fullResponse = await ChatService.sendMessage(
|
||
character: character,
|
||
messages: messages,
|
||
userMessage: text,
|
||
onStream: (content) {
|
||
_aiTypingText = content;
|
||
notifyListeners();
|
||
},
|
||
);
|
||
|
||
// Interaction finished, save AI message
|
||
final aiMsg = ChatMessage.assistant(fullResponse);
|
||
onAiMessage(aiMsg);
|
||
|
||
// Filter emojis and speak full text
|
||
final textToSpeak = _filterEmojis(fullResponse);
|
||
if (textToSpeak.isNotEmpty) {
|
||
if (_state != VoiceState.speaking) {
|
||
_state = VoiceState.speaking;
|
||
notifyListeners();
|
||
}
|
||
await _speak(textToSpeak);
|
||
}
|
||
|
||
} catch (e) {
|
||
debugPrint('❌ Voice Process Error: $e');
|
||
_state = VoiceState.error;
|
||
notifyListeners();
|
||
// Retry listening after error
|
||
Future.delayed(const Duration(seconds: 2), startListening);
|
||
}
|
||
}
|
||
|
||
String _filterEmojis(String text) {
|
||
// Regex matches common emoji ranges
|
||
final RegExp emojiRegex = RegExp(
|
||
r'(\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff])'
|
||
);
|
||
return text.replaceAll(emojiRegex, '').trim();
|
||
}
|
||
|
||
Future<void> _speak(String text) async {
|
||
// If we are listening (interrupted), ignore (or should check state)
|
||
if (_state == VoiceState.listening) return;
|
||
|
||
_speakQueue.add(text);
|
||
_processSpeakQueue();
|
||
}
|
||
|
||
void _processSpeakQueue() async {
|
||
if (_isSpeaking) return;
|
||
|
||
if (_speakQueue.isEmpty) {
|
||
// All done speaking (or no TTS was produced)
|
||
if (_state == VoiceState.speaking || _state == VoiceState.processing) {
|
||
debugPrint('🎤 Queue empty, resuming listening...');
|
||
_state = VoiceState.listening;
|
||
notifyListeners();
|
||
// 延迟启动 STT,让音频会话有时间从播放切换到录音
|
||
_silenceTimer?.cancel();
|
||
_silenceTimer = Timer(const Duration(milliseconds: 800), () {
|
||
debugPrint('🎤 延迟后启动 STT...');
|
||
startListening();
|
||
});
|
||
}
|
||
return;
|
||
}
|
||
|
||
// Pop first
|
||
String text = _speakQueue.removeAt(0);
|
||
_isSpeaking = true;
|
||
|
||
// Ensure STT is paused while speaking
|
||
await stopListening();
|
||
|
||
await _tts.speak(text, voiceConfig: character.aiVoiceConfig);
|
||
}
|
||
|
||
@override
|
||
void dispose() {
|
||
_vad.stopListening();
|
||
_tts.stop();
|
||
super.dispose();
|
||
}
|
||
}
|