using System; using System.Collections.Generic; using System.Net; using System.Net.Sockets; using System.Threading.Tasks; using Convai.Scripts.Runtime.Core; using Convai.Scripts.Runtime.LoggerSystem; using Convai.Scripts.Runtime.Utils; using UnityEngine; using System.Collections; namespace Convai.Scripts.Runtime.Multiplayer { /// /// UDP Speech Sender - Simple and reliable approach using events /// Hooks into AudioManager events to capture when clips are about to be played /// public class ConvaiUDPSpeechSender : MonoBehaviour { [Header("Network Configuration")] [SerializeField] private string targetIP = "127.0.0.1"; [SerializeField] private int targetPort = 12346; [SerializeField] private bool enableDebugLogging = true; [SerializeField] private bool useGlobalNetworkConfig = true; [SerializeField] private NetworkConfig networkConfigAsset; [Header("NPC Source")] [SerializeField] private bool useActiveNPC = true; [SerializeField] private ConvaiNPC sourceNPC; [Header("Audio Settings")] [SerializeField] private int maxSamplesPerPacket = 8192; [SerializeField] private bool sendTranscripts = true; // Network components private UdpClient _udpClient; private IPEndPoint _targetEndPoint; private bool _isInitialized = false; // Speech tracking private int _speechSequence = 0; private bool _isSendingSpeech = false; private HashSet _sentClips = new HashSet(); // Packet constants private const uint MAGIC_NUMBER = 0xC0A3; // V3 magic number private const byte PACKET_TYPE_AUDIO_START = 0x01; private const byte PACKET_TYPE_AUDIO_CHUNK = 0x02; private const byte PACKET_TYPE_AUDIO_END = 0x03; private const byte PACKET_TYPE_FINAL = 0x05; // Events public Action OnSpeechTransmission; public Action OnSpeechSent; private void Start() { // Apply global config if enabled if (useGlobalNetworkConfig) { var cfg = networkConfigAsset != null ? networkConfigAsset : NetworkConfig.Instance; if (cfg != null) { targetIP = cfg.ipAddress; targetPort = cfg.multiplayerSpeechPort; } } InitializeNetwork(); InitializeConvai(); } private void OnDestroy() { CleanupNPCSubscriptions(); CleanupNetwork(); } private void InitializeNetwork() { try { _udpClient = new UdpClient(); _targetEndPoint = new IPEndPoint(IPAddress.Parse(targetIP), targetPort); _isInitialized = true; ConvaiLogger.Info($"UDP Speech Sender initialized. Target: {targetIP}:{targetPort}", ConvaiLogger.LogCategory.Character); } catch (Exception ex) { ConvaiLogger.Error($"Failed to initialize UDP speech sender: {ex.Message}", ConvaiLogger.LogCategory.Character); } } private void InitializeConvai() { // Prefer local ConvaiNPC on the same GameObject, then fall back to active NPC var localNPC = GetComponent(); if (localNPC != null) { sourceNPC = localNPC; } else if (useActiveNPC) { sourceNPC = ConvaiNPCManager.Instance?.GetActiveConvaiNPC(); } SubscribeToNPCEvents(); // Subscribe to NPC manager events for late NPC activation if (ConvaiNPCManager.Instance != null) { ConvaiNPCManager.Instance.OnActiveNPCChanged += HandleActiveNPCChanged; } } private void SubscribeToNPCEvents() { if (sourceNPC?.AudioManager != null) { // Hook into the character talking events sourceNPC.AudioManager.OnCharacterTalkingChanged += HandleCharacterTalkingChanged; sourceNPC.AudioManager.OnAudioTranscriptAvailable += HandleTranscriptAvailable; ConvaiLogger.Info($"UDP Speech Sender subscribed to NPC: {sourceNPC.characterName}", ConvaiLogger.LogCategory.Character); } else { ConvaiLogger.Warn("No source NPC available for speech transmission", ConvaiLogger.LogCategory.Character); } } private void HandleCharacterTalkingChanged(bool isTalking) { if (!_isInitialized) return; if (isTalking) { // Start monitoring for audio clips StartCoroutine(MonitorAudioClips()); } else { // End speech transmission _ = SendFinalPacket(); } } private void HandleTranscriptAvailable(string transcript) { if (enableDebugLogging && !string.IsNullOrEmpty(transcript)) { ConvaiLogger.DebugLog($"📝 NPC transcript: '{transcript}'", ConvaiLogger.LogCategory.Character); } } private IEnumerator MonitorAudioClips() { if (sourceNPC?.AudioManager == null) yield break; AudioSource audioSource = sourceNPC.AudioManager.GetComponent(); AudioClip lastClip = null; while (sourceNPC.IsCharacterTalking) { if (audioSource?.clip != null && audioSource.clip != lastClip) { // New clip detected! lastClip = audioSource.clip; // Only send if we haven't sent this clip before if (!_sentClips.Contains(lastClip)) { _sentClips.Add(lastClip); // Get the transcript from the most recent available transcript string transcript = GetRecentTranscript(); // Send this clip _ = TransmitAudioClip(lastClip, transcript); } } yield return new WaitForSeconds(0.1f); // Check every 100ms } // Clear sent clips when done _sentClips.Clear(); } private string GetRecentTranscript() { // Try to get transcript from the NPC's recent activity // This is a simple approach - in a more complex setup you might want to match clips to transcripts return ""; // Transcripts come via the transcript event } private async Task TransmitAudioClip(AudioClip audioClip, string transcript) { if (!_isInitialized || audioClip == null) return; try { // Start transmission if not already started if (!_isSendingSpeech) { _isSendingSpeech = true; OnSpeechTransmission?.Invoke(true); ConvaiLogger.Info($"🔊 Starting speech transmission", ConvaiLogger.LogCategory.Character); } // Use the current speech sequence for this entire clip int clipSequence = _speechSequence; // Send start packet with metadata await SendAudioStartPacket(audioClip, transcript, clipSequence); // Send audio data in chunks (all with the same sequence) await SendAudioClipInChunks(audioClip, clipSequence); // Send end packet for this clip (with the same sequence) await SendAudioEndPacket(clipSequence); // Only increment sequence after the entire clip is sent _speechSequence++; OnSpeechSent?.Invoke(transcript); if (enableDebugLogging) ConvaiLogger.DebugLog($"✅ Transmitted speech clip: {audioClip.length:F2}s (sequence {clipSequence})", ConvaiLogger.LogCategory.Character); } catch (Exception ex) { ConvaiLogger.Error($"Failed to transmit AudioClip: {ex.Message}", ConvaiLogger.LogCategory.Character); } } private async Task SendAudioStartPacket(AudioClip audioClip, string transcript, int sequence) { byte[] packet = CreateAudioStartPacket(audioClip, transcript, sequence); await _udpClient.SendAsync(packet, packet.Length, _targetEndPoint); if (enableDebugLogging) ConvaiLogger.DebugLog($"📤 Sent start packet {sequence}: {audioClip.samples} samples", ConvaiLogger.LogCategory.Character); } private async Task SendAudioClipInChunks(AudioClip audioClip, int sequence) { // Get all audio data float[] audioData = new float[audioClip.samples]; audioClip.GetData(audioData, 0); // Send in chunks int totalSamples = audioData.Length; int processedSamples = 0; int chunkCount = 0; while (processedSamples < totalSamples) { int remainingSamples = totalSamples - processedSamples; int currentChunkSize = Mathf.Min(maxSamplesPerPacket, remainingSamples); float[] chunkData = new float[currentChunkSize]; Array.Copy(audioData, processedSamples, chunkData, 0, currentChunkSize); byte[] packet = CreateAudioChunkPacket(chunkData, audioClip.frequency, processedSamples, sequence); await _udpClient.SendAsync(packet, packet.Length, _targetEndPoint); processedSamples += currentChunkSize; chunkCount++; if (enableDebugLogging && chunkCount % 10 == 0) ConvaiLogger.DebugLog($"📤 Sent chunk {chunkCount} for sequence {sequence}", ConvaiLogger.LogCategory.Character); // Small delay to avoid overwhelming the network await Task.Delay(5); } if (enableDebugLogging) ConvaiLogger.DebugLog($"📤 Sent {chunkCount} audio chunks for sequence {sequence}", ConvaiLogger.LogCategory.Character); } private async Task SendAudioEndPacket(int sequence) { byte[] packet = CreateAudioEndPacket(sequence); await _udpClient.SendAsync(packet, packet.Length, _targetEndPoint); if (enableDebugLogging) ConvaiLogger.DebugLog($"📤 Sent end packet for sequence {sequence}", ConvaiLogger.LogCategory.Character); } private async Task SendFinalPacket() { if (!_isSendingSpeech) return; try { byte[] packet = CreateFinalPacket(); await _udpClient.SendAsync(packet, packet.Length, _targetEndPoint); _isSendingSpeech = false; OnSpeechTransmission?.Invoke(false); ConvaiLogger.Info("🔊 Speech transmission completed", ConvaiLogger.LogCategory.Character); } catch (Exception ex) { ConvaiLogger.Error($"Failed to send final packet: {ex.Message}", ConvaiLogger.LogCategory.Character); } } private byte[] CreateAudioStartPacket(AudioClip audioClip, string transcript, int sequence) { byte[] transcriptBytes = System.Text.Encoding.UTF8.GetBytes(transcript ?? ""); // Packet structure: // 4 bytes: Magic number // 1 byte: Packet type (0x01 = audio start) // 4 bytes: Sequence number // 4 bytes: Total samples in clip // 4 bytes: Sample rate // 4 bytes: Channels // 4 bytes: Transcript length // N bytes: Transcript (UTF-8) int headerSize = 25; byte[] packet = new byte[headerSize + transcriptBytes.Length]; int offset = 0; BitConverter.GetBytes(MAGIC_NUMBER).CopyTo(packet, offset); offset += 4; packet[offset] = PACKET_TYPE_AUDIO_START; offset += 1; BitConverter.GetBytes(sequence).CopyTo(packet, offset); offset += 4; BitConverter.GetBytes(audioClip.samples).CopyTo(packet, offset); offset += 4; BitConverter.GetBytes(audioClip.frequency).CopyTo(packet, offset); offset += 4; BitConverter.GetBytes(audioClip.channels).CopyTo(packet, offset); offset += 4; BitConverter.GetBytes(transcriptBytes.Length).CopyTo(packet, offset); offset += 4; transcriptBytes.CopyTo(packet, offset); return packet; } private byte[] CreateAudioChunkPacket(float[] audioData, int frequency, int startSample, int sequence) { // Packet structure: // 4 bytes: Magic number // 1 byte: Packet type (0x02 = audio chunk) // 4 bytes: Sequence number // 4 bytes: Start sample position // 4 bytes: Sample count in this chunk // N bytes: Audio data (as 16-bit PCM) int headerSize = 17; int audioDataSize = audioData.Length * sizeof(short); byte[] packet = new byte[headerSize + audioDataSize]; int offset = 0; BitConverter.GetBytes(MAGIC_NUMBER).CopyTo(packet, offset); offset += 4; packet[offset] = PACKET_TYPE_AUDIO_CHUNK; offset += 1; BitConverter.GetBytes(sequence).CopyTo(packet, offset); offset += 4; BitConverter.GetBytes(startSample).CopyTo(packet, offset); offset += 4; BitConverter.GetBytes(audioData.Length).CopyTo(packet, offset); offset += 4; // Convert float samples to 16-bit PCM for (int i = 0; i < audioData.Length; i++) { short sample = (short)(Mathf.Clamp(audioData[i], -1f, 1f) * short.MaxValue); BitConverter.GetBytes(sample).CopyTo(packet, offset); offset += 2; } return packet; } private byte[] CreateAudioEndPacket(int sequence) { byte[] packet = new byte[13]; // Header only int offset = 0; BitConverter.GetBytes(MAGIC_NUMBER).CopyTo(packet, offset); offset += 4; packet[offset] = PACKET_TYPE_AUDIO_END; offset += 1; BitConverter.GetBytes(sequence).CopyTo(packet, offset); offset += 4; BitConverter.GetBytes(0).CopyTo(packet, offset); // No additional data return packet; } private byte[] CreateFinalPacket() { byte[] packet = new byte[13]; // Header only int offset = 0; BitConverter.GetBytes(MAGIC_NUMBER).CopyTo(packet, offset); offset += 4; packet[offset] = PACKET_TYPE_FINAL; offset += 1; BitConverter.GetBytes(_speechSequence).CopyTo(packet, offset); offset += 4; BitConverter.GetBytes(0).CopyTo(packet, offset); // No additional data return packet; } private void CleanupNPCSubscriptions() { if (sourceNPC?.AudioManager != null) { sourceNPC.AudioManager.OnCharacterTalkingChanged -= HandleCharacterTalkingChanged; sourceNPC.AudioManager.OnAudioTranscriptAvailable -= HandleTranscriptAvailable; } if (ConvaiNPCManager.Instance != null) { ConvaiNPCManager.Instance.OnActiveNPCChanged -= HandleActiveNPCChanged; } } private void CleanupNetwork() { _udpClient?.Close(); _udpClient?.Dispose(); _udpClient = null; } private void HandleActiveNPCChanged(ConvaiNPC newActiveNPC) { if (!useActiveNPC) return; // Cleanup old subscriptions CleanupNPCSubscriptions(); // Update to new NPC sourceNPC = newActiveNPC; SubscribeToNPCEvents(); } // Public methods for external control public void SetTargetEndpoint(string ip, int port) { targetIP = ip; targetPort = port; _targetEndPoint = new IPEndPoint(IPAddress.Parse(ip), port); } public bool IsSendingSpeech => _isSendingSpeech; public bool IsInitialized => _isInitialized; public ConvaiNPC SourceNPC => sourceNPC; // Debug methods public void ShowNetworkStatus() { ConvaiLogger.Info($"=== Speech Sender Status ===", ConvaiLogger.LogCategory.Character); ConvaiLogger.Info($"Target: {targetIP}:{targetPort}", ConvaiLogger.LogCategory.Character); ConvaiLogger.Info($"Initialized: {_isInitialized}", ConvaiLogger.LogCategory.Character); ConvaiLogger.Info($"Sending Speech: {_isSendingSpeech}", ConvaiLogger.LogCategory.Character); ConvaiLogger.Info($"Source NPC: {(sourceNPC?.characterName ?? "None")}", ConvaiLogger.LogCategory.Character); ConvaiLogger.Info($"Packets sent: {_speechSequence}", ConvaiLogger.LogCategory.Character); ConvaiLogger.Info($"Sent clips: {_sentClips.Count}", ConvaiLogger.LogCategory.Character); } } }