Files
Master-Arbeit-Tom-Hempel/Unity-Master/Assets/Scripts/Multiplayer/ConvaiUDPSpeechReceiver.cs
2025-10-30 20:32:29 +01:00

700 lines
26 KiB
C#

using System;
using System.Collections.Generic;
using System.Net;
using System.Threading;
using System.Threading.Tasks;
using Convai.Scripts.Runtime.LoggerSystem;
using Convai.Scripts.Runtime.Utils;
using UnityEngine;
namespace Convai.Scripts.Runtime.Multiplayer
{
/// <summary>
/// UDP Speech Receiver - Receives and plays NPC speech audio from remote player
///
/// FLOW (Player 1 → Player 2):
/// 1. Player 2 speaks to Player 1's NPC
/// 2. Player 1's NPC responds with speech
/// 3. Player 1's ConvaiUDPSpeechSender transmits the audio
/// 4. THIS COMPONENT receives the audio packets
/// 5. Reconstructs AudioClips from the packets
/// 6. Plays them back on local AudioSource
///
/// This component should be on a NetworkManager or similar persistent object.
/// It receives speech from the remote player's NPC.
/// </summary>
public class ConvaiUDPSpeechReceiver : MonoBehaviour
{
[Header("Network Configuration")]
[SerializeField] private bool enableDebugLogging = true;
[Header("Audio Playback")]
[SerializeField] private AudioSource speechAudioSource;
[SerializeField] private bool createAudioSourceIfMissing = true;
[SerializeField] private float audioVolume = 1.0f;
[SerializeField] private bool spatialAudio = false;
[Header("UI")]
[SerializeField] private bool showTranscripts = true;
// Network components
private IPEndPoint _remoteEndPoint;
private bool _isListening = false;
private int listenPort;
private CancellationTokenSource _cancellationTokenSource;
// Audio reconstruction
private Dictionary<int, IncomingAudioClip> _incomingClips = new Dictionary<int, IncomingAudioClip>();
private Queue<ReconstructedAudioClip> _playbackQueue = new Queue<ReconstructedAudioClip>();
private bool _isPlayingSequence = false;
private int _currentSequence = 0;
// Packet constants (matching sender V3)
private const uint MAGIC_NUMBER = 0xC0A3;
private const byte PACKET_TYPE_AUDIO_START = 0x01;
private const byte PACKET_TYPE_AUDIO_CHUNK = 0x02;
private const byte PACKET_TYPE_AUDIO_END = 0x03;
private const byte PACKET_TYPE_TRANSCRIPT = 0x04;
private const byte PACKET_TYPE_FINAL = 0x05;
// Events
public Action<bool> OnSpeechReceiving;
public Action<string> OnTranscriptReceived;
public Action<AudioClip> OnAudioClipReceived;
// Metrics for debug UI
private int _totalClipsReceived = 0;
private DateTime _lastClipReceivedTime;
public int TotalClipsReceived => _totalClipsReceived;
public float TimeSinceLastReceive => _lastClipReceivedTime != default ?
(float)(DateTime.UtcNow - _lastClipReceivedTime).TotalSeconds : -1f;
public int ListenPort => listenPort;
// Data structures
private struct SpeechPacket
{
public uint magicNumber;
public byte packetType;
public int sequence;
public int totalSamples;
public int sampleRate;
public int channels;
public int startSample;
public int chunkSampleCount;
public short[] audioSamples;
public string transcript;
}
private class IncomingAudioClip
{
public int totalSamples;
public int sampleRate;
public int channels;
public string transcript;
public float[] audioData;
public bool isComplete;
public bool hasStart;
public bool hasEnd;
public int receivedSamples;
public IncomingAudioClip(int totalSamples, int sampleRate, int channels, string transcript)
{
this.totalSamples = totalSamples;
this.sampleRate = sampleRate;
this.channels = channels;
this.transcript = transcript;
this.audioData = new float[totalSamples];
this.isComplete = false;
this.hasStart = false;
this.hasEnd = false;
this.receivedSamples = 0;
}
}
private struct ReconstructedAudioClip
{
public AudioClip audioClip;
public string transcript;
public bool isFinal;
}
private void Start()
{
_cancellationTokenSource = new CancellationTokenSource();
// Get network config from global instance
var cfg = NetworkConfig.Instance;
if (cfg != null)
{
listenPort = cfg.port;
}
else
{
Debug.LogError("NetworkConfig not found! Please ensure NetworkConfig.asset exists in Resources folder.");
listenPort = 1221;
}
InitializeAudio();
InitializeNetwork();
}
private void OnEnable()
{
if (_cancellationTokenSource == null)
{
_cancellationTokenSource = new CancellationTokenSource();
}
StartCoroutine(WaitAndSubscribe());
}
private void OnDestroy()
{
StopListening();
_cancellationTokenSource?.Cancel();
_cancellationTokenSource?.Dispose();
}
private void OnDisable()
{
StopListening();
}
private void Update()
{
// Process playback queue
ProcessPlaybackQueue();
}
private void InitializeAudio()
{
if (speechAudioSource == null)
{
speechAudioSource = GetComponent<AudioSource>();
if (speechAudioSource == null && createAudioSourceIfMissing)
{
speechAudioSource = gameObject.AddComponent<AudioSource>();
ConvaiLogger.Info("Created AudioSource for speech playback", ConvaiLogger.LogCategory.Character);
}
}
if (speechAudioSource != null)
{
speechAudioSource.volume = audioVolume;
speechAudioSource.playOnAwake = false;
speechAudioSource.spatialBlend = spatialAudio ? 1.0f : 0.0f;
}
else
{
ConvaiLogger.Error("No AudioSource available for speech playback", ConvaiLogger.LogCategory.Character);
}
}
private void InitializeNetwork()
{
try
{
StartListening();
}
catch (Exception ex)
{
ConvaiLogger.Error($"Failed to initialize UDP speech receiver: {ex.Message}", ConvaiLogger.LogCategory.Character);
}
}
public void StartListening()
{
if (_isListening || _cancellationTokenSource == null)
return;
try
{
// Subscribe to shared listener
SharedUDPListener.Instance.OnPacketReceived += HandlePacketReceived;
_isListening = true;
ConvaiLogger.Info($"✅ Speech Receiver subscribed to shared listener, listening for magic 0x{MAGIC_NUMBER:X}", ConvaiLogger.LogCategory.Character);
}
catch (Exception ex)
{
ConvaiLogger.Error($"❌ FAILED to subscribe Speech Receiver: {ex.Message}", ConvaiLogger.LogCategory.Character);
}
}
private System.Collections.IEnumerator WaitAndSubscribe()
{
float timeout = 3f;
while (SharedUDPListener.Instance == null && timeout > 0f)
{
timeout -= Time.unscaledDeltaTime;
yield return null;
}
if (SharedUDPListener.Instance == null)
{
ConvaiLogger.Error("SharedUDPListener not ready after wait.", ConvaiLogger.LogCategory.Character);
yield break;
}
StartListening();
}
public void StopListening()
{
if (!_isListening)
return;
_isListening = false;
// Unsubscribe from shared listener
if (SharedUDPListener.Instance != null)
{
SharedUDPListener.Instance.OnPacketReceived -= HandlePacketReceived;
}
// Stop any ongoing playback
StopSpeechPlayback();
ConvaiLogger.Info("Stopped UDP Speech Receiver", ConvaiLogger.LogCategory.Character);
}
private void HandlePacketReceived(byte[] data, IPEndPoint senderEndPoint)
{
// Check if this is a speech packet (by magic number)
if (data.Length < 4) return;
uint magic = BitConverter.ToUInt32(data, 0);
if (magic != MAGIC_NUMBER) return;
// Update remote endpoint
_remoteEndPoint = senderEndPoint;
// Process speech packet
_ = ProcessReceivedPacket(data, senderEndPoint);
}
private Task ProcessReceivedPacket(byte[] data, IPEndPoint sender)
{
try
{
var packetData = ParseSpeechPacket(data);
if (packetData.HasValue)
{
var packet = packetData.Value;
if (enableDebugLogging)
{
string typeStr = packet.packetType switch
{
PACKET_TYPE_AUDIO_START => "start",
PACKET_TYPE_AUDIO_CHUNK => "chunk",
PACKET_TYPE_AUDIO_END => "end",
PACKET_TYPE_TRANSCRIPT => "transcript",
PACKET_TYPE_FINAL => "final",
_ => "unknown"
};
ConvaiLogger.DebugLog($"📥 Received {typeStr} packet {packet.sequence} from {sender}", ConvaiLogger.LogCategory.Character);
}
switch (packet.packetType)
{
case PACKET_TYPE_AUDIO_START:
HandleAudioStartPacket(packet);
break;
case PACKET_TYPE_AUDIO_CHUNK:
HandleAudioChunkPacket(packet);
break;
case PACKET_TYPE_AUDIO_END:
HandleAudioEndPacket(packet);
break;
case PACKET_TYPE_TRANSCRIPT:
HandleTranscriptPacket(packet);
break;
case PACKET_TYPE_FINAL:
HandleFinalPacket();
break;
}
}
else
{
if (enableDebugLogging)
{
// Check if it's a different magic number
if (data.Length >= 4)
{
uint receivedMagic = BitConverter.ToUInt32(data, 0);
ConvaiLogger.Warn($"❌ Invalid speech packet from {sender}. Expected magic: 0x{MAGIC_NUMBER:X}, Got: 0x{receivedMagic:X}", ConvaiLogger.LogCategory.Character);
}
else
{
ConvaiLogger.Warn($"❌ Packet too small from {sender}: {data.Length} bytes", ConvaiLogger.LogCategory.Character);
}
}
}
}
catch (Exception ex)
{
ConvaiLogger.Error($"Error processing speech packet: {ex.Message}", ConvaiLogger.LogCategory.Character);
}
return Task.CompletedTask;
}
private void HandleAudioStartPacket(SpeechPacket packet)
{
// Start new speech sequence if this is the first start packet
if (packet.sequence == 0 && !_isPlayingSequence)
{
StartSpeechReception();
}
// Create new incoming audio clip
var incomingClip = new IncomingAudioClip(packet.totalSamples, packet.sampleRate, packet.channels, packet.transcript);
incomingClip.hasStart = true;
_incomingClips[packet.sequence] = incomingClip;
if (enableDebugLogging)
ConvaiLogger.DebugLog($"🎵 Started receiving audio clip {packet.sequence}: {packet.totalSamples} samples, '{packet.transcript}'", ConvaiLogger.LogCategory.Character);
}
private void HandleAudioChunkPacket(SpeechPacket packet)
{
if (!_incomingClips.ContainsKey(packet.sequence)) return;
var incomingClip = _incomingClips[packet.sequence];
// Convert short samples back to float and copy to the correct position
if (packet.audioSamples != null && packet.startSample + packet.chunkSampleCount <= incomingClip.totalSamples)
{
for (int i = 0; i < packet.chunkSampleCount; i++)
{
int targetIndex = packet.startSample + i;
if (targetIndex < incomingClip.audioData.Length)
{
incomingClip.audioData[targetIndex] = packet.audioSamples[i] / (float)short.MaxValue;
}
}
incomingClip.receivedSamples += packet.chunkSampleCount;
}
}
private void HandleAudioEndPacket(SpeechPacket packet)
{
if (!_incomingClips.ContainsKey(packet.sequence)) return;
var incomingClip = _incomingClips[packet.sequence];
incomingClip.hasEnd = true;
// Check if the clip is complete (has start, end, and all samples)
if (incomingClip.hasStart && incomingClip.hasEnd)
{
incomingClip.isComplete = true;
// Create the AudioClip
CreateAndQueueAudioClip(incomingClip, packet.sequence);
// Remove from incoming clips
_incomingClips.Remove(packet.sequence);
}
}
private void HandleTranscriptPacket(SpeechPacket packet)
{
if (showTranscripts && !string.IsNullOrEmpty(packet.transcript))
{
MainThreadDispatcher.Instance.RunOnMainThread(() => {
OnTranscriptReceived?.Invoke(packet.transcript);
if (enableDebugLogging)
ConvaiLogger.Info($"📝 Remote NPC said: '{packet.transcript}'", ConvaiLogger.LogCategory.Character);
});
}
}
private void HandleFinalPacket()
{
// Process any remaining incomplete clips
ProcessIncompleteClips();
// Add final marker to queue
_playbackQueue.Enqueue(new ReconstructedAudioClip
{
audioClip = null,
transcript = "",
isFinal = true
});
StopSpeechReception();
}
private void ProcessIncompleteClips()
{
// Try to create AudioClips from any clips that might be mostly complete
var keysToRemove = new List<int>();
foreach (var kvp in _incomingClips)
{
var incomingClip = kvp.Value;
// If we received a reasonable amount of data, try to create the clip
if (incomingClip.receivedSamples > incomingClip.totalSamples * 0.8f) // 80% received
{
CreateAndQueueAudioClip(incomingClip, kvp.Key);
keysToRemove.Add(kvp.Key);
}
}
foreach (var key in keysToRemove)
{
_incomingClips.Remove(key);
}
}
private void CreateAndQueueAudioClip(IncomingAudioClip incomingClip, int sequence)
{
try
{
// Create AudioClip
AudioClip clip = AudioClip.Create($"RemoteSpeech_{sequence}",
incomingClip.totalSamples, incomingClip.channels, incomingClip.sampleRate, false);
clip.SetData(incomingClip.audioData, 0);
// Queue for playback
_playbackQueue.Enqueue(new ReconstructedAudioClip
{
audioClip = clip,
transcript = incomingClip.transcript,
isFinal = false
});
OnAudioClipReceived?.Invoke(clip);
// Update metrics
_totalClipsReceived++;
_lastClipReceivedTime = DateTime.UtcNow;
if (enableDebugLogging)
ConvaiLogger.DebugLog($"✅ Reconstructed audio clip {sequence}: {clip.length:F2}s, '{incomingClip.transcript}'", ConvaiLogger.LogCategory.Character);
}
catch (Exception ex)
{
ConvaiLogger.Error($"Failed to create audio clip from sequence {sequence}: {ex.Message}", ConvaiLogger.LogCategory.Character);
}
}
private void ProcessPlaybackQueue()
{
// If not currently playing and we have queued clips, start playing
if (!_isPlayingSequence && _playbackQueue.Count > 0 && speechAudioSource != null)
{
PlayNextAudioClip();
}
// Check if current clip finished playing
if (_isPlayingSequence && speechAudioSource != null && !speechAudioSource.isPlaying)
{
// Current clip finished, play next one if available
if (_playbackQueue.Count > 0)
{
PlayNextAudioClip();
}
else
{
_isPlayingSequence = false;
}
}
}
private void PlayNextAudioClip()
{
if (_playbackQueue.Count == 0 || speechAudioSource == null) return;
var reconstructedClip = _playbackQueue.Dequeue();
if (reconstructedClip.isFinal)
{
_isPlayingSequence = false;
ConvaiLogger.Info("🔊 Finished playing remote speech sequence", ConvaiLogger.LogCategory.Character);
return;
}
if (reconstructedClip.audioClip != null)
{
speechAudioSource.clip = reconstructedClip.audioClip;
speechAudioSource.Play();
_isPlayingSequence = true;
if (enableDebugLogging)
ConvaiLogger.DebugLog($"🔊 Playing remote speech: {reconstructedClip.audioClip.length:F2}s, '{reconstructedClip.transcript}'", ConvaiLogger.LogCategory.Character);
}
}
private void StartSpeechReception()
{
_isPlayingSequence = false;
_currentSequence = 0;
_incomingClips.Clear();
_playbackQueue.Clear();
OnSpeechReceiving?.Invoke(true);
ConvaiLogger.Info("🔊 Started receiving remote NPC speech", ConvaiLogger.LogCategory.Character);
}
private void StopSpeechReception()
{
OnSpeechReceiving?.Invoke(false);
ConvaiLogger.Info("🔊 Stopped receiving remote NPC speech", ConvaiLogger.LogCategory.Character);
}
private void StopSpeechPlayback()
{
if (speechAudioSource != null && speechAudioSource.isPlaying)
{
speechAudioSource.Stop();
}
_isPlayingSequence = false;
_playbackQueue.Clear();
_incomingClips.Clear();
}
private SpeechPacket? ParseSpeechPacket(byte[] data)
{
if (data.Length < 13) // Minimum header size
return null;
try
{
int offset = 0;
// Read magic number
uint magic = BitConverter.ToUInt32(data, offset);
offset += 4;
if (magic != MAGIC_NUMBER)
return null;
// Read packet type
byte packetType = data[offset];
offset += 1;
// Read sequence
int sequence = BitConverter.ToInt32(data, offset);
offset += 4;
var packet = new SpeechPacket
{
magicNumber = magic,
packetType = packetType,
sequence = sequence
};
// Parse based on packet type
switch (packetType)
{
case PACKET_TYPE_AUDIO_START:
if (data.Length < offset + 16) return null; // Need additional fields
packet.totalSamples = BitConverter.ToInt32(data, offset);
offset += 4;
packet.sampleRate = BitConverter.ToInt32(data, offset);
offset += 4;
packet.channels = BitConverter.ToInt32(data, offset);
offset += 4;
int transcriptLength = BitConverter.ToInt32(data, offset);
offset += 4;
if (transcriptLength > 0 && data.Length >= offset + transcriptLength)
{
packet.transcript = System.Text.Encoding.UTF8.GetString(data, offset, transcriptLength);
}
break;
case PACKET_TYPE_AUDIO_CHUNK:
if (data.Length < offset + 8) return null; // Need start sample + count
packet.startSample = BitConverter.ToInt32(data, offset);
offset += 4;
packet.chunkSampleCount = BitConverter.ToInt32(data, offset);
offset += 4;
// Read audio data
if (packet.chunkSampleCount > 0 && data.Length >= offset + packet.chunkSampleCount * 2)
{
packet.audioSamples = new short[packet.chunkSampleCount];
for (int i = 0; i < packet.chunkSampleCount; i++)
{
packet.audioSamples[i] = BitConverter.ToInt16(data, offset);
offset += 2;
}
}
break;
case PACKET_TYPE_AUDIO_END:
case PACKET_TYPE_FINAL:
// These packets have no additional data beyond the header
break;
case PACKET_TYPE_TRANSCRIPT:
// Similar to start packet transcript handling
if (data.Length >= offset + 4)
{
int transcriptLen = BitConverter.ToInt32(data, offset);
offset += 4;
if (transcriptLen > 0 && data.Length >= offset + transcriptLen)
{
packet.transcript = System.Text.Encoding.UTF8.GetString(data, offset, transcriptLen);
}
}
break;
default:
return null;
}
return packet;
}
catch (Exception ex)
{
ConvaiLogger.Error($"Error parsing speech packet V2: {ex.Message}", ConvaiLogger.LogCategory.Character);
return null;
}
}
// Public properties for debugging
public bool IsListening => _isListening;
public bool IsPlayingSequence => _isPlayingSequence;
public int QueuedClipCount => _playbackQueue.Count;
public int IncomingClipCount => _incomingClips.Count;
// Debug methods
public void ShowNetworkStatus()
{
ConvaiLogger.Info($"=== Speech Receiver Status ===", ConvaiLogger.LogCategory.Character);
ConvaiLogger.Info($"Listening: {_isListening} on port {listenPort}", ConvaiLogger.LogCategory.Character);
ConvaiLogger.Info($"Playing Sequence: {_isPlayingSequence}", ConvaiLogger.LogCategory.Character);
ConvaiLogger.Info($"Current Sequence: {_currentSequence}", ConvaiLogger.LogCategory.Character);
ConvaiLogger.Info($"Queued Clips: {_playbackQueue.Count}", ConvaiLogger.LogCategory.Character);
ConvaiLogger.Info($"Incoming Clips: {_incomingClips.Count}", ConvaiLogger.LogCategory.Character);
if (speechAudioSource != null)
{
ConvaiLogger.Info($"Audio Source: {speechAudioSource.name} (Volume: {speechAudioSource.volume})", ConvaiLogger.LogCategory.Character);
}
}
}
}