VBAF.RL.PPO.ps1

#Requires -Version 5.1
<#
.SYNOPSIS
    Proximal Policy Optimization (PPO) Agent for Reinforcement Learning
.DESCRIPTION
    Implements the PPO algorithm combining:
      - Actor Network : maps state -> action probabilities (policy)
      - Critic Network : maps state -> value estimate (baseline)
      - GAE : Generalized Advantage Estimation
      - Clipped update : limits policy change per update step
    Requires VBAF.Core.AllClasses.ps1 to be loaded first (via VBAF.LoadAll.ps1).
.NOTES
    Part of VBAF - Phase 3 Reinforcement Learning Module
    PS 5.1 compatible - dependency injection pattern used
    to avoid parse-time type resolution errors in classes.
#>

# Set base path
$basePath = $PSScriptRoot

# ============================================================
class PPOConfig {
    [int]    $StateSize        = 4
    [int]    $ActionSize       = 2
    [int[]]  $ActorHidden      = @(64, 64)
    [int[]]  $CriticHidden     = @(64, 64)
    [double] $LearningRate     = 0.001
    [double] $Gamma            = 0.99    # Discount factor
    [double] $LambdaGAE       = 0.95    # GAE smoothing
    [double] $ClipEpsilon      = 0.2    # PPO clip range
    [double] $EntropyBonus     = 0.01   # Encourages exploration
    [int]    $UpdateEpochs     = 4      # Training passes per rollout
    [int]    $RolloutSteps     = 64     # Steps before each update
    [int]    $MaxSteps         = 200    # Max steps per episode
}

# ============================================================
class PPOAgent {
    # [object] for all cross-file types - PS 5.1 requirement
    [object] $Actor
    [object] $Critic
    [object] $Config

    # Stats
    [int]    $TotalSteps     = 0
    [int]    $TotalEpisodes  = 0
    [int]    $UpdateCount    = 0
    [double] $LastActorLoss  = 0.0
    [double] $LastCriticLoss = 0.0
    [double] $LastEntropy    = 0.0

    [System.Collections.Generic.List[double]] $EpisodeRewards
    [System.Collections.Generic.List[double]] $ActorLossHistory
    [System.Collections.Generic.List[double]] $CriticLossHistory

    # Rollout buffer
    hidden [System.Collections.ArrayList] $States
    hidden [System.Collections.ArrayList] $Actions
    hidden [System.Collections.ArrayList] $Rewards
    hidden [System.Collections.ArrayList] $Values
    hidden [System.Collections.ArrayList] $LogProbs
    hidden [System.Collections.ArrayList] $Dones

    hidden [System.Random] $Rng

    # -------------------------------------------------------
    # Constructor - receives pre-built networks (PS 5.1 safe)
    # -------------------------------------------------------
    PPOAgent([object]$config, [object]$actor, [object]$critic) {
        $this.Config  = $config
        $this.Actor   = $actor
        $this.Critic  = $critic
        $this.Rng     = [System.Random]::new()

        $this.EpisodeRewards    = [System.Collections.Generic.List[double]]::new()
        $this.ActorLossHistory  = [System.Collections.Generic.List[double]]::new()
        $this.CriticLossHistory = [System.Collections.Generic.List[double]]::new()

        $this.ClearRollout()

        Write-Host "✅ PPOAgent created" -ForegroundColor Green
        Write-Host " State size : $($config.StateSize)"                          -ForegroundColor Cyan
        Write-Host " Action size : $($config.ActionSize)"                         -ForegroundColor Cyan
        Write-Host " Actor hidden : $($config.ActorHidden -join ' -> ')"          -ForegroundColor Cyan
        Write-Host " Critic hidden : $($config.CriticHidden -join ' -> ')"          -ForegroundColor Cyan
        Write-Host " Clip epsilon : $($config.ClipEpsilon)"                        -ForegroundColor Cyan
        Write-Host " Rollout steps : $($config.RolloutSteps)"                       -ForegroundColor Cyan
    }

    # -------------------------------------------------------
    # Softmax helper - converts raw outputs to probabilities
    # -------------------------------------------------------
    hidden [double[]] Softmax([double[]]$logits) {
        $max  = ($logits | Measure-Object -Maximum).Maximum
        $exps = @(0.0) * $logits.Length
        $sum  = 0.0
        for ($i = 0; $i -lt $logits.Length; $i++) {
            $exps[$i] = [Math]::Exp($logits[$i] - $max)
            $sum += $exps[$i]
        }
        $probs = @(0.0) * $logits.Length
        for ($i = 0; $i -lt $logits.Length; $i++) {
            $probs[$i] = $exps[$i] / $sum
        }
        return $probs
    }

    # -------------------------------------------------------
    # Sample action from probability distribution
    # -------------------------------------------------------
    hidden [int] SampleAction([double[]]$probs) {
        $r   = $this.Rng.NextDouble()
        $cum = 0.0
        for ($i = 0; $i -lt $probs.Length; $i++) {
            $cum += $probs[$i]
            if ($r -le $cum) { return $i }
        }
        return $probs.Length - 1
    }

    # -------------------------------------------------------
    # Log probability of action given probs
    # -------------------------------------------------------
    hidden [double] LogProb([double[]]$probs, [int]$action) {
        $p = [Math]::Max($probs[$action], 1e-8)
        return [Math]::Log($p)
    }

    # -------------------------------------------------------
    # Entropy of distribution (encourages exploration)
    # -------------------------------------------------------
    hidden [double] Entropy([double[]]$probs) {
        $h = 0.0
        foreach ($p in $probs) {
            if ($p -gt 1e-8) { $h -= $p * [Math]::Log($p) }
        }
        return $h
    }

    # -------------------------------------------------------
    # Select action - returns hashtable {Action, LogProb, Value}
    # -------------------------------------------------------
    [hashtable] Act([double[]]$state) {
        $logits = $this.Actor.Predict($state)
        $probs  = $this.Softmax($logits)
        $action = $this.SampleAction($probs)
        $logP   = $this.LogProb($probs, $action)

        # Critic value estimate
        $valueOut = $this.Critic.Predict($state)
        $value    = $valueOut[0]

        return @{ Action = $action; LogProb = $logP; Value = $value; Probs = $probs }
    }

    # -------------------------------------------------------
    # Greedy action for evaluation
    # -------------------------------------------------------
    [int] Predict([double[]]$state) {
        $logits = $this.Actor.Predict($state)
        $probs  = $this.Softmax($logits)
        $best   = 0
        for ($i = 1; $i -lt $probs.Length; $i++) {
            if ($probs[$i] -gt $probs[$best]) { $best = $i }
        }
        return $best
    }

    # -------------------------------------------------------
    # Store one transition in rollout buffer
    # -------------------------------------------------------
    [void] StoreTransition([double[]]$state, [int]$action, [double]$reward,
                           [double]$value, [double]$logProb, [bool]$done) {
        $this.States.Add($state)
        $this.Actions.Add($action)
        $this.Rewards.Add($reward)
        $this.Values.Add($value)
        $this.LogProbs.Add($logProb)
        $this.Dones.Add($done)
        $this.TotalSteps++
    }

    # -------------------------------------------------------
    # Clear rollout buffer
    # -------------------------------------------------------
    [void] ClearRollout() {
        $this.States   = [System.Collections.ArrayList]::new()
        $this.Actions  = [System.Collections.ArrayList]::new()
        $this.Rewards  = [System.Collections.ArrayList]::new()
        $this.Values   = [System.Collections.ArrayList]::new()
        $this.LogProbs = [System.Collections.ArrayList]::new()
        $this.Dones    = [System.Collections.ArrayList]::new()
    }

    # -------------------------------------------------------
    # Compute GAE advantages and discounted returns
    # lastValue = critic estimate of state after rollout ends
    # -------------------------------------------------------
    hidden [hashtable] ComputeGAE([double]$lastValue) {
        $n          = $this.Rewards.Count
        $advantages = @(0.0) * $n
        $returns    = @(0.0) * $n
        $gaeVal     = 0.0

        for ($t = $n - 1; $t -ge 0; $t--) {
            $done     = [bool]$this.Dones[$t]
            $reward   = [double]$this.Rewards[$t]
            $value    = [double]$this.Values[$t]
            $nextVal  = if ($t -eq $n - 1) { $lastValue } else { [double]$this.Values[$t + 1] }

            if ($done) { $nextVal = 0.0; $gaeVal = 0.0 }

            $delta        = $reward + $this.Config.Gamma * $nextVal - $value
            $gaeVal       = $delta + $this.Config.Gamma * $this.Config.LambdaGAE * $gaeVal
            $advantages[$t] = $gaeVal
            $returns[$t]    = $gaeVal + $value
        }

        # Normalize advantages
        $mean   = ($advantages | Measure-Object -Average).Average
        $sq     = $advantages | ForEach-Object { ($_ - $mean) * ($_ - $mean) }
        $stdDev = [Math]::Sqrt(($sq | Measure-Object -Average).Average + 1e-8)
        for ($i = 0; $i -lt $n; $i++) {
            $advantages[$i] = ($advantages[$i] - $mean) / $stdDev
        }

        return @{ Advantages = $advantages; Returns = $returns }
    }

    # -------------------------------------------------------
    # PPO Update - train actor and critic on collected rollout
    # -------------------------------------------------------
    [void] Update([double]$lastValue) {
        $gae         = $this.ComputeGAE($lastValue)
        $advantages  = $gae.Advantages
        $returns     = $gae.Returns
        $n           = $this.States.Count

        $totalActorLoss  = 0.0
        $totalCriticLoss = 0.0
        $totalEntropy    = 0.0
        $updateSamples   = 0

        for ($epoch = 0; $epoch -lt $this.Config.UpdateEpochs; $epoch++) {
            for ($t = 0; $t -lt $n; $t++) {
                $state      = [double[]]$this.States[$t]
                $action     = [int]$this.Actions[$t]
                $oldLogProb = [double]$this.LogProbs[$t]
                $advantage  = $advantages[$t]
                $ret        = $returns[$t]

                # ---- Critic update ----
                # Target: discounted return
                $criticTarget    = @($ret)
                $criticLoss      = $this.Critic.TrainSample($state, $criticTarget)
                $totalCriticLoss += $criticLoss

                # ---- Actor update ----
                # Get new probabilities
                $logits   = $this.Actor.Predict($state)
                $probs    = $this.Softmax($logits)
                $newLogP  = $this.LogProb($probs, $action)
                $entropy  = $this.Entropy($probs)
                $totalEntropy += $entropy

                # PPO ratio and clipped objective
                $ratio      = [Math]::Exp($newLogP - $oldLogProb)
                $clipRatio  = [Math]::Max($this.Config.ClipEpsilon * -1,
                              [Math]::Min($this.Config.ClipEpsilon,
                              $ratio - 1.0)) + 1.0

                # Build actor target: nudge probability of taken action
                # in direction of advantage, clipped by ratio
                $effectiveRatio = [Math]::Min($ratio, $clipRatio)
                $actorTarget    = $probs.Clone()
                $nudge          = $advantage * $effectiveRatio * 0.1 + $this.Config.EntropyBonus * $entropy
                $actorTarget[$action] = [Math]::Max(0.01,
                                        [Math]::Min(0.99,
                                        $probs[$action] + $nudge))

                # Renormalize
                $sum = ($actorTarget | Measure-Object -Sum).Sum
                for ($i = 0; $i -lt $actorTarget.Length; $i++) {
                    $actorTarget[$i] = $actorTarget[$i] / $sum
                }

                $actorLoss      = $this.Actor.TrainSample($state, $actorTarget)
                $totalActorLoss += $actorLoss
                $updateSamples++
            }
        }

        if ($updateSamples -gt 0) {
            $this.LastActorLoss  = $totalActorLoss  / $updateSamples
            $this.LastCriticLoss = $totalCriticLoss / $updateSamples
            $this.LastEntropy    = $totalEntropy     / $updateSamples
            $this.ActorLossHistory.Add($this.LastActorLoss)
            $this.CriticLossHistory.Add($this.LastCriticLoss)
        }

        $this.UpdateCount++
        $this.ClearRollout()
    }

    # -------------------------------------------------------
    [void] EndEpisode([double]$totalReward) {
        $this.TotalEpisodes++
        $this.EpisodeRewards.Add($totalReward)
    }

    # -------------------------------------------------------
    [hashtable] GetStats() {
        $avgReward      = 0.0
        $avgActorLoss   = 0.0
        $avgCriticLoss  = 0.0

        if ($this.EpisodeRewards.Count -gt 0) {
            $slice     = $this.EpisodeRewards | Select-Object -Last 100
            $avgReward = ($slice | Measure-Object -Average).Average
        }
        if ($this.ActorLossHistory.Count -gt 0) {
            $slice         = $this.ActorLossHistory | Select-Object -Last 100
            $avgActorLoss  = ($slice | Measure-Object -Average).Average
        }
        if ($this.CriticLossHistory.Count -gt 0) {
            $slice          = $this.CriticLossHistory | Select-Object -Last 100
            $avgCriticLoss  = ($slice | Measure-Object -Average).Average
        }

        return @{
            TotalEpisodes   = $this.TotalEpisodes
            TotalSteps      = $this.TotalSteps
            UpdateCount     = $this.UpdateCount
            LastActorLoss   = [Math]::Round($this.LastActorLoss,  6)
            LastCriticLoss  = [Math]::Round($this.LastCriticLoss, 6)
            LastEntropy     = [Math]::Round($this.LastEntropy,     4)
            AvgReward100    = [Math]::Round($avgReward,            3)
            AvgActorLoss    = [Math]::Round($avgActorLoss,         6)
            AvgCriticLoss   = [Math]::Round($avgCriticLoss,        6)
        }
    }

    # -------------------------------------------------------
    [void] PrintStats() {
        $s = $this.GetStats()
        Write-Host ""
        Write-Host "╔══════════════════════════════════════╗" -ForegroundColor Cyan
        Write-Host "║ PPO Agent Statistics ║" -ForegroundColor Cyan
        Write-Host "╠══════════════════════════════════════╣" -ForegroundColor Cyan
        Write-Host ("║ Episodes : {0,-20}║" -f $s.TotalEpisodes)   -ForegroundColor White
        Write-Host ("║ Total Steps : {0,-20}║" -f $s.TotalSteps)      -ForegroundColor White
        Write-Host ("║ PPO Updates : {0,-20}║" -f $s.UpdateCount)     -ForegroundColor White
        Write-Host ("║ Avg Reward : {0,-20}║" -f $s.AvgReward100)    -ForegroundColor Green
        Write-Host ("║ Entropy : {0,-20}║" -f $s.LastEntropy)     -ForegroundColor Yellow
        Write-Host ("║ Actor Loss : {0,-20}║" -f $s.LastActorLoss)   -ForegroundColor Magenta
        Write-Host ("║ Critic Loss : {0,-20}║" -f $s.LastCriticLoss)  -ForegroundColor Magenta
        Write-Host "╚══════════════════════════════════════╝" -ForegroundColor Cyan
        Write-Host ""
    }
}

# ============================================================
# CartPole-style environment (same as DQN, self-contained)
# ============================================================
class PPOEnvironment {
    [double] $Position
    [double] $Velocity
    [double] $Angle
    [double] $AngularVelocity
    [int]    $Steps
    [int]    $MaxSteps
    hidden [System.Random] $Rng

    PPOEnvironment() {
        $this.MaxSteps = 200
        $this.Rng      = [System.Random]::new()
        $this.Reset()
    }

    [double[]] Reset() {
        $this.Position        = ($this.Rng.NextDouble() - 0.5) * 0.1
        $this.Velocity        = ($this.Rng.NextDouble() - 0.5) * 0.1
        $this.Angle           = ($this.Rng.NextDouble() - 0.5) * 0.1
        $this.AngularVelocity = ($this.Rng.NextDouble() - 0.5) * 0.1
        $this.Steps           = 0
        return $this.GetState()
    }

    [double[]] GetState() {
        return @($this.Position, $this.Velocity, $this.Angle, $this.AngularVelocity)
    }

    [hashtable] Step([int]$action) {
        $this.Steps++
        $force     = if ($action -eq 1) { 1.0 } else { -1.0 }
        $gravity   = 9.8
        $cartMass  = 1.0
        $poleMass  = 0.1
        $totalMass = $cartMass + $poleMass
        $halfLen   = 0.25
        $dt        = 0.02

        $cosA  = [Math]::Cos($this.Angle)
        $sinA  = [Math]::Sin($this.Angle)
        $temp  = ($force + $poleMass * $halfLen * $this.AngularVelocity * $this.AngularVelocity * $sinA) / $totalMass
        $aAcc  = ($gravity * $sinA - $cosA * $temp) / ($halfLen * (4.0/3.0 - $poleMass * $cosA * $cosA / $totalMass))
        $acc   = $temp - $poleMass * $halfLen * $aAcc * $cosA / $totalMass

        $this.Position        += $dt * $this.Velocity
        $this.Velocity        += $dt * $acc
        $this.Angle           += $dt * $this.AngularVelocity
        $this.AngularVelocity += $dt * $aAcc

        $done   = ($this.Steps -ge $this.MaxSteps) -or
                  ([Math]::Abs($this.Position) -gt 2.4) -or
                  ([Math]::Abs($this.Angle)    -gt 0.21)
        $reward = if (-not $done) { 1.0 } else { 0.0 }

        return @{ NextState = $this.GetState(); Reward = $reward; Done = $done }
    }
}

# ============================================================
# TRAINING RUNNER
# All external types instantiated HERE (script level) - PS 5.1 safe
# ============================================================
function Invoke-PPOTraining {
    param(
        [int]    $Episodes   = 100,
        [int]    $PrintEvery = 10,
        [switch] $Quiet,
        [switch] $FastMode
    )

    # ---- Settings ----
    $actorHidden  = @(64, 64)
    $criticHidden = @(64, 64)
    $maxSteps     = 200
    $rolloutSteps = 64

    if ($FastMode) {
        $actorHidden  = @(16, 16)
        $criticHidden = @(16, 16)
        $maxSteps     = 30
        $rolloutSteps = 32
        if ($Episodes  -eq 100) { $Episodes   = 50 }
        if ($PrintEvery -eq 10) { $PrintEvery  = 5  }
        Write-Host ""
        Write-Host "⚡ FAST MODE ENABLED" -ForegroundColor Yellow
        Write-Host " Actor/Critic : 16 -> 16" -ForegroundColor Yellow
        Write-Host " MaxSteps : $maxSteps"  -ForegroundColor Yellow
        Write-Host " RolloutSteps : $rolloutSteps" -ForegroundColor Yellow
        Write-Host " Episodes : $Episodes"  -ForegroundColor Yellow
    }

    Write-Host ""
    Write-Host "🚀 VBAF PPO Training Started" -ForegroundColor Green
    Write-Host " Episodes: $Episodes"        -ForegroundColor Cyan
    Write-Host ""

    # ---- Config ----
    $config                 = [PPOConfig]::new()
    $config.StateSize       = 4
    $config.ActionSize      = 2
    $config.ActorHidden     = $actorHidden
    $config.CriticHidden    = $criticHidden
    $config.LearningRate    = 0.001
    $config.Gamma           = 0.99
    $config.LambdaGAE      = 0.95
    $config.ClipEpsilon     = 0.2
    $config.EntropyBonus    = 0.01
    $config.UpdateEpochs    = 4
    $config.RolloutSteps    = $rolloutSteps
    $config.MaxSteps        = $maxSteps

    # ---- Build layer arrays ----
    $actorLayers  = [System.Collections.Generic.List[int]]::new()
    $actorLayers.Add($config.StateSize)
    foreach ($h in $config.ActorHidden)  { $actorLayers.Add($h) }
    $actorLayers.Add($config.ActionSize)

    $criticLayers = [System.Collections.Generic.List[int]]::new()
    $criticLayers.Add($config.StateSize)
    foreach ($h in $config.CriticHidden) { $criticLayers.Add($h) }
    $criticLayers.Add(1)   # Critic outputs single value

    # ---- Instantiate networks at script level (PS 5.1 safe) ----
    $actor  = [NeuralNetwork]::new($actorLayers.ToArray(),  $config.LearningRate)
    $critic = [NeuralNetwork]::new($criticLayers.ToArray(), $config.LearningRate)

    # ---- Inject into PPOAgent ----
    $agent = [PPOAgent]::new($config, $actor, $critic)

    $env          = [PPOEnvironment]::new()
    $env.MaxSteps = $maxSteps

    $bestReward  = 0.0
    $stepCounter = 0

    for ($ep = 1; $ep -le $Episodes; $ep++) {
        $state       = $env.Reset()
        $totalReward = 0.0
        $done        = $false

        while (-not $done) {
            # Get action from actor
            $result  = $agent.Act($state)
            $action  = $result.Action
            $logProb = $result.LogProb
            $value   = $result.Value

            # Step environment
            $step    = $env.Step($action)
            $ns      = $step.NextState
            $reward  = $step.Reward
            $done    = $step.Done

            # Store in rollout buffer
            $agent.StoreTransition($state, $action, $reward, $value, $logProb, $done)
            $state        = $ns
            $totalReward += $reward
            $stepCounter++

            # Update when rollout buffer is full
            if ($stepCounter % $config.RolloutSteps -eq 0) {
                $lastValOut = $agent.Critic.Predict($state)
                $lastVal    = $lastValOut[0]
                $agent.Update($lastVal)
            }
        }

        $agent.EndEpisode($totalReward)
        if ($totalReward -gt $bestReward) { $bestReward = $totalReward }

        if (-not $Quiet -and ($ep % $PrintEvery -eq 0)) {
            $stats = $agent.GetStats()
            Write-Host (" Ep {0,4} Reward: {1,5:F0} Best: {2,5:F0} Updates: {3,4} Entropy: {4:F3} CriticLoss: {5:F5}" -f `
                $ep, $totalReward, $bestReward,
                $stats.UpdateCount, $stats.LastEntropy, $stats.LastCriticLoss) -ForegroundColor White
        }
    }

    # Final update on remaining rollout
    if ($agent.States.Count -gt 0) {
        $agent.Update(0.0)
    }

    Write-Host ""
    Write-Host "✅ Training Complete!" -ForegroundColor Green
    $agent.PrintStats()
    ,$agent  # comma operator forces return as single object in PS 5.1
}

# ============================================================
# ============================================================
# TEST
# 1. Run VBAF.LoadAll.ps1
# 2. $agent = (Invoke-PPOTraining -Episodes 20 -PrintEvery 2 -FastMode)[-1]
# 3. $agent = (Invoke-PPOTraining -Episodes 50 -PrintEvery 5 -FastMode)[-1]
# 4. $agent.PrintStats()
# ============================================================
Write-Host "📦 VBAF.RL.PPO.ps1 loaded" -ForegroundColor Green
Write-Host " Classes : PPOConfig, PPOAgent, PPOEnvironment"              -ForegroundColor Cyan
Write-Host " Function: Invoke-PPOTraining"                               -ForegroundColor Cyan
Write-Host ""
Write-Host " Quick start:"                                               -ForegroundColor Yellow
Write-Host ' $agent = (Invoke-PPOTraining -Episodes 50 -PrintEvery 5 -FastMode)[-1]'  -ForegroundColor White
Write-Host ' $agent.PrintStats()'                                        -ForegroundColor White
Write-Host ""