VBAF

3.0.0

VBAF.RL.PPO.ps1

                                #Requires -Version 5.1

<#

.SYNOPSIS

    Proximal Policy Optimization (PPO) Agent for Reinforcement Learning

.DESCRIPTION

    Implements the PPO algorithm combining:

      - Actor Network  : maps state -> action probabilities (policy)

      - Critic Network : maps state -> value estimate (baseline)

      - GAE            : Generalized Advantage Estimation

      - Clipped update : limits policy change per update step

    Requires VBAF.Core.AllClasses.ps1 to be loaded first (via VBAF.LoadAll.ps1).

.NOTES

    Part of VBAF - Phase 3 Reinforcement Learning Module

    PS 5.1 compatible - dependency injection pattern used

    to avoid parse-time type resolution errors in classes.

#>

# Set base path

$basePath = $PSScriptRoot

# ============================================================

class PPOConfig {

    [int]    $StateSize        = 4

    [int]    $ActionSize       = 2

    [int[]]  $ActorHidden      = @(64, 64)

    [int[]]  $CriticHidden     = @(64, 64)

    [double] $LearningRate     = 0.001

    [double] $Gamma            = 0.99    # Discount factor

    [double] $LambdaGAE       = 0.95    # GAE smoothing

    [double] $ClipEpsilon      = 0.2    # PPO clip range

    [double] $EntropyBonus     = 0.01   # Encourages exploration

    [int]    $UpdateEpochs     = 4      # Training passes per rollout

    [int]    $RolloutSteps     = 64     # Steps before each update

    [int]    $MaxSteps         = 200    # Max steps per episode

}

# ============================================================

class PPOAgent {

    # [object] for all cross-file types - PS 5.1 requirement

    [object] $Actor

    [object] $Critic

    [object] $Config

    # Stats

    [int]    $TotalSteps     = 0

    [int]    $TotalEpisodes  = 0

    [int]    $UpdateCount    = 0

    [double] $LastActorLoss  = 0.0

    [double] $LastCriticLoss = 0.0

    [double] $LastEntropy    = 0.0

    [System.Collections.Generic.List[double]] $EpisodeRewards

    [System.Collections.Generic.List[double]] $ActorLossHistory

    [System.Collections.Generic.List[double]] $CriticLossHistory

    # Rollout buffer

    hidden [System.Collections.ArrayList] $States

    hidden [System.Collections.ArrayList] $Actions

    hidden [System.Collections.ArrayList] $Rewards

    hidden [System.Collections.ArrayList] $Values

    hidden [System.Collections.ArrayList] $LogProbs

    hidden [System.Collections.ArrayList] $Dones

    hidden [System.Random] $Rng

    # -------------------------------------------------------

    # Constructor - receives pre-built networks (PS 5.1 safe)

    # -------------------------------------------------------

    PPOAgent([object]$config, [object]$actor, [object]$critic) {

        $this.Config  = $config

        $this.Actor   = $actor

        $this.Critic  = $critic

        $this.Rng     = [System.Random]::new()

        $this.EpisodeRewards    = [System.Collections.Generic.List[double]]::new()

        $this.ActorLossHistory  = [System.Collections.Generic.List[double]]::new()

        $this.CriticLossHistory = [System.Collections.Generic.List[double]]::new()

        $this.ClearRollout()

        Write-Host "✅ PPOAgent created" -ForegroundColor Green

        Write-Host "   State size    : $($config.StateSize)"                          -ForegroundColor Cyan

        Write-Host "   Action size   : $($config.ActionSize)"                         -ForegroundColor Cyan

        Write-Host "   Actor hidden  : $($config.ActorHidden  -join ' -> ')"          -ForegroundColor Cyan

        Write-Host "   Critic hidden : $($config.CriticHidden -join ' -> ')"          -ForegroundColor Cyan

        Write-Host "   Clip epsilon  : $($config.ClipEpsilon)"                        -ForegroundColor Cyan

        Write-Host "   Rollout steps : $($config.RolloutSteps)"                       -ForegroundColor Cyan

    }

    # -------------------------------------------------------

    # Softmax helper - converts raw outputs to probabilities

    # -------------------------------------------------------

    hidden [double[]] Softmax([double[]]$logits) {

        $max  = ($logits | Measure-Object -Maximum).Maximum

        $exps = @(0.0) * $logits.Length

        $sum  = 0.0

        for ($i = 0; $i -lt $logits.Length; $i++) {

            $exps[$i] = [Math]::Exp($logits[$i] - $max)

            $sum += $exps[$i]

        }

        $probs = @(0.0) * $logits.Length

        for ($i = 0; $i -lt $logits.Length; $i++) {

            $probs[$i] = $exps[$i] / $sum

        }

        return $probs

    }

    # -------------------------------------------------------

    # Sample action from probability distribution

    # -------------------------------------------------------

    hidden [int] SampleAction([double[]]$probs) {

        $r   = $this.Rng.NextDouble()

        $cum = 0.0

        for ($i = 0; $i -lt $probs.Length; $i++) {

            $cum += $probs[$i]

            if ($r -le $cum) { return $i }

        }

        return $probs.Length - 1

    }

    # -------------------------------------------------------

    # Log probability of action given probs

    # -------------------------------------------------------

    hidden [double] LogProb([double[]]$probs, [int]$action) {

        $p = [Math]::Max($probs[$action], 1e-8)

        return [Math]::Log($p)

    }

    # -------------------------------------------------------

    # Entropy of distribution (encourages exploration)

    # -------------------------------------------------------

    hidden [double] Entropy([double[]]$probs) {

        $h = 0.0

        foreach ($p in $probs) {

            if ($p -gt 1e-8) { $h -= $p * [Math]::Log($p) }

        }

        return $h

    }

    # -------------------------------------------------------

    # Select action - returns hashtable {Action, LogProb, Value}

    # -------------------------------------------------------

    [hashtable] Act([double[]]$state) {

        $logits = $this.Actor.Predict($state)

        $probs  = $this.Softmax($logits)

        $action = $this.SampleAction($probs)

        $logP   = $this.LogProb($probs, $action)

        # Critic value estimate

        $valueOut = $this.Critic.Predict($state)

        $value    = $valueOut[0]

        return @{ Action = $action; LogProb = $logP; Value = $value; Probs = $probs }

    }

    # -------------------------------------------------------

    # Greedy action for evaluation

    # -------------------------------------------------------

    [int] Predict([double[]]$state) {

        $logits = $this.Actor.Predict($state)

        $probs  = $this.Softmax($logits)

        $best   = 0

        for ($i = 1; $i -lt $probs.Length; $i++) {

            if ($probs[$i] -gt $probs[$best]) { $best = $i }

        }

        return $best

    }

    # -------------------------------------------------------

    # Store one transition in rollout buffer

    # -------------------------------------------------------

    [void] StoreTransition([double[]]$state, [int]$action, [double]$reward,

                           [double]$value, [double]$logProb, [bool]$done) {

        $this.States.Add($state)

        $this.Actions.Add($action)

        $this.Rewards.Add($reward)

        $this.Values.Add($value)

        $this.LogProbs.Add($logProb)

        $this.Dones.Add($done)

        $this.TotalSteps++

    }

    # -------------------------------------------------------

    # Clear rollout buffer

    # -------------------------------------------------------

    [void] ClearRollout() {

        $this.States   = [System.Collections.ArrayList]::new()

        $this.Actions  = [System.Collections.ArrayList]::new()

        $this.Rewards  = [System.Collections.ArrayList]::new()

        $this.Values   = [System.Collections.ArrayList]::new()

        $this.LogProbs = [System.Collections.ArrayList]::new()

        $this.Dones    = [System.Collections.ArrayList]::new()

    }

    # -------------------------------------------------------

    # Compute GAE advantages and discounted returns

    # lastValue = critic estimate of state after rollout ends

    # -------------------------------------------------------

    hidden [hashtable] ComputeGAE([double]$lastValue) {

        $n          = $this.Rewards.Count

        $advantages = @(0.0) * $n

        $returns    = @(0.0) * $n

        $gaeVal     = 0.0

        for ($t = $n - 1; $t -ge 0; $t--) {

            $done     = [bool]$this.Dones[$t]

            $reward   = [double]$this.Rewards[$t]

            $value    = [double]$this.Values[$t]

            $nextVal  = if ($t -eq $n - 1) { $lastValue } else { [double]$this.Values[$t + 1] }

            if ($done) { $nextVal = 0.0; $gaeVal = 0.0 }

            $delta        = $reward + $this.Config.Gamma * $nextVal - $value

            $gaeVal       = $delta + $this.Config.Gamma * $this.Config.LambdaGAE * $gaeVal

            $advantages[$t] = $gaeVal

            $returns[$t]    = $gaeVal + $value

        }

        # Normalize advantages

        $mean   = ($advantages | Measure-Object -Average).Average

        $sq     = $advantages | ForEach-Object { ($_ - $mean) * ($_ - $mean) }

        $stdDev = [Math]::Sqrt(($sq | Measure-Object -Average).Average + 1e-8)

        for ($i = 0; $i -lt $n; $i++) {

            $advantages[$i] = ($advantages[$i] - $mean) / $stdDev

        }

        return @{ Advantages = $advantages; Returns = $returns }

    }

    # -------------------------------------------------------

    # PPO Update - train actor and critic on collected rollout

    # -------------------------------------------------------

    [void] Update([double]$lastValue) {

        $gae         = $this.ComputeGAE($lastValue)

        $advantages  = $gae.Advantages

        $returns     = $gae.Returns

        $n           = $this.States.Count

        $totalActorLoss  = 0.0

        $totalCriticLoss = 0.0

        $totalEntropy    = 0.0

        $updateSamples   = 0

        for ($epoch = 0; $epoch -lt $this.Config.UpdateEpochs; $epoch++) {

            for ($t = 0; $t -lt $n; $t++) {

                $state      = [double[]]$this.States[$t]

                $action     = [int]$this.Actions[$t]

                $oldLogProb = [double]$this.LogProbs[$t]

                $advantage  = $advantages[$t]

                $ret        = $returns[$t]

                # ---- Critic update ----

                # Target: discounted return

                $criticTarget    = @($ret)

                $criticLoss      = $this.Critic.TrainSample($state, $criticTarget)

                $totalCriticLoss += $criticLoss

                # ---- Actor update ----

                # Get new probabilities

                $logits   = $this.Actor.Predict($state)

                $probs    = $this.Softmax($logits)

                $newLogP  = $this.LogProb($probs, $action)

                $entropy  = $this.Entropy($probs)

                $totalEntropy += $entropy

                # PPO ratio and clipped objective

                $ratio      = [Math]::Exp($newLogP - $oldLogProb)

                $clipRatio  = [Math]::Max($this.Config.ClipEpsilon * -1,

                              [Math]::Min($this.Config.ClipEpsilon,

                              $ratio - 1.0)) + 1.0

                # Build actor target: nudge probability of taken action

                # in direction of advantage, clipped by ratio

                $effectiveRatio = [Math]::Min($ratio, $clipRatio)

                $actorTarget    = $probs.Clone()

                $nudge          = $advantage * $effectiveRatio * 0.1 + $this.Config.EntropyBonus * $entropy

                $actorTarget[$action] = [Math]::Max(0.01,

                                        [Math]::Min(0.99,

                                        $probs[$action] + $nudge))

                # Renormalize

                $sum = ($actorTarget | Measure-Object -Sum).Sum

                for ($i = 0; $i -lt $actorTarget.Length; $i++) {

                    $actorTarget[$i] = $actorTarget[$i] / $sum

                }

                $actorLoss      = $this.Actor.TrainSample($state, $actorTarget)

                $totalActorLoss += $actorLoss

                $updateSamples++

            }

        }

        if ($updateSamples -gt 0) {

            $this.LastActorLoss  = $totalActorLoss  / $updateSamples

            $this.LastCriticLoss = $totalCriticLoss / $updateSamples

            $this.LastEntropy    = $totalEntropy     / $updateSamples

            $this.ActorLossHistory.Add($this.LastActorLoss)

            $this.CriticLossHistory.Add($this.LastCriticLoss)

        }

        $this.UpdateCount++

        $this.ClearRollout()

    }

    # -------------------------------------------------------

    [void] EndEpisode([double]$totalReward) {

        $this.TotalEpisodes++

        $this.EpisodeRewards.Add($totalReward)

    }

    # -------------------------------------------------------

    [hashtable] GetStats() {

        $avgReward      = 0.0

        $avgActorLoss   = 0.0

        $avgCriticLoss  = 0.0

        if ($this.EpisodeRewards.Count -gt 0) {

            $slice     = $this.EpisodeRewards | Select-Object -Last 100

            $avgReward = ($slice | Measure-Object -Average).Average

        }

        if ($this.ActorLossHistory.Count -gt 0) {

            $slice         = $this.ActorLossHistory | Select-Object -Last 100

            $avgActorLoss  = ($slice | Measure-Object -Average).Average

        }

        if ($this.CriticLossHistory.Count -gt 0) {

            $slice          = $this.CriticLossHistory | Select-Object -Last 100

            $avgCriticLoss  = ($slice | Measure-Object -Average).Average

        }

        return @{

            TotalEpisodes   = $this.TotalEpisodes

            TotalSteps      = $this.TotalSteps

            UpdateCount     = $this.UpdateCount

            LastActorLoss   = [Math]::Round($this.LastActorLoss,  6)

            LastCriticLoss  = [Math]::Round($this.LastCriticLoss, 6)

            LastEntropy     = [Math]::Round($this.LastEntropy,     4)

            AvgReward100    = [Math]::Round($avgReward,            3)

            AvgActorLoss    = [Math]::Round($avgActorLoss,         6)

            AvgCriticLoss   = [Math]::Round($avgCriticLoss,        6)

        }

    }

    # -------------------------------------------------------

    [void] PrintStats() {

        $s = $this.GetStats()

        Write-Host ""

        Write-Host "╔══════════════════════════════════════╗" -ForegroundColor Cyan

        Write-Host "║         PPO Agent Statistics         ║" -ForegroundColor Cyan

        Write-Host "╠══════════════════════════════════════╣" -ForegroundColor Cyan

        Write-Host ("║  Episodes      : {0,-20}║" -f $s.TotalEpisodes)   -ForegroundColor White

        Write-Host ("║  Total Steps   : {0,-20}║" -f $s.TotalSteps)      -ForegroundColor White

        Write-Host ("║  PPO Updates   : {0,-20}║" -f $s.UpdateCount)     -ForegroundColor White

        Write-Host ("║  Avg Reward    : {0,-20}║" -f $s.AvgReward100)    -ForegroundColor Green

        Write-Host ("║  Entropy       : {0,-20}║" -f $s.LastEntropy)     -ForegroundColor Yellow

        Write-Host ("║  Actor Loss    : {0,-20}║" -f $s.LastActorLoss)   -ForegroundColor Magenta

        Write-Host ("║  Critic Loss   : {0,-20}║" -f $s.LastCriticLoss)  -ForegroundColor Magenta

        Write-Host "╚══════════════════════════════════════╝" -ForegroundColor Cyan

        Write-Host ""

    }

}

# ============================================================

# CartPole-style environment (same as DQN, self-contained)

# ============================================================

class PPOEnvironment {

    [double] $Position

    [double] $Velocity

    [double] $Angle

    [double] $AngularVelocity

    [int]    $Steps

    [int]    $MaxSteps

    hidden [System.Random] $Rng

    PPOEnvironment() {

        $this.MaxSteps = 200

        $this.Rng      = [System.Random]::new()

        $this.Reset()

    }

    [double[]] Reset() {

        $this.Position        = ($this.Rng.NextDouble() - 0.5) * 0.1

        $this.Velocity        = ($this.Rng.NextDouble() - 0.5) * 0.1

        $this.Angle           = ($this.Rng.NextDouble() - 0.5) * 0.1

        $this.AngularVelocity = ($this.Rng.NextDouble() - 0.5) * 0.1

        $this.Steps           = 0

        return $this.GetState()

    }

    [double[]] GetState() {

        return @($this.Position, $this.Velocity, $this.Angle, $this.AngularVelocity)

    }

    [hashtable] Step([int]$action) {

        $this.Steps++

        $force     = if ($action -eq 1) { 1.0 } else { -1.0 }

        $gravity   = 9.8

        $cartMass  = 1.0

        $poleMass  = 0.1

        $totalMass = $cartMass + $poleMass

        $halfLen   = 0.25

        $dt        = 0.02

        $cosA  = [Math]::Cos($this.Angle)

        $sinA  = [Math]::Sin($this.Angle)

        $temp  = ($force + $poleMass * $halfLen * $this.AngularVelocity * $this.AngularVelocity * $sinA) / $totalMass

        $aAcc  = ($gravity * $sinA - $cosA * $temp) / ($halfLen * (4.0/3.0 - $poleMass * $cosA * $cosA / $totalMass))

        $acc   = $temp - $poleMass * $halfLen * $aAcc * $cosA / $totalMass

        $this.Position        += $dt * $this.Velocity

        $this.Velocity        += $dt * $acc

        $this.Angle           += $dt * $this.AngularVelocity

        $this.AngularVelocity += $dt * $aAcc

        $done   = ($this.Steps -ge $this.MaxSteps) -or

                  ([Math]::Abs($this.Position) -gt 2.4) -or

                  ([Math]::Abs($this.Angle)    -gt 0.21)

        $reward = if (-not $done) { 1.0 } else { 0.0 }

        return @{ NextState = $this.GetState(); Reward = $reward; Done = $done }

    }

}

# ============================================================

# TRAINING RUNNER

# All external types instantiated HERE (script level) - PS 5.1 safe

# ============================================================

function Invoke-PPOTraining {

    param(

        [int]    $Episodes   = 100,

        [int]    $PrintEvery = 10,

        [switch] $Quiet,

        [switch] $FastMode

    )

    # ---- Settings ----

    $actorHidden  = @(64, 64)

    $criticHidden = @(64, 64)

    $maxSteps     = 200

    $rolloutSteps = 64

    if ($FastMode) {

        $actorHidden  = @(16, 16)

        $criticHidden = @(16, 16)

        $maxSteps     = 30

        $rolloutSteps = 32

        if ($Episodes  -eq 100) { $Episodes   = 50 }

        if ($PrintEvery -eq 10) { $PrintEvery  = 5  }

        Write-Host ""

        Write-Host "⚡ FAST MODE ENABLED" -ForegroundColor Yellow

        Write-Host "   Actor/Critic : 16 -> 16" -ForegroundColor Yellow

        Write-Host "   MaxSteps     : $maxSteps"  -ForegroundColor Yellow

        Write-Host "   RolloutSteps : $rolloutSteps" -ForegroundColor Yellow

        Write-Host "   Episodes     : $Episodes"  -ForegroundColor Yellow

    }

    Write-Host ""

    Write-Host "🚀 VBAF PPO Training Started" -ForegroundColor Green

    Write-Host "   Episodes: $Episodes"        -ForegroundColor Cyan

    Write-Host ""

    # ---- Config ----

    $config                 = [PPOConfig]::new()

    $config.StateSize       = 4

    $config.ActionSize      = 2

    $config.ActorHidden     = $actorHidden

    $config.CriticHidden    = $criticHidden

    $config.LearningRate    = 0.001

    $config.Gamma           = 0.99

    $config.LambdaGAE      = 0.95

    $config.ClipEpsilon     = 0.2

    $config.EntropyBonus    = 0.01

    $config.UpdateEpochs    = 4

    $config.RolloutSteps    = $rolloutSteps

    $config.MaxSteps        = $maxSteps

    # ---- Build layer arrays ----

    $actorLayers  = [System.Collections.Generic.List[int]]::new()

    $actorLayers.Add($config.StateSize)

    foreach ($h in $config.ActorHidden)  { $actorLayers.Add($h) }

    $actorLayers.Add($config.ActionSize)

    $criticLayers = [System.Collections.Generic.List[int]]::new()

    $criticLayers.Add($config.StateSize)

    foreach ($h in $config.CriticHidden) { $criticLayers.Add($h) }

    $criticLayers.Add(1)   # Critic outputs single value

    # ---- Instantiate networks at script level (PS 5.1 safe) ----

    $actor  = [NeuralNetwork]::new($actorLayers.ToArray(),  $config.LearningRate)

    $critic = [NeuralNetwork]::new($criticLayers.ToArray(), $config.LearningRate)

    # ---- Inject into PPOAgent ----

    $agent = [PPOAgent]::new($config, $actor, $critic)

    $env          = [PPOEnvironment]::new()

    $env.MaxSteps = $maxSteps

    $bestReward  = 0.0

    $stepCounter = 0

    for ($ep = 1; $ep -le $Episodes; $ep++) {

        $state       = $env.Reset()

        $totalReward = 0.0

        $done        = $false

        while (-not $done) {

            # Get action from actor

            $result  = $agent.Act($state)

            $action  = $result.Action

            $logProb = $result.LogProb

            $value   = $result.Value

            # Step environment

            $step    = $env.Step($action)

            $ns      = $step.NextState

            $reward  = $step.Reward

            $done    = $step.Done

            # Store in rollout buffer

            $agent.StoreTransition($state, $action, $reward, $value, $logProb, $done)

            $state        = $ns

            $totalReward += $reward

            $stepCounter++

            # Update when rollout buffer is full

            if ($stepCounter % $config.RolloutSteps -eq 0) {

                $lastValOut = $agent.Critic.Predict($state)

                $lastVal    = $lastValOut[0]

                $agent.Update($lastVal)

            }

        }

        $agent.EndEpisode($totalReward)

        if ($totalReward -gt $bestReward) { $bestReward = $totalReward }

        if (-not $Quiet -and ($ep % $PrintEvery -eq 0)) {

            $stats = $agent.GetStats()

            Write-Host ("  Ep {0,4}  Reward: {1,5:F0}  Best: {2,5:F0}  Updates: {3,4}  Entropy: {4:F3}  CriticLoss: {5:F5}" -f `

                $ep, $totalReward, $bestReward,

                $stats.UpdateCount, $stats.LastEntropy, $stats.LastCriticLoss) -ForegroundColor White

        }

    }

    # Final update on remaining rollout

    if ($agent.States.Count -gt 0) {

        $agent.Update(0.0)

    }

    Write-Host ""

    Write-Host "✅ Training Complete!" -ForegroundColor Green

    $agent.PrintStats()

    ,$agent  # comma operator forces return as single object in PS 5.1

}

# ============================================================

# ============================================================

# TEST

# 1. Run VBAF.LoadAll.ps1

# 2. $agent = (Invoke-PPOTraining -Episodes 20 -PrintEvery 2 -FastMode)[-1]

# 3. $agent = (Invoke-PPOTraining -Episodes 50 -PrintEvery 5 -FastMode)[-1]

# 4. $agent.PrintStats()

# ============================================================

Write-Host "📦 VBAF.RL.PPO.ps1 loaded" -ForegroundColor Green

Write-Host "   Classes : PPOConfig, PPOAgent, PPOEnvironment"              -ForegroundColor Cyan

Write-Host "   Function: Invoke-PPOTraining"                               -ForegroundColor Cyan

Write-Host ""

Write-Host "   Quick start:"                                               -ForegroundColor Yellow

Write-Host '   $agent = (Invoke-PPOTraining -Episodes 50 -PrintEvery 5 -FastMode)[-1]'  -ForegroundColor White

Write-Host '   $agent.PrintStats()'                                        -ForegroundColor White

Write-Host ""