VBAF

2.1.0

VBAF.RL.A3C.ps1

                                #Requires -Version 5.1

<#

.SYNOPSIS

    Advantage Actor-Critic (A3C) Agent for Reinforcement Learning

.DESCRIPTION

    Implements the A3C algorithm with:

      - Shared Actor-Critic Network : single network, two output heads

      - Advantage estimation        : A(s,a) = R - V(s)

      - Entropy regularization      : encourages exploration

      - n-step returns              : bootstrapped multi-step rewards

      - Multiple worker rollouts    : simulated synchronously (PS 5.1)

    Note: True async threading not available in PS 5.1.

    Workers run sequentially, gradients accumulated before update.

    Requires VBAF.Core.AllClasses.ps1 (via VBAF.LoadAll.ps1).

.NOTES

    Part of VBAF - Phase 3 Reinforcement Learning Module

    PS 5.1 compatible - dependency injection pattern throughout

#>

# Set base path

$basePath = $PSScriptRoot

# ============================================================

class A3CConfig {

    [int]    $StateSize      = 4

    [int]    $ActionSize     = 2

    [int[]]  $SharedHidden   = @(64, 64)  # Shared layers for actor+critic

    [double] $LearningRate   = 0.001

    [double] $Gamma          = 0.99       # Discount factor

    [double] $EntropyBonus   = 0.01       # Exploration bonus

    [double] $ValueLossCoeff = 0.5        # Weight of critic loss

    [int]    $NSteps         = 5          # n-step return length

    [int]    $NumWorkers     = 4          # Simulated parallel workers

    [int]    $MaxSteps       = 200        # Max steps per episode

}

# ============================================================

class A3CWorker {

    # Each worker has its OWN local network copy

    # and collects experience independently

    [object] $LocalNetwork

    [object] $Config

    [int]    $WorkerId

    [int]    $EpisodesDone = 0

    [double] $LastReward   = 0.0

    hidden [System.Random] $Rng

    A3CWorker([int]$workerId, [object]$config, [object]$localNetwork) {

        $this.WorkerId      = $workerId

        $this.Config        = $config

        $this.LocalNetwork  = $localNetwork

        $this.Rng           = [System.Random]::new($workerId * 42 + 7)

    }

    # Softmax on raw logits

    [double[]] Softmax([double[]]$logits) {

        $max  = ($logits | Measure-Object -Maximum).Maximum

        $exps = @(0.0) * $logits.Length

        $sum  = 0.0

        for ($i = 0; $i -lt $logits.Length; $i++) {

            $exps[$i] = [Math]::Exp($logits[$i] - $max)

            $sum += $exps[$i]

        }

        $probs = @(0.0) * $logits.Length

        for ($i = 0; $i -lt $logits.Length; $i++) {

            $probs[$i] = $exps[$i] / $sum

        }

        return $probs

    }

    # Sample action from probability distribution

    [int] SampleAction([double[]]$probs) {

        $r   = $this.Rng.NextDouble()

        $cum = 0.0

        for ($i = 0; $i -lt $probs.Length; $i++) {

            $cum += $probs[$i]

            if ($r -le $cum) { return $i }

        }

        return $probs.Length - 1

    }

    # Run n-step rollout on an environment, return experience batch

    [hashtable] RunRollout([object]$env) {

        $states   = [System.Collections.ArrayList]::new()

        $actions  = [System.Collections.ArrayList]::new()

        $rewards  = [System.Collections.ArrayList]::new()

        $dones    = [System.Collections.ArrayList]::new()

        $state = $env.GetState()

        $done  = $false

        $totalReward = 0.0

        for ($step = 0; $step -lt $this.Config.NSteps; $step++) {

            # Forward pass: first ActionSize outputs = policy logits

            #               last output = value estimate

            $out    = $this.LocalNetwork.Predict($state)

            $nA     = $this.Config.ActionSize

            $logits = $out[0..($nA-1)]

            $probs  = $this.Softmax($logits)

            $action = $this.SampleAction($probs)

            $result = $env.Step($action)

            $ns     = $result.NextState

            $reward = $result.Reward

            $done   = $result.Done

            $states.Add($state)   | Out-Null

            $actions.Add($action) | Out-Null

            $rewards.Add($reward) | Out-Null

            $dones.Add($done)     | Out-Null

            $totalReward += $reward

            $state = $ns

            if ($done) {

                $env.Reset() | Out-Null

                $this.EpisodesDone++

                $this.LastReward = $totalReward

                $totalReward = 0.0

                $done = $false

            }

        }

        # Bootstrap value for last state

        $lastOut   = $this.LocalNetwork.Predict($state)

        $lastValue = $lastOut[$this.Config.ActionSize]  # Last output = value

        return @{

            States    = $states

            Actions   = $actions

            Rewards   = $rewards

            Dones     = $dones

            LastValue = $lastValue

            LastState = $state

        }

    }

}

# ============================================================

class A3CAgent {

    # Global (shared) network - [object] for PS 5.1

    [object] $GlobalNetwork

    [object] $Config

    # Stats

    [int]    $TotalSteps     = 0

    [int]    $TotalEpisodes  = 0

    [int]    $UpdateCount    = 0

    [double] $LastLoss       = 0.0

    [double] $LastEntropy    = 0.0

    [double] $LastValue      = 0.0

    [System.Collections.Generic.List[double]] $EpisodeRewards

    [System.Collections.Generic.List[double]] $LossHistory

    [System.Collections.ArrayList]            $Workers

    hidden [System.Random] $Rng

    # -------------------------------------------------------

    # Constructor - receives pre-built global network

    # -------------------------------------------------------

    A3CAgent([object]$config, [object]$globalNetwork, [System.Collections.ArrayList]$workers) {

        $this.Config         = $config

        $this.GlobalNetwork  = $globalNetwork

        $this.Workers        = $workers

        $this.Rng            = [System.Random]::new()

        $this.EpisodeRewards = [System.Collections.Generic.List[double]]::new()

        $this.LossHistory    = [System.Collections.Generic.List[double]]::new()

        Write-Host "✅ A3CAgent created" -ForegroundColor Green

        Write-Host "   State size   : $($config.StateSize)"              -ForegroundColor Cyan

        Write-Host "   Action size  : $($config.ActionSize)"             -ForegroundColor Cyan

        Write-Host "   Shared hidden: $($config.SharedHidden -join ' -> ')" -ForegroundColor Cyan

        Write-Host "   Workers      : $($config.NumWorkers)"             -ForegroundColor Cyan

        Write-Host "   n-steps      : $($config.NSteps)"                 -ForegroundColor Cyan

        Write-Host "   Entropy bonus: $($config.EntropyBonus)"           -ForegroundColor Cyan

    }

    # -------------------------------------------------------

    # Softmax helper

    # -------------------------------------------------------

    hidden [double[]] Softmax([double[]]$logits) {

        $max  = ($logits | Measure-Object -Maximum).Maximum

        $exps = @(0.0) * $logits.Length

        $sum  = 0.0

        for ($i = 0; $i -lt $logits.Length; $i++) {

            $exps[$i] = [Math]::Exp($logits[$i] - $max)

            $sum += $exps[$i]

        }

        $probs = @(0.0) * $logits.Length

        for ($i = 0; $i -lt $logits.Length; $i++) {

            $probs[$i] = $exps[$i] / $sum

        }

        return $probs

    }

    # -------------------------------------------------------

    # Entropy of distribution

    # -------------------------------------------------------

    hidden [double] Entropy([double[]]$probs) {

        $h = 0.0

        foreach ($p in $probs) {

            if ($p -gt 1e-8) { $h -= $p * [Math]::Log($p) }

        }

        return $h

    }

    # -------------------------------------------------------

    # Compute n-step returns with bootstrapping

    # -------------------------------------------------------

    hidden [double[]] ComputeReturns([System.Collections.ArrayList]$rewards,

                                     [System.Collections.ArrayList]$dones,

                                     [double]$lastValue) {

        $n       = $rewards.Count

        $returns = @(0.0) * $n

        $R       = $lastValue

        for ($t = $n - 1; $t -ge 0; $t--) {

            if ([bool]$dones[$t]) { $R = 0.0 }

            $R           = [double]$rewards[$t] + $this.Config.Gamma * $R

            $returns[$t] = $R

        }

        return $returns

    }

    # -------------------------------------------------------

    # Global update from one worker's experience batch

    # -------------------------------------------------------

    [void] UpdateFromWorker([hashtable]$batch, [int]$workerId) {

        $states    = $batch.States

        $actions   = $batch.Actions

        $rewards   = $batch.Rewards

        $dones     = $batch.Dones

        $bootValue = $batch.LastValue

        $n         = $states.Count

        $nA        = $this.Config.ActionSize

        $returns  = $this.ComputeReturns($rewards, $dones, $bootValue)

        $totalLoss    = 0.0

        $totalEntropy = 0.0

        for ($t = 0; $t -lt $n; $t++) {

            $state  = [double[]]$states[$t]

            $action = [int]$actions[$t]

            $ret    = $returns[$t]

            # Forward pass on global network

            $out    = $this.GlobalNetwork.Predict($state)

            $logits = $out[0..($nA-1)]

            $value  = $out[$nA]

            $probs  = $this.Softmax($logits)

            # Advantage = return - value estimate

            $advantage = $ret - $value

            $entropy   = $this.Entropy($probs)

            $totalEntropy += $entropy

            # Build combined target output:

            # Policy head: nudge action probability by advantage

            $targetOut = $out.Clone()

            $nudge     = $advantage * 0.1 + $this.Config.EntropyBonus * $entropy

            $targetOut[$action] = [Math]::Max(0.01,

                                  [Math]::Min(0.99, $probs[$action] + $nudge))

            # Renormalize policy outputs

            $pSum = 0.0

            for ($i = 0; $i -lt $nA; $i++) { $pSum += $targetOut[$i] }

            for ($i = 0; $i -lt $nA; $i++) { $targetOut[$i] = $targetOut[$i] / $pSum }

            # Value head: target is the n-step return

            $targetOut[$nA] = $ret

            # Train global network

            $loss       = $this.GlobalNetwork.TrainSample($state, $targetOut)

            $totalLoss += $loss

            $this.TotalSteps++

        }

        $this.LastLoss    = $totalLoss / $n

        $this.LastEntropy = $totalEntropy / $n

        $this.LastValue   = $bootValue

        $this.LossHistory.Add($this.LastLoss)

        $this.UpdateCount++

    }

    # -------------------------------------------------------

    # Sync worker local network from global network

    # -------------------------------------------------------

    [void] SyncWorker([object]$worker) {

        $state = $this.GlobalNetwork.ExportState()

        $worker.LocalNetwork.ImportState($state)

    }

    # -------------------------------------------------------

    # Greedy action from global network for evaluation

    # -------------------------------------------------------

    [int] Predict([double[]]$state) {

        $out    = $this.GlobalNetwork.Predict($state)

        $nA     = $this.Config.ActionSize

        $logits = $out[0..($nA-1)]

        $probs  = $this.Softmax($logits)

        $best   = 0

        for ($i = 1; $i -lt $probs.Length; $i++) {

            if ($probs[$i] -gt $probs[$best]) { $best = $i }

        }

        return $best

    }

    # -------------------------------------------------------

    [void] EndEpisode([double]$totalReward) {

        $this.TotalEpisodes++

        $this.EpisodeRewards.Add($totalReward)

    }

    # -------------------------------------------------------

    [hashtable] GetStats() {

        $avgReward = 0.0

        $avgLoss   = 0.0

        if ($this.EpisodeRewards.Count -gt 0) {

            $slice     = $this.EpisodeRewards | Select-Object -Last 100

            $avgReward = ($slice | Measure-Object -Average).Average

        }

        if ($this.LossHistory.Count -gt 0) {

            $slice   = $this.LossHistory | Select-Object -Last 100

            $avgLoss = ($slice | Measure-Object -Average).Average

        }

        return @{

            TotalEpisodes = $this.TotalEpisodes

            TotalSteps    = $this.TotalSteps

            UpdateCount   = $this.UpdateCount

            AvgReward100  = [Math]::Round($avgReward,         3)

            LastLoss      = [Math]::Round($this.LastLoss,     6)

            AvgLoss       = [Math]::Round($avgLoss,           6)

            LastEntropy   = [Math]::Round($this.LastEntropy,  4)

            LastValue     = [Math]::Round($this.LastValue,    4)

        }

    }

    # -------------------------------------------------------

    [void] PrintStats() {

        $s = $this.GetStats()

        Write-Host ""

        Write-Host "╔══════════════════════════════════════╗" -ForegroundColor Cyan

        Write-Host "║         A3C Agent Statistics         ║" -ForegroundColor Cyan

        Write-Host "╠══════════════════════════════════════╣" -ForegroundColor Cyan

        Write-Host ("║  Episodes      : {0,-20}║" -f $s.TotalEpisodes) -ForegroundColor White

        Write-Host ("║  Total Steps   : {0,-20}║" -f $s.TotalSteps)    -ForegroundColor White

        Write-Host ("║  Global Updates: {0,-20}║" -f $s.UpdateCount)   -ForegroundColor White

        Write-Host ("║  Avg Reward    : {0,-20}║" -f $s.AvgReward100)  -ForegroundColor Green

        Write-Host ("║  Last Entropy  : {0,-20}║" -f $s.LastEntropy)   -ForegroundColor Yellow

        Write-Host ("║  Last Loss     : {0,-20}║" -f $s.LastLoss)      -ForegroundColor Magenta

        Write-Host ("║  Avg Loss      : {0,-20}║" -f $s.AvgLoss)       -ForegroundColor Magenta

        Write-Host ("║  Last Value    : {0,-20}║" -f $s.LastValue)     -ForegroundColor White

        Write-Host "╚══════════════════════════════════════╝" -ForegroundColor Cyan

        Write-Host ""

    }

}

# ============================================================

# CartPole-style environment (self-contained, no external deps)

# ============================================================

class A3CEnvironment {

    [double] $Position

    [double] $Velocity

    [double] $Angle

    [double] $AngularVelocity

    [int]    $Steps

    [int]    $MaxSteps

    hidden [System.Random] $Rng

    A3CEnvironment([int]$seed) {

        $this.MaxSteps = 200

        $this.Rng      = [System.Random]::new($seed)

        $this.Reset()

    }

    [double[]] Reset() {

        $this.Position        = ($this.Rng.NextDouble() - 0.5) * 0.1

        $this.Velocity        = ($this.Rng.NextDouble() - 0.5) * 0.1

        $this.Angle           = ($this.Rng.NextDouble() - 0.5) * 0.1

        $this.AngularVelocity = ($this.Rng.NextDouble() - 0.5) * 0.1

        $this.Steps           = 0

        return $this.GetState()

    }

    [double[]] GetState() {

        return @($this.Position, $this.Velocity, $this.Angle, $this.AngularVelocity)

    }

    [hashtable] Step([int]$action) {

        $this.Steps++

        $force     = if ($action -eq 1) { 1.0 } else { -1.0 }

        $gravity   = 9.8

        $cartMass  = 1.0

        $poleMass  = 0.1

        $totalMass = $cartMass + $poleMass

        $halfLen   = 0.25

        $dt        = 0.02

        $cosA = [Math]::Cos($this.Angle)

        $sinA = [Math]::Sin($this.Angle)

        $temp = ($force + $poleMass * $halfLen * $this.AngularVelocity * $this.AngularVelocity * $sinA) / $totalMass

        $aAcc = ($gravity * $sinA - $cosA * $temp) / ($halfLen * (4.0/3.0 - $poleMass * $cosA * $cosA / $totalMass))

        $acc  = $temp - $poleMass * $halfLen * $aAcc * $cosA / $totalMass

        $this.Position        += $dt * $this.Velocity

        $this.Velocity        += $dt * $acc

        $this.Angle           += $dt * $this.AngularVelocity

        $this.AngularVelocity += $dt * $aAcc

        $done   = ($this.Steps -ge $this.MaxSteps) -or

                  ([Math]::Abs($this.Position) -gt 2.4) -or

                  ([Math]::Abs($this.Angle)    -gt 0.21)

        $reward = if (-not $done) { 1.0 } else { 0.0 }

        return @{ NextState = $this.GetState(); Reward = $reward; Done = $done }

    }

}

# ============================================================

# TRAINING RUNNER

# All external types instantiated HERE (script level) - PS 5.1 safe

# Global network + worker networks built and injected.

# ============================================================

function Invoke-A3CTraining {

    param(

        [int]    $Episodes   = 100,

        [int]    $PrintEvery = 10,

        [switch] $Quiet,

        [switch] $FastMode

    )

    # ---- Settings ----

    $sharedHidden = @(64, 64)

    $maxSteps     = 200

    $numWorkers   = 4

    $nSteps       = 5

    if ($FastMode) {

        $sharedHidden = @(16, 16)

        $maxSteps     = 30

        $numWorkers   = 2

        $nSteps       = 5

        if ($Episodes  -eq 100) { $Episodes   = 50 }

        if ($PrintEvery -eq 10) { $PrintEvery  = 5  }

        Write-Host ""

        Write-Host "⚡ FAST MODE ENABLED" -ForegroundColor Yellow

        Write-Host "   Shared hidden: 16 -> 16" -ForegroundColor Yellow

        Write-Host "   MaxSteps     : $maxSteps" -ForegroundColor Yellow

        Write-Host "   Workers      : $numWorkers" -ForegroundColor Yellow

        Write-Host "   n-steps      : $nSteps"   -ForegroundColor Yellow

        Write-Host "   Episodes     : $Episodes"  -ForegroundColor Yellow

    }

    Write-Host ""

    Write-Host "🚀 VBAF A3C Training Started" -ForegroundColor Green

    Write-Host "   Episodes: $Episodes"        -ForegroundColor Cyan

    Write-Host ""

    # ---- Config ----

    $config                 = [A3CConfig]::new()

    $config.StateSize       = 4

    $config.ActionSize      = 2

    $config.SharedHidden    = $sharedHidden

    $config.LearningRate    = 0.001

    $config.Gamma           = 0.99

    $config.EntropyBonus    = 0.01

    $config.ValueLossCoeff  = 0.5

    $config.NSteps          = $nSteps

    $config.NumWorkers      = $numWorkers

    $config.MaxSteps        = $maxSteps

    # ---- Build layer array ----

    # A3C uses ONE shared network with (ActionSize + 1) outputs:

    # [0..ActionSize-1] = policy logits, [ActionSize] = value

    $layers = [System.Collections.Generic.List[int]]::new()

    $layers.Add($config.StateSize)

    foreach ($h in $config.SharedHidden) { $layers.Add($h) }

    $layers.Add($config.ActionSize + 1)   # policy + value head

    $layerArray = $layers.ToArray()

    # ---- Build global network (script level - PS 5.1 safe) ----

    $globalNetwork = [NeuralNetwork]::new($layerArray, $config.LearningRate)

    # ---- Build worker local networks + environments ----

    $workers  = [System.Collections.ArrayList]::new()

    $envs     = [System.Collections.ArrayList]::new()

    for ($w = 0; $w -lt $numWorkers; $w++) {

        $localNet = [NeuralNetwork]::new($layerArray, $config.LearningRate)

        $worker   = [A3CWorker]::new($w, $config, $localNet)

        $env      = [A3CEnvironment]::new($w * 13 + 1)

        $env.MaxSteps = $maxSteps

        $workers.Add($worker) | Out-Null

        $envs.Add($env)       | Out-Null

    }

    # ---- Inject into A3CAgent ----

    $agent = [A3CAgent]::new($config, $globalNetwork, $workers)

    # Sync all workers with global network at start

    foreach ($worker in $workers) { $agent.SyncWorker($worker) }

    $bestReward  = 0.0

    $epRewards   = @(0.0) * $numWorkers

    for ($ep = 1; $ep -le $Episodes; $ep++) {

        $totalRewardThisEp = 0.0

        # Each worker runs a rollout, global network gets updated

        for ($w = 0; $w -lt $numWorkers; $w++) {

            $worker = $workers[$w]

            $env    = $envs[$w]

            # Run n-step rollout

            $batch  = $worker.RunRollout($env)

            # Update global network from this worker's experience

            $agent.UpdateFromWorker($batch, $w)

            # Sync worker local network from updated global

            $agent.SyncWorker($worker)

            $totalRewardThisEp += $worker.LastReward

        }

        $avgEpReward = $totalRewardThisEp / $numWorkers

        $agent.EndEpisode($avgEpReward)

        if ($avgEpReward -gt $bestReward) { $bestReward = $avgEpReward }

        if (-not $Quiet -and ($ep % $PrintEvery -eq 0)) {

            $stats = $agent.GetStats()

            Write-Host ("  Ep {0,4}  Reward: {1,5:F1}  Best: {2,5:F1}  Updates: {3,5}  Entropy: {4:F3}  Loss: {5:F5}" -f `

                $ep, $avgEpReward, $bestReward,

                $stats.UpdateCount, $stats.LastEntropy, $stats.LastLoss) -ForegroundColor White

        }

    }

    Write-Host ""

    Write-Host "✅ Training Complete!" -ForegroundColor Green

    $agent.PrintStats()

    ,$agent  # comma operator forces return as single object in PS 5.1

}

# ============================================================

# TEST

# 1. Run VBAF.LoadAll.ps1

# 2. $agent = (Invoke-A3CTraining -Episodes 20 -PrintEvery 2 -FastMode)[-1]

# 3. $agent = (Invoke-A3CTraining -Episodes 50 -PrintEvery 5 -FastMode)[-1]

# 4. $agent.PrintStats()

# ============================================================

Write-Host "📦 VBAF.RL.A3C.ps1 loaded" -ForegroundColor Green

Write-Host "   Classes : A3CConfig, A3CAgent, A3CWorker, A3CEnvironment" -ForegroundColor Cyan

Write-Host "   Function: Invoke-A3CTraining"                             -ForegroundColor Cyan

Write-Host ""

Write-Host "   Quick start:"                                                             -ForegroundColor Yellow

Write-Host '   $agent = (Invoke-A3CTraining -Episodes 20 -PrintEvery 2 -FastMode)[-1]'        -ForegroundColor White

Write-Host '   $agent.PrintStats()'                                                      -ForegroundColor White

Write-Host ""