VBAF

5.0.0

VBAF.RL.Example-CastleLearning.ps1

                                #Requires -Version 5.1

<#

.SYNOPSIS

    Q-Learning Castle Agent -- Training Demo

.DESCRIPTION

    Demonstrates a Q-Learning agent learning to generate castle sequences.

    WHAT YOU ARE LEARNING HERE:

    ============================

    This example shows Q-Learning applied to a creative problem --

    generating sequences of castle types that are visually varied

    and engaging.

    Unlike XOR (which has one correct answer), this is an OPTIMISATION

    problem -- there is no single right sequence, but some sequences

    are better than others according to a reward function.

    THE ENVIRONMENT:

    ================

    State:  the last 1-2 castle types chosen (recent history)

    Actions: choose the next castle type from 8 options

    Reward: +2 for variety, -1 for repetition, plus visual balance

            and engagement scores (simulated here with random values)

    WHAT THE AGENT LEARNS:

    =======================

    Over 100 episodes the agent discovers that:

    - Repeating the same castle type is penalised

    - Mixing different types earns higher rewards

    - Some transitions (e.g. Gothic -> Fairy Tale) score better

      than others on average

    THE Q-TABLE GROWS AS THE AGENT EXPLORES:

    =========================================

    Episode 1:   Q-table has ~0 entries (nothing visited yet)

    Episode 10:  Q-table growing -- common transitions recorded

    Episode 100: Q-table stable -- agent exploiting learned values

    Watch the Q-table size grow during training.

    Watch epsilon decay from 1.0 (random) toward 0.01 (learned).

    Watch recent average reward increase as the agent improves.

    EXPLORATION vs EXPLOITATION IN PRACTICE:

    =========================================

    Early episodes: epsilon ~1.0 -- agent tries everything randomly

    Middle episodes: epsilon ~0.5 -- mix of random and learned choices

    Late episodes: epsilon ~0.01 -- agent mostly uses learned Q-values

    This gradual shift is called the epsilon schedule.

    Too fast: agent stops exploring before finding good strategies.

    Too slow: agent wastes time exploring when it already knows what works.

    REWARD DESIGN NOTE:

    ===================

    In this example, visual balance and engagement are SIMULATED

    with random values. In a real application, these would come from

    user feedback, aesthetic scoring algorithms, or A/B test results.

    The random simulation still teaches the variety reward correctly.

.NOTES

    Part of VBAF (Visual AI & Reinforcement Learning Framework)

    Educational use -- compare output with VBAF.RL.DQN.ps1 to see

    how neural networks handle larger state spaces.

    Requires: VBAF.RL.QTable.ps1, VBAF.RL.ExperienceReplay.ps1,

              VBAF.RL.QLearningAgent.ps1

#>

$basePath = $PSScriptRoot

. (Join-Path $basePath "VBAF.RL.QTable.ps1")

. (Join-Path $basePath "VBAF.RL.ExperienceReplay.ps1")

. (Join-Path $basePath "VBAF.RL.QLearningAgent.ps1")

Write-Host ""

Write-Host "+----------------------------------------------+" -ForegroundColor Cyan

Write-Host "|   Q-LEARNING CASTLE AGENT - TRAINING DEMO   |" -ForegroundColor Cyan

Write-Host "+----------------------------------------------+" -ForegroundColor Cyan

#  THE ACTION SPACE 

# These are the castle types the agent can choose from.

# Each is a discrete action -- the agent picks one per step.

# 8 actions x 8 possible states = 64 Q-table entries at most.

# This is small enough for a Q-table (no neural network needed).

$castleTypes = @(

    "Gothic", "FairyTale", "Fortress", "Palace",

    "Wizard", "Cathedral", "Oriental", "Ruins"

)

Write-Host ""

Write-Host "Available Castle Types (the action space):" -ForegroundColor Yellow

foreach ($type in $castleTypes) {

    Write-Host "  - $type"

}

#  CREATE THE AGENT 

# Default constructor uses:

#   alpha (learning rate) = 0.1

#   gamma (discount)      = 0.9

#   epsilon               = 1.0 (start fully random)

Write-Host ""

Write-Host "Creating Q-Learning Agent..." -ForegroundColor Yellow

$agent = New-Object QLearningAgent -ArgumentList @(,$castleTypes)

Write-Host "  Alpha (learning rate) : $($agent.Alpha)   -- how fast Q-values update"

Write-Host "  Gamma (discount)      : $($agent.Gamma)   -- how much future rewards matter"

Write-Host "  Epsilon (exploration) : $($agent.Epsilon) -- start 100% random"

#  TRAINING CONFIGURATION 

# 100 episodes x 10 steps = 1000 total (state, action, reward) interactions.

# Each interaction potentially updates one Q-table entry.

# After 1000 updates the agent has a reasonable Q-table.

$episodes        = 100

$stepsPerEpisode = 10

Write-Host ""

Write-Host "Training Configuration:" -ForegroundColor Yellow

Write-Host "  Episodes           : $episodes"

Write-Host "  Steps per episode  : $stepsPerEpisode"

Write-Host "  Total interactions : $($episodes * $stepsPerEpisode)"

# recentCastles tracks the last few castle types chosen.

# This becomes the STATE that the agent observes.

# State = what the agent currently knows about the sequence so far.

$recentCastles = New-Object System.Collections.ArrayList

Write-Host ""

Write-Host ("-" * 60) -ForegroundColor Cyan

Write-Host "TRAINING IN PROGRESS" -ForegroundColor Cyan

Write-Host ("-" * 60) -ForegroundColor Cyan

Write-Host ""

#  MAIN TRAINING LOOP 

for ($ep = 1; $ep -le $episodes; $ep++) {

    $episodeReward = 0.0

    for ($step = 1; $step -le $stepsPerEpisode; $step++) {

        # OBSERVE: convert recent history into a state string

        # e.g. "Gothic|Fortress" = last two castle types

        $context = @{ RecentTypes = $recentCastles }

        $state   = $agent.GetState($context)

        # ACT: epsilon-greedy -- random or best known action

        $action  = $agent.ChooseAction($state)

        # ENVIRONMENT RESPONSE:

        # IsVaried = true if this castle differs from the previous one

        # VisualBalance and Engagement are simulated here with random values.

        # In a real system these would come from user ratings or scoring.

        $isVaried     = ($recentCastles.Count -eq 0) -or ($recentCastles[-1] -ne $action)

        $visualBalance = Get-Random -Minimum 0.0 -Maximum 1.0

        $engagement    = Get-Random -Minimum 0.0 -Maximum 1.0

        $outcome = @{

            CastleType    = $action

            IsVaried      = $isVaried

            VisualBalance = $visualBalance

            Engagement    = $engagement

        }

        # REWARD: shaped to encourage variety and quality

        $reward         = $agent.CalculateReward($outcome)

        $episodeReward += $reward

        # UPDATE STATE: add chosen castle to recent history

        $recentCastles.Add($action) | Out-Null

        if ($recentCastles.Count -gt 5) {

            $recentCastles.RemoveAt(0)   # Keep only last 5

        }

        # OBSERVE NEXT STATE: what does the agent see now

        $nextContext = @{ RecentTypes = $recentCastles }

        $nextState   = $agent.GetState($nextContext)

        # LEARN: update Q(state, action) using Bellman equation

        $agent.Learn($state, $action, $reward, $nextState)

    }

    # END EPISODE: record reward, decay epsilon

    $agent.EndEpisode($episodeReward)

    # Print progress every 10 episodes

    if ($ep % 10 -eq 0 -or $ep -eq 1 -or $ep -eq $episodes) {

        $stats      = $agent.GetStats()

        $totalActions = $stats.ExplorationCount + $stats.ExploitationCount

        $exploitPct = if ($totalActions -gt 0) { ($stats.ExploitationCount / $totalActions) * 100 } else { 0.0 }

        Write-Host ("Episode {0,3} | Reward: {1,6:F2} | Epsilon: {2:F3} | Exploit: {3,5:F1}% | Q-Table: {4,3} entries" -f `

            $ep, $episodeReward, $stats.Epsilon, $exploitPct, $stats.QTableSize)

    }

}

Write-Host ""

Write-Host "  Training complete!" -ForegroundColor Green

#  FINAL RESULTS 

Write-Host ""

Write-Host ("-" * 60) -ForegroundColor Cyan

Write-Host "FINAL RESULTS" -ForegroundColor Cyan

Write-Host ("-" * 60) -ForegroundColor Cyan

$finalStats = $agent.GetStats()

Write-Host ""

Write-Host "Learning Progress:" -ForegroundColor Yellow

Write-Host "  Total Episodes         : $($finalStats.Episode)"

Write-Host "  Total Reward           : $($finalStats.TotalReward.ToString('F2'))"

Write-Host "  Average Reward         : $($finalStats.AverageReward.ToString('F2'))"

Write-Host "  Recent Average (last 10): $($finalStats.RecentAverageReward.ToString('F2'))"

Write-Host ""

Write-Host "Exploration vs Exploitation:" -ForegroundColor Yellow

Write-Host "  Explorations  : $($finalStats.ExplorationCount)  (random actions taken)"

Write-Host "  Exploitations : $($finalStats.ExploitationCount)  (learned actions taken)"

Write-Host "  Final Epsilon : $($finalStats.Epsilon.ToString('F3'))  (target: 0.010)"

Write-Host ""

Write-Host "Knowledge Base:" -ForegroundColor Yellow

Write-Host "  Q-Table Entries    : $($finalStats.QTableSize)  (state-action pairs learned)"

Write-Host "  Experiences Stored : $($finalStats.MemorySize)"

#  INSPECT WHAT WAS LEARNED 

# This is the unique advantage of Q-learning over DQN:

# you can READ the Q-table and understand exactly what the agent learned.

# A DQN stores knowledge in neural network weights -- much harder to inspect.

Write-Host ""

Write-Host "Learned Q-Values by State:" -ForegroundColor Yellow

Write-Host "  (Positive = agent prefers this castle type in this state)" -ForegroundColor DarkGray

Write-Host "  (Negative = agent avoids this castle type in this state)" -ForegroundColor DarkGray

$statesFound = @()

foreach ($stateKey in $agent.QTable.Keys) {

    $qValues = $agent.GetQValues($stateKey)

    $hasLearning = $false

    foreach ($val in $qValues.Values) {

        if ($val -ne 0) { $hasLearning = $true; break }

    }

    if ($hasLearning) {

        $statesFound += $stateKey

        Write-Host ""

        Write-Host "  State '$stateKey':" -ForegroundColor Cyan

        $sorted = $qValues.GetEnumerator() | Sort-Object Value -Descending

        foreach ($item in $sorted) {

            if ($item.Value -ne 0) {

                $color = if ($item.Value -gt 0) { "Green" }

                         elseif ($item.Value -lt 0) { "Red" }

                         else { "Gray" }

                Write-Host ("    {0,-15} {1,8:F4}" -f $item.Key, $item.Value) -ForegroundColor $color

            }

        }

    }

}

if ($statesFound.Count -eq 0) {

    Write-Host ""

    Write-Host "  No learning detected -- Q-Learning update may not be working." -ForegroundColor Red

} else {

    Write-Host ""

    Write-Host "  Learning detected in states: $($statesFound -join ', ')" -ForegroundColor Green

}

#  IMPROVEMENT CHECK 

# If recent average > overall average, the agent improved during training.

# This is the key sign that Q-learning is working correctly.

if ($finalStats.RecentAverageReward -gt $finalStats.AverageReward) {

    Write-Host ""

    Write-Host "  Agent IMPROVED -- recent rewards higher than overall average!" -ForegroundColor Green

    Write-Host "  Q-learning successfully shifted from exploration to exploitation." -ForegroundColor DarkGray

} else {

    Write-Host ""

    Write-Host "  Agent performance stable -- try more episodes for further improvement." -ForegroundColor Yellow

}

Write-Host ""

# ============================================================================

# WHAT TO TRY NEXT:

# =================

# 1. Increase episodes to 500 -- watch Q-table grow and epsilon reach 0.01

# 2. Change stepsPerEpisode to 20 -- more interactions per episode

# 3. Print agent.QTable directly to see every learned value:

#       $agent.QTable | Format-Table

# 4. Compare with DQN on the same problem -- does neural network learn faster

# 5. Move on to: VBAF.Business.Test.CompanyMarket.ps1 -- multi-agent competition

# ============================================================================