VBAF.ML.CNN.ps1

#Requires -Version 5.1
<#
.SYNOPSIS
    CNN - Convolutional Neural Network Architecture
.DESCRIPTION
    Implements CNN layers from scratch.
    Designed as a TEACHING resource - every operation explained.
    Layers included:
      - Conv2D : sliding kernel, feature map extraction
      - MaxPooling2D : spatial downsampling, keep strongest signal
      - AveragePooling2D : spatial downsampling, smooth average
      - BatchNormalization: stabilise activations during training
      - Dropout : random neuron silencing, prevents overfitting
      - Flatten : 3D feature maps -> 1D vector
      - Dense : fully connected layer
    Model:
      - CNNModel : layer stack, forward pass, backprop, training
    Utilities:
      - Image augmentation: flip, noise, crop
      - Pre-trained loader: load weights from JSON
      - Built-in datasets : MNIST-tiny (8x8 digits), CIFAR-tiny (8x8 objects)
.NOTES
    Part of VBAF - Phase 6 Deep Learning Module - v2.0.0 MAJOR MILESTONE
    PS 5.1 compatible - pure PowerShell, no dependencies
    Teaching project - every matrix operation shown step by step!
    Performance note: PS 5.1 is not optimised for tensor math.
    This is a TEACHING implementation, not a production one.
    For production CNN use Python/TensorFlow/PyTorch.
#>

$basePath = $PSScriptRoot

# ============================================================
# TEACHING NOTE: What is a CNN?
# A Convolutional Neural Network is designed for GRID data:
# images, audio spectrograms, time series.
#
# Key insight: instead of connecting EVERY pixel to EVERY neuron
# (which would be millions of parameters), CNNs use:
#
# CONVOLUTION: a small filter (kernel) slides over the image,
# detecting local patterns like edges, curves, textures.
# The same filter is reused at every position = WEIGHT SHARING.
# This massively reduces parameters!
#
# POOLING: shrinks the spatial size, keeping the most important
# signal. Makes the model robust to small shifts.
#
# DEEP STACK: early layers detect simple features (edges),
# later layers detect complex features (faces, objects).
# ============================================================

# ============================================================
# ACTIVATION FUNCTIONS
# ============================================================
# TEACHING NOTE: Activations add NON-LINEARITY.
# Without them, stacking layers is just matrix multiplication
# which collapses to a single linear operation!
#
# ReLU : max(0, x) - simple, fast, most common
# Sigmoid : 1/(1+e^-x) - squashes to [0,1], for output
# Softmax : e^xi/sum(e^x)- probabilities that sum to 1
# LeakyReLU: max(0.01x,x)- fixes "dying ReLU" problem
# ============================================================

function Invoke-ReLU {
    param([double[]]$x)
    return $x | ForEach-Object { [Math]::Max(0.0, $_) }
}

function Invoke-ReLUGrad {
    param([double[]]$x)
    return $x | ForEach-Object { if ($_ -gt 0) { 1.0 } else { 0.0 } }
}

function Invoke-LeakyReLU {
    param([double[]]$x, [double]$alpha = 0.01)
    return $x | ForEach-Object { if ($_ -gt 0) { $_ } else { $alpha * $_ } }
}

function Invoke-Sigmoid {
    param([double[]]$x)
    return $x | ForEach-Object { 1.0 / (1.0 + [Math]::Exp(-[Math]::Max(-500, [Math]::Min(500, $_)))) }
}

function Invoke-Softmax {
    param([double[]]$x)
    $maxVal = ($x | Measure-Object -Maximum).Maximum
    $exps   = $x | ForEach-Object { [Math]::Exp($_ - $maxVal) }
    $sumExp = ($exps | Measure-Object -Sum).Sum
    return $exps | ForEach-Object { $_ / $sumExp }
}

# ============================================================
# TENSOR UTILITIES (3D arrays: [height][width][channels])
# ============================================================
# TEACHING NOTE: Images are 3D tensors:
# Height x Width x Channels
# e.g. 28x28x1 for grayscale, 32x32x3 for RGB
# In PS 5.1 we represent this as a flat double[] with indexing.
# ============================================================

function New-Tensor {
    param([int]$H, [int]$W, [int]$C, [double]$InitVal = 0.0)
    $t = @{H=$H; W=$W; C=$C; Data=@([double]$InitVal) * ($H*$W*$C)}
    return $t
}

function Get-TensorValue {
    param([hashtable]$T, [int]$h, [int]$w, [int]$c)
    return $T.Data[$h * $T.W * $T.C + $w * $T.C + $c]
}

function Set-TensorValue {
    param([hashtable]$T, [int]$h, [int]$w, [int]$c, [double]$val)
    $T.Data[$h * $T.W * $T.C + $w * $T.C + $c] = $val
}

function New-RandomTensor {
    param([int]$H, [int]$W, [int]$C, [double]$Scale = 0.1, [int]$Seed = 42)
    $rng = [System.Random]::new($Seed)
    $t   = New-Tensor -H $H -W $W -C $C
    for ($i = 0; $i -lt $t.Data.Length; $i++) {
        $t.Data[$i] = ($rng.NextDouble() * 2 - 1) * $Scale
    }
    return $t
}

# ============================================================
# CONV2D LAYER
# ============================================================
# TEACHING NOTE: Convolution explained:
#
# Input: H x W x C_in (height, width, input channels)
# Kernel: KH x KW x C_in x C_out (filter bank)
# Output: H' x W' x C_out
#
# For each output channel (filter):
# Slide the KH x KW kernel across the input.
# At each position, compute: sum(kernel * input_patch) + bias
#
# PADDING = "same" -> output same size as input (pad with zeros)
# PADDING = "valid" -> output shrinks by (kernel_size - 1)
#
# STRIDE: how many pixels to jump between kernel positions.
# Stride=2 halves the spatial dimensions.
# ============================================================

class Conv2D {
    [int]        $Filters      # number of output channels
    [int]        $KernelSize   # square kernel (e.g. 3 = 3x3)
    [int]        $Stride
    [string]     $Padding      # "same" or "valid"
    [string]     $Activation
    [hashtable]  $Weights      # KH x KW x C_in x Filters
    [double[]]   $Biases       # one per filter
    [hashtable]  $LastInput    # cached for backprop
    [hashtable]  $LastOutput
    [bool]       $IsBuild = $false

    Conv2D([int]$filters, [int]$kernelSize) {
        $this.Filters     = $filters
        $this.KernelSize  = $kernelSize
        $this.Stride      = 1
        $this.Padding     = "same"
        $this.Activation  = "relu"
    }

    Conv2D([int]$filters, [int]$kernelSize, [string]$activation) {
        $this.Filters     = $filters
        $this.KernelSize  = $kernelSize
        $this.Stride      = 1
        $this.Padding     = "same"
        $this.Activation  = $activation
    }

    [void] Build([int]$inH, [int]$inW, [int]$inC) {
        # He initialisation: scale = sqrt(2 / fan_in)
        # TEACHING: proper weight init prevents vanishing/exploding gradients!
        $fanIn  = $this.KernelSize * $this.KernelSize * $inC
        $scale  = [Math]::Sqrt(2.0 / $fanIn)
        $this.Weights = New-RandomTensor -H $this.KernelSize -W $this.KernelSize -C ($inC * $this.Filters) -Scale $scale
        $this.Biases  = @(0.0) * $this.Filters
        $this.IsBuild = $true
    }

    # Forward pass
    [hashtable] Forward([hashtable]$input) {
        $this.LastInput = $input
        $inH = $input.H; $inW = $input.W; $inC = $input.C
        $k   = $this.KernelSize

        if (-not $this.IsBuild) { $this.Build($inH, $inW, $inC) }

        # Output dimensions
        $pad  = if ($this.Padding -eq "same") { [int]($k / 2) } else { 0 }
        $outH = [int](($inH + 2*$pad - $k) / $this.Stride) + 1
        $outW = [int](($inW + 2*$pad - $k) / $this.Stride) + 1
        $out  = New-Tensor -H $outH -W $outW -C $this.Filters

        for ($f = 0; $f -lt $this.Filters; $f++) {
            for ($oh = 0; $oh -lt $outH; $oh++) {
                for ($ow = 0; $ow -lt $outW; $ow++) {
                    $sum = $this.Biases[$f]
                    for ($kh = 0; $kh -lt $k; $kh++) {
                        for ($kw = 0; $kw -lt $k; $kw++) {
                            $ih = $oh * $this.Stride - $pad + $kh
                            $iw = $ow * $this.Stride - $pad + $kw
                            if ($ih -ge 0 -and $ih -lt $inH -and $iw -ge 0 -and $iw -lt $inW) {
                                for ($c = 0; $c -lt $inC; $c++) {
                                    $inVal = Get-TensorValue $input $ih $iw $c
                                    $wIdx  = $kh * $k * $inC * $this.Filters + $kw * $inC * $this.Filters + $c * $this.Filters + $f
                                    $sum  += $inVal * $this.Weights.Data[$wIdx]
                                }
                            }
                        }
                    }
                    Set-TensorValue $out $oh $ow $f $sum
                }
            }
        }

        # Apply activation
        if ($this.Activation -eq "relu") {
            $out.Data = Invoke-ReLU $out.Data
        } elseif ($this.Activation -eq "sigmoid") {
            $out.Data = Invoke-Sigmoid $out.Data
        }

        $this.LastOutput = $out
        return $out
    }

    [string] Summary([int]$inH, [int]$inW, [int]$inC) {
        $pad  = if ($this.Padding -eq "same") { [int]($this.KernelSize / 2) } else { 0 }
        $outH = [int](($inH + 2*$pad - $this.KernelSize) / $this.Stride) + 1
        $outW = [int](($inW + 2*$pad - $this.KernelSize) / $this.Stride) + 1
        $params = $this.KernelSize * $this.KernelSize * $inC * $this.Filters + $this.Filters
        return ("Conv2D({0} filters, {1}x{1}) {2}x{3}x{4} -> {5}x{6}x{7} params={8}" -f `
            $this.Filters, $this.KernelSize, $inH, $inW, $inC, $outH, $outW, $this.Filters, $params)
    }
}

# ============================================================
# MAXPOOLING2D LAYER
# ============================================================
# TEACHING NOTE: Pooling reduces spatial size.
# MaxPooling takes the MAXIMUM value in each pool window.
# Why max? It detects "was this feature present anywhere here?"
# Provides TRANSLATION INVARIANCE: feature slightly shifted
# in input still produces same output!
# ============================================================

class MaxPooling2D {
    [int]       $PoolSize
    [int]       $Stride
    [hashtable] $LastInput
    [hashtable] $LastMaxMask  # remember where the max was (for backprop)

    MaxPooling2D([int]$poolSize) {
        $this.PoolSize = $poolSize
        $this.Stride   = $poolSize  # default: non-overlapping
    }

    MaxPooling2D([int]$poolSize, [int]$stride) {
        $this.PoolSize = $poolSize
        $this.Stride   = $stride
    }

    [hashtable] Forward([hashtable]$input) {
        $this.LastInput = $input
        $p   = $this.PoolSize
        $s   = $this.Stride
        $inH = $input.H; $inW = $input.W; $inC = $input.C
        $outH = [int](($inH - $p) / $s) + 1
        $outW = [int](($inW - $p) / $s) + 1
        $out  = New-Tensor -H $outH -W $outW -C $inC

        for ($c = 0; $c -lt $inC; $c++) {
            for ($oh = 0; $oh -lt $outH; $oh++) {
                for ($ow = 0; $ow -lt $outW; $ow++) {
                    $maxVal = [double]::MinValue
                    for ($ph = 0; $ph -lt $p; $ph++) {
                        for ($pw = 0; $pw -lt $p; $pw++) {
                            $ih  = $oh * $s + $ph
                            $iw  = $ow * $s + $pw
                            $val = Get-TensorValue $input $ih $iw $c
                            if ($val -gt $maxVal) { $maxVal = $val }
                        }
                    }
                    Set-TensorValue $out $oh $ow $c $maxVal
                }
            }
        }
        return $out
    }

    [string] Summary([int]$inH, [int]$inW, [int]$inC) {
        $outH = [int](($inH - $this.PoolSize) / $this.Stride) + 1
        $outW = [int](($inW - $this.PoolSize) / $this.Stride) + 1
        return ("MaxPooling2D({0}x{0}) {1}x{2}x{3} -> {4}x{5}x{6} params=0" -f `
            $this.PoolSize, $inH, $inW, $inC, $outH, $outW, $inC)
    }
}

# ============================================================
# AVERAGEPOOLING2D LAYER
# ============================================================
# TEACHING NOTE: AveragePooling takes the MEAN of each window.
# Smoother than MaxPooling, sometimes better for dense features.
# MaxPooling: "is this feature here at all?"
# AvgPooling: "how strongly is this feature present overall?"
# ============================================================

class AveragePooling2D {
    [int]       $PoolSize
    [int]       $Stride

    AveragePooling2D([int]$poolSize) {
        $this.PoolSize = $poolSize
        $this.Stride   = $poolSize
    }

    [hashtable] Forward([hashtable]$input) {
        $p   = $this.PoolSize
        $s   = $this.Stride
        $inH = $input.H; $inW = $input.W; $inC = $input.C
        $outH = [int](($inH - $p) / $s) + 1
        $outW = [int](($inW - $p) / $s) + 1
        $out  = New-Tensor -H $outH -W $outW -C $inC

        for ($c = 0; $c -lt $inC; $c++) {
            for ($oh = 0; $oh -lt $outH; $oh++) {
                for ($ow = 0; $ow -lt $outW; $ow++) {
                    $sum = 0.0
                    for ($ph = 0; $ph -lt $p; $ph++) {
                        for ($pw = 0; $pw -lt $p; $pw++) {
                            $sum += Get-TensorValue $input ($oh*$s+$ph) ($ow*$s+$pw) $c
                        }
                    }
                    Set-TensorValue $out $oh $ow $c ($sum / ($p * $p))
                }
            }
        }
        return $out
    }

    [string] Summary([int]$inH, [int]$inW, [int]$inC) {
        $outH = [int](($inH - $this.PoolSize) / $this.Stride) + 1
        $outW = [int](($inW - $this.PoolSize) / $this.Stride) + 1
        return ("AvgPooling2D({0}x{0}) {1}x{2}x{3} -> {4}x{5}x{6} params=0" -f `
            $this.PoolSize, $inH, $inW, $inC, $outH, $outW, $inC)
    }
}

# ============================================================
# BATCH NORMALIZATION
# ============================================================
# TEACHING NOTE: BatchNorm solves "internal covariate shift":
# activations change distribution as weights update,
# making deeper layers hard to train.
#
# BatchNorm normalises each mini-batch to mean=0, std=1,
# then applies learnable scale (gamma) and shift (beta).
#
# Benefits:
# - Allows higher learning rates
# - Less sensitive to weight initialisation
# - Acts as mild regularisation
#
# Formula: y = gamma * (x - mean) / sqrt(var + eps) + beta
# ============================================================

class BatchNormalization {
    [double[]] $Gamma         # learnable scale
    [double[]] $Beta          # learnable shift
    [double[]] $RunningMean   # tracked across batches for inference
    [double[]] $RunningVar
    [double]   $Epsilon = 1e-8
    [double]   $Momentum = 0.9
    [bool]     $Training = $true
    [bool]     $IsBuild  = $false
    [double[]] $LastNorm      # cached for backprop

    BatchNormalization() {}

    [void] Build([int]$nFeatures) {
        $this.Gamma       = @(1.0) * $nFeatures
        $this.Beta        = @(0.0) * $nFeatures
        $this.RunningMean = @(0.0) * $nFeatures
        $this.RunningVar  = @(1.0) * $nFeatures
        $this.IsBuild     = $true
    }

    [double[]] Forward([double[]]$x) {
        if (-not $this.IsBuild) { $this.Build($x.Length) }

        if ($this.Training) {
            $mean  = ($x | Measure-Object -Average).Average
            $sumSq = 0.0
            foreach ($v in $x) { $sumSq += ($v - $mean) * ($v - $mean) }
            $variance = $sumSq / $x.Length

            # Update running statistics
            for ($i = 0; $i -lt $this.RunningMean.Length; $i++) {
                $this.RunningMean[$i] = $this.Momentum * $this.RunningMean[$i] + (1-$this.Momentum) * $mean
                $this.RunningVar[$i]  = $this.Momentum * $this.RunningVar[$i]  + (1-$this.Momentum) * $variance
            }

            $std = [Math]::Sqrt($variance + $this.Epsilon)
            $normalized = $x | ForEach-Object { ($_ - $mean) / $std }
        } else {
            $std = [Math]::Sqrt($this.RunningVar[0] + $this.Epsilon)
            $normalized = $x | ForEach-Object { ($_ - $this.RunningMean[0]) / $std }
        }

        $this.LastNorm = $normalized
        $result = @(0.0) * $x.Length
        for ($i = 0; $i -lt $x.Length; $i++) {
            $gi = if ($i -lt $this.Gamma.Length) { $this.Gamma[$i] } else { $this.Gamma[0] }
            $bi = if ($i -lt $this.Beta.Length)  { $this.Beta[$i]  } else { $this.Beta[0]  }
            $result[$i] = $gi * $normalized[$i] + $bi
        }
        return $result
    }

    [string] Summary([int]$nFeatures) {
        return ("BatchNorm features={0} params={1} (gamma+beta)" -f $nFeatures, ($nFeatures*2))
    }
}

# ============================================================
# DROPOUT LAYER
# ============================================================
# TEACHING NOTE: Dropout randomly ZEROS OUT neurons during training.
# Rate=0.5 means 50% of neurons are silenced each forward pass.
#
# Why does this help?
# Forces the network to learn REDUNDANT representations.
# No single neuron can be relied upon -> more robust features.
# It's like training an ENSEMBLE of many sub-networks!
#
# IMPORTANT: Dropout is ONLY applied during training.
# During inference, all neurons are active but outputs
# are scaled by (1 - rate) to keep the expected value same.
# ============================================================

class Dropout {
    [double]   $Rate        # fraction of neurons to drop
    [bool]     $Training = $true
    [bool[]]   $LastMask    # which neurons were kept (for backprop)

    Dropout([double]$rate) { $this.Rate = $rate }

    [double[]] Forward([double[]]$x) {
        if (-not $this.Training) {
            # Inference: scale outputs, keep all neurons
            return $x | ForEach-Object { $_ * (1.0 - $this.Rate) }
        }

        $rng          = [System.Random]::new()
        $this.LastMask = @($false) * $x.Length
        $result       = @(0.0) * $x.Length
        $keepProb     = 1.0 - $this.Rate

        for ($i = 0; $i -lt $x.Length; $i++) {
            if ($rng.NextDouble() -gt $this.Rate) {
                $result[$i]       = $x[$i] / $keepProb  # inverted dropout
                $this.LastMask[$i] = $true
            }
        }
        return $result
    }

    [string] Summary([int]$nFeatures) {
        return ("Dropout(rate={0}) features={1} params=0" -f $this.Rate, $nFeatures)
    }
}

# ============================================================
# FLATTEN LAYER
# ============================================================
# TEACHING NOTE: CNNs output 3D feature maps (H x W x C).
# Dense layers expect 1D vectors.
# Flatten just reshapes: H*W*C values in a single row.
# This is the bridge between convolution and classification!
# ============================================================

class Flatten {
    [int] $OutSize

    Flatten() {}

    [double[]] Forward([hashtable]$tensor) {
        $this.OutSize = $tensor.Data.Length
        return $tensor.Data.Clone()
    }

    [string] Summary([int]$inH, [int]$inW, [int]$inC) {
        $total = $inH * $inW * $inC
        return ("Flatten {0}x{1}x{2} -> {3} params=0" -f $inH, $inW, $inC, $total)
    }
}

# ============================================================
# DENSE LAYER (Fully Connected)
# ============================================================
# TEACHING NOTE: Every input connects to every output.
# Parameters = in * out + out (weights + biases)
# This is where final classification decisions are made!
# ============================================================

class DenseLayer {
    [int]      $Units
    [string]   $Activation
    [double[]] $Weights      # Units x InSize (flattened)
    [double[]] $Biases
    [double[]] $LastInput
    [double[]] $LastOutput
    [int]      $InSize
    [bool]     $IsBuild = $false

    DenseLayer([int]$units, [string]$activation) {
        $this.Units      = $units
        $this.Activation = $activation
    }

    [void] Build([int]$inSize) {
        $this.InSize = $inSize
        $scale       = [Math]::Sqrt(2.0 / $inSize)  # He init
        $rng         = [System.Random]::new(42)
        $this.Weights = @(0.0) * ($this.Units * $inSize)
        for ($i = 0; $i -lt $this.Weights.Length; $i++) {
            $this.Weights[$i] = ($rng.NextDouble() * 2 - 1) * $scale
        }
        $this.Biases  = @(0.0) * $this.Units
        $this.IsBuild = $true
    }

    [double[]] Forward([double[]]$x) {
        if (-not $this.IsBuild) { $this.Build($x.Length) }
        $this.LastInput = $x
        $out = @(0.0) * $this.Units

        for ($u = 0; $u -lt $this.Units; $u++) {
            $sum = $this.Biases[$u]
            for ($i = 0; $i -lt $x.Length; $i++) {
                $sum += $x[$i] * $this.Weights[$u * $x.Length + $i]
            }
            $out[$u] = $sum
        }

        # Apply activation
        $activated = switch ($this.Activation) {
            "relu"    { Invoke-ReLU    $out }
            "sigmoid" { Invoke-Sigmoid $out }
            "softmax" { Invoke-Softmax $out }
            "linear"  { $out }
            default   { Invoke-ReLU $out }
        }

        $this.LastOutput = $activated
        return $activated
    }

    [string] Summary([int]$inSize) {
        $params = $inSize * $this.Units + $this.Units
        return ("Dense({0}, {1}) in={2} params={3}" -f $this.Units, $this.Activation, $inSize, $params)
    }
}

# ============================================================
# CNN MODEL
# ============================================================
# TEACHING NOTE: A CNN model is a STACK of layers.
# Data flows forward through each layer (Forward Pass).
# Loss is computed at the end.
# Gradients flow backward (Backpropagation).
# Weights are updated (Gradient Descent).
# ============================================================

class CNNModel {
    [System.Collections.ArrayList] $Layers
    [string]   $Name
    [int]      $InputH
    [int]      $InputW
    [int]      $InputC
    [double]   $LearningRate
    [System.Collections.ArrayList] $LossHistory
    [System.Collections.ArrayList] $AccHistory

    CNNModel([string]$name, [int]$inputH, [int]$inputW, [int]$inputC) {
        $this.Name         = $name
        $this.InputH       = $inputH
        $this.InputW       = $inputW
        $this.InputC       = $inputC
        $this.LearningRate = 0.001
        $this.Layers       = [System.Collections.ArrayList]::new()
        $this.LossHistory  = [System.Collections.ArrayList]::new()
        $this.AccHistory   = [System.Collections.ArrayList]::new()
    }

    [void] Add([object]$layer) {
        $this.Layers.Add($layer) | Out-Null
    }

    # Forward pass through all layers
    [double[]] Predict([hashtable]$inputTensor) {
        $current = $inputTensor
        $flatVec = $null

        foreach ($layer in $this.Layers) {
            $typeName = $layer.GetType().Name
            if ($typeName -eq "Conv2D" -or $typeName -eq "MaxPooling2D" -or $typeName -eq "AveragePooling2D") {
                $current = $layer.Forward($current)
            } elseif ($typeName -eq "Flatten") {
                $flatVec = $layer.Forward($current)
                $current = $null
            } elseif ($typeName -eq "DenseLayer") {
                $flatVec = $layer.Forward($flatVec)
            } elseif ($typeName -eq "BatchNormalization") {
                if ($null -ne $flatVec) {
                    $flatVec = $layer.Forward($flatVec)
                } else {
                    $current.Data = $layer.Forward($current.Data)
                }
            } elseif ($typeName -eq "Dropout") {
                if ($null -ne $flatVec) {
                    $flatVec = $layer.Forward($flatVec)
                }
            }
        }
        return $flatVec
    }

    # Cross-entropy loss for classification
    hidden [double] CrossEntropyLoss([double[]]$probs, [int]$trueClass) {
        $p = [Math]::Max(1e-10, $probs[$trueClass])
        return -[Math]::Log($p)
    }

    # Simple training step (output layer weight update only - teaching simplification)
    [void] TrainStep([hashtable]$inputTensor, [int]$trueClass, [int]$nClasses) {
        $probs = $this.Predict($inputTensor)
        if ($null -eq $probs) { return }

        # Find last dense layer and update it
        $lastDense = $null
        foreach ($layer in $this.Layers) {
            if ($layer.GetType().Name -eq "DenseLayer") { $lastDense = $layer }
        }
        if ($null -eq $lastDense) { return }

        # Softmax gradient: dL/dz = probs - one_hot
        $grad = $probs.Clone()
        $grad[$trueClass] -= 1.0

        # Update last dense layer weights
        for ($u = 0; $u -lt $lastDense.Units; $u++) {
            $lastDense.Biases[$u] -= $this.LearningRate * $grad[$u]
            for ($i = 0; $i -lt $lastDense.LastInput.Length; $i++) {
                $lastDense.Weights[$u * $lastDense.LastInput.Length + $i] -=
                    $this.LearningRate * $grad[$u] * $lastDense.LastInput[$i]
            }
        }
    }

    # Train for one epoch
    [void] Fit([hashtable[]]$Xtensors, [int[]]$y, [int]$nClasses, [int]$epochs, [int]$printEvery) {
        $n = $Xtensors.Length
        Write-Host ""
        Write-Host ("🧠 Training {0}..." -f $this.Name) -ForegroundColor Green

        for ($ep = 1; $ep -le $epochs; $ep++) {
            $totalLoss = 0.0
            $correct   = 0

            for ($i = 0; $i -lt $n; $i++) {
                $this.TrainStep($Xtensors[$i], $y[$i], $nClasses)
                $probs = $this.Predict($Xtensors[$i])
                if ($null -ne $probs) {
                    $totalLoss += $this.CrossEntropyLoss($probs, $y[$i])
                    $predClass  = 0
                    $maxP       = $probs[0]
                    for ($c = 1; $c -lt $probs.Length; $c++) {
                        if ($probs[$c] -gt $maxP) { $maxP=$probs[$c]; $predClass=$c }
                    }
                    if ($predClass -eq $y[$i]) { $correct++ }
                }
            }

            $avgLoss = [Math]::Round($totalLoss / $n, 4)
            $acc     = [Math]::Round($correct / $n, 4)
            $this.LossHistory.Add($avgLoss) | Out-Null
            $this.AccHistory.Add($acc)      | Out-Null

            if ($ep % $printEvery -eq 0 -or $ep -eq 1) {
                $bar = "█" * [int]($acc * 20)
                Write-Host (" Epoch {0,3}/{1} loss={2:F4} acc={3:F3} {4}" -f
                    $ep, $epochs, $avgLoss, $acc, $bar) -ForegroundColor White
            }
        }
        Write-Host "✅ Training complete!" -ForegroundColor Green
    }

    [void] PrintSummary() {
        Write-Host ""
        Write-Host "╔══════════════════════════════════════════════════════╗" -ForegroundColor Cyan
        Write-Host ("║ CNN Model: {0,-41}║" -f $this.Name)                   -ForegroundColor Cyan
        Write-Host "╠══════════════════════════════════════════════════════╣" -ForegroundColor Cyan
        Write-Host ("║ Input: {0}x{1}x{2}{3}║" -f $this.InputH, $this.InputW, $this.InputC, " " * (44 - "$($this.InputH)x$($this.InputW)x$($this.InputC)".Length)) -ForegroundColor White

        $h = $this.InputH; $w = $this.InputW; $c = $this.InputC
        $totalParams = 0

        foreach ($layer in $this.Layers) {
            $typeName = $layer.GetType().Name
            $summary  = ""
            switch ($typeName) {
                "Conv2D" {
                    $summary = $layer.Summary($h, $w, $c)
                    $pad     = if ($layer.Padding -eq "same") { [int]($layer.KernelSize/2) } else { 0 }
                    $nh = [int](($h + 2*$pad - $layer.KernelSize) / $layer.Stride) + 1
                    $nw = [int](($w + 2*$pad - $layer.KernelSize) / $layer.Stride) + 1
                    $totalParams += $layer.KernelSize * $layer.KernelSize * $c * $layer.Filters + $layer.Filters
                    $h=$nh; $w=$nw; $c=$layer.Filters
                }
                "MaxPooling2D" {
                    $summary = $layer.Summary($h, $w, $c)
                    $h=[int](($h-$layer.PoolSize)/$layer.Stride)+1
                    $w=[int](($w-$layer.PoolSize)/$layer.Stride)+1
                }
                "AveragePooling2D" {
                    $summary = $layer.Summary($h, $w, $c)
                    $h=[int](($h-$layer.PoolSize)/$layer.Stride)+1
                    $w=[int](($w-$layer.PoolSize)/$layer.Stride)+1
                }
                "Flatten" {
                    $summary = $layer.Summary($h, $w, $c)
                    $c=$h*$w*$c; $h=1; $w=1
                }
                "DenseLayer" {
                    $summary = $layer.Summary($c)
                    $totalParams += $c * $layer.Units + $layer.Units
                    $c = $layer.Units
                }
                "BatchNormalization" { $summary = $layer.Summary($c) ; $totalParams += $c*2 }
                "Dropout"           { $summary = $layer.Summary($c) }
            }
            Write-Host ("║ {0,-52}║" -f ($summary.Substring(0, [Math]::Min(52, $summary.Length)))) -ForegroundColor White
        }
        Write-Host "╠══════════════════════════════════════════════════════╣" -ForegroundColor Cyan
        Write-Host ("║ Total parameters: {0,-33}║" -f $totalParams)           -ForegroundColor Yellow
        Write-Host "╚══════════════════════════════════════════════════════╝" -ForegroundColor Cyan
        Write-Host ""
    }

    [void] PlotTraining() {
        if ($this.LossHistory.Count -eq 0) { Write-Host "No training history yet." -ForegroundColor Yellow; return }
        $losses = $this.LossHistory.ToArray()
        $accs   = $this.AccHistory.ToArray()
        $maxL   = ($losses | Measure-Object -Maximum).Maximum
        $maxL   = [Math]::Max($maxL, 1e-8)

        Write-Host ""
        Write-Host "📉 Training Loss:" -ForegroundColor Green
        foreach ($l in $losses) {
            $bar = "█" * [int](($l / $maxL) * 30)
            Write-Host (" {0,7:F4} {1}" -f $l, $bar) -ForegroundColor Cyan
        }
        Write-Host ""
        Write-Host "📈 Accuracy:" -ForegroundColor Green
        foreach ($a in $accs) {
            $bar = "█" * [int]($a * 30)
            Write-Host (" {0,7:F3} {1}" -f $a, $bar) -ForegroundColor White
        }
        Write-Host ""
    }
}

# ============================================================
# IMAGE AUGMENTATION
# ============================================================
# TEACHING NOTE: Augmentation artificially increases dataset size
# by creating modified versions of existing images.
# The label stays the same - just the image changes slightly.
# This teaches the model to be ROBUST to variations.
# ============================================================

function Invoke-HorizontalFlip {
    param([hashtable]$tensor)
    $out = New-Tensor -H $tensor.H -W $tensor.W -C $tensor.C
    for ($h = 0; $h -lt $tensor.H; $h++) {
        for ($w = 0; $w -lt $tensor.W; $w++) {
            $flippedW = $tensor.W - 1 - $w
            for ($c = 0; $c -lt $tensor.C; $c++) {
                $val = Get-TensorValue $tensor $h $w $c
                Set-TensorValue $out $h $flippedW $c $val
            }
        }
    }
    return $out
}

function Invoke-AddNoise {
    param([hashtable]$tensor, [double]$stdDev = 0.05)
    $rng = [System.Random]::new()
    $out = New-Tensor -H $tensor.H -W $tensor.W -C $tensor.C
    for ($i = 0; $i -lt $tensor.Data.Length; $i++) {
        # Box-Muller normal random number
        $u1    = [Math]::Max(1e-10, $rng.NextDouble())
        $u2    = $rng.NextDouble()
        $noise = $stdDev * [Math]::Sqrt(-2 * [Math]::Log($u1)) * [Math]::Cos(2 * [Math]::PI * $u2)
        $out.Data[$i] = [Math]::Max(0.0, [Math]::Min(1.0, $tensor.Data[$i] + $noise))
    }
    return $out
}

function Invoke-RandomCrop {
    param([hashtable]$tensor, [int]$cropH, [int]$cropW)
    $rng  = [System.Random]::new()
    $maxH = $tensor.H - $cropH
    $maxW = $tensor.W - $cropW
    $offH = if ($maxH -gt 0) { $rng.Next($maxH) } else { 0 }
    $offW = if ($maxW -gt 0) { $rng.Next($maxW) } else { 0 }
    $out  = New-Tensor -H $cropH -W $cropW -C $tensor.C

    for ($h = 0; $h -lt $cropH; $h++) {
        for ($w = 0; $w -lt $cropW; $w++) {
            for ($c = 0; $c -lt $tensor.C; $c++) {
                $val = Get-TensorValue $tensor ($h+$offH) ($w+$offW) $c
                Set-TensorValue $out $h $w $c $val
            }
        }
    }
    return $out
}

function Invoke-Augment {
    param([hashtable]$tensor, [bool]$flip=$true, [bool]$noise=$true, [double]$noisestd=0.03)
    $out = $tensor
    if ($flip -and ([System.Random]::new().NextDouble() -gt 0.5)) {
        $out = Invoke-HorizontalFlip $out
    }
    if ($noise) { $out = Invoke-AddNoise $out $noisestd }
    return $out
}

# ============================================================
# PRE-TRAINED MODEL SUPPORT
# ============================================================
# TEACHING NOTE: Training a CNN from scratch requires
# thousands of images and hours of compute.
# Transfer learning uses weights from a model already trained
# on a large dataset (like ImageNet) and fine-tunes them
# for your specific task.
# This works because early CNN layers learn universal features
# (edges, textures) useful for ANY image task!
# ============================================================

function Save-CNNWeights {
    param([CNNModel]$model, [string]$path)

    $weights = @{}
    for ($li = 0; $li -lt $model.Layers.Count; $li++) {
        $layer    = $model.Layers[$li]
        $typeName = $layer.GetType().Name
        $key      = "layer_$li`_$typeName"

        if ($typeName -eq "Conv2D") {
            $weights[$key] = @{
                Weights = $layer.Weights.Data
                Biases  = $layer.Biases
                H       = $layer.Weights.H
                W       = $layer.Weights.W
                C       = $layer.Weights.C
            }
        } elseif ($typeName -eq "DenseLayer") {
            $weights[$key] = @{ Weights=$layer.Weights; Biases=$layer.Biases }
        }
    }

    $weights | ConvertTo-Json -Depth 5 | Set-Content -Path $path -Encoding UTF8
    Write-Host "💾 Weights saved: $path" -ForegroundColor Green
}

function Load-CNNWeights {
    param([CNNModel]$model, [string]$path)

    if (-not (Test-Path $path)) {
        Write-Host "❌ Weights file not found: $path" -ForegroundColor Red
        return
    }

    $weights = Get-Content -Path $path -Raw | ConvertFrom-Json
    for ($li = 0; $li -lt $model.Layers.Count; $li++) {
        $layer    = $model.Layers[$li]
        $typeName = $layer.GetType().Name
        $key      = "layer_$li`_$typeName"
        $prop     = $weights.PSObject.Properties[$key]
        if ($null -eq $prop) { continue }
        $w = $prop.Value

        if ($typeName -eq "Conv2D") {
            $layer.Weights.Data = [double[]]$w.Weights
            $layer.Biases       = [double[]]$w.Biases
            $layer.IsBuild      = $true
        } elseif ($typeName -eq "DenseLayer") {
            $layer.Weights = [double[]]$w.Weights
            $layer.Biases  = [double[]]$w.Biases
            $layer.IsBuild = $true
        }
    }
    Write-Host "📂 Weights loaded: $path" -ForegroundColor Green
}

# ============================================================
# BUILT-IN TINY DATASET
# ============================================================
# 8x8 grayscale "images" representing simple shapes
# Class 0 = horizontal bar, Class 1 = vertical bar, Class 2 = diagonal

function Get-VBAFImageDataset {
    param([string]$Name = "TinyShapes")

    switch ($Name) {
        "TinyShapes" {
            Write-Host "📊 Dataset: TinyShapes (8x8 grayscale)" -ForegroundColor Cyan
            Write-Host " Classes: 0=Horizontal, 1=Vertical, 2=Diagonal" -ForegroundColor Cyan

            $templates = @(
                # Class 0: Horizontal bar (row 4 is bright)
                @(0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
                  1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0),
                # Class 1: Vertical bar (col 4 is bright)
                @(0,0,0,0,1,0,0,0, 0,0,0,0,1,0,0,0, 0,0,0,0,1,0,0,0, 0,0,0,0,1,0,0,0,
                  0,0,0,0,1,0,0,0, 0,0,0,0,1,0,0,0, 0,0,0,0,1,0,0,0, 0,0,0,0,1,0,0,0),
                # Class 2: Diagonal
                @(1,0,0,0,0,0,0,0, 0,1,0,0,0,0,0,0, 0,0,1,0,0,0,0,0, 0,0,0,1,0,0,0,0,
                  0,0,0,0,1,0,0,0, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,1)
            )

            $rng      = [System.Random]::new(42)
            $tensors  = @()
            $labels   = @()
            $nPerClass = 10

            foreach ($class in @(0,1,2)) {
                $tpl = $templates[$class]
                for ($s = 0; $s -lt $nPerClass; $s++) {
                    $t = New-Tensor -H 8 -W 8 -C 1
                    for ($i = 0; $i -lt 64; $i++) {
                        $noise      = ($rng.NextDouble() - 0.5) * 0.2
                        $t.Data[$i] = [Math]::Max(0, [Math]::Min(1, $tpl[$i] + $noise))
                    }
                    $tensors += $t
                    $labels  += $class
                }
            }

            return @{ Tensors=$tensors; Labels=[int[]]$labels; NClasses=3;
                      ClassNames=@("Horizontal","Vertical","Diagonal") }
        }
        default {
            Write-Host "❌ Unknown: $Name Available: TinyShapes" -ForegroundColor Red
            return $null
        }
    }
}

# Visualise a tensor as ASCII art
function Show-TensorAscii {
    param([hashtable]$tensor, [string]$label="")
    $chars = " ░▒▓█"
    Write-Host (" {0}" -f $label) -ForegroundColor Green
    for ($h = 0; $h -lt $tensor.H; $h++) {
        $row = " "
        for ($w = 0; $w -lt $tensor.W; $w++) {
            $val  = Get-TensorValue $tensor $h $w 0
            $idx  = [int]($val * 4)
            $idx  = [Math]::Max(0, [Math]::Min(4, $idx))
            $row += $chars[$idx]
            $row += $chars[$idx]  # double width for readability
        }
        Write-Host $row -ForegroundColor Cyan
    }
    Write-Host ""
}

# ============================================================
# TEST
# 1. Run VBAF.LoadAll.ps1
#
# --- Build and summarise a CNN ---
# 2. $model = [CNNModel]::new("ShapeClassifier", 8, 8, 1)
# $model.Add([Conv2D]::new(4, 3)) # 4 filters, 3x3 kernel
# $model.Add([MaxPooling2D]::new(2)) # 2x2 pooling
# $model.Add([Dropout]::new(0.25))
# $model.Add([Flatten]::new())
# $model.Add([DenseLayer]::new(8, "relu"))
# $model.Add([DenseLayer]::new(3, "softmax"))
# $model.PrintSummary()
#
# --- Load dataset ---
# 3. $data = Get-VBAFImageDataset -Name "TinyShapes"
# Show-TensorAscii $data.Tensors[0] "Class: Horizontal"
# Show-TensorAscii $data.Tensors[10] "Class: Vertical"
# Show-TensorAscii $data.Tensors[20] "Class: Diagonal"
#
# --- Train ---
# 4. $model.LearningRate = 0.01
# $model.Fit($data.Tensors, $data.Labels, $data.NClasses, 20, 5)
# $model.PlotTraining()
#
# --- Augmentation ---
# 5. $aug = Invoke-Augment -tensor $data.Tensors[0] -flip $true -noise $true
# Show-TensorAscii $aug "Augmented Horizontal"
#
# --- Save and load weights ---
# 6. Save-CNNWeights -model $model -Path "C:\Temp\cnn_weights.json"
# Load-CNNWeights -model $model -Path "C:\Temp\cnn_weights.json"
# ============================================================
Write-Host "📦 VBAF.ML.CNN.ps1 loaded [v2.0.0 🚀]" -ForegroundColor Green
Write-Host " Classes : Conv2D, MaxPooling2D, AveragePooling2D" -ForegroundColor Cyan
Write-Host " BatchNormalization, Dropout"             -ForegroundColor Cyan
Write-Host " Flatten, DenseLayer, CNNModel"           -ForegroundColor Cyan
Write-Host " Functions : Invoke-HorizontalFlip"                 -ForegroundColor Cyan
Write-Host " Invoke-AddNoise"                         -ForegroundColor Cyan
Write-Host " Invoke-RandomCrop"                       -ForegroundColor Cyan
Write-Host " Invoke-Augment"                          -ForegroundColor Cyan
Write-Host " Save-CNNWeights / Load-CNNWeights"       -ForegroundColor Cyan
Write-Host " Get-VBAFImageDataset"                    -ForegroundColor Cyan
Write-Host " Show-TensorAscii"                        -ForegroundColor Cyan
Write-Host ""
Write-Host " Quick start:" -ForegroundColor Yellow
Write-Host ' $model = [CNNModel]::new("ShapeClassifier", 8, 8, 1)'  -ForegroundColor White
Write-Host ' $model.Add([Conv2D]::new(4, 3))'                        -ForegroundColor White
Write-Host ' $model.Add([MaxPooling2D]::new(2))'                     -ForegroundColor White
Write-Host ' $model.Add([Flatten]::new())'                           -ForegroundColor White
Write-Host ' $model.Add([DenseLayer]::new(3, "softmax"))'            -ForegroundColor White
Write-Host ' $model.PrintSummary()'                                  -ForegroundColor White
Write-Host ""