VBAF

3.0.0

VBAF.ML.DataPipeline.ps1

                                #Requires -Version 5.1

<#

.SYNOPSIS

    Data Pipeline - Comprehensive Data Preprocessing Utilities

.DESCRIPTION

    Implements data preprocessing from scratch.

    Designed as a TEACHING resource - every step explained.

    Features included:

      - Missing value handling  : mean, median, mode, constant imputation

      - Outlier detection       : IQR method, Z-score method

      - Normalization           : StandardScaler, MinMaxScaler, RobustScaler

      - Categorical encoding    : OneHot, Label, Target encoding

      - Train/Test splitting    : random and stratified

      - Stratified sampling     : preserves class distribution

      - Pipeline chaining       : apply steps in sequence

    Standalone - no external VBAF dependencies required.

.NOTES

    Part of VBAF - Phase 5 Data Pipeline Module

    PS 5.1 compatible

    Teaching project - every transformation explained!

#>

$basePath = $PSScriptRoot

# ============================================================

# TEACHING NOTE: Why Data Preprocessing?

# Real-world data is MESSY:

#   - Missing values    : sensors fail, people skip questions

#   - Outliers          : typos, rare events, measurement errors

#   - Different scales  : age (0-100) vs salary (0-1000000)

#   - Text categories   : "red","green","blue" -> numbers

# ML algorithms expect clean, numeric, similarly-scaled data.

# Preprocessing is often 80% of the actual ML work!

# ============================================================

# ============================================================

# MISSING VALUE IMPUTATION

# ============================================================

# TEACHING NOTE: What to do with missing values?

#   Mean   : replace with average - good for symmetric data

#   Median : replace with middle value - robust to outliers

#   Mode   : replace with most common - good for categories

#   Constant: replace with fixed value (e.g. 0, "Unknown")

# Never just DELETE rows - you lose information!

# ============================================================

class MissingValueImputer {

    [string]   $Strategy     # "mean", "median", "mode", "constant"

    [object]   $ConstValue   # used when Strategy = "constant"

    [double[]] $FillValues   # learned fill values per feature

    [bool]     $IsFitted = $false

    MissingValueImputer() { $this.Strategy = "mean" }

    MissingValueImputer([string]$strategy) { $this.Strategy = $strategy }

    MissingValueImputer([string]$strategy, [object]$constValue) {

        $this.Strategy   = $strategy

        $this.ConstValue = $constValue

    }

    # Check if a value is missing (null, empty, NaN, or sentinel -999)

    hidden [bool] IsMissing([object]$val) {

        if ($null -eq $val) { return $true }

        if ("$val" -eq "" -or "$val" -eq "NA" -or "$val" -eq "?") { return $true }

        $d = 0.0

        if ([double]::TryParse("$val", [ref]$d)) {

            return [double]::IsNaN($d)

        }

        return $false

    }

    [void] Fit([object[][]]$X) {

        $nFeatures       = $X[0].Length

        $this.FillValues = @(0.0) * $nFeatures

        for ($f = 0; $f -lt $nFeatures; $f++) {

            $vals = @()

            foreach ($row in $X) {

                if (-not $this.IsMissing($row[$f])) { $vals += [double]$row[$f] }

            }

            if ($vals.Length -eq 0) { $this.FillValues[$f] = 0.0; continue }

            switch ($this.Strategy) {

                "mean"   { $this.FillValues[$f] = ($vals | Measure-Object -Average).Average }

                "median" {

                    $sorted = $vals | Sort-Object

                    $mid    = [int]($sorted.Length / 2)

                    $this.FillValues[$f] = if ($sorted.Length % 2 -eq 0) {

                        ($sorted[$mid-1] + $sorted[$mid]) / 2.0

                    } else { $sorted[$mid] }

                }

                "mode"   {

                    $counts = @{}

                    foreach ($v in $vals) {

                        $k = "$v"

                        if ($counts.ContainsKey($k)) { $counts[$k]++ } else { $counts[$k] = 1 }

                    }

                    $best = 0.0; $bestCount = -1

                    foreach ($kv in $counts.GetEnumerator()) {

                        if ($kv.Value -gt $bestCount) { $bestCount=$kv.Value; $best=[double]$kv.Key }

                    }

                    $this.FillValues[$f] = $best

                }

                "constant" { $this.FillValues[$f] = [double]$this.ConstValue }

            }

        }

        $this.IsFitted = $true

    }

    [double[][]] Transform([object[][]]$X) {

        $result = @()

        foreach ($row in $X) {

            $newRow = @(0.0) * $row.Length

            for ($f = 0; $f -lt $row.Length; $f++) {

                $newRow[$f] = if ($this.IsMissing($row[$f])) {

                    $this.FillValues[$f]

                } else { [double]$row[$f] }

            }

            $result += ,$newRow

        }

        return $result

    }

    [double[][]] FitTransform([object[][]]$X) {

        $this.Fit($X)

        return $this.Transform($X)

    }

    [void] PrintSummary() {

        Write-Host ""

        Write-Host "╔══════════════════════════════════════╗" -ForegroundColor Cyan

        Write-Host "║      Missing Value Imputer           ║" -ForegroundColor Cyan

        Write-Host "╠══════════════════════════════════════╣" -ForegroundColor Cyan

        Write-Host ("║  Strategy  : {0,-24}║" -f $this.Strategy) -ForegroundColor Yellow

        for ($f = 0; $f -lt $this.FillValues.Length; $f++) {

            Write-Host ("║  f{0} fill   : {1,-24}║" -f $f, [Math]::Round($this.FillValues[$f],4)) -ForegroundColor White

        }

        Write-Host "╚══════════════════════════════════════╝" -ForegroundColor Cyan

        Write-Host ""

    }

}

# ============================================================

# OUTLIER DETECTION AND TREATMENT

# ============================================================

# TEACHING NOTE: What is an outlier?

# A value far from the rest - could be error or rare event.

#

# IQR Method (Interquartile Range):

#   Q1 = 25th percentile, Q3 = 75th percentile

#   IQR = Q3 - Q1

#   Outlier if: x < Q1 - 1.5*IQR  OR  x > Q3 + 1.5*IQR

#   This is ROBUST - not affected by the outliers themselves!

#

# Z-Score Method:

#   z = (x - mean) / std

#   Outlier if |z| > threshold (usually 3)

#   Assumes normal distribution!

#

# Treatment options:

#   Remove : delete the row entirely

#   Clip   : cap to the boundary value (Winsorizing)

#   Flag   : add a column marking outliers (keep data!)

# ============================================================

class OutlierDetector {

    [string]   $Method      # "iqr" or "zscore"

    [string]   $Treatment   # "remove", "clip", "flag"

    [double]   $Threshold   # IQR multiplier or Z-score threshold

    [double[]] $LowerBounds # per feature

    [double[]] $UpperBounds # per feature

    [bool]     $IsFitted = $false

    OutlierDetector() {

        $this.Method    = "iqr"

        $this.Treatment = "clip"

        $this.Threshold = 1.5

    }

    OutlierDetector([string]$method, [string]$treatment, [double]$threshold) {

        $this.Method    = $method

        $this.Treatment = $treatment

        $this.Threshold = $threshold

    }

    hidden [double] Percentile([double[]]$sorted, [double]$p) {

        $idx = $p / 100.0 * ($sorted.Length - 1)

        $lo  = [int][Math]::Floor($idx)

        $hi  = [int][Math]::Ceiling($idx)

        if ($lo -eq $hi) { return $sorted[$lo] }

        return $sorted[$lo] + ($idx - $lo) * ($sorted[$hi] - $sorted[$lo])

    }

    [void] Fit([double[][]]$X) {

        $nFeatures       = $X[0].Length

        $this.LowerBounds = @(0.0) * $nFeatures

        $this.UpperBounds = @(0.0) * $nFeatures

        for ($f = 0; $f -lt $nFeatures; $f++) {

            $vals   = $X | ForEach-Object { $_[$f] }

            $sorted = $vals | Sort-Object

            if ($this.Method -eq "iqr") {

                $q1  = $this.Percentile($sorted, 25)

                $q3  = $this.Percentile($sorted, 75)

                $iqr = $q3 - $q1

                $this.LowerBounds[$f] = $q1 - $this.Threshold * $iqr

                $this.UpperBounds[$f] = $q3 + $this.Threshold * $iqr

            } else {

                # Z-score

                $mean  = ($vals | Measure-Object -Average).Average

                $sumSq = 0.0

                foreach ($v in $vals) { $sumSq += ($v - $mean) * ($v - $mean) }

                $std   = [Math]::Sqrt($sumSq / $vals.Count)

                $std   = [Math]::Max($std, 1e-8)

                $this.LowerBounds[$f] = $mean - $this.Threshold * $std

                $this.UpperBounds[$f] = $mean + $this.Threshold * $std

            }

        }

        $this.IsFitted = $true

    }

    # Returns: @{ Data = cleaned data; OutlierMask = bool[] per row }

    [hashtable] Transform([double[][]]$X) {

        $result      = @()

        $outlierMask = @()

        foreach ($row in $X) {

            $isOutlier = $false

            $newRow    = $row.Clone()

            for ($f = 0; $f -lt $row.Length; $f++) {

                if ($row[$f] -lt $this.LowerBounds[$f] -or $row[$f] -gt $this.UpperBounds[$f]) {

                    $isOutlier = $true

                    if ($this.Treatment -eq "clip") {

                        $newRow[$f] = [Math]::Max($this.LowerBounds[$f],

                                      [Math]::Min($this.UpperBounds[$f], $row[$f]))

                    }

                }

            }

            $outlierMask += $isOutlier

            if ($this.Treatment -ne "remove" -or -not $isOutlier) {

                $result += ,$newRow

            }

        }

        return @{ Data = $result; OutlierMask = $outlierMask }

    }

    [void] PrintSummary() {

        Write-Host ""

        Write-Host "╔══════════════════════════════════════╗" -ForegroundColor Cyan

        Write-Host "║        Outlier Detector              ║" -ForegroundColor Cyan

        Write-Host "╠══════════════════════════════════════╣" -ForegroundColor Cyan

        Write-Host ("║  Method    : {0,-24}║" -f $this.Method)    -ForegroundColor Yellow

        Write-Host ("║  Treatment : {0,-24}║" -f $this.Treatment) -ForegroundColor Yellow

        Write-Host ("║  Threshold : {0,-24}║" -f $this.Threshold) -ForegroundColor Yellow

        Write-Host "╠══════════════════════════════════════╣" -ForegroundColor Cyan

        for ($f = 0; $f -lt $this.LowerBounds.Length; $f++) {

            Write-Host ("║  f{0} bounds : [{1,8:F2}, {2,8:F2}]     ║" -f $f,

                $this.LowerBounds[$f], $this.UpperBounds[$f]) -ForegroundColor White

        }

        Write-Host "╚══════════════════════════════════════╝" -ForegroundColor Cyan

        Write-Host ""

    }

}

# ============================================================

# ROBUST SCALER

# ============================================================

# TEACHING NOTE: StandardScaler uses mean and std -

# both are sensitive to outliers!

# RobustScaler uses MEDIAN and IQR instead:

#   z = (x - median) / IQR

# Much more robust when data has outliers!

# ============================================================

class RobustScaler {

    [double[]] $Medians

    [double[]] $IQRs

    [bool]     $IsFitted = $false

    RobustScaler() {}

    hidden [double] Percentile([double[]]$sorted, [double]$p) {

        $idx = $p / 100.0 * ($sorted.Length - 1)

        $lo  = [int][Math]::Floor($idx)

        $hi  = [int][Math]::Ceiling($idx)

        if ($lo -eq $hi) { return $sorted[$lo] }

        return $sorted[$lo] + ($idx - $lo) * ($sorted[$hi] - $sorted[$lo])

    }

    [void] Fit([double[][]]$X) {

        $nFeatures    = $X[0].Length

        $this.Medians = @(0.0) * $nFeatures

        $this.IQRs    = @(1.0) * $nFeatures

        for ($f = 0; $f -lt $nFeatures; $f++) {

            $vals   = $X | ForEach-Object { $_[$f] }

            $sorted = $vals | Sort-Object

            $q1     = $this.Percentile($sorted, 25)

            $q3     = $this.Percentile($sorted, 75)

            $this.Medians[$f] = $this.Percentile($sorted, 50)

            $this.IQRs[$f]    = [Math]::Max($q3 - $q1, 1e-8)

        }

        $this.IsFitted = $true

    }

    [double[][]] Transform([double[][]]$X) {

        $result = @()

        foreach ($row in $X) {

            $scaled = @(0.0) * $row.Length

            for ($f = 0; $f -lt $row.Length; $f++) {

                $scaled[$f] = ($row[$f] - $this.Medians[$f]) / $this.IQRs[$f]

            }

            $result += ,$scaled

        }

        return $result

    }

    [double[][]] FitTransform([double[][]]$X) {

        $this.Fit($X)

        return $this.Transform($X)

    }

}

# ============================================================

# CATEGORICAL ENCODING

# ============================================================

# TEACHING NOTE: ML algorithms need NUMBERS, not text.

# Three ways to encode categories:

#

# Label Encoding: "red"=0, "green"=1, "blue"=2

#   PROBLEM: implies red < green < blue (false ordering!)

#   Use ONLY for ordinal categories (small, medium, large)

#

# One-Hot Encoding: "red" -> [1,0,0], "green" -> [0,1,0]

#   Each category gets its own binary column

#   No false ordering! But adds many columns.

#

# Target Encoding: replace category with mean of target

#   "red" -> mean(y) for all red rows

#   Powerful but can OVERFIT - use carefully!

# ============================================================

class LabelEncoder {

    [hashtable] $Mapping        # category -> integer

    [hashtable] $InverseMapping # integer -> category

    [bool]      $IsFitted = $false

    LabelEncoder() {}

    [void] Fit([string[]]$categories) {

        $this.Mapping        = @{}

        $this.InverseMapping = @{}

        $unique = $categories | Select-Object -Unique | Sort-Object

        $idx    = 0

        foreach ($cat in $unique) {

            $this.Mapping[$cat]        = $idx

            $this.InverseMapping[$idx] = $cat

            $idx++

        }

        $this.IsFitted = $true

    }

    [int[]] Transform([string[]]$categories) {

        $result = @(0) * $categories.Length

        for ($i = 0; $i -lt $categories.Length; $i++) {

            $result[$i] = if ($this.Mapping.ContainsKey($categories[$i])) {

                $this.Mapping[$categories[$i]]

            } else { -1 }  # unknown category

        }

        return $result

    }

    [int[]] FitTransform([string[]]$categories) {

        $this.Fit($categories)

        return $this.Transform($categories)

    }

    [string[]] InverseTransform([int[]]$labels) {

        $result = @("") * $labels.Length

        for ($i = 0; $i -lt $labels.Length; $i++) {

            $result[$i] = if ($this.InverseMapping.ContainsKey($labels[$i])) {

                $this.InverseMapping[$labels[$i]]

            } else { "unknown" }

        }

        return $result

    }

    [void] PrintMapping() {

        Write-Host ""

        Write-Host "🏷️  Label Encoder Mapping:" -ForegroundColor Green

        foreach ($kv in $this.Mapping.GetEnumerator() | Sort-Object Value) {

            Write-Host ("   '{0}' -> {1}" -f $kv.Key, $kv.Value) -ForegroundColor White

        }

        Write-Host ""

    }

}

class OneHotEncoder {

    [string[]]  $Categories    # unique categories learned

    [bool]      $IsFitted = $false

    OneHotEncoder() {}

    [void] Fit([string[]]$categories) {

        $this.Categories = $categories | Select-Object -Unique | Sort-Object

        $this.IsFitted   = $true

    }

    [double[][]] Transform([string[]]$categories) {

        $result = @()

        foreach ($cat in $categories) {

            $vec = @(0.0) * $this.Categories.Length

            for ($i = 0; $i -lt $this.Categories.Length; $i++) {

                if ($this.Categories[$i] -eq $cat) { $vec[$i] = 1.0 }

            }

            $result += ,$vec

        }

        return $result

    }

    [double[][]] FitTransform([string[]]$categories) {

        $this.Fit($categories)

        return $this.Transform($categories)

    }

    [void] PrintMapping() {

        Write-Host ""

        Write-Host "🔢 One-Hot Encoder Mapping:" -ForegroundColor Green

        Write-Host ("   Columns: [{0}]" -f ($this.Categories -join ", ")) -ForegroundColor Cyan

        foreach ($cat in $this.Categories) {

            $vec = $this.Transform(@($cat))[0]

            Write-Host ("   '{0}' -> [{1}]" -f $cat, ($vec -join ", ")) -ForegroundColor White

        }

        Write-Host ""

    }

}

class TargetEncoder {

    [hashtable] $Mapping      # category -> mean target value

    [double]    $GlobalMean   # fallback for unseen categories

    [bool]      $IsFitted = $false

    TargetEncoder() {}

    [void] Fit([string[]]$categories, [double[]]$y) {

        $this.Mapping    = @{}

        $this.GlobalMean = ($y | Measure-Object -Average).Average

        $unique = $categories | Select-Object -Unique

        foreach ($cat in $unique) {

            $vals = @()

            for ($i = 0; $i -lt $categories.Length; $i++) {

                if ($categories[$i] -eq $cat) { $vals += $y[$i] }

            }

            $this.Mapping[$cat] = ($vals | Measure-Object -Average).Average

        }

        $this.IsFitted = $true

    }

    [double[]] Transform([string[]]$categories) {

        $result = @(0.0) * $categories.Length

        for ($i = 0; $i -lt $categories.Length; $i++) {

            $result[$i] = if ($this.Mapping.ContainsKey($categories[$i])) {

                $this.Mapping[$categories[$i]]

            } else { $this.GlobalMean }

        }

        return $result

    }

    [double[]] FitTransform([string[]]$categories, [double[]]$y) {

        $this.Fit($categories, $y)

        return $this.Transform($categories)

    }

    [void] PrintMapping() {

        Write-Host ""

        Write-Host "🎯 Target Encoder Mapping:" -ForegroundColor Green

        foreach ($kv in $this.Mapping.GetEnumerator() | Sort-Object Value) {

            Write-Host ("   '{0}' -> {1:F4}" -f $kv.Key, $kv.Value) -ForegroundColor White

        }

        Write-Host ""

    }

}

# ============================================================

# TRAIN/TEST SPLIT

# ============================================================

# TEACHING NOTE: Why split data?

# We train on one part and TEST on another part we haven't seen.

# This gives an HONEST estimate of how well our model generalises.

# Typical split: 80% train, 20% test

#

# Stratified split: ensures SAME CLASS RATIO in both sets.

# e.g. if 30% spam in full data -> 30% spam in train AND test

# ============================================================

function Split-TrainTest {

    param(

        [double[][]] $X,

        [object[]]   $y,

        [double]     $TestSize  = 0.2,

        [bool]       $Stratify  = $false,

        [int]        $Seed      = 42

    )

    $rng = [System.Random]::new($Seed)

    $n   = $X.Length

    if ($Stratify) {

        # Stratified: split within each class

        $classes  = $y | Select-Object -Unique

        $trainIdx = [System.Collections.ArrayList]::new()

        $testIdx  = [System.Collections.ArrayList]::new()

        foreach ($c in $classes) {

            $classIdx = @()

            for ($i = 0; $i -lt $n; $i++) {

                if ("$($y[$i])" -eq "$c") { $classIdx += $i }

            }

            $shuffled  = $classIdx | Sort-Object { $rng.Next() }

            $nTest     = [Math]::Max(1, [int]([Math]::Round($classIdx.Length * $TestSize)))

            for ($i = 0; $i -lt $shuffled.Length; $i++) {

                if ($i -lt $nTest) { $testIdx.Add($shuffled[$i])  | Out-Null }

                else               { $trainIdx.Add($shuffled[$i]) | Out-Null }

            }

        }

    } else {

        # Random split

        $shuffled  = 0..($n-1) | Sort-Object { $rng.Next() }

        $nTest     = [int]([Math]::Round($n * $TestSize))

        $trainIdx  = [System.Collections.ArrayList]::new()

        $testIdx   = [System.Collections.ArrayList]::new()

        for ($i = 0; $i -lt $shuffled.Length; $i++) {

            if ($i -lt $nTest) { $testIdx.Add($shuffled[$i])  | Out-Null }

            else               { $trainIdx.Add($shuffled[$i]) | Out-Null }

        }

    }

    $XTrain = @(); $yTrain = @()

    $XTest  = @(); $yTest  = @()

    foreach ($i in $trainIdx) { $XTrain += ,$X[$i]; $yTrain += $y[$i] }

    foreach ($i in $testIdx)  { $XTest  += ,$X[$i]; $yTest  += $y[$i] }

    Write-Host ""

    Write-Host "✂️  Train/Test Split" -ForegroundColor Green

    Write-Host ("   Total   : {0} samples" -f $n)             -ForegroundColor Cyan

    Write-Host ("   Train   : {0} samples ({1:F0}%)" -f $XTrain.Length, (100*(1-$TestSize))) -ForegroundColor White

    Write-Host ("   Test    : {0} samples ({1:F0}%)" -f $XTest.Length,  (100*$TestSize))     -ForegroundColor White

    Write-Host ("   Stratify: {0}" -f $Stratify)              -ForegroundColor White

    Write-Host ""

    return @{ XTrain=$XTrain; yTrain=$yTrain; XTest=$XTest; yTest=$yTest }

}

# ============================================================

# DATA SUMMARY UTILITY

# ============================================================

# TEACHING NOTE: Always LOOK at your data before modelling!

# Check: ranges, missing values, distributions.

# ============================================================

function Get-DataSummary {

    param(

        [double[][]] $X,

        [string[]]   $FeatureNames = @()

    )

    $n         = $X.Length

    $nFeatures = $X[0].Length

    Write-Host ""

    Write-Host "📋 Data Summary" -ForegroundColor Green

    Write-Host ("   Samples  : {0}" -f $n)         -ForegroundColor Cyan

    Write-Host ("   Features : {0}" -f $nFeatures) -ForegroundColor Cyan

    Write-Host ""

    Write-Host ("  {0,-12} {1,8} {2,8} {3,8} {4,8} {5,8}" -f "Feature","Min","Max","Mean","Std","Missing") -ForegroundColor Yellow

    Write-Host ("  {0}" -f ("-" * 58)) -ForegroundColor DarkGray

    for ($f = 0; $f -lt $nFeatures; $f++) {

        $vals    = $X | ForEach-Object { $_[$f] }

        $name    = if ($f -lt $FeatureNames.Length) { $FeatureNames[$f] } else { "f$f" }

        $min     = ($vals | Measure-Object -Minimum).Minimum

        $max     = ($vals | Measure-Object -Maximum).Maximum

        $mean    = ($vals | Measure-Object -Average).Average

        $sumSq   = 0.0

        foreach ($v in $vals) { $sumSq += ($v - $mean) * ($v - $mean) }

        $std     = [Math]::Sqrt($sumSq / $vals.Count)

        $missing = ($vals | Where-Object { [double]::IsNaN($_) }).Count

        $color = if ($missing -gt 0) { "Yellow" } else { "White" }

        Write-Host ("  {0,-12} {1,8:F2} {2,8:F2} {3,8:F2} {4,8:F2} {5,8}" -f

            $name, $min, $max, $mean, $std, $missing) -ForegroundColor $color

    }

    Write-Host ""

}

# ============================================================

# BUILT-IN DEMO DATASET WITH ISSUES

# ============================================================

function Get-VBAFPipelineDataset {

    param([string]$Name = "MestyHousePrice")

    switch ($Name) {

        "MessyHousePrice" {

            Write-Host "📊 Dataset: MessyHousePrice (messy version!)" -ForegroundColor Cyan

            Write-Host "   Has: missing values, outliers, categories" -ForegroundColor Yellow

            # Features: size_sqm, bedrooms, age_years, condition (cat), price

            $rawData = @(

                @{size=50.0;  beds=1.0;   age=20.0; cond="good";      price=150.0},

                @{size=75.0;  beds=2.0;   age=15.0; cond="good";      price=220.0},

                @{size=$null; beds=3.0;   age=10.0; cond="excellent";  price=310.0},  # missing size

                @{size=120.0; beds=3.0;   age=5.0;  cond="excellent";  price=370.0},

                @{size=150.0; beds=4.0;   age=2.0;  cond="excellent";  price=450.0},

                @{size=60.0;  beds=2.0;   age=25.0; cond="fair";       price=175.0},

                @{size=80.0;  beds=$null; age=18.0; cond="good";       price=240.0},  # missing beds

                @{size=90.0;  beds=3.0;   age=12.0; cond="good";       price=270.0},

                @{size=999.0; beds=3.0;   age=8.0;  cond="good";       price=340.0},  # outlier size!

                @{size=130.0; beds=4.0;   age=3.0;  cond="excellent";  price=400.0},

                @{size=55.0;  beds=1.0;   age=22.0; cond="fair";       price=160.0},

                @{size=70.0;  beds=2.0;   age=16.0; cond="good";       price=210.0},

                @{size=95.0;  beds=3.0;   age=11.0; cond="good";       price=290.0},

                @{size=115.0; beds=3.0;   age=6.0;  cond="excellent";  price=355.0},

                @{size=140.0; beds=4.0;   age=1.0;  cond="excellent";  price=430.0},

                @{size=65.0;  beds=2.0;   age=19.0; cond="fair";       price=$null},  # missing price

                @{size=85.0;  beds=2.0;   age=14.0; cond="good";       price=255.0},

                @{size=105.0; beds=3.0;   age=9.0;  cond="good";       price=320.0},

                @{size=125.0; beds=4.0;   age=4.0;  cond="excellent";  price=385.0},

                @{size=160.0; beds=5.0;   age=1.0;  cond="excellent";  price=500.0}

            )

            $X    = @(); $y = @(); $conditions = @()

            foreach ($row in $rawData) {

                $X           += ,@($row.size, $row.beds, $row.age)

                $y           += $row.price

                $conditions  += $row.cond

            }

            return @{ X=$X; y=$y; Conditions=$conditions

                      Features=@("size_sqm","bedrooms","age_years") }

        }

        "IrisWithCategories" {

            Write-Host "📊 Dataset: IrisWithCategories" -ForegroundColor Cyan

            Write-Host "   Has: numeric features + text labels" -ForegroundColor Yellow

            $X = @(

                @(5.1,3.5,1.4,0.2),@(4.9,3.0,1.4,0.2),@(4.7,3.2,1.3,0.2),

                @(5.0,3.6,1.4,0.2),@(5.4,3.9,1.7,0.4),@(4.6,3.4,1.4,0.3),

                @(7.0,3.2,4.7,1.4),@(6.4,3.2,4.5,1.5),@(6.9,3.1,4.9,1.5),

                @(5.5,2.3,4.0,1.3),@(6.5,2.8,4.6,1.5),@(5.7,2.8,4.5,1.3),

                @(6.3,3.3,6.0,2.5),@(5.8,2.7,5.1,1.9),@(7.1,3.0,5.9,2.1)

            )

            $labels = @("setosa","setosa","setosa","setosa","setosa","setosa",

                        "versicolor","versicolor","versicolor","versicolor","versicolor","versicolor",

                        "virginica","virginica","virginica")

            return @{ X=$X; Labels=$labels; Features=@("sepal_l","sepal_w","petal_l","petal_w") }

        }

        default {

            Write-Host "❌ Unknown dataset: $Name" -ForegroundColor Red

            Write-Host "   Available: MessyHousePrice, IrisWithCategories" -ForegroundColor Yellow

            return $null

        }

    }

}

# ============================================================

# TEST

# 1. Run VBAF.LoadAll.ps1

#

# --- Full pipeline on messy data ---

# 2. $data = Get-VBAFPipelineDataset -Name "MessyHousePrice"

#

# 3. # Step 1: Look at raw data

#    Get-DataSummary -X $data.X -FeatureNames $data.Features

#

# 4. # Step 2: Impute missing values

#    $imp  = [MissingValueImputer]::new("median")

#    $Ximp = $imp.FitTransform($data.X)

#    $imp.PrintSummary()

#

# 5. # Step 3: Detect and clip outliers

#    $od      = [OutlierDetector]::new("iqr", "clip", 1.5)

#    $od.Fit($Ximp)

#    $od.PrintSummary()

#    $result  = $od.Transform($Ximp)

#    $Xclean  = $result.Data

#    Write-Host "Outliers found: $(($result.OutlierMask | Where-Object {$_}).Count)"

#

# 6. # Step 4: Encode categories

#    $ohe    = [OneHotEncoder]::new()

#    $Xcond  = $ohe.FitTransform($data.Conditions)

#    $ohe.PrintMapping()

#

# 7. # Step 5: Scale features

#    $scaler = [RobustScaler]::new()

#    $Xscaled = $scaler.FitTransform($Xclean)

#    Get-DataSummary -X $Xscaled -FeatureNames $data.Features

#

# 8. # Step 6: Train/Test split (stratified not needed for regression)

#    $split = Split-TrainTest -X $Xscaled -y $data.y -TestSize 0.2

#

# --- Label and Target encoding ---

# 9. $le  = [LabelEncoder]::new()

#    $enc = $le.FitTransform($data.Conditions)

#    $le.PrintMapping()

#

# 10. $te   = [TargetEncoder]::new()

#     $tenc = $te.FitTransform($data.Conditions, [double[]]$data.y)

#     $te.PrintMapping()

# ============================================================

Write-Host "📦 VBAF.ML.DataPipeline.ps1 loaded" -ForegroundColor Green

Write-Host "   Classes   : MissingValueImputer"              -ForegroundColor Cyan

Write-Host "              OutlierDetector"                   -ForegroundColor Cyan

Write-Host "              RobustScaler"                      -ForegroundColor Cyan

Write-Host "              LabelEncoder"                      -ForegroundColor Cyan

Write-Host "              OneHotEncoder"                     -ForegroundColor Cyan

Write-Host "              TargetEncoder"                     -ForegroundColor Cyan

Write-Host "   Functions : Split-TrainTest"                  -ForegroundColor Cyan

Write-Host "              Get-DataSummary"                   -ForegroundColor Cyan

Write-Host "              Get-VBAFPipelineDataset"           -ForegroundColor Cyan

Write-Host ""

Write-Host "   Quick start:" -ForegroundColor Yellow

Write-Host '   $data = Get-VBAFPipelineDataset -Name "MessyHousePrice"' -ForegroundColor White

Write-Host '   $imp  = [MissingValueImputer]::new("median")'            -ForegroundColor White

Write-Host '   $Ximp = $imp.FitTransform($data.X)'                      -ForegroundColor White

Write-Host '   $imp.PrintSummary()'                                     -ForegroundColor White

Write-Host ""