VBAF.ML.DataPipeline.ps1
|
#Requires -Version 5.1 <# .SYNOPSIS Data Pipeline - Comprehensive Data Preprocessing Utilities .DESCRIPTION Implements data preprocessing from scratch. Designed as a TEACHING resource - every step explained. Features included: - Missing value handling : mean, median, mode, constant imputation - Outlier detection : IQR method, Z-score method - Normalization : StandardScaler, MinMaxScaler, RobustScaler - Categorical encoding : OneHot, Label, Target encoding - Train/Test splitting : random and stratified - Stratified sampling : preserves class distribution - Pipeline chaining : apply steps in sequence Standalone - no external VBAF dependencies required. .NOTES Part of VBAF - Phase 5 Data Pipeline Module PS 5.1 compatible Teaching project - every transformation explained! #> $basePath = $PSScriptRoot # ============================================================ # TEACHING NOTE: Why Data Preprocessing? # Real-world data is MESSY: # - Missing values : sensors fail, people skip questions # - Outliers : typos, rare events, measurement errors # - Different scales : age (0-100) vs salary (0-1000000) # - Text categories : "red","green","blue" -> numbers # ML algorithms expect clean, numeric, similarly-scaled data. # Preprocessing is often 80% of the actual ML work! # ============================================================ # ============================================================ # MISSING VALUE IMPUTATION # ============================================================ # TEACHING NOTE: What to do with missing values? # Mean : replace with average - good for symmetric data # Median : replace with middle value - robust to outliers # Mode : replace with most common - good for categories # Constant: replace with fixed value (e.g. 0, "Unknown") # Never just DELETE rows - you lose information! # ============================================================ class MissingValueImputer { [string] $Strategy # "mean", "median", "mode", "constant" [object] $ConstValue # used when Strategy = "constant" [double[]] $FillValues # learned fill values per feature [bool] $IsFitted = $false MissingValueImputer() { $this.Strategy = "mean" } MissingValueImputer([string]$strategy) { $this.Strategy = $strategy } MissingValueImputer([string]$strategy, [object]$constValue) { $this.Strategy = $strategy $this.ConstValue = $constValue } # Check if a value is missing (null, empty, NaN, or sentinel -999) hidden [bool] IsMissing([object]$val) { if ($null -eq $val) { return $true } if ("$val" -eq "" -or "$val" -eq "NA" -or "$val" -eq "?") { return $true } $d = 0.0 if ([double]::TryParse("$val", [ref]$d)) { return [double]::IsNaN($d) } return $false } [void] Fit([object[][]]$X) { $nFeatures = $X[0].Length $this.FillValues = @(0.0) * $nFeatures for ($f = 0; $f -lt $nFeatures; $f++) { $vals = @() foreach ($row in $X) { if (-not $this.IsMissing($row[$f])) { $vals += [double]$row[$f] } } if ($vals.Length -eq 0) { $this.FillValues[$f] = 0.0; continue } switch ($this.Strategy) { "mean" { $this.FillValues[$f] = ($vals | Measure-Object -Average).Average } "median" { $sorted = $vals | Sort-Object $mid = [int]($sorted.Length / 2) $this.FillValues[$f] = if ($sorted.Length % 2 -eq 0) { ($sorted[$mid-1] + $sorted[$mid]) / 2.0 } else { $sorted[$mid] } } "mode" { $counts = @{} foreach ($v in $vals) { $k = "$v" if ($counts.ContainsKey($k)) { $counts[$k]++ } else { $counts[$k] = 1 } } $best = 0.0; $bestCount = -1 foreach ($kv in $counts.GetEnumerator()) { if ($kv.Value -gt $bestCount) { $bestCount=$kv.Value; $best=[double]$kv.Key } } $this.FillValues[$f] = $best } "constant" { $this.FillValues[$f] = [double]$this.ConstValue } } } $this.IsFitted = $true } [double[][]] Transform([object[][]]$X) { $result = @() foreach ($row in $X) { $newRow = @(0.0) * $row.Length for ($f = 0; $f -lt $row.Length; $f++) { $newRow[$f] = if ($this.IsMissing($row[$f])) { $this.FillValues[$f] } else { [double]$row[$f] } } $result += ,$newRow } return $result } [double[][]] FitTransform([object[][]]$X) { $this.Fit($X) return $this.Transform($X) } [void] PrintSummary() { Write-Host "" Write-Host "╔══════════════════════════════════════╗" -ForegroundColor Cyan Write-Host "║ Missing Value Imputer ║" -ForegroundColor Cyan Write-Host "╠══════════════════════════════════════╣" -ForegroundColor Cyan Write-Host ("║ Strategy : {0,-24}║" -f $this.Strategy) -ForegroundColor Yellow for ($f = 0; $f -lt $this.FillValues.Length; $f++) { Write-Host ("║ f{0} fill : {1,-24}║" -f $f, [Math]::Round($this.FillValues[$f],4)) -ForegroundColor White } Write-Host "╚══════════════════════════════════════╝" -ForegroundColor Cyan Write-Host "" } } # ============================================================ # OUTLIER DETECTION AND TREATMENT # ============================================================ # TEACHING NOTE: What is an outlier? # A value far from the rest - could be error or rare event. # # IQR Method (Interquartile Range): # Q1 = 25th percentile, Q3 = 75th percentile # IQR = Q3 - Q1 # Outlier if: x < Q1 - 1.5*IQR OR x > Q3 + 1.5*IQR # This is ROBUST - not affected by the outliers themselves! # # Z-Score Method: # z = (x - mean) / std # Outlier if |z| > threshold (usually 3) # Assumes normal distribution! # # Treatment options: # Remove : delete the row entirely # Clip : cap to the boundary value (Winsorizing) # Flag : add a column marking outliers (keep data!) # ============================================================ class OutlierDetector { [string] $Method # "iqr" or "zscore" [string] $Treatment # "remove", "clip", "flag" [double] $Threshold # IQR multiplier or Z-score threshold [double[]] $LowerBounds # per feature [double[]] $UpperBounds # per feature [bool] $IsFitted = $false OutlierDetector() { $this.Method = "iqr" $this.Treatment = "clip" $this.Threshold = 1.5 } OutlierDetector([string]$method, [string]$treatment, [double]$threshold) { $this.Method = $method $this.Treatment = $treatment $this.Threshold = $threshold } hidden [double] Percentile([double[]]$sorted, [double]$p) { $idx = $p / 100.0 * ($sorted.Length - 1) $lo = [int][Math]::Floor($idx) $hi = [int][Math]::Ceiling($idx) if ($lo -eq $hi) { return $sorted[$lo] } return $sorted[$lo] + ($idx - $lo) * ($sorted[$hi] - $sorted[$lo]) } [void] Fit([double[][]]$X) { $nFeatures = $X[0].Length $this.LowerBounds = @(0.0) * $nFeatures $this.UpperBounds = @(0.0) * $nFeatures for ($f = 0; $f -lt $nFeatures; $f++) { $vals = $X | ForEach-Object { $_[$f] } $sorted = $vals | Sort-Object if ($this.Method -eq "iqr") { $q1 = $this.Percentile($sorted, 25) $q3 = $this.Percentile($sorted, 75) $iqr = $q3 - $q1 $this.LowerBounds[$f] = $q1 - $this.Threshold * $iqr $this.UpperBounds[$f] = $q3 + $this.Threshold * $iqr } else { # Z-score $mean = ($vals | Measure-Object -Average).Average $sumSq = 0.0 foreach ($v in $vals) { $sumSq += ($v - $mean) * ($v - $mean) } $std = [Math]::Sqrt($sumSq / $vals.Count) $std = [Math]::Max($std, 1e-8) $this.LowerBounds[$f] = $mean - $this.Threshold * $std $this.UpperBounds[$f] = $mean + $this.Threshold * $std } } $this.IsFitted = $true } # Returns: @{ Data = cleaned data; OutlierMask = bool[] per row } [hashtable] Transform([double[][]]$X) { $result = @() $outlierMask = @() foreach ($row in $X) { $isOutlier = $false $newRow = $row.Clone() for ($f = 0; $f -lt $row.Length; $f++) { if ($row[$f] -lt $this.LowerBounds[$f] -or $row[$f] -gt $this.UpperBounds[$f]) { $isOutlier = $true if ($this.Treatment -eq "clip") { $newRow[$f] = [Math]::Max($this.LowerBounds[$f], [Math]::Min($this.UpperBounds[$f], $row[$f])) } } } $outlierMask += $isOutlier if ($this.Treatment -ne "remove" -or -not $isOutlier) { $result += ,$newRow } } return @{ Data = $result; OutlierMask = $outlierMask } } [void] PrintSummary() { Write-Host "" Write-Host "╔══════════════════════════════════════╗" -ForegroundColor Cyan Write-Host "║ Outlier Detector ║" -ForegroundColor Cyan Write-Host "╠══════════════════════════════════════╣" -ForegroundColor Cyan Write-Host ("║ Method : {0,-24}║" -f $this.Method) -ForegroundColor Yellow Write-Host ("║ Treatment : {0,-24}║" -f $this.Treatment) -ForegroundColor Yellow Write-Host ("║ Threshold : {0,-24}║" -f $this.Threshold) -ForegroundColor Yellow Write-Host "╠══════════════════════════════════════╣" -ForegroundColor Cyan for ($f = 0; $f -lt $this.LowerBounds.Length; $f++) { Write-Host ("║ f{0} bounds : [{1,8:F2}, {2,8:F2}] ║" -f $f, $this.LowerBounds[$f], $this.UpperBounds[$f]) -ForegroundColor White } Write-Host "╚══════════════════════════════════════╝" -ForegroundColor Cyan Write-Host "" } } # ============================================================ # ROBUST SCALER # ============================================================ # TEACHING NOTE: StandardScaler uses mean and std - # both are sensitive to outliers! # RobustScaler uses MEDIAN and IQR instead: # z = (x - median) / IQR # Much more robust when data has outliers! # ============================================================ class RobustScaler { [double[]] $Medians [double[]] $IQRs [bool] $IsFitted = $false RobustScaler() {} hidden [double] Percentile([double[]]$sorted, [double]$p) { $idx = $p / 100.0 * ($sorted.Length - 1) $lo = [int][Math]::Floor($idx) $hi = [int][Math]::Ceiling($idx) if ($lo -eq $hi) { return $sorted[$lo] } return $sorted[$lo] + ($idx - $lo) * ($sorted[$hi] - $sorted[$lo]) } [void] Fit([double[][]]$X) { $nFeatures = $X[0].Length $this.Medians = @(0.0) * $nFeatures $this.IQRs = @(1.0) * $nFeatures for ($f = 0; $f -lt $nFeatures; $f++) { $vals = $X | ForEach-Object { $_[$f] } $sorted = $vals | Sort-Object $q1 = $this.Percentile($sorted, 25) $q3 = $this.Percentile($sorted, 75) $this.Medians[$f] = $this.Percentile($sorted, 50) $this.IQRs[$f] = [Math]::Max($q3 - $q1, 1e-8) } $this.IsFitted = $true } [double[][]] Transform([double[][]]$X) { $result = @() foreach ($row in $X) { $scaled = @(0.0) * $row.Length for ($f = 0; $f -lt $row.Length; $f++) { $scaled[$f] = ($row[$f] - $this.Medians[$f]) / $this.IQRs[$f] } $result += ,$scaled } return $result } [double[][]] FitTransform([double[][]]$X) { $this.Fit($X) return $this.Transform($X) } } # ============================================================ # CATEGORICAL ENCODING # ============================================================ # TEACHING NOTE: ML algorithms need NUMBERS, not text. # Three ways to encode categories: # # Label Encoding: "red"=0, "green"=1, "blue"=2 # PROBLEM: implies red < green < blue (false ordering!) # Use ONLY for ordinal categories (small, medium, large) # # One-Hot Encoding: "red" -> [1,0,0], "green" -> [0,1,0] # Each category gets its own binary column # No false ordering! But adds many columns. # # Target Encoding: replace category with mean of target # "red" -> mean(y) for all red rows # Powerful but can OVERFIT - use carefully! # ============================================================ class LabelEncoder { [hashtable] $Mapping # category -> integer [hashtable] $InverseMapping # integer -> category [bool] $IsFitted = $false LabelEncoder() {} [void] Fit([string[]]$categories) { $this.Mapping = @{} $this.InverseMapping = @{} $unique = $categories | Select-Object -Unique | Sort-Object $idx = 0 foreach ($cat in $unique) { $this.Mapping[$cat] = $idx $this.InverseMapping[$idx] = $cat $idx++ } $this.IsFitted = $true } [int[]] Transform([string[]]$categories) { $result = @(0) * $categories.Length for ($i = 0; $i -lt $categories.Length; $i++) { $result[$i] = if ($this.Mapping.ContainsKey($categories[$i])) { $this.Mapping[$categories[$i]] } else { -1 } # unknown category } return $result } [int[]] FitTransform([string[]]$categories) { $this.Fit($categories) return $this.Transform($categories) } [string[]] InverseTransform([int[]]$labels) { $result = @("") * $labels.Length for ($i = 0; $i -lt $labels.Length; $i++) { $result[$i] = if ($this.InverseMapping.ContainsKey($labels[$i])) { $this.InverseMapping[$labels[$i]] } else { "unknown" } } return $result } [void] PrintMapping() { Write-Host "" Write-Host "🏷️ Label Encoder Mapping:" -ForegroundColor Green foreach ($kv in $this.Mapping.GetEnumerator() | Sort-Object Value) { Write-Host (" '{0}' -> {1}" -f $kv.Key, $kv.Value) -ForegroundColor White } Write-Host "" } } class OneHotEncoder { [string[]] $Categories # unique categories learned [bool] $IsFitted = $false OneHotEncoder() {} [void] Fit([string[]]$categories) { $this.Categories = $categories | Select-Object -Unique | Sort-Object $this.IsFitted = $true } [double[][]] Transform([string[]]$categories) { $result = @() foreach ($cat in $categories) { $vec = @(0.0) * $this.Categories.Length for ($i = 0; $i -lt $this.Categories.Length; $i++) { if ($this.Categories[$i] -eq $cat) { $vec[$i] = 1.0 } } $result += ,$vec } return $result } [double[][]] FitTransform([string[]]$categories) { $this.Fit($categories) return $this.Transform($categories) } [void] PrintMapping() { Write-Host "" Write-Host "🔢 One-Hot Encoder Mapping:" -ForegroundColor Green Write-Host (" Columns: [{0}]" -f ($this.Categories -join ", ")) -ForegroundColor Cyan foreach ($cat in $this.Categories) { $vec = $this.Transform(@($cat))[0] Write-Host (" '{0}' -> [{1}]" -f $cat, ($vec -join ", ")) -ForegroundColor White } Write-Host "" } } class TargetEncoder { [hashtable] $Mapping # category -> mean target value [double] $GlobalMean # fallback for unseen categories [bool] $IsFitted = $false TargetEncoder() {} [void] Fit([string[]]$categories, [double[]]$y) { $this.Mapping = @{} $this.GlobalMean = ($y | Measure-Object -Average).Average $unique = $categories | Select-Object -Unique foreach ($cat in $unique) { $vals = @() for ($i = 0; $i -lt $categories.Length; $i++) { if ($categories[$i] -eq $cat) { $vals += $y[$i] } } $this.Mapping[$cat] = ($vals | Measure-Object -Average).Average } $this.IsFitted = $true } [double[]] Transform([string[]]$categories) { $result = @(0.0) * $categories.Length for ($i = 0; $i -lt $categories.Length; $i++) { $result[$i] = if ($this.Mapping.ContainsKey($categories[$i])) { $this.Mapping[$categories[$i]] } else { $this.GlobalMean } } return $result } [double[]] FitTransform([string[]]$categories, [double[]]$y) { $this.Fit($categories, $y) return $this.Transform($categories) } [void] PrintMapping() { Write-Host "" Write-Host "🎯 Target Encoder Mapping:" -ForegroundColor Green foreach ($kv in $this.Mapping.GetEnumerator() | Sort-Object Value) { Write-Host (" '{0}' -> {1:F4}" -f $kv.Key, $kv.Value) -ForegroundColor White } Write-Host "" } } # ============================================================ # TRAIN/TEST SPLIT # ============================================================ # TEACHING NOTE: Why split data? # We train on one part and TEST on another part we haven't seen. # This gives an HONEST estimate of how well our model generalises. # Typical split: 80% train, 20% test # # Stratified split: ensures SAME CLASS RATIO in both sets. # e.g. if 30% spam in full data -> 30% spam in train AND test # ============================================================ function Split-TrainTest { param( [double[][]] $X, [object[]] $y, [double] $TestSize = 0.2, [bool] $Stratify = $false, [int] $Seed = 42 ) $rng = [System.Random]::new($Seed) $n = $X.Length if ($Stratify) { # Stratified: split within each class $classes = $y | Select-Object -Unique $trainIdx = [System.Collections.ArrayList]::new() $testIdx = [System.Collections.ArrayList]::new() foreach ($c in $classes) { $classIdx = @() for ($i = 0; $i -lt $n; $i++) { if ("$($y[$i])" -eq "$c") { $classIdx += $i } } $shuffled = $classIdx | Sort-Object { $rng.Next() } $nTest = [Math]::Max(1, [int]([Math]::Round($classIdx.Length * $TestSize))) for ($i = 0; $i -lt $shuffled.Length; $i++) { if ($i -lt $nTest) { $testIdx.Add($shuffled[$i]) | Out-Null } else { $trainIdx.Add($shuffled[$i]) | Out-Null } } } } else { # Random split $shuffled = 0..($n-1) | Sort-Object { $rng.Next() } $nTest = [int]([Math]::Round($n * $TestSize)) $trainIdx = [System.Collections.ArrayList]::new() $testIdx = [System.Collections.ArrayList]::new() for ($i = 0; $i -lt $shuffled.Length; $i++) { if ($i -lt $nTest) { $testIdx.Add($shuffled[$i]) | Out-Null } else { $trainIdx.Add($shuffled[$i]) | Out-Null } } } $XTrain = @(); $yTrain = @() $XTest = @(); $yTest = @() foreach ($i in $trainIdx) { $XTrain += ,$X[$i]; $yTrain += $y[$i] } foreach ($i in $testIdx) { $XTest += ,$X[$i]; $yTest += $y[$i] } Write-Host "" Write-Host "✂️ Train/Test Split" -ForegroundColor Green Write-Host (" Total : {0} samples" -f $n) -ForegroundColor Cyan Write-Host (" Train : {0} samples ({1:F0}%)" -f $XTrain.Length, (100*(1-$TestSize))) -ForegroundColor White Write-Host (" Test : {0} samples ({1:F0}%)" -f $XTest.Length, (100*$TestSize)) -ForegroundColor White Write-Host (" Stratify: {0}" -f $Stratify) -ForegroundColor White Write-Host "" return @{ XTrain=$XTrain; yTrain=$yTrain; XTest=$XTest; yTest=$yTest } } # ============================================================ # DATA SUMMARY UTILITY # ============================================================ # TEACHING NOTE: Always LOOK at your data before modelling! # Check: ranges, missing values, distributions. # ============================================================ function Get-DataSummary { param( [double[][]] $X, [string[]] $FeatureNames = @() ) $n = $X.Length $nFeatures = $X[0].Length Write-Host "" Write-Host "📋 Data Summary" -ForegroundColor Green Write-Host (" Samples : {0}" -f $n) -ForegroundColor Cyan Write-Host (" Features : {0}" -f $nFeatures) -ForegroundColor Cyan Write-Host "" Write-Host (" {0,-12} {1,8} {2,8} {3,8} {4,8} {5,8}" -f "Feature","Min","Max","Mean","Std","Missing") -ForegroundColor Yellow Write-Host (" {0}" -f ("-" * 58)) -ForegroundColor DarkGray for ($f = 0; $f -lt $nFeatures; $f++) { $vals = $X | ForEach-Object { $_[$f] } $name = if ($f -lt $FeatureNames.Length) { $FeatureNames[$f] } else { "f$f" } $min = ($vals | Measure-Object -Minimum).Minimum $max = ($vals | Measure-Object -Maximum).Maximum $mean = ($vals | Measure-Object -Average).Average $sumSq = 0.0 foreach ($v in $vals) { $sumSq += ($v - $mean) * ($v - $mean) } $std = [Math]::Sqrt($sumSq / $vals.Count) $missing = ($vals | Where-Object { [double]::IsNaN($_) }).Count $color = if ($missing -gt 0) { "Yellow" } else { "White" } Write-Host (" {0,-12} {1,8:F2} {2,8:F2} {3,8:F2} {4,8:F2} {5,8}" -f $name, $min, $max, $mean, $std, $missing) -ForegroundColor $color } Write-Host "" } # ============================================================ # BUILT-IN DEMO DATASET WITH ISSUES # ============================================================ function Get-VBAFPipelineDataset { param([string]$Name = "MestyHousePrice") switch ($Name) { "MessyHousePrice" { Write-Host "📊 Dataset: MessyHousePrice (messy version!)" -ForegroundColor Cyan Write-Host " Has: missing values, outliers, categories" -ForegroundColor Yellow # Features: size_sqm, bedrooms, age_years, condition (cat), price $rawData = @( @{size=50.0; beds=1.0; age=20.0; cond="good"; price=150.0}, @{size=75.0; beds=2.0; age=15.0; cond="good"; price=220.0}, @{size=$null; beds=3.0; age=10.0; cond="excellent"; price=310.0}, # missing size @{size=120.0; beds=3.0; age=5.0; cond="excellent"; price=370.0}, @{size=150.0; beds=4.0; age=2.0; cond="excellent"; price=450.0}, @{size=60.0; beds=2.0; age=25.0; cond="fair"; price=175.0}, @{size=80.0; beds=$null; age=18.0; cond="good"; price=240.0}, # missing beds @{size=90.0; beds=3.0; age=12.0; cond="good"; price=270.0}, @{size=999.0; beds=3.0; age=8.0; cond="good"; price=340.0}, # outlier size! @{size=130.0; beds=4.0; age=3.0; cond="excellent"; price=400.0}, @{size=55.0; beds=1.0; age=22.0; cond="fair"; price=160.0}, @{size=70.0; beds=2.0; age=16.0; cond="good"; price=210.0}, @{size=95.0; beds=3.0; age=11.0; cond="good"; price=290.0}, @{size=115.0; beds=3.0; age=6.0; cond="excellent"; price=355.0}, @{size=140.0; beds=4.0; age=1.0; cond="excellent"; price=430.0}, @{size=65.0; beds=2.0; age=19.0; cond="fair"; price=$null}, # missing price @{size=85.0; beds=2.0; age=14.0; cond="good"; price=255.0}, @{size=105.0; beds=3.0; age=9.0; cond="good"; price=320.0}, @{size=125.0; beds=4.0; age=4.0; cond="excellent"; price=385.0}, @{size=160.0; beds=5.0; age=1.0; cond="excellent"; price=500.0} ) $X = @(); $y = @(); $conditions = @() foreach ($row in $rawData) { $X += ,@($row.size, $row.beds, $row.age) $y += $row.price $conditions += $row.cond } return @{ X=$X; y=$y; Conditions=$conditions Features=@("size_sqm","bedrooms","age_years") } } "IrisWithCategories" { Write-Host "📊 Dataset: IrisWithCategories" -ForegroundColor Cyan Write-Host " Has: numeric features + text labels" -ForegroundColor Yellow $X = @( @(5.1,3.5,1.4,0.2),@(4.9,3.0,1.4,0.2),@(4.7,3.2,1.3,0.2), @(5.0,3.6,1.4,0.2),@(5.4,3.9,1.7,0.4),@(4.6,3.4,1.4,0.3), @(7.0,3.2,4.7,1.4),@(6.4,3.2,4.5,1.5),@(6.9,3.1,4.9,1.5), @(5.5,2.3,4.0,1.3),@(6.5,2.8,4.6,1.5),@(5.7,2.8,4.5,1.3), @(6.3,3.3,6.0,2.5),@(5.8,2.7,5.1,1.9),@(7.1,3.0,5.9,2.1) ) $labels = @("setosa","setosa","setosa","setosa","setosa","setosa", "versicolor","versicolor","versicolor","versicolor","versicolor","versicolor", "virginica","virginica","virginica") return @{ X=$X; Labels=$labels; Features=@("sepal_l","sepal_w","petal_l","petal_w") } } default { Write-Host "❌ Unknown dataset: $Name" -ForegroundColor Red Write-Host " Available: MessyHousePrice, IrisWithCategories" -ForegroundColor Yellow return $null } } } # ============================================================ # TEST # 1. Run VBAF.LoadAll.ps1 # # --- Full pipeline on messy data --- # 2. $data = Get-VBAFPipelineDataset -Name "MessyHousePrice" # # 3. # Step 1: Look at raw data # Get-DataSummary -X $data.X -FeatureNames $data.Features # # 4. # Step 2: Impute missing values # $imp = [MissingValueImputer]::new("median") # $Ximp = $imp.FitTransform($data.X) # $imp.PrintSummary() # # 5. # Step 3: Detect and clip outliers # $od = [OutlierDetector]::new("iqr", "clip", 1.5) # $od.Fit($Ximp) # $od.PrintSummary() # $result = $od.Transform($Ximp) # $Xclean = $result.Data # Write-Host "Outliers found: $(($result.OutlierMask | Where-Object {$_}).Count)" # # 6. # Step 4: Encode categories # $ohe = [OneHotEncoder]::new() # $Xcond = $ohe.FitTransform($data.Conditions) # $ohe.PrintMapping() # # 7. # Step 5: Scale features # $scaler = [RobustScaler]::new() # $Xscaled = $scaler.FitTransform($Xclean) # Get-DataSummary -X $Xscaled -FeatureNames $data.Features # # 8. # Step 6: Train/Test split (stratified not needed for regression) # $split = Split-TrainTest -X $Xscaled -y $data.y -TestSize 0.2 # # --- Label and Target encoding --- # 9. $le = [LabelEncoder]::new() # $enc = $le.FitTransform($data.Conditions) # $le.PrintMapping() # # 10. $te = [TargetEncoder]::new() # $tenc = $te.FitTransform($data.Conditions, [double[]]$data.y) # $te.PrintMapping() # ============================================================ Write-Host "📦 VBAF.ML.DataPipeline.ps1 loaded" -ForegroundColor Green Write-Host " Classes : MissingValueImputer" -ForegroundColor Cyan Write-Host " OutlierDetector" -ForegroundColor Cyan Write-Host " RobustScaler" -ForegroundColor Cyan Write-Host " LabelEncoder" -ForegroundColor Cyan Write-Host " OneHotEncoder" -ForegroundColor Cyan Write-Host " TargetEncoder" -ForegroundColor Cyan Write-Host " Functions : Split-TrainTest" -ForegroundColor Cyan Write-Host " Get-DataSummary" -ForegroundColor Cyan Write-Host " Get-VBAFPipelineDataset" -ForegroundColor Cyan Write-Host "" Write-Host " Quick start:" -ForegroundColor Yellow Write-Host ' $data = Get-VBAFPipelineDataset -Name "MessyHousePrice"' -ForegroundColor White Write-Host ' $imp = [MissingValueImputer]::new("median")' -ForegroundColor White Write-Host ' $Ximp = $imp.FitTransform($data.X)' -ForegroundColor White Write-Host ' $imp.PrintSummary()' -ForegroundColor White Write-Host "" |