VBAF.ML.NaiveBayes.ps1
|
#Requires -Version 5.1 <# .SYNOPSIS Naive Bayes Classification Algorithms .DESCRIPTION Implements Naive Bayes variants from scratch. Designed as a TEACHING resource - every step explained. Algorithms included: - Gaussian Naive Bayes : continuous features, assumes normal distribution - Multinomial Naive Bayes : count-based features (word counts, frequencies) - Bernoulli Naive Bayes : binary features (word present/absent) Utilities included: - Text preprocessing : tokenize, stopwords, feature vectors - Built-in datasets : Iris-style numeric + spam-style text Standalone - no external VBAF dependencies required. .NOTES Part of VBAF - Phase 4 Machine Learning Module PS 5.1 compatible Teaching project - Bayes theorem shown step by step! #> $basePath = $PSScriptRoot # ============================================================ # TEACHING NOTE: What is Naive Bayes? # Based on BAYES THEOREM: # P(class | features) = P(features | class) * P(class) / P(features) # # Translation: # "What is the probability this is class C, given these features?" # = "How likely are these features if it IS class C?" # * "How common is class C overall?" # / "How common are these features overall?" # # The "NAIVE" part: we assume all features are INDEPENDENT. # This is rarely true in reality, but works surprisingly well! # # Three variants for different data types: # Gaussian : features are continuous numbers (height, weight) # Multinomial : features are counts (word frequencies in text) # Bernoulli : features are 0/1 (word present or absent) # ============================================================ # ============================================================ # GAUSSIAN NAIVE BAYES # ============================================================ # TEACHING NOTE: For continuous features, we assume each feature # follows a Gaussian (normal) distribution within each class. # We learn: mean and variance of each feature per class. # Then for a new point: # P(feature_i | class) = Gaussian(feature_i; mean, variance) # Gaussian(x; mu, sigma^2) = (1/sqrt(2*pi*sigma^2)) * exp(-(x-mu)^2 / (2*sigma^2)) # ============================================================ class GaussianNaiveBayes { [hashtable] $ClassPriors # P(class) for each class [hashtable] $FeatureMeans # mean of each feature per class [hashtable] $FeatureVars # variance of each feature per class [object[]] $Classes # unique class labels [bool] $IsFitted = $false GaussianNaiveBayes() {} # Gaussian probability density hidden [double] GaussianPDF([double]$x, [double]$mean, [double]$variance) { $variance = [Math]::Max($variance, 1e-9) # avoid division by zero $exponent = -($x - $mean) * ($x - $mean) / (2.0 * $variance) $coeff = 1.0 / [Math]::Sqrt(2.0 * [Math]::PI * $variance) return $coeff * [Math]::Exp($exponent) } [void] Fit([double[][]]$X, [object[]]$y) { $n = $X.Length $nFeatures = $X[0].Length # Get unique classes $this.Classes = $y | Select-Object -Unique | Sort-Object $this.ClassPriors = @{} $this.FeatureMeans = @{} $this.FeatureVars = @{} foreach ($c in $this.Classes) { $key = "$c" # Collect rows for this class $classRows = @() for ($i = 0; $i -lt $n; $i++) { if ("$($y[$i])" -eq $key) { $classRows += ,$X[$i] } } # P(class) = count / total $this.ClassPriors[$key] = $classRows.Length / $n # Mean and variance per feature $means = @(0.0) * $nFeatures $vars = @(0.0) * $nFeatures for ($f = 0; $f -lt $nFeatures; $f++) { $vals = $classRows | ForEach-Object { $_[$f] } $mu = ($vals | Measure-Object -Average).Average $means[$f] = $mu $sumSq = 0.0 foreach ($v in $vals) { $sumSq += ($v - $mu) * ($v - $mu) } $vars[$f] = $sumSq / $vals.Count } $this.FeatureMeans[$key] = $means $this.FeatureVars[$key] = $vars } $this.IsFitted = $true } # Predict class probabilities (log scale for numerical stability) [hashtable] PredictProba([double[]]$x) { $logProbs = @{} foreach ($c in $this.Classes) { $key = "$c" $logProb = [Math]::Log($this.ClassPriors[$key]) for ($f = 0; $f -lt $x.Length; $f++) { $pdf = $this.GaussianPDF($x[$f], $this.FeatureMeans[$key][$f], $this.FeatureVars[$key][$f]) $logProb += [Math]::Log([Math]::Max($pdf, 1e-300)) } $logProbs[$key] = $logProb } return $logProbs } [object] PredictOne([double[]]$x) { $logProbs = $this.PredictProba($x) $best = $null $bestProb = [double]::MinValue foreach ($kv in $logProbs.GetEnumerator()) { if ($kv.Value -gt $bestProb) { $bestProb = $kv.Value $best = $kv.Key } } return $best } [object[]] Predict([double[][]]$X) { $preds = @() foreach ($row in $X) { $preds += $this.PredictOne($row) } return $preds } [void] PrintSummary() { Write-Host "" Write-Host "╔══════════════════════════════════════╗" -ForegroundColor Cyan Write-Host "║ Gaussian Naive Bayes Summary ║" -ForegroundColor Cyan Write-Host "╠══════════════════════════════════════╣" -ForegroundColor Cyan foreach ($c in $this.Classes) { $key = "$c" $prior = [Math]::Round($this.ClassPriors[$key], 4) Write-Host ("║ Class {0} prior={1,-28}║" -f $key, $prior) -ForegroundColor White $means = $this.FeatureMeans[$key] $vars = $this.FeatureVars[$key] for ($f = 0; $f -lt $means.Length; $f++) { Write-Host ("║ f{0}: mean={1,-8} var={2,-16}║" -f $f, [Math]::Round($means[$f],3), [Math]::Round($vars[$f],3)) -ForegroundColor DarkGray } } Write-Host "╚══════════════════════════════════════╝" -ForegroundColor Cyan Write-Host "" } } # ============================================================ # MULTINOMIAL NAIVE BAYES # ============================================================ # TEACHING NOTE: For COUNT features (e.g. word frequencies). # Instead of Gaussian, we use: # P(word_i | class) = (count of word_i in class + alpha) / # (total words in class + alpha * vocab_size) # The alpha is LAPLACE SMOOTHING - adds 1 to every count # so we never get P=0 for unseen words! # ============================================================ class MultinomialNaiveBayes { [hashtable] $ClassPriors # P(class) [hashtable] $FeatureLogProbs # log P(feature | class) [object[]] $Classes [double] $Alpha # Laplace smoothing [bool] $IsFitted = $false MultinomialNaiveBayes() { $this.Alpha = 1.0 } MultinomialNaiveBayes([double]$alpha) { $this.Alpha = $alpha } [void] Fit([double[][]]$X, [object[]]$y) { $n = $X.Length $nFeatures = $X[0].Length $this.Classes = $y | Select-Object -Unique | Sort-Object $this.ClassPriors = @{} $this.FeatureLogProbs = @{} $nTotal = $y.Length # store total count explicitly foreach ($c in $this.Classes) { $key = "$c" $classRows = @() for ($i = 0; $i -lt $nTotal; $i++) { if ("$($y[$i])" -eq $key) { $classRows += ,$X[$i] } } $this.ClassPriors[$key] = $classRows.Length / $nTotal # Sum counts per feature across all class documents $featureCounts = @(0.0) * $nFeatures foreach ($row in $classRows) { for ($f = 0; $f -lt $nFeatures; $f++) { $featureCounts[$f] += $row[$f] } } # Total count + smoothing $totalCount = ($featureCounts | Measure-Object -Sum).Sum $totalSmoothed = $totalCount + $this.Alpha * $nFeatures # Log probabilities with Laplace smoothing $logProbs = @(0.0) * $nFeatures for ($f = 0; $f -lt $nFeatures; $f++) { $logProbs[$f] = [Math]::Log(($featureCounts[$f] + $this.Alpha) / $totalSmoothed) } $this.FeatureLogProbs[$key] = $logProbs } $this.IsFitted = $true } [object] PredictOne([double[]]$x) { $best = $null $bestProb = [double]::MinValue foreach ($c in $this.Classes) { $key = "$c" $logProb = [Math]::Log($this.ClassPriors[$key]) for ($f = 0; $f -lt $x.Length; $f++) { if ($x[$f] -gt 0) { $logProb += $x[$f] * $this.FeatureLogProbs[$key][$f] } } if ($logProb -gt $bestProb) { $bestProb = $logProb $best = $key } } return $best } [object[]] Predict([double[][]]$X) { $preds = @() foreach ($row in $X) { $preds += $this.PredictOne($row) } return $preds } [void] PrintSummary() { Write-Host "" Write-Host "╔══════════════════════════════════════╗" -ForegroundColor Cyan Write-Host "║ Multinomial Naive Bayes Summary ║" -ForegroundColor Cyan Write-Host "╠══════════════════════════════════════╣" -ForegroundColor Cyan Write-Host ("║ Alpha (smoothing): {0,-18}║" -f $this.Alpha) -ForegroundColor Yellow foreach ($c in $this.Classes) { $key = "$c" $prior = [Math]::Round($this.ClassPriors[$key], 4) Write-Host ("║ Class {0} prior={1,-28}║" -f $key, $prior) -ForegroundColor White } Write-Host "╚══════════════════════════════════════╝" -ForegroundColor Cyan Write-Host "" } } # ============================================================ # BERNOULLI NAIVE BAYES # ============================================================ # TEACHING NOTE: For BINARY features (0 or 1). # e.g. "does this email contain the word FREE? yes/no" # P(feature_i=1 | class) = (count of docs with feature_i + alpha) / # (count of class docs + 2*alpha) # Key difference from Multinomial: # - Bernoulli explicitly models ABSENCE of features too # - "word NOT present" also carries information! # ============================================================ class BernoulliNaiveBayes { [hashtable] $ClassPriors [hashtable] $FeatureProbs # P(feature=1 | class) [object[]] $Classes [double] $Alpha [bool] $IsFitted = $false BernoulliNaiveBayes() { $this.Alpha = 1.0 } BernoulliNaiveBayes([double]$alpha) { $this.Alpha = $alpha } [void] Fit([double[][]]$X, [object[]]$y) { $n = $X.Length $nFeatures = $X[0].Length $this.Classes = $y | Select-Object -Unique | Sort-Object $this.ClassPriors = @{} $this.FeatureProbs = @{} $nTotal2 = $y.Length foreach ($c in $this.Classes) { $key = "$c" $classRows = @() for ($i = 0; $i -lt $nTotal2; $i++) { if ("$($y[$i])" -eq $key) { $classRows += ,$X[$i] } } $nc = $classRows.Length $this.ClassPriors[$key] = $nc / $nTotal2 # Count docs where each feature = 1 $featurePresent = @(0.0) * $nFeatures foreach ($row in $classRows) { for ($f = 0; $f -lt $nFeatures; $f++) { if ($row[$f] -gt 0) { $featurePresent[$f]++ } } } # P(feature=1 | class) with Laplace smoothing $probs = @(0.0) * $nFeatures for ($f = 0; $f -lt $nFeatures; $f++) { $probs[$f] = ($featurePresent[$f] + $this.Alpha) / ($nc + 2.0 * $this.Alpha) } $this.FeatureProbs[$key] = $probs } $this.IsFitted = $true } [object] PredictOne([double[]]$x) { $best = $null $bestProb = [double]::MinValue foreach ($c in $this.Classes) { $key = "$c" $logProb = [Math]::Log($this.ClassPriors[$key]) for ($f = 0; $f -lt $x.Length; $f++) { $p = $this.FeatureProbs[$key][$f] $p = [Math]::Max(1e-10, [Math]::Min(1 - 1e-10, $p)) if ($x[$f] -gt 0) { $logProb += [Math]::Log($p) } else { # Bernoulli explicitly penalizes absent features too! $logProb += [Math]::Log(1.0 - $p) } } if ($logProb -gt $bestProb) { $bestProb = $logProb $best = $key } } return $best } [object[]] Predict([double[][]]$X) { $preds = @() foreach ($row in $X) { $preds += $this.PredictOne($row) } return $preds } [void] PrintSummary() { Write-Host "" Write-Host "╔══════════════════════════════════════╗" -ForegroundColor Cyan Write-Host "║ Bernoulli Naive Bayes Summary ║" -ForegroundColor Cyan Write-Host "╠══════════════════════════════════════╣" -ForegroundColor Cyan Write-Host ("║ Alpha (smoothing): {0,-18}║" -f $this.Alpha) -ForegroundColor Yellow foreach ($c in $this.Classes) { $key = "$c" $prior = [Math]::Round($this.ClassPriors[$key], 4) Write-Host ("║ Class {0} prior={1,-28}║" -f $key, $prior) -ForegroundColor White $probs = $this.FeatureProbs[$key] for ($f = 0; $f -lt [Math]::Min($probs.Length, 5); $f++) { Write-Host ("║ f{0}: P(present)={1,-22}║" -f $f, [Math]::Round($probs[$f], 4)) -ForegroundColor DarkGray } if ($probs.Length -gt 5) { Write-Host ("║ ... ({0} features total){1,-12}║" -f $probs.Length, "") -ForegroundColor DarkGray } } Write-Host "╚══════════════════════════════════════╝" -ForegroundColor Cyan Write-Host "" } } # ============================================================ # TEXT PREPROCESSING UTILITIES # ============================================================ # TEACHING NOTE: Before classifying text we need to: # 1. Tokenize: split "Hello World" -> ["hello", "world"] # 2. Remove stopwords: remove "the", "a", "is", etc. # 3. Build vocabulary: list of all unique words # 4. Vectorize: convert text to feature vector # Multinomial: count how many times each word appears # Bernoulli : 1 if word appears, 0 if not # ============================================================ $script:STOPWORDS = @("the","a","an","is","it","in","on","at","to","for", "of","and","or","but","not","this","that","with","are","was") function ConvertTo-Tokens { param([string]$text, [switch]$RemoveStopwords) $words = $text.ToLower() -replace '[^a-z\s]','' -split '\s+' | Where-Object { $_.Length -gt 1 } if ($RemoveStopwords) { $words = $words | Where-Object { $script:STOPWORDS -notcontains $_ } } return $words } function New-Vocabulary { param([string[]]$texts) $vocab = [System.Collections.Generic.List[string]]::new() foreach ($text in $texts) { $tokens = ConvertTo-Tokens -text $text -RemoveStopwords foreach ($token in $tokens) { if (-not $vocab.Contains($token)) { $vocab.Add($token) } } } return ($vocab | Sort-Object) } function ConvertTo-CountVector { param([string]$text, [string[]]$vocabulary) $tokens = ConvertTo-Tokens -text $text -RemoveStopwords $vec = @(0.0) * $vocabulary.Length for ($i = 0; $i -lt $vocabulary.Length; $i++) { $vec[$i] = ($tokens | Where-Object { $_ -eq $vocabulary[$i] }).Count } return $vec } function ConvertTo-BinaryVector { param([string]$text, [string[]]$vocabulary) $tokens = ConvertTo-Tokens -text $text -RemoveStopwords $vec = @(0.0) * $vocabulary.Length for ($i = 0; $i -lt $vocabulary.Length; $i++) { $vec[$i] = if ($tokens -contains $vocabulary[$i]) { 1.0 } else { 0.0 } } return $vec } # ============================================================ # BUILT-IN DATASETS # ============================================================ function Get-VBAFNBDataset { param([string]$Name = "Iris3Class") switch ($Name) { "Iris3Class" { # 3-class Iris subset - good for Gaussian NB Write-Host "📊 Dataset: Iris3Class (30 samples)" -ForegroundColor Cyan Write-Host " Features: [sepal_length, sepal_width, petal_length, petal_width]" -ForegroundColor Cyan Write-Host " Target : 0=Setosa, 1=Versicolor, 2=Virginica" -ForegroundColor Cyan $X = @( @(5.1,3.5,1.4,0.2),@(4.9,3.0,1.4,0.2),@(4.7,3.2,1.3,0.2), @(5.0,3.6,1.4,0.2),@(5.4,3.9,1.7,0.4),@(4.6,3.4,1.4,0.3), @(5.0,3.4,1.5,0.2),@(4.4,2.9,1.4,0.2),@(4.9,3.1,1.5,0.1),@(5.4,3.7,1.5,0.2), @(7.0,3.2,4.7,1.4),@(6.4,3.2,4.5,1.5),@(6.9,3.1,4.9,1.5), @(5.5,2.3,4.0,1.3),@(6.5,2.8,4.6,1.5),@(5.7,2.8,4.5,1.3), @(6.3,3.3,4.7,1.6),@(4.9,2.4,3.3,1.0),@(6.6,2.9,4.6,1.3),@(5.2,2.7,3.9,1.4), @(6.3,3.3,6.0,2.5),@(5.8,2.7,5.1,1.9),@(7.1,3.0,5.9,2.1), @(6.3,2.9,5.6,1.8),@(6.5,3.0,5.8,2.2),@(7.6,3.0,6.6,2.1), @(4.9,2.5,4.5,1.7),@(7.3,2.9,6.3,1.8),@(6.7,2.5,5.8,1.8),@(7.2,3.6,6.1,2.5) ) $y = @(0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2) return @{ X=$X; y=[object[]]$y; Task="classification" } } "SpamHam" { # Simple spam/ham text classification Write-Host "📊 Dataset: SpamHam (16 messages)" -ForegroundColor Cyan Write-Host " Features: text messages" -ForegroundColor Cyan Write-Host " Target : spam / ham" -ForegroundColor Cyan $texts = @( "free money win prize claim now", "win cash prize free offer limited", "click here free gift money now", "congratulations you won free prize", "free discount offer buy now limited", "urgent claim your free money prize", "exclusive free offer win cash today", "limited time free money win now", "hey are you coming to lunch today", "meeting at three pm conference room", "please review the report by friday", "can we reschedule our meeting tomorrow", "project update attached please review", "see you at the office tomorrow morning", "quarterly results report is ready", "team lunch wednesday at noon confirm" ) $labels = @("spam","spam","spam","spam","spam","spam","spam","spam", "ham","ham","ham","ham","ham","ham","ham","ham") return @{ Texts=$texts; Labels=$labels; Task="text" } } default { Write-Host "❌ Unknown dataset: $Name" -ForegroundColor Red Write-Host " Available: Iris3Class, SpamHam" -ForegroundColor Yellow return $null } } } # ============================================================ # TEST # 1. Run VBAF.LoadAll.ps1 # # --- Gaussian NB on Iris --- # 2. $data = Get-VBAFNBDataset -Name "Iris3Class" # $gnb = [GaussianNaiveBayes]::new() # $gnb.Fit($data.X, $data.y) # $gnb.PrintSummary() # $preds = $gnb.Predict($data.X) # # Count correct (compare $preds vs $data.y) # # --- Multinomial NB on spam text --- # 3. $data2 = Get-VBAFNBDataset -Name "SpamHam" # $vocab = New-Vocabulary -texts $data2.Texts # $Xcount = $data2.Texts | ForEach-Object { ConvertTo-CountVector -text $_ -vocabulary $vocab } # $mnb = [MultinomialNaiveBayes]::new() # $mnb.Fit($Xcount, $data2.Labels) # $mnb.PrintSummary() # $preds2 = $mnb.Predict($Xcount) # # --- Bernoulli NB on spam text --- # 4. $Xbin = $data2.Texts | ForEach-Object { ConvertTo-BinaryVector -text $_ -vocabulary $vocab } # $bnb = [BernoulliNaiveBayes]::new() # $bnb.Fit($Xbin, $data2.Labels) # $bnb.PrintSummary() # $preds3 = $bnb.Predict($Xbin) # # --- Classify new text --- # 5. $newMsg = "win free money now prize" # $vec = ConvertTo-CountVector -text $newMsg -vocabulary $vocab # $mnb.PredictOne($vec) # should predict "spam" # $newMsg2 = "meeting tomorrow at the office" # $vec2 = ConvertTo-CountVector -text $newMsg2 -vocabulary $vocab # $mnb.PredictOne($vec2) # should predict "ham" # ============================================================ Write-Host "📦 VBAF.ML.NaiveBayes.ps1 loaded" -ForegroundColor Green Write-Host " Classes : GaussianNaiveBayes" -ForegroundColor Cyan Write-Host " MultinomialNaiveBayes" -ForegroundColor Cyan Write-Host " BernoulliNaiveBayes" -ForegroundColor Cyan Write-Host " Functions : ConvertTo-Tokens" -ForegroundColor Cyan Write-Host " New-Vocabulary" -ForegroundColor Cyan Write-Host " ConvertTo-CountVector" -ForegroundColor Cyan Write-Host " ConvertTo-BinaryVector" -ForegroundColor Cyan Write-Host " Get-VBAFNBDataset" -ForegroundColor Cyan Write-Host "" Write-Host " Quick start:" -ForegroundColor Yellow Write-Host ' $data = Get-VBAFNBDataset -Name "Iris3Class"' -ForegroundColor White Write-Host ' $gnb = [GaussianNaiveBayes]::new()' -ForegroundColor White Write-Host ' $gnb.Fit($data.X, $data.y)' -ForegroundColor White Write-Host ' $gnb.PrintSummary()' -ForegroundColor White Write-Host "" |