PSAISuiteBenchmarks/Public/Invoke-Benchmark.ps1
|
function Invoke-Benchmark { [CmdletBinding()] param ( [Parameter(Mandatory = $true)] [string[]]$Models, [Parameter(Mandatory = $false)] [string[]]$Category, [Parameter(Mandatory = $false)] [string]$BenchmarksPath = "$PSScriptRoot\..\benchmarks", [Parameter(Mandatory = $false)] [string]$OutputPath ) # 1. Load all .ps1 files from BenchmarksPath $allBenchmarks = @() if (Test-Path $BenchmarksPath) { $benchmarkFiles = Get-ChildItem -Path $BenchmarksPath -Filter "*.ps1" -File foreach ($file in $benchmarkFiles) { $b = . $file.FullName if ($b) { $allBenchmarks += $b } } } else { Write-Warning "Benchmarks path not found: $BenchmarksPath" return } # 2. Filter by Category if provided if ($Category -and $Category.Count -gt 0) { $allBenchmarks = $allBenchmarks | Where-Object { $_.Category -in $Category } } if ($allBenchmarks.Count -eq 0) { Write-Warning "No benchmarks found to run." return } $scoreScriptPath = Join-Path -Path $PSScriptRoot -ChildPath 'Invoke-BenchmarkScore.ps1' # 3 & 4. Run models in parallel and score $results = @() $totalBenchmarks = $allBenchmarks.Count $completedBenchmarks = 0 foreach ($benchmark in $allBenchmarks) { $percentComplete = if ($totalBenchmarks -gt 0) { [int](($completedBenchmarks / $totalBenchmarks) * 100) } else { 0 } Write-Progress -Id 1 -Activity 'Running PSAISuiteBenchmarks' -Status "Benchmark [$($completedBenchmarks + 1)/$totalBenchmarks]: $($benchmark.Category)/$($benchmark.Id) across $($Models.Count) model(s)" -PercentComplete $percentComplete $benchmarkData = [PSCustomObject]@{ Prompt = $benchmark.Prompt ExpectedAnswer = $benchmark.ExpectedAnswer ScoringType = $benchmark.ScoringType Notes = $benchmark.Notes Category = $benchmark.Category Id = $benchmark.Id } $benchmarkResults = $Models | ForEach-Object -Parallel { $model = $_ $b = $using:benchmarkData . $using:scoreScriptPath # Call Invoke-ChatCompletion $chatResult = Invoke-ChatCompletion -Model $model -Prompt $b.Prompt -Raw -IncludeElapsedTime # Call Invoke-BenchmarkScore $scoreResult = Invoke-BenchmarkScore -Response $chatResult.Response -ExpectedAnswer $b.ExpectedAnswer -ScoringType $b.ScoringType -Notes $b.Notes $elapsed = if ($chatResult.ElapsedTime -is [TimeSpan]) { $chatResult.ElapsedTime } else { [TimeSpan]::Parse($chatResult.ElapsedTime) } # 5. Output PSCustomObject [PSCustomObject]@{ Model = $model Category = $b.Category BenchmarkId = $b.Id Prompt = $b.Prompt Response = $chatResult.Response RawScore = $scoreResult.RawScore Passed = $scoreResult.Passed NeedsReview = $scoreResult.NeedsReview ElapsedTime = $elapsed ScoringType = $scoreResult.ScoringType Notes = $scoreResult.Notes } } $results += $benchmarkResults $completedBenchmarks++ } Write-Progress -Id 1 -Activity 'Running PSAISuiteBenchmarks' -Completed # 6. Export to CSV if OutputPath provided if ($OutputPath) { $results | Export-Csv -Path $OutputPath -NoTypeInformation } # 7. Print summary table to console $summary = @() $groupedByCategory = $results | Group-Object Category foreach ($catGroup in $groupedByCategory) { $groupedByModel = $catGroup.Group | Group-Object Model foreach ($modelGroup in $groupedByModel) { $total = $modelGroup.Group.Count $passed = ($modelGroup.Group | Where-Object Passed -eq $true).Count $needsReview = ($modelGroup.Group | Where-Object NeedsReview -eq $true).Count $failed = ($modelGroup.Group | Where-Object { $_.Passed -eq $false -and $_.NeedsReview -eq $false }).Count $validTimes = $modelGroup.Group.ElapsedTime | Where-Object { $_ -is [TimeSpan] } if ($validTimes.Count -gt 0) { $avgTicks = ($validTimes | Measure-Object -Property Ticks -Average).Average $avgTime = [TimeSpan]::FromTicks([long]$avgTicks) } else { $avgTime = [TimeSpan]::Zero } $summary += [PSCustomObject]@{ Category = $catGroup.Name Model = $modelGroup.Name TotalTests = $total Passed = $passed Failed = $failed NeedsReview = $needsReview AvgElapsedTime = $avgTime } } } $summary | Format-Table -AutoSize | Out-Host # Highlight models that failed all InstructionFollowing tests $ifGroups = $results | Where-Object Category -eq 'InstructionFollowing' | Group-Object Model foreach ($grp in $ifGroups) { $allZero = $true foreach ($r in $grp.Group) { if ($r.RawScore -ne 0) { $allZero = $false break } } if ($allZero) { Write-Warning "$($grp.Name) failed all instruction following tests - not safe for agent pipelines" } } # Return the results to the pipeline return $results } |