AITriad

0.7.4

Public/Test-AITJudgeModel.ps1

                                # Copyright (c) 2026 Jeffrey Snover. All rights reserved.

# Licensed under the MIT License. See LICENSE file in the project root.

<#

.SYNOPSIS

    Tests whether a different AI model improves debate turn validation by replaying

    the Stage-B judge prompt across multiple models and comparing verdicts.

.DESCRIPTION

    Runs a cross-model audit on completed debates. For each statement turn, the

    judge prompt is sent to every specified model and the verdicts are compared.

    Surfaces blind spots (turns one model flags but others pass), agreement rates,

    and per-model statistics.

    This is a one-off experiment tool — it does not modify any debate files.

.EXAMPLE

    Test-AITJudgeModel -DebateCount 3

    # Audits the 3 debates with the most validated turns using haiku + gemini-3.1-flash-lite-preview

.EXAMPLE

    Test-AITJudgeModel -DebatePath ../ai-triad-data/debates/debate-4bc8ae8a-1459-4d33-b306-4bdb2308d423.json -Models haiku,sonnet,gemini

.EXAMPLE

    Test-AITJudgeModel -DebateCount 5 -Models haiku,gemini,groq -MaxTurnsPerDebate 10 -OutputPath ./judge-audit.json

.EXAMPLE

    Test-AITJudgeModel -All -Models haiku,gemini -MaxTurnsPerDebate 5

    # Quick sweep across all debates with turn validations

#>

function Test-AITJudgeModel {

    [CmdletBinding(DefaultParameterSetName = 'Auto')]

    param(

        [Parameter(ParameterSetName = 'Auto')]

        [ValidateRange(1, 50)]

        [int]$DebateCount = 3,

        [Parameter(ParameterSetName = 'Auto')]

        [switch]$All,

        [Parameter(Mandatory, ParameterSetName = 'Explicit')]

        [string[]]$DebatePath,

        [Parameter()]

        [string]$Models = 'haiku,gemini',

        [Parameter()]

        [ValidateRange(1, 100)]

        [int]$MaxTurnsPerDebate = 50,

        [Parameter()]

        [string]$OutputPath

    )

    Set-StrictMode -Version Latest

    # ── Verify prerequisites ─────────────────────────────

    $NpxCmd = Get-Command npx.cmd -ErrorAction SilentlyContinue

    if (-not $NpxCmd) { $NpxCmd = Get-Command npx -ErrorAction SilentlyContinue }

    if (-not $NpxCmd) {

        throw "npx is required. Install Node.js (v18+): https://nodejs.org"

    }

    $RepoRoot = Get-CodeRoot

    $CliPath = Join-Path $RepoRoot 'lib' 'debate' 'judgeAudit.ts'

    if (-not (Test-Path $CliPath)) {

        throw "Judge audit CLI not found at: $CliPath"

    }

    # ── Resolve debate files ─────────────────────────────

    if ($PSCmdlet.ParameterSetName -eq 'Auto') {

        try {

            $DebatesDir = Get-DebatesDir

        } catch {

            throw "Cannot locate debates directory. Set AI_TRIAD_DATA_ROOT or check .aitriad.json."

        }

        $AllDebateFiles = Get-ChildItem -Path $DebatesDir -Filter 'debate-*.json' -File

        if ($AllDebateFiles.Count -eq 0) {

            throw "No debate files found in $DebatesDir"

        }

        # Rank by number of validated turns (debates with turn_validations are most interesting)

        $Ranked = $AllDebateFiles | ForEach-Object {

            try {

                $D = Get-Content $_.FullName -Raw | ConvertFrom-Json -ErrorAction Stop

                $StmtCount = ($D.transcript | Where-Object { $_.type -eq 'statement' -or $_.type -eq 'opening' }).Count

                $TvCount = if ($D.turn_validations) { ($D.turn_validations.PSObject.Properties).Count } else { 0 }

                [PSCustomObject]@{

                    Path       = $_.FullName

                    Title      = ($D.title ?? '').Substring(0, [Math]::Min(60, ($D.title ?? '').Length))

                    Statements = $StmtCount

                    Validated  = $TvCount

                }

            } catch {

                $null

            }

        } | Where-Object { $_ -ne $null -and $_.Statements -ge 6 } |

            Sort-Object -Property Validated -Descending

        if ($All) {

            $Selected = $Ranked | Where-Object { $_.Validated -gt 0 }

        } else {

            $Selected = $Ranked | Select-Object -First $DebateCount

        }

        if ($Selected.Count -eq 0) {

            throw "No debates with enough statement turns found."

        }

        Write-Host "Selected $($Selected.Count) debate(s):" -ForegroundColor Cyan

        $Selected | ForEach-Object {

            Write-Host "  $($_.Statements) turns | $($_.Validated) validated | $($_.Title)" -ForegroundColor DarkCyan

        }

        Write-Host ""

        $DebatePaths = $Selected | ForEach-Object { $_.Path }

    } else {

        $DebatePaths = $DebatePath | ForEach-Object {

            $Resolved = Resolve-Path $_ -ErrorAction SilentlyContinue

            if (-not $Resolved) { throw "Debate file not found: $_" }

            $Resolved.Path

        }

    }

    # ── Build CLI arguments ──────────────────────────────

    $DebateArgs = ($DebatePaths | ForEach-Object { "--debate `"$_`"" }) -join ' '

    $FullArgs = "tsx `"$CliPath`" $DebateArgs --models $Models --max-turns $MaxTurnsPerDebate"

    if ($OutputPath) {

        $ResolvedOutput = Join-Path $PWD $OutputPath

        $FullArgs += " --output `"$ResolvedOutput`""

    }

    Write-Host "Running judge audit: $($DebatePaths.Count) debate(s), models=$Models" -ForegroundColor Yellow

    Write-Host ""

    # ── Execute ──────────────────────────────────────────

    $Psi = [System.Diagnostics.ProcessStartInfo]::new()

    $Psi.FileName = $NpxCmd.Source

    $Psi.Arguments = $FullArgs

    $Psi.WorkingDirectory = $RepoRoot

    $Psi.RedirectStandardOutput = $true

    $Psi.RedirectStandardError = $true

    $Psi.UseShellExecute = $false

    $Psi.CreateNoWindow = $true

    try {

        $Proc = [System.Diagnostics.Process]::Start($Psi)

    } catch {

        throw "Failed to start judge audit process: $_"

    }

    # Stream progress from stderr

    while (-not $Proc.StandardError.EndOfStream) {

        $Line = $Proc.StandardError.ReadLine()

        if ($Line) { Write-Host $Line -ForegroundColor DarkGray }

    }

    $StdOut = $Proc.StandardOutput.ReadToEnd()

    if (-not $Proc.WaitForExit(1200000)) {

        try { $Proc.Kill() } catch { }

        throw "Judge audit timed out after 20 minutes."

    }

    if ($Proc.ExitCode -ne 0 -and -not $StdOut) {

        throw "Judge audit failed with exit code $($Proc.ExitCode)."

    }

    # ── Return result ────────────────────────────────────

    if ($OutputPath) {

        Write-Host "`nReport saved to: $ResolvedOutput" -ForegroundColor Green

        return Get-Item $ResolvedOutput

    }

    if ($StdOut) {

        try {

            return $StdOut | ConvertFrom-Json

        } catch {

            return $StdOut

        }

    }

}