Public/Test-AITJudgeModel.ps1

# Copyright (c) 2026 Jeffrey Snover. All rights reserved.
# Licensed under the MIT License. See LICENSE file in the project root.

<#
.SYNOPSIS
    Tests whether a different AI model improves debate turn validation by replaying
    the Stage-B judge prompt across multiple models and comparing verdicts.
.DESCRIPTION
    Runs a cross-model audit on completed debates. For each statement turn, the
    judge prompt is sent to every specified model and the verdicts are compared.
    Surfaces blind spots (turns one model flags but others pass), agreement rates,
    and per-model statistics.
 
    This is a one-off experiment tool — it does not modify any debate files.
.EXAMPLE
    Test-AITJudgeModel -DebateCount 3
    # Audits the 3 debates with the most validated turns using haiku + gemini-3.1-flash-lite-preview
.EXAMPLE
    Test-AITJudgeModel -DebatePath ../ai-triad-data/debates/debate-4bc8ae8a-1459-4d33-b306-4bdb2308d423.json -Models haiku,sonnet,gemini
.EXAMPLE
    Test-AITJudgeModel -DebateCount 5 -Models haiku,gemini,groq -MaxTurnsPerDebate 10 -OutputPath ./judge-audit.json
.EXAMPLE
    Test-AITJudgeModel -All -Models haiku,gemini -MaxTurnsPerDebate 5
    # Quick sweep across all debates with turn validations
#>

function Test-AITJudgeModel {
    [CmdletBinding(DefaultParameterSetName = 'Auto')]
    param(
        [Parameter(ParameterSetName = 'Auto')]
        [ValidateRange(1, 50)]
        [int]$DebateCount = 3,

        [Parameter(ParameterSetName = 'Auto')]
        [switch]$All,

        [Parameter(Mandatory, ParameterSetName = 'Explicit')]
        [string[]]$DebatePath,

        [Parameter()]
        [string]$Models = 'haiku,gemini',

        [Parameter()]
        [ValidateRange(1, 100)]
        [int]$MaxTurnsPerDebate = 50,

        [Parameter()]
        [string]$OutputPath
    )

    Set-StrictMode -Version Latest

    # ── Verify prerequisites ─────────────────────────────
    $NpxCmd = Get-Command npx.cmd -ErrorAction SilentlyContinue
    if (-not $NpxCmd) { $NpxCmd = Get-Command npx -ErrorAction SilentlyContinue }
    if (-not $NpxCmd) {
        throw "npx is required. Install Node.js (v18+): https://nodejs.org"
    }

    $RepoRoot = Get-CodeRoot
    $CliPath = Join-Path $RepoRoot 'lib' 'debate' 'judgeAudit.ts'
    if (-not (Test-Path $CliPath)) {
        throw "Judge audit CLI not found at: $CliPath"
    }

    # ── Resolve debate files ─────────────────────────────
    if ($PSCmdlet.ParameterSetName -eq 'Auto') {
        try {
            $DebatesDir = Get-DebatesDir
        } catch {
            throw "Cannot locate debates directory. Set AI_TRIAD_DATA_ROOT or check .aitriad.json."
        }

        $AllDebateFiles = Get-ChildItem -Path $DebatesDir -Filter 'debate-*.json' -File
        if ($AllDebateFiles.Count -eq 0) {
            throw "No debate files found in $DebatesDir"
        }

        # Rank by number of validated turns (debates with turn_validations are most interesting)
        $Ranked = $AllDebateFiles | ForEach-Object {
            try {
                $D = Get-Content $_.FullName -Raw | ConvertFrom-Json -ErrorAction Stop
                $StmtCount = ($D.transcript | Where-Object { $_.type -eq 'statement' -or $_.type -eq 'opening' }).Count
                $TvCount = if ($D.turn_validations) { ($D.turn_validations.PSObject.Properties).Count } else { 0 }
                [PSCustomObject]@{
                    Path       = $_.FullName
                    Title      = ($D.title ?? '').Substring(0, [Math]::Min(60, ($D.title ?? '').Length))
                    Statements = $StmtCount
                    Validated  = $TvCount
                }
            } catch {
                $null
            }
        } | Where-Object { $_ -ne $null -and $_.Statements -ge 6 } |
            Sort-Object -Property Validated -Descending

        if ($All) {
            $Selected = $Ranked | Where-Object { $_.Validated -gt 0 }
        } else {
            $Selected = $Ranked | Select-Object -First $DebateCount
        }

        if ($Selected.Count -eq 0) {
            throw "No debates with enough statement turns found."
        }

        Write-Host "Selected $($Selected.Count) debate(s):" -ForegroundColor Cyan
        $Selected | ForEach-Object {
            Write-Host " $($_.Statements) turns | $($_.Validated) validated | $($_.Title)" -ForegroundColor DarkCyan
        }
        Write-Host ""

        $DebatePaths = $Selected | ForEach-Object { $_.Path }
    } else {
        $DebatePaths = $DebatePath | ForEach-Object {
            $Resolved = Resolve-Path $_ -ErrorAction SilentlyContinue
            if (-not $Resolved) { throw "Debate file not found: $_" }
            $Resolved.Path
        }
    }

    # ── Build CLI arguments ──────────────────────────────
    $DebateArgs = ($DebatePaths | ForEach-Object { "--debate `"$_`"" }) -join ' '
    $FullArgs = "tsx `"$CliPath`" $DebateArgs --models $Models --max-turns $MaxTurnsPerDebate"

    if ($OutputPath) {
        $ResolvedOutput = Join-Path $PWD $OutputPath
        $FullArgs += " --output `"$ResolvedOutput`""
    }

    Write-Host "Running judge audit: $($DebatePaths.Count) debate(s), models=$Models" -ForegroundColor Yellow
    Write-Host ""

    # ── Execute ──────────────────────────────────────────
    $Psi = [System.Diagnostics.ProcessStartInfo]::new()
    $Psi.FileName = $NpxCmd.Source
    $Psi.Arguments = $FullArgs
    $Psi.WorkingDirectory = $RepoRoot
    $Psi.RedirectStandardOutput = $true
    $Psi.RedirectStandardError = $true
    $Psi.UseShellExecute = $false
    $Psi.CreateNoWindow = $true

    try {
        $Proc = [System.Diagnostics.Process]::Start($Psi)
    } catch {
        throw "Failed to start judge audit process: $_"
    }

    # Stream progress from stderr
    while (-not $Proc.StandardError.EndOfStream) {
        $Line = $Proc.StandardError.ReadLine()
        if ($Line) { Write-Host $Line -ForegroundColor DarkGray }
    }

    $StdOut = $Proc.StandardOutput.ReadToEnd()
    if (-not $Proc.WaitForExit(1200000)) {
        try { $Proc.Kill() } catch { }
        throw "Judge audit timed out after 20 minutes."
    }

    if ($Proc.ExitCode -ne 0 -and -not $StdOut) {
        throw "Judge audit failed with exit code $($Proc.ExitCode)."
    }

    # ── Return result ────────────────────────────────────
    if ($OutputPath) {
        Write-Host "`nReport saved to: $ResolvedOutput" -ForegroundColor Green
        return Get-Item $ResolvedOutput
    }

    if ($StdOut) {
        try {
            return $StdOut | ConvertFrom-Json
        } catch {
            return $StdOut
        }
    }
}