Public/Test-AzLocalClusterHealth.ps1

function Test-AzLocalClusterHealth {
    <#
    .SYNOPSIS
        Validates cluster health before applying updates by checking for blocking health check failures.
     
    .DESCRIPTION
        Queries the health check results from each cluster's update summary to identify
        Critical, Warning, and Informational failures. Critical failures block updates
        from being applied.
         
        This function can be used as a standalone pre-flight check or is called
        automatically by Start-AzLocalClusterUpdate before applying updates.
         
        Health check data is stored in ARM on the cluster's updateSummaries resource
        and is refreshed approximately every 24 hours.
     
    .PARAMETER ClusterResourceIds
        An array of full Azure Resource IDs for the clusters to check.
     
    .PARAMETER ClusterNames
        An array of Azure Local cluster names to check.
     
    .PARAMETER ScopeByUpdateRingTag
        Find clusters by their 'UpdateRing' tag value via Azure Resource Graph.
     
    .PARAMETER UpdateRingValue
        The value of the 'UpdateRing' tag to match when using -ScopeByUpdateRingTag.
     
    .PARAMETER ResourceGroupName
        Resource group containing the clusters (only used with -ClusterNames).
     
    .PARAMETER SubscriptionId
        Azure subscription ID (defaults to current subscription).
     
    .PARAMETER BlockingOnly
        Show only Critical severity failures (the ones that block updates).
     
    .PARAMETER ApiVersion
        Azure REST API version to use. Default: "2025-10-01".
     
    .PARAMETER ExportPath
        Export results to CSV (.csv), JSON (.json), or JUnit XML (.xml) file.
     
    .PARAMETER ExportFormat
        Explicit format to use when writing -ExportPath. One of: Auto, Csv, Json, JUnitXml.
        Default: Auto (resolved from the file extension of -ExportPath; unknown extensions fall back to Csv).
        Use this to write a specific format regardless of extension (e.g. a JUnit XML file with a .xml name but CI-picked parser).
     
    .PARAMETER UpdateSummary
        Pre-fetched update summary object from Get-AzLocalUpdateSummary.
        When provided, skips the internal summary fetch to avoid redundant API calls.
        Only used when checking a single cluster via -ClusterResourceIds with one ID.
     
    .OUTPUTS
        PSCustomObject[] - Array of health check results per cluster.
     
    .EXAMPLE
        Test-AzLocalClusterHealth -ClusterResourceIds @("/subscriptions/.../clusters/Seattle")
        Checks health for a single cluster by resource ID.
     
    .EXAMPLE
        Test-AzLocalClusterHealth -ScopeByUpdateRingTag -UpdateRingValue "Wave1" -BlockingOnly
        Shows only Critical (update-blocking) health failures for all Wave1 clusters.
     
    .EXAMPLE
        Test-AzLocalClusterHealth -ClusterNames "MyCluster" -ExportPath "C:\Reports\health.csv"
        Checks health and exports results to CSV.
    #>

    [CmdletBinding(DefaultParameterSetName = 'ByResourceId')]
    [OutputType([PSCustomObject[]])]
    param(
        [Parameter(Mandatory = $true, ParameterSetName = 'ByResourceId')]
        [string[]]$ClusterResourceIds,

        [Parameter(Mandatory = $true, ParameterSetName = 'ByName')]
        [string[]]$ClusterNames,

        [Parameter(Mandatory = $true, ParameterSetName = 'ByTag')]
        [switch]$ScopeByUpdateRingTag,

        [ValidatePattern('^(\*\*\*|[A-Za-z0-9_-]{1,64}(;[A-Za-z0-9_-]{1,64})*)$')]
        [Parameter(Mandatory = $true, ParameterSetName = 'ByTag')]
        [string]$UpdateRingValue,

        [Parameter(Mandatory = $false, ParameterSetName = 'ByName')]
        [string]$ResourceGroupName,

        [Parameter(Mandatory = $false, ParameterSetName = 'ByName')]
        [Parameter(Mandatory = $false, ParameterSetName = 'ByResourceId')]
        [Parameter(Mandatory = $false, ParameterSetName = 'ByTag')]
        [string]$SubscriptionId,

        [Parameter(Mandatory = $false)]
        [switch]$BlockingOnly,

        [Parameter(Mandatory = $false)]
        [string]$ApiVersion = $script:DefaultApiVersion,

        [Parameter(Mandatory = $false)]
        [string]$ExportPath,

        [Parameter(Mandatory = $false)]
        [ValidateSet('Auto', 'Csv', 'Json', 'JUnitXml')]
        [string]$ExportFormat = 'Auto',

        [Parameter(Mandatory = $false)]
        [object]$UpdateSummary,

        [Parameter(Mandatory = $false)]
        [switch]$PassThru
    )

    # Pre-flight: Validate export path is writable before expensive operations
    if ($ExportPath) {
        try { Test-ExportPathWritable -Path $ExportPath | Out-Null }
        catch { Write-Warning $_.Exception.Message; return }
    }

    Write-Log -Message "" -Level Info
    Write-Log -Message "========================================" -Level Header
    Write-Log -Message "Azure Local Cluster Health Validation" -Level Header
    Write-Log -Message "========================================" -Level Header

    # Verify Azure CLI
    Test-AzCliAvailable | Out-Null
    try {
        $null = az account show 2>$null
        if ($LASTEXITCODE -ne 0) { throw "Not logged in" }
        Write-Log -Message "Azure CLI authentication verified" -Level Success
    }
    catch {
        Write-Log -Message "Azure CLI is not logged in. Please run 'az login' first." -Level Error
        return
    }

    # Ensure resource-graph extension is installed (the cmdlet is fully
    # ARG-driven from v0.7.68 - single batched updatesummaries query replaces
    # the per-cluster ARM REST fan-out).
    if (-not (Install-AzGraphExtension)) {
        Write-Error "Failed to install Azure CLI 'resource-graph' extension. Please install manually: az extension add --name resource-graph"
        return
    }

    # Build cluster list (reuse existing patterns)
    $clustersToCheck = @()

    if ($PSCmdlet.ParameterSetName -eq 'ByTag') {
        $ringFilter = ConvertTo-AzLocalUpdateRingKqlFilter -UpdateRingValue $UpdateRingValue
        $argQuery = "resources | where type =~ 'microsoft.azurestackhci/clusters' $ringFilter | project id, name, resourceGroup, subscriptionId"
        try {
            $argParams = @{ Query = $argQuery }
            if ($SubscriptionId) { $argParams['SubscriptionId'] = $SubscriptionId }
            $clusters = Invoke-AzResourceGraphQuery @argParams
        }
        catch {
            Write-Log -Message "Azure Resource Graph query failed: $($_.Exception.Message)" -Level Error
            return
        }
        if (-not $clusters -or $clusters.Count -eq 0) {
            Write-Log -Message "No clusters found with UpdateRing = '$UpdateRingValue'" -Level Warning
            return @()
        }
        foreach ($c in $clusters) {
            $clustersToCheck += @{ ResourceId = $c.id; Name = $c.name }
        }
    }
    elseif ($PSCmdlet.ParameterSetName -eq 'ByResourceId') {
        foreach ($rid in $ClusterResourceIds) {
            $clustersToCheck += @{ ResourceId = $rid; Name = ($rid -split '/')[-1] }
        }
    }
    else {
        # ByName - v0.7.68: single ARG batch lookup replaces the per-name
        # Get-AzLocalClusterInfo ARM REST loop.
        $nameListKql = ($ClusterNames | ForEach-Object { "'$($_.ToLower())'" }) -join ','
        $rgFilter = ''
        if ($ResourceGroupName) { $rgFilter = "| where tolower(resourceGroup) =~ '$($ResourceGroupName.ToLower())'" }
        $argQuery = "resources | where type =~ 'microsoft.azurestackhci/clusters' | where tolower(name) in~ ($nameListKql) $rgFilter | project id, name, resourceGroup, subscriptionId"
        try {
            $argParams = @{ Query = $argQuery }
            if ($SubscriptionId) { $argParams['SubscriptionId'] = $SubscriptionId }
            $clusterRows = Invoke-AzResourceGraphQuery @argParams
        }
        catch {
            Write-Log -Message "Error resolving cluster names via Azure Resource Graph: $_" -Level Error
            return
        }
        $foundNames = @{}
        foreach ($cluster in @($clusterRows)) { $foundNames[$cluster.name.ToLower()] = $cluster }
        foreach ($name in $ClusterNames) {
            $key = $name.ToLower()
            if ($foundNames.ContainsKey($key)) {
                $cluster = $foundNames[$key]
                $clustersToCheck += @{ ResourceId = $cluster.id; Name = $cluster.name }
            }
            else {
                Write-Log -Message "Cluster '$name' not found - skipping" -Level Warning
            }
        }
    }

    if (-not $clustersToCheck -or $clustersToCheck.Count -eq 0) {
        Write-Log -Message "No clusters resolved for health validation." -Level Warning
        return @()
    }

    Write-Log -Message "Checking health for $($clustersToCheck.Count) cluster(s)..." -Level Info

    $results = @()
    $overallPassed = $true

    # v0.7.68: Batch every cluster's update summary into one Azure Resource
    # Graph query. The previous design made one ARM REST call per cluster
    # (optionally parallelised across Start-Job runspaces). ARG returns the
    # same `properties.healthCheckResult` shape as the ARM REST response, so
    # the downstream parsing logic is unchanged.
    #
    # Fast-path: when the caller pre-fetched the summary (used by
    # Start-AzLocalClusterUpdate's single-cluster invocation), skip the
    # ARG query and use the supplied object directly.
    $summaryByCluster = @{}
    if ($UpdateSummary -and $clustersToCheck.Count -eq 1) {
        $summaryByCluster[$clustersToCheck[0].ResourceId.ToLower()] = $UpdateSummary
    }
    else {
        $idListKql = ($clustersToCheck | ForEach-Object { "'$($_.ResourceId.ToLower())'" }) -join ','
        $summariesKql = "extensibilityresources | where type =~ 'microsoft.azurestackhci/clusters/updatesummaries' | extend ids = split(id, '/') | extend ClusterResourceId_ = tolower(strcat('/subscriptions/', tostring(ids[2]), '/resourceGroups/', tostring(ids[4]), '/providers/Microsoft.AzureStackHCI/clusters/', tostring(ids[8]))) | where ClusterResourceId_ in~ ($idListKql) | project id, name, properties, ClusterResourceId_"
        try {
            $argParams = @{ Query = $summariesKql }
            if ($SubscriptionId) { $argParams['SubscriptionId'] = $SubscriptionId }
            $summaryRows = Invoke-AzResourceGraphQuery @argParams
        }
        catch {
            Write-Log -Message "Azure Resource Graph query for update summaries failed: $($_.Exception.Message)" -Level Error
            return
        }
        foreach ($row in @($summaryRows)) {
            $summaryByCluster[[string]$row.ClusterResourceId_] = $row
        }
        Write-Log -Message "Returned $(@($summaryRows).Count) update-summary record(s) via Azure Resource Graph" -Level Success
    }

    foreach ($cluster in $clustersToCheck) {
        $clusterName = $cluster.Name
        Write-Host " Checking: $clusterName..." -ForegroundColor Gray -NoNewline

        try {
            $key = $cluster.ResourceId.ToLower()
            $summary = if ($summaryByCluster.ContainsKey($key)) { $summaryByCluster[$key] } else { $null }

            if (-not $summary -or -not $summary.properties.healthCheckResult) {
                Write-Host " No Health Data" -ForegroundColor Yellow
                $results += [PSCustomObject]@{
                    ClusterName = $clusterName; HealthState = "No Data"; Passed = $true
                    CriticalCount = 0; WarningCount = 0; Failures = @()
                }
                continue
            }

            $healthState = if ($summary.properties.healthState) { [string]$summary.properties.healthState } else { "Unknown" }
            $healthChecks = $summary.properties.healthCheckResult

            # Extract failures (Critical and Warning only; use -BlockingOnly for Critical only)
            $failures = @()
            # Track seen rows for dedup. The ARM updateSummaries.healthCheckResult feed
            # sometimes emits byte-identical duplicate entries for the same logical
            # check (observed in v0.7.76 on a 2-node Azure Local cluster where the
            # "Test Network intent on existing cluster nodes" check emitted two
            # rows with identical CheckName/Severity/Description/Remediation/
            # TargetResourceName/Timestamp). Faithfully echoing those into the
            # operator's CSV doubled the displayed failure count and made
            # Step.5_assess-update-readiness.yml reports confusing. We dedup
            # by the COMPLETE row tuple: if every field is identical the row is
            # a duplicate; per-node distinct findings (different TargetResource
            # Name or Timestamp) stay separate.
            $seenKeys = New-Object 'System.Collections.Generic.HashSet[string]'
            # Composite-key field separator. U+001F (UNIT SEPARATOR) is never
            # present in human-readable strings so no field value can ever
            # collide with a separator boundary. Windows PowerShell 5.1 does
            # not support the `u{XXXX} escape so we build the char explicitly.
            $usSep = [char]0x1F
            foreach ($check in $healthChecks) {
                if ($check.status -eq "Failed") {
                    $sev = if ($check.severity) { $check.severity } else { "Unknown" }
                    if ($BlockingOnly -and $sev -ne "Critical") { continue }
                    if ($sev -eq "Informational") { continue }
                    $displayName = if ($check.displayName) { $check.displayName } elseif ($check.name) { ($check.name -split '/')[0] } else { "Unknown" }
                    $description = if ($check.description) { $check.description } else { "" }
                    $remediation = if ($check.remediation) { $check.remediation } else { "" }
                    $targetResName = if ($check.targetResourceName) { $check.targetResourceName } else { "" }
                    $timestamp = if ($check.timestamp) { $check.timestamp } else { "" }
                    $key = $clusterName + $usSep + $displayName + $usSep + $sev + $usSep + $description + $usSep + $remediation + $usSep + $targetResName + $usSep + $timestamp
                    if (-not $seenKeys.Add($key)) {
                        Write-Verbose "Suppressing duplicate healthCheckResult row for cluster '$clusterName' check '$displayName' target '$targetResName' timestamp '$timestamp' (ARM upstream duplicate)."
                        continue
                    }
                    $failures += [PSCustomObject]@{
                        ClusterName        = $clusterName
                        CheckName          = $displayName
                        Severity           = $sev
                        Description        = $description
                        Remediation        = $remediation
                        TargetResourceName = $targetResName
                        Timestamp          = $timestamp
                    }
                }
            }

            $critCount = @($failures | Where-Object { $_.Severity -eq "Critical" }).Count
            $warnCount = @($failures | Where-Object { $_.Severity -eq "Warning" }).Count
            $passed = ($critCount -eq 0)
            if (-not $passed) { $overallPassed = $false }

            # Console output
            if ($passed -and $failures.Count -eq 0) {
                Write-Host " Healthy" -ForegroundColor Green
            }
            elseif ($passed) {
                Write-Host " Warnings ($warnCount)" -ForegroundColor Yellow
            }
            else {
                Write-Host " BLOCKED ($critCount critical)" -ForegroundColor Red
            }

            $results += [PSCustomObject]@{
                ClusterName   = $clusterName
                HealthState   = $healthState
                Passed        = $passed
                CriticalCount = $critCount
                WarningCount  = $warnCount
                Failures      = $failures
            }
        }
        catch {
            Write-Host " Error: $($_.Exception.Message)" -ForegroundColor Red
            $results += [PSCustomObject]@{
                ClusterName = $clusterName; HealthState = "Error"; Passed = $false
                CriticalCount = 0; WarningCount = 0; Failures = @()
            }
            $overallPassed = $false
        }
    }

    # Summary
    Write-Log -Message "" -Level Info
    Write-Log -Message "========================================" -Level Header
    Write-Log -Message "Health Validation Summary" -Level Header
    Write-Log -Message "========================================" -Level Header

    $totalClusters = $results.Count
    $passedCount = @($results | Where-Object { $_.Passed -eq $true }).Count
    $failedCount = $totalClusters - $passedCount

    Write-Log -Message "Total Clusters: $totalClusters" -Level Info
    Write-Log -Message "Passed: $passedCount (no critical failures)" -Level $(if ($passedCount -eq $totalClusters) { "Success" } else { "Info" })
    Write-Log -Message "Blocked: $failedCount (critical failures present)" -Level $(if ($failedCount -gt 0) { "Error" } else { "Info" })

    # Display failure details
    $allFailures = @($results | ForEach-Object { $_.Failures } | Where-Object { $_ })
    if ($allFailures.Count -gt 0) {
        Write-Log -Message "" -Level Info
        Write-Log -Message "Health Check Failures:" -Level Header
        $allFailures | Format-Table ClusterName, Severity, CheckName, TargetResourceName, Description -AutoSize -Wrap | Out-String -Stream | ForEach-Object {
            if ($_ -ne "") { Write-Log -Message $_ -Level Info }
        }

        # Show remediation for Critical failures
        $criticalFailures = @($allFailures | Where-Object { $_.Severity -eq "Critical" })
        if ($criticalFailures.Count -gt 0) {
            Write-Log -Message "" -Level Info
            Write-Log -Message "Remediation for Critical (Update-Blocking) Failures:" -Level Warning
            foreach ($f in $criticalFailures) {
                if ($f.Remediation) {
                    $nodeInfo = if ($f.TargetResourceName) { " ($($f.TargetResourceName))" } else { "" }
                    Write-Log -Message " $($f.ClusterName) - $($f.CheckName)$nodeInfo`: $($f.Remediation)" -Level Warning
                }
            }
        }
    }
    else {
        Write-Log -Message "" -Level Info
        Write-Log -Message "No health check failures detected. All clusters are ready for updates." -Level Success
    }

    # Overall result
    Write-Log -Message "" -Level Info
    if ($overallPassed) {
        Write-Log -Message "HEALTH VALIDATION PASSED - All clusters are ready for updates" -Level Success
    }
    else {
        Write-Log -Message "HEALTH VALIDATION FAILED - Critical health issues must be resolved before updates can proceed" -Level Error
    }

    # Export if path specified
    if ($ExportPath -and $allFailures.Count -gt 0) {
        try {
            $ExportPath = Resolve-SafeOutputPath -Path $ExportPath
            $exportDir = Split-Path -Path $ExportPath -Parent
            if ($exportDir -and -not (Test-Path $exportDir)) {
                New-Item -ItemType Directory -Path $exportDir -Force | Out-Null
            }
            # Resolve effective format: explicit -ExportFormat wins; 'Auto' falls back
            # to file-extension detection for backward compatibility.
            $effectiveFormat = $ExportFormat
            if ($effectiveFormat -eq 'Auto') {
                $extension = [System.IO.Path]::GetExtension($ExportPath).ToLower()
                $effectiveFormat = switch ($extension) {
                    '.csv'  { 'Csv' }
                    '.json' { 'Json' }
                    '.xml'  { 'JUnitXml' }
                    default { 'Csv' }
                }
            }
            switch ($effectiveFormat) {
                'Csv' {
                    $allFailures | ConvertTo-SafeCsvCollection | Export-Csv -Path $ExportPath -NoTypeInformation -Encoding UTF8
                    Write-Log -Message "Results exported to CSV: $ExportPath" -Level Success
                }
                'Json' {
                    $exportData = @{
                        Timestamp = Get-Date -Format "yyyy-MM-dd HH:mm:ss"
                        OverallPassed = $overallPassed
                        TotalClusters = $totalClusters
                        Passed = $passedCount
                        Blocked = $failedCount
                        Failures = $allFailures
                    }
                    Write-Utf8NoBomFile -Path $ExportPath -Content ($exportData | ConvertTo-Json -Depth 10)
                    Write-Log -Message "Results exported to JSON: $ExportPath" -Level Success
                }
                'JUnitXml' {
                    $junitResults = $allFailures | ForEach-Object {
                        $junitNodeInfo = if ($_.TargetResourceName) { " (Node: $($_.TargetResourceName))" } else { "" }
                        [PSCustomObject]@{
                            ClusterName = $_.ClusterName; Status = "Failed"
                            Message = "$($_.Severity): $($_.CheckName)$junitNodeInfo - $($_.Description)"
                            UpdateName = $_.CheckName; CurrentState = $_.Severity
                        }
                    }
                    Export-ResultsToJUnitXml -Results $junitResults -OutputPath $ExportPath `
                        -TestSuiteName "AzureLocalClusterHealth" -OperationType "HealthCheck"
                    Write-Log -Message "Results exported to JUnit XML: $ExportPath" -Level Success
                }
            }
        }
        catch {
            Write-Log -Message "Failed to export results: $($_.Exception.Message)" -Level Error
        }
    }

    if ($PassThru) {
        return $results
    }
}