Public/Invoke-AzLocalReadinessGatedFailedUpdateRetry.ps1

function Invoke-AzLocalReadinessGatedFailedUpdateRetry {
    <#
    .SYNOPSIS
        Step.6 thin-YAML adapter that retries previously FAILED cluster updates
        for the clusters in the readiness CSV, emits per-status step outputs,
        persists per-cluster results to apply-retry-results.json, and appends a
        dedicated 'Failed Update Single-Retry' section to the consolidated
        pipeline summary.
    .DESCRIPTION
        v0.8.95 opt-in companion to Invoke-AzLocalReadinessGatedClusterUpdate.
 
        The apply-updates pipeline already produces readiness-report.csv (one row
        per cluster in the UpdateRing scope, with ReadyForUpdate, UpdateState and
        ClusterResourceId columns). The readiness gate marks clusters whose latest
        run terminated in a FAILED state as ReadyForUpdate=False (NotReady), so the
        primary apply step never touches them. This adapter consumes the SAME CSV
        and selects exactly those failed-state rows, then calls the per-cluster
        primitive Invoke-AzLocalFailedUpdateRetry once for each.
 
        The guard against re-applying in a loop lives in the primitive (the
        durable UpdateRetryAttempted cluster tag), NOT here - this adapter is a
        thin fan-out. It invokes the primitive with -Confirm:$false (unattended)
        but WITHOUT -Force, so the one-time guard stays in force: a cluster that
        was already retried once for the same update version returns
        'RetryAlreadyAttempted' and is not re-applied. The guard tag is
        auto-cleared once the retried run reaches Succeeded
        (Invoke-AzLocalSideloadedAutoResetForCluster), re-arming a future retry.
 
        This step is OPT-IN at the pipeline layer: the bundled apply-updates.yml
        pipelines only invoke it when the FAILED_UPDATES_SINGLE_RETRY pipeline
        variable is 'true'. The cmdlet itself never reads that variable - the
        gating is entirely in the YAML.
 
        Failed updateSummary states recognised: NeedsAttention, UpdateFailed,
        PreparationFailed. UpdateInProgress (including stalled/orphaned runs) is
        intentionally NOT retried - the primitive skips those.
    .PARAMETER ReadinessCsvPath
        Path to readiness-report.csv produced by the check-readiness job.
    .PARAMETER UpdateRing
        UpdateRing label used in console/summary output. Cosmetic only; the retry
        scope comes from the CSV.
    .PARAMETER DryRun
        Switch. When set, the primitive is invoked with -WhatIf so no retry is
        actually issued.
    .PARAMETER OutputDirectory
        Directory where apply-retry-results.json is written. Defaults to the
        readiness CSV's parent directory.
    .PARAMETER RetryResultsJsonFileName
        Per-cluster JSON filename. Default 'apply-retry-results.json'.
    .PARAMETER SummaryFileName
        Per-task markdown filename (ADO/Local only). Default
        'azlocal-step6-retry-summary.md'.
    .PARAMETER ApiVersion
        Azure REST API version forwarded to the primitive. Defaults to the module
        default API version.
    .PARAMETER PassThru
        Returns a PSCustomObject with the four counters (RetryStarted,
        RetryAlreadyAttempted, RetrySkipped, RetryFailed) plus Results,
        RetryResultsJsonPath, FailedResourceIds and SummaryPath.
    .OUTPUTS
        PSCustomObject (with -PassThru) or none.
    .NOTES
        Author : AzLocal.UpdateManagement
        Version : 0.8.95
    #>

    [CmdletBinding(SupportsShouldProcess = $true)]
    [OutputType([void])]
    [OutputType([pscustomobject])]
    param(
        [Parameter(Mandatory = $true)]
        [ValidateNotNullOrEmpty()]
        [string]$ReadinessCsvPath,

        [Parameter(Mandatory = $false)]
        [AllowEmptyString()]
        [string]$UpdateRing = '',

        [switch]$DryRun,

        [Parameter(Mandatory = $false)]
        [AllowEmptyString()]
        [string]$OutputDirectory = '',

        [Parameter(Mandatory = $false)]
        [ValidateNotNullOrEmpty()]
        [string]$RetryResultsJsonFileName = 'apply-retry-results.json',

        [Parameter(Mandatory = $false)]
        [ValidateNotNullOrEmpty()]
        [string]$SummaryFileName = 'azlocal-step6-retry-summary.md',

        [Parameter(Mandatory = $false)]
        [string]$ApiVersion = $script:DefaultApiVersion,

        [switch]$PassThru
    )

    Set-StrictMode -Version Latest
    $ErrorActionPreference = 'Stop'

    $pipelineHost = Get-AzLocalPipelineHost

    # updateSummary states that mark a terminal failed/aborted run. Mirrors the
    # gate in Invoke-AzLocalFailedUpdateRetry; the primitive re-validates per
    # cluster, so this CSV-level filter is only a coarse pre-selection.
    $failedStates = @('NeedsAttention', 'UpdateFailed', 'PreparationFailed')

    # OutputDirectory default: parent of the readiness CSV.
    if (-not $OutputDirectory) {
        $OutputDirectory = Split-Path -Path $ReadinessCsvPath -Parent
        if (-not $OutputDirectory) { $OutputDirectory = '.' }
    }
    if (-not (Test-Path -LiteralPath $OutputDirectory)) {
        New-Item -ItemType Directory -Path $OutputDirectory -Force | Out-Null
    }
    $retryJsonPath = Join-Path -Path $OutputDirectory -ChildPath $RetryResultsJsonFileName

    # Per-host step-output naming: GH UPPER_SNAKE, ADO PascalCase.
    if ($pipelineHost -eq 'AzureDevOps') {
        $nStarted = 'RetryStarted'; $nAlready = 'RetryAlready'
        $nSkipped = 'RetrySkipped'; $nFailed = 'RetryFailed'
    }
    else {
        $nStarted = 'RETRY_STARTED'; $nAlready = 'RETRY_ALREADY'
        $nSkipped = 'RETRY_SKIPPED'; $nFailed = 'RETRY_FAILED'
    }

    # Lexically-scoped emitter (no .GetNewClosure() - that severs module
    # SessionState and hides the private Set-AzLocalPipelineOutput function).
    $emitCounters = {
        param($st, $al, $sk, $f)
        Set-AzLocalPipelineOutput -Name $nStarted -Value "$st" -CrossJob
        Set-AzLocalPipelineOutput -Name $nAlready -Value "$al" -CrossJob
        Set-AzLocalPipelineOutput -Name $nSkipped -Value "$sk" -CrossJob
        Set-AzLocalPipelineOutput -Name $nFailed  -Value "$f"  -CrossJob
    }

    if (-not (Test-Path -LiteralPath $ReadinessCsvPath)) {
        throw "Invoke-AzLocalReadinessGatedFailedUpdateRetry: Readiness CSV not found at '$ReadinessCsvPath'. The check-readiness job did not upload a readiness-report artifact - cannot determine which clusters to retry."
    }

    $readinessRows = @(Import-Csv -Path $ReadinessCsvPath)
    if ($readinessRows.Count -gt 0) {
        $cols = $readinessRows[0].PSObject.Properties.Name
        if (-not ($cols -contains 'ClusterResourceId')) {
            throw "Invoke-AzLocalReadinessGatedFailedUpdateRetry: Readiness CSV at '$ReadinessCsvPath' is missing the 'ClusterResourceId' column. Re-run check-readiness with v0.7.62+ or refresh the pipeline YAML via Copy-AzLocalPipelineExample -Update."
        }
        if (-not ($cols -contains 'UpdateState')) {
            throw "Invoke-AzLocalReadinessGatedFailedUpdateRetry: Readiness CSV at '$ReadinessCsvPath' is missing the 'UpdateState' column - cannot identify failed clusters to retry."
        }
    }

    # Coarse pre-selection: failed updateSummary state AND a usable resource id.
    $failedRows = @($readinessRows | Where-Object {
            $_.ClusterResourceId -and ($_.UpdateState -in $failedStates)
        })
    [string[]]$failedResourceIds = @($failedRows | ForEach-Object { [string]$_.ClusterResourceId })

    Write-Host "Readiness CSV: $($readinessRows.Count) row(s), $($failedResourceIds.Count) in a failed update state eligible for single-retry."

    # Local helper to render + emit the summary section, so every early-return
    # path still contributes a section to the consolidated summary.
    $renderSummary = {
        param($Rows, $StartedN, $AlreadyN, $SkippedN, $FailedN)
        $okIcon = ':white_check_mark:'; $alreadyIcon = ':fast_forward:'
        $skipIcon = ':no_entry:'; $failIcon = ':x:'
        $sb = New-Object System.Text.StringBuilder
        [void]$sb.AppendLine('## Failed Update Single-Retry')
        [void]$sb.AppendLine('')
        if (-not [string]::IsNullOrWhiteSpace($UpdateRing)) {
            [void]$sb.AppendLine("Target UpdateRing: **$UpdateRing**")
            [void]$sb.AppendLine('')
        }
        if ($DryRun) {
            [void]$sb.AppendLine('> This was a dry run - no retries were actually issued.')
            [void]$sb.AppendLine('')
        }
        [void]$sb.AppendLine('| Result | Count |')
        [void]$sb.AppendLine('|---|---|')
        [void]$sb.AppendLine("| $okIcon Retry started | $StartedN |")
        [void]$sb.AppendLine("| $alreadyIcon Already retried (guard) | $AlreadyN |")
        [void]$sb.AppendLine("| $skipIcon Skipped | $SkippedN |")
        [void]$sb.AppendLine("| $failIcon Failed | $FailedN |")
        [void]$sb.AppendLine('')
        if ($Rows -and @($Rows).Count -gt 0) {
            [void]$sb.AppendLine('| Cluster | Status | Update | Message |')
            [void]$sb.AppendLine('|---|---|---|---|')
            foreach ($row in @($Rows)) {
                $msg = if ($row.PSObject.Properties['Message'] -and $row.Message) {
                    ([string]$row.Message) -replace '\|', '\|' -replace '\r?\n', ' '
                }
                else { '' }
                if ($msg.Length -gt 200) { $msg = $msg.Substring(0, 197) + '...' }
                $upd = if ($row.PSObject.Properties['UpdateName'] -and $row.UpdateName) { [string]$row.UpdateName } else { '' }
                [void]$sb.AppendLine(('| {0} | {1} | {2} | {3} |' -f $row.ClusterName, $row.Status, $upd, $msg))
            }
            [void]$sb.AppendLine('')
        }
        else {
            [void]$sb.AppendLine('No clusters were in a failed update state - nothing to retry.')
            [void]$sb.AppendLine('')
        }
        return (Add-AzLocalPipelineStepSummary -Markdown $sb.ToString() -SummaryFileName $SummaryFileName)
    }

    if ($failedResourceIds.Count -eq 0) {
        & $emitCounters 0 0 0 0
        $summaryPath = & $renderSummary @() 0 0 0 0
        @() | ConvertTo-Json -Depth 4 | Out-File -FilePath $retryJsonPath -Encoding utf8 -Force
        if ($PassThru) {
            return [pscustomobject]@{
                RetryStarted          = 0
                RetryAlreadyAttempted = 0
                RetrySkipped          = 0
                RetryFailed           = 0
                Results               = @()
                RetryResultsJsonPath  = $retryJsonPath
                FailedResourceIds     = @()
                SummaryPath           = $summaryPath
            }
        }
        return
    }

    Write-Host ""
    Write-Host "========================================" -ForegroundColor Cyan
    Write-Host "Single-retry of FAILED updates (UpdateRing: $UpdateRing)" -ForegroundColor Cyan
    Write-Host " Clusters in failed state (from readiness CSV): $($failedResourceIds.Count)" -ForegroundColor Cyan
    if ($DryRun) { Write-Host " DRY RUN MODE - no retries will be issued" -ForegroundColor Yellow }
    Write-Host "========================================" -ForegroundColor Cyan

    $results = @()
    foreach ($resId in $failedResourceIds) {
        $retryParams = @{
            ClusterResourceId = $resId
            ApiVersion        = $ApiVersion
            Confirm           = $false
        }
        if ($DryRun) { $retryParams['WhatIf'] = $true }
        # Primitive is best-effort per cluster; never let one cluster abort the
        # fan-out (the primitive already wraps its body in try/catch and returns
        # a Failed row, but guard the invocation itself defensively).
        try {
            $results += @(Invoke-AzLocalFailedUpdateRetry @retryParams)
        }
        catch {
            $name = ($resId -split '/')[-1]
            Write-Host "::warning::Retry invocation threw for '$name': $($_.Exception.Message)"
            $results += [pscustomobject]@{
                ClusterName = $name
                Status      = 'Failed'
                Message     = $_.Exception.Message
                UpdateName  = $null
                StartTime   = (Get-Date)
                EndTime     = (Get-Date)
                Duration    = '00:00:00'
            }
        }
    }
    $results = @($results)

    Write-Host ""
    Write-Host "Single-retry operation complete"

    $retryStarted = @($results | Where-Object { $_.Status -eq 'RetryStarted' }).Count
    $retryAlready = @($results | Where-Object { $_.Status -eq 'RetryAlreadyAttempted' }).Count
    $retrySkipped = @($results | Where-Object { $_.Status -in @('Skipped', 'NotFound', 'WhatIf') }).Count
    $retryFailed  = @($results | Where-Object { $_.Status -in @('Failed', 'Error') }).Count

    & $emitCounters $retryStarted $retryAlready $retrySkipped $retryFailed

    @($results) | Select-Object ClusterName, Status, UpdateName, Duration, Message |
        ConvertTo-Json -Depth 4 |
        Out-File -FilePath $retryJsonPath -Encoding utf8 -Force
    Write-Host "Wrote per-cluster retry results to $retryJsonPath"

    $summaryPath = & $renderSummary $results $retryStarted $retryAlready $retrySkipped $retryFailed

    # ADO-only per-bucket warning lines (preserve CI surface parity).
    if ($pipelineHost -eq 'AzureDevOps') {
        if ($retryFailed -gt 0) {
            Write-Host "##vso[task.logissue type=warning]$retryFailed cluster(s) failed to retry. See apply-retry-results.json for per-cluster detail."
        }
        if ($retryAlready -gt 0) {
            Write-Host "##vso[task.logissue type=warning]$retryAlready cluster(s) were already retried once (one-time guard). Use Invoke-AzLocalFailedUpdateRetry -Force to override manually."
        }
    }

    if ($PassThru) {
        return [pscustomobject]@{
            RetryStarted          = $retryStarted
            RetryAlreadyAttempted = $retryAlready
            RetrySkipped          = $retrySkipped
            RetryFailed           = $retryFailed
            Results               = $results
            RetryResultsJsonPath  = $retryJsonPath
            FailedResourceIds     = $failedResourceIds
            SummaryPath           = $summaryPath
        }
    }
}