Public/Invoke-AzLocalFailedUpdateRetry.ps1
|
function Invoke-AzLocalFailedUpdateRetry { <# .SYNOPSIS Re-applies (retries) a previously FAILED Azure Local cluster update, once. .DESCRIPTION Issues the same `updates/{updateName}/apply` ARM action that the Azure portal's "Try again" button uses, to resume a cluster update whose latest run reached a terminal FAILED state. Azure Local update runs are resumable, so re-applying the same update version resumes it from where it failed. Guard rails (all enforced before any apply is issued): 1. FAILED-state gate. The cluster's updateSummary state MUST be one of 'NeedsAttention', 'UpdateFailed' or 'PreparationFailed'. A cluster that is still 'UpdateInProgress' (including a STALLED / orphaned run whose lastUpdatedTime has frozen) is deliberately SKIPPED - ARM rejects an apply over an in-flight run, and the orphaned run must be cleared first. This is the safe, intended behaviour for clusters like the Arizona orphaned-run case. 2. One-time guard (idempotent). The retry is recorded in a DEDICATED 'UpdateRetryAttempted' cluster tag (the durable one-time guard). On a subsequent invocation, if that tag already records an attempt for the SAME update version, the cluster is SKIPPED with status 'RetryAlreadyAttempted' so a scheduled pipeline never re-applies in a loop. Use -Force to override and retry again. Both tags are auto-cleared together by the normal post-success reconciliation (Invoke-AzLocalSideloadedAutoResetForCluster) once the retried update reaches Succeeded, re-arming a future retry naturally. Two tags are written by the retry, sharing one value format: - 'UpdateLastAttempt' (Outcome 'UpdateRetried' / 'Failed') - the generic last-attempt audit pointer, so the retry surfaces in the Step.08 audit trail like any other attempt. This tag is overwritten by every Start attempt, so it is NOT used as the retry guard. - 'UpdateRetryAttempted' (Outcome 'RetryStarted' / 'RetryFailed') - the dedicated, durable one-time guard read in step 5 below. Both tag values are length-clamped to the Azure 256 character limit by Format-AzLocalUpdateLastAttemptTagValue. .PARAMETER ClusterName Name of a single Azure Local cluster to retry. Resolved via Azure Resource Graph / ARM. Use with -ResourceGroupName / -SubscriptionId to disambiguate. .PARAMETER ResourceGroupName Resource group containing the cluster (only used with -ClusterName). .PARAMETER ClusterResourceId Full Azure resource ID of a single cluster (alternative to -ClusterName). .PARAMETER SubscriptionId Azure subscription ID. Defaults to the current az CLI subscription. .PARAMETER UpdateName The failed update version to retry (e.g. 'Solution12.2604.1003.1005'). When omitted, the cmdlet auto-detects it from the cluster's most recent update run (which must itself be Failed/Error). Supply this explicitly for a targeted retry, or when the latest run cannot be auto-classified. .PARAMETER ApiVersion Azure REST API version. Defaults to the module default API version. .PARAMETER Force Override the one-time guard (retry even if a 'UpdateRetried' attempt is already recorded for this update version) and suppress the confirmation prompt. .OUTPUTS PSCustomObject with ClusterName, Status, Message, UpdateName, StartTime, EndTime, Duration. Status is one of: RetryStarted, RetryAlreadyAttempted, Skipped, NotFound, Failed, WhatIf. .EXAMPLE Invoke-AzLocalFailedUpdateRetry -ClusterName 'Arizona' Auto-detects the failed update on 'Arizona' and retries it once (prompts for confirmation). Skips if the cluster is still UpdateInProgress. .EXAMPLE Invoke-AzLocalFailedUpdateRetry -ClusterName 'Arizona' -UpdateName 'Solution12.2604.1003.1005' -Force Targeted, unattended retry of a specific failed update version, overriding the one-time guard. .NOTES Author : AzLocal.UpdateManagement Version: 0.8.95 #> [CmdletBinding(SupportsShouldProcess = $true, ConfirmImpact = 'High', DefaultParameterSetName = 'ByName')] [OutputType([PSCustomObject])] param( [Parameter(Mandatory = $true, ParameterSetName = 'ByName', Position = 0, ValueFromPipeline = $true, ValueFromPipelineByPropertyName = $true)] [ValidateNotNullOrEmpty()] [string]$ClusterName, [Parameter(Mandatory = $false, ParameterSetName = 'ByName')] [string]$ResourceGroupName, [Parameter(Mandatory = $true, ParameterSetName = 'ByResourceId', ValueFromPipelineByPropertyName = $true)] [ValidateNotNullOrEmpty()] [string]$ClusterResourceId, [Parameter(Mandatory = $false)] [string]$SubscriptionId, [Parameter(Mandatory = $false)] [ValidateNotNullOrEmpty()] [string]$UpdateName, [Parameter(Mandatory = $false)] [string]$ApiVersion = $script:DefaultApiVersion, [Parameter(Mandatory = $false)] [switch]$Force ) begin { Test-AzCliAvailable | Out-Null # updateSummary states that indicate a terminal failed/aborted update run. # InProgress (incl. stalled/orphaned) is intentionally absent - see help. $failedStates = @('NeedsAttention', 'UpdateFailed', 'PreparationFailed') # States that confirm the latest run itself terminated in failure. $failedRunStates = @('Failed', 'Error') # Outcome written to the generic UpdateLastAttempt audit tag on success. $retryOutcome = 'UpdateRetried' # Outcomes written to the dedicated UpdateRetryAttempted one-time guard tag. $retryGuardStartedOutcome = 'RetryStarted' $retryGuardFailedOutcome = 'RetryFailed' # Honour -Force as "do not prompt" without forcing the caller to also pass # -Confirm:$false against the High ConfirmImpact declared above. if ($Force -and -not $PSBoundParameters.ContainsKey('Confirm')) { $ConfirmPreference = 'None' } } process { $startTime = Get-Date $clusterDisplay = if ($PSCmdlet.ParameterSetName -eq 'ByResourceId') { ($ClusterResourceId -split '/')[-1] } else { $ClusterName } # Local helper to build a consistent result row. $newResult = { param($Name, $Status, $Message, $Update) $end = Get-Date [PSCustomObject]@{ ClusterName = $Name Status = $Status Message = $Message UpdateName = $Update StartTime = $startTime EndTime = $end Duration = ($end - $startTime).ToString('hh\:mm\:ss') } } Write-Log -Message "" -Level Info Write-Log -Message "Retry failed update: $clusterDisplay" -Level Header try { # 1. Resolve cluster resource (id, tags, name). if ($PSCmdlet.ParameterSetName -eq 'ByResourceId') { $uri = "https://management.azure.com$ClusterResourceId`?api-version=$ApiVersion" $clusterInfo = (Invoke-AzRestJson -Uri $uri).Data if ($LASTEXITCODE -ne 0) { $clusterInfo = $null } } else { if (-not $SubscriptionId) { $SubscriptionId = (az account show --query id -o tsv) Write-Log -Message "Using current subscription: $SubscriptionId" -Level Info } $clusterInfo = Get-AzLocalClusterInfo -ClusterName $ClusterName ` -ResourceGroupName $ResourceGroupName ` -SubscriptionId $SubscriptionId ` -ApiVersion $ApiVersion } if (-not $clusterInfo -or -not $clusterInfo.id) { Write-Log -Message "Cluster '$clusterDisplay' not found." -Level Warning return (& $newResult $clusterDisplay 'NotFound' 'Cluster not found' $null) } $resolvedName = if ($clusterInfo.PSObject.Properties['name'] -and $clusterInfo.name) { [string]$clusterInfo.name } else { $clusterDisplay } Write-Log -Message "Found cluster: $($clusterInfo.id)" -Level Success # 2. Update summary state. $summary = Get-AzLocalUpdateSummary -ClusterResourceId $clusterInfo.id -ApiVersion $ApiVersion $state = if ($summary -and $summary.properties -and $summary.properties.PSObject.Properties['state']) { [string]$summary.properties.state } else { '' } Write-Log -Message "Update summary state: $state" -Level Info # 3. FAILED-state gate. if ($state -notin $failedStates) { $reason = if ($state -match 'InProgress') { "Cluster state is '$state' - an in-flight (or stalled/orphaned) run cannot be retried; clear the run first" } else { "Cluster state is '$state' - no failed update to retry" } Write-Log -Message $reason -Level Warning return (& $newResult $resolvedName 'Skipped' $reason $null) } # 4. Determine the failed update name to retry. $targetUpdateName = $null if ($UpdateName) { $targetUpdateName = $UpdateName Write-Log -Message "Using caller-supplied update name: $targetUpdateName" -Level Info } else { $allRuns = Get-AzLocalClusterUpdateRuns -resourceId $clusterInfo.id -apiVer $ApiVersion $latestRun = $allRuns | Sort-Object { $_.properties.timeStarted } -Descending | Select-Object -First 1 if ($latestRun) { $formattedRun = Format-AzLocalUpdateRun -run $latestRun -clusterResourceId $clusterInfo.id if ($formattedRun.State -in $failedRunStates -and $formattedRun.UpdateName) { $targetUpdateName = $formattedRun.UpdateName Write-Log -Message "Auto-detected failed update from latest run: $targetUpdateName (run state: $($formattedRun.State))" -Level Info } else { Write-Log -Message "Latest run state '$($formattedRun.State)' did not confirm a failed update name." -Level Warning } } } if (-not $targetUpdateName) { $msg = 'Could not determine a failed update to retry; re-run with -UpdateName' Write-Log -Message $msg -Level Warning return (& $newResult $resolvedName 'Skipped' $msg $null) } # 5. One-time guard via the dedicated UpdateRetryAttempted tag. # ANY recorded retry attempt (RetryStarted OR RetryFailed) for the # same update version blocks a second automatic retry; -Force overrides. $retryGuardRaw = Get-TagValue -Tags $clusterInfo.tags -Name $script:UpdateRetryAttemptedTagName $parsedGuard = ConvertFrom-AzLocalUpdateLastAttemptTagValue -Value $retryGuardRaw if (-not $Force -and $parsedGuard ` -and -not [string]::IsNullOrWhiteSpace($parsedGuard.UpdateName) ` -and $parsedGuard.UpdateName.Trim() -ieq $targetUpdateName.Trim()) { $msg = "Already retried update '$targetUpdateName' once (recorded $($parsedGuard.AttemptUtc.ToString('yyyy-MM-ddTHH:mm:ssZ')), outcome '$($parsedGuard.Outcome)'); use -Force to retry again" Write-Log -Message $msg -Level Warning return (& $newResult $resolvedName 'RetryAlreadyAttempted' $msg $targetUpdateName) } # 6. Apply (ShouldProcess + High confirmation unless -Force). if (-not $PSCmdlet.ShouldProcess($resolvedName, "Retry failed update '$targetUpdateName'")) { return (& $newResult $resolvedName 'WhatIf' "Would retry update '$targetUpdateName'" $targetUpdateName) } Write-Log -Message "Retrying update '$targetUpdateName' on cluster '$resolvedName'..." -Level Info $applyResult = Invoke-AzLocalUpdateApply -ClusterResourceId $clusterInfo.id ` -UpdateName $targetUpdateName ` -ApiVersion $ApiVersion if ($applyResult) { Write-Log -Message "Retry initiated successfully." -Level Success Write-Log -Message "Monitor progress using: Get-AzLocalUpdateRuns -ClusterName '$resolvedName'" -Level Info $attemptUtc = $startTime.ToUniversalTime() # Generic last-attempt audit pointer. Write-AzLocalUpdateLastAttemptTag ` -ClusterResourceId $clusterInfo.id ` -ClusterName $resolvedName ` -AttemptUtc $attemptUtc ` -Outcome $retryOutcome ` -UpdateName $targetUpdateName ` -Reason 'One-time retry of failed update initiated' ` -ApiVersion $ApiVersion # Dedicated durable one-time guard. Write-AzLocalUpdateLastAttemptTag ` -ClusterResourceId $clusterInfo.id ` -ClusterName $resolvedName ` -AttemptUtc $attemptUtc ` -Outcome $retryGuardStartedOutcome ` -UpdateName $targetUpdateName ` -Reason 'One-time retry of failed update initiated' ` -TagName $script:UpdateRetryAttemptedTagName ` -ApiVersion $ApiVersion return (& $newResult $resolvedName 'RetryStarted' 'Retry initiated successfully' $targetUpdateName) } else { Write-Log -Message "Retry apply call was rejected for cluster '$resolvedName'." -Level Error $attemptUtc = $startTime.ToUniversalTime() Write-AzLocalUpdateLastAttemptTag ` -ClusterResourceId $clusterInfo.id ` -ClusterName $resolvedName ` -AttemptUtc $attemptUtc ` -Outcome 'Failed' ` -UpdateName $targetUpdateName ` -Reason 'Retry apply call was rejected by ARM' ` -ApiVersion $ApiVersion # Record the attempt in the guard tag too so a rejected call still # consumes the one-time retry (use -Force to try again). Write-AzLocalUpdateLastAttemptTag ` -ClusterResourceId $clusterInfo.id ` -ClusterName $resolvedName ` -AttemptUtc $attemptUtc ` -Outcome $retryGuardFailedOutcome ` -UpdateName $targetUpdateName ` -Reason 'Retry apply call was rejected by ARM' ` -TagName $script:UpdateRetryAttemptedTagName ` -ApiVersion $ApiVersion return (& $newResult $resolvedName 'Failed' 'Retry apply call was rejected by ARM' $targetUpdateName) } } catch { Write-Log -Message "Failed to retry update on '$clusterDisplay': $($_.Exception.Message)" -Level Error return (& $newResult $clusterDisplay 'Failed' $_.Exception.Message $null) } } } |