Public/Invoke-VMMetricsCollection.ps1

function Invoke-VMMetricsCollection {
    <#
    .SYNOPSIS
        Discovers VMs in a subscription, pulls their platform metrics, and exports
        VMPerformance-compatible rows for FinOps rightsizing.

    .DESCRIPTION
        End-to-end orchestrator:
          1. For each subscription (or the current context): inventories VMs via Azure
             Resource Graph (falls back to Get-AzVM) and excludes managed / ephemeral /
             short-running compute (VM Scale Set members, Spot, Databricks/AKS managed RGs,
             tagged or pattern-matched VMs) using cheap control-plane signals -- no metric calls.
          2. For each remaining VM pulls 'Percentage CPU' + 'Available Memory Bytes' and
             aggregates to monthly Min/Median/P95/P99/Max rows.
          3. Writes ONE combined CSV (+ manifest) across all subscriptions and returns a
             run report (including what was excluded and why).
          4. Optionally uploads the CSV (+ manifest) to blob storage.

        Platform metrics retain only ~93 days; -WindowDays above that is capped with a
        warning. For longer history, run this on a schedule so rows accumulate, or stand
        up a diagnostic-setting export to Log Analytics (see README).

    .PARAMETER SubscriptionId
        One or more subscriptions to operate in. If omitted, the current Az context is used.
        All subscriptions are aggregated into a single output file. Pass
        (Get-AzSubscription).Id to sweep everything visible to the signed-in identity.

    .PARAMETER ResourceGroupName
        Optional resource-group filter.

    .PARAMETER VMName
        Optional VM-name filter (one or more exact names).

    .PARAMETER WindowDays
        Look-back window in days, default 90 (capped at 93 -- platform retention).

    .PARAMETER TimeGrain
        Aggregation grain, default 00:05:00 (PT5M). A coarser grain (e.g. 01:00:00) is
        lighter and faster for very large fleets with negligible impact on monthly percentiles.

    .PARAMETER ThrottleLimit
        How many VMs to pull concurrently (default 12, max 50). Metric pulls are network-bound,
        so parallelism is the main speed lever. Lower it if you hit throttling (HTTP 429).

    .PARAMETER IncludeScaleSetMember
        Keep VMs that belong to a VM Scale Set (Flex). Default: excluded (ephemeral/managed).

    .PARAMETER IncludeSpot
        Keep Spot-priority VMs. Default: excluded (interruptible/ephemeral).

    .PARAMETER ExcludeResourceGroupPattern
        Extra resource-group regex(es) to exclude, added to the built-in defaults
        ('^databricks-rg-', '^MC_').

    .PARAMETER ExcludeNamePattern
        VM-name regex(es) to exclude.

    .PARAMETER ExcludeTag
        Tag rules to exclude, e.g. @{ Vendor = 'Databricks'; environment = '*' }. Value '*'
        matches any value for that key. Merged with the built-in default @{ Vendor = 'Databricks' }.

    .PARAMETER MinAgeDays
        Exclude VMs created within the last N days (too little history to size). Default 0 (off);
        only effective when Resource Graph is available (it provides creation time).

    .PARAMETER NoDefaultExclusions
        Turn off the built-in resource-group and tag defaults (Databricks/AKS/Vendor). The
        -IncludeSpot / -IncludeScaleSetMember switches still control those two categories.

    .PARAMETER OutputPath
        Directory to write the CSV + manifest into. Default: current directory.

    .PARAMETER Format
        'CSV' (default) or 'Parquet'.

    .PARAMETER StorageAccountName
        Optional. When set, the CSV (+ manifest) is uploaded to this storage account under
        {Customer}/{date}/ after the local write. Requires -Customer.

    .PARAMETER ContainerName
        Blob container for the upload (default 'vmperformance'); created if missing.

    .PARAMETER Customer
        Real customer name; first blob path segment. Required when -StorageAccountName is set.

    .PARAMETER StorageAccountKey
        Account key for the upload. If omitted, -StorageAccountResourceGroup is used to fetch it.

    .PARAMETER StorageAccountResourceGroup
        Resource group of the storage account, used to fetch the key when none is supplied.

    .PARAMETER SasToken
        Container-scoped SAS token for the upload. When given, key auth is bypassed -- use
        this for cross-tenant uploads from a customer's Cloud Shell.

    .OUTPUTS
        [pscustomobject] run report: OutputFile, BlobUri, Subscriptions, VMsProcessed,
        VMsSkipped, VMsExcluded, ExcludedByReason, RowCount, Skipped (per-VM reasons), Window.
    #>

    [CmdletBinding(SupportsShouldProcess)]
    [OutputType([pscustomobject])]
    param(
        [string[]] $SubscriptionId,
        [string]   $ResourceGroupName,
        [string[]] $VMName,
        [ValidateRange(1, 93)] [int] $WindowDays = 90,
        [timespan] $TimeGrain = ([timespan]'00:05:00'),
        [ValidateRange(1, 50)] [int] $ThrottleLimit = 12,
        [string]   $OutputPath = (Get-Location).Path,
        [ValidateSet('CSV', 'Parquet')] [string] $Format = 'CSV',

        # --- Exclusion of managed / ephemeral / short-running compute ---
        [switch]    $IncludeScaleSetMember,
        [switch]    $IncludeSpot,
        [string[]]  $ExcludeResourceGroupPattern,
        [string[]]  $ExcludeNamePattern,
        [hashtable] $ExcludeTag,
        [int]       $MinAgeDays = 0,
        [switch]    $NoDefaultExclusions,

        # --- Optional blob upload (account-key or SAS auth) ---
        [string]   $StorageAccountName,
        [string]   $ContainerName = 'vmperformance',
        [string]   $Customer,
        [string]   $StorageAccountKey,
        [string]   $StorageAccountResourceGroup,
        [string]   $SasToken
    )

    if ($StorageAccountName -and -not $Customer) {
        throw "-Customer is required when -StorageAccountName is set (it is the first blob path segment)."
    }

    # Resolve the subscription list. Default to the current context's subscription.
    if ($SubscriptionId) {
        $subList = @($SubscriptionId)
    }
    else {
        $cur = Get-AzContext -ErrorAction Stop
        if (-not $cur) { throw "No Azure context. Run Connect-AzAccount first." }
        $subList = @($cur.Subscription.Id)
    }

    $endTime   = (Get-Date).ToUniversalTime()
    $startTime = $endTime.AddDays(-$WindowDays)

    $allRows   = [System.Collections.Generic.List[object]]::new()
    $skipped   = [System.Collections.Generic.List[object]]::new()
    $processed = 0
    $subsDone  = [System.Collections.Generic.List[string]]::new()

    # Build the effective exclusion config (built-in defaults + caller additions).
    $rgPatterns = [System.Collections.Generic.List[string]]::new()
    $tagRules   = @{}
    if (-not $NoDefaultExclusions) {
        $rgPatterns.Add('^databricks-rg-')   # Azure Databricks managed RG
        $rgPatterns.Add('^MC_')              # AKS node managed RG
        $tagRules['Vendor'] = 'Databricks'
    }
    foreach ($p in $ExcludeResourceGroupPattern) { if ($p) { $rgPatterns.Add($p) } }
    if ($ExcludeTag) { foreach ($k in $ExcludeTag.Keys) { $tagRules[$k] = $ExcludeTag[$k] } }
    $excludeArgs = @{
        IncludeScaleSetMember       = $IncludeScaleSetMember
        IncludeSpot                 = $IncludeSpot
        ExcludeResourceGroupPattern = $rgPatterns.ToArray()
        ExcludeNamePattern          = $ExcludeNamePattern
        ExcludeTag                  = $tagRules
        MinAgeDays                  = $MinAgeDays
        Now                         = (Get-Date).ToUniversalTime()
    }
    $excludedTotal   = 0
    $excludedReasons = @{}   # reason -> count

    # Remember the caller's context so a multi-subscription sweep doesn't leave them
    # parked on the last subscription.
    $originalSub = (Get-AzContext -ErrorAction SilentlyContinue).Subscription.Id

    $subTotal = $subList.Count
    $subIdx   = 0
    foreach ($sid in $subList) {
        $subIdx++
        Write-Progress -Id 0 -Activity "FinOpsVMMetrics: collecting VM metrics" `
            -Status "Subscription $subIdx/$subTotal ($sid) - $processed VM(s) so far" `
            -PercentComplete ([int](($subIdx - 1) / [math]::Max($subTotal, 1) * 100))

        try {
            Write-Verbose "Setting subscription context to $sid"
            Set-AzContext -Subscription $sid -ErrorAction Stop | Out-Null
        }
        catch {
            Write-Warning "Cannot select subscription ${sid}: $($_.Exception.Message)"
            $skipped.Add([pscustomobject]@{ Subscription = $sid; VM = '(subscription)'; Reason = $_.Exception.Message })
            continue
        }
        $subsDone.Add($sid)

        $inventory = Get-VMInventory -SubscriptionId $sid -ResourceGroupName $ResourceGroupName -VMName $VMName
        if (-not $inventory) {
            Write-Verbose "No VMs in subscription $sid for the given filters."
            continue
        }

        # Exclude managed / ephemeral / short-running compute before any metric pull.
        $vmKeep = [System.Collections.Generic.List[object]]::new()
        foreach ($rec in $inventory) {
            $reason = Test-VMExcluded -VM $rec @excludeArgs
            if ($reason) {
                $excludedTotal++
                if ($excludedReasons.ContainsKey($reason)) { $excludedReasons[$reason]++ }
                else { $excludedReasons[$reason] = 1 }
                Write-Verbose "Excluded $($rec.Name): $reason"
            }
            else {
                $vmKeep.Add($rec)
            }
        }
        if ($vmKeep.Count -eq 0) {
            Write-Verbose "Subscription ${sid}: all $($inventory.Count) VM(s) excluded."
            continue
        }
        Write-Verbose "Subscription ${sid}: $($vmKeep.Count) VM(s) to pull ($($inventory.Count - $vmKeep.Count) excluded)."

        $vmAll   = $vmKeep.ToArray()
        $vmTotal = $vmAll.Count

        # Pre-resolve SKU -> total RAM (MB) serially (cached). The parallel runspaces have no
        # Az.Compute loaded, so they read RAM from this map instead of calling Azure.
        $ramMap = @{}
        foreach ($vm in $vmAll) {
            $key = "$($vm.Location)|$($vm.SkuName)"
            if (-not $ramMap.ContainsKey($key)) {
                try { $ramMap[$key] = Resolve-VMSkuRam -SkuName $vm.SkuName -Location $vm.Location }
                catch { $ramMap[$key] = $null }
            }
        }

        # Per-subscription context for concurrency-safe auth (passed as -DefaultProfile).
        $azCtx = Get-AzContext

        # Inject the self-contained functions the parallel block needs, so the runspaces
        # don't have to import the whole module (slow + fragile). They are passed as source
        # strings (a $using value can't be a scriptblock) and rebuilt inside each runspace.
        $fnSeries  = ${function:Get-VMMetricSeries}.ToString()
        $fnConvert = ${function:ConvertTo-VMPerformanceRow}.ToString()
        $fnPctile  = ${function:Measure-Percentile}.ToString()

        # One parallel pass per subscription (runspaces are reused across all its VMs, so the
        # per-runspace Az auto-load is paid once). -AsJob lets us poll completion for a live
        # progress bar. Raw datapoints are released per VM inside each runspace; only the small
        # monthly rows come back, so memory stays bounded even for large fleets.
        $job = $vmAll | ForEach-Object -ThrottleLimit $ThrottleLimit -Parallel {
            ${function:Get-VMMetricSeries}         = [scriptblock]::Create($using:fnSeries)
            ${function:ConvertTo-VMPerformanceRow} = [scriptblock]::Create($using:fnConvert)
            ${function:Measure-Percentile}         = [scriptblock]::Create($using:fnPctile)
            $ctx   = $using:azCtx
            $rm    = $using:ramMap
            $vm    = $_
            $ramMB = $rm["$($vm.Location)|$($vm.SkuName)"]
            try {
                $series = Get-VMMetricSeries -ResourceId $vm.Id `
                    -MetricName 'Percentage CPU', 'Available Memory Bytes' `
                    -StartTime $using:startTime -EndTime $using:endTime -TimeGrain $using:TimeGrain -AzContext $ctx
                $convertArgs = @{
                    VMName     = $vm.Name
                    CpuPoint   = @($series['Percentage CPU'])
                    MemPoint   = @($series['Available Memory Bytes'])
                    ResourceId = $vm.Id
                    SkuName    = $vm.SkuName
                }
                if ($null -ne $ramMB) { $convertArgs['TotalRamMB'] = $ramMB }
                [pscustomobject]@{ VM = $vm.Name; Rows = (ConvertTo-VMPerformanceRow @convertArgs); Reason = $null }
            }
            catch {
                [pscustomobject]@{ VM = $vm.Name; Rows = $null; Reason = $_.Exception.Message }
            }
        } -AsJob

        while ($job.State -eq 'Running') {
            $done = @($job.ChildJobs | Where-Object { $_.State -in 'Completed', 'Failed', 'Stopped' }).Count
            Write-Progress -Id 1 -ParentId 0 `
                -Activity "Pulling platform metrics (sub $subIdx/$subTotal, ${ThrottleLimit}x parallel)" `
                -Status "VM $done/$vmTotal" `
                -PercentComplete ([int]($done / [math]::Max($vmTotal, 1) * 100))
            Start-Sleep -Seconds 1
        }
        $results = Receive-Job $job
        Remove-Job $job
        Write-Progress -Id 1 -ParentId 0 -Activity "Pulling platform metrics" -Completed

        foreach ($res in $results) {
            if ($res.Rows -and @($res.Rows).Count -gt 0) {
                foreach ($r in $res.Rows) { $allRows.Add($r) }
                $processed++
            }
            elseif ($res.Reason) {
                Write-Warning "Failed for $($res.VM): $($res.Reason)"
                $skipped.Add([pscustomobject]@{ Subscription = $sid; VM = $res.VM; Reason = $res.Reason })
            }
            else {
                $skipped.Add([pscustomobject]@{ Subscription = $sid; VM = $res.VM; Reason = 'No metric data in window' })
            }
        }
    }
    Write-Progress -Id 0 -Activity "FinOpsVMMetrics: collecting VM metrics" -Completed

    # Restore the caller's original subscription context after the sweep.
    if ($originalSub -and $subsDone.Count -and $subsDone[-1] -ne $originalSub) {
        Set-AzContext -Subscription $originalSub -ErrorAction SilentlyContinue | Out-Null
    }

    if ($excludedTotal -gt 0) {
        $summary = ($excludedReasons.GetEnumerator() | Sort-Object Value -Descending |
            ForEach-Object { "$($_.Value) $($_.Key)" }) -join ', '
        Write-Verbose "Excluded $excludedTotal VM(s): $summary"
    }

    if ($allRows.Count -eq 0) {
        Write-Warning "No VM metric data collected across $($subList.Count) subscription(s)."
        return [pscustomobject]@{
            OutputFile = $null; BlobUri = $null; Subscriptions = $subsDone.ToArray()
            VMsProcessed = 0; VMsSkipped = $skipped.Count; VMsExcluded = $excludedTotal
            ExcludedByReason = $excludedReasons; RowCount = 0
            Skipped = $skipped.ToArray(); Window = @{ Start = $startTime; End = $endTime; Days = $WindowDays }
        }
    }

    $stamp    = $endTime.ToString('yyyy-MM-dd')
    $fileName = "VMPerformance-$stamp.$($Format.ToLowerInvariant())"
    $dataPath = Join-Path $OutputPath $fileName

    $manifest = @{
        subscriptions  = $subsDone.ToArray()
        windowStartUtc = $startTime.ToString('o')
        windowEndUtc   = $endTime.ToString('o')
        windowDays     = $WindowDays
        timeGrain      = $TimeGrain.ToString()
        vmsProcessed   = $processed
        vmsSkipped     = $skipped.Count
        vmsExcluded    = $excludedTotal
        excludedByReason = $excludedReasons
    }
    if ($Customer) { $manifest['customer'] = $Customer }

    $written = $null
    if ($PSCmdlet.ShouldProcess($dataPath, "Export $($allRows.Count) rows")) {
        $written = Export-VMPerformanceData -Row $allRows.ToArray() -Path $dataPath -Format $Format -Manifest $manifest
    }

    # Optional upload to blob storage.
    $blobUri = $null
    if ($StorageAccountName -and $written) {
        $uploaded = Publish-VMPerformanceData -Path $written -Customer $Customer `
            -StorageAccountName $StorageAccountName -ContainerName $ContainerName `
            -StorageAccountKey $StorageAccountKey -StorageAccountResourceGroup $StorageAccountResourceGroup `
            -SasToken $SasToken -Date $endTime.ToString('yyyy-MM-dd')
        $blobUri = $uploaded.BlobUri
    }

    [pscustomobject]@{
        OutputFile       = $written
        BlobUri          = $blobUri
        Subscriptions    = $subsDone.ToArray()
        VMsProcessed     = $processed
        VMsSkipped       = $skipped.Count
        VMsExcluded      = $excludedTotal
        ExcludedByReason = $excludedReasons
        RowCount         = $allRows.Count
        Skipped          = $skipped.ToArray()
        Window           = @{ Start = $startTime; End = $endTime; Days = $WindowDays }
    }
}