Common/Get-RedactionRules.ps1

<#
.SYNOPSIS
    Deterministic PII redaction rules for the sanitized evidence package (D4 #788).
.DESCRIPTION
    Pure function module. Provides Invoke-RedactionRules for stripping
    user-identifiable information from arbitrary text content while preserving
    join keys via SHA-256-truncated tokens.

    Replacements use stable hashes: the same UPN always produces the same
    <user-xxxxxxxx> token across all artifacts in the package. This lets an
    auditor still see correlations ("user-a3f81b29 fails MFA on CA-001 and has
    admin role on ROLE-001") without ever seeing the underlying UPN.

    Categories redacted:
      - UPNs / email addresses -> <user-{hash}>
      - IPv4 / IPv6 addresses -> <ip-{hash}>
      - Application/Tenant GUIDs -> <guid-{hash}> (preserves GUID structure)

    Tenant display name is redacted via -TenantDisplayName param when the
    caller knows it; we don't try to discover it from text alone since
    "Contoso" inside a control description shouldn't be touched.
.NOTES
    The hash is SHA-256(value) truncated to 8 hex chars. 8 chars * 4 bits =
    32 bits of entropy; for the typical tenant size (<10k principals) the
    collision probability is < 10^-5, well below "useful for join keys" while
    revealing nothing about the underlying value.
#>


function Get-RedactionToken {
    <#
    .SYNOPSIS
        Returns a deterministic redaction token for a single value.
    .PARAMETER Value
        The plaintext value to redact.
    .PARAMETER Prefix
        Token prefix (e.g. 'user', 'ip', 'guid').
    .OUTPUTS
        String of the form '<{prefix}-{8 hex chars}>'.
    #>

    [CmdletBinding()]
    [OutputType([string])]
    param(
        [Parameter(Mandatory)]
        [AllowEmptyString()]
        [string]$Value,

        [Parameter(Mandatory)]
        [ValidateNotNullOrEmpty()]
        [string]$Prefix
    )
    if ([string]::IsNullOrEmpty($Value)) { return "<$Prefix-empty>" }
    $sha = [System.Security.Cryptography.SHA256]::Create()
    try {
        $bytes = [System.Text.Encoding]::UTF8.GetBytes($Value.ToLowerInvariant())
        $hash  = $sha.ComputeHash($bytes)
        $hex   = -join ($hash[0..3] | ForEach-Object { $_.ToString('x2') })
        return "<$Prefix-$hex>"
    }
    finally {
        $sha.Dispose()
    }
}

function Invoke-RedactionRules {
    <#
    .SYNOPSIS
        Applies the full PII redaction ruleset to a string of text.
    .PARAMETER Text
        Input text. Returned unchanged if empty or null.
    .PARAMETER TenantDisplayName
        Optional. When provided, all case-insensitive occurrences of the
        tenant display name are replaced with <tenant>.
    .OUTPUTS
        Redacted string.
    #>

    [CmdletBinding()]
    [OutputType([string])]
    param(
        [Parameter(Mandatory)]
        [AllowEmptyString()]
        [AllowNull()]
        [string]$Text,

        [Parameter()]
        [string]$TenantDisplayName
    )
    if ([string]::IsNullOrEmpty($Text)) { return $Text }
    $result = $Text

    # Email / UPN pass FIRST. Running tenant-name first would eat the domain
    # portion of any email containing the tenant name (admin@contoso.com ->
    # admin@<tenant>.com), leaving the address half-redacted and undetectable
    # by later regexes. Replacing the whole address with <user-{hash}> first
    # neutralises that risk.
    $emailPattern = '[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}'
    $result = [regex]::Replace($result, $emailPattern, {
        param($m) Get-RedactionToken -Value $m.Value -Prefix 'user'
    })

    # Tenant display name pass -- runs after email so only bare mentions in
    # narrative text are caught. Case-insensitive.
    if (-not [string]::IsNullOrWhiteSpace($TenantDisplayName)) {
        $escaped = [regex]::Escape($TenantDisplayName)
        $result = [regex]::Replace($result, $escaped, '<tenant>', [System.Text.RegularExpressions.RegexOptions]::IgnoreCase)
    }

    # IPv4 (4 octets 0-255). Anchored to word boundaries to avoid matching
    # version strings like 1.2.3.4 inside paths.
    $ipv4Pattern = '\b(?:(?:25[0-5]|2[0-4]\d|1\d\d|\d{1,2})\.){3}(?:25[0-5]|2[0-4]\d|1\d\d|\d{1,2})\b'
    $result = [regex]::Replace($result, $ipv4Pattern, {
        param($m) Get-RedactionToken -Value $m.Value -Prefix 'ip'
    })

    # IPv6: full form (8 colon-separated segments) OR compact form (any
    # segments + :: + any segments). Loose -- catches common shapes without
    # enforcing full RFC 4291 validity.
    $ipv6Full    = '(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}'
    $ipv6Compact = '(?:[0-9a-fA-F]{1,4}(?::[0-9a-fA-F]{1,4})*)?::(?:[0-9a-fA-F]{1,4}(?::[0-9a-fA-F]{1,4})*)?'
    $ipv6Pattern = "(?:$ipv6Full|$ipv6Compact)"
    $result = [regex]::Replace($result, $ipv6Pattern, {
        param($m)
        # Skip false positives: pure '::', single-token, or no hex digits.
        $v = $m.Value
        if ($v -eq '::' -or $v.Length -lt 3) { return $v }
        if ($v -notmatch '[0-9a-fA-F]')      { return $v }
        Get-RedactionToken -Value $v -Prefix 'ip'
    })

    # GUIDs (8-4-4-4-12). Preserve the structural shape so consumers can still
    # spot "this is a GUID" while not seeing the value. Token is shorter than
    # a real GUID, so it's visually distinct.
    $guidPattern = '\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b'
    $result = [regex]::Replace($result, $guidPattern, {
        param($m) Get-RedactionToken -Value $m.Value -Prefix 'guid'
    })

    return $result
}