Private/Encoding/Get-FileEncoding.ps1
|
function Get-FileEncoding { <# .SYNOPSIS Detects the encoding of a file by inspecting BOM bytes and content heuristics. .OUTPUTS A hashtable with keys: Name (string), HasBom (bool), Encoding (System.Text.Encoding) #> [OutputType([hashtable])] param( [Parameter(Mandatory)] [string] $Path ) $bytes = [System.IO.File]::ReadAllBytes($Path) # BOM detection if ($bytes.Count -ge 4 -and $bytes[0] -eq 0xFF -and $bytes[1] -eq 0xFE -and $bytes[2] -eq 0x00 -and $bytes[3] -eq 0x00) { return @{ Name = 'UTF-32LE'; HasBom = $true; Encoding = [System.Text.Encoding]::UTF32 } } if ($bytes.Count -ge 3 -and $bytes[0] -eq 0xEF -and $bytes[1] -eq 0xBB -and $bytes[2] -eq 0xBF) { return @{ Name = 'UTF-8-BOM'; HasBom = $true; Encoding = [System.Text.Encoding]::UTF8 } } if ($bytes.Count -ge 2 -and $bytes[0] -eq 0xFF -and $bytes[1] -eq 0xFE) { return @{ Name = 'UTF-16LE'; HasBom = $true; Encoding = [System.Text.Encoding]::Unicode } } if ($bytes.Count -ge 2 -and $bytes[0] -eq 0xFE -and $bytes[1] -eq 0xFF) { return @{ Name = 'UTF-16BE'; HasBom = $true; Encoding = [System.Text.Encoding]::BigEndianUnicode } } # No BOM — try UTF-8 decode (check for replacement character U+FFFD) $utf8 = [System.Text.Encoding]::UTF8 $text = $utf8.GetString($bytes) if ($text -notmatch '\uFFFD') { return @{ Name = 'UTF-8'; HasBom = $false; Encoding = $utf8 } } # Fallback: Windows-1252 (most common legacy Western subtitle encoding) try { $win1252 = [System.Text.Encoding]::GetEncoding(1252) return @{ Name = 'Windows-1252'; HasBom = $false; Encoding = $win1252 } } catch { # If Windows-1252 unavailable (non-Windows), fall back to Latin-1 $latin1 = [System.Text.Encoding]::GetEncoding('iso-8859-1') return @{ Name = 'ISO-8859-1'; HasBom = $false; Encoding = $latin1 } } } |