Functions/Get-IBHFileEncoding.ps1
<#
.SYNOPSIS Guess the encoding of the specified file. .DESCRIPTION First we read the first 4 bytes of a file .OUTPUTS System.Text.Encoding. Encoding of the file. .EXAMPLE PS C:\> Get-IBHFileEncoding -Path 'C:\Temp\demo.txt' Guess the encoding of the demo.txt file. .LINK https://github.com/claudiospizzi/InvokeBuildHelper #> function Get-IBHFileEncoding { [CmdletBinding()] [OutputType([System.Text.Encoding])] param ( # Path to the file. [Parameter(Mandatory = $true)] [System.String] $Path ) # Read the first 4 bytes of the file. if ($PSVersionTable.PSVersion.Major -lt 6) { [System.Byte[]] $bytes = Get-Content -Path $Path -TotalCount 4 -Encoding 'Byte' } else { [System.Byte[]] $bytes = Get-Content -Path $Path -TotalCount 4 -AsByteStream } # Binary # Read the first 5 lines of the file and check them for non printable # charactres. If we find any, it's a binary file. $nonPrintable = [System.Char[]] (0..8 + 10..31 + 127 + 129 + 141 + 143 + 144 + 157) $affectedLineCount = Get-Content -Path $Path -TotalCount 5 | Where-Object { $_.IndexOfAny($nonPrintable) -ne -1 } | Measure-Object | Select-Object -ExpandProperty 'Count' if ($affectedLineCount -gt 0) { throw 'Binary files have no encoding!' } # UTF8 (EF BB BF) if ($bytes.Length -ge 3 -and $bytes[0] -eq 0xef -and $bytes[1] -eq 0xbb -and $bytes[2] -eq 0xbf) { return [System.Text.Encoding]::UTF8 } # UTF16 Big-Endian (FE FF) if ($bytes.Length -ge 2 -and $bytes[0] -eq 0xfe -and $bytes[1] -eq 0xff) { return [System.Text.Encoding]::BigEndianUnicode } # UTF16 Little-Endian (FF FE) if ($bytes.Length -ge 2 -and $bytes[0] -eq 0xff -and $bytes[1] -eq 0xfe) { return [System.Text.Encoding]::Unicode } # UTF32 Big-Endian (00 00 FE FF) if ($bytes.Length -ge 4 -and $bytes[0] -eq 0x00 -and $bytes[1] -eq 0x00 -and $bytes[2] -eq 0xfe -and $bytes[3] -eq 0xff) { return [System.Text.Encoding]::UTF32 } # UTF32 Little-Endian (FE FF 00 00) if ($bytes.Length -ge 4 -and $bytes[0] -eq 0xfe -and $bytes[1] -eq 0xff -and $bytes[2] -eq 0x00 -and $bytes[3] -eq 0x00) { return [System.Text.Encoding]::UTF32 } # UTF7 (2B 2F 76 38|38|2B|2F) if ($bytes.Length -ge 4 -and $bytes[0] -eq 0x2b -and $bytes[1] -eq 0x2f -and $bytes[2] -eq 0x76 -and ($bytes[3] -eq 0x38 -or $bytes[3] -eq 0x39 -or $bytes[3] -eq 0x2b -or $bytes[3] -eq 0x2f)) { throw 'UTF7 is not a supported encoding!' } # UTF-1 (F7 64 4C) if ($bytes.Length -ge 3 -and $bytes[0] -eq 0xf7 -and $bytes[1] -eq 0x64 -and $bytes[2] -eq 0x4c ) { throw 'UTF-1 is not a supported encoding!' } # UTF-EBCDIC (DD 73 66 73) if ($bytes.Length -ge 4 -and $bytes[0] -eq 0xdd -and $bytes[1] -eq 0x73 -and $bytes[2] -eq 0x66 -and $bytes[3] -eq 0x73) { throw 'UTF-EBCDIC is not a supported encoding!' } # SCSU (0E FE FF) if ($bytes.Length -ge 3 -and $bytes[0] -eq 0x0e -and $bytes[1] -eq 0xfe -and $bytes[2] -eq 0xff) { throw 'SCSU is not a supported encoding!' } # BOCU-1 (FB EE 28) if ($bytes.Length -ge 3 -and $bytes[0] -eq 0xfb -and $bytes[1] -eq 0xee -and $bytes[2] -eq 0x28 ) { throw 'BOCU-1 is not a supported encoding!' } # GB-18030 (84 31 95 33) if ($bytes.Length -ge 4 -and $bytes[0] -eq 0x84 -and $bytes[1] -eq 0x31 -and $bytes[2] -eq 0x95 -and $bytes[3] -eq 0x33) { throw 'GB-18030 is not a supported encoding!' } # If the function will reach this point, the encoding was NOT found by # parsing the BOM header. Starting from here, we are guessing based on the # file content. # We are checking, if any byte has a value greather than 127, this indicates # it's a UTF8 encoded file. if ($bytes -notmatch '^[\x00-\x7F]*$') { return [System.Text.Encoding]::UTF8 } else { return [System.Text.Encoding]::ASCII } } |