unishell.psm1
param( $DataFilesDirectory, $DefaultDisplayEncodings = @('utf-8', 'utf-16'), $AutoDownloadDataFiles ) $scriptDir = Split-Path $psCommandPath if (-not $dataFilesDirectory) { $dataFilesDirectory = $scriptDir } . $scriptDir/lib.ps1 -datafilesdirectory $dataFilesDirectory if (-not $?) { exit } <# .SYNOPSIS Gets Unicode codepoint information from an input string or list of integer codepoints. .DESCRIPTION Gets Unicode codepoint information from an input string or list of integer codepoints. Returned items each represent a single Unicode codepoint, and carry information about various properties and binary encodings of the codepoint. All properties are sourced from public Unicode Consortium data files. By default, codepoints are displayed in TABLE format and display only a few properties. To see all available information about a codepoint, pipe output to Format-List to view it in LIST format. .PARAMETER InputString Specifies the string that will be decomposed into its constituent codepoints. .PARAMETER Codepoint Specifies explicitly the integer codepoints that will be returned. .PARAMETER Encoding Specifies the subset of available encodings which will be displayed. By default, utf-8 and utf-16 encodings are displayed. Note that all encodings are always available on the returned items, even if they are not displayed by default. Use "Format-List *" to force-display all encodings. .EXAMPLE # display codepoints of a simple Latin string Get-UniCodepoint 'Dude' Dude Codepoint Name utf-8 utf-16 Value --------- ---- ----- ------ ----- ┌─ U+0044 LATIN CAPITAL LETTER D 44 44 00 D ├─ U+0075 LATIN SMALL LETTER U 75 75 00 u ├─ U+0064 LATIN SMALL LETTER D 64 64 00 d └─ U+0065 LATIN SMALL LETTER E 65 65 00 e .EXAMPLE # display codepoints of a more interesting string '(͡° ͜ʖ ͡°)' | Get-UniCodepoint (͡° ͜ʖ ͡°) Codepoint Name utf-8 utf-16 Value --------- ---- ----- ------ ----- ┌┬ U+0028 LEFT PARENTHESIS 28 28 00 ( │└ U+0361 COMBINING DOUBLE INVERTED BREVE CD A1 61 03 ͡ ├─ U+00B0 DEGREE SIGN C2 B0 B0 00 ° ├┬ U+0020 SPACE 20 20 00 │└ U+035C COMBINING DOUBLE BREVE BELOW CD 9C 5C 03 ͜ ├─ U+0296 LATIN LETTER INVERTED GLOTTAL STOP CA 96 96 02 ʖ ├┬ U+0020 SPACE 20 20 00 │└ U+0361 COMBINING DOUBLE INVERTED BREVE CD A1 61 03 ͡ ├─ U+00B0 DEGREE SIGN C2 B0 B0 00 ° └─ U+0029 RIGHT PARENTHESIS 29 29 00 ) .EXAMPLE # display codepoints based on explicit integer codepoint values 0x1f480..0x1f485 | Get-UniCodepoint Codepoint Name utf-8 utf-16 Value --------- ---- ----- ------ ----- U+1F480 SKULL F0 9F 92 80 3D D8 80 DC 💀 U+1F481 INFORMATION DESK PERSON F0 9F 92 81 3D D8 81 DC 💁 U+1F482 GUARDSMAN F0 9F 92 82 3D D8 82 DC 💂 U+1F483 DANCER F0 9F 92 83 3D D8 83 DC 💃 U+1F484 LIPSTICK F0 9F 92 84 3D D8 84 DC 💄 U+1F485 NAIL POLISH F0 9F 92 85 3D D8 85 DC 💅 .EXAMPLE # display other encodings 'señor' | Get-UniCodepoint -Encoding iso-8859-1,utf-16BE,utf-8 señor Codepoint Name iso-8859-1 utf-16BE utf-8 Value --------- ---- ---------- -------- ----- ----- ┌─ U+0073 LATIN SMALL LETTER S 73 00 73 73 s ├─ U+0065 LATIN SMALL LETTER E 65 00 65 65 e ├─ U+00F1 LATIN SMALL LETTER N WITH TILDE F1 00 F1 C3 B1 ñ ├─ U+006F LATIN SMALL LETTER O 6F 00 6F 6F o └─ U+0072 LATIN SMALL LETTER R 72 00 72 72 r .EXAMPLE # view full details of a codepoint by viewing in list format 0x0414 | Get-UniCodepoint | Format-List Value : Д Codepoint : U+0414 Name : CYRILLIC CAPITAL LETTER DE Block : Cyrillic Plane : 0 - Basic Multilingual Plane UnicodeVersion : 1.1 Script : Cyrillic LineBreakClass : AL - Alphabetic Category : Lu - Letter, Uppercase CanonicalCombiningClasses : 0 - Spacing, split, enclosing, reordrant, and Tibetan subjoined BidiCategory : L - Left-to-Right DecompositionMapping : DecimalDigitValue : DigitValue : NumericValue : Mirrored : False UppercaseMapping : LowercaseMapping : U+0434 TitlecaseMapping : utf-8 : D0 94 utf-16 : 14 04 #> function Get-UniCodepoint { [CmdletBinding(DefaultParameterSetName = 'string')] param( [Parameter(Mandatory = $true , ParameterSetName = 'string', Position = 0, ValueFromPipeline = $true)] [string[]] $InputString, [Parameter(Mandatory = $true, ParameterSetName = 'codepoint', Position = 0, ValueFromPipeline = $true)] [int[]] $Codepoint, [string[]] $Encoding ) begin { loadStub $changedFormatting = $false if ($encoding) { $displayEncodings = $encoding | % { $allEncodings.WebName -like $_ } | Select-Object -Unique if (-not $displayEncodings) { Write-Error "The encoding '$encoding' does not match any available encoding" return } else { updateFormatting $displayEncodings $changedFormatting = $true } } } process { if ($psCmdlet.ParameterSetName -eq 'codepoint') { foreach ($c in $codepoint) { getChar $c } } elseif ($psCmdlet.ParameterSetName -eq 'string') { foreach ($s in $inputString) { expandString $s } } } end { if ($changedFormatting) { updateFormatting $script:defaultDisplayEncodings } } } <# .SYNOPSIS Gets the bytes associated with a binary encoding of the specified Unicode string or codepoints. .DESCRIPTION Gets the bytes associated with a binary encoding of the specified Unicode string or codepoints. UTF-8 encoding is used by default, but any available encoding is supported. .PARAMETER InputString Specifies the string whose encoded bytes will be returned. .PARAMETER Codepoint Specifies a sequence of integer Unicode codepoints whose encoded bytes will be returned. This parameter can be populated via raw integers or by piping output from Get-UniCodepoint. .PARAMETER Encoding Specifies the encoding to use when converting the input string or codepoints to bytes. UTF-8 is used by default. .EXAMPLE # get the encoded bytes of a simple Latin string Get-UniByte 'Sweet' 83 119 101 101 116 .EXAMPLE # get the UTF-16 bytes of the Mandarin word 筷子 '筷子' | Get-UniByte -Encoding utf-16 119 123 80 91 .EXAMPLE # get the bytes of an integer codepoint 0x1F937 | Get-UniByte -Encoding utf-32 55 249 1 0 #> function Get-UniByte { param( [Parameter(Mandatory = $true, Position = 0, ValueFromPipeline = $true, ParameterSetName = 'string')] [string[]]$InputString, [Parameter(Mandatory = $true, Position = 0, ValueFromPipeline = $true, ValueFromPipelineByPropertyName = $true, ParameterSetName = 'codepoint')] [int[]] $Codepoint, [string] $Encoding = 'utf-8' ) begin { $encodingImpl = $allEncodings |? WebName -eq $Encoding if (-not $encodingImpl) { Write-Error "The encoding '$encoding' does not match any available encoding" return } } process { if ($PSCmdlet.ParameterSetName -eq 'string') { foreach ($s in $inputString) { $encodingImpl.GetBytes($s) } } elseif ($PSCmdlet.ParameterSetName -eq 'codepoint') { foreach ($c in $codepoint) { $value = getValue $c if ($value -eq $null) { return } $encodingImpl.GetBytes($value) } } } } <# .SYNOPSIS Gets the string generated by decoding the input bytes according to the specified encoding, or by combining the specified codepoints. .DESCRIPTION Gets the string generated by decoding the input bytes according to the specified encoding, or by combining the specified codepoints. When specifying bytes, UTF-8 encoding is used by default. .PARAMETER Bytes Specifies the bytes that will be decoded to generate the resulting string. .PARAMETER Encoding Specifies the encoding that will be used to decode the input bytes. UTF-8 encoding is used by default. .PARAMETER Codepoint Specifies the integer codepoints that will be combined to generate the resulting string. This parameter can be populated via raw integers or by piping output from Get-UniCodepoint. .EXAMPLE # decode a simple Latin word based on its UTF-8 bytes 83, 119, 101, 101, 116 | Get-UniString Sweet .EXAMPLE # decode a Mandarin word based on its UTF-16 bytes 119, 123, 80, 91 | Get-UniString -Encoding utf-16 筷子 .EXAMPLE # combine codepoints to form a string 0x6d,0x65,0x68,0x20,0x1f937 | Get-UniString meh 🤷 .EXAMPLE # discover conspiracies! '畂桳栠摩琠敨映捡獴' | Get-UniByte -Encoding utf-16 | Get-UniString -Encoding utf-8 Bush hid the facts #> function Get-UniString { [CmdletBinding(DefaultParameterSetName = 'bytes')] param( [Parameter(Mandatory = $true, Position = 0, ValueFromPipeline = $true, ParameterSetName = 'bytes')] [byte[]]$Bytes, [Parameter(ParameterSetName = 'bytes')] [string] $Encoding = 'utf-8', [Parameter(Mandatory = $true, Position = 0, ValueFromPipeline = $true, ValueFromPipelineByPropertyName = $true, ParameterSetName = 'codepoint')] [int[]] $Codepoint ) begin { $byteBuffer = New-Object 'System.Collections.Generic.List[byte]' $sb = [System.Text.StringBuilder]::new() $encodingImpl = $allEncodings |? WebName -eq $Encoding if ((-not $encodingImpl) -and ($psCmdlet.ParameterSetName -eq 'bytes')) { Write-Error "The encoding '$encoding' does not match any available encoding" return } } process { if ($PSCmdlet.ParameterSetName -eq 'bytes') { foreach ($b in $bytes) { $byteBuffer.Add($b) } } elseif ($PSCmdlet.ParameterSetName -eq 'codepoint') { foreach ($c in $codepoint) { $value = getValue $c if ($value -eq $null) { return } $null = $sb.Append($value) } } } end { if ($PSCmdlet.ParameterSetName -eq 'bytes') { $encodingImpl.GetString($byteBuffer.ToArray()) } elseif ($PSCmdlet.ParameterSetName -eq 'codepoint') { $sb.ToString() } } } # tab completion through all available encodings for 'Encoding' arg on all relevant cmdlets if (Get-Command 'Register-ArgumentCompleter' -ea 0) { Register-ArgumentCompleter -CommandName 'Get-UniCodepoint', 'Get-UniByte', 'Get-UniString' -ParameterName 'Encoding' -ScriptBlock { param($commandName, $parameterName, $wordToComplete, $commandAst, $boundParameters) # maintain quotes if user has added them $quote = $null if ($wordToComplete -match "^(`"|')") { $quote = $matches[1] $wordToComplete = $wordToComplete -replace "^(`"|')+" } $wordToComplete = $wordToComplete -replace "(`"|')+$" $script:allEncodings |? WebName -like "$wordToComplete*" | % WebName | % { $localQuote = if ($quote) { $quote } elseif ($_ -match '\s') { "'" } else { $null } "$localQuote$_$localQuote" } } } New-Alias unicode Get-UniCodepoint New-Alias unibyte Get-UniByte New-Alias unistring Get-UniString Export-ModuleMember ` -Function 'Get-UniCodepoint','Get-UniByte','Get-UniString' ` -Alias 'unicode','unibyte','unistring' |