Functions/Get-HomoglyphsInFile.ps1
<#
.Synopsis Gets homoglyphs in the specified file .DESCRIPTION Gets homoglyphs in the specified file. The user must specify how to parse the file using parse elements, regular expressions, or pre-defined methods. .EXAMPLE Get-HomoglyphsInFile -File .\testfile.txt -ParseElements @(' ', '.', '(', ')', '{', '}', ';', '?', '\', '/', '&', '%', '!', '<<', '>>',"`n") The text in testfile.txt is split into words based on the parse elements defined. .EXAMPLE Get-HomoglyphsInFile -FileName .\testTextFile.txt -Regex "[a-z]+" -RemoveUninteresting Name OCRValue Type File ---- -------- ---- ---- 123 123 customRegex .\testTextFile.txt 123… 123 customRegex .\testTextFile.txt 123 123 customRegex .\testTextFile.txt Found all the number values that were separated by text that were homoglyphs .EXAMPLE Get-HomoglyphsInFile -File .\testfile.txt -ParseElements @(' ','.',',',"`n") -RemoveUninteresting Scans a typical text file for homoglyphs .EXAMPLE ls testfi*.ps1 | Get-HomoglyphsInFile -Predefined PowerShell -RemoveUninteresting Get all files like testfi*.ps1 and remove any homoglyphs across all matching files .EXAMPLE $results = ls testfi*.ps1 | %{Get-HomoglyphsInFile $_ -Predefined PowerShell -RemoveUninteresting} $results.name | group | select count,name Get all files like test*.ps1 and remove homoglyphs. Group them to show which items were found, and what they're homoglyphs of and how many of each are found. .EXAMPLE $results = @() $count = 0 $files = gci C:\myFolder\ -Recurse -Include "*.ps1" -File ForEach($file in $files){ Write-Progress -PercentComplete ($count/$($files.count)*100) -Activity 'Scanning Files' -CurrentOperation "($count/$($files.count)) - $($file.FullName)" $results += Get-HomoglyphsInFile -FullName $file.FullName -Predefined PowerShell $count++ } $refinedResults = $refinedResults -Array $results Searches all files in C:\myFolder - This is about 130 times faster than scanning the same repo with Find-HomoglyphsInRepo .PARAMETER FileName The path and name of the file to scan for homoglyphs. .PARAMETER ParseElements An array of seperator strings to use to define how to split the text file into words for comparison. Elements can be multiple characters long. .PARAMETER Regex A regular expression defining how to split the text file into words for comparison. .PARAMETER RemoveUninteresting Compares all words in all files analyzed then removes those that are not homoglyphs of another value .PARAMETER Predefined Use one of the predefined parsing criteria .Link https://github.com/paulhcode #> Function Get-HomoglyphsInFile { [CmdletBinding(DefaultParameterSetName = 'Predefined')] Param ( [Parameter(Mandatory = $true, ValueFromPipeline = $true, Position = 0, ParameterSetName = 'ParseElement')] [Parameter(Mandatory = $true, ValueFromPipeline = $true, Position = 0, ParameterSetName = 'Regex')] [Parameter(Mandatory = $true, ValueFromPipeline = $true, Position = 0, ParameterSetName = 'Predefined')] [string] [ValidateNotNullOrEmpty()] [ValidateScript({ Test-Path -PathType Leaf $_ })] $FullName, [Parameter(Mandatory = $true, ValueFromPipelineByPropertyName = $true, Position = 1, ParameterSetName = 'ParseElement')] [string[]] $ParseElements, #maybe something like: @(' ', '.', '(', ')', '{', '}', ';', '?', '\', '/', '&', '%', '!', '<<', '>>',"`n") [Parameter(Mandatory = $true, ValueFromPipelineByPropertyName = $true, Position = 1, ParameterSetName = 'Regex')] [string] $Regex, [Parameter(Mandatory = $true, ValueFromPipelineByPropertyName = $true, Position = 0, ParameterSetName = 'Predefined')] [ValidateSet('PowerShell', 'Text')] [string] $Predefined, [Parameter(Mandatory = $false, ValueFromPipelineByPropertyName = $true, ParameterSetName = 'ParseElement')] [Parameter(Mandatory = $false, ValueFromPipelineByPropertyName = $true, ParameterSetName = 'Regex')] [Parameter(Mandatory = $false, ValueFromPipelineByPropertyName = $true, ParameterSetName = 'Predefined')] [switch] $RemoveUninteresting ) begin { $foundWords = @() } process { #Get the file and split it If ($Predefined) { switch ($Predefined) { 'PowerShell' { $NeedsAnalysis = $false # $foundWords = @() $foundWords += Get-ValuesFromPS -FileName $FullName } 'Text' { $NeedsAnalysis = $true $ParseElements = @(' ', '.', "`n", '(', ')') } } } If ($ParseElements) { $items = Split-ByParseElements -Text (Get-Content $FullName -Raw) -ParseElements $ParseElements $type = 'customParse' } ElseIf ($Regex) { $items = [regex]::Split((Get-Content $FullName -Raw), $Regex) | Where-Object { $Null -ne $_ -and '' -ne $_ } $type = 'customRegex' } #Analyze it If (!$Predefined -or $NeedsAnalysis) { $WordsInThisFile = $items | ForEach-Object { Get-ValueReport -Name $_ -Type $type -File $FullName } $foundWords += $WordsInThisFile | Group-Object Name, Type, File | ForEach-Object { $_.Group | Select-Object * -First 1 } } } end { If ($RemoveUninteresting) { Remove-Uninteresting -Array $foundWords } Else { $foundWords } } } |