Find-PSOneDuplicateFile.ps1
function Find-PSOneDuplicateFile { <# .SYNOPSIS Identifies files with duplicate content .DESCRIPTION Returns a hashtable with the hashes that have at least two files (duplicates) .EXAMPLE $Path = [Environment]::GetFolderPath('MyDocuments') Find-PSOneDuplicateFile -Path $Path Find duplicate files in the user documents folder .EXAMPLE Find-PSOneDuplicateFile -Path c:\windows -Filter *.log find log files in the Windows folder with duplicate content .LINK https://powershell.one/tricks/filesystem/finding-duplicate-files #> param ( # Path of folder to recursively search [String] [Parameter(Mandatory)] $Path, # Filter to apply. Default is '*' (all Files) [String] $Filter = '*' ) # get a hashtable of all files of size greater 0 # grouped by their length # ENUMERATE ALL FILES RECURSIVELY # call scriptblocks directly and pipe them together # this is by far the fastest way and much faster than # using Foreach-Object: & { try { # try and use the fast API way of enumerating files recursively # this FAILS whenever there is any "Access Denied" errors Write-Progress -Activity 'Acquiring Files' -Status 'Fast Method' [IO.DirectoryInfo]::new($Path).GetFiles('*', 'AllDirectories') } catch { # use PowerShell's own (slow) way of enumerating files if any error occurs: Write-Progress -Activity 'Acquiring Files' -Status 'Falling Back to Slow Method' Get-ChildItem -Path $Path -File -Recurse -ErrorAction Ignore } } | # EXCLUDE EMPTY FILES: # use direct process blocks with IF (which is much faster than Where-Object): & { process { # if the file has content... if ($_.Length -gt 0) { # let it pass through: $_ } } } | # GROUP FILES BY LENGTH, AND RETURN ONLY FILES WHERE THERE IS AT LEAST ONE # OTHER FILE WITH SAME SIZE # use direct scriptblocks with own hashtable (which is much faster than Group-Object) & { begin # start with an empty hashtable { $hash = @{} } process { # group files by their length # (use "length" as hashtable key) $file = $_ $key = $file.Length.toString() # if we see this key for the first time, create a generic # list to hold group items, and store FileInfo objects in this list # (specialized generic lists are faster than ArrayList): if ($hash.ContainsKey($key) -eq $false) { $hash[$key] = [Collections.Generic.List[System.IO.FileInfo]]::new() } # add file to appropriate hashtable key: $hash[$key].Add($file) } end { # return only the files from groups with at least two files # (if there is only one file with a given length, then it # cannot have any duplicates for sure): foreach($pile in $hash.Values) { # are there at least 2 files in this pile? if ($pile.Count -gt 1) { # yes, add it to the candidates $pile } } } } | # CALCULATE THE NUMBER OF FILES TO HASH # collect all files and hand over en-bloc & { end { ,@($input) } } | # GROUP FILES BY HASH, AND RETURN ONLY HASHES THAT HAVE AT LEAST TWO FILES: # use a direct scriptblock call with a hashtable (much faster than Group-Object): & { begin { # start with an empty hashtable $hash = @{} # since this is a length procedure, a progress bar is in order # keep a counter of processed files: $c = 0 } process { $totalNumber = $_.Count foreach($file in $_) { # update progress bar $c++ # update progress bar every 20 files: if ($c % 20 -eq 0) { $percentComplete = $c * 100 / $totalNumber Write-Progress -Activity 'Hashing File Content' -Status $file.Name -PercentComplete $percentComplete } # use the file hash of this file PLUS file length as a key to the hashtable # use the fastest algorithm SHA1 $result = Get-FileHash -Path $file.FullName -Algorithm SHA1 $key = '{0}:{1}' -f $result.Hash, $file.Length # if we see this key the first time, add a generic list to this key: if ($hash.ContainsKey($key) -eq $false) { $hash.Add($key, [Collections.Generic.List[System.IO.FileInfo]]::new()) } # add the file to the approriate group: $hash[$key].Add($file) } } end { # remove all hashtable keys with only one file in them # first, CLONE the list of hashtable keys # (we cannot remove hashtable keys while enumerating the live # keys list): # remove keys $keys = @($hash.Keys).Clone() # enumerate all keys... foreach($key in $keys) { # ...if key has only one file, remove it: if ($hash[$key].Count -eq 1) { $hash.Remove($key) } } # return the hashtable with only duplicate files left: $hash } } } |