public/Get-DuplicateItem.ps1
<# .SYNOPSIS Gets duplicate or non-duplicate files. .DESCRIPTION Gets duplicate or non-duplicate files. .PARAMETER InputObject Folder to search. .PARAMETER Path Folder to search. .PARAMETER LiteralPath Folder to search. .PARAMETER Recurse Expand the scope of the duplicate file search to be across all descendent files of the given folder. .PARAMETER Exclude Omits the specified items. The value of this parameter qualifies the -Path parameter. Enter a path element or pattern, such as "*.txt". Wildcards are permitted. .PARAMETER Include Gets only the specified items. The value of this parameter qualifies the -Path parameter. Enter a path element or pattern, such as "*.txt". Wildcards are permitted. .PARAMETER Inverse Get only non-duplicate files. By default the Cmdlet returns duplicate files. .PARAMETER AsHashtable Get the result as a Hashtable, where duplicates are grouped in file hashes. .EXAMPLE # Get duplicate files in 'C:/folder1' only Get-DuplicateItem -Path 'C:/folder1' .EXAMPLE # Get duplicate files in 'C:/folder1' and its descendents Get-DuplicateItem -Path 'C:/folder1' -Recurse .EXAMPLE # Get non-duplicate files in 'C:/folder1' and its descendents Get-DuplicateItem -Path 'C:/folder1' -Recurse -Inverse .EXAMPLE # Remove all duplicate items Get-DuplicateItem Get-Item 'C:/folder1' | Remove-Item .EXAMPLE # Remove all non-duplicate items Get-DuplicateItem 'C:/folder1' -Inverse | Remove-Item .NOTES The cmdlet calculates the md5 hash of each descendent file, to be able to identify duplicates and non-duplicates. Therefore if there are many large descendent files, it is normal for the Cmdlet to take several seconds to several minutes to complete. #> function Get-DuplicateItem { [CmdletBinding(DefaultParameterSetName='Path')] param( [Parameter(ParameterSetName="Path", Mandatory=$true, Position=0)] [string]$Path , [Parameter(ParameterSetName="LiteralPath", Mandatory=$true)] [string]$LiteralPath , [Parameter()] [switch]$Recurse , [Parameter()] [string]$Exclude = '' , [Parameter()] [string]$Include = '' , [Parameter()] [switch]$Inverse , [Parameter()] [switch]$AsHashtable , [Parameter(ValueFromPipeline, ParameterSetName="Pipeline", Mandatory=$false)] [string]$InputObject ) process { try { if ($InputObject) { $Path = $_ } if ($Path) { if (! (Test-Path -Path $Path -ErrorAction SilentlyContinue) ) { throw "Path $Path does not exist." } } if ($LiteralPath) { if (! (Test-Path -LiteralPath $Path -ErrorAction SilentlyContinue) ) { throw "LiteralPath $Path does not exist." } } $fileSearchParams = @{ File = $true Recurse = $Recurse #ReadOnly = $true } if ($Path) { $fileSearchParams['Path'] = $Path } if ($LiteralPath) { $fileSearchParams['LiteralPath'] = $LiteralPath } if ($Exclude) { $fileSearchParams['Exclude'] = $Exclude } if ($Include) { $fileSearchParams['Include'] = $Include } $hashesUnique = @{} # format: md5str => FileInfo[] $hashesDuplicates = @{} # format: md5str => FileInfo[] # Get all files found only within this directory Get-ChildItem @fileSearchParams | Sort-Object Name, Extension | ForEach-Object { $md5 = (Get-FileHash -LiteralPath $_.FullName -Algorithm MD5).Hash # md5 hash of this file if ( ! $hashesUnique.ContainsKey($md5) ) { $hashesUnique[$md5] = [System.Collections.Arraylist]@() $hashesUnique[$md5].Add( $_ ) > $null }else { # Duplicate! if (!$hashesDuplicates.ContainsKey($md5)) { $hashesDuplicates[$md5] = [System.Collections.Arraylist]@() $hashesDuplicates[$md5].Add($hashesUnique[$md5][0]) > $null } $hashesDuplicates[$md5].Add($_) > $null } } # The first object will be the Original object (shortest file name). # @($hashesDuplicates.Keys) | ForEach-Object { # $md5 = $_ # $hashesDuplicates[$md5] = $hashesDuplicates[$md5] | Sort-Object { $_.Name.Length } # } if ($Inverse) { # Remove any keys that are in the duplicates hash $( $hashesUnique.Keys ) | ? { $hashesDuplicates.ContainsKey($_) } | ForEach-Object { $hashesUnique.Remove($_) > $null } if ($AsHashtable) { $hashesUnique }else { # Unwrap the Arraylist so we retrun System.IO.FileInfo $hashesUnique.Values | ForEach-Object { $_ } } }else { if ($AsHashtable) { $hashesDuplicates }else { # Unwrap the Arraylist so we retrun System.IO.FileInfo $hashesDuplicates.Values | ForEach-Object { $_ } } } }catch { Write-Error -ErrorRecord $_ } } } |