public/Get-Duplicate.ps1
<#
.SYNOPSIS A Powershell module that find duplicate files within a given folder. .DESCRIPTION A Powershell module that find duplicate files within a given folder. It may also expand it's search scope to all descendent items of that folder. .PARAMETER InputObject Parameter description .PARAMETER Path Folder to search for duplicate files. .PARAMETER LiteralPath Folder to search for duplicate files. .PARAMETER Recurse Expand the scope of the duplicate file search to be across all descendent files of the given folder. .PARAMETER Exclude Omits the specified items. The value of this parameter qualifies the -Path parameter. Enter a path element or pattern, such as "*.txt". Wildcards are permitted. .PARAMETER Include Gets only the specified items. The value of this parameter qualifies the -Path parameter. Enter a path element or pattern, such as "*.txt". Wildcards are permitted. .PARAMETER ExcludeDirectory Omits searching any descendent directory matching the entered name or pattern. Enter a name or pattern, such as "*secret". Wildcards are permitted. .PARAMETER Inverse Get only non-duplicate files. By default the Cmdlet returns duplicate files. .PARAMETER AsHashtable Get the result as a Hashtable, where duplicates are grouped in file hashes. .EXAMPLE Get-Duplicate -Path 'C:/my_folder_with_duplicates' .EXAMPLE Get-Duplicate -Path 'C:/my_folder_with_duplicates' -Recurse -ExcludeDirectory 'specialDirectory' .NOTES When using the -Recurse parameter, the md5 hash of each descendent file has to be calculated, in order for comparison against all other descendent files' md5 hash. Therefore, if using Get-Duplicate with the -Recurse parameter on a folder containing many large descendent files, it is to be expected that the Cmdlet might take several seconds to several minutes to complete, depending on the size of those files. #> function Get-Duplicate { [CmdletBinding(DefaultParameterSetName='Path')] param( [Parameter(ParameterSetName="Path", Mandatory=$true, Position=0)] [string]$Path , [Parameter(ParameterSetName="LiteralPath", Mandatory=$true)] [string]$LiteralPath , [Parameter()] [switch]$Recurse , [Parameter()] [string]$Exclude = '' , [Parameter()] [string]$Include = '' , [Parameter()] [string]$ExcludeDirectory = '' , [Parameter()] [switch]$Inverse , [Parameter()] [switch]$AsHashtable , [Parameter(ValueFromPipeline, ParameterSetName="Pipeline", Mandatory=$false)] [string]$InputObject ) begin { $callerEA = $ErrorActionPreference $ErrorActionPreference = "Stop" if ($callerEA -ne $ErrorActionPreference) { $PSDefaultParameterValues['Get-ChildItem:ErrorAction'] = $callerEA $PSDefaultParameterValues['Where-Object:ErrorAction'] = $callerEA $PSDefaultParameterValues['ForEach-Object:ErrorAction'] = $callerEA } } process { try { if ($InputObject) { $Path = $_ } if ($Path) { if (! (Test-Path -Path $Path -ErrorAction SilentlyContinue) ) { throw "Path $Path does not exist." } } if ($LiteralPath) { if (! (Test-Path -LiteralPath $Path -ErrorAction SilentlyContinue) ) { throw "LiteralPath $Path does not exist." } } $fileSearchParams = @{ File = $true Recurse = $Recurse #ReadOnly = $true } if ($Path) { $fileSearchParams['Path'] = $Path } if ($LiteralPath) { $fileSearchParams['LiteralPath'] = $LiteralPath } if ($Exclude) { $fileSearchParams['Exclude'] = $Exclude } if ($Include) { $fileSearchParams['Include'] = $Include } $hashes_unique = @{} # format: md5str => FileInfo[] $hashes_duplicates = @{} # format: md5str => FileInfo[] # Get all files found only within this directory & { if ($ExcludeDirectory) { Get-ChildItem @fileSearchParams | Where-Object { $_.Directory.Name -notmatch "^$( [regex]::Escape($ExcludeDirectory) )$" } }else { Get-ChildItem @fileSearchParams } } | Sort-Object Name, Extension | ForEach-Object { $md5 = (Get-FileHash -LiteralPath $_.FullName -Algorithm MD5).Hash # md5 hash of this file if ( ! $hashes_unique.ContainsKey($md5) ) { $hashes_unique[$md5] = @( $_ ) }else { # Duplicate! if (!$hashes_duplicates.ContainsKey($md5)) { $hashes_duplicates[$md5] = [System.Collections.Arraylist]@() $hashes_duplicates[$md5].Add($hashes_unique[$md5][0]) > $null } $hashes_duplicates[$md5].Add($_) > $null } } # The first object will be the Original object (shortest file name). # @($hashes_duplicates.Keys) | ForEach-Object { # $md5 = $_ # $hashes_duplicates[$md5] = $hashes_duplicates[$md5] | Sort-Object { $_.Name.Length } # } if ($Inverse) { # Remove any keys that are in the duplicates hash $( $hashes_unique.Keys ) | ? { $hashes_duplicates.ContainsKey($_) } | ForEach-Object { $hashes_unique.Remove($_) > $null } if ($AsHashtable) { $hashes_unique }else { $hashes_unique.Values } }else { if ($AsHashtable) { $hashes_duplicates }else { $hashes_duplicates.Values } } }catch { Write-Error -ErrorRecord $_ -ErrorAction $callerEA } } } |