DimDuplicateFiles.psm1
<#
.SYNOPSIS Generates a list of duplicate files in the current or specified directory. .DESCRIPTION Duplicate files are calculated using a filehash. This can be very time consuming and resource intensive. Only files between 10 KB and 1 TB are scanned. .EXAMPLE Get-DuplicateFiles This command scans for duplicate files in the current directory. .EXAMPLE Get-DuplicateFiles -MinimumFileSize 1KB -MaximumFileSize 1GB This command scans for duplicate files in the current directory specifying alternate file sizes. .EXAMPLE Get-DuplicateFiles -Path C:\Data -Filter *.PDF -Verbose This command scans for duplicate PDF files in the specified directory, and displays verbose output. .EXAMPLE Get-DuplicateFiles -Path C:\Data -Verbose | Export-CliXml Dups.xml This command scans for duplicate files in the specified directory, displays verbose output, and writes to a CliXml file for future reference. .EXAMPLE Get-DuplicateFiles | Out-GridView -OutputMode Multiple | Remove-Item -Confirm This command scans for duplicate files in the current directory and displays a graphical listing. Any selected files will be removed after confirmation. .LINK www.dimensionit.nl .NOTES By Dimitri Koens Contact me through: http://www.DimensionIT.tv Twitter: @DimensionIT https://twitter.com/DimensionIT Linkedin: http://nl.linkedin.com/in/dimitrikoens This function uses Get-FileHash function introduced in PowerShell 4. #> function Get-DuplicateFiles { # Requires -Version 4.0 werkt niet? [CmdletBinding()] param( # Specify a path with files to be checked for duplicates. [string[]]$Path = (Get-Location), #(Get-Location).ProviderPath # Specifies, as a string array, a property or property that this cmdlet excludes from the operation. The value of this parameter qualifies the Path parameter. [string]$Exclude = $null, # Specify a filesystem filter for filenames or extensions to be checked. [string]$Filter = '*', # Specify to filter multimedia files. #[switch]$MultimediaFilesOnly, #[string]$MultimediaFilter = 'jpg|ORF|dng|cr2|MP4|MOV|AVI|xmp|crw|png|bmp|wav|jpeg|VCF|3gp|gif|tif|m4a|MPG|m4v|wmv|mpeg|m2v', # Minimum file size to be checked. [int64]$MinimumFileSize = 10KB, # most JPGs are at least 10KB # Maximum file size to be checked. Large files can take a long time to check. [int64]$MaximumFileSize = 1TB, # Different algorithms can have a huge impact on performance. SHA1 and MD5 are fast but regarded least reliable. [ValidateSet('SHA1', 'SHA256', 'SHA384', 'SHA512', 'MACTripleDES', 'MD5', 'RIPEMD160')] [string]$Algorithm='SHA256', # When specified sorts files by length descending. #[switch]$SortOnLength, [switch]$NoLogging ) function DupLog { param( [string]$message, [string]$FileName = 'log-Get-DuplicateFiles.txt', # $date = Get-Date -UFormat '%Y-%m-%d %T' [switch]$NoScreenOutput ) $date = Get-Date -UFormat '%Y-%m-%d %T' $msg = "{0} {1}" -f $date, $message if (!($NoLogging)) { $msg | Out-File -Append -FilePath $FileName } if (!($NoScreenOutput)) { Write-Verbose $msg } } # traversing filepaths $startTraversal = Get-Date DupLog "Traversing $Path" DupLog ("Minimum file size: {0} Maximum file size: {1}" -f (Convert-HumanReadable -Bytes $MinimumFileSize), (Convert-HumanReadable -Bytes $MaximumFileSize)) Write-Progress -Activity 'Preparations' -CurrentOperation "Traversing $path" -Percent 10 # path needs to be in double quotes (because of array?) try { $Files = Get-ChildItem -Path $Path -Recurse -File -Exclude $Exclude | Where-Object { $_.Length -ge $MinimumFileSize -and $_.Length -le $MaximumFileSize } # proberen met: [io.Directory]::EnumerateFiles("C:\Program Files","*","AllDirectories") kan veel sneller zijn # eventueel foreach -parallel, meet performance winst! } catch { DupLog { 'Error during traversal' } Write-Error 'Error during traversal' } DupLog "Time taken for traversal: $(New-TimeSpan -Start $startTraversal)" <# if ($MultimediaFilesOnly) { $msg = "Applying multimedia filter: $MultimediaFilter" Write-Progress -Activity 'Preparations' -CurrentOperation $msg -Percent 50 DupLog $msg $Files = $Files | Where-Object Extension -match $MultimediaFilter } #> # obsolete: group on length automatically sorts on length # sort on length when required <# if ($SortOnLength) { Write-Progress -Activity 'Preparations' -CurrentOperation 'Sorting on length' -Percent 60 DupLog 'Sorting files on length' $Files = $Files | Sort-Object Length -Descending } #> # group files on same length DupLog 'Finding suspects' Write-Progress -Activity 'Preparations' -CurrentOperation 'Finding suspects...' -Percent 70 # massive performance improvement in pwsh 6.1 if ($PSVersionTable.PSVersion.Major -lt 7 -and $totFiles.count -gt 1000) { DupLog 'Run this program on at least PowerShell 7 for a massive performance improvement during grouping!' } $Files = $Files | Group-Object -Property Length | Where-Object { $_.Count -gt 1 } # undesired result: files are now sorted on length, from small to large # optionally expand all groups and sort on filehash: similar files close. fullname: similar source close. Write-Progress -Activity 'Preparations' -CurrentOperation 'Calculating total file size and number of suspects...' -Percent 95 $SuspectsFound = 0 [long]$TotalSize = 0 $Files | ForEach-Object { $_.Group | ForEach-Object { $TotalSize += $_.Length $SuspectsFound++ } } Write-Progress -Activity 'Preparations' -Completed DupLog ("$("{0:N2}" -f ($TotalSize/1GB)) GB in $SuspectsFound suspect files. Beginning hash calculations.") $i = 0 $start = Get-Date $datetime = Get-Date -UFormat '%Y-%m-%d %T' [long]$bytesProcessed = 0 $DuplicateFilesFound = 0 # calculating file hashes # uitwerken: met foreach -parallel, maar hoe doe je dan write-progress??? $Activity = "Calculating hash values on {0}, started at {1}" -f (Convert-HumanReadable -Bytes $TotalSize), $datetime # UITWERKEN $Files | ForEach-Object { $_.Group | ForEach-Object { # progress $i++ $pct = [int]($bytesProcessed/$TotalSize*100) $CurrentOp = "File {1} of {2}. Duplicates found: {0}. {5} processed. {3} in {4}" -f $DuplicateFilesFound, $i, $SuspectsFound, (Convert-HumanReadable -Bytes $_.Length), $_.FullName, (Convert-HumanReadable -Bytes $bytesProcessed) # Write-Progress (only include remaining time when 1MB processed and 10s has elapsed) if ($bytesProcessed -ge 1MB -and (New-TimeSpan -start $start).TotalSeconds -gt 10) { # calc seconds remaining $bytesPerSecond = $bytesProcessed / (New-TimeSpan -Start $start).TotalSeconds [long]$secRemaining = ($TotalSize - $bytesProcessed)/$bytesPerSecond # was (100-$pct)*($TimeElapsed.TotalSeconds/$pct) Write-Progress -Activity $Activity -CurrentOperation $CurrentOp -Percent $pct -SecondsRemaining $secRemaining } else { Write-Progress -Activity $Activity -CurrentOperation $CurrentOp -Percent $pct } # optionally read first KB to prevent invoking Get-FileHash when files are not duplicate, for instance, VOB or split ZIP files with same size # todo #Get-Content -Raw -AsByteStream # AsByteStream requires ps6! Raw requires ps3! # calculating file hash DupLog "Calculating filehash on $($_.length) bytes from $($_.fullname)" -NoScreenOutput try { $FileHash = (Get-FileHash -LiteralPath $_.FullName -Algorithm $Algorithm).Hash } catch { DupLog "Could not produce file hash for: $($_.FullName)" } $_ | Add-Member -MemberType NoteProperty -Name FileHash -Value $FileHash $_ | Add-Member -MemberType NoteProperty -Name Algorithm -Value $Algorithm #$_ | Add-Member -MemberType NoteProperty -Name LiteralPath -Value $_.FullName # uitproberen pipe naar remove-item $bytesProcessed += $_.length } # group items have been amended with a filehash property # now group all files on FileHash property and produce results $DuplicateBytesFound = 0 $_.Group | Where-Object { $null -ne $_.FileHash } | # if FileHash is null file could probably not be opened for reading - filehash is still in properties, CHECK !!!!!!!!!!!!!!!!!!!!!!!! Group-Object -Property FileHash | Where-Object { $_.Count -gt 1 } | ForEach-Object { $DuplicateFilesFound++ # produce output, unselected # to do: use type ps1xml $_.Group $DuplicateBytesFound += $_.Length $_.Group | ForEach-Object { DupLog ($("{0,12} bytes in {1}" -f $_.Length, $_.FullName)) -NoScreenOutput } } } Write-Progress -Activity "Calculating hash value" -Completed # summary: how many files traversed? how many files checked? how many dups found? how many GB dup out of total? how much time it took? # Convert-HumanReadable DupLog ("{0:N1} GB found in {1:N0} duplicate file groups. Time taken for hash calculations: {2:N1} hours" -f ($DuplicateBytesFound/1GB), $DuplicateFilesFound, (New-TimeSpan -start $start).TotalHours) if ($DuplicateFilesFound -EQ 0) { DupLog 'No duplicate files found' } else { #DupLog ("{0} found in {1} files" -f ($DuplicateBytesFound), $DuplicateFilesFound) } } <# .SYNOPSIS This command uses Out-GridView to list duplicate files #> function Show-DuplicateFiles { [CmdletBinding()] Param( [Parameter(Mandatory=$true, ValueFromPipeline=$true, ValueFromPipelineByPropertyName=$true)] [PSCustomObject]$File ) begin { # create empty collection $fileCollection = New-Object System.Collections.ArrayList Write-Progress -Activity 'Receiving files from pipeline' } process { $fileCollection.Add($_) | Out-Null } end { Write-Progress -Activity 'Receiving files from pipeline' -Completed $fileCollection | Select-Object FileHash, Length, FullName, FullNameOriginal | Out-GridView -OutputMode Multiple } } function Convert-HumanReadable { [CmdletBinding()] Param( [Parameter(ValueFromPipeline=$true)] [long]$Bytes, # [decimal]? [Parameter(ValueFromPipeline=$true)] [string]$Text, # [decimal]? [switch]$Pad ) if ($Bytes) { switch($Bytes) { { $_ -ge 1PB } { $r = "{0:n2} PB" -f ($_ / 1PB); break } { $_ -ge 1TB } { $r = "{0:n2} TB" -f ($_ / 1TB); break } { $_ -ge 1GB } { $r = "{0:n2} GB" -f ($_ / 1GB); break } { $_ -ge 1MB } { $r = "{0:n2} MB" -f ($_ / 1MB); break } { $_ -ge 1KB } { $r = "{0:n2} KB" -f ($_ / 1KB) } default { $r = "{0} B " -f $_} } if ($Pad) { $r.PadLeft(11) } else { $r } } else { # trim #$Text = $Text.Replace(' ', '') # to do: implement punctuation and globalization/culture if ($text -match '\.\,') { Write-Error 'No punctuation allowed (yet)' } else { Write-Verbose "$r" switch -regex ($Text) { # replace case insensitive "EB" { $r = [int]($Text -Replace 'EB') * 1PB*1KB; break } # max 16 EB "PB" { $r = [int]($Text -Replace 'PB') * 1PB; break } # max 16 EB "TB" { $r = [int]($Text -Replace 'TB') * 1TB; break } # max 16 EB "GB" { $r = [int]($Text -Replace 'GB') * 1GB; break } # max 16 EB "MB" { $r = [int]($Text -Replace 'MB') * 1MB; break } # max 2GB "KB" { $r = [int]($Text -Replace 'KB') * 1KB } default { $r = [int]($Text -Replace 'B') } } } } } |