DimDuplicateFiles.psm1

<#
.SYNOPSIS
    Generates a list of duplicate files in the current or specified directory.
.DESCRIPTION
    Duplicate files are calculated using a filehash. This can be very time consuming and resource intensive. Only files between 10 KB and 1 TB are scanned.
.EXAMPLE
    Get-DuplicateFiles
    This command scans for duplicate files in the current directory.
.EXAMPLE
    Get-DuplicateFiles -MinimumFileSize 1KB -MaximumFileSize 1GB
    This command scans for duplicate files in the current directory specifying alternate file sizes.
.EXAMPLE
    Get-DuplicateFiles -Path C:\Data -Filter *.PDF -Verbose
    This command scans for duplicate PDF files in the specified directory, and displays verbose output.
.EXAMPLE
    Get-DuplicateFiles -Path C:\Data -Verbose | Export-CliXml Dups.xml
    This command scans for duplicate files in the specified directory, displays verbose output, and writes to a CliXml file for future reference.
.EXAMPLE
    Get-DuplicateFiles | Out-GridView -OutputMode Multiple | Remove-Item -Confirm
    This command scans for duplicate files in the current directory and displays a graphical listing. Any selected files will be removed after confirmation.
.LINK
    www.dimensionit.nl
.NOTES
    By Dimitri Koens
 
    Contact me through:
    http://www.DimensionIT.tv
    Twitter: @DimensionIT https://twitter.com/DimensionIT
    Linkedin: http://nl.linkedin.com/in/dimitrikoens
 
    This function uses Get-FileHash function introduced in PowerShell 4.
#>

function Get-DuplicateFiles {
    # Requires -Version 4.0 werkt niet?
    
    [CmdletBinding()]
    param(
        # Specify a path with files to be checked for duplicates.
        [string[]]$Path = (Get-Location),   #(Get-Location).ProviderPath

        # Specifies, as a string array, a property or property that this cmdlet excludes from the operation. The value of this parameter qualifies the Path parameter.
        [string]$Exclude = $null,

        # Specify a filesystem filter for filenames or extensions to be checked.
        [string]$Filter = '*',

        # Specify to filter multimedia files.
        #[switch]$MultimediaFilesOnly,

        #[string]$MultimediaFilter = 'jpg|ORF|dng|cr2|MP4|MOV|AVI|xmp|crw|png|bmp|wav|jpeg|VCF|3gp|gif|tif|m4a|MPG|m4v|wmv|mpeg|m2v',

        # Minimum file size to be checked.
        [int64]$MinimumFileSize = 10KB,   # most JPGs are at least 10KB

        # Maximum file size to be checked. Large files can take a long time to check.
        [int64]$MaximumFileSize = 1TB,

        # Different algorithms can have a huge impact on performance. SHA1 and MD5 are fast but regarded least reliable.
        [ValidateSet('SHA1', 'SHA256', 'SHA384', 'SHA512', 'MACTripleDES', 'MD5', 'RIPEMD160')]
        [string]$Algorithm='SHA256',

        # When specified sorts files by length descending.
        #[switch]$SortOnLength,

        [switch]$NoLogging

    )

    function DupLog {
        param(
            [string]$message,
            [string]$FileName = 'log-Get-DuplicateFiles.txt', # $date = Get-Date -UFormat '%Y-%m-%d %T'
            [switch]$NoScreenOutput
        )
        $date = Get-Date -UFormat '%Y-%m-%d %T'
        $msg = "{0} {1}" -f $date, $message
        if (!($NoLogging)) { $msg | Out-File -Append -FilePath $FileName }
        if (!($NoScreenOutput)) { Write-Verbose $msg }
    }

    # traversing filepaths
    $startTraversal = Get-Date
    DupLog "Traversing $Path"
    DupLog ("Minimum file size: {0} Maximum file size: {1}" -f (Convert-HumanReadable -Bytes $MinimumFileSize), (Convert-HumanReadable -Bytes $MaximumFileSize))
    Write-Progress -Activity 'Preparations' -CurrentOperation "Traversing $path" -Percent 10   # path needs to be in double quotes (because of array?)

    try {
        $Files = Get-ChildItem -Path $Path -Recurse -File -Exclude $Exclude |
        Where-Object { $_.Length -ge $MinimumFileSize -and $_.Length -le $MaximumFileSize }
        # proberen met: [io.Directory]::EnumerateFiles("C:\Program Files","*","AllDirectories") kan veel sneller zijn
        # eventueel foreach -parallel, meet performance winst!
    }
    catch {
        DupLog { 'Error during traversal' }
        Write-Error 'Error during traversal'
    }
    DupLog "Time taken for traversal: $(New-TimeSpan -Start $startTraversal)"

    <#
    if ($MultimediaFilesOnly) {
        $msg = "Applying multimedia filter: $MultimediaFilter"
        Write-Progress -Activity 'Preparations' -CurrentOperation $msg -Percent 50
        DupLog $msg
        $Files = $Files | Where-Object Extension -match $MultimediaFilter
    }
    #>


    # obsolete: group on length automatically sorts on length
    # sort on length when required
    <#
    if ($SortOnLength) {
        Write-Progress -Activity 'Preparations' -CurrentOperation 'Sorting on length' -Percent 60
        DupLog 'Sorting files on length'
        $Files = $Files | Sort-Object Length -Descending
    }
    #>


    # group files on same length
    DupLog 'Finding suspects'
    Write-Progress -Activity 'Preparations' -CurrentOperation 'Finding suspects...' -Percent 70

    # massive performance improvement in pwsh 6.1
    if ($PSVersionTable.PSVersion.Major -lt 7 -and $totFiles.count -gt 1000) { 
        DupLog 'Run this program on at least PowerShell 7 for a massive performance improvement during grouping!' 
    }

    $Files = $Files |
        Group-Object -Property Length |
        Where-Object { $_.Count -gt 1 }
    # undesired result: files are now sorted on length, from small to large
    # optionally expand all groups and sort on filehash: similar files close. fullname: similar source close.

    Write-Progress -Activity 'Preparations' -CurrentOperation 'Calculating total file size and number of suspects...' -Percent 95
    $SuspectsFound = 0
    [long]$TotalSize = 0
    $Files | ForEach-Object {
        $_.Group | ForEach-Object { 
            $TotalSize += $_.Length
            $SuspectsFound++
        }
    }

    Write-Progress -Activity 'Preparations' -Completed
    DupLog ("$("{0:N2}" -f ($TotalSize/1GB)) GB in $SuspectsFound suspect files. Beginning hash calculations.")

    $i = 0
    $start = Get-Date
    $datetime = Get-Date -UFormat '%Y-%m-%d %T'
    [long]$bytesProcessed = 0
    $DuplicateFilesFound = 0

    # calculating file hashes
    # uitwerken: met foreach -parallel, maar hoe doe je dan write-progress???

    $Activity = "Calculating hash values on {0}, started at {1}" -f (Convert-HumanReadable -Bytes $TotalSize), $datetime  # UITWERKEN
    $Files | ForEach-Object {

        $_.Group | ForEach-Object {

            # progress
            $i++
            $pct = [int]($bytesProcessed/$TotalSize*100)
            $CurrentOp = "File {1} of {2}. Duplicates found: {0}. {5} processed. {3} in {4}" -f $DuplicateFilesFound, $i, $SuspectsFound, (Convert-HumanReadable -Bytes $_.Length), $_.FullName, (Convert-HumanReadable -Bytes $bytesProcessed)

            # Write-Progress (only include remaining time when 1MB processed and 10s has elapsed)
            if ($bytesProcessed -ge 1MB -and (New-TimeSpan -start $start).TotalSeconds -gt 10) {
                # calc seconds remaining
                $bytesPerSecond = $bytesProcessed / (New-TimeSpan -Start $start).TotalSeconds
                [long]$secRemaining = ($TotalSize - $bytesProcessed)/$bytesPerSecond  # was (100-$pct)*($TimeElapsed.TotalSeconds/$pct)

                Write-Progress -Activity $Activity  -CurrentOperation $CurrentOp -Percent $pct -SecondsRemaining $secRemaining

            } else {

                Write-Progress -Activity $Activity -CurrentOperation $CurrentOp -Percent $pct

            }

            # optionally read first KB to prevent invoking Get-FileHash when files are not duplicate, for instance, VOB or split ZIP files with same size
            # todo
            #Get-Content -Raw -AsByteStream # AsByteStream requires ps6! Raw requires ps3!
            
            # calculating file hash
            DupLog "Calculating filehash on $($_.length) bytes from $($_.fullname)" -NoScreenOutput

            try {
                $FileHash = (Get-FileHash -LiteralPath $_.FullName -Algorithm $Algorithm).Hash
            }
            catch {
                DupLog "Could not produce file hash for: $($_.FullName)"
            }

            
            $_ | Add-Member -MemberType NoteProperty -Name FileHash  -Value $FileHash
            $_ | Add-Member -MemberType NoteProperty -Name Algorithm -Value $Algorithm
            #$_ | Add-Member -MemberType NoteProperty -Name LiteralPath -Value $_.FullName # uitproberen pipe naar remove-item

            $bytesProcessed += $_.length

        }
        # group items have been amended with a filehash property

        # now group all files on FileHash property and produce results
        $DuplicateBytesFound = 0
        $_.Group |
        Where-Object { $null -ne $_.FileHash } |   # if FileHash is null file could probably not be opened for reading - filehash is still in properties, CHECK !!!!!!!!!!!!!!!!!!!!!!!!
        Group-Object -Property FileHash |
        Where-Object { $_.Count -gt 1 } |
        ForEach-Object {
            $DuplicateFilesFound++
            # produce output, unselected
            # to do: use type ps1xml
            $_.Group
            $DuplicateBytesFound += $_.Length
            $_.Group | ForEach-Object { DupLog ($("{0,12} bytes in {1}" -f $_.Length, $_.FullName)) -NoScreenOutput }
        }
    }

    Write-Progress -Activity "Calculating hash value" -Completed

    # summary: how many files traversed? how many files checked? how many dups found? how many GB dup out of total? how much time it took?
    # Convert-HumanReadable
    DupLog ("{0:N1} GB found in {1:N0} duplicate file groups. Time taken for hash calculations: {2:N1} hours" -f ($DuplicateBytesFound/1GB), $DuplicateFilesFound, (New-TimeSpan -start $start).TotalHours)

    if ($DuplicateFilesFound -EQ 0) { 
        DupLog 'No duplicate files found'
    } else {
        #DupLog ("{0} found in {1} files" -f ($DuplicateBytesFound), $DuplicateFilesFound)
    }
}




<#
.SYNOPSIS
This command uses Out-GridView to list duplicate files
#>

function Show-DuplicateFiles {
    [CmdletBinding()]
    Param(
        [Parameter(Mandatory=$true,
            ValueFromPipeline=$true,
            ValueFromPipelineByPropertyName=$true)]
        [PSCustomObject]$File
    )

    begin {
        # create empty collection
        $fileCollection = New-Object System.Collections.ArrayList
        Write-Progress -Activity 'Receiving files from pipeline'
    }

    process {
        $fileCollection.Add($_) | Out-Null
    }

    end {
        Write-Progress -Activity 'Receiving files from pipeline' -Completed
        $fileCollection | Select-Object FileHash, Length, FullName, FullNameOriginal | Out-GridView -OutputMode Multiple
    }
}




function Convert-HumanReadable {

    [CmdletBinding()]
    Param(
        [Parameter(ValueFromPipeline=$true)]
        [long]$Bytes,    # [decimal]?

        [Parameter(ValueFromPipeline=$true)]
        [string]$Text,    # [decimal]?

        [switch]$Pad
    )

    if ($Bytes) {
        switch($Bytes) {
            { $_ -ge 1PB }     { $r = "{0:n2} PB" -f ($_ / 1PB); break }
            { $_ -ge 1TB }     { $r = "{0:n2} TB" -f ($_ / 1TB); break }
            { $_ -ge 1GB }     { $r = "{0:n2} GB" -f ($_ / 1GB); break }
            { $_ -ge 1MB }     { $r = "{0:n2} MB" -f ($_ / 1MB); break }
            { $_ -ge 1KB }     { $r = "{0:n2} KB" -f ($_ / 1KB) }
            default            { $r = "{0} B " -f $_} 
        }
        if ($Pad) {
            $r.PadLeft(11)
        } else {
            $r
        }
    } else {
        # trim
        #$Text = $Text.Replace(' ', '')

        # to do: implement punctuation and globalization/culture
        if ($text -match '\.\,') {
            Write-Error 'No punctuation allowed (yet)'
        } else {
            Write-Verbose "$r"
            switch -regex ($Text) {
                # replace case insensitive
                "EB"    { $r = [int]($Text -Replace 'EB') * 1PB*1KB; break } # max 16 EB
                "PB"    { $r = [int]($Text -Replace 'PB') * 1PB; break } # max 16 EB
                "TB"    { $r = [int]($Text -Replace 'TB') * 1TB; break } # max 16 EB
                "GB"    { $r = [int]($Text -Replace 'GB') * 1GB; break } # max 16 EB
                "MB"    { $r = [int]($Text -Replace 'MB') * 1MB; break } # max 2GB
                "KB"    { $r = [int]($Text -Replace 'KB') * 1KB }
                default { $r = [int]($Text -Replace  'B') }
            }
        }
    }
}