Test-BloomFilter.ps1

<#PSScriptInfo
.VERSION 1.0
.GUID b46be54d-9f7e-42db-bace-18557fa1a91e
.AUTHOR Lee Holmes
#>


<#
 
.DESCRIPTION
Tests for the existence of items in a data set, based on the output of New-BloomFilter.
If this function returns 'False' (or emits the item in -PassThru mode), you can be 100%
certain that the item was not part of the original training data set.
 
The efficiency of this encoding comes with a very slight false positive rate. That is, one
in a billion times, this function will return 'True' (or not emit an item in -PassThru
mode) for an item that was not in the original data set.
 
.EXAMPLE
## Create the data set based on strings of name, last write time, and length
PS > $system32files = dir c:\windows\system32 | % { "{0},{1},{2}" -f $_.Name,$_.LastWriteTime,$_.Length }
PS > $filterData = $system32files | New-BloomFilter -ExpectedItemCount $system32files.Length
 
## Tamper with one item in the data set, and then test
PS > $system32files[337] = "KBDDIV1.DLL,12/7/2021 1:08:55 AM,7685"
PS > $system32files | Test-BloomFilter.ps1 -FilterData $filterData -PassThru
 
KBDDIV1.DLL,12/7/2021 1:08:55 AM,7685
 
This example creates a data set of all files in System32 (Name, Last Write Time, and Length),
and outputs any that change.
 
.EXAMPLE
## Create the data set based out the string output of Get-Filehash
PS > $system32hashes = dir c:\windows\system32 -File | Get-FileHash | Out-String -Stream -Width 999
PS > $filterData = $system32hashes | New-BloomFilter -ExpectedItemCount $system32hashes.Length
 
## Tamper with one item in the data set, and then test
PS > $system32hashes[337] = $system32hashes[337] -replace '0','1'
PS > $system32hashes | Test-BloomFilter.ps1 -FilterData $filterData -PassThru
 
SHA256 9A47888C8118A34111475E92663C1A4D9E3E1E26289B6B16558886631F6FD89B C:\Windows\System32\BrowserSettingSync.dll
 
This example creates a data set of all files in System32 based on the output of Get-FileHash, and uses the
Out-String -Stream cmdlet to capture this output as strings. It then outputs any that change.
 
#>

param(
    ## The item to be examined to determine whether it was in the original data set.
    ## This must be a String.
    [Parameter(Mandatory, ValueFromPipeline)]
    [Object] $InputObject,

    ## The Base64 data set output from New-BloomFilter that encodes the training data
    [Parameter(Mandatory)]
    [string] $FilterData,

    ## An optional data set key. If you think an attacker might know you use
    ## these bloom filters to detect them, they might intentionally modify their
    ## data to make it turn into a false positive. Similar to salting a password,
    ## this data set key will prevent that.
    [Parameter()]
    [string] $DatasetKey,

    ## Specify this flag if you want Test-BloomFilter to emit items that were not present
    ## in the original data set. Otherwise, returns $True / $False depending on whether
    ## the input object was in the original data set.
    [Parameter()]
    [Switch] $PassThru
)

begin
{
    ## Optimize the data set size for one in a billion false positive rate.
    ## Using 30 hash functions for this false positive rate and most data
    ## set sizes creates an optimal data set size.
    $falsePositiveRate = 1 / 1000000000
    $hashFunctionCount = 30
    [uint32] $bitsetCount = 0

    ## Retrieve the bitset and bitset size from the encoded FilterData representation
    [byte[]] $filterDataBytes = [Convert]::FromBase64String($FilterData)
    $bitsetCount = $filterDataBytes.Count * 8
    $bitset = [Collections.BitArray]::New($filterDataBytes)

    ## Use a SHA256 hash for the bytes of the multiple hash indexes.
    $hasher = [System.Security.Cryptography.SHA256Managed]::Create()
}
process
{
    if($InputObject -isnot [String])
    {
        throw "Input objects must be supplied as strings. To use the formatted output you " +
            "see in PowerShell, use 'Out-String -Stream -Width 999' as shown in the help examples."
    }

    ## Iterate on the hash 30 times to generate the bitset indexes, and determine if the resulting bit
    ## is set in the bitset
    for($currentHashIndex = 0; $currentHashIndex -lt $hashFunctionCount; $currentHashIndex++)
    {
        ## Hash the content, and use those hash bytes to test entries in the data set / bitset.
        $itemBytes = [System.Text.Encoding]::Unicode.GetBytes($DatasetKey + $currentHashIndex + $InputObject)
        $hashBytes = $hasher.ComputeHash($itemBytes)
   
        $currentHashOffset = [BitConverter]::ToUInt32($hashBytes, 0) % $bitsetCount

        ## If the bitset does not have an entry set at this position, we can know for certain that
        ## the item was not part of the training data.
        if(-not $bitset[$currentHashoffset])
        {
            ## Emit the object
            if($PassThru.IsPresent)
            {
                return $InputObject
            }
            else
            {
                ## Or just return that the item was not in the training data.
                return $false
            }
        }
    }

    ## If all the bits were set, this very probably was part of the training data.
    ## Only emit $true / $false if they didn't specify -PassThru
    if(-not $PassThru.IsPresent)
    {
        return $true
    }
}