MergeCsv.psm1
<#
.SYNOPSIS Merges an arbitrary amount of CSV files based on an ID column or several combined ID columns. Also works on custom PowerShell objects, with the InputObject parameter. Copyright Joakim Svendsen (C) 2014-2016 All rights reserved. MIT license. Online documentation: http://www.powershelladmin.com/wiki/Merge_CSV_files_or_PSObjects_in_PowerShell .PARAMETER Id Shared ID property/header (multiple supported). .PARAMETER Path CSV files to process. .PARAMETER InputObject Custom PowerShell objects to process. .PARAMETER Delimiter Optional delimiter that's used if you pass file paths (default is a comma). .PARAMETER Separator Optional multi-ID column string separator (default "#Merge-Csv-Separator#"). .PARAMETER AllowDuplicates Allow and aggregate duplicate entries (IDs) in the order they occur. .EXAMPLE ipcsv users.csv | ft -AutoSize Username Department -------- ---------- John IT Jane HR PS C:\> ipcsv user-mail.csv | ft -AutoSize Username Email -------- ----- John john@example.com Jane jane@example.com PS C:\> Merge-Csv -Path users.csv, user-mail.csv -Id Username | Export-Csv -enc UTF8 merged.csv PS C:\> ipcsv .\merged.csv | ft -AutoSize Username Department Email -------- ---------- ----- John IT john@example.com Jane HR jane@example.com .EXAMPLE Merge-Csv -In (ipcsv .\csv1.csv), (ipcsv csv2.csv), (ipcsv csv3.csv) -Id Username | Sort-Object username | ft -AutoSize Merging three files. WARNING: Duplicate identifying (shared column(s) ID) entry found in CSV data/file 0: user42 WARNING: Identifying column entry 'firstOnly' was not found in all CSV data objects/files. Found in object/file no.: 1 WARNING: Identifying column entry '2only' was not found in all CSV data objects/files. Found in object/file no.: 2 WARNING: Identifying column entry 'user2and3only' was not found in all CSV data objects/files. Found in object/file no.: 2, 3 Username File1A File1B TestID File2A File2B TestX File3 -------- ------ ------ ------ ------ ------ ----- ----- 2only a b c firstOnly firstOnlyA1 firstOnlyB1 foo user1 1A1 1B1 same 1A3 2A3 same same user2 2A1 2B1 diff2 2A3 2B3 diff2_2 testC2 user2and3only 2and3A2 2and3B2 test2and3X testID user3 3A1 3B1 same 3A3 3B3 same same user42 42A1 42B1 same42 testA42 testB42 testX42 testC42 .EXAMPLE Merge-Csv -Path csvmerge1.csv, csvmerge2.csv, csvmerge3.csv -Id Username, TestID | Sort-Object username | ft -a Two shared/ID column, three files. WARNING: Duplicate identifying (shared column(s) ID) entry found in CSV data/file 1: user42, same42 WARNING: Identifying column entry 'user2, diff2' was not found in all CSV data objects/files. Found in object/file no.: 1 WARNING: Identifying column entry 'user2and3only, testID' was not found in all CSV data objects/files. Found in object/file no.: 3 WARNING: Identifying column entry 'user2, testC2' was not found in all CSV data objects/files. Found in object/file no.: 3 WARNING: Identifying column entry '2only, c' was not found in all CSV data objects/files. Found in object/file no.: 2 WARNING: Identifying column entry 'user2and3only, test2and3X' was not found in all CSV data objects/files. Found in object/file no.: 2 WARNING: Identifying column entry 'user2, diff2_2' was not found in all CSV data objects/files. Found in object/file no.: 2 WARNING: Identifying column entry 'firstOnly, foo' was not found in all CSV data objects/files. Found in object/file no.: 1 Username TestID File1A File1B File2A File2B -------- ------ ------ ------ ------ ------ 2only c a b firstOnly foo firstOnlyA1 firstOnlyB1 user1 same 1A1 1B1 1A3 2A3 user2 diff2 2A1 2B1 user2 diff2_2 2A3 2B3 user2 testC2 user2and3only testID user2and3only test2and3X 2and3A2 2and3B2 user3 same 3A1 3B1 3A3 3B3 user42 same42 42A1 42B1 testA42 testB42 .EXAMPLE Merge-Csv -Path csv1.csv, csv2.csv, csv3.csv -Id ID -AllowDuplicates | ft -AutoSize ID 1Title1 1Title2 1Title3 2Title1 2Title2 3Title1 -- ------- ------- ------- ------- ------- ------- FooBar x y z blorp dongs first3 FooBar xxx yyy second3 FooBar third3 Svendsen a b c e f SvenData3 Svendsen aa bb cc ee ff Svendsen aaa eee fff #> function Merge-Csv { [CmdletBinding( DefaultParameterSetName='Files' )] param( # Shared ID column(s)/header(s). [Parameter(Mandatory=$true)][ValidateNotNullOrEmpty()][string[]] $Id, # CSV files to process. [Parameter(ParameterSetName='Files',Mandatory=$true)][ValidateScript({Test-Path $_ -PathType Leaf})][string[]] $Path, # Custom PowerShell objects to process. [Parameter(ParameterSetName='Objects',Mandatory=$true)][psobject[]] $InputObject, # Optional delimiter that's used if you pass file paths (default is a comma). [Parameter(ParameterSetName='Files')][string] $Delimiter = ',', # Optional multi-ID column string separator (default "#Merge-Csv-Separator#"). [string] $Separator = '#Merge-Csv-Separator#', # Allow duplicate entries (IDs). [switch] $AllowDuplicates) # v1.4 as a module - 2016-10-28 - adding module format prerequisites, cleaning up redundant code # v1.4 - 2016-09-16 - Added support for handling duplicate IDs. [psobject[]] $CsvObjects = @() if ($PSCmdlet.ParameterSetName -eq 'Files') { $CsvObjects = foreach ($File in $Path) { ,@(Import-Csv -Delimiter $Delimiter -Path $File) } } else { $CsvObjects = $InputObject } $Headers = @() foreach ($Csv in $CsvObjects) { $Headers += , @($Csv | Get-Member -MemberType NoteProperty | Select-Object -ExpandProperty Name) } $Counter = 0 foreach ($h in $Headers) { $Counter++ foreach ($Column in $Id) { if ($h -notcontains $Column) { Write-Error "Headers in object/file $Counter don't include $Column. Exiting." return } } } $HeadersFlatNoShared = @($Headers | ForEach-Object { $_ } | Where-Object { $Id -notcontains $_ }) if ($HeadersFlatNoShared.Count -ne @($HeadersFlatNoShared | Sort-Object -Unique).Count) { Write-Error "Some headers are shared. Are you just looking for '@(ipcsv csv1) + @(ipcsv csv2) | Export-Csv ...'?`nTo remove duplicate (between the files to merge) headers from a CSV file, Import-Csv it, pass it to Select-Object, and omit the duplicate header(s)/column(s).`nExiting." return } $SharedColumnHashes = @() $SharedColumnCount = $Id.Count $Counter = 0 foreach ($Csv in $CsvObjects) { $SharedColumnHashes += @{} $Csv | ForEach-Object { $CurrentID = $(for ($i = 0; $i -lt $SharedColumnCount; $i++) { $_ | Select-Object -ExpandProperty $Id[$i] -EA SilentlyContinue }) -join $Separator if (-not $SharedColumnHashes[$Counter].ContainsKey($CurrentID)) { $SharedColumnHashes[$Counter].Add($CurrentID, @($_ | Select-Object -Property $Headers[$Counter])) } else { if ($AllowDuplicates) { $SharedColumnHashes[$Counter].$CurrentID += $_ | Select-Object -Property $Headers[$Counter] } else { Write-Warning ("Duplicate identifying (shared column(s) ID) entry found in CSV data/file $($Counter+1): " + ($CurrentID -replace [regex]::Escape($Separator), ', ')) } } } $Counter++ } $Result = @{} $NotFound = @{} foreach ($Counter in 0..($SharedColumnHashes.Count-1)) { foreach ($InnerCounter in (0..($SharedColumnHashes.Count-1) | Where-Object { $_ -ne $Counter })) { foreach ($Key in $SharedColumnHashes[$Counter].Keys) { Write-Verbose "Key: $Key, Counter: $Counter, InnerCounter: $InnerCounter" $Obj = New-Object -TypeName PSObject if ($SharedColumnHashes[$InnerCounter].ContainsKey($Key)) { foreach ($Header in $Headers[$InnerCounter] | Where-Object { $Id -notcontains $_ }) { Add-Member -InputObject $Obj -MemberType NoteProperty -Name $Header -Value ($SharedColumnHashes[$InnerCounter].$Key | Select-Object $Header) } } else { foreach ($Header in $Headers[$Counter]) { if ($Id -notcontains $Header) { Add-Member -InputObject $Obj -MemberType NoteProperty -Name $Header -Value ($SharedColumnHashes[$Counter].$Key | Select-Object $Header) } } if (-not $NotFound.ContainsKey($Key)) { $NotFound.Add($Key, @($Counter)) } else { $NotFound[$Key] += $Counter } } if (-not $Result.ContainsKey($Key)) { $Result.$Key = $Obj } else { foreach ($Property in @($Obj | Get-Member -MemberType NoteProperty | Select-Object -ExpandProperty Name)) { if (-not ($Result.$Key | Get-Member -MemberType NoteProperty -Name $Property)) { Add-Member -InputObject $Result.$Key -MemberType NoteProperty -Name $Property -Value $Obj.$Property #-EA SilentlyContinue } } } } } } if ($NotFound) { foreach ($Key in $NotFound.Keys) { Write-Warning "Identifying column entry '$($Key -replace [regex]::Escape($Separator), ', ')' was not found in all CSV data objects/files. Found in object/file no.: $( if ($NotFound.$Key) { ($NotFound.$Key | ForEach-Object { ([int]$_)+1 } | Sort-Object -Unique) -join ', '} elseif ($CsvObjects.Count -eq 2) { '1' } else { 'none' } )" } } #$Global:Result = $Result $Counter = 0 [hashtable[]] $SharedHeadersNoDuplicate = $Id | ForEach-Object { @{n="$($Id[$Counter])";e=[scriptblock]::Create("(`$_.Name -split ([regex]::Escape('$Separator')))[$Counter]")} $Counter++ } [hashtable[]] $HeaderPropertiesNoDuplicate = $HeadersFlatNoShared | ForEach-Object { @{n=$_.ToString(); e=[scriptblock]::Create("`$_.Value.'$_' | Select -ExpandProperty '$_'")} } [int] $HeadersFlatNoSharedCount = $HeadersFlatNoShared.Count # Return results. if (-not $AllowDuplicates) { $Result.GetEnumerator() | Select-Object -Property ($SharedHeadersNoDuplicate + $HeaderPropertiesNoDuplicate) } else { $Result.GetEnumerator() | ForEach-Object { # Latching on support for duplicate objects. Insanely inefficient. # Variable for the count of duplicates we find. Initialize to 1 for each array of PSobjects for each ID. $MaxDuplicateCount = 1 foreach ($Title in $_.Value | Get-Member -MemberType NoteProperty | Select-Object -ExpandProperty Name) { $Count = @($_.Value.$Title).Count # find max count for this instance (if at all higher than 1) # duplicates are processed in the order they occur if ($MaxDuplicateCount -lt $Count) { $MaxDuplicateCount = $Count } } Write-Verbose "Max duplicate count: $MaxDuplicateCount" foreach ($i in 0..($MaxDuplicateCount-1)) { # Add ID(s) once to each object. $Obj = $null $Obj = New-Object -TypeName PSObject foreach ($TempID in $Id) { Add-Member -InputObject $Obj -MemberType NoteProperty -Name $TempID -Value $_.Name } foreach ($NumHeader in 0..($HeadersFlatNoSharedCount-1)) { try { $Value = ($_.Value.($HeadersFlatNoShared[$NumHeader]))[$i] | Select-Object -ExpandProperty $HeadersFlatNoShared[$NumHeader] } catch { Write-Verbose "Caught out of bounds in array." $Value = '' | Select-Object -Property $HeadersFlatNoShared[$NumHeader] } Add-Member -InputObject $Obj -MemberType NoteProperty -Name $HeadersFlatNoShared[$NumHeader] -Value $Value } $Obj | Select-Object -Property ($Id + $HeadersFlatNoShared) } } } } #Export-ModuleMember -Function Merge-Csv |