Public/TMD.DataManipulation.ps1
## Data Reduction Functions Function Get-NameAndId { param( [Parameter(Mandatory = $false, ValueFromPipeline = $true)][AllowNull()]$Data = $null, [Parameter(Mandatory = $false, ValueFromPipeline = $false)][string]$IDName = 'id', [Parameter(Mandatory = $false, ValueFromPipeline = $false)][string]$IDValueFrom = 'Id', [Parameter(Mandatory = $false, ValueFromPipeline = $false)]$NameName = 'name', [Parameter(Mandatory = $false, ValueFromPipeline = $false)]$NameValueFrom = 'Name' ) Begin { } Process { ## Get the Name and ID of each of the objects. ## Using the defaults you'll have the JSON equivilent to { id: $_.Id; name: $_.Name } if ($Data) { ## Ensure that our data is wrapped with an array if ($Data.GetType() -eq 'System.Array') { foreach ($Item in $Data) { [System.Collections.Hashtable]@{ $IDName = $Item.$IDValueFrom $NameName = $Item.$NameValueFrom } } } else { [System.Collections.Hashtable]@{ $IDName = $Data.$IDValueFrom $NameName = $Data.$NameValueFrom } } } } End { } } Function ConvertTo-PlainObject { [CmdletBinding()] param ( [Parameter(Mandatory = $true, Position = 0, ValueFromPipeline = $true)][AllowNull()] $InputObject ) begin { } process { return [System.Management.Automation.PSSerializer]::Deserialize([System.Management.Automation.PSSerializer]::Serialize($InputObject)) } end { } } Function Optimize-DataObject { param( [Parameter(Mandatory = $false, ValueFromPipeline = $true)][AllowNull()]$Data = $null, [Parameter(Mandatory = $false, ValueFromPipeline = $false)][scriptblock]$ProcessingMap, [Parameter(Mandatory = $false, ValueFromPipeline = $false)][String]$StartLabel, [Parameter(Mandatory = $false, ValueFromPipeline = $false)][String]$EndLabel, [Parameter(Mandatory = $false, ValueFromPipeline = $false)][int]$Level = 0 ) Begin { ## Print Start Label, Accomodate Level if ($StartLabel) { for ($i = 0; $i -lt $Level; $i++) { Write-Host ' ' -NoNewline } Write-Host "$StartLabel" } } Process { if ($Data) { foreach ($Global:Item in $Data) { Invoke-Command -ScriptBlock $ProcessingMap -NoNewScope -ErrorAction Continue } } } End { ## Print Start Label, Accomodate Level if ($EndLabel) { Write-Host "$EndLabel" } } } Function Test-VariableExists { param( [Parameter(Mandatory = $true)][String]$Name, [Parameter(Mandatory = $false)][Switch]$PassThru ) if (Get-Variable -Name $Name -ErrorAction SilentlyContinue) { if ($PassThru) { $var = Get-Variable $Name return $var } else { return $true } return $true } else { return $false } } Function New-VariableName ([String]$string) { if ($string.Length -gt 0) { $delimeters = @(' ', '.', ',', '_', '-', '(', ')', '\', '/', '?', ':', '?') $string = Replace-VariableCharWithCamelCase -string $string -delimeter $delimeters $string = $string.trim() $string = $string.replace('#', 'Num') $string = Lowercase-FirstCharacter -string $string $string += 'Var' $string } else { $string } } Function ConvertTo-LowercaseFirstCharacter([String]$string) { $string.Substring(0, 1).ToLower() + $string.Substring(1, $string.Length - 1) } Function ConvertTo-UppercaseFirstCharacter([String]$string) { $string.Substring(0, 1).ToUpper() + $string.Substring(1, $string.Length - 1) } Function ConvertTo-VariableCharWithCamelCase( [String]$string, [String[]]$delimeter ) { foreach ($delim in $delimeter) { if ($string.contains($delim)) { $stringArr = $string.Split($delim) $tempString = '' for ($i = 0; $i -lt $stringArr.Count; $i++) { $word = $stringArr[$i].trim() if ($word.length -gt 0) { $tempString += Uppercase-FirstCharacter -string $word } } $string = $tempString } } $string } Function ConvertTo-Array { [CmdletBinding()] param( [Parameter(Mandatory = $true, ValueFromPipeline = $True)][AllowNull()]$InputObject ) begin { } process { ## Initalize the Return Array $ResultArray = @() ## Return an empty array on Null objects if (-Not $InputObject) { return $ResultArray } ## Switch based on the Input Object type $ObjectType = $InputObject.GetType().BaseType switch ($ObjectType) { 'string' { ## A string may be an array object, as a Json string, for example $PossibleArray = $InputObject #| ConvertFrom-Json if (($PossibleArray.GetType()).BaseType -eq 'System.Array') { $ResultArray = @($InputObject) break } ## Replace end characters and quotes $InputObject = $InputObject.Replace('[', '') $InputObject = $InputObject.Replace(']', '') $InputObject = $InputObject.Replace("'", '') $InputObject = $InputObject.Replace('"', '') ## Split on a comma if there is one if ($InputObject.contains(',')) { $ResultArray = $InputObject -Split ',', ' ' | ForEach-Object { $_.Trim() } } else { if ($InputObject -ne '') { $ResultArray = @($InputObject) } } break } 'System.Object[]' { if ($ObjectType.BaseType.FullName -eq 'System.Array') { $ResultArray = $InputObject break } break } 'System.Array' { return $InputObject } ## No Array type objects were matched, return whatever was passed in as the only object in an array Default { $ResultArray = @($InputObject) break } } ## The comma below is deliberate. PowerShell will (oddly) strip an array down to it's most basic form (a null, a string). ## It's related to pipeline optimization Reasons. See https://www.reddit.com/r/PowerShell/comments/6yogs2/functions_that_return_arrays/ ## using the Comma ultimately 'returns 2 objects back', of which the first (an empty object before the comma) is discarded, and the array ## then passed through, even if it is @(null) or @('One String') return , $ResultArray } end { } } Function ConvertTo-Boolean { param( [AllowNull()]$InputObject ) if ($null -ne $InputObject) { switch ($InputObject.GetType().ToString()) { 'System.String' { $trueStrings = @('yes', 'y', 'true', 't', '1') $falseStrings = @('no', 'n', 'false', 'f', '0') if ($trueStrings.Contains($InputObject.ToLower())) { return $true } if ($falseStrings.Contains($InputObject.ToLower())) { return $false } return $false } 'System.Boolean' { if ($InputObject) { return $true } else { return $false } return $false } 'System.Int32' { if ($InputObject -gt 0) { return $true } else { return $false } return $false } Default { try { $asString = [System.Convert]::ToString($InputObject) $stringLikeTrue = [System.Convert]::ToBoolean($asString) if ($stringLikeTrue) { return $true } else { return $false } } catch { return $false } } } } } function Convert-Size { [cmdletbinding()] param( [validateset('Bytes', 'KB', 'MB', 'GB', 'TB')] [string]$From, [validateset('Bytes', 'KB', 'MB', 'GB', 'TB')] [string]$To, [Parameter(Mandatory = $true)] [double]$Value, [int]$Precision = 4 ) switch ($From) { 'Bytes' { $value = $Value } 'KB' { $value = $Value * 1024 } 'MB' { $value = $Value * 1024 * 1024 } 'GB' { $value = $Value * 1024 * 1024 * 1024 } 'TB' { $value = $Value * 1024 * 1024 * 1024 * 1024 } } switch ($To) { 'Bytes' { return $value } 'KB' { $Value = $Value / 1KB } 'MB' { $Value = $Value / 1MB } 'GB' { $Value = $Value / 1GB } 'TB' { $Value = $Value / 1TB } } return [Math]::Round($value, $Precision, [MidPointRounding]::AwayFromZero) } ## Method to assess an object and return the key at the provided node (Obj.Prop.SubProp.Value) function Get-Value($object, $key) { $p1, $p2 = $key.Split('.') if ($p2) { return Get-Value -object $object.$p1 -key $p2 } else { return $object.$p1 } } ## Method for setting values in an object node.address.format like Get-Value above function Set-Value($object, $key, $Value) { ## Deal with appropriate typing if ($Value -match '^\d+$') { $Value = [Int64]$Value } if ($Value -eq 'true') { $Value = $True } if ($Value -eq 'false') { $Value = $False } ## Split the Key into comparable parts # $p1, $p2 = $key.Split(".") | Select-Object -First 2 $p1, $p2 = $key.Split('.') ## Is the node an array reference if ($p1 -like '*]') { ## This node is an array object, separte the array node and the array pointer $p1Node = $p1 | Select-String -Pattern '.*[a-zA-Z]' | ForEach-Object { $_.Matches[0].Value } $p1Pointer = $p1 | Select-String -Pattern '[0-9]' | ForEach-Object { $_.Matches[0].Value } if ($p2) { Set-Value -object $object.$p1Node[$p1Pointer] -key $p2 -Value $Value } else { $object.$p1 = $Value } } else { ## This object is a flat node object, not an array. # Write-Host "Node is an object" if ($p2) { Set-Value -object $object.$p1 -key $p2 -Value $Value } else { $object.$p1 = $Value } } } ## Method for getting a TimeSpan that is Human Readable format function Get-TimeSpanString { [CmdletBinding()] param ( [Parameter(Mandatory = $false, ParameterSetName = 'DateTime')] [datetime] $Start = (Get-Date), [Parameter(Mandatory = $True, ParameterSetName = 'DateTime', Position = 0 )] [datetime] $End, [Parameter(Mandatory = $True, ParameterSetName = 'Timespan', Position = 0)] [timespan] $Timespan, [Parameter(Mandatory = $False)] [int]$TimespanUnits = 3 ) process { $SecondsReference = @{ Year = (60 * 60 * 24 * 365) Month = (60 * 60 * 24 * 30) Day = (60 * 60 * 24) Hour = (60 * 60) Minute = 60 } ## Calculate the Timespan if ($PSCmdlet.ParameterSetName -eq 'DateTime') { $Timespan = (New-TimeSpan -Start $Start -End $End) } ## Get the Total number of seconds of the timspand (abs, to force a positive number) $TimespanSeconds = [math]::Abs($Timespan.TotalSeconds) ## Create a Span String to collect the required parts to $SpanStringParts = [System.Collections.ArrayList]@() ## Record how many years $Years = [math]::Floor($TimespanSeconds / $SecondsReference.Year) if ($Years -and ($SpanStringParts.Count -lt $TimespanUnits)) { [void]($SpanStringParts.Add("$($Years)y")) $TimespanSeconds -= ($Years * $SecondsReference.Year) } ## Record how many Months $Months = [math]::Floor($TimespanSeconds / $SecondsReference.Month) if ($Months -and ($SpanStringParts.Count -lt $TimespanUnits)) { [void]($SpanStringParts.Add("$($Months)Mo")) $TimespanSeconds -= ($Months * $SecondsReference.Month) } ## Record how many Days $Days = [math]::Floor($TimespanSeconds / $SecondsReference.Day) if ($Days -and ($SpanStringParts.Count -lt $TimespanUnits)) { [void]($SpanStringParts.Add("$($Days)d")) $TimespanSeconds -= ($Days * $SecondsReference.Day) } ## Record how many Hours $Hours = [math]::Floor($TimespanSeconds / $SecondsReference.Hour) if ($Hours -and ($SpanStringParts.Count -lt $TimespanUnits)) { [void]($SpanStringParts.Add("$($Hours)h")) $TimespanSeconds -= ($Hours * $SecondsReference.Hour) } ## Record how many Minutes $Minutes = [math]::Floor($TimespanSeconds / $SecondsReference.Minute) if ($Minutes -and ($SpanStringParts.Count -lt $TimespanUnits)) { [void]($SpanStringParts.Add("$($Minutes)m")) $TimespanSeconds -= ($Minutes * $SecondsReference.Minute) } ## Record how many Seconds if ($TimespanSeconds -and ($SpanStringParts.Count -lt $TimespanUnits)) { [void]($SpanStringParts.Add("$([math]::Floor($TimespanSeconds))s")) } ## Record how many Seconds if ($SpanStringParts.Count -lt $TimespanUnits) { [void]($SpanStringParts.Add("$($Timespan.Milliseconds)ms")) } ## Combine and return the Span String $SpanStringParts -join ' ' } } function Split-CsvFile { <# .SYNOPSIS Split a CSV file into small batches .DESCRIPTION This function consumes a CSV file and splits it into batches, using the defined splitting parameters .PARAMETER Path Path to the CSV File to split .PARAMETER OutputFolder Folder path to save the split files to. The folder (path) will be created if it does not already exist .PARAMETER BatchSize The number of lines of CSV data to add to each of the output files .PARAMETER Excel Converts the output files to Xlsx format. .EXAMPLE Split-CsvFile -Path 'Big.csv' -BatchSize 5000 .EXAMPLE Split-CsvFile -Path 'Big.csv' -BatchSize 20000 -OutFolderPath './Split' .EXAMPLE Split-CsvFile -Path 'Big.csv' -BatchSize 20000 -OutFolderPath './Split' -Excel .EXAMPLE Split-CsvFile -Path @('Medium.csv', 'Big.csv') -BatchSize 5000 .OUTPUTS None #> [CmdletBinding()] # Always add CmdletBinding to expose the common Cmdlet variables [OutputType([Bool])] # Add this if the function has an output param( [Parameter(Mandatory = $true, Position = 0, ValueFromPipeline = $true, # Make sure you have a process block if the function accepts value(s) from the pipeline ValueFromPipelineByPropertyName = $true)] [String[]]$Path, [Parameter(Mandatory = $false, Position = 1, ValueFromPipeline = $true, # Make sure you have a process block if the function accepts value(s) from the pipeline ValueFromPipelineByPropertyName = $true)] [String]$OutFolderPath, [Parameter(Mandatory = $False, Position = 2, ValueFromPipeline = $true, # Make sure you have a process block if the function accepts value(s) from the pipeline ValueFromPipelineByPropertyName = $true)] [Int32]$BatchSize = 25000, [Parameter(Mandatory = $False, Position = 2, ValueFromPipeline = $true, # Make sure you have a process block if the function accepts value(s) from the pipeline ValueFromPipelineByPropertyName = $true)] [Switch]$Excel, [Parameter(Mandatory = $False, ValueFromPipeline = $true, # Make sure you have a process block if the function accepts value(s) from the pipeline ValueFromPipelineByPropertyName = $true)] [Switch]$RenameSource ) process { if (-Not $OutFolderPath) { $OutFolderPath = ($Path[0] | Split-Path -Parent) } if ($Excel.IsPresent) { Import-Module ImportExcel } ## Create the OutFolderPath Test-FolderPath $OutFolderPath # Iterate over the Files provided foreach ($File in $Path) { ## Read the Header Row if (-Not (Test-Path $File)) { throw "File: $File does not exist" } ## Begin a FileStream Reader $BaseFileName = Split-Path -LeafBase -Path $File if ((Split-Path -Extension -Path $File) -ne '.csv') { throw "The provided file is not a csv file" } $OriginalFileReader = New-Object System.IO.StreamReader $File $HeaderRow = $OriginalFileReader.ReadLine() ## Start our Counters and Read the New File $BatchNumber = 1 $OutputLines = [System.Collections.ArrayList]@() while ($null -ne ($Line = $OriginalFileReader.ReadLine())) { ## Add this line [void]($OutputLines.Add($Line)) ## Carve off a job to analyze these rows if ($OutputLines.Count -eq $BatchSize) { ## Add the Header Row to the data $OutputLines.Insert(0, $HeaderRow) # Write the batch to the file if ($Excel.IsPresent) { $OutFilePath = Join-Path $OutFolderPath ("$($BaseFileName)_part_$($BatchNumber).xlsx") $Data = ($OutputLines -join "`r`n" | Out-String | ConvertFrom-Csv) Export-Excel -Path $OutFilePath -InputObject $data Remove-Variable Data } else { $OutFilePath = Join-Path $OutFolderPath ("$($BaseFileName)_part_$($BatchNumber).csv") Set-Content -Path $OutFilePath -Force -Encoding utf8 -Value $OutputLines } ## Empty the batch array and increment our Batch Number $OutputLines.Clear() $BatchNumber++ } } # Close the File Reader $OriginalFileReader.Close() Remove-Variable OriginalFileReader ## Add this line [void]($OutputLines.Add($Line)) if ($Excel.IsPresent) { $OutFilePath = Join-Path $OutFolderPath ("$($BaseFileName)_part_$($BatchNumber).xlsx") $Data = ($OutputLines -join "`r`n" | Out-String | ConvertFrom-Csv) Export-Excel -Path $OutFilePath -InputObject $data -AutoSize Remove-Variable Data } else { ## Write the remainder of the data to the final CSV file $OutFilePath = Join-Path $OutFolderPath ("$($BaseFileName)_part_$($BatchNumber).csv") Set-Content -Path $OutFilePath -Encoding utf8 -Force -Value $OutputLines } ## Rename the input file so it's not split again if ($RenameSource.IsPresent) { Move-Item $File -Destination ("$File.split") -Force } } } } |