Read-HtmlTable.ps1
<#PSScriptInfo
.VERSION 2.0.1 .GUID 6ddb4b24-29bc-4268-a62f-402b3ee28e3d .AUTHOR iRon .COMPANYNAME .COPYRIGHT .TAGS Read Extract Scrape ConvertFrom Html Table .LICENSE https://github.com/iRon7/Read-HtmlTable/LICENSE .PROJECTURI https://github.com/iRon7/Read-HtmlTable .ICON https://raw.githubusercontent.com/iRon7/Read-HtmlTable/main/Read-HtmlTable.png .EXTERNALMODULEDEPENDENCIES .REQUIREDSCRIPTS .EXTERNALSCRIPTDEPENDENCIES .RELEASENOTES .PRIVATEDATA #> <# .SYNOPSIS Reads a HTML table .DESCRIPTION Scrapes (extracts) a HTML table from a string or the internet location .INPUTS String or Uri .OUTPUTS PSCustomObject[] .PARAMETER InputObject The html content (string) that contains a html table. If the string is less than 2048 characters and contains a valid uri protocol, the content is downloaded from the concerned location. .PARAMETER Uri A uri location referring to the html content that contains the html table .PARAMETER Header Specifies an alternate column header row for the imported string. The column header determines the property names of the objects created by ConvertFrom-Csv. Enter column headers as a comma-separated list. Do not enclose the header string in quotation marks. Enclose each column header in single quotation marks. If you enter fewer column headers than there are data columns, the remaining data columns are discarded. If you enter more column headers than there are data columns, the additional column headers are created with empty data columns. A $Null instead of a column name, will span the respective column with previous column. Note: To select specific columns or skip any data (or header) rows, use Select-Object cmdlet .PARAMETER TableIndex Specifies which tables should be selected from the html content (where 0 refers to the first table). By default, all tables are extracted from the content. Note: in case of multiple tables, the headers should be unified to properly output or display of each table. (see: https://github.com/PowerShell/PowerShell/issues/13906) .PARAMETER Separator Specifies the characters used to join a header with is spanned over multiple columns. (default: space character) .PARAMETER Delimiter Specifies the characters used to join a header with is spanned over multiple rows. (default: the newline characters used by the operating system) .PARAMETER NoTrim By default, all header - and data text is trimmed, to disable trimming, use the -NoTrim parameter. .EXAMPLE Read-HTMLTable https://github.com/iRon7/Read-HtmlTable Product Invoice Invoice Invoice Item Qauntity @ Price ------------- ----------------- ---------- -------------- Paperclips (Box) 100 1.15 115.00 Paper (Case) 10 45.99 459.90 Wastepaper Baskets 10 17.99 35.98 Subtotal Subtotal Subtotal 610.88 Tax Tax 7% 42.76 Total Total Total 653.64 .LINK https://github.com/iRon7/Read-HtmlTable #> [CmdletBinding(DefaultParameterSetName='Html')][OutputType([Object[]])] param( [Parameter(ParameterSetName='Html', ValueFromPipeLine = $True, Mandatory = $True, Position = 0)][String]$InputObject, [Parameter(ParameterSetName='Uri', ValueFromPipeLine = $True, Mandatory = $True)][Uri]$Uri, [Object[]]$Header, [Int[]]$TableIndex, [String]$Separator = ' ', [String]$Delimiter = [System.Environment]::NewLine, [Switch]$NoTrim ) Begin { function ParseHtml($String) { $Unicode = [System.Text.Encoding]::Unicode.GetBytes($String) $Html = New-Object -Com 'HTMLFile' if ($Html.PSObject.Methods.Name -Contains 'IHTMLDocument2_Write') { $Html.IHTMLDocument2_Write($Unicode) } else { $Html.write($Unicode) } $Html.Close() $Html } filter GetTopElement([String[]]$TagName) { if ($TagName -Contains $_.tagName) { $_} else { @($_.Children).Where{ $_ } | GetTopElement -TagName $TagName } } function GetUnit($Data, [int]$x, [int]$y) { if ($x -lt $Data.Count -and $y -lt $Data[$x].Count) { $Data[$x][$y] } } function SetUnit($Data, [int]$x, [int]$y, [HashTable]$Unit) { while ($x -ge $Data.Count) { $Data.Add([System.Collections.Generic.List[HashTable]]::new()) } while ($y -ge $Data[$x].Count) { $Data[$x].Add($Null) } $Data[$x][$y] = $Unit } function GetData([__ComObject[]]$TRs) { $Data = [System.Collections.Generic.List[System.Collections.Generic.List[HashTable]]]::new() $y = 0 foreach($TR in $TRs) { $x = 0 foreach($TD in ($TR |GetTopElement 'th', 'td')) { while ($True) { # Skip any row spans $Unit = GetUnit -Data $Data -x $x -y $y if (!$Unit) { break } $x++ } $Text = if ($Null -ne $TD.innerText) { if ($NoTrim) { $TD.innerText } else { $TD.innerText.Trim() } } for ($r = 0; $r -lt $TD.rowspan; $r++) { $y1 = $y + $r for ($c = 0; $c -lt $TD.colspan; $c++) { $x1 = $x + $c $Unit = GetUnit -Data $Data -x $x1 -y $y1 if ($Unit) { SetUnit -Data $Data -x $x1 -y $y1 -Unit @{ ColSpan = $True; Text = $Unit.Text, $Text } } # RowSpan/ColSpan overlap else { SetUnit -Data $Data -x $x1 -y $y1 -Unit @{ ColSpan = $c -gt 0; RowSpan = $r -gt 0; Text = $Text } } } } $x++ } $y++ } ,$Data } } Process { if (!$Uri -and $InputObject.Length -le 2048 -and ([Uri]$InputObject).AbsoluteUri) { $Uri = [Uri]$InputObject } $Response = if ($Uri -is [Uri] -and $Uri.AbsoluteUri) { Try { Invoke-WebRequest $Uri } Catch { Throw $_ } } $Html = if ($Response) { ParseHtml $Response.RawContent } else { ParseHtml $InputObject } $i = 0 foreach($Table in ($Html.Body |GetTopElement 'table')) { if (!$PSBoundParameters.ContainsKey('TableIndex') -or $i++ -In $TableIndex) { $Rows = $Table |GetTopElement 'tr' if (!$Rows) { return } if ($PSBoundParameters.ContainsKey('Header')) { $HeadRows = @() $Data = GetData $Rows } else { for ($i = 0; $i -lt $Rows.Count; $i++) { $Rows[$i].id = "id_$i" } $THead = $Table |GetTopElement 'thead' $HeadRows = @( if ($THead) { $THead |GetTopElement 'tr' } else { $Rows.Where({ !($_ |GetTopElement 'th') }, 'Until' ) } ) if (!$HeadRows -or $HeadRows.Count -eq $Rows.Count) { $HeadRows = $Rows[0] } $Head = GetData $HeadRows $Data = GetData ($Rows.Where{ $_.id -notin $HeadRows.id }) $Header = @( for ($x = 0; $x -lt $Head.Count; $x++) { if ($Head[$x].Where({ !$_.ColSpan }, 'First') ) { ,@($Head[$x].Where{ !$_.RowSpan }.ForEach{ $_.Text }) } else { $Null } # aka spanned header column } for ($x = $Head.Count; $x -lt $Data.Count; $x++) { if ($Null -ne $Data[$x].Where({ $_ -and !$_.ColSpan }, 'First') ) { '' } } ) } $Header = $Header.ForEach{ if ($Null -eq $_) { $Null } else { $Name = [String[]]$_ $Name = if ($NoTrim) { $Name -Join $Delimiter } else { (($Name.ForEach{ $_.Trim() }) -Join $Delimiter).Trim() } if ($Name) { $Name } else { '1' } } } $Unique = [System.Collections.Generic.HashSet[String]]::new([StringComparer]::InvariantCultureIgnoreCase) $Duplicates = @( for ($i = 0; $i -lt $Header.Count; $i++) { if ($Null -ne $Header[$i] -and !$Unique.Add($Header[$i])) { $i } } ) $Duplicates.ForEach{ do { $Name, $Number = ([Regex]::Match($Header[$_], '^([\s\S]*?)(\d*)$$')).Groups.Value[1, 2] $Digits = '0' * $Number.Length $Header[$_] = "$Name{0:$Digits}" -f (1 + $Number) } while (!$Unique.Add($Header[$_])) } for ($y = 0; $y -lt ($Data |ForEach-Object Count |Measure-Object -Maximum).Maximum; $y++) { $Name = $Null # (custom) -Header parameter started with a spanned ($Null) column $Properties = [ordered]@{} for ($x = 0; $x -lt $Header.Count; $x++) { $Unit = GetUnit -Data $Data -x $x -y $y -Unit if ($Null -ne $Header[$x]) { $Name = $Header[$x] $Properties[$Name] = if ($Unit) { $Unit.Text } # else $Null (align column overflow) } elseif ($Name -and !$Unit.ColSpan) { $Properties[$Name] = $Properties[$Name], $Unit.Text } } [pscustomobject]$Properties } } } $Null = [System.Runtime.Interopservices.Marshal]::ReleaseComObject($Html) } |