Public/ConvertFrom-HTML.ps1
<#
.SYNOPSIS Takes an HTML input and converts it to an HTMLAgilityPack htmlNode object that can be navigated using Linq .DESCRIPTION Long description .EXAMPLE PS C:\> <example usage> Explanation of what the example does .INPUTS [String[]] [System.IO.FileInfo[]] .OUTPUTS [HtmlAgilityPack.HtmlDocument] [HtmlAgilityPack.HtmlNode] .NOTES General notes #> function ConvertFrom-Html { [CmdletBinding(DefaultParameterSetName="String")] param ( #The HTML text to parse. Accepts multiple separate documents as an array. This also accepts pipeline from Invoke-WebRequest [Parameter(ParameterSetName="String",Mandatory,ValueFromPipeline,ValueFromPipelineByPropertyName,Position=0)] [String[]]$Content, #The URI or URIs from which to retrieve content. This may be faster than using Invoke-WebRequest but is less flexible in the method of retrieval (for instance, no POST) [Parameter(ParameterSetName="URI",Mandatory,ValueFromPipeline,ValueFromPipelineByPropertyName)] [System.URI[]]$URI, #Path to file or files containing HTML content to convert. This accepts pipeline from Get-Childitem or Get-Item [Parameter(ParameterSetName="Path",Mandatory,ValueFromPipeline,ValueFromPipelineByPropertyName)] [System.IO.FileInfo[]]$Path, #Do not return the Linq documentnode, instead return the HTMLDocument object. This is useful if you want to do XPath queries instead of Linq queries [switch]$Raw ) begin { } process { #Find the type of input and bind it to inputObject $inputObject = $null foreach ($contentType in "Content","URI","Path") { if ((Get-Variable -erroraction SilentlyContinue $contentType).value) { $inputObject = (Get-Variable $contentType).value break } } if (-not $inputObject) {write-error "Input Object Type Not Identified. If you see this then ConvertFrom-HTML needs better input validation"} #Unwrap any arrays. This allows us to accept both pipeline and parameter input $inputObject | ForEach-Object { $inputItem = $PSItem $htmlDoc = new-object HtmlAgilityPack.HtmlDocument #Process all object types into a common HTML document format switch ($inputItem.GetType().FullName) { "System.String" { $htmlDoc.LoadHtml($inputItem) } "System.Uri" { $htmlDoc = (new-object HtmlAgilityPack.HtmlWeb).Load($inputItem) } "System.IO.FileInfo" { $htmlDoc.Load($inputItem) } Default { write-error "Object Type not supported or implemented. If you see this error then ConvertFrom-HTML has improper input validation" continue } } if ($inputItem) { if ($Raw) { $htmlDoc } else { $htmlDoc.DocumentNode } } } } } |