Get-WebLinksFromSite.psm1
|
function Get-InnerUrl { [CmdletBinding()] param( [Parameter(Mandatory)] [string]$Url ) # Detect embedded URLs in query parameters $pattern = '(?i)(?:u|url|target)=((https?|ftp)://[^&]+)' $match = [regex]::Match($Url, $pattern) if ($match.Success) { return $match.Groups[1].Value } return $Url } function Get-WebLinks { [CmdletBinding()] param( [Parameter(Mandatory, Position = 0)] [ValidateNotNullOrEmpty()] [string]$Url ) # Pretend to be a real Chrome browser $headers = @{ "User-Agent" = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36" "Accept" = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8" "Accept-Language" = "en-US,en;q=0.9" #"Connection" = "keep-alive" } try { $response = Invoke-WebRequest -Uri $Url -Headers $headers -UseBasicParsing } catch { throw "Failed to retrieve '$Url'. $($_.Exception.Message)" } if (-not $response.Links) { Write-Warning "No links detected. The site may require JavaScript or blocked the request." return } $baseUri = [System.Uri]$Url foreach ($link in $response.Links) { # Skip if no href if (-not $link.href) { continue } # Resolve relative URLs $resolved = if ([System.Uri]::IsWellFormedUriString($link.href, 'Absolute')) { $link.href } else { try { (New-Object System.Uri($baseUri, $link.href)).AbsoluteUri } catch { continue } } # Extract embedded URL if present $finalUrl = Get-InnerUrl -Url $resolved # Safely handle missing innerText $text = if ($link.innerText) { $link.innerText.Trim() } else { "" } [PSCustomObject]@{ Text = $text Url = $finalUrl } } } Export-ModuleMember -Function Get-WebLinks, Get-InnerUrl |