Get-WebLinksFromSite.psm1

function Get-InnerUrl {
    [CmdletBinding()]
    param(
        [Parameter(Mandatory)]
        [string]$Url
    )

    # Detect embedded URLs in query parameters
    $pattern = '(?i)(?:u|url|target)=((https?|ftp)://[^&]+)'
    $match = [regex]::Match($Url, $pattern)

    if ($match.Success) {
        return $match.Groups[1].Value
    }

    return $Url
}

function Get-WebLinks {
    [CmdletBinding()]
    param(
        [Parameter(Mandatory, Position = 0)]
        [ValidateNotNullOrEmpty()]
        [string]$Url
    )

    # Pretend to be a real Chrome browser
    $headers = @{
        "User-Agent"      = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
        "Accept"          = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"
        "Accept-Language" = "en-US,en;q=0.9"
        #"Connection" = "keep-alive"
    }

    try {
        $response = Invoke-WebRequest -Uri $Url -Headers $headers -UseBasicParsing
    }
    catch {
        throw "Failed to retrieve '$Url'. $($_.Exception.Message)"
    }

    if (-not $response.Links) {
        Write-Warning "No links detected. The site may require JavaScript or blocked the request."
        return
    }

    $baseUri = [System.Uri]$Url

    foreach ($link in $response.Links) {

        # Skip if no href
        if (-not $link.href) { continue }

        # Resolve relative URLs
        $resolved = if ([System.Uri]::IsWellFormedUriString($link.href, 'Absolute')) {
            $link.href
        }
        else {
            try {
                (New-Object System.Uri($baseUri, $link.href)).AbsoluteUri
            }
            catch {
                continue
            }
        }

        # Extract embedded URL if present
        $finalUrl = Get-InnerUrl -Url $resolved

        # Safely handle missing innerText
        $text = if ($link.innerText) { $link.innerText.Trim() } else { "" }

        [PSCustomObject]@{
            Text = $text
            Url  = $finalUrl
        }
    }
}

Export-ModuleMember -Function Get-WebLinks, Get-InnerUrl