khd.ps1

# SPDX-License-Identifier: MPL-2.0
# Original author and source code: https://github.com/Mahtwo/KHDownloader

#region Header
# Help needs a blank before and after to be parsed

<#PSScriptInfo
.VERSION 1.0.0
 
.GUID e6a05bc7-7650-4f36-8745-409565470730
 
.AUTHOR Mahtwo
 
.TAGS
Music Download CrossPlatform PSEdition_Core Windows Linux MacOS
 
.LICENSEURI
https://mozilla.org/MPL/2.0/
 
.PROJECTURI
https://github.com/Mahtwo/KHDownloader
 
.RELEASENOTES
https://github.com/Mahtwo/KHDownloader/releases/1.0.0
#>


<#
.SYNOPSIS
Downloads an album from KHInsider Video Game Music with robust resume functionality.
 
.DESCRIPTION
The khd.ps1 script downloads an album from the KHInsider Video Game Music website
specified by the user. If the script stops for any reason, it will resume where
it previously was when run again with the same album.
 
.PARAMETER Url
URL of the KHInsider album to download,
like https://downloads.khinsider.com/game-soundtracks/album/name-of-the-album.
 
.PARAMETER Format
Audio format to prioritize (like FLAC, M4A, etc.), if not available will fallback to MP3.
This parameter supports tab completion based on the value of the URL parameter.
 
.PARAMETER NoCoverArt
Disables downloading the album cover art.
 
.INPUTS
None. You can't pipe objects to khd.ps1.
 
.OUTPUTS
None. khd.ps1 doesn't generate any output to the pipeline.
 
.EXAMPLE
& ./khd.ps1 'https://downloads.khinsider.com/game-soundtracks/album/malicious-fallen-original-soundtrack-2017' m4a
 
.EXAMPLE
$items = @(
    @{
        Url = 'https://downloads.khinsider.com/game-soundtracks/album/malicious-fallen-original-soundtrack-2017'
        Format = 'm4a'
    },
    @{
        Url = 'https://downloads.khinsider.com/game-soundtracks/album/the-legend-of-zelda-breath-of-the-wild'
    }
)
foreach ($item in $items) {& ./khd.ps1 @item}
 
.EXAMPLE
do {
    $loop = $false
    try {
        & ./khd.ps1 'https://downloads.khinsider.com/game-soundtracks/album/the-legend-of-zelda-breath-of-the-wild' flac
    }
    # Will still throw on other errors (for example parameters errors)
    catch [System.Net.Http.HttpRequestException] {
        $loop = $true
    }
} while ($loop)
 
.LINK
KHInsider home page : https://downloads.khinsider.com
 
.LINK
https://github.com/Mahtwo/KHDownloader
#>


#Requires -PSEdition Core
# Suppress some PSScript-Analyzer warnings
[Diagnostics.CodeAnalysis.SuppressMessageAttribute('PSReviewUnusedParameter', 'commandAst', Justification = 'variable not used in ArgumentCompleter of formats')]
[Diagnostics.CodeAnalysis.SuppressMessageAttribute('PSReviewUnusedParameter', 'commandName', Justification = 'variable not used in ArgumentCompleter of formats')]
[Diagnostics.CodeAnalysis.SuppressMessageAttribute('PSReviewUnusedParameter', 'parameterName', Justification = 'variable not used in ArgumentCompleter of formats')]
# TODO : PSGallery (and -UpdateKHD parameter only available on GitHub release?)
# TODO : Only get songs page URL from the last downloaded song to last song of the album (since it may be partially downloaded). That means only last song if album is fully downloaded
#endregion Header

#region Parameters
param (
    [Parameter(Position = 0, Mandatory, HelpMessage = 'URL of the album to download, like https://downloads.khinsider.com/game-soundtracks/album/name-of-the-album')]
    [Alias('u', 'Uri')]
    [ValidateScript({
            if ($_ -notmatch '^(https?://)?downloads.khinsider.com/game-soundtracks/album/[^/]+$') {
                throw 'Invalid URL, it should look like https://downloads.khinsider.com/game-soundtracks/album/name-of-the-album'
            }

            if ($null -eq $_.Scheme) {
                [uri]$url = "https://$_"
            } else {
                [uri]$url = $_
            }

            # Check internet by trying to connect to the URL
            if (-not (Test-Connection $url.Host -Count 1 -Quiet -ErrorAction SilentlyContinue)) {
                throw "$($url.Host) is unreachable, check your internet connection."
            }

            $mainPageFile = Join-Path ([System.IO.Path]::GetTempPath()) ($url.Segments[-1] + '.html')
            # Skip check if main page HTML file already exist
            if (Test-Path -PathType Leaf $mainPageFile) {
                return $true
            }


            # The file will only be created after downlading it entirely
            $mainPage = (Invoke-WebRequest -PassThru -ErrorAction Stop -OutFile $mainPageFile $url).Content
            # Check if the URL is a valid album URL
            if ($mainPage -match '<title>\s*Error\s*<\/title>') {
                Remove-Item -LiteralPath $mainPageFile
                throw "The album $_ does not exist"
            }

            return $true
        }
    )]
    [uri]$Url,

    [Parameter(Position = 1, HelpMessage = 'Format to prioritize (FLAC, M4A, etc.), default/fallback is MP3. Can TAB complete based on -Url')]
    [Alias('f')]
    [ArgumentCompletions('')] # Disable suggesting files from working directory when ArgumentCompleter returns nothing
    # Returning MP3 is technically useless but it's helpful for users not knowing MP3 is always available
    # Returning MP3 also tells the user formats have been gotten correctly when it's the only format available
    [ArgumentCompleter({
            param ($commandName, $parameterName, $wordToComplete, $commandAst, $fakeBoundParameters)
            if (-not $fakeBoundParameters.ContainsKey('Url')) {
                return
            }

            # fakeBoundParameters values are of primitive types (string int etc.), so we cannot use Scheme property
            if ($fakeBoundParameters['Url'] -match '^https?') {
                [uri]$url = $fakeBoundParameters['Url']
            } else {
                [uri]$url = "https://$($fakeBoundParameters['Url'])"
            }

            $mainPageFile = Join-Path ([System.IO.Path]::GetTempPath()) ($url.Segments[-1] + '.html')
            if (Test-Path -PathType Leaf $mainPageFile) {
                $mainPage = Get-Content -Raw -LiteralPath $mainPageFile
            } else {
                # Will silently fail if no internet connection (which is what we want)
                $mainPage = (Invoke-WebRequest -PassThru -ErrorAction Stop -OutFile $mainPageFile $url).Content
            }
            # Check if the URL is a valid album URL
            if ($mainPage -match '<title>\s*Error\s*<\/title>') {
                return
            }

            # SingleLine makes . match new line characters, [\s\S] would work with -replace but it's more cumbersome
            # .*? does the shortest match while .* does the biggest match
            # Get entire tr of songlist_header
            $tableHeader = [regex]::Replace($mainPage, '.*(<tr[^>]*songlist_header.*?</tr>).*', '$1', 'SingleLine')
            # Get each th value and remove all HTML tags
            $tableHeaderValues = [regex]::Matches($tableHeader, '<th[^>]*>(.*?)</th[^>]*>', 'SingleLine') |
                ForEach-Object { $_.Groups[1].Value -replace '<[^>]*>' }
            $availableFormats = @()
            for ($i = $tableHeaderValues.Length - 3; $tableHeaderValues[$i] -ne 'Song Name'; $i--) {
                $availableFormats += $tableHeaderValues[$i]
            }
            [array]::Reverse($availableFormats)

            if ($wordToComplete) {
                return $availableFormats | Where-Object { $_ -like "$wordToComplete*" }
            } else {
                return $availableFormats
            }
        }
    )]
    [string]$Format = 'MP3',

    [Alias('nca')]
    [switch]$NoCoverArt
)

# Formatting arguments
if ($null -eq $Url.Scheme) {
    $Url = "https://$Url"
}
$Format = $Format.ToUpperInvariant() # No null check needed as a string cannot be null
#endregion Parameters

#region Helpers
function Write-ProgressHelper {
    param(
        [string]$Status,
        [int]$PercentComplete,
        [switch]$Completed,
        [switch]$WaitUpdate
    )

    if ($WaitUpdate) {
        # Write-Progress only updates every 200ms and does not update to the last "missed" Write-Progress even after 200ms
        if (-not $timerWriteProgressHelper) {
            $timerWriteProgressHelper = [System.Threading.Tasks.Task]::Delay(2000)
        }
        $timerWriteProgressHelper.Wait()
    }
    $timerWriteProgressHelper = [System.Threading.Tasks.Task]::Delay(2000)
    # Cannot combine -Status $Status and -Completed:$Completed because it would throw when Status is not specified
    if ($Completed) {
        Write-Progress -Completed
    } else {
        Write-Progress -Activity "Downloading album $albumName" -Status $Status -PercentComplete $PercentComplete
    }
}
function Write-WarningHelper {
    param(
        [ValidateNotNull()]
        [string]$Message
    )
    Write-Warning "${albumName}: $Message"
}
function ConvertTo-ValidPath {
    param(
        [Parameter(ValueFromPipeline)]
        [ValidateNotNull()]
        [string]$Path
    )

    begin {
        # Removing Windows illegal characters so it's compatible between OS (Linux and macOS are subsets)
        # Some characters are technically valid like horizontal tabulation " " but it's not important
        $illegalCharacters = ([char[]](0..31) + [char[]]':*?"<>|') -join ''
    }
    process {
        # Replace consecutive illegal characters and whitespaces with a single space...
        # ...Trim whitespace characters at the beginning and whitespace + dot "." characters at the end
        return ($Path -replace "[$illegalCharacters\s]+", ' ') -replace '^\s*|[\s\.]*$'
    }
}
#endregion Helpers

#region Get album name
# Main page HTML file already exist because of argument URL ValidateScript
$mainPageFile = Join-Path ([System.IO.Path]::GetTempPath()) ($Url.Segments[-1] + '.html')
$mainPage = Get-Content -Raw -LiteralPath $mainPageFile
# Get first h2, replace illegal path characters and consecutive spaces to one space
$albumName = ([regex]::Match($mainPage, '<h2[^>]*>(.*?)</h2[^>]*>')).Groups[1] | ConvertTo-ValidPath
#endregion Get album name

#region Get songs page URL
$tempFile = Join-Path ([System.IO.Path]::GetTempPath()) ($Url.Segments[-1] + '.khd')
if (-not (Test-Path -PathType Leaf $tempFile)) {
    Write-ProgressHelper -Status 'Getting each song page URL' -PercentComplete 0

    # Get page URL of each song
    # Get from playlistDownloadSong to shortest href and capture href content
    $songsPageURL = [regex]::Matches($mainPage, 'playlistDownloadSong.*?href="([^"]*)"', 'SingleLine') |
        ForEach-Object { $_.Groups[1].Value }
    $pDSLength = $songsPageURL.Length
    $songsURL = [string[]]::new($pDSLength)
    # Fast enough, parallelization would be slower
    for ($index = 0; $index -lt $pDSLength; $index++) {
        Write-ProgressHelper -Status "Getting each song page URL ($index/$pDSLength)" -PercentComplete ([System.Math]::Floor($index / $pDSLength * 5))
        $songsURL[$index] = $Url.GetLeftPart([System.UriPartial]::Authority) + $songsPageURL[$index]
    }

    # Create file containing all songs page URLs
    # There is a weird bug with Add-Content -LiteralPath (but not -Path or Out-File -LiteralPath) where
    # if the path points to a file that does not exist AND the path use a PSDrive with a root that does not end with
    # a trailing path separator (which is the case with Pester TestDrive), Add-Content does not create the file
    New-Item -Path $tempFile -ItemType File > $null
    Add-Content -LiteralPath $tempFile -Value $songsURL

    Write-ProgressHelper -Status "Getting each song page URL ($index/$pDSLength)" -PercentComplete 5
} else {
    $songsURL = Get-Content -LiteralPath $tempFile
}
#endregion Get songs page URL

#region Get songs download URL
if (($songsURL -join '').Contains('downloads.khinsider.com/game-soundtracks/album/')) {
    Write-ProgressHelper -Status 'Converting each song page URL to download URL' -PercentComplete 5

    if ($Format -ne 'MP3') {
        # Check if the format is available for this album
        $formatAvailable = $false
        # Get entire tr of songlist_header
        $tableHeader = [regex]::Replace($mainPage, '.*(<tr[^>]*songlist_header.*?</tr>).*', '$1', 'SingleLine')
        # Get each th value and remove all HTML tags
        $tableHeaderValues = [regex]::Matches($tableHeader, '<th[^>]*>(.*?)</th[^>]*>', 'SingleLine') |
            ForEach-Object { $_.Groups[1].Value -replace '<[^>]*>' }
        for ($i = $tableHeaderValues.Length - 3; $tableHeaderValues[$i] -ne 'Song Name'; $i--) {
            if ($tableHeaderValues[$i] -eq $Format) {
                $formatAvailable = $true
                break
            }
        }
        if (-not $formatAvailable) {
            Write-WarningHelper "Format $Format not available, fallbacking to MP3"
            $Format = 'MP3'
        }
    }

    $sULength = $songsURL.Length
    try {
        # Put helper functions in variable to use them inside the job ($Using:Function:... does not work)
        $jobFunctions = Get-ChildItem -Path Function: | Where-Object Name -In Write-WarningHelper, ConvertTo-ValidPath |
            Select-Object -Property Name, Definition

        # We assume more CPU cores means more RAM too. -ThrottleLimit has diminishing returns anyway
        # editorconfig-checker-disable-next-line because splitting by pipeline adds an indentation and the closing brace } isn't aligned
        $getSongsDownloadURLJob = 0..($sULength - 1) | Where-Object { $songsURL[$_].Contains('downloads.khinsider.com/game-soundtracks/album/') } | ForEach-Object -AsJob -ThrottleLimit ([System.Environment]::ProcessorCount * 5) -Parallel {
            #region Get songs download URL - Job
            #region Get songs download URL - Job setup
            $songsURL = $Using:songsURL # No need for thread safe array since each runspace only modifiy their index
            $albumName = $Using:albumName # Used by Write-WarningHelper
            $Format = $Using:Format
            foreach ($jobFunction in $Using:jobFunctions) {
                New-Item -Path Function: -Name $jobFunction.Name -Value $jobFunction.Definition > $null
            }
            #endregion Get songs download URL - Job setup

            $songPageURL = $songsURL[$_]
            try {
                $SongPage = (Invoke-WebRequest -ErrorAction Stop $songPageURL).Content
            } catch {
                throw $_
            }
            # Matches : Get from href to shortest songDownloadLink (without encountering another href) to shortest </span>
            $songDownloadLinks = [regex]::Matches($SongPage, 'href(?:(?!href).)*?songDownloadLink.*?</span[^>]*>', 'SingleLine') | ForEach-Object {
                [PSCustomObject]@{
                    # Get href inside
                    href   = [regex]::Replace($_.Value, '.*href="([^"]*)".*', '$1', 'SingleLine')
                    # Get text between "download as " and "<"
                    Format = [regex]::Replace($_.Value, '.*download\s*as\s*([^<]*)<.*', '$1', 'SingleLine')
                }
            }

            # Check if format is available (the format may not be available for every song)
            foreach ($songDownloadLink in $songDownloadLinks) {
                if ($songDownloadLink.Format -eq $Format) {
                    $songsURL[$_] = $songDownloadLink.href
                    return
                }
            }

            # Fallback to MP3
            foreach ($songDownloadLink in $songDownloadLinks) {
                if ($songDownloadLink.Format -eq 'MP3') {
                    $songsURL[$_] = $songDownloadLink.href
                    # Prettify filename without extension from URL for warning
                    $filename = [uri]::UnescapeDataString(((Split-Path -LeafBase $songDownloadLink.href) | ConvertTo-ValidPath))
                    Write-WarningHelper "Format $Format not found for $filename, fallbacking to MP3"
                }
            }
            #endregion Get songs download URL - Job
        }
        $remainingChildJobs = $getSongsDownloadURLJob.ChildJobs
        $totalCount = $remainingChildJobs.Count
        while ($getSongsDownloadURLJob.State -eq 'Running') {
            $remainingChildJobs | Wait-Job -Any > $null
            $getSongsDownloadURLJob | Receive-Job

            # Check for any failed job
            if ($remainingChildJobs | Where-Object State -EQ 'Failed' | Select-Object -First 1) {
                return
            }

            $remainingChildJobs = $remainingChildJobs | Where-Object State -In 'NotStarted', 'Running'
            $doneCount = $totalCount - $remainingChildJobs.Count
            Write-ProgressHelper -Status "Converting each song page URL to download URL ($doneCount/$totalCount)" -PercentComplete (5 + [System.Math]::Floor($doneCount / $totalCount * 15))
        }
    } catch {
        # Necessary to exit script on all errors, otherwise some errors (notably from Invoke-WebRequest) continue after finally
        # The catch can be removed when/if https://github.com/PowerShell/PowerShell/issues/21345 is fixed
        throw $_
    }
    # Save current progress even on errors or Ctrl-C
    finally {
        # Script may have interrupted before creating the job
        if ($getSongsDownloadURLJob) {
            $getSongsDownloadURLJob | Stop-Job -PassThru | Receive-Job -AutoRemoveJob -Wait
        }
        if (-not $totalCount) {
            $totalCount = 1 # Avoids a division by zero
        }
        Write-ProgressHelper -Status 'Saving converted URLs' -PercentComplete (5 + [System.Math]::Floor($doneCount / $totalCount * 15))
        $tempFileTemp = "$tempFile.tmp"
        New-Item -ItemType File -Force $tempFileTemp > $null
        # $songsURL may have a mix of download URLs and page URLs if the script was interrupted
        Add-Content -LiteralPath $tempFileTemp -Value $songsURL
        Move-Item -Force -LiteralPath $tempFileTemp -Destination $tempFile
    }
} elseif ($Format -ne 'MP3') {
    Write-WarningHelper "All songs URL are present, format $Format will not be checked"
}
#endregion Get songs download URL

#region Prepare filenames
$sULength = $songsURL.Length
$songsFile = [string[]]::new($sULength)
$albumDirectory = Join-Path $PWD $albumName
for ($index = 0; $index -lt $sULength; $index++) {
    $songDownloadURL = $songsURL[$index]
    $filename = [uri]::UnescapeDataString(((Split-Path -Leaf $songDownloadURL) | ConvertTo-ValidPath))
    $filepath = Join-Path $albumDirectory $filename
    $songsFile[$index] = $filepath
}
#endregion Prepare filenames

#region Download songs
if (-not (Test-Path -PathType Container $albumDirectory)) {
    New-Item -ItemType Directory $albumDirectory > $null
}
for ($index = 0; $index -lt $sULength; $index++) {
    # Skip to the last downloaded file as it may only be partially downloaded
    if ($index + 1 -ne $sULength -and (Test-Path -PathType Leaf $songsFile[$index + 1])) {
        continue
    }
    Write-ProgressHelper -Status "Downloading each song ($index/$sULength)" -PercentComplete (20 + [System.Math]::Floor($index / $sULength * 80))

    $songDownloadURL = $songsURL[$index]
    $songFile = $songsFile[$index]
    try {
        Invoke-WebRequest -Resume -ErrorAction Stop -OutFile $songFile $songDownloadURL > $null
    } catch {
        # Necessary to exit on Invoke-WebRequest error, otherwise those errors don't end the script
        # The try-catch can be removed (keep only the Invoke-WebRequest command) when/if https://github.com/PowerShell/PowerShell/issues/21345 is fixed
        throw $_
    }
}
#endregion Download songs

#region Download cover art
if (-not $NoCoverArt) {
    Write-ProgressHelper -Status 'Downloading album cover art' -PercentComplete 99

    # Use first cover art found
    # Will silently fail (coverArtUrl set to empty string) if no cover art was found, although they seem to always have at least one
    # Get entire div of first albumImage (Match only gets first unlike Matches)
    $albumImageFirst = ([regex]::Match($mainPage, '<div[^>]*albumImage[^>]*>.*?</div>', 'SingleLine')).Value
    # Get href inside
    $coverArtUrl = [regex]::Replace($albumImageFirst, '.*href="([^"]*)".*', '$1', 'SingleLine')
    if ($coverArtUrl) {
        $fileExtension = Split-Path -Extension $coverArtUrl
        if (-not $fileExtension) {
            Write-WarningHelper 'Album cover art does not have a file extension, defaulting to .jpg'
            $fileExtension = '.jpg'
        }

        $filename = 'cover' + $fileExtension
        $coverArtFile = Join-Path $albumDirectory $filename
        Invoke-WebRequest -Resume -ErrorAction Stop -OutFile $coverArtFile $coverArtUrl > $null
    } else {
        Write-WarningHelper 'No album cover art found'
    }
}
#endregion Download cover art

#region Clean-up
Remove-Item -LiteralPath $mainPageFile, $tempFile
# [System.Environment]::UserInteractive is false if there is no user interface on Windows, always true on other OSs
if ([System.Environment]::UserInteractive -and $ProgressPreference -notin 'SilentlyContinue', 'Ignore') {
    Write-ProgressHelper -Status 'Done!' -PercentComplete 100 -WaitUpdate
    # Add a delay to show 100% complete bar for better UX
    Start-Sleep 1
}
Write-ProgressHelper -Completed
#endregion Clean-up