functions/Read-EBEpub.ps1
function Read-EBEpub { <# .SYNOPSIS Extract chapters from an epub-formatted ebook and convert them to markdown. .DESCRIPTION Extract chapters from an epub-formatted ebook and convert them to markdown. Markdown parsing strongly relies on the provided replacements file. .PARAMETER Path The path to the ebook to parse. .PARAMETER OutPath The folder to which to export the chapters. .PARAMETER Name Name of the book being parsed. Used in the output files' name. .PARAMETER ReplacementPath Path to a PowerShell data file (*.psd1) containing replacements to use with the reading effort. The file should contain a single hashtable with a single key: Values. This shall then contain a list of replacement definitions using regex. Example content: @{ Values = @( @{ What = '<p class="text">(.+?)</p>' With = '$1' Weight = 1 } ) } What: The pattern in the source file to match With: What to replace it with Weight: The processing order when specifying multiple replacements. Lower numbers go first. .PARAMETER StartIndex The number the first chapter starts with. Only affects the file name of the output. Defaults to:1 .EXAMPLE PS C:\> Read-EBEpub -Path '.\pirates.epub' -OutPath 'C:\ebooks\pirates\chapters' -ReplacementPath 'C:\ebooks\pirates\replacements.psd1' -Name Pirates Reads the "Pirates.epub" file, extracts the chapters to the specified output path, using the replacements provided inreplacements.psd1. #> [CmdletBinding()] param ( [Parameter(Mandatory = $true, ValueFromPipeline = $true, ValueFromPipelineByPropertyName = $true)] [Alias('FullName')] [string[]] $Path, [Parameter(Mandatory = $true)] [string] $OutPath, [string] $Name = 'unknown', [string] $ReplacementPath, [int] $StartIndex = 1 ) begin{ #region Functions function Convert-Chapter { [OutputType([string])] [CmdletBinding()] param ( [string] $Content, $Replacements ) $string = $Content -replace '</p>', "</p>`n" -replace '</{0,1}html[^>]{0,}>|</{0,1}body[^>]{0,}>|</{0,1}head[^>]{0,}>|</{0,1}link[^>]{0,}>|</{0,1}meta[^>]{0,}>|</{0,1}\?{0,1}xml[^>]{0,}>' $string = $string -split "\n" | ForEach-Object Trim | Remove-PSFNull | Join-String "`n`n" foreach ($item in $Replacements | Sort-Object Weight) { $string = $string -replace $item.What, $item.With } $string -replace '</{0,1}p[^>]{0,}>' -replace "(?s)\n{3,}", "`n`n" } #endregion Functions $tempFolder = Join-Path -Path (Get-PSFPath -Name Temp) -ChildPath "Ebook_Temp_$(Get-Random)" $null = New-Item -Path $tempFolder -ItemType Directory -Force $chapterIndex = $StartIndex $outputPath = Resolve-PSFPath -Path $OutPath -Provider FileSystem -SingleItem $replacements = @() if ($ReplacementPath) { $data = Import-PSFPowerShellDataFile -Path $ReplacementPath $replacements = foreach ($entry in $data.Values) { [PSCustomObject]$entry } } } process { foreach ($filePath in $Path) { foreach ($resolvedPath in Resolve-PSFPath -Path $filePath) { Expand-Archive -Path $resolvedPath -DestinationPath $tempFolder -Force foreach ($chapter in Get-ChildItem -Path "$tempFolder\OEBPS\sections") { $content = [System.IO.File]::ReadAllText($chapter.FullName) $newContent = Convert-Chapter -Content $content -Replacements $Replacements $newFileName = '{0}-{1:D4}.md' -f $Name, $chapterIndex [System.IO.File]::WriteAllText("$outputPath\$newFileName", $newContent) $chapterIndex++ } Remove-Item -Path "$tempFolder\*" -Force -Recurse } } } end { Remove-Item -Path $tempFolder -Force -Recurse -ErrorAction Ignore } } |