Private/Get-VolkskrantArticle.ps1
|
function Get-VolkskrantArticle { param ( [Parameter(Mandatory)] [string]$Uri, [ScriptBlock]$UrlFilter = { $_ -like '*/nieuws-achtergrond/*' -or $_ -like '*/cultuur-media/*' } ) $DutchCulture = New-Object -TypeName System.Globalization.CultureInfo -ArgumentList 'nl-NL' curl --silent --location $Uri ` | Join-String -Separator ' ' | ConvertTo-HtmlDocument | ForEach-Object { $_.DocumentNode.SelectNodes("//a") } | Where-Object { $_.HasAttributes } | ForEach-Object { $_.GetAttributeValue('href', '') } | Where-Object -FilterScript $UrlFilter | ForEach-Object { $_ -replace '^/', 'https://volkskrant.nl/' } | Where-Object { $_ -notlike 'https://volkskrant.nl/*' } | ForEach-Object { $Document = curl --silent --location $_ | Join-String -Separator ' ' | ConvertTo-HtmlDocument $DateText = (($Document | Select-HtmlNode -XPath '//meta[@property="article:published_time"]').GetAttributeValue("content", "") -split 'T')[0] $Date = [DateTime]::ParseExact($DateText, 'yyyy\-MM\-dd', $DutchCulture) $Title = (($Document | Select-HtmlNode -CssSelector 'h1' -All).InnerText | ForEach-Object { $_.Trim() } | Where-Object { $_ }) -join ' ' $Paragraphs = $Document | Select-HtmlNode -CssSelector 'section' | Select-HtmlNode -CssSelector 'p' -All | Where-Object { $_.GetAttributeValue('class', '') -notin 'artstyle__container__text', 'z3lfzo5 z3lfzo6 _1iobnq21' } $Body = (($Paragraphs).InnerText | ForEach-Object { $_.Trim() } | Where-Object { $_ }) -join ' ' $Body = $Body -replace 'Over de auteur.*?Magazine\.\s*', '' $Body = $Body -replace 'Columnisten hebben de vrijheid hun mening te geven en hoeven zich niet te houden aan de journalistieke regels voor objectiviteit\.\s*', '' $Body = $Body -replace 'Geselecteerd door de redactie\.?\s*' [PSCustomObject][Ordered]@{ PSTypeName = 'UncommonSense.Volkskrant.Article' Url = $_ Date = $Date DateText = $DateText Title = $Title Body = $Body } } } |