Split-Wikipedia

1.5

Split-Wikipedia.ps1

                                
<#PSScriptInfo

.VERSION 1.5

.GUID 6c8ec05e-4d42-465b-9a30-2bbdcec289d3

.AUTHOR Lee Holmes

#>

<# 

.DESCRIPTION 

Splits a Wikipedia XML database dump into text-only articles. Articles are placed

in an "Articles" directory, then again split into subdirectories with 5,000

articles each. 

.EXAMPLE

PS > Invoke-WebRequest https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 -Outfile enwiki-latest-pages-articles.xml.bz2

PS > bzip2 -d enwiki-latest-pages-articles.xml.bz2

PS > Split-Wikipedia enwiki-latest-pages-articles.xml

.NOTES

Processing of 

#> 

param(

    [CmdletBinding()]

    $Path

)

function GetSafeFilename

{

    param(

        $BasePath = ".",

        $Text,

        $Extension = ".txt"

    )

    ## Remove invalid filesystem characters

    $invalidChars = [IO.Path]::GetInvalidFileNameChars()

    $invalidCharsRegex = "[" + (-join ($invalidChars | % { [Regex]::Escape($_) })) + "]"

    $baseFilename = $Text -replace $invalidCharsRegex,'_'

    ## Avoid reserved device names

    $reservedDeviceNames = -split "CON PRN AUX NUL COM1 COM2 COM3 COM4 COM5 COM6 COM7 COM8 COM9 LPT1 LPT2 LPT3 LPT4 LPT5 LPT6 LPT7 LPT8 LPT9"

    if($baseFilename -in $reservedDeviceNames)

    {

        $baseFilename = "_" + $baseFilename

    }

    ## Avoid path length issues

    $baseFilename = $baseFilename.Substring(0, [Math]::Min(50, $baseFilename.Length))

    ## Avoid existing files

    $counter = 1

    $fileName = $baseFilename + $Extension

    while(Test-Path (Join-Path $BasePath $fileName))

    {

        $filename = $baseFilename + "_${counter}${Extension}"

        $counter++

    }

    # Emit the result

    $fileName

}

$null = New-Item -Type Directory articles

$basePath = ""

$articleCounter = 1

$currentTitle = ''

$currentArticle = New-Object System.Text.StringBuilder

$capturing = $false

$capturingTitle = $false

## Taken from enwiki-20160601, which had an average article size of 3456.7 bytes.

$estimatedArticleCount = (Get-Item $Path).Length / 3456.79917342699

$xmlReader = [System.Xml.XmlReader]::Create( (Resolve-Path $Path) )

while($xmlReader.Read())

{

    switch ($xmlReader.NodeType)

    {

        'Element'

        {

            if($xmlReader.Name -eq 'Title')

            {

                $capturingTitle = $true

            }

            elseif($xmlReader.Name -eq 'Text')

            {

                $capturing = $true

            }

        }

        'Text'

        {

            if($capturingTitle)

            {

                $currentTitle = $xmlReader.Value

                $capturingTitle = $false

            }

            elseif($capturing)

            {

                $null = $currentArticle.Append($xmlReader.Value)

            }

        }

        'EndElement'

        {

            if($xmlReader.Name -eq 'Page')

            {

                if(($articleCounter % 1000) -eq 0)

                {

                    Write-Progress "Processing article ${articleCounter}: $currentTitle" -PercentComplete ($articleCounter * 100 / $estimatedArticleCount)

                }

                if(($articleCounter % 5000) -eq 0)

                {

                    $basePath = $null

                }

                $output = $currentArticle.ToString()

                do

                {

                    $foundmatch = $false

                    ## Remove tables

                    if($output -match "(?s){\|[^{}]+?\|}")

                    {

                        $foundmatch = $true

                        $output = $output -replace "(?s){\|[^{}]+?\|}",""

                    }

                    ## Remove {{cite ... }} and subheadings

                    if($output -match "(?s){{[^{}]+?}}")

                    {

                        $foundmatch = $true

                        $output = $output -replace "(?s){{[^{}]+?}}",""

                    }

                } while($foundmatch)

                ## Remove <ref some article></ref>

                $output = $output -replace "(?s)<ref.*?</ref>",""

                $output = $output -replace "(?s)<ref.*?/>",""

                ## Remove <!-- Some comment ->>

                $output = $output -replace "(?s)<!--.*?>",""

                ## Replace [[Article Reference|Description]] with Description

                $output = $output -replace '(?s)\[\[([^\[\]]+)\|([^\[\]]+)\]\]','$2'

                # Replace [Article Reference] with Article Reference

                $output = $output -replace '(?s)\[\[([^\[\|\]]+)\]\]','$1'

                # Remove [[File ... ]]

                $output = $output -replace '(?s)\[\[File.*?\]\]',''

                ## Remove everything after "References"

                $output = $output -replace "(?s)==References.*",""

                ## Normalize line endings, and remove extraneous extra

                ## newlines

                $output = $output -replace "\n","`r`n"

                $output = $output -replace "(`r`n){3,}","`r`n"

                ## Clean up sequences of single quotes like '''Quoted'''

                $output = $output -replace "'{2,}",'"'

                ## Final cleanup

                $output = $output.Trim()

                if(

                    ## Skip articles that just redirect to other articles

                    ($output -notmatch "^#REDIRECT") -and

                    ## Skip very small articles

                    ($output.Length -gt 500) -and

                    ## Skip file metadata articles

                    ($currentTitle -notmatch "^FILE:") -and

                    ## Skip Wikipedia metadata articles

                    ($currentTitle -notmatch "^Wikipedia:") -and

                    ## Skip "category", "template", or "draft" articles

                    ($currentTitle -notmatch "^CATEGORY:|^TEMPLATE:|^DRAFT") -and

                    ## Skip "articles for deletion"

                    ($currentTitle -notmatch "Articles for deletion") -and

                    ## Skip "Spam link reports"

                    ($currentTitle -notmatch "Spam/LinkReports")

                )

                {

                    if(-not $basePath)

                    {

                        $basePath = GetSafeFileName -BasePath "articles" -Text $currentTitle -Extension ""

                        $null = New-Item -Type Directory -Path (Join-Path articles $basePath)

                    }

                    $outputFile = GetSafeFilename -BasePath (Join-Path articles $basePath) $currentTitle

                    Set-Content -LiteralPath "articles\$basePath\$outputFile" -Value $output 

                }

                $null = $currentArticle.Clear()

                $articleCounter++

            }

            elseif($xmlReader.Name -eq 'Text')

            {

                $capturing = $false

            }

        }

    }

}