ExtractPDFData.psm1
function Export-PDFDataTextWithLayout { <# .SYNOPSIS Extract text with lay-out from a PDF document. .DESCRIPTION Extract text with lay-out from a PDF document. The output will be a plain text document which tries to keep the text on the same place as in the normal document by using spaces to position it. Do note that all text is the same size so larger and smaller text could look wrong due to them being resized. .PARAMETER Path The path of the PDF file. .PARAMETER Page A page number or array of page numbers. When left empty all pages will be exported. .EXAMPLE Extract the text from all files. PS> Export-PDFDataTextWithLayout -Path .\file.pdf .EXAMPLE Extract page 1 and 2 from the file. PS> Export-PDFDataTextWithLayout -Path .\file.pdf -Page 1,2 #> [CmdLetBinding()] [OutputType([System.Object[]])] Param ( [Parameter(Mandatory = $true, Position = 1)] [String] $Path, [Parameter(Mandatory = $false, Position = 2)] [Int32[]] $Page ) # Send telemetry Send-THEvent -ModuleName "ExtractPDFData" -EventName "Export-PDFDataTextWithLayout" # Load the document $document = [Spire.Pdf.PdfDocument]::new($path) # Create an output variable $convertedfile = @() # List all pages if not entered if ($null -eq $Page) { $Page = 1..($document.Pages.count) } # Convert every page $pageindex = 1 $document.Pages | ForEach-Object { if ($pageindex -in $Page) { $convertedfile += $_.ExtractText() } $pageindex ++ } return $convertedfile } # =================================================================== # ================== INSTALL NUGET PACKAGE ========================== # =================================================================== $loaded = $false $installed = $false $package = Get-Package -Name FreeSpire.PDF -ErrorAction SilentlyContinue if ($package) { $installed = $true if ([System.AppDomain]::CurrentDomain.GetAssemblies() | Where-Object { $_.FullName -like "Spire.PDF*" }) { $loaded = $true } } if (-not $installed) { # Install the package in currentuser scope to make sure no elavation is needed $null = Install-Package -Name FreeSpire.PDF -Source "http://www.nuget.org/api/v2" -SkipDependencies -Confirm:$false -Force -Scope CurrentUser $package = Get-Package -Name FreeSpire.PDF } if (-not $loaded) { # Load the package in memory and look for the right dll $zip = [System.IO.Compression.ZipFile]::Open($package.Source, "Read") $file = $zip.entries | Where-Object { $_.FullName -like "*/net6.0/Spire.Pdf.dll" } # Read the file to memory $reader = [System.IO.StreamReader]$file.Open() $memStream = [System.IO.MemoryStream]::new() $reader.BaseStream.CopyTo($memStream) [byte[]]$bytes = $memStream.ToArray() # Close the package and file $reader.Close() $zip.dispose() # Load the assmebly from memory $null = [System.Reflection.Assembly]::Load($bytes) } # =================================================================== # ================== TELEMETRY ====================================== # =================================================================== # Create env variables $Env:EXTRACTPDFDATA_TELEMETRY_OPTIN = (-not $Evn:POWERSHELL_TELEMETRY_OPTOUT) # use the invert of default powershell telemetry setting # Set up the telemetry Initialize-THTelemetry -ModuleName "ExtractPDFData" Set-THTelemetryConfiguration -ModuleName "ExtractPDFData" -OptInVariableName "EXTRACTPDFDATA_TELEMETRY_OPTIN" -StripPersonallyIdentifiableInformation $true -Confirm:$false Add-THAppInsightsConnectionString -ModuleName "ExtractPDFData" -ConnectionString "InstrumentationKey=df9757a1-873b-41c6-b4a2-2b93d15c9fb1;IngestionEndpoint=https://westeurope-5.in.applicationinsights.azure.com/;LiveEndpoint=https://westeurope.livediagnostics.monitor.azure.com/" # Create a message about the telemetry Write-Information ("Telemetry for ExtractPDFData module is $(if([string] $Env:EXTRACTPDFDATA_TELEMETRY_OPTIN -in ("no","false","0")){"NOT "})enabled. Change the behavior by setting the value of " + '$Env:EXTRACTPDFDATA_TELEMETRY_OPTIN') -InformationAction Continue # Send a metric for the installation of the module Send-THEvent -ModuleName "ExtractPDFData" -EventName "Import Module ExtractPDFData" |