Documentarian.psm1
# Copyright (c) Microsoft Corporation. # Licensed under the MIT License. using namespace System.Management.Automation #region Enums.Public enum MarkdownExtension { Advanced PipeTables GfmPipeTables EmphasisExtras ListExtras HardLineBreak Footnotes Footers Citations Attributes GridTables Abbreviations Emojis DefinitionLists CustomContainers Figures Mathematics Bootstrap MediaLinks SmartyPants AutoIdentifiers TaskLists Diagrams NoFollowLinks NoOpenerLinks NoReferrerLinks NoHtml Yaml NonAsciiNoEscape AutoLinks Globalization } enum LinkKind { TextInline # [<Text>](<Definition>) TextSelfReference # [<Text>] TextUsingReference # [<Text>][<Reference>] ImageInline # ![<AltText>](<Definition>) ImageSelfReference # ![<AltText>] ImageUsingReference # ![<AltText>][<Reference>] ReferenceDefinition # [<Name>]: <Definition> } #endregion Enums.Public #region Classes.Public [Diagnostics.CodeAnalysis.SuppressMessageAttribute( <#Category#>'PSUseConsistentIndentation', <#CheckId#>$null, Justification = 'Easier readability for regex strings from arrays' )] class ParsingPatterns { # Anything following this is the start of a valid inline/block for Markdown. The value can be # used to find code fence openings, link reference definitions, etc. static [string] $LineLead = @( '^' # Anchors to start of line '(?<Lead>' # Lead captures whitespace + block notation '(?<LeadingWhiteSpace>\s*)' # May start with any amount of leading whitespace '(?<BlockNotation>' # May be in a list or blockquote "(?'BlockQuoteBefore'>\s+)*" # Blockquote, like '> ```', '> > ```', etc. '(?<ListNotation>' # A list can follow a block quote '(?<OrderedList>\d+\. )' # Ordered list, like '1. ```' or '15. ```' '|' # '(?<UnorderedList>[-+*] )' # Unordered list, like '- ```', '* ```', or '+ ```' ')?' # "(?'BlockQuoteAfter'>\s+)?" # Blockquotes can come after a list, too, but only once ')?' # Doesn't need to have a block ')' # Close lead capture group ) -join '' # Returns a pattern for finding everything inside square brackets using balance groups with an optional name. # To use the same pattern more than once in a regex, the balance groups need unique names. static [string] InSquareBrackets([string]$BalanceGroupName) { $OpenGroup = "Open$BalanceGroupName" $CloseGroup = [string]::IsNullOrWhiteSpace($BalanceGroupName) ? 'Close' : $BalanceGroupName return @( '(?:' # Open bracket group finder '(?:' # "(?<$OpenGroup>\[)" # Open balance group starts with [ '(?:`` .* ``|`[^`]*`|[^\[\]])*' # Anything inside inline code or not-[] ')+' # At least one '(?:' # "(?<$CloseGroup-$OpenGroup>\])" # Push to stack on ] '(?:`` .* ``|`[^`]*`|[^\[\]])*?' # Anything inside inline code or not-[] ')+?' # At least one ')+?' # Must match at least once "(?($OpenGroup)(?!))" # If open exists (ie not-matching # of ]), fail ) -join '' } # This will return the whole thing, need to trim [] from start and end for # text. Need to reparse text for nested. static [string] InSquareBrackets() { return [ParsingPatterns]::InSquareBrackets('') } # Double backticks are difficult - we can't reuse capture groups inside a # pattern to know that we're closing the right one. We'll just assume it's # always `` ... `` for now. Theoretically you could nest them but that # seems like a tiny edge case for most documents. static [string] $MultitickInlineCode = @( '(?<open>`{2,}) ' # Multi-backtick inline code opens with 2+ backticks and a space '(?<text>(?:' # Capture everything until the code closes, don't capture sub-group '.(?!\k<open>))*.' # Anything not followed by the code closer, then that character too ')' # Close the text capture group '\k<open>' # The code is closed only by the same number of backticks it opened with. ) -join '' # Finds the opening for a codefence with leading components and the actual fence. This is useful # so we can effectively find the matching closing fence. static [string] $OpenCodeFence = @( [ParsingPatterns]::LineLead # Retrieves any leading whitespace/block syntax '(?<Fence>`{3,}|~{3,})' # Fences can be backticks or tildes, don't care about after. ) -join '' # Finds any comment block, including unclosed comment blocks so we know if a multi-line comment # is starting. Used for ignoring otherwise valid syntax. static [string] $HtmlCommentBlock = @( "(?'OpenComment'<!--)" "(?'InComments'(?:.(?!-->))*.)" "(?'CloseComment'-->)?" ) -join '' # Only used for discovering the closure point of a multi-line HTML comment so we can ignore # otherwise valid syntax that comes before the closure. static [string] $ClosingMultiLineHtmlComment = @( '^' "(?'InComments'(?:.(?!-->))*.)" "(?'CloseComment'-->)" "(?'AfterComment'.*)" ) -join '' # Finds a match within a set of single backticks, the most common syntax for inline code. static [string] InsideSingleBacktick([string]$InnerPattern) { return @( '`[^`]*' # Opening backtick followed by zero-or-more not-backticks $InnerPattern # Inner regex between close and open '[^`]*`' # zero-or-more not-backticks followed by closing backtick ) -join '' } # Helper method for discovering whether a string of text is inside any inline code blocks # for a given line of Markdown. static [bool] NotInsideInlineCode([string]$Line, [string]$FullMatch) { $EscapedMatch = [regex]::Escape($FullMatch) $SingleBacktickPattern = [ParsingPatterns]::InsideSingleBacktick($EscapedMatch) # First find all multitick codeblocks, grab their raw value $MultitickCodeBlocks = [regex]::Matches($Line, [ParsingPatterns]::MultitickInlineCode)?.Value # If the text is inside a multitick codeblock, it's in a codeblock. # If it isn't inside a multitick codeblock, it might still be in a single-tick codeblock. if (($MultitickCodeBlocks.Count -ge 1) -and ($MultitickCodeBlocks -match $EscapedMatch)) { return $false } elseif ($Line -match $SingleBacktickPattern) { return $false } # The text wasn't inside any codeblocks return $true } # Needed to make it easier to read the combined pattern; also reused for reference definitions. static [string] $LinkDefinition = @( '(?<Destination>\S+)' # The URL component, capture any non-whitespace '(?:\s+(?:' # The title component, leads with non-captured whitespace "'(?<Title>[^']*)'" # May be wrapped in non-captured single-quotes '|' # or '"(?<Title>[^"]*)"' # May be wrapped in non-captured double-quotes '))?' # Make sure title is optional. ) -join '' # Finds a Markdown link in a given line. This pattern is likely hugely non-performant on a # non-split document. It's also not codeblock-aware. Only use it on a single line known not # to be inside a Markdown codeblock. static [string] $Link = @( '(?<IsImage>!)?' # If the link has a ! prefix, it's an image "(?<Text>$( # # [ParsingPatterns]::InSquareBracketsP # Need to retrieve the text inside the brackets. [ParsingPatterns]::InSquareBrackets('Text') # Need to retrieve the text inside the brackets. ))" # '(?!:)' # Ignore if followed by colon - that's a ref def '(?:' # Text can be followed by an inline def/ref/null "\($( # [ParsingPatterns]::LinkDefinition # Inline Definition, optional destination/title )\)" # '|' # "(?<ReferenceID>$( # # [ParsingPatterns]::InSquareBracketsP [ParsingPatterns]::InSquareBrackets('ReferenceID') # Need to retrieve the text inside the brackets. ))" # ')?' # The definition and reference syntax is optional ) -join '' # Finds a link reference definition, which can be inside a block. static [string] $LinkReferenceDefinition = @( [ParsingPatterns]::LineLead # Retrieves any leading whitespace/block syntax "(?<ReferenceID>$( # # [ParsingPatterns]::InSquareBrackets # Need to retrieve the text inside the brackets as ID [ParsingPatterns]::InSquareBrackets() ))" # ':\s+' # Must be followed by a colon and at least one space [ParsingPatterns]::LinkDefinition # Inline Definition, optional destination/title ) -join '' } class Position { [System.IO.FileInfo]$FileInfo [int]$LineNumber [int]$StartColumn [string] ToString() { $StringValue = "$($this.LineNumber):$($this.StartColumn)" if ($null -ne $this.FileInfo) { $StringValue = "$($this.FileInfo):$StringValue" } return $StringValue } } class DocumentLink { [LinkKind] $Kind [string] $Text [uri] $Destination [string] $Title [string] $ReferenceID [Position] $Position [string] $Markdown # Shorthand method for determining if this link is for an image [bool] IsImage() { return $this.Kind.ToString() -match '^Image' } # Shorthand method for determining if this link is for text [bool] IsText() { return $this.Kind.ToString() -match '^Text' } # Shorthand method for determining if this link has a reference [bool] HasReference() { return $this.Kind.ToString() -match 'Reference$' } [bool] IsSelfReferential() { return $this.Kind.ToString() -match 'SelfReference$' } # Shorthand method for determining if this link is a reference [bool] IsReference() { return $this.Kind.ToString() -match '^Reference' } DocumentLink() { # Re-defined to support alternate constructors } # Generate a document link from a match group for [ParsingPatterns]::Link DocumentLink([System.Text.RegularExpressions.Group]$LinkMatch) { [DocumentLink]::New($LinkMatch, 0) } # Generate a document link from a match group for [ParsingPatterns]::Link DocumentLink([System.Text.RegularExpressions.Group]$LinkMatch, [int]$LineNumber) { $this.Position = [Position]@{ FileInfo = $null LineNumber = $LineNumber StartColumn = $LinkMatch.Index + 1 } $this.Text = [DocumentLink]::TrimSquareBrackets( $LinkMatch.Groups.Where({ $_.Name -eq 'Text' }).Value ) $this.Destination = $LinkMatch.Groups.Where({ $_.Name -eq 'Destination' }).Value $this.Title = $LinkMatch.Groups.Where({ $_.Name -eq 'Title' }).Value $this.ReferenceID = [DocumentLink]::TrimSquareBrackets( $LinkMatch.Groups.Where({ $_.Name -eq 'ReferenceID' }).Value ) $this.Markdown = $LinkMatch.Value $IsImage = $LinkMatch.Groups.Where({ $_.Name -eq 'IsImage' }).Value -eq '!' $IsInline = ![string]::IsNullOrWhiteSpace($this.Destination) $HasReference = ![string]::IsNullOrWhiteSpace($this.ReferenceID) if ($IsImage -and $IsInline) { $this.Kind = [LinkKind]::ImageInline } elseif ($IsImage -and $HasReference) { $this.Kind = [LinkKind]::ImageUsingReference } elseif ($IsImage) { $this.Kind = [LinkKind]::ImageSelfReference } elseif ($IsInline) { $this.Kind = [LinkKind]::TextInline } elseif ($HasReference) { $this.Kind = [LinkKind]::TextUsingReference } else { $this.Kind = [LinkKind]::TextSelfReference } } # Trim square brackets when using balance groups, like to find text and reference IDs hidden static [string] TrimSquareBrackets([string]$Text) { if ($Text -match '^\[(?<Inner>.*)\]$') { return $Matches.Inner } return $Text } # Parses a file's content for Markdown links, parsing one line at a time to support ignoring any # links in multiline codeblocks or comments, and ensuring the returned objects have the FileInfo # property defined with the input file's values. static [DocumentLink[]] Parse([System.IO.FileInfo]$FileInfo) { $Content = Get-Content -Raw -Path $FileInfo.FullName [DocumentLink[]]$Links = [DocumentLink]::Parse($Content) | ForEach-Object -Process { # Add the file info to each link $_.Position.FileInfo = $FileInfo # Emit the link for the list $_ } return $Links } # Parses an arbitrary block of text for Markdown links, parsing one line at a time to support # ignoring any links in multiline codeblocks or comments. static [DocumentLink[]] Parse([string]$Markdown) { [DocumentLink[]]$Links = @() [DocumentLink[]]$DiscoveredLinks = @() $Lines = $Markdown -split '\r?\n|`r' $InCodeFence = $false # This is set to true when a code fence opens to ignore lines til close $CodeFenceClose = $null # This is defined when a code fence is found and nulled when closed $InCommentBlock = $false # This is set to true when a comment block opens without closing for ($i = 1; $i -le $Lines.Count ; $i++) { $CommentBlocks = @() # This holds the enclosed comment blocks for a line $IgnoreAfterIndex = $null # Points to a comment block that doesn't close on this line $IgnoreBeforeIndex = $null # Points to closing of a multi-line comment block $LinkMatches = $null # Holds discovered links on this line $Line = $Lines[$i - 1] # Editors/humans use a 1-index array for file lines # Before we process anything else, check if we're in a code fence and closing it if ($InCodeFence) { if ($Line -eq $CodeFenceClose) { $InCodeFence = $false $CodeFenceClose = $null } # Regardless whether this line closes the code fence, no valid links can be here. continue } elseif ($InCommentBlock) { # If we're not in a code fence, we might be in a comment block and need to see if it closes. # If it does, we need to mark the index so we ignore links before the closure. if ($Line -match [ParsingPatterns]::ClosingMultiLineHtmlComment) { $ClosingMatch = $Matches $InCommentBlock = $false $IgnoreBeforeIndex = ($ClosingMatch.InComments + $ClosingMatch.CloseComment).Length } } # Look for new HTML comments. We need to capture fully enclosed comments and mark any unclosed # comments so we can ignore links in comments. We can have any number of comments on a line. $HtmlCommentMatches = [regex]::Matches($Line, [ParsingPatterns]::HtmlCommentBlock) if ($HtmlCommentMatches.Count) { $CommentBlocks = $HtmlCommentMatches.Groups | Where-Object { $_.Name -eq 'InComments' } | Select-Object -ExpandProperty Value if ($CommentBlocks) { } $UnclosedHtmlComment = $HtmlCommentMatches | Where-Object { $_.Groups | Where-Object { $_.Name -eq 'CloseComment' -and (-not $_.Success) } } | Select-Object -First 1 if ($UnclosedHtmlComment) { $IgnoreAfterIndex = $UnclosedHtmlComment.Index $InCommentBlock = $true } } # If the line opens a code fence, capture the closing pattern and continue # if ($Line -match [DocumentLink]::OpenCodeFencePattern) { if ($Line -match [ParsingPatterns]::OpenCodeFence) { $InCodeFence = $true $CodeFenceClose = @( $Matches.Lead -replace '([0-9]|\.|-|\+|\*)', ' ' $Matches.Fence ) -join '' continue } # Check for link references first - less expensive and no valid links follow them. if ($Line -match [ParsingPatterns]::LinkReferenceDefinition) { $ReferenceMatchInfo = $Matches $FullMatch = $ReferenceMatchInfo.0 if ([ParsingPatterns]::NotInsideInlineCode($Line, $FullMatch)) { $Properties = @{ Position = [Position]@{ LineNumber = $i StartColumn = $ReferenceMatchInfo.Lead.Length } ReferenceID = [DocumentLink]::TrimSquareBrackets($ReferenceMatchInfo.ReferenceID) Destination = $ReferenceMatchInfo.Destination Title = $ReferenceMatchInfo.Title Markdown = $FullMatch Kind = [LinkKind]::ReferenceDefinition } $DiscoveredLinks += [DocumentLink]$Properties } # Reset before next line $ReferenceMatchInfo = $null continue } # Find all links in the line, ignoring them if in comment blocks or code if ($LinkMatches = [regex]::Matches($Line, [ParsingPatterns]::Link)) { foreach ($LinkMatch in $LinkMatches) { $FullMatch = $LinkMatch.Value $Index = $LinkMatch.Index $NotInsideComment = $true # If there was an unclosed comment block on this line, ignore links after it started if ($IgnoreAfterIndex -and ($Index -gt $IgnoreAfterIndex)) { $NotInsideComment = $false } # If this line closed a multi-line comment block, ignore links before it closed if ($IgnoreBeforeIndex -and ($Index -le $IgnoreBeforeIndex)) { $NotInsideComment = $false } # If this line had closed comment blocks, ignore links inside them foreach ($Block in $CommentBlocks) { if ($Block -match [regex]::Escape($FullMatch)) { $NotInsideComment = $false } } $NotInsideInlineCode = [ParsingPatterns]::NotInsideInlineCode($Line, $FullMatch) if ($NotInsideComment -and $NotInsideInlineCode) { $Link = [DocumentLink]::New($LinkMatch, $i) $DiscoveredLinks += $Link # Look for nested links, setting their position relative to their parent if (![string]::IsNullOrWhiteSpace($Link.Text)) { if ($NestedLinks = [DocumentLink]::ParseNested($Link.Text, 1, 5)) { foreach ($NestedLink in $NestedLinks) { $NestedLink.Position.LineNumber = $Link.Position.LineNumber $NestedLink.Position.StartColumn += $Link.Position.StartColumn $DiscoveredLinks += $NestedLink } } } } } } } # Need to discard self-reference links without a definition - they're technically # not links at all. $ReferenceDefinitions = $DiscoveredLinks | Where-Object -FilterScript { $_.IsReference() } foreach ($Link in $DiscoveredLinks) { if (!$Link.IsSelfReferential() -or ($Link.Text -in $ReferenceDefinitions.ReferenceID)) { $Links += $Link } } return $Links } hidden static [DocumentLink[]] ParseNested([string]$LinkText, [int]$Depth, [int]$MaxDepth) { [DocumentLink[]]$Links = @() $CommentBlocks = @() if ($Depth -gt $MaxDepth) { return $Links } # Look for new HTML comments. We need to capture fully enclosed comments and mark any unclosed # comments so we can ignore links in comments. We can have any number of comments on a line. $HtmlCommentMatches = [regex]::Matches($LinkText, [ParsingPatterns]::HtmlCommentBlock) if ($HtmlCommentMatches.Count) { $CommentBlocks = $HtmlCommentMatches.Groups | Where-Object { $_.Name -eq 'InComments' } | Select-Object -ExpandProperty Value } # Find all links in the line, ignoring them if in comment blocks or code if ($LinkMatches = [regex]::Matches($LinkText, [ParsingPatterns]::Link)) { foreach ($LinkMatch in $LinkMatches) { $FullMatch = $LinkMatch.Value $NotInsideComment = $true # If this line had closed comment blocks, ignore links inside them foreach ($Block in $CommentBlocks) { if ($Block -match [regex]::Escape($FullMatch)) { $NotInsideComment = $false } } $NotInsideInlineCode = [ParsingPatterns]::NotInsideInlineCode($LinkText, $FullMatch) if ($NotInsideComment -and $NotInsideInlineCode) { $Link = [DocumentLink]::New($LinkMatch, 0) $Links += $Link # Look for nested links, setting their position relative to their parent if (![string]::IsNullOrWhiteSpace($Link.Text)) { if ($NestedLinks = [DocumentLink]::ParseNested($Link.Text, ($Depth + 1), $MaxDepth)) { foreach ($NestedLink in $NestedLinks) { $NestedLink.Position.LineNumber = $Link.Position.LineNumber $NestedLink.Position.StartColumn += $Link.Position.StartColumn $Links += $NestedLink } } } } } } return $Links } hidden static [DocumentLink[]] FilterForInlineLinks([DocumentLink[]]$Links) { return $Links.Where({ -not ($_.HasReference() -or $_.IsReference()) }) } hidden static [DocumentLink[]] FilterForReferenceLinks([DocumentLink[]]$Links) { return $Links.Where({ $_.HasReference() }) } hidden static [DocumentLink[]] FilterForSelfReferentialLinks([DocumentLink[]]$Links) { return $Links.Where({ $_.IsSelfReferential() }) } hidden static [DocumentLink[]] FilterForReferenceDefinitions([DocumentLink[]]$Links) { return $Links.Where({ $_.IsReference() }) } hidden static [DocumentLink[]] FilterForReferenceLinksAndDefinitions([DocumentLink[]]$Links) { return $Links.Where({ $_.HasReference() -or $_.IsReference() }) } hidden static [DocumentLink[]] FilterForUndefinedReferenceLinks([DocumentLink[]]$Links) { return [DocumentLink]::FilterForReferenceLinks($Links).Where({ $ReferenceID = $_.IsSelfReferential() ? $_.Text : $_.ReferenceID $ReferenceID -notin [DocumentLink]::FilterForReferenceDefinitions($Links).ReferenceID } ) } hidden static [DocumentLink[]] FilterForUnusedReferenceLinkDefinitions([DocumentLink[]]$Links) { return [DocumentLink]::FilterForReferenceDefinitions($Links).Where({ ($_.ReferenceID -notin [DocumentLink]::FilterForReferenceLinks($Links).ReferenceID) -and ($_.ReferenceID -notin [DocumentLink]::FilterForSelfReferentialLinks($Links).Text) } ) } hidden static [DocumentLink[]] FilterForValidReferenceLinksAndDefinitions([DocumentLink[]]$Links) { $InvalidReferences = ( [DocumentLink]::FilterForUndefinedReferenceLinks($Links) + [DocumentLink]::FilterForUnusedReferenceLinkDefinitions($Links) ) return [DocumentLink]::FilterForReferenceLinksAndDefinitions($Links).Where({ $_ -notin $InvalidReferences }) } } class LinkKindTransformAttribute : ArgumentTransformationAttribute { [object] Transform([EngineIntrinsics]$engineIntrinsics, [System.Object]$inputData) { $ValidEnums = [LinkKind].GetEnumNames() $outputData = switch ($inputData) { { $_ -is [LinkKind] } { $_ } { $_ -is [string] } { if ($_ -in $ValidEnums) { $_ } elseif ($Matching = $ValidEnums -like $_) { $Matching } else { $Message = @( "Specified kind '$_' couldn't resolve to any LinkKind enums;" 'values must be a specific LinkKind or a wildcard expression' "(containing '*', '?', or '[]') matching one or more LinkKind." "Valid LinkKind enums are: $ValidEnums" ) -join ' ' throw [ArgumentTransformationMetadataException]::New( $Message ) } } default { $Message = @( "Could not convert input ($_) of type '$($_.GetType().FullName)' to a LinkKind." "Specify a valid LinkKind or a wildcard expression (containing '*', '?', or '[]')" "matching one or more LinkKind enums. Valid LinkKind enums are: $ValidEnums" ) -join ' ' throw [ArgumentTransformationMetadataException]::New( $Message ) } } return $outputData } } class ParsedDocument { [System.IO.FileInfo]$FileInfo [string]$RawContent [Markdig.Syntax.MarkdownDocument]$ParsedMarkdown [System.Collections.Specialized.OrderedDictionary]$FrontMatter [string]$Body [DocumentLink[]]$Links hidden [bool]$HasParsedLinks ParsedDocument() {} hidden ParseLinksFromBody() { $this.Links = [DocumentLink]::Parse($this.Body) | ForEach-Object -Process { # Add the file info to each link $_.Position.FileInfo = $FileInfo # Emit the link for the list $_ } $this.HasParsedLinks = $true } [DocumentLink[]] ParsedLinks() { if (!$this.HasParsedLinks) { $this.ParseLinksFromBody() } return $this.Links } [DocumentLink[]] ParsedLinks([bool]$Force) { if (!$this.HasParsedLinks -or $Force) { $this.ParseLinksFromBody() } return $this.Links } [DocumentLink[]] InlineLinks() { return [DocumentLink]::FilterForInlineLinks($this.Links) } [DocumentLink[]] ReferenceLinks() { return [DocumentLink]::FilterForReferenceLinks($this.Links) } [DocumentLink[]] ReferenceDefinitions() { return [DocumentLink]::FilterForReferenceDefinitions($this.Links) } [DocumentLink[]] ReferenceLinksAndDefinitions() { return [DocumentLink]::FilterForReferenceLinksAndDefinitions($this.Links) } [DocumentLink[]] UndefinedReferenceLinks() { return [DocumentLink]::FilterForUndefinedReferenceLinks($this.Links) } [DocumentLink[]] UnusedReferenceLinkDefinitions() { return [DocumentLink]::FilterForUnusedReferenceLinkDefinitions($this.Links) } [DocumentLink[]] ValidReferenceLinksAndDefinitions() { return [DocumentLink]::FilterForValidReferenceLinksAndDefinitions($this.Links) } [string] ToDecoratedString() { return $this.Body | ConvertFrom-Markdown -AsVT100EncodedString | Select-Object -ExpandProperty VT100EncodedString } } #endregion Classes.Public #region Functions.Private function Get-YamlHeader { [CmdletBinding()] param([string]$Path) $doc = Get-Content $path -Encoding UTF8 $hasFrontmatter = Select-String -Pattern '^---$' -Path $path $start = 0 $end = $doc.count if ($hasFrontmatter) { $start = $hasFrontmatter[0].LineNumber $end = $hasFrontmatter[1].LineNumber - 2 } $doc[$start..$end] } function hash2yaml { [CmdletBinding()] param([hashtable]$MetaHash) ### This is a naive implementation of a YAML serializer. It is not intended to be a complete ### implementation. It converts all members of the hashtable to single-line strings, and does ### not support any of the more complex YAML features. It is intended to be used to serialize ### the metadata hashtable that is passed to the Markdown template. ForEach-Object { '---' ForEach ($key in ($MetaHash.keys | Sort-Object)) { if ('' -ne $MetaHash.$key) { '{0}: {1}' -f $key, $MetaHash.$key } } '---' } } function New-ParsedDocument { [CmdletBinding()] [OutputType('ParsedDocument')] param( [Parameter(Mandatory)] [System.IO.FileInfo]$FileInfo, [Parameter(Mandatory)] [AllowEmptyString()] [string]$RawContent, [Parameter(Mandatory)] [AllowNull()] [Markdig.Syntax.MarkdownDocument]$ParsedMarkdown, [Parameter()] [System.Collections.Specialized.OrderedDictionary]$FrontMatter, [Parameter(Mandatory)] [AllowEmptyString()] [string]$Body ) process { $Document = [ParsedDocument]::new() $Document.FileInfo = $FileInfo $Document.RawContent = $RawContent $Document.ParsedMarkdown = $ParsedMarkdown if ($FrontMatter) { $Document.FrontMatter = $FrontMatter } $Document.Body = $Body $Document.ParseLinksFromBody() $Document } } #endregion Functions.Private #region Functions.Public function Convert-MDLinks { [CmdletBinding()] param( [Parameter(Mandatory, Position = 0)] [string[]]$Path, [switch]$PassThru ) $mdlinkpattern = '[\s\n]*(?<link>!?\[(?<label>[^\]]*)\]\((?<target>[^\)]+)\))[\s\n]?' $reflinkpattern = '[\s\n]*(?<link>!?\[(?<label>[^\]]*)\]\[(?<ref>[^\[\]]+)\])[\s\n]?' $refpattern = '^(?<refdef>\[(?<ref>[^\[\]]+)\]:\s(?<target>.+))$' $Path = Get-Item $Path # resolve wildcards foreach ($filename in $Path) { $mdfile = Get-Item $filename $mdlinks = Get-Content $mdfile -Raw | Select-String -Pattern $mdlinkpattern -AllMatches $reflinks = Get-Content $mdfile -Raw | Select-String -Pattern $reflinkpattern -AllMatches $refdefs = Select-String -Path $mdfile -Pattern $refpattern -AllMatches Write-Verbose ('{0}/{1}: {2} links' -f $mdfile.Directory.Name, $mdfile.Name, $mdlinks.count) Write-Verbose ('{0}/{1}: {2} ref links' -f $mdfile.Directory.Name, $mdfile.Name, $reflinks.count) Write-Verbose ('{0}/{1}: {2} ref defs' -f $mdfile.Directory.Name, $mdfile.Name, $refdefs.count) function GetMDLinks { foreach ($mdlink in $mdlinks.Matches) { # Skip INCLUDE and tab links if (-not $mdlink.Value.Trim().StartsWith('[!INCLUDE') -and -not $mdlink.Value.Trim().Contains('#tab/') ) { $linkitem = [pscustomobject]([ordered]@{ mdlink = '' target = '' ref = '' label = '' }) switch ($mdlink.Groups) { { $_.Name -eq 'link' } { $linkitem.mdlink = $_.Value } { $_.Name -eq 'target' } { $linkitem.target = $_.Value } { $_.Name -eq 'label' } { $linkitem.label = $_.Value } } $linkitem } } foreach ($reflink in $reflinks.Matches) { if (-not $reflink.Value.Trim().StartsWith('[!INCLUDE')) { $linkitem = [pscustomobject]([ordered]@{ mdlink = '' target = '' ref = '' label = '' }) switch ($reflink.Groups) { { $_.Name -eq 'link' } { $linkitem.mdlink = $_.Value } { $_.Name -eq 'label' } { $linkitem.label = $_.Value } { $_.Name -eq 'ref' } { $linkitem.ref = $_.Value } } $linkitem } } } function GetRefTargets { foreach ($refdef in $refdefs.Matches) { $refitem = [pscustomobject]([ordered]@{ refdef = '' target = '' ref = '' }) switch ($refdef.Groups) { { $_.Name -eq 'refdef' } { $refitem.refdef = $_.Value } { $_.Name -eq 'target' } { $refitem.target = $_.Value } { $_.Name -eq 'ref' } { $refitem.ref = $_.Value } } if (!$RefTargets.ContainsKey($refitem.ref)) { $RefTargets.Add( $refitem.ref, [pscustomobject]@{ target = $refitem.target ref = $refitem.ref refdef = $refitem.refdef } ) } } } $linkdata = GetMDLinks $RefTargets = @{}; GetRefTargets # map targets by reference if ($RefTargets.Count -gt 0) { for ($x = 0; $x -lt $linkdata.Count; $x++) { foreach ($key in $RefTargets.Keys) { if ($RefTargets[$key].ref -eq $linkdata[$x].ref) { $linkdata[$x].target = $RefTargets[$key].target } } } } # Get unique list of targets $targets = $linkdata.target + $RefTargets.Values.target | Sort-Object -Unique # Calculate new links and references $newlinks = @() $index = 0 for ($x = 0; $x -lt $linkdata.Count; $x++) { if ($linkdata[$x].mdlink.StartsWith('!')) { $bang = '!' } else { $bang = '' } if ($linkdata[$x].target -match 'https://github.com/\w+/\w+/(pull|issues)/(?<linkid>\d+)$') { $linkid = $matches.linkid $newlinks += '[{0}]: {1}' -f $linkid, $linkdata[$x].target $newlink = '[{0}][{1}]' -f $linkdata[$x].label, $linkid } else { $index += 1 $linkid = $index $newlinks += '[{0:d2}]: {1}' -f $linkid, $linkdata[$x].target $newlink = '{0}[{1}][{2:d2}]' -f $bang, $linkdata[$x].label, $linkid } $parms = @{ InputObject = $linkdata[$x] MemberType = 'NoteProperty' Name = 'newlink' Value = $newlink } Add-Member @parms } $mdtext = Get-Content $mdfile foreach ($link in $linkdata) { $mdtext = $mdtext -replace [regex]::Escape($link.mdlink), $link.newlink } if ($PassThru) { $linkdata } else { $mdtext += '<!-- updated link references -->' $mdtext += $newlinks | Sort-Object -Unique Set-Content -Path $mdfile -Value $mdtext -Encoding utf8 -Force } } } function ConvertTo-Contraction { [CmdletBinding()] param ( [Parameter(Mandatory, Position = 0)] [SupportsWildcards()] [string[]]$Path, [switch]$Recurse ) ### This function converts common word pairs to contractions. It doesn't handle all possible ### cases and it's not aware of code blocks. $contractions = @{ lower = @{ '([\s\n])are([\s\n])not([\s\n])' = "`$1aren't`$3" '([\s\n])cannot([\s\n])' = "`$1can't`$2" '([\s\n])could([\s\n])not([\s\n])' = "`$1couldn't`$3" '([\s\n])did([\s\n])not([\s\n])' = "`$1didn't`$3" '([\s\n])do([\s\n])not([\s\n])' = "`$1don't`$3" '([\s\n])does([\s\n])not([\s\n])' = "`$1doesn't`$3" '([\s\n])has([\s\n])not([\s\n])' = "`$1hasn't`$3" '([\s\n])have([\s\n])not([\s\n])' = "`$1haven't`$3" '([\s\n])is([\s\n])not([\s\n])' = "`$1isn't`$3" '([\s\n])it([\s\n])is([\s\n])' = "`$1it's`$3" '([\s\n])should([\s\n])not([\s\n])' = "`$1shouldn't`$3" '([\s\n])that([\s\n])is([\s\n])' = "`$1that's`$3" '([\s\n])they([\s\n])are([\s\n])' = "`$1they're`$3" '([\s\n])was([\s\n])not([\s\n])' = "`$1wasn't`$3" '([\s\n])what([\s\n])is([\s\n])' = "`$1what's`$3" '([\s\n])we([\s\n])are([\s\n])' = "`$1we're`$3" '([\s\n])we([\s\n])have([\s\n])' = "`$1we've`$3" '([\s\n])were([\s\n])not([\s\n])' = "`$1weren't`$3" } upper = @{ '([\s\n])Are([\s\n])not([\s\n])' = "`$1Aren't`$3" '([\s\n])Cannot([\s\n])' = "`$1Can't`$2" '([\s\n])Could([\s\n])not([\s\n])' = "`$1Couldn't`$3" '([\s\n])Did([\s\n])not([\s\n])' = "`$1Didn't`$3" '([\s\n])Do([\s\n])not([\s\n])' = "`$1Don't`$3" '([\s\n])Does([\s\n])not([\s\n])' = "`$1Doesn't`$3" '([\s\n])Has([\s\n])not([\s\n])' = "`$1Hasn't`$3" '([\s\n])Have([\s\n])not([\s\n])' = "`$1Haven't`$3" '([\s\n])Is([\s\n])not([\s\n])' = "`$1Isn't`$3" '([\s\n])It([\s\n])is([\s\n])' = "`$1It's`$3" '([\s\n])Should([\s\n])not([\s\n])' = "`$1Shouldn't`$3" '([\s\n])That([\s\n])is([\s\n])' = "`$1That's`$3" '([\s\n])They([\s\n])are([\s\n])' = "`$1They're`$3" '([\s\n])Was([\s\n])not([\s\n])' = "`$1Wasn't`$3" '([\s\n])What([\s\n])is([\s\n])' = "`$1what's`$3" '([\s\n])We([\s\n])are([\s\n])' = "`$1We're`$3" '([\s\n])We([\s\n])have([\s\n])' = "`$1We've`$3" '([\s\n])Were([\s\n])not([\s\n])' = "`$1Weren't`$3" } } foreach ($filepath in $Path) { Get-ChildItem -Path $filepath -Recurse:$Recurse | ForEach-Object { Write-Host $_.name $mdtext = Get-Content $_ -Raw foreach ($key in $contractions.lower.keys) { $mdtext = $mdtext -creplace $key, $contractions.lower[$key] } foreach ($key in $contractions.upper.keys) { $mdtext = $mdtext -creplace $key, $contractions.upper[$key] } Set-Content -Path $_ -Value $mdtext -NoNewline -Encoding utf8 -Force } } } function Get-ContentWithoutHeader { [CmdletBinding()] param( [Parameter(Mandatory, Position = 0)] [string]$Path ) $doc = Get-Content $path -Encoding UTF8 $hasFrontmatter = Select-String -Pattern '^---$' -Path $path $start = 0 $end = $doc.count if ($hasFrontmatter) { $start = $hasFrontmatter[-1].LineNumber } $doc[$start..$end] } function Get-Document { [CmdletBinding()] [OutputType([ParsedDocument])] param( [string[]]$Path ) begin { $Pipeline = New-Object -TypeName Markdig.MarkdownPipelineBuilder $Pipeline = [Markdig.MarkdownExtensions]::Configure($Pipeline, 'Advanced+Yaml') } process { $Files = Get-Item -Path $Path if ($Files.PSIsContainer) { $Files = Get-ChildItem -Path $Path -Recurse | Where-Object -FilterScript { $_.Extension -eq '.md' } } $Files | ForEach-Object -Process { $File = $_ if ($File.Extension -ne '.md') { continue } $ParsedDocumentParameters = @{ FileInfo = $File } $ParsedDocumentParameters.RawContent = Get-Content -Path $File.FullName -Raw if ($ParsedDocumentParameters.RawContent.Length -gt 0) { $ParsedDocumentParameters.ParsedMarkdown = [Markdig.Parsers.MarkdownParser]::Parse( $ParsedDocumentParameters.RawContent, $Pipeline.Build() ) } else { $ParsedDocumentParameters.ParsedMarkdown = $null } $FrontMatterToken = $ParsedDocumentParameters.ParsedMarkdown | Where-Object -FilterScript { $_.Parser -is [Markdig.Extensions.Yaml.YamlFrontMatterParser] } if ($FrontMatterToken) { $ParsedDocumentParameters.FrontMatter = $FrontMatterToken.Lines.ToString().Trim() | ConvertFrom-Yaml -Ordered $Body = $ParsedDocumentParameters.RawContent -split '---' | Select-Object -Skip 2 | Join-String -Separator '---' $ParsedDocumentParameters.Body = $Body.TrimStart() } else { $ParsedDocumentParameters.Body = $ParsedDocumentParameters.RawContent } New-ParsedDocument @ParsedDocumentParameters } } } function Get-DocumentLink { [CmdletBinding(DefaultParameterSetName = 'FilterByKind')] [OutputType([DocumentLink])] param( [Parameter( ParameterSetName = 'FilterByKind', ValueFromPipeline, ValueFromPipelineByPropertyName )] [Parameter( ParameterSetName = 'FilterByOnly', ValueFromPipeline, ValueFromPipelineByPropertyName )] [Alias('FullName')] [string[]]$Path, [Parameter(ParameterSetName = 'FilterByKind', ValueFromPipeline)] [Parameter(ParameterSetName = 'FilterByOnly', ValueFromPipeline)] [ParsedDocument[]]$Document, [Parameter(ParameterSetName = 'FilterByKind')] [SupportsWildcards()] [LinkKindTransformAttribute()] [LinkKind[]]$IncludeKind, [Parameter(ParameterSetName = 'FilterByKind')] [SupportsWildcards()] [LinkKindTransformAttribute()] [LinkKind[]]$ExcludeKind, [Parameter(ParameterSetName = 'FilterByOnly')] [ValidateSet( 'Inline', 'References', 'UndefinedReferences', 'UnusedReferences', 'ValidReferences' )] [string]$Only, [SupportsWildcards()] [regex]$MatchMarkdown, [SupportsWildcards()] [regex]$MatchText, [SupportsWildcards()] [regex]$MatchDestination, [SupportsWildcards()] [regex]$MatchReferenceID, [SupportsWildcards()] [regex]$NotMatchMarkdown, [SupportsWildcards()] [regex]$NotMatchText, [SupportsWildcards()] [regex]$NotMatchDestination, [SupportsWildcards()] [regex]$NotMatchReferenceID ) process { if ($Path) { $Document = Get-Document -Path $Path } $Document | ForEach-Object { $ParsedDocument = $_ $Links = $ParsedDocument.Links switch ($Only) { 'Inline' { $Links = $ParsedDocument.InlineLinks() } 'References' { $Links = $ParsedDocument.ReferenceLinksAndDefinitions() } 'UndefinedReferences' { $Links = $ParsedDocument.UndefinedReferenceLinks() } 'UnusedReferences' { $Links = $ParsedDocument.UnusedReferenceLinkDefinitions() } 'ValidReferences' { $Links = $ParsedDocument.ValidReferenceLinksAndDefinitions() } } if ($IncludeKind.Count) { $Links = $Links.Where({ $_.Kind -in $IncludeKind }) } if ($ExcludeKind.Count) { $Links = $Links.Where({ $_.Kind -notin $ExcludeKind }) } if ($MatchMarkdown) { $Links = $Links.Where({ $_.Markdown -match $MatchMarkdown }) } if ($MatchText) { $Links = $Links.Where({ $_.Text -match $MatchText }) } if ($MatchDestination) { $Links = $Links.Where({ $_.Destination -match $MatchDestination }) } if ($MatchReferenceID) { $Links = $Links.Where({ $_.ReferenceID -match $MatchReferenceID }) } if ($NotMatchMarkdown) { $Links = $Links.Where({ $_.Markdown -notmatch $NotMatchMarkdown }) } if ($NotMatchText) { $Links = $Links.Where({ $_.Text -notmatch $NotMatchText }) } if ($NotMatchDestination) { $Links = $Links.Where({ $_.Destination -notmatch $NotMatchDestination }) } if ($NotMatchReferenceID) { $Links = $Links.Where({ $_.ReferenceID -notmatch $NotMatchReferenceID }) } $Links } } } function Get-Metadata { [CmdletBinding(DefaultParameterSetName = 'AsHash')] param( [Parameter(ParameterSetName = 'AsHash', Mandatory, Position = 0)] [Parameter(ParameterSetName = 'AsObject', Mandatory, Position = 0)] [Parameter(ParameterSetName = 'AsYaml', Mandatory, Position = 0)] [SupportsWildcards()] [string]$Path, [Parameter(ParameterSetName = 'AsObject', Mandatory)] [switch]$AsObject, [Parameter(ParameterSetName = 'AsYaml', Mandatory)] [switch]$AsYaml, [Parameter(ParameterSetName = 'AsHash')] [Parameter(ParameterSetName = 'AsObject')] [Parameter(ParameterSetName = 'AsYaml')] [switch]$Recurse ) foreach ($file in (Get-ChildItem -Recurse:$Recurse -File -Path $Path)) { $ignorelist = 'keywords', 'helpviewer_keywords', 'ms.assetid' $lines = Get-YamlHeader $file if ($AsYaml) { $lines } else { $meta = @{} foreach ($line in $lines) { ### Parse the YAML block ### This is a naive implementation that only works for simple single-line ### YAML data types and has some special cases for the metadata we care ### about. It's not intended to be a general purpose solution. $i = $line.IndexOf(':') if ($i -ne -1) { $key = $line.Substring(0, $i) if (!$ignorelist.Contains($key)) { $value = $line.Substring($i + 1).replace('"', '') switch ($key) { 'title' { $value = $value.split('|')[0].Trim() } 'ms.date' { [datetime]$date = $value.Trim() $value = Get-Date $date -Format 'MM/dd/yyyy' } Default { $value = $value.Trim() } } $meta.Add($key, $value) } } } if ($AsObject) { $meta.Add('file', $file.FullName) [pscustomobject]$meta } else { $meta } } } } function Remove-Metadata { param( [Parameter(Mandatory, Position = 0)] [SupportsWildcards()] [string]$Path, [Parameter(Mandatory, Position = 1)] [string[]]$KeyName, [switch]$Recurse ) foreach ($file in (Get-ChildItem -Path $Path -Recurse:$Recurse)) { $file.name $metadata = Get-Metadata -Path $file $mdtext = Get-ContentWithoutHeader -Path $file foreach ($key in $KeyName) { if ($metadata.ContainsKey($key)) { $metadata.Remove($key) } } Set-Content -Value (hash2yaml $metadata) -Path $file -Force -Encoding utf8 Add-Content -Value $mdtext -Path $file -Encoding utf8 } } function Set-Metadata { param( [Parameter(Mandatory, Position = 0)] [SupportsWildcards()] [string]$Path, [Parameter(Mandatory, Position = 1)] [hashtable]$NewMetadata, [switch]$Recurse ) foreach ($file in (Get-ChildItem -Path $Path -Recurse:$Recurse)) { $file.Name $mdtext = Get-ContentWithoutHeader -Path $file Set-Content -Value (hash2yaml $NewMetadata) -Path $file -Force -Encoding utf8 Add-Content -Value $mdtext -Path $file -Encoding utf8 } } function Update-Metadata { param( [Parameter(Mandatory, Position = 0)] [SupportsWildcards()] [string]$Path, [Parameter(Mandatory, Position = 1)] [hashtable]$NewMetadata, [switch]$Recurse ) foreach ($file in (Get-ChildItem -Path $Path -Recurse:$Recurse)) { $file.name $OldMetadata = Get-Metadata -Path $file $mdtext = Get-ContentWithoutHeader -Path $file $update = $OldMetadata.Clone() foreach ($key in $NewMetadata.Keys) { if ($update.ContainsKey($key)) { $update[$key] = $NewMetadata[$key] } else { $update.Add($key, $NewMetadata[$key]) } } Set-Content -Value (hash2yaml $update) -Path $file -Force -Encoding utf8 Add-Content -Value $mdtext -Path $file -Encoding utf8 } } #endregion Functions.Public $ExportableFunctions = @( 'Convert-MDLinks' 'ConvertTo-Contraction' 'Get-ContentWithoutHeader' 'Get-Document' 'Get-DocumentLink' 'Get-Metadata' 'Remove-Metadata' 'Set-Metadata' 'Update-Metadata' ) Export-ModuleMember -Alias * -Function $ExportableFunctions |