classes/Tokenizer.ps1
|
class Tokenizer { [InputFileStack]$fileStack [System.Collections.Generic.List[char]]$InputData [int]$cpos [int]$tokenStart [int]$tline [int]$tcolumn [string]$tfile [System.Collections.Generic.List[Object]]$tokens # Apparently the <T> needs to be Object, if the type is custom - can be of custom type when the object is initialized [System.Collections.Generic.Stack[int]]$ScopeStack [MultiLevelCounter]$classCounter [bool]$sawQuestionMark [hashtable]$state [hashtable]$PendingDirective Tokenizer([InputFileStack]$fileStack) { $this.fileStack = $fileStack $this.InputData = [System.Collections.Generic.List[char]]::new() $this.cpos = 0 $this.tokenStart = 0 $this.tokens = [System.Collections.Generic.List[Token]]::new() $this.ScopeStack = [System.Collections.Generic.Stack[int]]::new() $this.classCounter = [MultiLevelCounter]::new(2) $this.sawQuestionMark = $false $this.PendingDirective = $null $this.state = @{} $this.Tokenize() } Tokenizer() {} [void] SetState([string]$key) { $this.state[$key] = $true } [void] UnsetState([string]$key) { $this.state[$key] = $false } [bool] GetState([string]$key) { return [bool]$this.state[$key] } [string] PeekCharsBackUntil([char[]]$c) { # Walk backwards from current cpos until one of $c is found $pos = $this.cpos - 1 $sb = [System.Text.StringBuilder]::new() while ($pos -ge 0) { $ch = $this.InputData[$pos] if ($c -contains $ch) { break } $sb.Insert(0, $ch) | Out-Null $pos-- } return $sb.ToString() } [char] PeekChar() { # Ensure InputData has at least one char if ($this.cpos -ge $this.InputData.Count) { $ch = $this.FileStack.PeekChar() if ($ch -eq 0) { return 0 } # EOF sentinel return $ch } return $this.InputData[$this.cpos] # same index, no increment } [void] SkipChar() { [void]$this.GetChar() } [char] GetChar() { # Always run ReadChar() to update counters in file context class $ch = $this.FileStack.ReadChar() if ($ch -eq 0) { return 0 } if ($this.cpos -ge $this.InputData.Count) { $this.InputData.Add($ch) } return $this.InputData[$this.cpos++] } [void] UnGetChar() { if ($this.cpos -gt 0) { $this.FileStack.UnReadChar() $this.cpos-- } } [Token] NewToken([TokenType]$tokenType) { $lexeme = ($this.InputData[$this.tokenStart..($this.cpos-1)] -join '') $token = [Token]::new( $tokenType, $lexeme, $this.tokenStart, ($this.cpos - $this.tokenStart), $this.tline, $this.tcolumn, $this.tfile ) return $token } [token] ScanNewLine([char]$c) { if($c -eq "`r" -and $this.PeekChar() -eq "`n") { $this.SkipChar() } $this.HandlePendingDirective() return $this.NewToken([TokenType]::NewLine) } [Token] ScanBlockComment([TokenType]$tokenType) { $this.SkipChar() switch ($tokenType) { {$_ -eq [TokenType]::CStyleBlockComment} { while(-not ($this.GetChar() -eq '*' -and $this.PeekChar() -eq '/')) {} } {$_ -eq [TokenType]::PSBlockComment} { while(-not ($this.GetChar() -eq '#' -and $this.PeekChar() -eq '>')) {} } } $this.SkipChar() return $this.NewToken($tokenType) } [Token] ScanLineComment([TokenType]$tokenType) { while($this.GetChar() -notin 0,"`r", "`n") {} $this.UnGetChar() return $this.NewToken($tokenType) } [Token] ScanDirective($str) { switch -regex ($str) { '^\.include$' { $this.PendingDirective = @{Directive=$str; Index=$this.tokens.Count }} '^\.incdir$' { $this.PendingDirective = @{Directive=$str; Index=$this.tokens.Count }} '^\.includeonce$' { $this.FileStack.MarkCurrentFileIncludeOnce() } } return $this.NewToken([TokenType]::Directive) } [void] HandlePendingDirective() { if ($this.PendingDirective) { switch ($this.PendingDirective.Directive) { '.include' { for ($i = $this.PendingDirective.Index+1; $i -lt $this.tokens.Count; $i++) { if ($this.tokens[$i].Type -in [TokenType]::StringLiteral, [TokenType]::StringExpandable) { $file = & { $ExecutionContext.InvokeCommand.ExpandString($this.tokens[$i].Value.Trim('"''')) } if ($file -ne '') { $this.FileStack.PushFile($file) } } } $this.PendingDirective = $null break } '.incdir' { for ($i = $this.PendingDirective.Index+1; $i -lt $this.tokens.Count; $i++) { if ($this.tokens[$i].Type -in [TokenType]::StringLiteral, [TokenType]::StringExpandable) { $file = & { $ExecutionContext.InvokeCommand.ExpandString($this.tokens[$i].Value.Trim('"''')) } if ($file -ne '') { $this.FileStack.AddIncludeDir($file) } } } $this.PendingDirective = $null break } } } } [Token] ScanStringLiteral() { while(1) { $c = $this.GetChar() if($c -eq "'" -and $this.PeekChar() -eq "'") { $this.SkipChar() continue } if($c -eq "``" -and $this.PeekChar() -in "'","``") { $this.SkipChar() continue } if($c -eq "'") { break } } return $this.NewToken([TokenType]::StringLiteral) } # Well, this is obviously not expandable yet, so.... add to to-do list [Token] ScanStringExpandable() { while(1) { $c = $this.GetChar() if($c -eq '"' -and $this.PeekChar() -eq '"') { $this.SkipChar() continue } if($c -eq "``" -and $this.PeekChar() -in '"',"``") { $this.SkipChar() continue } if($c -eq '"') { break } } return $this.NewToken([TokenType]::StringExpandable) } [Token] ScanIdentifier() { while($this.GetChar() -match '^[_a-z0-9]') {} $this.UnGetChar() if($this.PeekChar() -eq ':' -and $this.tokens[-1].Type -eq [TokenType]::Minus) { $this.SkipChar() return $this.NewToken([TokenType]::PSFunctionParameter) } if($this.PeekChar() -eq ':') { $this.SkipChar() return $this.NewToken([TokenType]::Label) } if ($this.classCounter.Counters[0] -gt 0 -and $this.classCounter.Counters[1] -eq 1) { # We're in a class, only methods and properties allowed here.. not sure how to handle props yet ;-) # This is necessary to avoid macros being misinterpreted as methods, when used in classes # and classes can be nested, that's why the MultiLevelCounter class is used - 0: class level, 1: scope level - and methods only exist at scope level 1 return $this.NewToken([TokenType]::PSClassMethod) } $str = $this.InputData[$this.tokenStart..($this.cpos-1)] -join '' if(($str) -in $script:PSKeywords) { return $this.ScanPSKeyword() } if(($str) -in $script:PSASMFunctions) { return $this.ScanDirective($str) } if(($this.InputData[$this.tokenStart..$this.cpos] -join '') -match '^(ADC|AND|ASL|BCC|BCS|BEQ|BIT|BMI|BNE|BPL|BRK|BVC|BVS|CLC|CLD|CLI|CLV|CMP|CPX|CPY|DEC|DEX|DEY|EOR|INC|INX|INY|JMP|JSR|LDA|LDX|LDY|LSR|NOP|ORA|PHA|PHP|PLA|PLP|ROL|ROR|RTI|RTS|SBC|SEC|SED|SEI|STA|STX|STY|TAX|TAY|TSX|TXA|TXS|TYA)\b' -and $this.tokens[-1].Type -ne [TokenType]::Minus) { # return $this.ScanMnemonic() return $this.NewToken([TokenType]::Mnemonic) } return $this.NewToken([TokenType]::Identifier) } [Token] ScanPSKeyword() { switch(($this.InputData[$this.tokenStart..($this.cpos-1)] -join '')) { "class" { $this.classCounter.Inc(0) } } return $this.NewToken([TokenType]::PSKeyword) } [Token] ScanNumber() { if($this.PeekChar() -eq 'x') { $this.SkipChar() while($this.GetChar() -in $script:CharsHex) {} $this.UnGetChar() return $this.NewToken([TokenType]::NumericLiteral) } if($this.PeekChar() -eq 'b') { $this.SkipChar() while($this.GetChar() -in '0','1') {} $this.UnGetChar() return $this.NewToken([TokenType]::NumericLiteral) } if($this.PeekChar() -eq '.') { $this.SkipChar() if($this.PeekChar() -eq '.') { $this.UnGetChar() return $this.NewToken([TokenType]::NumericLiteral) } while($this.GetChar() -in $script:Chars0to9) {} $this.UnGetChar() return $this.NewToken([TokenType]::NumericLiteral) } while($this.GetChar() -in $script:Chars0to9) {} $this.UnGetChar() return $this.NewToken([TokenType]::NumericLiteral) } [Token] ScanVariable() { while($this.GetChar() -in $script:CharsIdentifier) {} $this.UnGetChar() return $this.NewToken([TokenType]::PSVariable) } [Token] ScanMember() { if ($this.GetChar() -eq ':') { $c = $this.PeekChar() if ($c -in '+','-') { while($this.GetChar() -eq $c) {} $this.UnGetChar() if($this.PeekChar() -in ([char[]]'$' + $script:CharsIdentifier)) { $this.UnGetChar() } } } else { while($this.GetChar() -in $script:CharsIdentifier) {} $this.UnGetChar() } return $this.NewToken([TokenType]::Member) } [Token] NextToken() { $this.tokenStart = $this.cpos $ctx = $this.FileStack.CurrentContext() $this.tline = $ctx.Line $this.tcolumn = $ctx.Column $this.tfile = $ctx.File [char]$c = $this.GetChar() switch($c) { '/' { switch($this.PeekChar()) { '*' {return $this.ScanBlockComment([TokenType]::CStyleBlockComment)} '/' {return $this.ScanLineComment([TokenType]::CStyleLineComment)} } } '<' { if($this.PeekChar() -eq '#') { return $this.ScanBlockComment([TokenType]::PSBlockComment) } return $this.NewToken([TokenType]::LAngle) } '#' { $i=-1 while($this.tokens[$i].Type -notin $null, [TokenType]::SemiColon, [TokenType]::NewLine){ if($this.tokens[$i].Type -eq [TokenType]::Hash) { return $this.ScanLineComment([TokenType]::PSLineComment) } if($this.tokens[$i].Type -eq [TokenType]::Mnemonic) { return $this.NewToken([TokenType]::Hash) } $i-- } return $this.ScanLineComment([TokenType]::PSLineComment) } '>' {return $this.NewToken([TokenType]::RAngle)} '(' {return $this.NewToken([TokenType]::LParen)} ')' {return $this.NewToken([TokenType]::RParen)} '[' {return $this.NewToken([TokenType]::LBracket)} ']' {return $this.NewToken([TokenType]::RBracket)} '{' { if ($this.classCounter.Counters[0] -gt 0) { $this.classCounter.Inc(1) } return $this.NewToken([TokenType]::LCurly) } '}' { if ($this.classCounter.Counters[0] -gt 0) { $this.classCounter.Dec(1) if ($this.classCounter.Counters[1] -eq 0) { $this.classCounter.Dec(0) } } return $this.NewToken([TokenType]::RCurly) } '+' {return $this.NewToken([TokenType]::Plus)} '-' {return $this.NewToken([TokenType]::Minus)} '/' {return $this.NewToken([TokenType]::Divide)} '*' {return $this.NewToken([TokenType]::Asterisk)} '%' { $i = -1 while ($this.tokens[$i].Type -in [TokenType]::WhiteSpace, [TokenType]::CStyleBlockComment, [TokenType]::PSBlockComment) { $i-- } $prev = $this.tokens[$i] # Previous non-whitespace, non-block-comment token if($prev.Type -in $null, [TokenType]::NewLine, [TokenType]::SemiColon, [TokenType]::LCurly, [TokenType]::LParen, [TokenType]::LAngle, [TokenType]::RAngle, [TokenType]::LBracket, [TokenType]::Directive, [TokenType]::Mnemonic, [TokenType]::Hash, [TokenType]::Comma, [TokenType]::Divide, [TokenType]::Equals, [TokenType]::Minus, [TokenType]::Modulo, [TokenType]::Plus, [TokenType]::Asterisk, [TokenType]::TernaryColon, [TokenType]::QuestionMark) { # Unary % operator, treat as start of binary number $cnt=0 while($this.GetChar() -in $script:CharsBin) {$cnt++} $this.UnGetChar() if($cnt -gt 0 -and $this.PeekChar() -match '^\W') { return $this.NewToken([TokenType]::NumericLiteral) } else { throw "Unexpected character '$($this.PeekChar())' after binary literal at line $($this.tline), column $($this.tcolumn)" } } else { # Else assume binary modulo operator or Foreach-Object alias, both represented by the same token type and disambiguated in the ps parser return $this.NewToken([TokenType]::Modulo) } } '=' {return $this.NewToken([TokenType]::Equals)} ',' {return $this.NewToken([TokenType]::Comma)} '|' {return $this.NewToken([TokenType]::Pipe)} "'" { return $this.ScanStringLiteral() } '"' { return $this.ScanStringExpandable() } {$_ -in " ","`t","`f","`v"} { return $this.NewToken([TokenType]::WhiteSpace) } "`n" {return $this.ScanNewline($c)} "`r" {return $this.ScanNewline($c)} ';' { $this.HandlePendingDirective() return $this.NewToken([TokenType]::SemiColon) } '.' { if($this.PeekChar() -match '^[_a-z:]') { if($this.tokens[-1].Type -in [TokenType]::WhiteSpace, [TokenType]::NewLine, [TokenType]::SemiColon, [TokenType]::LCurly, [TokenType]::LParen, $null) { return $this.ScanIdentifier() # Identifiers can start with a . and not all directives start with a . so ScanIdentifier is used to figure out if it's a directive or not } return $this.ScanMember() } if($this.PeekChar() -eq '.') { $this.SkipChar() return $this.NewToken([TokenType]::DotDot) } return $this.NewToken([TokenType]::Dot) } ':' { if ($this.sawQuestionMark) { $this.sawQuestionMark = $false return $this.NewToken([TokenType]::TernaryColon) } $c1 = $this.PeekChar() if($c1 -eq '+' -or $c1 -eq '-') { while($this.GetChar() -eq $c1) {} $this.UnGetChar() if($this.PeekChar() -in ([char[]]'$' + $script:CharsIdentifier)) { $this.UnGetChar() } return $this.NewToken([TokenType]::AnonymousReference) } if($c1 -eq ':') { $this.SkipChar() return $this.NewToken([TokenType]::ColonColon) } return $this.NewToken([TokenType]::AnonymousLabel) } {$_ -in $script:Chars0to9} { return $this.ScanNumber() } '$' { ### No good way in the tokenizer to detect all the valid places a $xxxx number could exist ### e.g. as parameters in macros, so for now just treat $xxxx anywhere as a hex number # $i=-1 # while($this.tokens[$i].Type -notin $null, [TokenType]::SemiColon, [TokenType]::NewLine){ # if($this.tokens[$i--].Type -in [TokenType]::Mnemonic, [TokenType]::Directive) { $cnt=0 while($this.GetChar() -in $script:CharsHex) {$cnt++} $this.UnGetChar() if($cnt -gt 0 -and $this.PeekChar() -match '^\W') { return $this.NewToken([TokenType]::NumericLiteral) } else { # return $this.ScanVariable() } # } # } return $this.ScanVariable() } '?' { if ($this.PeekChar() -eq '?') { $this.SkipChar() return $this.NewToken([TokenType]::NullCoalesce) } if ($this.PeekChar() -eq '.') { $this.SkipChar() return $this.NewToken([TokenType]::NullConditionalProperty) } if ($this.PeekChar() -eq '[') { $this.SkipChar() return $this.NewToken([TokenType]::NullConditionalIndex) } $this.sawQuestionMark = $true return $this.NewToken([TokenType]::QuestionMark) } '@' { return $this.NewToken([TokenType]::AtSymbol) } {$_ -in [char[]]($script:Char_ + $script:CharsAtoZ)} { return $this.ScanIdentifier() } default { return $this.NewToken([TokenType]::Unknown) } } Write-Host "'$c' at $($this.cpos ) WHAT?! this should not happen." return $this.NewToken([TokenType]::Error) } Tokenize() { while($this.PeekChar() -ne 0) { $this.tokens.Add($this.NextToken()) } $this.tokenStart = $this.cpos++ $this.tokens.Add($this.NewToken([TokenType]::EOF)) } } |