extends Object const Constants = preload('res://addons/Wol/core/constants.gd') const LINE_COMENT : String = '//' const FORWARD_SLASH : String = '/' const LINE_SEPARATOR : String = '\n' const BASE : String = 'base' const DASH : String = '-' const COMMAND : String = 'command' const LINK : String = 'link' const SHORTCUT : String = 'shortcut' const TAG : String = 'tag' const EXPRESSION : String = 'expression' const ASSIGNMENT : String = 'assignment' const OPTION : String = 'option' const OR : String = 'or' const DESTINATION : String = 'destination' var WHITESPACE : String = '\\s*' var _states : Dictionary = {} var _defaultState : LexerState var _currentState : LexerState var _indentStack : Array = [] var _shouldTrackIndent : bool = false func _init(): create_states() func create_states(): var patterns : Dictionary = {} patterns[Constants.TokenType.Text] = '.*' patterns[Constants.TokenType.Number] = '\\-?[0-9]+(\\.[0-9+])?' patterns[Constants.TokenType.Str] = '\'([^\'\\\\]*(?:\\.[^\'\\\\]*)*)\'' patterns[Constants.TokenType.TagMarker] = '\\#' patterns[Constants.TokenType.LeftParen] = '\\(' patterns[Constants.TokenType.RightParen] = '\\)' patterns[Constants.TokenType.EqualTo] = '(==|is(?!\\w)|eq(?!\\w))' patterns[Constants.TokenType.EqualToOrAssign] = '(=|to(?!\\w))' patterns[Constants.TokenType.NotEqualTo] = '(\\!=|neq(?!\\w))' patterns[Constants.TokenType.GreaterThanOrEqualTo] = '(\\>=|gte(?!\\w))' patterns[Constants.TokenType.GreaterThan] = '(\\>|gt(?!\\w))' patterns[Constants.TokenType.LessThanOrEqualTo] = '(\\<=|lte(?!\\w))' patterns[Constants.TokenType.LessThan] = '(\\<|lt(?!\\w))' patterns[Constants.TokenType.AddAssign] = '\\+=' patterns[Constants.TokenType.MinusAssign] = '\\-=' patterns[Constants.TokenType.MultiplyAssign] = '\\*=' patterns[Constants.TokenType.DivideAssign] = '\\/=' patterns[Constants.TokenType.Add] = '\\+' patterns[Constants.TokenType.Minus] = '\\-' patterns[Constants.TokenType.Multiply] = '\\*' patterns[Constants.TokenType.Divide] = '\\/' patterns[Constants.TokenType.Modulo] = '\\%' patterns[Constants.TokenType.And] = '(\\&\\&|and(?!\\w))' patterns[Constants.TokenType.Or] = '(\\|\\||or(?!\\w))' patterns[Constants.TokenType.Xor] = '(\\^|xor(?!\\w))' patterns[Constants.TokenType.Not] = '(\\!|not(?!\\w))' patterns[Constants.TokenType.Variable] = '\\$([A-Za-z0-9_\\.])+' patterns[Constants.TokenType.Comma] = '\\,' patterns[Constants.TokenType.TrueToken] = 'true(?!\\w)' patterns[Constants.TokenType.FalseToken] = 'false(?!\\w)' patterns[Constants.TokenType.NullToken] = 'null(?!\\w)' patterns[Constants.TokenType.BeginCommand] = '\\<\\<' patterns[Constants.TokenType.EndCommand] = '\\>\\>' patterns[Constants.TokenType.OptionStart] = '\\[\\[' patterns[Constants.TokenType.OptionEnd] = '\\]\\]' patterns[Constants.TokenType.OptionDelimit] = '\\|' patterns[Constants.TokenType.Identifier] = '[a-zA-Z0-9_:\\.]+' patterns[Constants.TokenType.IfToken] = 'if(?!\\w)' patterns[Constants.TokenType.ElseToken] = 'else(?!\\w)' patterns[Constants.TokenType.ElseIf] = 'elseif(?!\\w)' patterns[Constants.TokenType.EndIf] = 'endif(?!\\w)' patterns[Constants.TokenType.Set] = 'set(?!\\w)' patterns[Constants.TokenType.ShortcutOption] = '\\-\\>\\s*' #compound states var shortcut_option : String= SHORTCUT + DASH + OPTION var shortcut_option_tag : String = shortcut_option + DASH + TAG var command_or_expression : String= COMMAND + DASH + OR + DASH + EXPRESSION var link_destination : String = LINK + DASH + DESTINATION _states = {} _states[BASE] = LexerState.new(patterns) _states[BASE].add_transition(Constants.TokenType.BeginCommand,COMMAND,true) _states[BASE].add_transition(Constants.TokenType.OptionStart,LINK,true) _states[BASE].add_transition(Constants.TokenType.ShortcutOption,shortcut_option) _states[BASE].add_transition(Constants.TokenType.TagMarker,TAG,true) _states[BASE].add_text_rule(Constants.TokenType.Text) _states[TAG] = LexerState.new(patterns) _states[TAG].add_transition(Constants.TokenType.Identifier,BASE) _states[shortcut_option] = LexerState.new(patterns) _states[shortcut_option].track_indent = true _states[shortcut_option].add_transition(Constants.TokenType.BeginCommand,EXPRESSION,true) _states[shortcut_option].add_transition(Constants.TokenType.TagMarker,shortcut_option_tag,true) _states[shortcut_option].add_text_rule(Constants.TokenType.Text,BASE) _states[shortcut_option_tag] = LexerState.new(patterns) _states[shortcut_option_tag].add_transition(Constants.TokenType.Identifier,shortcut_option) _states[COMMAND] = LexerState.new(patterns) _states[COMMAND].add_transition(Constants.TokenType.IfToken,EXPRESSION) _states[COMMAND].add_transition(Constants.TokenType.ElseToken) _states[COMMAND].add_transition(Constants.TokenType.ElseIf,EXPRESSION) _states[COMMAND].add_transition(Constants.TokenType.EndIf) _states[COMMAND].add_transition(Constants.TokenType.Set,ASSIGNMENT) _states[COMMAND].add_transition(Constants.TokenType.EndCommand,BASE,true) _states[COMMAND].add_transition(Constants.TokenType.Identifier,command_or_expression) _states[COMMAND].add_text_rule(Constants.TokenType.Text) _states[command_or_expression] = LexerState.new(patterns) _states[command_or_expression].add_transition(Constants.TokenType.LeftParen,EXPRESSION) _states[command_or_expression].add_transition(Constants.TokenType.EndCommand,BASE,true) _states[command_or_expression].add_text_rule(Constants.TokenType.Text) _states[ASSIGNMENT] = LexerState.new(patterns) _states[ASSIGNMENT].add_transition(Constants.TokenType.Variable) _states[ASSIGNMENT].add_transition(Constants.TokenType.EqualToOrAssign, EXPRESSION) _states[ASSIGNMENT].add_transition(Constants.TokenType.AddAssign, EXPRESSION) _states[ASSIGNMENT].add_transition(Constants.TokenType.MinusAssign, EXPRESSION) _states[ASSIGNMENT].add_transition(Constants.TokenType.MultiplyAssign, EXPRESSION) _states[ASSIGNMENT].add_transition(Constants.TokenType.DivideAssign, EXPRESSION) _states[EXPRESSION] = LexerState.new(patterns) _states[EXPRESSION].add_transition(Constants.TokenType.EndCommand, BASE) _states[EXPRESSION].add_transition(Constants.TokenType.Number) _states[EXPRESSION].add_transition(Constants.TokenType.Str) _states[EXPRESSION].add_transition(Constants.TokenType.LeftParen) _states[EXPRESSION].add_transition(Constants.TokenType.RightParen) _states[EXPRESSION].add_transition(Constants.TokenType.EqualTo) _states[EXPRESSION].add_transition(Constants.TokenType.EqualToOrAssign) _states[EXPRESSION].add_transition(Constants.TokenType.NotEqualTo) _states[EXPRESSION].add_transition(Constants.TokenType.GreaterThanOrEqualTo) _states[EXPRESSION].add_transition(Constants.TokenType.GreaterThan) _states[EXPRESSION].add_transition(Constants.TokenType.LessThanOrEqualTo) _states[EXPRESSION].add_transition(Constants.TokenType.LessThan) _states[EXPRESSION].add_transition(Constants.TokenType.Add) _states[EXPRESSION].add_transition(Constants.TokenType.Minus) _states[EXPRESSION].add_transition(Constants.TokenType.Multiply) _states[EXPRESSION].add_transition(Constants.TokenType.Divide) _states[EXPRESSION].add_transition(Constants.TokenType.Modulo) _states[EXPRESSION].add_transition(Constants.TokenType.And) _states[EXPRESSION].add_transition(Constants.TokenType.Or) _states[EXPRESSION].add_transition(Constants.TokenType.Xor) _states[EXPRESSION].add_transition(Constants.TokenType.Not) _states[EXPRESSION].add_transition(Constants.TokenType.Variable) _states[EXPRESSION].add_transition(Constants.TokenType.Comma) _states[EXPRESSION].add_transition(Constants.TokenType.TrueToken) _states[EXPRESSION].add_transition(Constants.TokenType.FalseToken) _states[EXPRESSION].add_transition(Constants.TokenType.NullToken) _states[EXPRESSION].add_transition(Constants.TokenType.Identifier) _states[LINK] = LexerState.new(patterns) _states[LINK].add_transition(Constants.TokenType.OptionEnd, BASE, true) _states[LINK].add_transition(Constants.TokenType.OptionDelimit, link_destination, true) _states[LINK].add_text_rule(Constants.TokenType.Text) _states[link_destination] = LexerState.new(patterns) _states[link_destination].add_transition(Constants.TokenType.Identifier) _states[link_destination].add_transition(Constants.TokenType.OptionEnd, BASE) _defaultState = _states[BASE] for stateKey in _states.keys(): _states[stateKey].stateName = stateKey pass func tokenize(text:String)->Array: _indentStack.clear() _indentStack.push_front(IntBoolPair.new(0,false)) _shouldTrackIndent = false var tokens : Array = [] _currentState = _defaultState var lines : PoolStringArray = text.split(LINE_SEPARATOR) lines.append('') var line_number : int = 1 for line in lines: tokens+=tokenize_line(line,line_number) line_number+=1 var endOfInput : Token = Token.new(Constants.TokenType.EndOfInput,_currentState,line_number,0) tokens.append(endOfInput) # print(tokens) return tokens func tokenize_line(line:String, line_number : int)->Array: var tokenStack : Array = [] var freshLine = line.replace('\t',' ').replace('\r','') #record indentation var indentation = line_indentation(line) var prevIndentation : IntBoolPair = _indentStack.front() if _shouldTrackIndent && indentation > prevIndentation.key: #we add an indenation token to record indent level _indentStack.push_front(IntBoolPair.new(indentation,true)) var indent : Token = Token.new(Constants.TokenType.Indent,_currentState,line_number,prevIndentation.key) indent.value = '%*s' % [indentation - prevIndentation.key,''] _shouldTrackIndent = false tokenStack.push_front(indent) elif indentation < prevIndentation.key: #de-indent and then emit indentaiton token while indentation < _indentStack.front().key: var top : IntBoolPair = _indentStack.pop_front() if top.value: var deIndent : Token = Token.new(Constants.TokenType.Dedent,_currentState,line_number,0) tokenStack.push_front(deIndent) var column : int = indentation var whitespace : RegEx = RegEx.new() var error = whitespace.compile(WHITESPACE) if error != OK: printerr('unable to compile regex WHITESPACE') return [] while column < freshLine.length(): if freshLine.substr(column).begins_with(LINE_COMENT): break var matched : bool = false for rule in _currentState.rules: var found : RegExMatch = rule.regex.search(freshLine, column) if !found: continue var tokenText : String if rule.tokenType == Constants.TokenType.Text: #if this is text then we back up to the most recent #delimiting token and treat everything from there as text. var startIndex : int = indentation if tokenStack.size() > 0 : while tokenStack.front().type == Constants.TokenType.Identifier: tokenStack.pop_front() var startDelimitToken : Token = tokenStack.front() startIndex = startDelimitToken.column if startDelimitToken.type == Constants.TokenType.Indent: startIndex += startDelimitToken.value.length() if startDelimitToken.type == Constants.TokenType.Dedent: startIndex = indentation # column = startIndex var endIndex : int = found.get_start() + found.get_string().length() tokenText = freshLine.substr(startIndex,endIndex-startIndex) else: tokenText = found.get_string() column += tokenText.length() #pre-proccess string if rule.tokenType == Constants.TokenType.Str: tokenText = tokenText.substr(1,tokenText.length() - 2) tokenText = tokenText.replace('\\\\', '\\') tokenText = tokenText.replace('\\\'','\'') var token : Token = Token.new(rule.tokenType,_currentState,line_number,column,tokenText) token.delimitsText = rule.delimitsText tokenStack.push_front(token) if rule.enterState != null && rule.enterState.length() > 0: if !_states.has(rule.enterState): printerr('State[%s] not known - line(%s) col(%s)'%[rule.enterState,line_number,column]) return [] enter_state(_states[rule.enterState]) if _shouldTrackIndent: if _indentStack.front().key < indentation: _indentStack.append(IntBoolPair.new(indentation,false)) matched = true break if !matched: # TODO: Send out some helpful messages printerr('expectedTokens [%s] - line(%s) col(%s)'%['refineErrors.Lexer.tokenize_line',line_number,column]) return [] var lastWhiteSpace : RegExMatch = whitespace.search(line,column) if lastWhiteSpace: column += lastWhiteSpace.get_string().length() tokenStack.invert() return tokenStack func line_indentation(line:String)->int: var indentRegex : RegEx = RegEx.new() indentRegex.compile('^(\\s*)') var found : RegExMatch = indentRegex.search(line) if !found || found.get_string().length() <= 0: return 0 return found.get_string().length() func enter_state(state:LexerState): _currentState = state; if _currentState.track_indent: _shouldTrackIndent = true class Token: var type : int var value : String var line_number : int var column : int var text : String var delimitsText : bool= false var paramCount : int var lexerState : String func _init(type:int,state: LexerState, line_number:int = -1,column:int = -1,value:String =''): self.type = type self.lexerState = state.stateName self.line_number = line_number self.column = column self.value = value func _to_string(): return '%s (%s) at %s:%s (state: %s)' % [Constants.token_type_name(type),value,line_number,column,lexerState] class LexerState: var stateName : String var patterns : Dictionary var rules : Array = [] var track_indent : bool = false func _init(patterns): self.patterns = patterns func add_transition(type : int, state : String = '',delimitText : bool = false)->Rule: var pattern = '\\G%s' % patterns[type] # print('pattern = %s' % pattern) var rule = Rule.new(type,pattern,state,delimitText) rules.append(rule) return rule func add_text_rule(type : int, state : String = '')->Rule: if contains_text_rule() : printerr('State already contains Text rule') return null var delimiters:Array = [] for rule in rules: if rule.delimitsText: delimiters.append('%s' % rule.regex.get_pattern().substr(2)) var pattern = '\\G((?!%s).)*' % [PoolStringArray(delimiters).join('|')] var rule : Rule = add_transition(type,state) rule.regex = RegEx.new() rule.regex.compile(pattern) rule.isTextRule = true return rule func contains_text_rule()->bool: for rule in rules: if rule.isTextRule: return true return false class Rule: var regex : RegEx var enterState : String var tokenType : int var isTextRule : bool var delimitsText : bool func _init(type : int , regex : String, enterState : String, delimitsText:bool): self.tokenType = type self.regex = RegEx.new() self.regex.compile(regex) self.enterState = enterState self.delimitsText = delimitsText func _to_string(): return '[Rule : %s - %s]' % [Constants.token_type_name(tokenType),regex] class IntBoolPair: var key : int var value : bool func _init(key:int,value:bool): self.key = key self.value = value