From 9f60bbfd230a0a53de5208e3fb24877678127486 Mon Sep 17 00:00:00 2001 From: Bram Dingelstad Date: Sun, 21 Nov 2021 15:03:56 +0100 Subject: [PATCH] cleaned up lexer --- addons/Wol/core/compiler/lexer.gd | 392 ++++++++++++++---------------- 1 file changed, 183 insertions(+), 209 deletions(-) diff --git a/addons/Wol/core/compiler/lexer.gd b/addons/Wol/core/compiler/lexer.gd index 405ee8a..16add19 100644 --- a/addons/Wol/core/compiler/lexer.gd +++ b/addons/Wol/core/compiler/lexer.gd @@ -21,26 +21,26 @@ const DESTINATION = 'destination' var WHITESPACE = '\\s*' -var _states = {} -var _defaultState -var _currentState - -var _indentStack = [] -var _shouldTrackIndent : bool = false - var filename = '' var title = '' var text = '' +var states = {} +var default_state +var current_state + +var indent_stack = [] +var should_track_indent = false + func _init(_filename, _title, _text): - create_states() + createstates() filename = _filename title = _title text = _text -func create_states(): - var patterns : Dictionary = {} +func createstates(): + var patterns = {} patterns[Constants.TokenType.Text] = ['.*', 'any text'] patterns[Constants.TokenType.Number] = ['\\-?[0-9]+(\\.[0-9+])?', 'any number'] @@ -86,249 +86,234 @@ func create_states(): patterns[Constants.TokenType.Set] = ['set(?!\\w)', '"set"'] patterns[Constants.TokenType.ShortcutOption] = ['\\-\\>\\s*', '"->"'] - #compound states - var shortcut_option : String = SHORTCUT + DASH + OPTION - var shortcut_option_tag : String = shortcut_option + DASH + TAG - var command_or_expression : String = COMMAND + DASH + OR + DASH + EXPRESSION - var link_destination : String = LINK + DASH + DESTINATION + var shortcut_option = SHORTCUT + DASH + OPTION + var shortcut_option_tag = shortcut_option + DASH + TAG + var command_or_expression = COMMAND + DASH + OR + DASH + EXPRESSION + var link_destination = LINK + DASH + DESTINATION - _states = {} + states = {} - _states[BASE] = LexerState.new(patterns) - _states[BASE].add_transition(Constants.TokenType.BeginCommand,COMMAND,true) - _states[BASE].add_transition(Constants.TokenType.OptionStart,LINK,true) - _states[BASE].add_transition(Constants.TokenType.ShortcutOption, shortcut_option) - _states[BASE].add_transition(Constants.TokenType.TagMarker,TAG,true) - _states[BASE].add_text_rule(Constants.TokenType.Text) + states[BASE] = LexerState.new(patterns) + states[BASE].add_transition(Constants.TokenType.BeginCommand, COMMAND, true) + states[BASE].add_transition(Constants.TokenType.OptionStart, LINK, true) + states[BASE].add_transition(Constants.TokenType.ShortcutOption, shortcut_option) + states[BASE].add_transition(Constants.TokenType.TagMarker, TAG, true) + states[BASE].add_text_rule(Constants.TokenType.Text) - _states[TAG] = LexerState.new(patterns) - _states[TAG].add_transition(Constants.TokenType.Identifier,BASE) + states[TAG] = LexerState.new(patterns) + states[TAG].add_transition(Constants.TokenType.Identifier, BASE) - _states[shortcut_option] = LexerState.new(patterns) - _states[shortcut_option].track_indent = true - _states[shortcut_option].add_transition(Constants.TokenType.BeginCommand,EXPRESSION,true) - _states[shortcut_option].add_transition(Constants.TokenType.TagMarker,shortcut_option_tag,true) - _states[shortcut_option].add_text_rule(Constants.TokenType.Text,BASE) + states[shortcut_option] = LexerState.new(patterns) + states[shortcut_option].track_indent = true + states[shortcut_option].add_transition(Constants.TokenType.BeginCommand, EXPRESSION, true) + states[shortcut_option].add_transition(Constants.TokenType.TagMarker, shortcut_option_tag, true) + states[shortcut_option].add_text_rule(Constants.TokenType.Text, BASE) - _states[shortcut_option_tag] = LexerState.new(patterns) - _states[shortcut_option_tag].add_transition(Constants.TokenType.Identifier,shortcut_option) + states[shortcut_option_tag] = LexerState.new(patterns) + states[shortcut_option_tag].add_transition(Constants.TokenType.Identifier, shortcut_option) - _states[COMMAND] = LexerState.new(patterns) - _states[COMMAND].add_transition(Constants.TokenType.IfToken,EXPRESSION) - _states[COMMAND].add_transition(Constants.TokenType.ElseToken) - _states[COMMAND].add_transition(Constants.TokenType.ElseIf,EXPRESSION) - _states[COMMAND].add_transition(Constants.TokenType.EndIf) - _states[COMMAND].add_transition(Constants.TokenType.Set, ASSIGNMENT) - _states[COMMAND].add_transition(Constants.TokenType.EndCommand,BASE,true) - _states[COMMAND].add_transition(Constants.TokenType.Identifier,command_or_expression) - _states[COMMAND].add_text_rule(Constants.TokenType.Text) + states[COMMAND] = LexerState.new(patterns) + states[COMMAND].add_transition(Constants.TokenType.IfToken, EXPRESSION) + states[COMMAND].add_transition(Constants.TokenType.ElseToken) + states[COMMAND].add_transition(Constants.TokenType.ElseIf, EXPRESSION) + states[COMMAND].add_transition(Constants.TokenType.EndIf) + states[COMMAND].add_transition(Constants.TokenType.Set, ASSIGNMENT) + states[COMMAND].add_transition(Constants.TokenType.EndCommand, BASE, true) + states[COMMAND].add_transition(Constants.TokenType.Identifier, command_or_expression) + states[COMMAND].add_text_rule(Constants.TokenType.Text) - _states[command_or_expression] = LexerState.new(patterns) - _states[command_or_expression].add_transition(Constants.TokenType.LeftParen,EXPRESSION) - _states[command_or_expression].add_transition(Constants.TokenType.EndCommand,BASE,true) - _states[command_or_expression].add_text_rule(Constants.TokenType.Text) + states[command_or_expression] = LexerState.new(patterns) + states[command_or_expression].add_transition(Constants.TokenType.LeftParen, EXPRESSION) + states[command_or_expression].add_transition(Constants.TokenType.EndCommand, BASE, true) + states[command_or_expression].add_text_rule(Constants.TokenType.Text) - _states[ASSIGNMENT] = LexerState.new(patterns) - _states[ASSIGNMENT].add_transition(Constants.TokenType.Variable) - _states[ASSIGNMENT].add_transition(Constants.TokenType.EqualToOrAssign, EXPRESSION) - _states[ASSIGNMENT].add_transition(Constants.TokenType.AddAssign, EXPRESSION) - _states[ASSIGNMENT].add_transition(Constants.TokenType.MinusAssign, EXPRESSION) - _states[ASSIGNMENT].add_transition(Constants.TokenType.MultiplyAssign, EXPRESSION) - _states[ASSIGNMENT].add_transition(Constants.TokenType.DivideAssign, EXPRESSION) + states[ASSIGNMENT] = LexerState.new(patterns) + states[ASSIGNMENT].add_transition(Constants.TokenType.Variable) + states[ASSIGNMENT].add_transition(Constants.TokenType.EqualToOrAssign, EXPRESSION) + states[ASSIGNMENT].add_transition(Constants.TokenType.AddAssign, EXPRESSION) + states[ASSIGNMENT].add_transition(Constants.TokenType.MinusAssign, EXPRESSION) + states[ASSIGNMENT].add_transition(Constants.TokenType.MultiplyAssign, EXPRESSION) + states[ASSIGNMENT].add_transition(Constants.TokenType.DivideAssign, EXPRESSION) - _states[EXPRESSION] = LexerState.new(patterns) - _states[EXPRESSION].add_transition(Constants.TokenType.EndCommand, BASE) - _states[EXPRESSION].add_transition(Constants.TokenType.Number) - _states[EXPRESSION].add_transition(Constants.TokenType.Str) - _states[EXPRESSION].add_transition(Constants.TokenType.LeftParen) - _states[EXPRESSION].add_transition(Constants.TokenType.RightParen) - _states[EXPRESSION].add_transition(Constants.TokenType.EqualTo) - _states[EXPRESSION].add_transition(Constants.TokenType.EqualToOrAssign) - _states[EXPRESSION].add_transition(Constants.TokenType.NotEqualTo) - _states[EXPRESSION].add_transition(Constants.TokenType.GreaterThanOrEqualTo) - _states[EXPRESSION].add_transition(Constants.TokenType.GreaterThan) - _states[EXPRESSION].add_transition(Constants.TokenType.LessThanOrEqualTo) - _states[EXPRESSION].add_transition(Constants.TokenType.LessThan) - _states[EXPRESSION].add_transition(Constants.TokenType.Add) - _states[EXPRESSION].add_transition(Constants.TokenType.Minus) - _states[EXPRESSION].add_transition(Constants.TokenType.Multiply) - _states[EXPRESSION].add_transition(Constants.TokenType.Divide) - _states[EXPRESSION].add_transition(Constants.TokenType.Modulo) - _states[EXPRESSION].add_transition(Constants.TokenType.And) - _states[EXPRESSION].add_transition(Constants.TokenType.Or) - _states[EXPRESSION].add_transition(Constants.TokenType.Xor) - _states[EXPRESSION].add_transition(Constants.TokenType.Not) - _states[EXPRESSION].add_transition(Constants.TokenType.Variable) - _states[EXPRESSION].add_transition(Constants.TokenType.Comma) - _states[EXPRESSION].add_transition(Constants.TokenType.TrueToken) - _states[EXPRESSION].add_transition(Constants.TokenType.FalseToken) - _states[EXPRESSION].add_transition(Constants.TokenType.NullToken) - _states[EXPRESSION].add_transition(Constants.TokenType.Identifier) + states[EXPRESSION] = LexerState.new(patterns) + states[EXPRESSION].add_transition(Constants.TokenType.EndCommand, BASE) + states[EXPRESSION].add_transition(Constants.TokenType.Number) + states[EXPRESSION].add_transition(Constants.TokenType.Str) + states[EXPRESSION].add_transition(Constants.TokenType.LeftParen) + states[EXPRESSION].add_transition(Constants.TokenType.RightParen) + states[EXPRESSION].add_transition(Constants.TokenType.EqualTo) + states[EXPRESSION].add_transition(Constants.TokenType.EqualToOrAssign) + states[EXPRESSION].add_transition(Constants.TokenType.NotEqualTo) + states[EXPRESSION].add_transition(Constants.TokenType.GreaterThanOrEqualTo) + states[EXPRESSION].add_transition(Constants.TokenType.GreaterThan) + states[EXPRESSION].add_transition(Constants.TokenType.LessThanOrEqualTo) + states[EXPRESSION].add_transition(Constants.TokenType.LessThan) + states[EXPRESSION].add_transition(Constants.TokenType.Add) + states[EXPRESSION].add_transition(Constants.TokenType.Minus) + states[EXPRESSION].add_transition(Constants.TokenType.Multiply) + states[EXPRESSION].add_transition(Constants.TokenType.Divide) + states[EXPRESSION].add_transition(Constants.TokenType.Modulo) + states[EXPRESSION].add_transition(Constants.TokenType.And) + states[EXPRESSION].add_transition(Constants.TokenType.Or) + states[EXPRESSION].add_transition(Constants.TokenType.Xor) + states[EXPRESSION].add_transition(Constants.TokenType.Not) + states[EXPRESSION].add_transition(Constants.TokenType.Variable) + states[EXPRESSION].add_transition(Constants.TokenType.Comma) + states[EXPRESSION].add_transition(Constants.TokenType.TrueToken) + states[EXPRESSION].add_transition(Constants.TokenType.FalseToken) + states[EXPRESSION].add_transition(Constants.TokenType.NullToken) + states[EXPRESSION].add_transition(Constants.TokenType.Identifier) - _states[LINK] = LexerState.new(patterns) - _states[LINK].add_transition(Constants.TokenType.OptionEnd, BASE, true) - _states[LINK].add_transition(Constants.TokenType.OptionDelimit, link_destination, true) - _states[LINK].add_text_rule(Constants.TokenType.Text) + states[LINK] = LexerState.new(patterns) + states[LINK].add_transition(Constants.TokenType.OptionEnd, BASE, true) + states[LINK].add_transition(Constants.TokenType.OptionDelimit, link_destination, true) + states[LINK].add_text_rule(Constants.TokenType.Text) - _states[link_destination] = LexerState.new(patterns) - _states[link_destination].add_transition(Constants.TokenType.Identifier) - _states[link_destination].add_transition(Constants.TokenType.OptionEnd, BASE) + states[link_destination] = LexerState.new(patterns) + states[link_destination].add_transition(Constants.TokenType.Identifier) + states[link_destination].add_transition(Constants.TokenType.OptionEnd, BASE) - _defaultState = _states[BASE] + default_state = states[BASE] - for stateKey in _states.keys(): - _states[stateKey].stateName = stateKey + for key in states.keys(): + states[key].name = key func tokenize(): - _indentStack.clear() - _indentStack.push_front(IntBoolPair.new(0, false)) - _shouldTrackIndent = false - var tokens = [] - _currentState = _defaultState + indent_stack.clear() + indent_stack.push_front([0, false]) + should_track_indent = false + current_state = default_state var lines = text.split(LINE_SEPARATOR) - lines.append('') + var line_number = 1 - var line_number : int = 1 + lines.append('') for line in lines: tokens += tokenize_line(line, line_number) line_number += 1 - var endOfInput = Token.new( + var end_of_input = Token.new( Constants.TokenType.EndOfInput, - _currentState, + current_state, line_number, 0 ) - tokens.append(endOfInput) + tokens.append(end_of_input) return tokens func tokenize_line(line, line_number): - var tokenStack : Array = [] + var token_stack = [] - var freshLine = line.replace('\t',' ').replace('\r','') + var fresh_line = line.replace('\t',' ').replace('\r','') - #record indentation var indentation = line_indentation(line) - var prevIndentation = _indentStack.front() + var previous_indentation = indent_stack.front()[0] - if _shouldTrackIndent && indentation > prevIndentation.key: - #we add an indenation token to record indent level - _indentStack.push_front(IntBoolPair.new(indentation,true)) + if should_track_indent && indentation > previous_indentation: + indent_stack.push_front([indentation, true]) - var indent : Token = Token.new( + var indent = Token.new( Constants.TokenType.Indent, - _currentState, + current_state, filename, line_number, - prevIndentation.key + previous_indentation ) - indent.value = '%*s' % [indentation - prevIndentation.key,''] + indent.value = '%*s' % [indentation - previous_indentation, ''] - _shouldTrackIndent = false - tokenStack.push_front(indent) + should_track_indent = false + token_stack.push_front(indent) - elif indentation < prevIndentation.key: - #de-indent and then emit indentaiton token - - while indentation < _indentStack.front().key: - var top : IntBoolPair = _indentStack.pop_front() - if top.value: - var deIndent : Token = Token.new(Constants.TokenType.Dedent,_currentState,line_number,0) - tokenStack.push_front(deIndent) + elif indentation < previous_indentation: + while indentation < indent_stack.front()[0]: + var top = indent_stack.pop_front()[1] + if top: + var deindent = Token.new(Constants.TokenType.Dedent, current_state, line_number, 0) + token_stack.push_front(deindent) - - var column : int = indentation + var column = indentation + var whitespace = RegEx.new() + whitespace.compile(WHITESPACE) - var whitespace : RegEx = RegEx.new() - var error = whitespace.compile(WHITESPACE) - if error != OK: - printerr('unable to compile regex WHITESPACE') - return [] - - while column < freshLine.length(): - - if freshLine.substr(column).begins_with(LINE_COMENT): + while column < fresh_line.length(): + if fresh_line.substr(column).begins_with(LINE_COMENT): break - var matched : bool = false + var matched = false - for rule in _currentState.rules: - var found = rule.regex.search(freshLine, column) + for rule in current_state.rules: + var found = rule.regex.search(fresh_line, column) if !found: continue - var tokenText : String + var token_text = '' + # NOTE: If this is text then we back up to the most recent delimiting token + # and treat everything from there as text. if rule.token_type == Constants.TokenType.Text: - #if this is text then we back up to the most recent - #delimiting token and treat everything from there as text. - var startIndex : int = indentation + var start_index = indentation - if tokenStack.size() > 0 : - while tokenStack.front().type == Constants.TokenType.Identifier: - tokenStack.pop_front() + if token_stack.size() > 0 : + while token_stack.front().type == Constants.TokenType.Identifier: + token_stack.pop_front() - var startDelimitToken : Token = tokenStack.front() - startIndex = startDelimitToken.column + var start_delimit_token = token_stack.front() + start_index = start_delimit_token.column - if startDelimitToken.type == Constants.TokenType.Indent: - startIndex += startDelimitToken.value.length() - if startDelimitToken.type == Constants.TokenType.Dedent: - startIndex = indentation - # + if start_delimit_token.type == Constants.TokenType.Indent: + start_index += start_delimit_token.value.length() + if start_delimit_token.type == Constants.TokenType.Dedent: + start_index = indentation - column = startIndex + column = start_index var end_index = found.get_start() + found.get_string().length() - tokenText = freshLine.substr(startIndex, end_index - startIndex) - + token_text = fresh_line.substr(start_index, end_index - start_index) else: - tokenText = found.get_string() + token_text = found.get_string() - column += tokenText.length() + column += token_text.length() - #pre-proccess string if rule.token_type == Constants.TokenType.Str: - tokenText = tokenText.substr(1, tokenText.length() - 2) - tokenText = tokenText.replace('\\\\', '\\') - tokenText = tokenText.replace('\\\'','\'') + token_text = token_text.substr(1, token_text.length() - 2) + token_text = token_text.replace('\\\\', '\\') + token_text = token_text.replace('\\\'','\'') var token = Token.new( rule.token_type, - _currentState, + current_state, filename, line_number, column, - tokenText + token_text ) token.delimits_text = rule.delimits_text - tokenStack.push_front(token) + token_stack.push_front(token) if rule.enter_state != null and rule.enter_state.length() > 0: - if not _states.has(rule.enter_state): + if not states.has(rule.enter_state): printerr('State[%s] not known - line(%s) col(%s)' % [rule.enter_state, line_number, column]) return [] - enter_state(_states[rule.enter_state]) + enter_state(states[rule.enter_state]) - if _shouldTrackIndent: - if _indentStack.front().key < indentation: - _indentStack.append(IntBoolPair.new(indentation, false)) + if should_track_indent: + if indent_stack.front()[0] < indentation: + indent_stack.append([indentation, false]) matched = true break if not matched: var rules = [] - for rule in _currentState.rules: + for rule in current_state.rules: rules.append('"%s" (%s)' % [Constants.token_type_name(rule.token_type), rule.human_readable_identifier]) var error_data = [ @@ -340,30 +325,30 @@ func tokenize_line(line, line_number): ] assert(false, 'Expected %s in file %s in node "%s" on line #%d (column #%d)' % error_data) - var lastWhiteSpace = whitespace.search(line, column) - if lastWhiteSpace: - column += lastWhiteSpace.get_string().length() + var last_whitespace = whitespace.search(line, column) + if last_whitespace: + column += last_whitespace.get_string().length() - tokenStack.invert() + token_stack.invert() - return tokenStack + return token_stack -func line_indentation(line:String)->int: - var indentRegex : RegEx = RegEx.new() - indentRegex.compile('^(\\s*)') +func line_indentation(line): + var indent_regex = RegEx.new() + indent_regex.compile('^(\\s*)') - var found : RegExMatch = indentRegex.search(line) + var found = indent_regex.search(line) - if !found || found.get_string().length() <= 0: + if !found or found.get_string().length() <= 0: return 0 return found.get_string().length() -func enter_state(state:LexerState): - _currentState = state; - if _currentState.track_indent: - _shouldTrackIndent = true +func enter_state(state): + current_state = state; + if current_state.track_indent: + should_track_indent = true class Token: var type = -1 @@ -375,38 +360,36 @@ class Token: var text = '' var delimits_text = false - var paramCount = -1 - var lexerState = '' + var parameter_count = -1 + var lexer_state = '' func _init(_type, _state, _filename, _line_number = -1, _column = -1, _value = ''): type = _type - lexerState = _state.stateName + lexer_state = _state.name filename = _filename line_number = _line_number column = _column value = _value func _to_string(): - return '%s (%s) at %s:%s (state: %s)' % [Constants.token_type_name(type),value,line_number,column,lexerState] + return '%s (%s) at %s:%s (state: %s)' % [Constants.token_type_name(type),value, line_number, column, lexer_state] class LexerState: - - var stateName : String - var patterns : Dictionary - var rules : Array = [] - var track_indent : bool = false + var name = '' + var patterns = {} + var rules = [] + var track_indent = false func _init(_patterns): patterns = _patterns - func add_transition(type : int, state : String = '',delimitText : bool = false)->Rule: + func add_transition(type, state = '', delimit_text = false): var pattern = '\\G%s' % patterns[type][0] - # print('pattern = %s' % pattern) - var rule = Rule.new(type, pattern, patterns[type][1], state, delimitText) + var rule = Rule.new(type, pattern, patterns[type][1], state, delimit_text) rules.append(rule) return rule - func add_text_rule(type : int, state : String = '')->Rule: + func add_text_rule(type, state = ''): if contains_text_rule() : printerr('State already contains Text rule') return null @@ -417,25 +400,25 @@ class LexerState: delimiters.append('%s' % rule.regex.get_pattern().substr(2)) var pattern = '\\G((?!%s).)*' % [PoolStringArray(delimiters).join('|')] - var rule : Rule = add_transition(type,state) + var rule = add_transition(type, state) rule.regex = RegEx.new() rule.regex.compile(pattern) rule.is_text_rule = true return rule - func contains_text_rule()->bool: + func contains_text_rule(): for rule in rules: if rule.is_text_rule: return true return false class Rule: - var regex : RegEx + var regex - var enter_state : String - var token_type : int - var is_text_rule : bool - var delimits_text : bool + var enter_state = '' + var token_type = -1 + var is_text_rule = false + var delimits_text = false var human_readable_identifier = '' func _init(_type, _regex, _human_readable_identifier, _enter_state, _delimits_text): @@ -450,12 +433,3 @@ class Rule: func _to_string(): return '[Rule : %s (%s) - %s]' % [Constants.token_type_name(token_type), human_readable_identifier, regex] - -class IntBoolPair: - var key = -1 - var value = false - - func _init(_key, _value): - key = _key - value = _value -