cleaned up lexer

This commit is contained in:
Bram Dingelstad 2021-11-21 15:03:56 +01:00
parent 83c4808d5a
commit 9f60bbfd23

View file

@ -21,26 +21,26 @@ const DESTINATION = 'destination'
var WHITESPACE = '\\s*' var WHITESPACE = '\\s*'
var _states = {}
var _defaultState
var _currentState
var _indentStack = []
var _shouldTrackIndent : bool = false
var filename = '' var filename = ''
var title = '' var title = ''
var text = '' var text = ''
var states = {}
var default_state
var current_state
var indent_stack = []
var should_track_indent = false
func _init(_filename, _title, _text): func _init(_filename, _title, _text):
create_states() createstates()
filename = _filename filename = _filename
title = _title title = _title
text = _text text = _text
func create_states(): func createstates():
var patterns : Dictionary = {} var patterns = {}
patterns[Constants.TokenType.Text] = ['.*', 'any text'] patterns[Constants.TokenType.Text] = ['.*', 'any text']
patterns[Constants.TokenType.Number] = ['\\-?[0-9]+(\\.[0-9+])?', 'any number'] patterns[Constants.TokenType.Number] = ['\\-?[0-9]+(\\.[0-9+])?', 'any number']
@ -86,249 +86,234 @@ func create_states():
patterns[Constants.TokenType.Set] = ['set(?!\\w)', '"set"'] patterns[Constants.TokenType.Set] = ['set(?!\\w)', '"set"']
patterns[Constants.TokenType.ShortcutOption] = ['\\-\\>\\s*', '"->"'] patterns[Constants.TokenType.ShortcutOption] = ['\\-\\>\\s*', '"->"']
#compound states var shortcut_option = SHORTCUT + DASH + OPTION
var shortcut_option : String = SHORTCUT + DASH + OPTION var shortcut_option_tag = shortcut_option + DASH + TAG
var shortcut_option_tag : String = shortcut_option + DASH + TAG var command_or_expression = COMMAND + DASH + OR + DASH + EXPRESSION
var command_or_expression : String = COMMAND + DASH + OR + DASH + EXPRESSION var link_destination = LINK + DASH + DESTINATION
var link_destination : String = LINK + DASH + DESTINATION
_states = {} states = {}
_states[BASE] = LexerState.new(patterns) states[BASE] = LexerState.new(patterns)
_states[BASE].add_transition(Constants.TokenType.BeginCommand,COMMAND,true) states[BASE].add_transition(Constants.TokenType.BeginCommand, COMMAND, true)
_states[BASE].add_transition(Constants.TokenType.OptionStart,LINK,true) states[BASE].add_transition(Constants.TokenType.OptionStart, LINK, true)
_states[BASE].add_transition(Constants.TokenType.ShortcutOption, shortcut_option) states[BASE].add_transition(Constants.TokenType.ShortcutOption, shortcut_option)
_states[BASE].add_transition(Constants.TokenType.TagMarker,TAG,true) states[BASE].add_transition(Constants.TokenType.TagMarker, TAG, true)
_states[BASE].add_text_rule(Constants.TokenType.Text) states[BASE].add_text_rule(Constants.TokenType.Text)
_states[TAG] = LexerState.new(patterns) states[TAG] = LexerState.new(patterns)
_states[TAG].add_transition(Constants.TokenType.Identifier,BASE) states[TAG].add_transition(Constants.TokenType.Identifier, BASE)
_states[shortcut_option] = LexerState.new(patterns) states[shortcut_option] = LexerState.new(patterns)
_states[shortcut_option].track_indent = true states[shortcut_option].track_indent = true
_states[shortcut_option].add_transition(Constants.TokenType.BeginCommand,EXPRESSION,true) states[shortcut_option].add_transition(Constants.TokenType.BeginCommand, EXPRESSION, true)
_states[shortcut_option].add_transition(Constants.TokenType.TagMarker,shortcut_option_tag,true) states[shortcut_option].add_transition(Constants.TokenType.TagMarker, shortcut_option_tag, true)
_states[shortcut_option].add_text_rule(Constants.TokenType.Text,BASE) states[shortcut_option].add_text_rule(Constants.TokenType.Text, BASE)
_states[shortcut_option_tag] = LexerState.new(patterns) states[shortcut_option_tag] = LexerState.new(patterns)
_states[shortcut_option_tag].add_transition(Constants.TokenType.Identifier,shortcut_option) states[shortcut_option_tag].add_transition(Constants.TokenType.Identifier, shortcut_option)
_states[COMMAND] = LexerState.new(patterns) states[COMMAND] = LexerState.new(patterns)
_states[COMMAND].add_transition(Constants.TokenType.IfToken,EXPRESSION) states[COMMAND].add_transition(Constants.TokenType.IfToken, EXPRESSION)
_states[COMMAND].add_transition(Constants.TokenType.ElseToken) states[COMMAND].add_transition(Constants.TokenType.ElseToken)
_states[COMMAND].add_transition(Constants.TokenType.ElseIf,EXPRESSION) states[COMMAND].add_transition(Constants.TokenType.ElseIf, EXPRESSION)
_states[COMMAND].add_transition(Constants.TokenType.EndIf) states[COMMAND].add_transition(Constants.TokenType.EndIf)
_states[COMMAND].add_transition(Constants.TokenType.Set, ASSIGNMENT) states[COMMAND].add_transition(Constants.TokenType.Set, ASSIGNMENT)
_states[COMMAND].add_transition(Constants.TokenType.EndCommand,BASE,true) states[COMMAND].add_transition(Constants.TokenType.EndCommand, BASE, true)
_states[COMMAND].add_transition(Constants.TokenType.Identifier,command_or_expression) states[COMMAND].add_transition(Constants.TokenType.Identifier, command_or_expression)
_states[COMMAND].add_text_rule(Constants.TokenType.Text) states[COMMAND].add_text_rule(Constants.TokenType.Text)
_states[command_or_expression] = LexerState.new(patterns) states[command_or_expression] = LexerState.new(patterns)
_states[command_or_expression].add_transition(Constants.TokenType.LeftParen,EXPRESSION) states[command_or_expression].add_transition(Constants.TokenType.LeftParen, EXPRESSION)
_states[command_or_expression].add_transition(Constants.TokenType.EndCommand,BASE,true) states[command_or_expression].add_transition(Constants.TokenType.EndCommand, BASE, true)
_states[command_or_expression].add_text_rule(Constants.TokenType.Text) states[command_or_expression].add_text_rule(Constants.TokenType.Text)
_states[ASSIGNMENT] = LexerState.new(patterns) states[ASSIGNMENT] = LexerState.new(patterns)
_states[ASSIGNMENT].add_transition(Constants.TokenType.Variable) states[ASSIGNMENT].add_transition(Constants.TokenType.Variable)
_states[ASSIGNMENT].add_transition(Constants.TokenType.EqualToOrAssign, EXPRESSION) states[ASSIGNMENT].add_transition(Constants.TokenType.EqualToOrAssign, EXPRESSION)
_states[ASSIGNMENT].add_transition(Constants.TokenType.AddAssign, EXPRESSION) states[ASSIGNMENT].add_transition(Constants.TokenType.AddAssign, EXPRESSION)
_states[ASSIGNMENT].add_transition(Constants.TokenType.MinusAssign, EXPRESSION) states[ASSIGNMENT].add_transition(Constants.TokenType.MinusAssign, EXPRESSION)
_states[ASSIGNMENT].add_transition(Constants.TokenType.MultiplyAssign, EXPRESSION) states[ASSIGNMENT].add_transition(Constants.TokenType.MultiplyAssign, EXPRESSION)
_states[ASSIGNMENT].add_transition(Constants.TokenType.DivideAssign, EXPRESSION) states[ASSIGNMENT].add_transition(Constants.TokenType.DivideAssign, EXPRESSION)
_states[EXPRESSION] = LexerState.new(patterns) states[EXPRESSION] = LexerState.new(patterns)
_states[EXPRESSION].add_transition(Constants.TokenType.EndCommand, BASE) states[EXPRESSION].add_transition(Constants.TokenType.EndCommand, BASE)
_states[EXPRESSION].add_transition(Constants.TokenType.Number) states[EXPRESSION].add_transition(Constants.TokenType.Number)
_states[EXPRESSION].add_transition(Constants.TokenType.Str) states[EXPRESSION].add_transition(Constants.TokenType.Str)
_states[EXPRESSION].add_transition(Constants.TokenType.LeftParen) states[EXPRESSION].add_transition(Constants.TokenType.LeftParen)
_states[EXPRESSION].add_transition(Constants.TokenType.RightParen) states[EXPRESSION].add_transition(Constants.TokenType.RightParen)
_states[EXPRESSION].add_transition(Constants.TokenType.EqualTo) states[EXPRESSION].add_transition(Constants.TokenType.EqualTo)
_states[EXPRESSION].add_transition(Constants.TokenType.EqualToOrAssign) states[EXPRESSION].add_transition(Constants.TokenType.EqualToOrAssign)
_states[EXPRESSION].add_transition(Constants.TokenType.NotEqualTo) states[EXPRESSION].add_transition(Constants.TokenType.NotEqualTo)
_states[EXPRESSION].add_transition(Constants.TokenType.GreaterThanOrEqualTo) states[EXPRESSION].add_transition(Constants.TokenType.GreaterThanOrEqualTo)
_states[EXPRESSION].add_transition(Constants.TokenType.GreaterThan) states[EXPRESSION].add_transition(Constants.TokenType.GreaterThan)
_states[EXPRESSION].add_transition(Constants.TokenType.LessThanOrEqualTo) states[EXPRESSION].add_transition(Constants.TokenType.LessThanOrEqualTo)
_states[EXPRESSION].add_transition(Constants.TokenType.LessThan) states[EXPRESSION].add_transition(Constants.TokenType.LessThan)
_states[EXPRESSION].add_transition(Constants.TokenType.Add) states[EXPRESSION].add_transition(Constants.TokenType.Add)
_states[EXPRESSION].add_transition(Constants.TokenType.Minus) states[EXPRESSION].add_transition(Constants.TokenType.Minus)
_states[EXPRESSION].add_transition(Constants.TokenType.Multiply) states[EXPRESSION].add_transition(Constants.TokenType.Multiply)
_states[EXPRESSION].add_transition(Constants.TokenType.Divide) states[EXPRESSION].add_transition(Constants.TokenType.Divide)
_states[EXPRESSION].add_transition(Constants.TokenType.Modulo) states[EXPRESSION].add_transition(Constants.TokenType.Modulo)
_states[EXPRESSION].add_transition(Constants.TokenType.And) states[EXPRESSION].add_transition(Constants.TokenType.And)
_states[EXPRESSION].add_transition(Constants.TokenType.Or) states[EXPRESSION].add_transition(Constants.TokenType.Or)
_states[EXPRESSION].add_transition(Constants.TokenType.Xor) states[EXPRESSION].add_transition(Constants.TokenType.Xor)
_states[EXPRESSION].add_transition(Constants.TokenType.Not) states[EXPRESSION].add_transition(Constants.TokenType.Not)
_states[EXPRESSION].add_transition(Constants.TokenType.Variable) states[EXPRESSION].add_transition(Constants.TokenType.Variable)
_states[EXPRESSION].add_transition(Constants.TokenType.Comma) states[EXPRESSION].add_transition(Constants.TokenType.Comma)
_states[EXPRESSION].add_transition(Constants.TokenType.TrueToken) states[EXPRESSION].add_transition(Constants.TokenType.TrueToken)
_states[EXPRESSION].add_transition(Constants.TokenType.FalseToken) states[EXPRESSION].add_transition(Constants.TokenType.FalseToken)
_states[EXPRESSION].add_transition(Constants.TokenType.NullToken) states[EXPRESSION].add_transition(Constants.TokenType.NullToken)
_states[EXPRESSION].add_transition(Constants.TokenType.Identifier) states[EXPRESSION].add_transition(Constants.TokenType.Identifier)
_states[LINK] = LexerState.new(patterns) states[LINK] = LexerState.new(patterns)
_states[LINK].add_transition(Constants.TokenType.OptionEnd, BASE, true) states[LINK].add_transition(Constants.TokenType.OptionEnd, BASE, true)
_states[LINK].add_transition(Constants.TokenType.OptionDelimit, link_destination, true) states[LINK].add_transition(Constants.TokenType.OptionDelimit, link_destination, true)
_states[LINK].add_text_rule(Constants.TokenType.Text) states[LINK].add_text_rule(Constants.TokenType.Text)
_states[link_destination] = LexerState.new(patterns) states[link_destination] = LexerState.new(patterns)
_states[link_destination].add_transition(Constants.TokenType.Identifier) states[link_destination].add_transition(Constants.TokenType.Identifier)
_states[link_destination].add_transition(Constants.TokenType.OptionEnd, BASE) states[link_destination].add_transition(Constants.TokenType.OptionEnd, BASE)
_defaultState = _states[BASE] default_state = states[BASE]
for stateKey in _states.keys(): for key in states.keys():
_states[stateKey].stateName = stateKey states[key].name = key
func tokenize(): func tokenize():
_indentStack.clear()
_indentStack.push_front(IntBoolPair.new(0, false))
_shouldTrackIndent = false
var tokens = [] var tokens = []
_currentState = _defaultState indent_stack.clear()
indent_stack.push_front([0, false])
should_track_indent = false
current_state = default_state
var lines = text.split(LINE_SEPARATOR) var lines = text.split(LINE_SEPARATOR)
lines.append('') var line_number = 1
var line_number : int = 1 lines.append('')
for line in lines: for line in lines:
tokens += tokenize_line(line, line_number) tokens += tokenize_line(line, line_number)
line_number += 1 line_number += 1
var endOfInput = Token.new( var end_of_input = Token.new(
Constants.TokenType.EndOfInput, Constants.TokenType.EndOfInput,
_currentState, current_state,
line_number, line_number,
0 0
) )
tokens.append(endOfInput) tokens.append(end_of_input)
return tokens return tokens
func tokenize_line(line, line_number): func tokenize_line(line, line_number):
var tokenStack : Array = [] var token_stack = []
var freshLine = line.replace('\t',' ').replace('\r','') var fresh_line = line.replace('\t',' ').replace('\r','')
#record indentation
var indentation = line_indentation(line) var indentation = line_indentation(line)
var prevIndentation = _indentStack.front() var previous_indentation = indent_stack.front()[0]
if _shouldTrackIndent && indentation > prevIndentation.key: if should_track_indent && indentation > previous_indentation:
#we add an indenation token to record indent level indent_stack.push_front([indentation, true])
_indentStack.push_front(IntBoolPair.new(indentation,true))
var indent : Token = Token.new( var indent = Token.new(
Constants.TokenType.Indent, Constants.TokenType.Indent,
_currentState, current_state,
filename, filename,
line_number, line_number,
prevIndentation.key previous_indentation
) )
indent.value = '%*s' % [indentation - prevIndentation.key,''] indent.value = '%*s' % [indentation - previous_indentation, '']
_shouldTrackIndent = false should_track_indent = false
tokenStack.push_front(indent) token_stack.push_front(indent)
elif indentation < prevIndentation.key: elif indentation < previous_indentation:
#de-indent and then emit indentaiton token while indentation < indent_stack.front()[0]:
var top = indent_stack.pop_front()[1]
if top:
var deindent = Token.new(Constants.TokenType.Dedent, current_state, line_number, 0)
token_stack.push_front(deindent)
while indentation < _indentStack.front().key: var column = indentation
var top : IntBoolPair = _indentStack.pop_front() var whitespace = RegEx.new()
if top.value: whitespace.compile(WHITESPACE)
var deIndent : Token = Token.new(Constants.TokenType.Dedent,_currentState,line_number,0)
tokenStack.push_front(deIndent)
while column < fresh_line.length():
var column : int = indentation if fresh_line.substr(column).begins_with(LINE_COMENT):
var whitespace : RegEx = RegEx.new()
var error = whitespace.compile(WHITESPACE)
if error != OK:
printerr('unable to compile regex WHITESPACE')
return []
while column < freshLine.length():
if freshLine.substr(column).begins_with(LINE_COMENT):
break break
var matched : bool = false var matched = false
for rule in _currentState.rules: for rule in current_state.rules:
var found = rule.regex.search(freshLine, column) var found = rule.regex.search(fresh_line, column)
if !found: if !found:
continue continue
var tokenText : String var token_text = ''
# NOTE: If this is text then we back up to the most recent delimiting token
# and treat everything from there as text.
if rule.token_type == Constants.TokenType.Text: if rule.token_type == Constants.TokenType.Text:
#if this is text then we back up to the most recent
#delimiting token and treat everything from there as text.
var startIndex : int = indentation var start_index = indentation
if tokenStack.size() > 0 : if token_stack.size() > 0 :
while tokenStack.front().type == Constants.TokenType.Identifier: while token_stack.front().type == Constants.TokenType.Identifier:
tokenStack.pop_front() token_stack.pop_front()
var startDelimitToken : Token = tokenStack.front() var start_delimit_token = token_stack.front()
startIndex = startDelimitToken.column start_index = start_delimit_token.column
if startDelimitToken.type == Constants.TokenType.Indent: if start_delimit_token.type == Constants.TokenType.Indent:
startIndex += startDelimitToken.value.length() start_index += start_delimit_token.value.length()
if startDelimitToken.type == Constants.TokenType.Dedent: if start_delimit_token.type == Constants.TokenType.Dedent:
startIndex = indentation start_index = indentation
#
column = startIndex column = start_index
var end_index = found.get_start() + found.get_string().length() var end_index = found.get_start() + found.get_string().length()
tokenText = freshLine.substr(startIndex, end_index - startIndex) token_text = fresh_line.substr(start_index, end_index - start_index)
else: else:
tokenText = found.get_string() token_text = found.get_string()
column += tokenText.length() column += token_text.length()
#pre-proccess string
if rule.token_type == Constants.TokenType.Str: if rule.token_type == Constants.TokenType.Str:
tokenText = tokenText.substr(1, tokenText.length() - 2) token_text = token_text.substr(1, token_text.length() - 2)
tokenText = tokenText.replace('\\\\', '\\') token_text = token_text.replace('\\\\', '\\')
tokenText = tokenText.replace('\\\'','\'') token_text = token_text.replace('\\\'','\'')
var token = Token.new( var token = Token.new(
rule.token_type, rule.token_type,
_currentState, current_state,
filename, filename,
line_number, line_number,
column, column,
tokenText token_text
) )
token.delimits_text = rule.delimits_text token.delimits_text = rule.delimits_text
tokenStack.push_front(token) token_stack.push_front(token)
if rule.enter_state != null and rule.enter_state.length() > 0: if rule.enter_state != null and rule.enter_state.length() > 0:
if not _states.has(rule.enter_state): if not states.has(rule.enter_state):
printerr('State[%s] not known - line(%s) col(%s)' % [rule.enter_state, line_number, column]) printerr('State[%s] not known - line(%s) col(%s)' % [rule.enter_state, line_number, column])
return [] return []
enter_state(_states[rule.enter_state]) enter_state(states[rule.enter_state])
if _shouldTrackIndent: if should_track_indent:
if _indentStack.front().key < indentation: if indent_stack.front()[0] < indentation:
_indentStack.append(IntBoolPair.new(indentation, false)) indent_stack.append([indentation, false])
matched = true matched = true
break break
if not matched: if not matched:
var rules = [] var rules = []
for rule in _currentState.rules: for rule in current_state.rules:
rules.append('"%s" (%s)' % [Constants.token_type_name(rule.token_type), rule.human_readable_identifier]) rules.append('"%s" (%s)' % [Constants.token_type_name(rule.token_type), rule.human_readable_identifier])
var error_data = [ var error_data = [
@ -340,30 +325,30 @@ func tokenize_line(line, line_number):
] ]
assert(false, 'Expected %s in file %s in node "%s" on line #%d (column #%d)' % error_data) assert(false, 'Expected %s in file %s in node "%s" on line #%d (column #%d)' % error_data)
var lastWhiteSpace = whitespace.search(line, column) var last_whitespace = whitespace.search(line, column)
if lastWhiteSpace: if last_whitespace:
column += lastWhiteSpace.get_string().length() column += last_whitespace.get_string().length()
tokenStack.invert() token_stack.invert()
return tokenStack return token_stack
func line_indentation(line:String)->int: func line_indentation(line):
var indentRegex : RegEx = RegEx.new() var indent_regex = RegEx.new()
indentRegex.compile('^(\\s*)') indent_regex.compile('^(\\s*)')
var found : RegExMatch = indentRegex.search(line) var found = indent_regex.search(line)
if !found || found.get_string().length() <= 0: if !found or found.get_string().length() <= 0:
return 0 return 0
return found.get_string().length() return found.get_string().length()
func enter_state(state:LexerState): func enter_state(state):
_currentState = state; current_state = state;
if _currentState.track_indent: if current_state.track_indent:
_shouldTrackIndent = true should_track_indent = true
class Token: class Token:
var type = -1 var type = -1
@ -375,38 +360,36 @@ class Token:
var text = '' var text = ''
var delimits_text = false var delimits_text = false
var paramCount = -1 var parameter_count = -1
var lexerState = '' var lexer_state = ''
func _init(_type, _state, _filename, _line_number = -1, _column = -1, _value = ''): func _init(_type, _state, _filename, _line_number = -1, _column = -1, _value = ''):
type = _type type = _type
lexerState = _state.stateName lexer_state = _state.name
filename = _filename filename = _filename
line_number = _line_number line_number = _line_number
column = _column column = _column
value = _value value = _value
func _to_string(): func _to_string():
return '%s (%s) at %s:%s (state: %s)' % [Constants.token_type_name(type),value,line_number,column,lexerState] return '%s (%s) at %s:%s (state: %s)' % [Constants.token_type_name(type),value, line_number, column, lexer_state]
class LexerState: class LexerState:
var name = ''
var stateName : String var patterns = {}
var patterns : Dictionary var rules = []
var rules : Array = [] var track_indent = false
var track_indent : bool = false
func _init(_patterns): func _init(_patterns):
patterns = _patterns patterns = _patterns
func add_transition(type : int, state : String = '',delimitText : bool = false)->Rule: func add_transition(type, state = '', delimit_text = false):
var pattern = '\\G%s' % patterns[type][0] var pattern = '\\G%s' % patterns[type][0]
# print('pattern = %s' % pattern) var rule = Rule.new(type, pattern, patterns[type][1], state, delimit_text)
var rule = Rule.new(type, pattern, patterns[type][1], state, delimitText)
rules.append(rule) rules.append(rule)
return rule return rule
func add_text_rule(type : int, state : String = '')->Rule: func add_text_rule(type, state = ''):
if contains_text_rule() : if contains_text_rule() :
printerr('State already contains Text rule') printerr('State already contains Text rule')
return null return null
@ -417,25 +400,25 @@ class LexerState:
delimiters.append('%s' % rule.regex.get_pattern().substr(2)) delimiters.append('%s' % rule.regex.get_pattern().substr(2))
var pattern = '\\G((?!%s).)*' % [PoolStringArray(delimiters).join('|')] var pattern = '\\G((?!%s).)*' % [PoolStringArray(delimiters).join('|')]
var rule : Rule = add_transition(type,state) var rule = add_transition(type, state)
rule.regex = RegEx.new() rule.regex = RegEx.new()
rule.regex.compile(pattern) rule.regex.compile(pattern)
rule.is_text_rule = true rule.is_text_rule = true
return rule return rule
func contains_text_rule()->bool: func contains_text_rule():
for rule in rules: for rule in rules:
if rule.is_text_rule: if rule.is_text_rule:
return true return true
return false return false
class Rule: class Rule:
var regex : RegEx var regex
var enter_state : String var enter_state = ''
var token_type : int var token_type = -1
var is_text_rule : bool var is_text_rule = false
var delimits_text : bool var delimits_text = false
var human_readable_identifier = '' var human_readable_identifier = ''
func _init(_type, _regex, _human_readable_identifier, _enter_state, _delimits_text): func _init(_type, _regex, _human_readable_identifier, _enter_state, _delimits_text):
@ -450,12 +433,3 @@ class Rule:
func _to_string(): func _to_string():
return '[Rule : %s (%s) - %s]' % [Constants.token_type_name(token_type), human_readable_identifier, regex] return '[Rule : %s (%s) - %s]' % [Constants.token_type_name(token_type), human_readable_identifier, regex]
class IntBoolPair:
var key = -1
var value = false
func _init(_key, _value):
key = _key
value = _value