cleaned up lexer

This commit is contained in:
Bram Dingelstad 2021-11-21 15:03:56 +01:00
parent 83c4808d5a
commit 9f60bbfd23

View file

@ -21,26 +21,26 @@ const DESTINATION = 'destination'
var WHITESPACE = '\\s*'
var _states = {}
var _defaultState
var _currentState
var _indentStack = []
var _shouldTrackIndent : bool = false
var filename = ''
var title = ''
var text = ''
var states = {}
var default_state
var current_state
var indent_stack = []
var should_track_indent = false
func _init(_filename, _title, _text):
create_states()
createstates()
filename = _filename
title = _title
text = _text
func create_states():
var patterns : Dictionary = {}
func createstates():
var patterns = {}
patterns[Constants.TokenType.Text] = ['.*', 'any text']
patterns[Constants.TokenType.Number] = ['\\-?[0-9]+(\\.[0-9+])?', 'any number']
@ -86,249 +86,234 @@ func create_states():
patterns[Constants.TokenType.Set] = ['set(?!\\w)', '"set"']
patterns[Constants.TokenType.ShortcutOption] = ['\\-\\>\\s*', '"->"']
#compound states
var shortcut_option : String = SHORTCUT + DASH + OPTION
var shortcut_option_tag : String = shortcut_option + DASH + TAG
var command_or_expression : String = COMMAND + DASH + OR + DASH + EXPRESSION
var link_destination : String = LINK + DASH + DESTINATION
var shortcut_option = SHORTCUT + DASH + OPTION
var shortcut_option_tag = shortcut_option + DASH + TAG
var command_or_expression = COMMAND + DASH + OR + DASH + EXPRESSION
var link_destination = LINK + DASH + DESTINATION
_states = {}
states = {}
_states[BASE] = LexerState.new(patterns)
_states[BASE].add_transition(Constants.TokenType.BeginCommand,COMMAND,true)
_states[BASE].add_transition(Constants.TokenType.OptionStart,LINK,true)
_states[BASE].add_transition(Constants.TokenType.ShortcutOption, shortcut_option)
_states[BASE].add_transition(Constants.TokenType.TagMarker,TAG,true)
_states[BASE].add_text_rule(Constants.TokenType.Text)
states[BASE] = LexerState.new(patterns)
states[BASE].add_transition(Constants.TokenType.BeginCommand, COMMAND, true)
states[BASE].add_transition(Constants.TokenType.OptionStart, LINK, true)
states[BASE].add_transition(Constants.TokenType.ShortcutOption, shortcut_option)
states[BASE].add_transition(Constants.TokenType.TagMarker, TAG, true)
states[BASE].add_text_rule(Constants.TokenType.Text)
_states[TAG] = LexerState.new(patterns)
_states[TAG].add_transition(Constants.TokenType.Identifier,BASE)
states[TAG] = LexerState.new(patterns)
states[TAG].add_transition(Constants.TokenType.Identifier, BASE)
_states[shortcut_option] = LexerState.new(patterns)
_states[shortcut_option].track_indent = true
_states[shortcut_option].add_transition(Constants.TokenType.BeginCommand,EXPRESSION,true)
_states[shortcut_option].add_transition(Constants.TokenType.TagMarker,shortcut_option_tag,true)
_states[shortcut_option].add_text_rule(Constants.TokenType.Text,BASE)
states[shortcut_option] = LexerState.new(patterns)
states[shortcut_option].track_indent = true
states[shortcut_option].add_transition(Constants.TokenType.BeginCommand, EXPRESSION, true)
states[shortcut_option].add_transition(Constants.TokenType.TagMarker, shortcut_option_tag, true)
states[shortcut_option].add_text_rule(Constants.TokenType.Text, BASE)
_states[shortcut_option_tag] = LexerState.new(patterns)
_states[shortcut_option_tag].add_transition(Constants.TokenType.Identifier,shortcut_option)
states[shortcut_option_tag] = LexerState.new(patterns)
states[shortcut_option_tag].add_transition(Constants.TokenType.Identifier, shortcut_option)
_states[COMMAND] = LexerState.new(patterns)
_states[COMMAND].add_transition(Constants.TokenType.IfToken,EXPRESSION)
_states[COMMAND].add_transition(Constants.TokenType.ElseToken)
_states[COMMAND].add_transition(Constants.TokenType.ElseIf,EXPRESSION)
_states[COMMAND].add_transition(Constants.TokenType.EndIf)
_states[COMMAND].add_transition(Constants.TokenType.Set, ASSIGNMENT)
_states[COMMAND].add_transition(Constants.TokenType.EndCommand,BASE,true)
_states[COMMAND].add_transition(Constants.TokenType.Identifier,command_or_expression)
_states[COMMAND].add_text_rule(Constants.TokenType.Text)
states[COMMAND] = LexerState.new(patterns)
states[COMMAND].add_transition(Constants.TokenType.IfToken, EXPRESSION)
states[COMMAND].add_transition(Constants.TokenType.ElseToken)
states[COMMAND].add_transition(Constants.TokenType.ElseIf, EXPRESSION)
states[COMMAND].add_transition(Constants.TokenType.EndIf)
states[COMMAND].add_transition(Constants.TokenType.Set, ASSIGNMENT)
states[COMMAND].add_transition(Constants.TokenType.EndCommand, BASE, true)
states[COMMAND].add_transition(Constants.TokenType.Identifier, command_or_expression)
states[COMMAND].add_text_rule(Constants.TokenType.Text)
_states[command_or_expression] = LexerState.new(patterns)
_states[command_or_expression].add_transition(Constants.TokenType.LeftParen,EXPRESSION)
_states[command_or_expression].add_transition(Constants.TokenType.EndCommand,BASE,true)
_states[command_or_expression].add_text_rule(Constants.TokenType.Text)
states[command_or_expression] = LexerState.new(patterns)
states[command_or_expression].add_transition(Constants.TokenType.LeftParen, EXPRESSION)
states[command_or_expression].add_transition(Constants.TokenType.EndCommand, BASE, true)
states[command_or_expression].add_text_rule(Constants.TokenType.Text)
_states[ASSIGNMENT] = LexerState.new(patterns)
_states[ASSIGNMENT].add_transition(Constants.TokenType.Variable)
_states[ASSIGNMENT].add_transition(Constants.TokenType.EqualToOrAssign, EXPRESSION)
_states[ASSIGNMENT].add_transition(Constants.TokenType.AddAssign, EXPRESSION)
_states[ASSIGNMENT].add_transition(Constants.TokenType.MinusAssign, EXPRESSION)
_states[ASSIGNMENT].add_transition(Constants.TokenType.MultiplyAssign, EXPRESSION)
_states[ASSIGNMENT].add_transition(Constants.TokenType.DivideAssign, EXPRESSION)
states[ASSIGNMENT] = LexerState.new(patterns)
states[ASSIGNMENT].add_transition(Constants.TokenType.Variable)
states[ASSIGNMENT].add_transition(Constants.TokenType.EqualToOrAssign, EXPRESSION)
states[ASSIGNMENT].add_transition(Constants.TokenType.AddAssign, EXPRESSION)
states[ASSIGNMENT].add_transition(Constants.TokenType.MinusAssign, EXPRESSION)
states[ASSIGNMENT].add_transition(Constants.TokenType.MultiplyAssign, EXPRESSION)
states[ASSIGNMENT].add_transition(Constants.TokenType.DivideAssign, EXPRESSION)
_states[EXPRESSION] = LexerState.new(patterns)
_states[EXPRESSION].add_transition(Constants.TokenType.EndCommand, BASE)
_states[EXPRESSION].add_transition(Constants.TokenType.Number)
_states[EXPRESSION].add_transition(Constants.TokenType.Str)
_states[EXPRESSION].add_transition(Constants.TokenType.LeftParen)
_states[EXPRESSION].add_transition(Constants.TokenType.RightParen)
_states[EXPRESSION].add_transition(Constants.TokenType.EqualTo)
_states[EXPRESSION].add_transition(Constants.TokenType.EqualToOrAssign)
_states[EXPRESSION].add_transition(Constants.TokenType.NotEqualTo)
_states[EXPRESSION].add_transition(Constants.TokenType.GreaterThanOrEqualTo)
_states[EXPRESSION].add_transition(Constants.TokenType.GreaterThan)
_states[EXPRESSION].add_transition(Constants.TokenType.LessThanOrEqualTo)
_states[EXPRESSION].add_transition(Constants.TokenType.LessThan)
_states[EXPRESSION].add_transition(Constants.TokenType.Add)
_states[EXPRESSION].add_transition(Constants.TokenType.Minus)
_states[EXPRESSION].add_transition(Constants.TokenType.Multiply)
_states[EXPRESSION].add_transition(Constants.TokenType.Divide)
_states[EXPRESSION].add_transition(Constants.TokenType.Modulo)
_states[EXPRESSION].add_transition(Constants.TokenType.And)
_states[EXPRESSION].add_transition(Constants.TokenType.Or)
_states[EXPRESSION].add_transition(Constants.TokenType.Xor)
_states[EXPRESSION].add_transition(Constants.TokenType.Not)
_states[EXPRESSION].add_transition(Constants.TokenType.Variable)
_states[EXPRESSION].add_transition(Constants.TokenType.Comma)
_states[EXPRESSION].add_transition(Constants.TokenType.TrueToken)
_states[EXPRESSION].add_transition(Constants.TokenType.FalseToken)
_states[EXPRESSION].add_transition(Constants.TokenType.NullToken)
_states[EXPRESSION].add_transition(Constants.TokenType.Identifier)
states[EXPRESSION] = LexerState.new(patterns)
states[EXPRESSION].add_transition(Constants.TokenType.EndCommand, BASE)
states[EXPRESSION].add_transition(Constants.TokenType.Number)
states[EXPRESSION].add_transition(Constants.TokenType.Str)
states[EXPRESSION].add_transition(Constants.TokenType.LeftParen)
states[EXPRESSION].add_transition(Constants.TokenType.RightParen)
states[EXPRESSION].add_transition(Constants.TokenType.EqualTo)
states[EXPRESSION].add_transition(Constants.TokenType.EqualToOrAssign)
states[EXPRESSION].add_transition(Constants.TokenType.NotEqualTo)
states[EXPRESSION].add_transition(Constants.TokenType.GreaterThanOrEqualTo)
states[EXPRESSION].add_transition(Constants.TokenType.GreaterThan)
states[EXPRESSION].add_transition(Constants.TokenType.LessThanOrEqualTo)
states[EXPRESSION].add_transition(Constants.TokenType.LessThan)
states[EXPRESSION].add_transition(Constants.TokenType.Add)
states[EXPRESSION].add_transition(Constants.TokenType.Minus)
states[EXPRESSION].add_transition(Constants.TokenType.Multiply)
states[EXPRESSION].add_transition(Constants.TokenType.Divide)
states[EXPRESSION].add_transition(Constants.TokenType.Modulo)
states[EXPRESSION].add_transition(Constants.TokenType.And)
states[EXPRESSION].add_transition(Constants.TokenType.Or)
states[EXPRESSION].add_transition(Constants.TokenType.Xor)
states[EXPRESSION].add_transition(Constants.TokenType.Not)
states[EXPRESSION].add_transition(Constants.TokenType.Variable)
states[EXPRESSION].add_transition(Constants.TokenType.Comma)
states[EXPRESSION].add_transition(Constants.TokenType.TrueToken)
states[EXPRESSION].add_transition(Constants.TokenType.FalseToken)
states[EXPRESSION].add_transition(Constants.TokenType.NullToken)
states[EXPRESSION].add_transition(Constants.TokenType.Identifier)
_states[LINK] = LexerState.new(patterns)
_states[LINK].add_transition(Constants.TokenType.OptionEnd, BASE, true)
_states[LINK].add_transition(Constants.TokenType.OptionDelimit, link_destination, true)
_states[LINK].add_text_rule(Constants.TokenType.Text)
states[LINK] = LexerState.new(patterns)
states[LINK].add_transition(Constants.TokenType.OptionEnd, BASE, true)
states[LINK].add_transition(Constants.TokenType.OptionDelimit, link_destination, true)
states[LINK].add_text_rule(Constants.TokenType.Text)
_states[link_destination] = LexerState.new(patterns)
_states[link_destination].add_transition(Constants.TokenType.Identifier)
_states[link_destination].add_transition(Constants.TokenType.OptionEnd, BASE)
states[link_destination] = LexerState.new(patterns)
states[link_destination].add_transition(Constants.TokenType.Identifier)
states[link_destination].add_transition(Constants.TokenType.OptionEnd, BASE)
_defaultState = _states[BASE]
default_state = states[BASE]
for stateKey in _states.keys():
_states[stateKey].stateName = stateKey
for key in states.keys():
states[key].name = key
func tokenize():
_indentStack.clear()
_indentStack.push_front(IntBoolPair.new(0, false))
_shouldTrackIndent = false
var tokens = []
_currentState = _defaultState
indent_stack.clear()
indent_stack.push_front([0, false])
should_track_indent = false
current_state = default_state
var lines = text.split(LINE_SEPARATOR)
lines.append('')
var line_number = 1
var line_number : int = 1
lines.append('')
for line in lines:
tokens += tokenize_line(line, line_number)
line_number += 1
var endOfInput = Token.new(
var end_of_input = Token.new(
Constants.TokenType.EndOfInput,
_currentState,
current_state,
line_number,
0
)
tokens.append(endOfInput)
tokens.append(end_of_input)
return tokens
func tokenize_line(line, line_number):
var tokenStack : Array = []
var token_stack = []
var freshLine = line.replace('\t',' ').replace('\r','')
var fresh_line = line.replace('\t',' ').replace('\r','')
#record indentation
var indentation = line_indentation(line)
var prevIndentation = _indentStack.front()
var previous_indentation = indent_stack.front()[0]
if _shouldTrackIndent && indentation > prevIndentation.key:
#we add an indenation token to record indent level
_indentStack.push_front(IntBoolPair.new(indentation,true))
if should_track_indent && indentation > previous_indentation:
indent_stack.push_front([indentation, true])
var indent : Token = Token.new(
var indent = Token.new(
Constants.TokenType.Indent,
_currentState,
current_state,
filename,
line_number,
prevIndentation.key
previous_indentation
)
indent.value = '%*s' % [indentation - prevIndentation.key,'']
indent.value = '%*s' % [indentation - previous_indentation, '']
_shouldTrackIndent = false
tokenStack.push_front(indent)
should_track_indent = false
token_stack.push_front(indent)
elif indentation < prevIndentation.key:
#de-indent and then emit indentaiton token
while indentation < _indentStack.front().key:
var top : IntBoolPair = _indentStack.pop_front()
if top.value:
var deIndent : Token = Token.new(Constants.TokenType.Dedent,_currentState,line_number,0)
tokenStack.push_front(deIndent)
elif indentation < previous_indentation:
while indentation < indent_stack.front()[0]:
var top = indent_stack.pop_front()[1]
if top:
var deindent = Token.new(Constants.TokenType.Dedent, current_state, line_number, 0)
token_stack.push_front(deindent)
var column : int = indentation
var column = indentation
var whitespace = RegEx.new()
whitespace.compile(WHITESPACE)
var whitespace : RegEx = RegEx.new()
var error = whitespace.compile(WHITESPACE)
if error != OK:
printerr('unable to compile regex WHITESPACE')
return []
while column < freshLine.length():
if freshLine.substr(column).begins_with(LINE_COMENT):
while column < fresh_line.length():
if fresh_line.substr(column).begins_with(LINE_COMENT):
break
var matched : bool = false
var matched = false
for rule in _currentState.rules:
var found = rule.regex.search(freshLine, column)
for rule in current_state.rules:
var found = rule.regex.search(fresh_line, column)
if !found:
continue
var tokenText : String
var token_text = ''
# NOTE: If this is text then we back up to the most recent delimiting token
# and treat everything from there as text.
if rule.token_type == Constants.TokenType.Text:
#if this is text then we back up to the most recent
#delimiting token and treat everything from there as text.
var startIndex : int = indentation
var start_index = indentation
if tokenStack.size() > 0 :
while tokenStack.front().type == Constants.TokenType.Identifier:
tokenStack.pop_front()
if token_stack.size() > 0 :
while token_stack.front().type == Constants.TokenType.Identifier:
token_stack.pop_front()
var startDelimitToken : Token = tokenStack.front()
startIndex = startDelimitToken.column
var start_delimit_token = token_stack.front()
start_index = start_delimit_token.column
if startDelimitToken.type == Constants.TokenType.Indent:
startIndex += startDelimitToken.value.length()
if startDelimitToken.type == Constants.TokenType.Dedent:
startIndex = indentation
#
if start_delimit_token.type == Constants.TokenType.Indent:
start_index += start_delimit_token.value.length()
if start_delimit_token.type == Constants.TokenType.Dedent:
start_index = indentation
column = startIndex
column = start_index
var end_index = found.get_start() + found.get_string().length()
tokenText = freshLine.substr(startIndex, end_index - startIndex)
token_text = fresh_line.substr(start_index, end_index - start_index)
else:
tokenText = found.get_string()
token_text = found.get_string()
column += tokenText.length()
column += token_text.length()
#pre-proccess string
if rule.token_type == Constants.TokenType.Str:
tokenText = tokenText.substr(1, tokenText.length() - 2)
tokenText = tokenText.replace('\\\\', '\\')
tokenText = tokenText.replace('\\\'','\'')
token_text = token_text.substr(1, token_text.length() - 2)
token_text = token_text.replace('\\\\', '\\')
token_text = token_text.replace('\\\'','\'')
var token = Token.new(
rule.token_type,
_currentState,
current_state,
filename,
line_number,
column,
tokenText
token_text
)
token.delimits_text = rule.delimits_text
tokenStack.push_front(token)
token_stack.push_front(token)
if rule.enter_state != null and rule.enter_state.length() > 0:
if not _states.has(rule.enter_state):
if not states.has(rule.enter_state):
printerr('State[%s] not known - line(%s) col(%s)' % [rule.enter_state, line_number, column])
return []
enter_state(_states[rule.enter_state])
enter_state(states[rule.enter_state])
if _shouldTrackIndent:
if _indentStack.front().key < indentation:
_indentStack.append(IntBoolPair.new(indentation, false))
if should_track_indent:
if indent_stack.front()[0] < indentation:
indent_stack.append([indentation, false])
matched = true
break
if not matched:
var rules = []
for rule in _currentState.rules:
for rule in current_state.rules:
rules.append('"%s" (%s)' % [Constants.token_type_name(rule.token_type), rule.human_readable_identifier])
var error_data = [
@ -340,30 +325,30 @@ func tokenize_line(line, line_number):
]
assert(false, 'Expected %s in file %s in node "%s" on line #%d (column #%d)' % error_data)
var lastWhiteSpace = whitespace.search(line, column)
if lastWhiteSpace:
column += lastWhiteSpace.get_string().length()
var last_whitespace = whitespace.search(line, column)
if last_whitespace:
column += last_whitespace.get_string().length()
tokenStack.invert()
token_stack.invert()
return tokenStack
return token_stack
func line_indentation(line:String)->int:
var indentRegex : RegEx = RegEx.new()
indentRegex.compile('^(\\s*)')
func line_indentation(line):
var indent_regex = RegEx.new()
indent_regex.compile('^(\\s*)')
var found : RegExMatch = indentRegex.search(line)
var found = indent_regex.search(line)
if !found || found.get_string().length() <= 0:
if !found or found.get_string().length() <= 0:
return 0
return found.get_string().length()
func enter_state(state:LexerState):
_currentState = state;
if _currentState.track_indent:
_shouldTrackIndent = true
func enter_state(state):
current_state = state;
if current_state.track_indent:
should_track_indent = true
class Token:
var type = -1
@ -375,38 +360,36 @@ class Token:
var text = ''
var delimits_text = false
var paramCount = -1
var lexerState = ''
var parameter_count = -1
var lexer_state = ''
func _init(_type, _state, _filename, _line_number = -1, _column = -1, _value = ''):
type = _type
lexerState = _state.stateName
lexer_state = _state.name
filename = _filename
line_number = _line_number
column = _column
value = _value
func _to_string():
return '%s (%s) at %s:%s (state: %s)' % [Constants.token_type_name(type),value,line_number,column,lexerState]
return '%s (%s) at %s:%s (state: %s)' % [Constants.token_type_name(type),value, line_number, column, lexer_state]
class LexerState:
var stateName : String
var patterns : Dictionary
var rules : Array = []
var track_indent : bool = false
var name = ''
var patterns = {}
var rules = []
var track_indent = false
func _init(_patterns):
patterns = _patterns
func add_transition(type : int, state : String = '',delimitText : bool = false)->Rule:
func add_transition(type, state = '', delimit_text = false):
var pattern = '\\G%s' % patterns[type][0]
# print('pattern = %s' % pattern)
var rule = Rule.new(type, pattern, patterns[type][1], state, delimitText)
var rule = Rule.new(type, pattern, patterns[type][1], state, delimit_text)
rules.append(rule)
return rule
func add_text_rule(type : int, state : String = '')->Rule:
func add_text_rule(type, state = ''):
if contains_text_rule() :
printerr('State already contains Text rule')
return null
@ -417,25 +400,25 @@ class LexerState:
delimiters.append('%s' % rule.regex.get_pattern().substr(2))
var pattern = '\\G((?!%s).)*' % [PoolStringArray(delimiters).join('|')]
var rule : Rule = add_transition(type,state)
var rule = add_transition(type, state)
rule.regex = RegEx.new()
rule.regex.compile(pattern)
rule.is_text_rule = true
return rule
func contains_text_rule()->bool:
func contains_text_rule():
for rule in rules:
if rule.is_text_rule:
return true
return false
class Rule:
var regex : RegEx
var regex
var enter_state : String
var token_type : int
var is_text_rule : bool
var delimits_text : bool
var enter_state = ''
var token_type = -1
var is_text_rule = false
var delimits_text = false
var human_readable_identifier = ''
func _init(_type, _regex, _human_readable_identifier, _enter_state, _delimits_text):
@ -450,12 +433,3 @@ class Rule:
func _to_string():
return '[Rule : %s (%s) - %s]' % [Constants.token_type_name(token_type), human_readable_identifier, regex]
class IntBoolPair:
var key = -1
var value = false
func _init(_key, _value):
key = _key
value = _value