cleaned up lexer

2021-11-21 15:03:56 +01:00 · 2021-11-21 15:03:56 +01:00 · 9f60bbfd23
commit 9f60bbfd23
parent 83c4808d5a
1 changed files with 183 additions and 209 deletions
--- a/addons/Wol/core/compiler/lexer.gd
+++ b/addons/Wol/core/compiler/lexer.gd
@ -21,26 +21,26 @@ const DESTINATION = 'destination'

 var WHITESPACE = '\\s*'

-var _states = {}
-var _defaultState
-var _currentState
-
-var _indentStack = []
-var _shouldTrackIndent : bool = false
-
 var filename = ''
 var title = ''
 var text = ''

+var states = {}
+var default_state
+var current_state
+
+var indent_stack = []
+var should_track_indent = false
+
 func _init(_filename, _title, _text):
-	create_states()
+	createstates()

 	filename = _filename
 	title = _title
 	text = _text

-func create_states():
-	var patterns : Dictionary = {}
+func createstates():
+	var patterns = {}
 	patterns[Constants.TokenType.Text] = ['.*', 'any text']

 	patterns[Constants.TokenType.Number] = ['\\-?[0-9]+(\\.[0-9+])?', 'any number']
@ -86,249 +86,234 @@ func create_states():
 	patterns[Constants.TokenType.Set] = ['set(?!\\w)', '"set"']
 	patterns[Constants.TokenType.ShortcutOption] = ['\\-\\>\\s*', '"->"']

-	#compound states
-	var shortcut_option : String = SHORTCUT + DASH + OPTION
-	var shortcut_option_tag : String = shortcut_option + DASH + TAG
-	var command_or_expression : String = COMMAND + DASH + OR + DASH + EXPRESSION
-	var link_destination : String = LINK + DASH + DESTINATION
+	var shortcut_option = SHORTCUT + DASH + OPTION
+	var shortcut_option_tag = shortcut_option + DASH + TAG
+	var command_or_expression = COMMAND + DASH + OR + DASH + EXPRESSION
+	var link_destination = LINK + DASH + DESTINATION

-	_states = {}
+	states = {}

-	_states[BASE] = LexerState.new(patterns)
-	_states[BASE].add_transition(Constants.TokenType.BeginCommand,COMMAND,true)
-	_states[BASE].add_transition(Constants.TokenType.OptionStart,LINK,true)
-	_states[BASE].add_transition(Constants.TokenType.ShortcutOption, shortcut_option)
-	_states[BASE].add_transition(Constants.TokenType.TagMarker,TAG,true)
-	_states[BASE].add_text_rule(Constants.TokenType.Text)
+	states[BASE] = LexerState.new(patterns)
+	states[BASE].add_transition(Constants.TokenType.BeginCommand, COMMAND, true)
+	states[BASE].add_transition(Constants.TokenType.OptionStart, LINK, true)
+	states[BASE].add_transition(Constants.TokenType.ShortcutOption, shortcut_option)
+	states[BASE].add_transition(Constants.TokenType.TagMarker, TAG, true)
+	states[BASE].add_text_rule(Constants.TokenType.Text)

-	_states[TAG] = LexerState.new(patterns)
-	_states[TAG].add_transition(Constants.TokenType.Identifier,BASE)
+	states[TAG] = LexerState.new(patterns)
+	states[TAG].add_transition(Constants.TokenType.Identifier, BASE)

-	_states[shortcut_option] = LexerState.new(patterns)
-	_states[shortcut_option].track_indent = true
-	_states[shortcut_option].add_transition(Constants.TokenType.BeginCommand,EXPRESSION,true)
-	_states[shortcut_option].add_transition(Constants.TokenType.TagMarker,shortcut_option_tag,true)
-	_states[shortcut_option].add_text_rule(Constants.TokenType.Text,BASE)
+	states[shortcut_option] = LexerState.new(patterns)
+	states[shortcut_option].track_indent = true
+	states[shortcut_option].add_transition(Constants.TokenType.BeginCommand, EXPRESSION, true)
+	states[shortcut_option].add_transition(Constants.TokenType.TagMarker, shortcut_option_tag, true)
+	states[shortcut_option].add_text_rule(Constants.TokenType.Text, BASE)
 	
-	_states[shortcut_option_tag] = LexerState.new(patterns)
-	_states[shortcut_option_tag].add_transition(Constants.TokenType.Identifier,shortcut_option)
+	states[shortcut_option_tag] = LexerState.new(patterns)
+	states[shortcut_option_tag].add_transition(Constants.TokenType.Identifier, shortcut_option)

-	_states[COMMAND] = LexerState.new(patterns)
-	_states[COMMAND].add_transition(Constants.TokenType.IfToken,EXPRESSION)
-	_states[COMMAND].add_transition(Constants.TokenType.ElseToken)
-	_states[COMMAND].add_transition(Constants.TokenType.ElseIf,EXPRESSION)
-	_states[COMMAND].add_transition(Constants.TokenType.EndIf)
-	_states[COMMAND].add_transition(Constants.TokenType.Set, ASSIGNMENT)
-	_states[COMMAND].add_transition(Constants.TokenType.EndCommand,BASE,true)
-	_states[COMMAND].add_transition(Constants.TokenType.Identifier,command_or_expression)
-	_states[COMMAND].add_text_rule(Constants.TokenType.Text)
+	states[COMMAND] = LexerState.new(patterns)
+	states[COMMAND].add_transition(Constants.TokenType.IfToken, EXPRESSION)
+	states[COMMAND].add_transition(Constants.TokenType.ElseToken)
+	states[COMMAND].add_transition(Constants.TokenType.ElseIf, EXPRESSION)
+	states[COMMAND].add_transition(Constants.TokenType.EndIf)
+	states[COMMAND].add_transition(Constants.TokenType.Set, ASSIGNMENT)
+	states[COMMAND].add_transition(Constants.TokenType.EndCommand, BASE, true)
+	states[COMMAND].add_transition(Constants.TokenType.Identifier, command_or_expression)
+	states[COMMAND].add_text_rule(Constants.TokenType.Text)

-	_states[command_or_expression] = LexerState.new(patterns)
-	_states[command_or_expression].add_transition(Constants.TokenType.LeftParen,EXPRESSION)
-	_states[command_or_expression].add_transition(Constants.TokenType.EndCommand,BASE,true)
-	_states[command_or_expression].add_text_rule(Constants.TokenType.Text)
+	states[command_or_expression] = LexerState.new(patterns)
+	states[command_or_expression].add_transition(Constants.TokenType.LeftParen, EXPRESSION)
+	states[command_or_expression].add_transition(Constants.TokenType.EndCommand, BASE, true)
+	states[command_or_expression].add_text_rule(Constants.TokenType.Text)

-	_states[ASSIGNMENT] = LexerState.new(patterns)
-	_states[ASSIGNMENT].add_transition(Constants.TokenType.Variable)
-	_states[ASSIGNMENT].add_transition(Constants.TokenType.EqualToOrAssign, EXPRESSION)
-	_states[ASSIGNMENT].add_transition(Constants.TokenType.AddAssign, EXPRESSION)
-	_states[ASSIGNMENT].add_transition(Constants.TokenType.MinusAssign, EXPRESSION)
-	_states[ASSIGNMENT].add_transition(Constants.TokenType.MultiplyAssign, EXPRESSION)
-	_states[ASSIGNMENT].add_transition(Constants.TokenType.DivideAssign, EXPRESSION)
+	states[ASSIGNMENT] = LexerState.new(patterns)
+	states[ASSIGNMENT].add_transition(Constants.TokenType.Variable)
+	states[ASSIGNMENT].add_transition(Constants.TokenType.EqualToOrAssign, EXPRESSION)
+	states[ASSIGNMENT].add_transition(Constants.TokenType.AddAssign, EXPRESSION)
+	states[ASSIGNMENT].add_transition(Constants.TokenType.MinusAssign, EXPRESSION)
+	states[ASSIGNMENT].add_transition(Constants.TokenType.MultiplyAssign, EXPRESSION)
+	states[ASSIGNMENT].add_transition(Constants.TokenType.DivideAssign, EXPRESSION)

-	_states[EXPRESSION] = LexerState.new(patterns)
-	_states[EXPRESSION].add_transition(Constants.TokenType.EndCommand, BASE)
-	_states[EXPRESSION].add_transition(Constants.TokenType.Number)
-	_states[EXPRESSION].add_transition(Constants.TokenType.Str)
-	_states[EXPRESSION].add_transition(Constants.TokenType.LeftParen)
-	_states[EXPRESSION].add_transition(Constants.TokenType.RightParen)
-	_states[EXPRESSION].add_transition(Constants.TokenType.EqualTo)
-	_states[EXPRESSION].add_transition(Constants.TokenType.EqualToOrAssign)
-	_states[EXPRESSION].add_transition(Constants.TokenType.NotEqualTo)
-	_states[EXPRESSION].add_transition(Constants.TokenType.GreaterThanOrEqualTo)
-	_states[EXPRESSION].add_transition(Constants.TokenType.GreaterThan)
-	_states[EXPRESSION].add_transition(Constants.TokenType.LessThanOrEqualTo)
-	_states[EXPRESSION].add_transition(Constants.TokenType.LessThan)
-	_states[EXPRESSION].add_transition(Constants.TokenType.Add)
-	_states[EXPRESSION].add_transition(Constants.TokenType.Minus)
-	_states[EXPRESSION].add_transition(Constants.TokenType.Multiply)
-	_states[EXPRESSION].add_transition(Constants.TokenType.Divide)
-	_states[EXPRESSION].add_transition(Constants.TokenType.Modulo)
-	_states[EXPRESSION].add_transition(Constants.TokenType.And)
-	_states[EXPRESSION].add_transition(Constants.TokenType.Or)
-	_states[EXPRESSION].add_transition(Constants.TokenType.Xor)
-	_states[EXPRESSION].add_transition(Constants.TokenType.Not)
-	_states[EXPRESSION].add_transition(Constants.TokenType.Variable)
-	_states[EXPRESSION].add_transition(Constants.TokenType.Comma)
-	_states[EXPRESSION].add_transition(Constants.TokenType.TrueToken)
-	_states[EXPRESSION].add_transition(Constants.TokenType.FalseToken)
-	_states[EXPRESSION].add_transition(Constants.TokenType.NullToken)
-	_states[EXPRESSION].add_transition(Constants.TokenType.Identifier)
+	states[EXPRESSION] = LexerState.new(patterns)
+	states[EXPRESSION].add_transition(Constants.TokenType.EndCommand, BASE)
+	states[EXPRESSION].add_transition(Constants.TokenType.Number)
+	states[EXPRESSION].add_transition(Constants.TokenType.Str)
+	states[EXPRESSION].add_transition(Constants.TokenType.LeftParen)
+	states[EXPRESSION].add_transition(Constants.TokenType.RightParen)
+	states[EXPRESSION].add_transition(Constants.TokenType.EqualTo)
+	states[EXPRESSION].add_transition(Constants.TokenType.EqualToOrAssign)
+	states[EXPRESSION].add_transition(Constants.TokenType.NotEqualTo)
+	states[EXPRESSION].add_transition(Constants.TokenType.GreaterThanOrEqualTo)
+	states[EXPRESSION].add_transition(Constants.TokenType.GreaterThan)
+	states[EXPRESSION].add_transition(Constants.TokenType.LessThanOrEqualTo)
+	states[EXPRESSION].add_transition(Constants.TokenType.LessThan)
+	states[EXPRESSION].add_transition(Constants.TokenType.Add)
+	states[EXPRESSION].add_transition(Constants.TokenType.Minus)
+	states[EXPRESSION].add_transition(Constants.TokenType.Multiply)
+	states[EXPRESSION].add_transition(Constants.TokenType.Divide)
+	states[EXPRESSION].add_transition(Constants.TokenType.Modulo)
+	states[EXPRESSION].add_transition(Constants.TokenType.And)
+	states[EXPRESSION].add_transition(Constants.TokenType.Or)
+	states[EXPRESSION].add_transition(Constants.TokenType.Xor)
+	states[EXPRESSION].add_transition(Constants.TokenType.Not)
+	states[EXPRESSION].add_transition(Constants.TokenType.Variable)
+	states[EXPRESSION].add_transition(Constants.TokenType.Comma)
+	states[EXPRESSION].add_transition(Constants.TokenType.TrueToken)
+	states[EXPRESSION].add_transition(Constants.TokenType.FalseToken)
+	states[EXPRESSION].add_transition(Constants.TokenType.NullToken)
+	states[EXPRESSION].add_transition(Constants.TokenType.Identifier)

-	_states[LINK] = LexerState.new(patterns)
-	_states[LINK].add_transition(Constants.TokenType.OptionEnd, BASE, true)
-	_states[LINK].add_transition(Constants.TokenType.OptionDelimit, link_destination, true)
-	_states[LINK].add_text_rule(Constants.TokenType.Text)
+	states[LINK] = LexerState.new(patterns)
+	states[LINK].add_transition(Constants.TokenType.OptionEnd, BASE, true)
+	states[LINK].add_transition(Constants.TokenType.OptionDelimit, link_destination, true)
+	states[LINK].add_text_rule(Constants.TokenType.Text)

-	_states[link_destination] = LexerState.new(patterns)
-	_states[link_destination].add_transition(Constants.TokenType.Identifier)
-	_states[link_destination].add_transition(Constants.TokenType.OptionEnd, BASE)
+	states[link_destination] = LexerState.new(patterns)
+	states[link_destination].add_transition(Constants.TokenType.Identifier)
+	states[link_destination].add_transition(Constants.TokenType.OptionEnd, BASE)

-	_defaultState = _states[BASE]
+	default_state = states[BASE]

-	for stateKey in _states.keys():
-		_states[stateKey].stateName = stateKey
+	for key in states.keys():
+		states[key].name = key

 func tokenize():
-	_indentStack.clear()
-	_indentStack.push_front(IntBoolPair.new(0, false))
-	_shouldTrackIndent = false
-
 	var tokens = []

-	_currentState = _defaultState
+	indent_stack.clear()
+	indent_stack.push_front([0, false])
+	should_track_indent = false
+	current_state = default_state

 	var lines = text.split(LINE_SEPARATOR)
-	lines.append('')
+	var line_number = 1

-	var line_number : int = 1
+	lines.append('')

 	for line in lines:
 		tokens += tokenize_line(line, line_number)
 		line_number += 1

-	var endOfInput = Token.new(
+	var end_of_input = Token.new(
 		Constants.TokenType.EndOfInput,
-		_currentState,
+		current_state,
 		line_number,
 		0
 	)
-	tokens.append(endOfInput)
+	tokens.append(end_of_input)

 	return tokens

 func tokenize_line(line, line_number):
-	var tokenStack : Array = []
+	var token_stack = []

-	var freshLine = line.replace('\t','    ').replace('\r','')
+	var fresh_line = line.replace('\t','    ').replace('\r','')

-	#record indentation
 	var indentation = line_indentation(line)
-	var prevIndentation = _indentStack.front()
+	var previous_indentation = indent_stack.front()[0]

-	if _shouldTrackIndent && indentation > prevIndentation.key:
-		#we add an indenation token to record indent level
-		_indentStack.push_front(IntBoolPair.new(indentation,true))
+	if should_track_indent && indentation > previous_indentation:
+		indent_stack.push_front([indentation, true])

-		var indent : Token = Token.new(
+		var indent = Token.new(
 			Constants.TokenType.Indent,
-			_currentState,
+			current_state,
 			filename,
 			line_number,
-			prevIndentation.key
+			previous_indentation
 		)
-		indent.value = '%*s' % [indentation - prevIndentation.key,'']
+		indent.value = '%*s' % [indentation - previous_indentation, '']

-		_shouldTrackIndent = false
-		tokenStack.push_front(indent)
+		should_track_indent = false
+		token_stack.push_front(indent)

-	elif indentation < prevIndentation.key:
-		#de-indent and then emit indentaiton token
-
-		while indentation < _indentStack.front().key:
-			var top : IntBoolPair = _indentStack.pop_front()
-			if top.value:
-				var deIndent : Token = Token.new(Constants.TokenType.Dedent,_currentState,line_number,0)
-				tokenStack.push_front(deIndent)
+	elif indentation < previous_indentation:
+		while indentation < indent_stack.front()[0]:
+			var top = indent_stack.pop_front()[1]
+			if top:
+				var deindent = Token.new(Constants.TokenType.Dedent, current_state, line_number, 0)
+				token_stack.push_front(deindent)
 	
-	
-	var column : int = indentation
+	var column = indentation
+	var whitespace = RegEx.new()
+	whitespace.compile(WHITESPACE)

-	var whitespace : RegEx = RegEx.new()
-	var error = whitespace.compile(WHITESPACE)
-	if error != OK:
-		printerr('unable to compile regex WHITESPACE')
-		return []
-	
-	while column < freshLine.length():
-
-		if freshLine.substr(column).begins_with(LINE_COMENT):
+	while column < fresh_line.length():
+		if fresh_line.substr(column).begins_with(LINE_COMENT):
 			break
 		
-		var matched : bool = false
+		var matched = false

-		for rule in _currentState.rules:
-			var found = rule.regex.search(freshLine, column)
+		for rule in current_state.rules:
+			var found = rule.regex.search(fresh_line, column)
 			
 			if !found:
 				continue

-			var tokenText : String
+			var token_text = ''

+			# NOTE: If this is text then we back up to the most recent delimiting token
+			#		and treat everything from there as text.
 			if rule.token_type == Constants.TokenType.Text:
-				#if this is text then we back up to the most recent
-				#delimiting token and treat everything from there as text.
 				
-				var startIndex : int = indentation
+				var start_index = indentation

-				if tokenStack.size() > 0 :
-					while tokenStack.front().type == Constants.TokenType.Identifier:
-						tokenStack.pop_front()
+				if token_stack.size() > 0 :
+					while token_stack.front().type == Constants.TokenType.Identifier:
+						token_stack.pop_front()
 					
-					var startDelimitToken : Token = tokenStack.front()
-					startIndex =  startDelimitToken.column
+					var start_delimit_token = token_stack.front()
+					start_index =  start_delimit_token.column

-					if startDelimitToken.type == Constants.TokenType.Indent:
-						startIndex += startDelimitToken.value.length()
-					if startDelimitToken.type == Constants.TokenType.Dedent:
-						startIndex = indentation
-				#
+					if start_delimit_token.type == Constants.TokenType.Indent:
+						start_index += start_delimit_token.value.length()
+					if start_delimit_token.type == Constants.TokenType.Dedent:
+						start_index = indentation
 				
-				column = startIndex
+				column = start_index
 				var end_index = found.get_start() + found.get_string().length()

-				tokenText = freshLine.substr(startIndex, end_index - startIndex)
-			
+				token_text = fresh_line.substr(start_index, end_index - start_index)
 			else:
-				tokenText = found.get_string()
+				token_text = found.get_string()

-			column += tokenText.length()
+			column += token_text.length()

-			#pre-proccess string
 			if rule.token_type == Constants.TokenType.Str:
-				tokenText = tokenText.substr(1, tokenText.length() - 2)
-				tokenText = tokenText.replace('\\\\', '\\')
-				tokenText = tokenText.replace('\\\'','\'')
+				token_text = token_text.substr(1, token_text.length() - 2)
+				token_text = token_text.replace('\\\\', '\\')
+				token_text = token_text.replace('\\\'','\'')

 			var token = Token.new(
 				rule.token_type,
-				_currentState,
+				current_state,
 				filename,
 				line_number,
 				column,
-				tokenText
+				token_text
 			)
 			token.delimits_text = rule.delimits_text

-			tokenStack.push_front(token)
+			token_stack.push_front(token)

 			if rule.enter_state != null and rule.enter_state.length() > 0:
-				if not _states.has(rule.enter_state):
+				if not states.has(rule.enter_state):
 					printerr('State[%s] not known - line(%s) col(%s)' % [rule.enter_state, line_number, column])
 					return []
 				
-				enter_state(_states[rule.enter_state])
+				enter_state(states[rule.enter_state])

-				if _shouldTrackIndent:
-					if _indentStack.front().key < indentation:
-						_indentStack.append(IntBoolPair.new(indentation, false))
+				if should_track_indent:
+					if indent_stack.front()[0] < indentation:
+						indent_stack.append([indentation, false])
 			
 			matched = true
 			break

 		if not matched:
 			var rules = []
-			for rule in _currentState.rules:
+			for rule in current_state.rules:
 				rules.append('"%s" (%s)' % [Constants.token_type_name(rule.token_type), rule.human_readable_identifier])

 			var error_data = [
@ -340,30 +325,30 @@ func tokenize_line(line, line_number):
 			]
 			assert(false, 'Expected %s in file %s in node "%s" on line #%d (column #%d)' % error_data)

-		var lastWhiteSpace = whitespace.search(line, column)
-		if lastWhiteSpace:
-			column += lastWhiteSpace.get_string().length()
+		var last_whitespace = whitespace.search(line, column)
+		if last_whitespace:
+			column += last_whitespace.get_string().length()
 		
 	
-	tokenStack.invert()
+	token_stack.invert()

-	return tokenStack
+	return token_stack

-func line_indentation(line:String)->int:
-	var indentRegex : RegEx = RegEx.new()
-	indentRegex.compile('^(\\s*)')
+func line_indentation(line):
+	var indent_regex = RegEx.new()
+	indent_regex.compile('^(\\s*)')

-	var found : RegExMatch = indentRegex.search(line)
+	var found = indent_regex.search(line)
 	
-	if !found || found.get_string().length() <= 0:
+	if !found or found.get_string().length() <= 0:
 		return 0

 	return found.get_string().length()

-func enter_state(state:LexerState):
-	_currentState = state;
-	if _currentState.track_indent:
-		_shouldTrackIndent = true
+func enter_state(state):
+	current_state = state;
+	if current_state.track_indent:
+		should_track_indent = true

 class Token:
 	var type = -1
@ -375,38 +360,36 @@ class Token:
 	var text = ''

 	var delimits_text = false
-	var paramCount = -1
-	var lexerState = ''
+	var parameter_count = -1
+	var lexer_state = ''

 	func _init(_type, _state, _filename, _line_number = -1, _column = -1, _value = ''):
 		type = _type
-		lexerState = _state.stateName
+		lexer_state = _state.name
 		filename = _filename
 		line_number = _line_number
 		column = _column
 		value = _value

 	func _to_string():
-		return '%s (%s) at %s:%s (state: %s)' % [Constants.token_type_name(type),value,line_number,column,lexerState]
+		return '%s (%s) at %s:%s (state: %s)' % [Constants.token_type_name(type),value, line_number, column, lexer_state]
 	
 class LexerState:
-
-	var stateName : String
-	var patterns : Dictionary
-	var rules : Array = []
-	var track_indent : bool = false
+	var name = ''
+	var patterns = {}
+	var rules = []
+	var track_indent = false

 	func _init(_patterns):
 		patterns = _patterns

-	func add_transition(type : int, state : String = '',delimitText : bool = false)->Rule:
+	func add_transition(type, state = '', delimit_text = false):
 		var pattern = '\\G%s' % patterns[type][0]
-		# print('pattern = %s' % pattern)
-		var rule = Rule.new(type, pattern, patterns[type][1], state, delimitText)
+		var rule = Rule.new(type, pattern, patterns[type][1], state, delimit_text)
 		rules.append(rule)
 		return rule
 	
-	func add_text_rule(type : int, state : String = '')->Rule:
+	func add_text_rule(type, state = ''):
 		if contains_text_rule() :
 			printerr('State already contains Text rule')
 			return null
@ -417,25 +400,25 @@ class LexerState:
 				delimiters.append('%s' % rule.regex.get_pattern().substr(2))

 		var pattern = '\\G((?!%s).)*' % [PoolStringArray(delimiters).join('|')]
-		var rule : Rule = add_transition(type,state)
+		var rule = add_transition(type, state)
 		rule.regex = RegEx.new()
 		rule.regex.compile(pattern)
 		rule.is_text_rule = true
 		return rule

-	func contains_text_rule()->bool:
+	func contains_text_rule():
 		for rule in rules:
 			if rule.is_text_rule:
 				return true
 		return false
 	
 class Rule:
-	var regex : RegEx
+	var regex

-	var enter_state : String
-	var token_type : int
-	var is_text_rule : bool
-	var delimits_text : bool
+	var enter_state = ''
+	var token_type = -1
+	var is_text_rule = false
+	var delimits_text = false
 	var human_readable_identifier = ''

 	func _init(_type, _regex, _human_readable_identifier, _enter_state, _delimits_text):
@ -450,12 +433,3 @@ class Rule:

 	func _to_string():
 		return '[Rule : %s (%s) - %s]' % [Constants.token_type_name(token_type), human_readable_identifier, regex]
-
-class IntBoolPair:
-	var key = -1
-	var value = false
-
-	func _init(_key, _value):
-		key = _key
-		value = _value
-