1 files changed, 260 insertions, 155 deletions
diff --git a/modules/gdscript/gdscript_tokenizer.cpp b/modules/gdscript/gdscript_tokenizer.cpp
index a45a73a8d5..98a3a1268f 100644
--- a/modules/gdscript/gdscript_tokenizer.cpp
+++ b/modules/gdscript/gdscript_tokenizer.cpp
@@ -33,13 +33,14 @@
 #include "core/error/error_macros.h"
 #include "core/string/char_utils.h"
 
-#ifdef TOOLS_ENABLED
-#include "editor/editor_settings.h"
-#endif
 #ifdef DEBUG_ENABLED
 #include "servers/text_server.h"
 #endif
 
+#ifdef TOOLS_ENABLED
+#include "editor/editor_settings.h"
+#endif
+
 static const char *token_names[] = {
 	"Empty", // EMPTY,
 	// Basic
@@ -98,6 +99,7 @@ static const char *token_names[] = {
 	"pass", // PASS,
 	"return", // RETURN,
 	"match", // MATCH,
+	"when", // WHEN,
 	// Keywords
 	"as", // AS,
 	"assert", // ASSERT,
@@ -162,12 +164,31 @@ const char *GDScriptTokenizer::Token::get_name() const {
 	return token_names[type];
 }
 
+bool GDScriptTokenizer::Token::can_precede_bin_op() const {
+	switch (type) {
+		case IDENTIFIER:
+		case LITERAL:
+		case SELF:
+		case BRACKET_CLOSE:
+		case BRACE_CLOSE:
+		case PARENTHESIS_CLOSE:
+		case CONST_PI:
+		case CONST_TAU:
+		case CONST_INF:
+		case CONST_NAN:
+			return true;
+		default:
+			return false;
+	}
+}
+
 bool GDScriptTokenizer::Token::is_identifier() const {
 	// Note: Most keywords should not be recognized as identifiers.
 	// These are only exceptions for stuff that already is on the engine's API.
 	switch (type) {
 		case IDENTIFIER:
 		case MATCH: // Used in String.match().
+		case WHEN: // New keyword, avoid breaking existing code.
 		// Allow constants to be treated as regular identifiers.
 		case CONST_PI:
 		case CONST_INF:
@@ -222,6 +243,7 @@ bool GDScriptTokenizer::Token::is_node_name() const {
 		case VAR:
 		case VOID:
 		case WHILE:
+		case WHEN:
 		case YIELD:
 			return true;
 		default:
@@ -382,6 +404,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::make_token(Token::Type p_type) {
 		}
 	}
 
+	last_token = token;
 	return token;
 }
 
@@ -511,6 +534,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::annotation() {
 	KEYWORD("void", Token::VOID)             \
 	KEYWORD_GROUP('w')                       \
 	KEYWORD("while", Token::WHILE)           \
+	KEYWORD("when", Token::WHEN)             \
 	KEYWORD_GROUP('y')                       \
 	KEYWORD("yield", Token::YIELD)           \
 	KEYWORD_GROUP('I')                       \
@@ -559,6 +583,24 @@ GDScriptTokenizer::Token GDScriptTokenizer::potential_identifier() {
 		return make_identifier(name);
 	}
 
+	if (!only_ascii) {
+		// Kept here in case the order with push_error matters.
+		Token id = make_identifier(name);
+
+#ifdef DEBUG_ENABLED
+		// Additional checks for identifiers but only in debug and if it's available in TextServer.
+		if (TS->has_feature(TextServer::FEATURE_UNICODE_SECURITY)) {
+			int64_t confusable = TS->is_confusable(name, keyword_list);
+			if (confusable >= 0) {
+				push_error(vformat(R"(Identifier "%s" is visually similar to the GDScript keyword "%s" and thus not allowed.)", name, keyword_list[confusable]));
+			}
+		}
+#endif // DEBUG_ENABLED
+
+		// Cannot be a keyword, as keywords are ASCII only.
+		return id;
+	}
+
 	// Define some helper macros for the switch case.
 #define KEYWORD_GROUP_CASE(char) \
 	break;                       \
@@ -594,19 +636,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::potential_identifier() {
 	}
 
 	// Not a keyword, so must be an identifier.
-	Token id = make_identifier(name);
-
-#ifdef DEBUG_ENABLED
-	// Additional checks for identifiers but only in debug and if it's available in TextServer.
-	if (!only_ascii && TS->has_feature(TextServer::FEATURE_UNICODE_SECURITY)) {
-		int64_t confusable = TS->is_confusable(name, keyword_list);
-		if (confusable >= 0) {
-			push_error(vformat(R"(Identifier "%s" is visually similar to the GDScript keyword "%s" and thus not allowed.)", name, keyword_list[confusable]));
-		}
-	}
-#endif // DEBUG_ENABLED
-
-	return id;
+	return make_identifier(name);
 
 #undef KEYWORD_GROUP_CASE
 #undef KEYWORD
@@ -627,6 +657,7 @@ void GDScriptTokenizer::newline(bool p_make_token) {
 		newline.leftmost_column = newline.start_column;
 		newline.rightmost_column = newline.end_column;
 		pending_newline = true;
+		last_token = newline;
 		last_newline = newline;
 	}
 
@@ -643,6 +674,11 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
 	bool has_error = false;
 	bool (*digit_check_func)(char32_t) = is_digit;
 
+	// Sign before hexadecimal or binary.
+	if ((_peek(-1) == '+' || _peek(-1) == '-') && _peek() == '0') {
+		_advance();
+	}
+
 	if (_peek(-1) == '.') {
 		has_decimal = true;
 	} else if (_peek(-1) == '0') {
@@ -659,12 +695,20 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
 		}
 	}
 
-	// Allow '_' to be used in a number, for readability.
-	bool previous_was_underscore = false;
+	if (base != 10 && is_underscore(_peek())) { // Disallow `0x_` and `0b_`.
+		Token error = make_error(vformat(R"(Unexpected underscore after "0%c".)", _peek(-1)));
+		error.start_column = column;
+		error.leftmost_column = column;
+		error.end_column = column + 1;
+		error.rightmost_column = column + 1;
+		push_error(error);
+		has_error = true;
+	}
+	bool previous_was_underscore = false; // Allow `_` to be used in a number, for readability.
 	while (digit_check_func(_peek()) || is_underscore(_peek())) {
 		if (is_underscore(_peek())) {
 			if (previous_was_underscore) {
-				Token error = make_error(R"(Only one underscore can be used as a numeric separator.)");
+				Token error = make_error(R"(Multiple underscores cannot be adjacent in a numeric literal.)");
 				error.start_column = column;
 				error.leftmost_column = column;
 				error.end_column = column + 1;
@@ -711,7 +755,30 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
 			_advance();
 
 			// Consume decimal digits.
+			if (is_underscore(_peek())) { // Disallow `10._`, but allow `10.`.
+				Token error = make_error(R"(Unexpected underscore after decimal point.)");
+				error.start_column = column;
+				error.leftmost_column = column;
+				error.end_column = column + 1;
+				error.rightmost_column = column + 1;
+				push_error(error);
+				has_error = true;
+			}
+			previous_was_underscore = false;
 			while (is_digit(_peek()) || is_underscore(_peek())) {
+				if (is_underscore(_peek())) {
+					if (previous_was_underscore) {
+						Token error = make_error(R"(Multiple underscores cannot be adjacent in a numeric literal.)");
+						error.start_column = column;
+						error.leftmost_column = column;
+						error.end_column = column + 1;
+						error.rightmost_column = column + 1;
+						push_error(error);
+					}
+					previous_was_underscore = true;
+				} else {
+					previous_was_underscore = false;
+				}
 				_advance();
 			}
 		}
@@ -737,7 +804,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
 			while (is_digit(_peek()) || is_underscore(_peek())) {
 				if (is_underscore(_peek())) {
 					if (previous_was_underscore) {
-						Token error = make_error(R"(Only one underscore can be used as a numeric separator.)");
+						Token error = make_error(R"(Multiple underscores cannot be adjacent in a numeric literal.)");
 						error.start_column = column;
 						error.leftmost_column = column;
 						error.end_column = column + 1;
@@ -794,10 +861,14 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
 		STRING_NODEPATH,
 	};
 
+	bool is_raw = false;
 	bool is_multiline = false;
 	StringType type = STRING_REGULAR;
 
-	if (_peek(-1) == '&') {
+	if (_peek(-1) == 'r') {
+		is_raw = true;
+		_advance();
+	} else if (_peek(-1) == '&') {
 		type = STRING_NAME;
 		_advance();
 	} else if (_peek(-1) == '^') {
@@ -827,7 +898,12 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
 		char32_t ch = _peek();
 
 		if (ch == 0x200E || ch == 0x200F || (ch >= 0x202A && ch <= 0x202E) || (ch >= 0x2066 && ch <= 0x2069)) {
-			Token error = make_error("Invisible text direction control character present in the string, escape it (\"\\u" + String::num_int64(ch, 16) + "\") to avoid confusion.");
+			Token error;
+			if (is_raw) {
+				error = make_error("Invisible text direction control character present in the string, use regular string literal instead of r-string.");
+			} else {
+				error = make_error("Invisible text direction control character present in the string, escape it (\"\\u" + String::num_int64(ch, 16) + "\") to avoid confusion.");
+			}
 			error.start_column = column;
 			error.leftmost_column = error.start_column;
 			error.end_column = column + 1;
@@ -842,144 +918,164 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
 				return make_error("Unterminated string.");
 			}
 
-			// Grab escape character.
-			char32_t code = _peek();
-			_advance();
-			if (_is_at_end()) {
-				return make_error("Unterminated string.");
-			}
+			if (is_raw) {
+				if (_peek() == quote_char) {
+					_advance();
+					if (_is_at_end()) {
+						return make_error("Unterminated string.");
+					}
+					result += '\\';
+					result += quote_char;
+				} else if (_peek() == '\\') { // For `\\\"`.
+					_advance();
+					if (_is_at_end()) {
+						return make_error("Unterminated string.");
+					}
+					result += '\\';
+					result += '\\';
+				} else {
+					result += '\\';
+				}
+			} else {
+				// Grab escape character.
+				char32_t code = _peek();
+				_advance();
+				if (_is_at_end()) {
+					return make_error("Unterminated string.");
+				}
 
-			char32_t escaped = 0;
-			bool valid_escape = true;
+				char32_t escaped = 0;
+				bool valid_escape = true;
 
-			switch (code) {
-				case 'a':
-					escaped = '\a';
-					break;
-				case 'b':
-					escaped = '\b';
-					break;
-				case 'f':
-					escaped = '\f';
-					break;
-				case 'n':
-					escaped = '\n';
-					break;
-				case 'r':
-					escaped = '\r';
-					break;
-				case 't':
-					escaped = '\t';
-					break;
-				case 'v':
-					escaped = '\v';
-					break;
-				case '\'':
-					escaped = '\'';
-					break;
-				case '\"':
-					escaped = '\"';
-					break;
-				case '\\':
-					escaped = '\\';
-					break;
-				case 'U':
-				case 'u': {
-					// Hexadecimal sequence.
-					int hex_len = (code == 'U') ? 6 : 4;
-					for (int j = 0; j < hex_len; j++) {
-						if (_is_at_end()) {
-							return make_error("Unterminated string.");
+				switch (code) {
+					case 'a':
+						escaped = '\a';
+						break;
+					case 'b':
+						escaped = '\b';
+						break;
+					case 'f':
+						escaped = '\f';
+						break;
+					case 'n':
+						escaped = '\n';
+						break;
+					case 'r':
+						escaped = '\r';
+						break;
+					case 't':
+						escaped = '\t';
+						break;
+					case 'v':
+						escaped = '\v';
+						break;
+					case '\'':
+						escaped = '\'';
+						break;
+					case '\"':
+						escaped = '\"';
+						break;
+					case '\\':
+						escaped = '\\';
+						break;
+					case 'U':
+					case 'u': {
+						// Hexadecimal sequence.
+						int hex_len = (code == 'U') ? 6 : 4;
+						for (int j = 0; j < hex_len; j++) {
+							if (_is_at_end()) {
+								return make_error("Unterminated string.");
+							}
+
+							char32_t digit = _peek();
+							char32_t value = 0;
+							if (is_digit(digit)) {
+								value = digit - '0';
+							} else if (digit >= 'a' && digit <= 'f') {
+								value = digit - 'a';
+								value += 10;
+							} else if (digit >= 'A' && digit <= 'F') {
+								value = digit - 'A';
+								value += 10;
+							} else {
+								// Make error, but keep parsing the string.
+								Token error = make_error("Invalid hexadecimal digit in unicode escape sequence.");
+								error.start_column = column;
+								error.leftmost_column = error.start_column;
+								error.end_column = column + 1;
+								error.rightmost_column = error.end_column;
+								push_error(error);
+								valid_escape = false;
+								break;
+							}
+
+							escaped <<= 4;
+							escaped |= value;
+
+							_advance();
 						}
-
-						char32_t digit = _peek();
-						char32_t value = 0;
-						if (is_digit(digit)) {
-							value = digit - '0';
-						} else if (digit >= 'a' && digit <= 'f') {
-							value = digit - 'a';
-							value += 10;
-						} else if (digit >= 'A' && digit <= 'F') {
-							value = digit - 'A';
-							value += 10;
-						} else {
-							// Make error, but keep parsing the string.
-							Token error = make_error("Invalid hexadecimal digit in unicode escape sequence.");
-							error.start_column = column;
-							error.leftmost_column = error.start_column;
-							error.end_column = column + 1;
-							error.rightmost_column = error.end_column;
-							push_error(error);
-							valid_escape = false;
+					} break;
+					case '\r':
+						if (_peek() != '\n') {
+							// Carriage return without newline in string. (???)
+							// Just add it to the string and keep going.
+							result += ch;
+							_advance();
 							break;
 						}
-
-						escaped <<= 4;
-						escaped |= value;
-
-						_advance();
-					}
-				} break;
-				case '\r':
-					if (_peek() != '\n') {
-						// Carriage return without newline in string. (???)
-						// Just add it to the string and keep going.
-						result += ch;
-						_advance();
+						[[fallthrough]];
+					case '\n':
+						// Escaping newline.
+						newline(false);
+						valid_escape = false; // Don't add to the string.
 						break;
-					}
-					[[fallthrough]];
-				case '\n':
-					// Escaping newline.
-					newline(false);
-					valid_escape = false; // Don't add to the string.
-					break;
-				default:
-					Token error = make_error("Invalid escape in string.");
-					error.start_column = column - 2;
-					error.leftmost_column = error.start_column;
-					push_error(error);
-					valid_escape = false;
-					break;
-			}
-			// Parse UTF-16 pair.
-			if (valid_escape) {
-				if ((escaped & 0xfffffc00) == 0xd800) {
-					if (prev == 0) {
-						prev = escaped;
-						prev_pos = column - 2;
-						continue;
-					} else {
-						Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+					default:
+						Token error = make_error("Invalid escape in string.");
 						error.start_column = column - 2;
 						error.leftmost_column = error.start_column;
 						push_error(error);
 						valid_escape = false;
-						prev = 0;
+						break;
+				}
+				// Parse UTF-16 pair.
+				if (valid_escape) {
+					if ((escaped & 0xfffffc00) == 0xd800) {
+						if (prev == 0) {
+							prev = escaped;
+							prev_pos = column - 2;
+							continue;
+						} else {
+							Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate.");
+							error.start_column = column - 2;
+							error.leftmost_column = error.start_column;
+							push_error(error);
+							valid_escape = false;
+							prev = 0;
+						}
+					} else if ((escaped & 0xfffffc00) == 0xdc00) {
+						if (prev == 0) {
+							Token error = make_error("Invalid UTF-16 sequence in string, unpaired trail surrogate.");
+							error.start_column = column - 2;
+							error.leftmost_column = error.start_column;
+							push_error(error);
+							valid_escape = false;
+						} else {
+							escaped = (prev << 10UL) + escaped - ((0xd800 << 10UL) + 0xdc00 - 0x10000);
+							prev = 0;
+						}
 					}
-				} else if ((escaped & 0xfffffc00) == 0xdc00) {
-					if (prev == 0) {
-						Token error = make_error("Invalid UTF-16 sequence in string, unpaired trail surrogate");
-						error.start_column = column - 2;
+					if (prev != 0) {
+						Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate.");
+						error.start_column = prev_pos;
 						error.leftmost_column = error.start_column;
 						push_error(error);
-						valid_escape = false;
-					} else {
-						escaped = (prev << 10UL) + escaped - ((0xd800 << 10UL) + 0xdc00 - 0x10000);
 						prev = 0;
 					}
 				}
-				if (prev != 0) {
-					Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
-					error.start_column = prev_pos;
-					error.leftmost_column = error.start_column;
-					push_error(error);
-					prev = 0;
-				}
-			}
 
-			if (valid_escape) {
-				result += escaped;
+				if (valid_escape) {
+					result += escaped;
+				}
 			}
 		} else if (ch == quote_char) {
 			if (prev != 0) {
@@ -1099,15 +1195,6 @@ void GDScriptTokenizer::check_indent() {
 			_advance();
 		}
 
-		if (mixed && !(line_continuation || multiline_mode)) {
-			Token error = make_error("Mixed use of tabs and spaces for indentation.");
-			error.start_line = line;
-			error.start_column = 1;
-			error.leftmost_column = 1;
-			error.rightmost_column = column;
-			push_error(error);
-		}
-
 		if (_is_at_end()) {
 			// Reached the end with an empty line, so just dedent as much as needed.
 			pending_indents -= indent_level();
@@ -1151,9 +1238,18 @@ void GDScriptTokenizer::check_indent() {
 			continue;
 		}
 
+		if (mixed && !line_continuation && !multiline_mode) {
+			Token error = make_error("Mixed use of tabs and spaces for indentation.");
+			error.start_line = line;
+			error.start_column = 1;
+			error.leftmost_column = 1;
+			error.rightmost_column = column;
+			push_error(error);
+		}
+
 		if (line_continuation || multiline_mode) {
 			// We cleared up all the whitespace at the beginning of the line.
-			// But if this is a continuation or multiline mode and we don't want any indentation change.
+			// If this is a line continuation or we're in multiline mode then we don't want any indentation changes.
 			return;
 		}
 
@@ -1353,6 +1449,9 @@ GDScriptTokenizer::Token GDScriptTokenizer::scan() {
 
 	if (is_digit(c)) {
 		return number();
+	} else if (c == 'r' && (_peek() == '"' || _peek() == '\'')) {
+		// Raw string literals.
+		return string();
 	} else if (is_unicode_identifier_start(c)) {
 		return potential_identifier();
 	}
@@ -1431,6 +1530,9 @@ GDScriptTokenizer::Token GDScriptTokenizer::scan() {
 			if (_peek() == '=') {
 				_advance();
 				return make_token(Token::PLUS_EQUAL);
+			} else if (is_digit(_peek()) && !last_token.can_precede_bin_op()) {
+				// Number starting with '+'.
+				return number();
 			} else {
 				return make_token(Token::PLUS);
 			}
@@ -1438,6 +1540,9 @@ GDScriptTokenizer::Token GDScriptTokenizer::scan() {
 			if (_peek() == '=') {
 				_advance();
 				return make_token(Token::MINUS_EQUAL);
+			} else if (is_digit(_peek()) && !last_token.can_precede_bin_op()) {
+				// Number starting with '-'.
+				return number();
 			} else if (_peek() == '>') {
 				_advance();
 				return make_token(Token::FORWARD_ARROW);
@@ -1547,9 +1652,9 @@ GDScriptTokenizer::Token GDScriptTokenizer::scan() {
 
 		default:
 			if (is_whitespace(c)) {
-				return make_error(vformat(R"(Invalid white space character "\\u%X".)", static_cast<int32_t>(c)));
+				return make_error(vformat(R"(Invalid white space character U+%04X.)", static_cast<int32_t>(c)));
 			} else {
-				return make_error(vformat(R"(Unknown character "%s".)", String(&c, 1)));
+				return make_error(vformat(R"(Invalid character "%c" (U+%04X).)", c, static_cast<int32_t>(c)));
 			}
 	}
 }