summaryrefslogtreecommitdiffstats
path: root/modules/gdscript/gdscript_tokenizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'modules/gdscript/gdscript_tokenizer.cpp')
-rw-r--r--modules/gdscript/gdscript_tokenizer.cpp415
1 files changed, 260 insertions, 155 deletions
diff --git a/modules/gdscript/gdscript_tokenizer.cpp b/modules/gdscript/gdscript_tokenizer.cpp
index a45a73a8d5..98a3a1268f 100644
--- a/modules/gdscript/gdscript_tokenizer.cpp
+++ b/modules/gdscript/gdscript_tokenizer.cpp
@@ -33,13 +33,14 @@
#include "core/error/error_macros.h"
#include "core/string/char_utils.h"
-#ifdef TOOLS_ENABLED
-#include "editor/editor_settings.h"
-#endif
#ifdef DEBUG_ENABLED
#include "servers/text_server.h"
#endif
+#ifdef TOOLS_ENABLED
+#include "editor/editor_settings.h"
+#endif
+
static const char *token_names[] = {
"Empty", // EMPTY,
// Basic
@@ -98,6 +99,7 @@ static const char *token_names[] = {
"pass", // PASS,
"return", // RETURN,
"match", // MATCH,
+ "when", // WHEN,
// Keywords
"as", // AS,
"assert", // ASSERT,
@@ -162,12 +164,31 @@ const char *GDScriptTokenizer::Token::get_name() const {
return token_names[type];
}
+bool GDScriptTokenizer::Token::can_precede_bin_op() const {
+ switch (type) {
+ case IDENTIFIER:
+ case LITERAL:
+ case SELF:
+ case BRACKET_CLOSE:
+ case BRACE_CLOSE:
+ case PARENTHESIS_CLOSE:
+ case CONST_PI:
+ case CONST_TAU:
+ case CONST_INF:
+ case CONST_NAN:
+ return true;
+ default:
+ return false;
+ }
+}
+
bool GDScriptTokenizer::Token::is_identifier() const {
// Note: Most keywords should not be recognized as identifiers.
// These are only exceptions for stuff that already is on the engine's API.
switch (type) {
case IDENTIFIER:
case MATCH: // Used in String.match().
+ case WHEN: // New keyword, avoid breaking existing code.
// Allow constants to be treated as regular identifiers.
case CONST_PI:
case CONST_INF:
@@ -222,6 +243,7 @@ bool GDScriptTokenizer::Token::is_node_name() const {
case VAR:
case VOID:
case WHILE:
+ case WHEN:
case YIELD:
return true;
default:
@@ -382,6 +404,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::make_token(Token::Type p_type) {
}
}
+ last_token = token;
return token;
}
@@ -511,6 +534,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::annotation() {
KEYWORD("void", Token::VOID) \
KEYWORD_GROUP('w') \
KEYWORD("while", Token::WHILE) \
+ KEYWORD("when", Token::WHEN) \
KEYWORD_GROUP('y') \
KEYWORD("yield", Token::YIELD) \
KEYWORD_GROUP('I') \
@@ -559,6 +583,24 @@ GDScriptTokenizer::Token GDScriptTokenizer::potential_identifier() {
return make_identifier(name);
}
+ if (!only_ascii) {
+ // Kept here in case the order with push_error matters.
+ Token id = make_identifier(name);
+
+#ifdef DEBUG_ENABLED
+ // Additional checks for identifiers but only in debug and if it's available in TextServer.
+ if (TS->has_feature(TextServer::FEATURE_UNICODE_SECURITY)) {
+ int64_t confusable = TS->is_confusable(name, keyword_list);
+ if (confusable >= 0) {
+ push_error(vformat(R"(Identifier "%s" is visually similar to the GDScript keyword "%s" and thus not allowed.)", name, keyword_list[confusable]));
+ }
+ }
+#endif // DEBUG_ENABLED
+
+ // Cannot be a keyword, as keywords are ASCII only.
+ return id;
+ }
+
// Define some helper macros for the switch case.
#define KEYWORD_GROUP_CASE(char) \
break; \
@@ -594,19 +636,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::potential_identifier() {
}
// Not a keyword, so must be an identifier.
- Token id = make_identifier(name);
-
-#ifdef DEBUG_ENABLED
- // Additional checks for identifiers but only in debug and if it's available in TextServer.
- if (!only_ascii && TS->has_feature(TextServer::FEATURE_UNICODE_SECURITY)) {
- int64_t confusable = TS->is_confusable(name, keyword_list);
- if (confusable >= 0) {
- push_error(vformat(R"(Identifier "%s" is visually similar to the GDScript keyword "%s" and thus not allowed.)", name, keyword_list[confusable]));
- }
- }
-#endif // DEBUG_ENABLED
-
- return id;
+ return make_identifier(name);
#undef KEYWORD_GROUP_CASE
#undef KEYWORD
@@ -627,6 +657,7 @@ void GDScriptTokenizer::newline(bool p_make_token) {
newline.leftmost_column = newline.start_column;
newline.rightmost_column = newline.end_column;
pending_newline = true;
+ last_token = newline;
last_newline = newline;
}
@@ -643,6 +674,11 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
bool has_error = false;
bool (*digit_check_func)(char32_t) = is_digit;
+ // Sign before hexadecimal or binary.
+ if ((_peek(-1) == '+' || _peek(-1) == '-') && _peek() == '0') {
+ _advance();
+ }
+
if (_peek(-1) == '.') {
has_decimal = true;
} else if (_peek(-1) == '0') {
@@ -659,12 +695,20 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
}
}
- // Allow '_' to be used in a number, for readability.
- bool previous_was_underscore = false;
+ if (base != 10 && is_underscore(_peek())) { // Disallow `0x_` and `0b_`.
+ Token error = make_error(vformat(R"(Unexpected underscore after "0%c".)", _peek(-1)));
+ error.start_column = column;
+ error.leftmost_column = column;
+ error.end_column = column + 1;
+ error.rightmost_column = column + 1;
+ push_error(error);
+ has_error = true;
+ }
+ bool previous_was_underscore = false; // Allow `_` to be used in a number, for readability.
while (digit_check_func(_peek()) || is_underscore(_peek())) {
if (is_underscore(_peek())) {
if (previous_was_underscore) {
- Token error = make_error(R"(Only one underscore can be used as a numeric separator.)");
+ Token error = make_error(R"(Multiple underscores cannot be adjacent in a numeric literal.)");
error.start_column = column;
error.leftmost_column = column;
error.end_column = column + 1;
@@ -711,7 +755,30 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
_advance();
// Consume decimal digits.
+ if (is_underscore(_peek())) { // Disallow `10._`, but allow `10.`.
+ Token error = make_error(R"(Unexpected underscore after decimal point.)");
+ error.start_column = column;
+ error.leftmost_column = column;
+ error.end_column = column + 1;
+ error.rightmost_column = column + 1;
+ push_error(error);
+ has_error = true;
+ }
+ previous_was_underscore = false;
while (is_digit(_peek()) || is_underscore(_peek())) {
+ if (is_underscore(_peek())) {
+ if (previous_was_underscore) {
+ Token error = make_error(R"(Multiple underscores cannot be adjacent in a numeric literal.)");
+ error.start_column = column;
+ error.leftmost_column = column;
+ error.end_column = column + 1;
+ error.rightmost_column = column + 1;
+ push_error(error);
+ }
+ previous_was_underscore = true;
+ } else {
+ previous_was_underscore = false;
+ }
_advance();
}
}
@@ -737,7 +804,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
while (is_digit(_peek()) || is_underscore(_peek())) {
if (is_underscore(_peek())) {
if (previous_was_underscore) {
- Token error = make_error(R"(Only one underscore can be used as a numeric separator.)");
+ Token error = make_error(R"(Multiple underscores cannot be adjacent in a numeric literal.)");
error.start_column = column;
error.leftmost_column = column;
error.end_column = column + 1;
@@ -794,10 +861,14 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
STRING_NODEPATH,
};
+ bool is_raw = false;
bool is_multiline = false;
StringType type = STRING_REGULAR;
- if (_peek(-1) == '&') {
+ if (_peek(-1) == 'r') {
+ is_raw = true;
+ _advance();
+ } else if (_peek(-1) == '&') {
type = STRING_NAME;
_advance();
} else if (_peek(-1) == '^') {
@@ -827,7 +898,12 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
char32_t ch = _peek();
if (ch == 0x200E || ch == 0x200F || (ch >= 0x202A && ch <= 0x202E) || (ch >= 0x2066 && ch <= 0x2069)) {
- Token error = make_error("Invisible text direction control character present in the string, escape it (\"\\u" + String::num_int64(ch, 16) + "\") to avoid confusion.");
+ Token error;
+ if (is_raw) {
+ error = make_error("Invisible text direction control character present in the string, use regular string literal instead of r-string.");
+ } else {
+ error = make_error("Invisible text direction control character present in the string, escape it (\"\\u" + String::num_int64(ch, 16) + "\") to avoid confusion.");
+ }
error.start_column = column;
error.leftmost_column = error.start_column;
error.end_column = column + 1;
@@ -842,144 +918,164 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
return make_error("Unterminated string.");
}
- // Grab escape character.
- char32_t code = _peek();
- _advance();
- if (_is_at_end()) {
- return make_error("Unterminated string.");
- }
+ if (is_raw) {
+ if (_peek() == quote_char) {
+ _advance();
+ if (_is_at_end()) {
+ return make_error("Unterminated string.");
+ }
+ result += '\\';
+ result += quote_char;
+ } else if (_peek() == '\\') { // For `\\\"`.
+ _advance();
+ if (_is_at_end()) {
+ return make_error("Unterminated string.");
+ }
+ result += '\\';
+ result += '\\';
+ } else {
+ result += '\\';
+ }
+ } else {
+ // Grab escape character.
+ char32_t code = _peek();
+ _advance();
+ if (_is_at_end()) {
+ return make_error("Unterminated string.");
+ }
- char32_t escaped = 0;
- bool valid_escape = true;
+ char32_t escaped = 0;
+ bool valid_escape = true;
- switch (code) {
- case 'a':
- escaped = '\a';
- break;
- case 'b':
- escaped = '\b';
- break;
- case 'f':
- escaped = '\f';
- break;
- case 'n':
- escaped = '\n';
- break;
- case 'r':
- escaped = '\r';
- break;
- case 't':
- escaped = '\t';
- break;
- case 'v':
- escaped = '\v';
- break;
- case '\'':
- escaped = '\'';
- break;
- case '\"':
- escaped = '\"';
- break;
- case '\\':
- escaped = '\\';
- break;
- case 'U':
- case 'u': {
- // Hexadecimal sequence.
- int hex_len = (code == 'U') ? 6 : 4;
- for (int j = 0; j < hex_len; j++) {
- if (_is_at_end()) {
- return make_error("Unterminated string.");
+ switch (code) {
+ case 'a':
+ escaped = '\a';
+ break;
+ case 'b':
+ escaped = '\b';
+ break;
+ case 'f':
+ escaped = '\f';
+ break;
+ case 'n':
+ escaped = '\n';
+ break;
+ case 'r':
+ escaped = '\r';
+ break;
+ case 't':
+ escaped = '\t';
+ break;
+ case 'v':
+ escaped = '\v';
+ break;
+ case '\'':
+ escaped = '\'';
+ break;
+ case '\"':
+ escaped = '\"';
+ break;
+ case '\\':
+ escaped = '\\';
+ break;
+ case 'U':
+ case 'u': {
+ // Hexadecimal sequence.
+ int hex_len = (code == 'U') ? 6 : 4;
+ for (int j = 0; j < hex_len; j++) {
+ if (_is_at_end()) {
+ return make_error("Unterminated string.");
+ }
+
+ char32_t digit = _peek();
+ char32_t value = 0;
+ if (is_digit(digit)) {
+ value = digit - '0';
+ } else if (digit >= 'a' && digit <= 'f') {
+ value = digit - 'a';
+ value += 10;
+ } else if (digit >= 'A' && digit <= 'F') {
+ value = digit - 'A';
+ value += 10;
+ } else {
+ // Make error, but keep parsing the string.
+ Token error = make_error("Invalid hexadecimal digit in unicode escape sequence.");
+ error.start_column = column;
+ error.leftmost_column = error.start_column;
+ error.end_column = column + 1;
+ error.rightmost_column = error.end_column;
+ push_error(error);
+ valid_escape = false;
+ break;
+ }
+
+ escaped <<= 4;
+ escaped |= value;
+
+ _advance();
}
-
- char32_t digit = _peek();
- char32_t value = 0;
- if (is_digit(digit)) {
- value = digit - '0';
- } else if (digit >= 'a' && digit <= 'f') {
- value = digit - 'a';
- value += 10;
- } else if (digit >= 'A' && digit <= 'F') {
- value = digit - 'A';
- value += 10;
- } else {
- // Make error, but keep parsing the string.
- Token error = make_error("Invalid hexadecimal digit in unicode escape sequence.");
- error.start_column = column;
- error.leftmost_column = error.start_column;
- error.end_column = column + 1;
- error.rightmost_column = error.end_column;
- push_error(error);
- valid_escape = false;
+ } break;
+ case '\r':
+ if (_peek() != '\n') {
+ // Carriage return without newline in string. (???)
+ // Just add it to the string and keep going.
+ result += ch;
+ _advance();
break;
}
-
- escaped <<= 4;
- escaped |= value;
-
- _advance();
- }
- } break;
- case '\r':
- if (_peek() != '\n') {
- // Carriage return without newline in string. (???)
- // Just add it to the string and keep going.
- result += ch;
- _advance();
+ [[fallthrough]];
+ case '\n':
+ // Escaping newline.
+ newline(false);
+ valid_escape = false; // Don't add to the string.
break;
- }
- [[fallthrough]];
- case '\n':
- // Escaping newline.
- newline(false);
- valid_escape = false; // Don't add to the string.
- break;
- default:
- Token error = make_error("Invalid escape in string.");
- error.start_column = column - 2;
- error.leftmost_column = error.start_column;
- push_error(error);
- valid_escape = false;
- break;
- }
- // Parse UTF-16 pair.
- if (valid_escape) {
- if ((escaped & 0xfffffc00) == 0xd800) {
- if (prev == 0) {
- prev = escaped;
- prev_pos = column - 2;
- continue;
- } else {
- Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+ default:
+ Token error = make_error("Invalid escape in string.");
error.start_column = column - 2;
error.leftmost_column = error.start_column;
push_error(error);
valid_escape = false;
- prev = 0;
+ break;
+ }
+ // Parse UTF-16 pair.
+ if (valid_escape) {
+ if ((escaped & 0xfffffc00) == 0xd800) {
+ if (prev == 0) {
+ prev = escaped;
+ prev_pos = column - 2;
+ continue;
+ } else {
+ Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate.");
+ error.start_column = column - 2;
+ error.leftmost_column = error.start_column;
+ push_error(error);
+ valid_escape = false;
+ prev = 0;
+ }
+ } else if ((escaped & 0xfffffc00) == 0xdc00) {
+ if (prev == 0) {
+ Token error = make_error("Invalid UTF-16 sequence in string, unpaired trail surrogate.");
+ error.start_column = column - 2;
+ error.leftmost_column = error.start_column;
+ push_error(error);
+ valid_escape = false;
+ } else {
+ escaped = (prev << 10UL) + escaped - ((0xd800 << 10UL) + 0xdc00 - 0x10000);
+ prev = 0;
+ }
}
- } else if ((escaped & 0xfffffc00) == 0xdc00) {
- if (prev == 0) {
- Token error = make_error("Invalid UTF-16 sequence in string, unpaired trail surrogate");
- error.start_column = column - 2;
+ if (prev != 0) {
+ Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate.");
+ error.start_column = prev_pos;
error.leftmost_column = error.start_column;
push_error(error);
- valid_escape = false;
- } else {
- escaped = (prev << 10UL) + escaped - ((0xd800 << 10UL) + 0xdc00 - 0x10000);
prev = 0;
}
}
- if (prev != 0) {
- Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
- error.start_column = prev_pos;
- error.leftmost_column = error.start_column;
- push_error(error);
- prev = 0;
- }
- }
- if (valid_escape) {
- result += escaped;
+ if (valid_escape) {
+ result += escaped;
+ }
}
} else if (ch == quote_char) {
if (prev != 0) {
@@ -1099,15 +1195,6 @@ void GDScriptTokenizer::check_indent() {
_advance();
}
- if (mixed && !(line_continuation || multiline_mode)) {
- Token error = make_error("Mixed use of tabs and spaces for indentation.");
- error.start_line = line;
- error.start_column = 1;
- error.leftmost_column = 1;
- error.rightmost_column = column;
- push_error(error);
- }
-
if (_is_at_end()) {
// Reached the end with an empty line, so just dedent as much as needed.
pending_indents -= indent_level();
@@ -1151,9 +1238,18 @@ void GDScriptTokenizer::check_indent() {
continue;
}
+ if (mixed && !line_continuation && !multiline_mode) {
+ Token error = make_error("Mixed use of tabs and spaces for indentation.");
+ error.start_line = line;
+ error.start_column = 1;
+ error.leftmost_column = 1;
+ error.rightmost_column = column;
+ push_error(error);
+ }
+
if (line_continuation || multiline_mode) {
// We cleared up all the whitespace at the beginning of the line.
- // But if this is a continuation or multiline mode and we don't want any indentation change.
+ // If this is a line continuation or we're in multiline mode then we don't want any indentation changes.
return;
}
@@ -1353,6 +1449,9 @@ GDScriptTokenizer::Token GDScriptTokenizer::scan() {
if (is_digit(c)) {
return number();
+ } else if (c == 'r' && (_peek() == '"' || _peek() == '\'')) {
+ // Raw string literals.
+ return string();
} else if (is_unicode_identifier_start(c)) {
return potential_identifier();
}
@@ -1431,6 +1530,9 @@ GDScriptTokenizer::Token GDScriptTokenizer::scan() {
if (_peek() == '=') {
_advance();
return make_token(Token::PLUS_EQUAL);
+ } else if (is_digit(_peek()) && !last_token.can_precede_bin_op()) {
+ // Number starting with '+'.
+ return number();
} else {
return make_token(Token::PLUS);
}
@@ -1438,6 +1540,9 @@ GDScriptTokenizer::Token GDScriptTokenizer::scan() {
if (_peek() == '=') {
_advance();
return make_token(Token::MINUS_EQUAL);
+ } else if (is_digit(_peek()) && !last_token.can_precede_bin_op()) {
+ // Number starting with '-'.
+ return number();
} else if (_peek() == '>') {
_advance();
return make_token(Token::FORWARD_ARROW);
@@ -1547,9 +1652,9 @@ GDScriptTokenizer::Token GDScriptTokenizer::scan() {
default:
if (is_whitespace(c)) {
- return make_error(vformat(R"(Invalid white space character "\\u%X".)", static_cast<int32_t>(c)));
+ return make_error(vformat(R"(Invalid white space character U+%04X.)", static_cast<int32_t>(c)));
} else {
- return make_error(vformat(R"(Unknown character "%s".)", String(&c, 1)));
+ return make_error(vformat(R"(Invalid character "%c" (U+%04X).)", c, static_cast<int32_t>(c)));
}
}
}