1 files changed, 457 insertions, 0 deletions
diff --git a/modules/gdscript/gdscript_tokenizer_buffer.cpp b/modules/gdscript/gdscript_tokenizer_buffer.cpp
new file mode 100644
index 0000000000..5b41c411d8
--- /dev/null
+++ b/modules/gdscript/gdscript_tokenizer_buffer.cpp
@@ -0,0 +1,457 @@
+/**************************************************************************/
+/*  gdscript_tokenizer_buffer.cpp                                         */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             GODOT ENGINE                               */
+/*                        https://godotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#include "gdscript_tokenizer_buffer.h"
+
+#include "core/io/marshalls.h"
+
+#define TOKENIZER_VERSION 100
+
+int GDScriptTokenizerBuffer::_token_to_binary(const Token &p_token, Vector<uint8_t> &r_buffer, int p_start, HashMap<StringName, uint32_t> &r_identifiers_map, HashMap<Variant, uint32_t, VariantHasher, VariantComparator> &r_constants_map) {
+	int pos = p_start;
+
+	int token_type = p_token.type & TOKEN_MASK;
+
+	switch (p_token.type) {
+		case GDScriptTokenizer::Token::ANNOTATION:
+		case GDScriptTokenizer::Token::IDENTIFIER: {
+			// Add identifier to map.
+			int identifier_pos;
+			StringName id = p_token.get_identifier();
+			if (r_identifiers_map.has(id)) {
+				identifier_pos = r_identifiers_map[id];
+			} else {
+				identifier_pos = r_identifiers_map.size();
+				r_identifiers_map[id] = identifier_pos;
+			}
+			token_type |= identifier_pos << TOKEN_BITS;
+		} break;
+		case GDScriptTokenizer::Token::ERROR:
+		case GDScriptTokenizer::Token::LITERAL: {
+			// Add literal to map.
+			int constant_pos;
+			if (r_constants_map.has(p_token.literal)) {
+				constant_pos = r_constants_map[p_token.literal];
+			} else {
+				constant_pos = r_constants_map.size();
+				r_constants_map[p_token.literal] = constant_pos;
+			}
+			token_type |= constant_pos << TOKEN_BITS;
+		} break;
+		default:
+			break;
+	}
+
+	// Encode token.
+	int token_len;
+	if (token_type & TOKEN_MASK) {
+		token_len = 8;
+		r_buffer.resize(pos + token_len);
+		encode_uint32(token_type | TOKEN_BYTE_MASK, &r_buffer.write[pos]);
+		pos += 4;
+	} else {
+		token_len = 5;
+		r_buffer.resize(pos + token_len);
+		r_buffer.write[pos] = token_type;
+		pos++;
+	}
+	encode_uint32(p_token.start_line, &r_buffer.write[pos]);
+	return token_len;
+}
+
+GDScriptTokenizer::Token GDScriptTokenizerBuffer::_binary_to_token(const uint8_t *p_buffer) {
+	Token token;
+	const uint8_t *b = p_buffer;
+
+	uint32_t token_type = decode_uint32(b);
+	token.type = (Token::Type)(token_type & TOKEN_MASK);
+	if (token_type & TOKEN_BYTE_MASK) {
+		b += 4;
+	} else {
+		b++;
+	}
+	token.start_line = decode_uint32(b);
+	token.end_line = token.start_line;
+
+	token.literal = token.get_name();
+	if (token.type == Token::CONST_NAN) {
+		token.literal = String("NAN"); // Special case since name and notation are different.
+	}
+
+	switch (token.type) {
+		case GDScriptTokenizer::Token::ANNOTATION:
+		case GDScriptTokenizer::Token::IDENTIFIER: {
+			// Get name from map.
+			int identifier_pos = token_type >> TOKEN_BITS;
+			if (unlikely(identifier_pos >= identifiers.size())) {
+				Token error;
+				error.type = Token::ERROR;
+				error.literal = "Identifier index out of bounds.";
+				return error;
+			}
+			token.literal = identifiers[identifier_pos];
+		} break;
+		case GDScriptTokenizer::Token::ERROR:
+		case GDScriptTokenizer::Token::LITERAL: {
+			// Get literal from map.
+			int constant_pos = token_type >> TOKEN_BITS;
+			if (unlikely(constant_pos >= constants.size())) {
+				Token error;
+				error.type = Token::ERROR;
+				error.literal = "Constant index out of bounds.";
+				return error;
+			}
+			token.literal = constants[constant_pos];
+		} break;
+		default:
+			break;
+	}
+
+	return token;
+}
+
+Error GDScriptTokenizerBuffer::set_code_buffer(const Vector<uint8_t> &p_buffer) {
+	const uint8_t *buf = p_buffer.ptr();
+	int total_len = p_buffer.size();
+	ERR_FAIL_COND_V(p_buffer.size() < 24 || p_buffer[0] != 'G' || p_buffer[1] != 'D' || p_buffer[2] != 'S' || p_buffer[3] != 'C', ERR_INVALID_DATA);
+
+	int version = decode_uint32(&buf[4]);
+	ERR_FAIL_COND_V_MSG(version > TOKENIZER_VERSION, ERR_INVALID_DATA, "Binary GDScript is too recent! Please use a newer engine version.");
+
+	uint32_t identifier_count = decode_uint32(&buf[8]);
+	uint32_t constant_count = decode_uint32(&buf[12]);
+	uint32_t token_line_count = decode_uint32(&buf[16]);
+	uint32_t token_count = decode_uint32(&buf[20]);
+
+	const uint8_t *b = &buf[24];
+	total_len -= 24;
+
+	identifiers.resize(identifier_count);
+	for (uint32_t i = 0; i < identifier_count; i++) {
+		uint32_t len = decode_uint32(b);
+		total_len -= 4;
+		ERR_FAIL_COND_V((len * 4u) > (uint32_t)total_len, ERR_INVALID_DATA);
+		b += 4;
+		Vector<uint32_t> cs;
+		cs.resize(len);
+		for (uint32_t j = 0; j < len; j++) {
+			uint8_t tmp[4];
+			for (uint32_t k = 0; k < 4; k++) {
+				tmp[k] = b[j * 4 + k] ^ 0xb6;
+			}
+			cs.write[j] = decode_uint32(tmp);
+		}
+
+		String s(reinterpret_cast<const char32_t *>(cs.ptr()), len);
+		b += len * 4;
+		total_len -= len * 4;
+		identifiers.write[i] = s;
+	}
+
+	constants.resize(constant_count);
+	for (uint32_t i = 0; i < constant_count; i++) {
+		Variant v;
+		int len;
+		Error err = decode_variant(v, b, total_len, &len, false);
+		if (err) {
+			return err;
+		}
+		b += len;
+		total_len -= len;
+		constants.write[i] = v;
+	}
+
+	for (uint32_t i = 0; i < token_line_count; i++) {
+		ERR_FAIL_COND_V(total_len < 8, ERR_INVALID_DATA);
+		uint32_t token_index = decode_uint32(b);
+		b += 4;
+		uint32_t line = decode_uint32(b);
+		b += 4;
+		total_len -= 8;
+		token_lines[token_index] = line;
+	}
+	for (uint32_t i = 0; i < token_line_count; i++) {
+		ERR_FAIL_COND_V(total_len < 8, ERR_INVALID_DATA);
+		uint32_t token_index = decode_uint32(b);
+		b += 4;
+		uint32_t column = decode_uint32(b);
+		b += 4;
+		total_len -= 8;
+		token_columns[token_index] = column;
+	}
+
+	tokens.resize(token_count);
+	for (uint32_t i = 0; i < token_count; i++) {
+		int token_len = 5;
+		if ((*b) & TOKEN_BYTE_MASK) {
+			token_len = 8;
+		}
+		ERR_FAIL_COND_V(total_len < token_len, ERR_INVALID_DATA);
+		Token token = _binary_to_token(b);
+		b += token_len;
+		ERR_FAIL_INDEX_V(token.type, Token::TK_MAX, ERR_INVALID_DATA);
+		tokens.write[i] = token;
+		total_len -= token_len;
+	}
+
+	ERR_FAIL_COND_V(total_len > 0, ERR_INVALID_DATA);
+
+	return OK;
+}
+
+Vector<uint8_t> GDScriptTokenizerBuffer::parse_code_string(const String &p_code) {
+	Vector<uint8_t> buf;
+
+	HashMap<StringName, uint32_t> identifier_map;
+	HashMap<Variant, uint32_t, VariantHasher, VariantComparator> constant_map;
+	Vector<uint8_t> token_buffer;
+	HashMap<uint32_t, uint32_t> token_lines;
+	HashMap<uint32_t, uint32_t> token_columns;
+
+	GDScriptTokenizerText tokenizer;
+	tokenizer.set_source_code(p_code);
+	tokenizer.set_multiline_mode(true); // Ignore whitespace tokens.
+	Token current = tokenizer.scan();
+	int token_pos = 0;
+	int last_token_line = 0;
+	int token_counter = 0;
+
+	while (current.type != Token::TK_EOF) {
+		int token_len = _token_to_binary(current, token_buffer, token_pos, identifier_map, constant_map);
+		token_pos += token_len;
+		if (token_counter > 0 && current.start_line > last_token_line) {
+			token_lines[token_counter] = current.start_line;
+			token_columns[token_counter] = current.start_column;
+		}
+		last_token_line = current.end_line;
+
+		current = tokenizer.scan();
+		token_counter++;
+	}
+
+	// Reverse maps.
+	Vector<StringName> rev_identifier_map;
+	rev_identifier_map.resize(identifier_map.size());
+	for (const KeyValue<StringName, uint32_t> &E : identifier_map) {
+		rev_identifier_map.write[E.value] = E.key;
+	}
+	Vector<Variant> rev_constant_map;
+	rev_constant_map.resize(constant_map.size());
+	for (const KeyValue<Variant, uint32_t> &E : constant_map) {
+		rev_constant_map.write[E.value] = E.key;
+	}
+	HashMap<uint32_t, uint32_t> rev_token_lines;
+	for (const KeyValue<uint32_t, uint32_t> &E : token_lines) {
+		rev_token_lines[E.value] = E.key;
+	}
+
+	// Remove continuation lines from map.
+	for (int line : tokenizer.get_continuation_lines()) {
+		if (rev_token_lines.has(line + 1)) {
+			token_lines.erase(rev_token_lines[line + 1]);
+			token_columns.erase(rev_token_lines[line + 1]);
+		}
+	}
+
+	// Save header.
+	buf.resize(24);
+	buf.write[0] = 'G';
+	buf.write[1] = 'D';
+	buf.write[2] = 'S';
+	buf.write[3] = 'C';
+	encode_uint32(TOKENIZER_VERSION, &buf.write[4]);
+	encode_uint32(identifier_map.size(), &buf.write[8]);
+	encode_uint32(constant_map.size(), &buf.write[12]);
+	encode_uint32(token_lines.size(), &buf.write[16]);
+	encode_uint32(token_counter, &buf.write[20]);
+
+	int buf_pos = 24;
+
+	// Save identifiers.
+	for (const StringName &id : rev_identifier_map) {
+		String s = id.operator String();
+		int len = s.length();
+
+		buf.resize(buf_pos + (len + 1) * 4);
+
+		encode_uint32(len, &buf.write[buf_pos]);
+		buf_pos += 4;
+
+		for (int i = 0; i < len; i++) {
+			uint8_t tmp[4];
+			encode_uint32(s[i], tmp);
+
+			for (int b = 0; b < 4; b++) {
+				buf.write[buf_pos + b] = tmp[b] ^ 0xb6;
+			}
+
+			buf_pos += 4;
+		}
+	}
+
+	// Save constants.
+	for (const Variant &v : rev_constant_map) {
+		int len;
+		// Objects cannot be constant, never encode objects.
+		Error err = encode_variant(v, nullptr, len, false);
+		ERR_FAIL_COND_V_MSG(err != OK, Vector<uint8_t>(), "Error when trying to encode Variant.");
+		buf.resize(buf_pos + len);
+		encode_variant(v, &buf.write[buf_pos], len, false);
+		buf_pos += len;
+	}
+
+	// Save lines and columns.
+	buf.resize(buf_pos + token_lines.size() * 16);
+	for (const KeyValue<uint32_t, uint32_t> &e : token_lines) {
+		encode_uint32(e.key, &buf.write[buf_pos]);
+		buf_pos += 4;
+		encode_uint32(e.value, &buf.write[buf_pos]);
+		buf_pos += 4;
+	}
+	for (const KeyValue<uint32_t, uint32_t> &e : token_columns) {
+		encode_uint32(e.key, &buf.write[buf_pos]);
+		buf_pos += 4;
+		encode_uint32(e.value, &buf.write[buf_pos]);
+		buf_pos += 4;
+	}
+
+	// Store tokens.
+	buf.append_array(token_buffer);
+
+	return buf;
+}
+
+int GDScriptTokenizerBuffer::get_cursor_line() const {
+	return 0;
+}
+
+int GDScriptTokenizerBuffer::get_cursor_column() const {
+	return 0;
+}
+
+void GDScriptTokenizerBuffer::set_cursor_position(int p_line, int p_column) {
+}
+
+void GDScriptTokenizerBuffer::set_multiline_mode(bool p_state) {
+	multiline_mode = p_state;
+}
+
+bool GDScriptTokenizerBuffer::is_past_cursor() const {
+	return false;
+}
+
+void GDScriptTokenizerBuffer::push_expression_indented_block() {
+	indent_stack_stack.push_back(indent_stack);
+}
+
+void GDScriptTokenizerBuffer::pop_expression_indented_block() {
+	ERR_FAIL_COND(indent_stack_stack.size() == 0);
+	indent_stack = indent_stack_stack.back()->get();
+	indent_stack_stack.pop_back();
+}
+
+GDScriptTokenizer::Token GDScriptTokenizerBuffer::scan() {
+	// Add final newline.
+	if (current >= tokens.size() && !last_token_was_newline) {
+		Token newline;
+		newline.type = Token::NEWLINE;
+		newline.start_line = current_line;
+		newline.end_line = current_line;
+		last_token_was_newline = true;
+		return newline;
+	}
+
+	// Resolve pending indentation change.
+	if (pending_indents > 0) {
+		pending_indents--;
+		Token indent;
+		indent.type = Token::INDENT;
+		indent.start_line = current_line;
+		indent.end_line = current_line;
+		return indent;
+	} else if (pending_indents < 0) {
+		pending_indents++;
+		Token dedent;
+		dedent.type = Token::DEDENT;
+		dedent.start_line = current_line;
+		dedent.end_line = current_line;
+		return dedent;
+	}
+
+	if (current >= tokens.size()) {
+		if (!indent_stack.is_empty()) {
+			pending_indents -= indent_stack.size();
+			indent_stack.clear();
+			return scan();
+		}
+		Token eof;
+		eof.type = Token::TK_EOF;
+		return eof;
+	};
+
+	if (!last_token_was_newline && token_lines.has(current)) {
+		current_line = token_lines[current];
+		uint32_t current_column = token_columns[current];
+
+		// Check if there's a need to indent/dedent.
+		if (!multiline_mode) {
+			uint32_t previous_indent = 0;
+			if (!indent_stack.is_empty()) {
+				previous_indent = indent_stack.back()->get();
+			}
+			if (current_column - 1 > previous_indent) {
+				pending_indents++;
+				indent_stack.push_back(current_column - 1);
+			} else {
+				while (current_column - 1 < previous_indent) {
+					pending_indents--;
+					indent_stack.pop_back();
+					if (indent_stack.is_empty()) {
+						break;
+					}
+					previous_indent = indent_stack.back()->get();
+				}
+			}
+
+			Token newline;
+			newline.type = Token::NEWLINE;
+			newline.start_line = current_line;
+			newline.end_line = current_line;
+			last_token_was_newline = true;
+
+			return newline;
+		}
+	}
+
+	last_token_was_newline = false;
+
+	Token token = tokens[current++];
+	return token;
+}