From b4d0a09f15c60c88bbf516d2f6dcdb451dcad9c7 Mon Sep 17 00:00:00 2001 From: George Marques Date: Mon, 22 Jan 2024 11:31:55 -0300 Subject: GDScript: Reintroduce binary tokenization on export This adds back a function available in 3.x: exporting the GDScript files in a binary form by converting the tokens recognized by the tokenizer into a data format. It is enabled by default on export but can be manually disabled. The format helps with loading times since, the tokens are easily reconstructed, and with hiding the source code, since recovering it would require a specialized tool. Code comments are not stored in this format. The `--test` command can also include a `--use-binary-tokens` flag which will run the GDScript tests with the binary format instead of the regular source code by converting them in-memory before the test runs. --- modules/gdscript/gdscript_tokenizer_buffer.cpp | 457 +++++++++++++++++++++++++ 1 file changed, 457 insertions(+) create mode 100644 modules/gdscript/gdscript_tokenizer_buffer.cpp (limited to 'modules/gdscript/gdscript_tokenizer_buffer.cpp') diff --git a/modules/gdscript/gdscript_tokenizer_buffer.cpp b/modules/gdscript/gdscript_tokenizer_buffer.cpp new file mode 100644 index 0000000000..5b41c411d8 --- /dev/null +++ b/modules/gdscript/gdscript_tokenizer_buffer.cpp @@ -0,0 +1,457 @@ +/**************************************************************************/ +/* gdscript_tokenizer_buffer.cpp */ +/**************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* https://godotengine.org */ +/**************************************************************************/ +/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */ +/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/**************************************************************************/ + +#include "gdscript_tokenizer_buffer.h" + +#include "core/io/marshalls.h" + +#define TOKENIZER_VERSION 100 + +int GDScriptTokenizerBuffer::_token_to_binary(const Token &p_token, Vector &r_buffer, int p_start, HashMap &r_identifiers_map, HashMap &r_constants_map) { + int pos = p_start; + + int token_type = p_token.type & TOKEN_MASK; + + switch (p_token.type) { + case GDScriptTokenizer::Token::ANNOTATION: + case GDScriptTokenizer::Token::IDENTIFIER: { + // Add identifier to map. + int identifier_pos; + StringName id = p_token.get_identifier(); + if (r_identifiers_map.has(id)) { + identifier_pos = r_identifiers_map[id]; + } else { + identifier_pos = r_identifiers_map.size(); + r_identifiers_map[id] = identifier_pos; + } + token_type |= identifier_pos << TOKEN_BITS; + } break; + case GDScriptTokenizer::Token::ERROR: + case GDScriptTokenizer::Token::LITERAL: { + // Add literal to map. + int constant_pos; + if (r_constants_map.has(p_token.literal)) { + constant_pos = r_constants_map[p_token.literal]; + } else { + constant_pos = r_constants_map.size(); + r_constants_map[p_token.literal] = constant_pos; + } + token_type |= constant_pos << TOKEN_BITS; + } break; + default: + break; + } + + // Encode token. + int token_len; + if (token_type & TOKEN_MASK) { + token_len = 8; + r_buffer.resize(pos + token_len); + encode_uint32(token_type | TOKEN_BYTE_MASK, &r_buffer.write[pos]); + pos += 4; + } else { + token_len = 5; + r_buffer.resize(pos + token_len); + r_buffer.write[pos] = token_type; + pos++; + } + encode_uint32(p_token.start_line, &r_buffer.write[pos]); + return token_len; +} + +GDScriptTokenizer::Token GDScriptTokenizerBuffer::_binary_to_token(const uint8_t *p_buffer) { + Token token; + const uint8_t *b = p_buffer; + + uint32_t token_type = decode_uint32(b); + token.type = (Token::Type)(token_type & TOKEN_MASK); + if (token_type & TOKEN_BYTE_MASK) { + b += 4; + } else { + b++; + } + token.start_line = decode_uint32(b); + token.end_line = token.start_line; + + token.literal = token.get_name(); + if (token.type == Token::CONST_NAN) { + token.literal = String("NAN"); // Special case since name and notation are different. + } + + switch (token.type) { + case GDScriptTokenizer::Token::ANNOTATION: + case GDScriptTokenizer::Token::IDENTIFIER: { + // Get name from map. + int identifier_pos = token_type >> TOKEN_BITS; + if (unlikely(identifier_pos >= identifiers.size())) { + Token error; + error.type = Token::ERROR; + error.literal = "Identifier index out of bounds."; + return error; + } + token.literal = identifiers[identifier_pos]; + } break; + case GDScriptTokenizer::Token::ERROR: + case GDScriptTokenizer::Token::LITERAL: { + // Get literal from map. + int constant_pos = token_type >> TOKEN_BITS; + if (unlikely(constant_pos >= constants.size())) { + Token error; + error.type = Token::ERROR; + error.literal = "Constant index out of bounds."; + return error; + } + token.literal = constants[constant_pos]; + } break; + default: + break; + } + + return token; +} + +Error GDScriptTokenizerBuffer::set_code_buffer(const Vector &p_buffer) { + const uint8_t *buf = p_buffer.ptr(); + int total_len = p_buffer.size(); + ERR_FAIL_COND_V(p_buffer.size() < 24 || p_buffer[0] != 'G' || p_buffer[1] != 'D' || p_buffer[2] != 'S' || p_buffer[3] != 'C', ERR_INVALID_DATA); + + int version = decode_uint32(&buf[4]); + ERR_FAIL_COND_V_MSG(version > TOKENIZER_VERSION, ERR_INVALID_DATA, "Binary GDScript is too recent! Please use a newer engine version."); + + uint32_t identifier_count = decode_uint32(&buf[8]); + uint32_t constant_count = decode_uint32(&buf[12]); + uint32_t token_line_count = decode_uint32(&buf[16]); + uint32_t token_count = decode_uint32(&buf[20]); + + const uint8_t *b = &buf[24]; + total_len -= 24; + + identifiers.resize(identifier_count); + for (uint32_t i = 0; i < identifier_count; i++) { + uint32_t len = decode_uint32(b); + total_len -= 4; + ERR_FAIL_COND_V((len * 4u) > (uint32_t)total_len, ERR_INVALID_DATA); + b += 4; + Vector cs; + cs.resize(len); + for (uint32_t j = 0; j < len; j++) { + uint8_t tmp[4]; + for (uint32_t k = 0; k < 4; k++) { + tmp[k] = b[j * 4 + k] ^ 0xb6; + } + cs.write[j] = decode_uint32(tmp); + } + + String s(reinterpret_cast(cs.ptr()), len); + b += len * 4; + total_len -= len * 4; + identifiers.write[i] = s; + } + + constants.resize(constant_count); + for (uint32_t i = 0; i < constant_count; i++) { + Variant v; + int len; + Error err = decode_variant(v, b, total_len, &len, false); + if (err) { + return err; + } + b += len; + total_len -= len; + constants.write[i] = v; + } + + for (uint32_t i = 0; i < token_line_count; i++) { + ERR_FAIL_COND_V(total_len < 8, ERR_INVALID_DATA); + uint32_t token_index = decode_uint32(b); + b += 4; + uint32_t line = decode_uint32(b); + b += 4; + total_len -= 8; + token_lines[token_index] = line; + } + for (uint32_t i = 0; i < token_line_count; i++) { + ERR_FAIL_COND_V(total_len < 8, ERR_INVALID_DATA); + uint32_t token_index = decode_uint32(b); + b += 4; + uint32_t column = decode_uint32(b); + b += 4; + total_len -= 8; + token_columns[token_index] = column; + } + + tokens.resize(token_count); + for (uint32_t i = 0; i < token_count; i++) { + int token_len = 5; + if ((*b) & TOKEN_BYTE_MASK) { + token_len = 8; + } + ERR_FAIL_COND_V(total_len < token_len, ERR_INVALID_DATA); + Token token = _binary_to_token(b); + b += token_len; + ERR_FAIL_INDEX_V(token.type, Token::TK_MAX, ERR_INVALID_DATA); + tokens.write[i] = token; + total_len -= token_len; + } + + ERR_FAIL_COND_V(total_len > 0, ERR_INVALID_DATA); + + return OK; +} + +Vector GDScriptTokenizerBuffer::parse_code_string(const String &p_code) { + Vector buf; + + HashMap identifier_map; + HashMap constant_map; + Vector token_buffer; + HashMap token_lines; + HashMap token_columns; + + GDScriptTokenizerText tokenizer; + tokenizer.set_source_code(p_code); + tokenizer.set_multiline_mode(true); // Ignore whitespace tokens. + Token current = tokenizer.scan(); + int token_pos = 0; + int last_token_line = 0; + int token_counter = 0; + + while (current.type != Token::TK_EOF) { + int token_len = _token_to_binary(current, token_buffer, token_pos, identifier_map, constant_map); + token_pos += token_len; + if (token_counter > 0 && current.start_line > last_token_line) { + token_lines[token_counter] = current.start_line; + token_columns[token_counter] = current.start_column; + } + last_token_line = current.end_line; + + current = tokenizer.scan(); + token_counter++; + } + + // Reverse maps. + Vector rev_identifier_map; + rev_identifier_map.resize(identifier_map.size()); + for (const KeyValue &E : identifier_map) { + rev_identifier_map.write[E.value] = E.key; + } + Vector rev_constant_map; + rev_constant_map.resize(constant_map.size()); + for (const KeyValue &E : constant_map) { + rev_constant_map.write[E.value] = E.key; + } + HashMap rev_token_lines; + for (const KeyValue &E : token_lines) { + rev_token_lines[E.value] = E.key; + } + + // Remove continuation lines from map. + for (int line : tokenizer.get_continuation_lines()) { + if (rev_token_lines.has(line + 1)) { + token_lines.erase(rev_token_lines[line + 1]); + token_columns.erase(rev_token_lines[line + 1]); + } + } + + // Save header. + buf.resize(24); + buf.write[0] = 'G'; + buf.write[1] = 'D'; + buf.write[2] = 'S'; + buf.write[3] = 'C'; + encode_uint32(TOKENIZER_VERSION, &buf.write[4]); + encode_uint32(identifier_map.size(), &buf.write[8]); + encode_uint32(constant_map.size(), &buf.write[12]); + encode_uint32(token_lines.size(), &buf.write[16]); + encode_uint32(token_counter, &buf.write[20]); + + int buf_pos = 24; + + // Save identifiers. + for (const StringName &id : rev_identifier_map) { + String s = id.operator String(); + int len = s.length(); + + buf.resize(buf_pos + (len + 1) * 4); + + encode_uint32(len, &buf.write[buf_pos]); + buf_pos += 4; + + for (int i = 0; i < len; i++) { + uint8_t tmp[4]; + encode_uint32(s[i], tmp); + + for (int b = 0; b < 4; b++) { + buf.write[buf_pos + b] = tmp[b] ^ 0xb6; + } + + buf_pos += 4; + } + } + + // Save constants. + for (const Variant &v : rev_constant_map) { + int len; + // Objects cannot be constant, never encode objects. + Error err = encode_variant(v, nullptr, len, false); + ERR_FAIL_COND_V_MSG(err != OK, Vector(), "Error when trying to encode Variant."); + buf.resize(buf_pos + len); + encode_variant(v, &buf.write[buf_pos], len, false); + buf_pos += len; + } + + // Save lines and columns. + buf.resize(buf_pos + token_lines.size() * 16); + for (const KeyValue &e : token_lines) { + encode_uint32(e.key, &buf.write[buf_pos]); + buf_pos += 4; + encode_uint32(e.value, &buf.write[buf_pos]); + buf_pos += 4; + } + for (const KeyValue &e : token_columns) { + encode_uint32(e.key, &buf.write[buf_pos]); + buf_pos += 4; + encode_uint32(e.value, &buf.write[buf_pos]); + buf_pos += 4; + } + + // Store tokens. + buf.append_array(token_buffer); + + return buf; +} + +int GDScriptTokenizerBuffer::get_cursor_line() const { + return 0; +} + +int GDScriptTokenizerBuffer::get_cursor_column() const { + return 0; +} + +void GDScriptTokenizerBuffer::set_cursor_position(int p_line, int p_column) { +} + +void GDScriptTokenizerBuffer::set_multiline_mode(bool p_state) { + multiline_mode = p_state; +} + +bool GDScriptTokenizerBuffer::is_past_cursor() const { + return false; +} + +void GDScriptTokenizerBuffer::push_expression_indented_block() { + indent_stack_stack.push_back(indent_stack); +} + +void GDScriptTokenizerBuffer::pop_expression_indented_block() { + ERR_FAIL_COND(indent_stack_stack.size() == 0); + indent_stack = indent_stack_stack.back()->get(); + indent_stack_stack.pop_back(); +} + +GDScriptTokenizer::Token GDScriptTokenizerBuffer::scan() { + // Add final newline. + if (current >= tokens.size() && !last_token_was_newline) { + Token newline; + newline.type = Token::NEWLINE; + newline.start_line = current_line; + newline.end_line = current_line; + last_token_was_newline = true; + return newline; + } + + // Resolve pending indentation change. + if (pending_indents > 0) { + pending_indents--; + Token indent; + indent.type = Token::INDENT; + indent.start_line = current_line; + indent.end_line = current_line; + return indent; + } else if (pending_indents < 0) { + pending_indents++; + Token dedent; + dedent.type = Token::DEDENT; + dedent.start_line = current_line; + dedent.end_line = current_line; + return dedent; + } + + if (current >= tokens.size()) { + if (!indent_stack.is_empty()) { + pending_indents -= indent_stack.size(); + indent_stack.clear(); + return scan(); + } + Token eof; + eof.type = Token::TK_EOF; + return eof; + }; + + if (!last_token_was_newline && token_lines.has(current)) { + current_line = token_lines[current]; + uint32_t current_column = token_columns[current]; + + // Check if there's a need to indent/dedent. + if (!multiline_mode) { + uint32_t previous_indent = 0; + if (!indent_stack.is_empty()) { + previous_indent = indent_stack.back()->get(); + } + if (current_column - 1 > previous_indent) { + pending_indents++; + indent_stack.push_back(current_column - 1); + } else { + while (current_column - 1 < previous_indent) { + pending_indents--; + indent_stack.pop_back(); + if (indent_stack.is_empty()) { + break; + } + previous_indent = indent_stack.back()->get(); + } + } + + Token newline; + newline.type = Token::NEWLINE; + newline.start_line = current_line; + newline.end_line = current_line; + last_token_was_newline = true; + + return newline; + } + } + + last_token_was_newline = false; + + Token token = tokens[current++]; + return token; +} -- cgit v1.2.3 From 72e5f8c31e9237a4581cf250aa9943a4290f4017 Mon Sep 17 00:00:00 2001 From: George Marques Date: Fri, 26 Jan 2024 14:49:31 -0300 Subject: GDScript: Enable compression on export Besides the regular option to export GDScript as binary tokens, this also includes a compression option on top of it. The binary format needs to encode some information which generally makes it bigger than the source text. This option reduces that difference by using Zstandard compression on the buffer. --- modules/gdscript/gdscript_tokenizer_buffer.cpp | 106 +++++++++++++++++-------- 1 file changed, 71 insertions(+), 35 deletions(-) (limited to 'modules/gdscript/gdscript_tokenizer_buffer.cpp') diff --git a/modules/gdscript/gdscript_tokenizer_buffer.cpp b/modules/gdscript/gdscript_tokenizer_buffer.cpp index 5b41c411d8..db523ea941 100644 --- a/modules/gdscript/gdscript_tokenizer_buffer.cpp +++ b/modules/gdscript/gdscript_tokenizer_buffer.cpp @@ -30,6 +30,7 @@ #include "gdscript_tokenizer_buffer.h" +#include "core/io/compression.h" #include "core/io/marshalls.h" #define TOKENIZER_VERSION 100 @@ -139,19 +140,31 @@ GDScriptTokenizer::Token GDScriptTokenizerBuffer::_binary_to_token(const uint8_t Error GDScriptTokenizerBuffer::set_code_buffer(const Vector &p_buffer) { const uint8_t *buf = p_buffer.ptr(); - int total_len = p_buffer.size(); - ERR_FAIL_COND_V(p_buffer.size() < 24 || p_buffer[0] != 'G' || p_buffer[1] != 'D' || p_buffer[2] != 'S' || p_buffer[3] != 'C', ERR_INVALID_DATA); + ERR_FAIL_COND_V(p_buffer.size() < 12 || p_buffer[0] != 'G' || p_buffer[1] != 'D' || p_buffer[2] != 'S' || p_buffer[3] != 'C', ERR_INVALID_DATA); int version = decode_uint32(&buf[4]); ERR_FAIL_COND_V_MSG(version > TOKENIZER_VERSION, ERR_INVALID_DATA, "Binary GDScript is too recent! Please use a newer engine version."); - uint32_t identifier_count = decode_uint32(&buf[8]); - uint32_t constant_count = decode_uint32(&buf[12]); - uint32_t token_line_count = decode_uint32(&buf[16]); - uint32_t token_count = decode_uint32(&buf[20]); + int decompressed_size = decode_uint32(&buf[8]); - const uint8_t *b = &buf[24]; - total_len -= 24; + Vector contents; + if (decompressed_size == 0) { + contents = p_buffer.slice(12); + } else { + contents.resize(decompressed_size); + int result = Compression::decompress(contents.ptrw(), contents.size(), &buf[12], p_buffer.size() - 12, Compression::MODE_ZSTD); + ERR_FAIL_COND_V_MSG(result != decompressed_size, ERR_INVALID_DATA, "Error decompressing GDScript tokenizer buffer."); + } + + int total_len = contents.size(); + buf = contents.ptr(); + uint32_t identifier_count = decode_uint32(&buf[0]); + uint32_t constant_count = decode_uint32(&buf[4]); + uint32_t token_line_count = decode_uint32(&buf[8]); + uint32_t token_count = decode_uint32(&buf[16]); + + const uint8_t *b = &buf[20]; + total_len -= 20; identifiers.resize(identifier_count); for (uint32_t i = 0; i < identifier_count; i++) { @@ -226,9 +239,7 @@ Error GDScriptTokenizerBuffer::set_code_buffer(const Vector &p_buffer) return OK; } -Vector GDScriptTokenizerBuffer::parse_code_string(const String &p_code) { - Vector buf; - +Vector GDScriptTokenizerBuffer::parse_code_string(const String &p_code, CompressMode p_compress_mode) { HashMap identifier_map; HashMap constant_map; Vector token_buffer; @@ -280,28 +291,23 @@ Vector GDScriptTokenizerBuffer::parse_code_string(const String &p_code) } } - // Save header. - buf.resize(24); - buf.write[0] = 'G'; - buf.write[1] = 'D'; - buf.write[2] = 'S'; - buf.write[3] = 'C'; - encode_uint32(TOKENIZER_VERSION, &buf.write[4]); - encode_uint32(identifier_map.size(), &buf.write[8]); - encode_uint32(constant_map.size(), &buf.write[12]); - encode_uint32(token_lines.size(), &buf.write[16]); - encode_uint32(token_counter, &buf.write[20]); + Vector contents; + contents.resize(20); + encode_uint32(identifier_map.size(), &contents.write[0]); + encode_uint32(constant_map.size(), &contents.write[4]); + encode_uint32(token_lines.size(), &contents.write[8]); + encode_uint32(token_counter, &contents.write[16]); - int buf_pos = 24; + int buf_pos = 20; // Save identifiers. for (const StringName &id : rev_identifier_map) { String s = id.operator String(); int len = s.length(); - buf.resize(buf_pos + (len + 1) * 4); + contents.resize(buf_pos + (len + 1) * 4); - encode_uint32(len, &buf.write[buf_pos]); + encode_uint32(len, &contents.write[buf_pos]); buf_pos += 4; for (int i = 0; i < len; i++) { @@ -309,7 +315,7 @@ Vector GDScriptTokenizerBuffer::parse_code_string(const String &p_code) encode_uint32(s[i], tmp); for (int b = 0; b < 4; b++) { - buf.write[buf_pos + b] = tmp[b] ^ 0xb6; + contents.write[buf_pos + b] = tmp[b] ^ 0xb6; } buf_pos += 4; @@ -322,28 +328,58 @@ Vector GDScriptTokenizerBuffer::parse_code_string(const String &p_code) // Objects cannot be constant, never encode objects. Error err = encode_variant(v, nullptr, len, false); ERR_FAIL_COND_V_MSG(err != OK, Vector(), "Error when trying to encode Variant."); - buf.resize(buf_pos + len); - encode_variant(v, &buf.write[buf_pos], len, false); + contents.resize(buf_pos + len); + encode_variant(v, &contents.write[buf_pos], len, false); buf_pos += len; } // Save lines and columns. - buf.resize(buf_pos + token_lines.size() * 16); + contents.resize(buf_pos + token_lines.size() * 16); for (const KeyValue &e : token_lines) { - encode_uint32(e.key, &buf.write[buf_pos]); + encode_uint32(e.key, &contents.write[buf_pos]); buf_pos += 4; - encode_uint32(e.value, &buf.write[buf_pos]); + encode_uint32(e.value, &contents.write[buf_pos]); buf_pos += 4; } for (const KeyValue &e : token_columns) { - encode_uint32(e.key, &buf.write[buf_pos]); + encode_uint32(e.key, &contents.write[buf_pos]); buf_pos += 4; - encode_uint32(e.value, &buf.write[buf_pos]); + encode_uint32(e.value, &contents.write[buf_pos]); buf_pos += 4; } // Store tokens. - buf.append_array(token_buffer); + contents.append_array(token_buffer); + + Vector buf; + + // Save header. + buf.resize(12); + buf.write[0] = 'G'; + buf.write[1] = 'D'; + buf.write[2] = 'S'; + buf.write[3] = 'C'; + encode_uint32(TOKENIZER_VERSION, &buf.write[4]); + + switch (p_compress_mode) { + case COMPRESS_NONE: + encode_uint32(0u, &buf.write[8]); + buf.append_array(contents); + break; + + case COMPRESS_ZSTD: { + encode_uint32(contents.size(), &buf.write[8]); + Vector compressed; + int max_size = Compression::get_max_compressed_buffer_size(contents.size(), Compression::MODE_ZSTD); + compressed.resize(max_size); + + int compressed_size = Compression::compress(compressed.ptrw(), contents.ptr(), contents.size(), Compression::MODE_ZSTD); + ERR_FAIL_COND_V_MSG(compressed_size < 0, Vector(), "Error compressing GDScript tokenizer buffer."); + compressed.resize(compressed_size); + + buf.append_array(compressed); + } break; + } return buf; } @@ -372,7 +408,7 @@ void GDScriptTokenizerBuffer::push_expression_indented_block() { } void GDScriptTokenizerBuffer::pop_expression_indented_block() { - ERR_FAIL_COND(indent_stack_stack.size() == 0); + ERR_FAIL_COND(indent_stack_stack.is_empty()); indent_stack = indent_stack_stack.back()->get(); indent_stack_stack.pop_back(); } -- cgit v1.2.3