From b4d0a09f15c60c88bbf516d2f6dcdb451dcad9c7 Mon Sep 17 00:00:00 2001
From: George Marques <george@gmarqu.es>
Date: Mon, 22 Jan 2024 11:31:55 -0300
Subject: GDScript: Reintroduce binary tokenization on export

This adds back a function available in 3.x: exporting the GDScript
files in a binary form by converting the tokens recognized by the
tokenizer into a data format.

It is enabled by default on export but can be manually disabled. The
format helps with loading times since, the tokens are easily
reconstructed, and with hiding the source code, since recovering it
would require a specialized tool. Code comments are not stored in this
format.

The `--test` command can also include a `--use-binary-tokens` flag
which will run the GDScript tests with the binary format instead of the
regular source code by converting them in-memory before the test runs.
---
 modules/gdscript/gdscript_parser.cpp | 104 ++++++++++++++++++++++++++---------
 1 file changed, 77 insertions(+), 27 deletions(-)

(limited to 'modules/gdscript/gdscript_parser.cpp')

diff --git a/modules/gdscript/gdscript_parser.cpp b/modules/gdscript/gdscript_parser.cpp
index 129d62cabd..a0036d38d6 100644
--- a/modules/gdscript/gdscript_parser.cpp
+++ b/modules/gdscript/gdscript_parser.cpp
@@ -31,6 +31,7 @@
 #include "gdscript_parser.h"
 
 #include "gdscript.h"
+#include "gdscript_tokenizer_buffer.h"
 
 #include "core/config/project_settings.h"
 #include "core/io/file_access.h"
@@ -226,7 +227,7 @@ void GDScriptParser::make_completion_context(CompletionType p_type, Node *p_node
 	if (!for_completion || (!p_force && completion_context.type != COMPLETION_NONE)) {
 		return;
 	}
-	if (previous.cursor_place != GDScriptTokenizer::CURSOR_MIDDLE && previous.cursor_place != GDScriptTokenizer::CURSOR_END && current.cursor_place == GDScriptTokenizer::CURSOR_NONE) {
+	if (previous.cursor_place != GDScriptTokenizerText::CURSOR_MIDDLE && previous.cursor_place != GDScriptTokenizerText::CURSOR_END && current.cursor_place == GDScriptTokenizerText::CURSOR_NONE) {
 		return;
 	}
 	CompletionContext context;
@@ -234,7 +235,7 @@ void GDScriptParser::make_completion_context(CompletionType p_type, Node *p_node
 	context.current_class = current_class;
 	context.current_function = current_function;
 	context.current_suite = current_suite;
-	context.current_line = tokenizer.get_cursor_line();
+	context.current_line = tokenizer->get_cursor_line();
 	context.current_argument = p_argument;
 	context.node = p_node;
 	completion_context = context;
@@ -244,7 +245,7 @@ void GDScriptParser::make_completion_context(CompletionType p_type, Variant::Typ
 	if (!for_completion || (!p_force && completion_context.type != COMPLETION_NONE)) {
 		return;
 	}
-	if (previous.cursor_place != GDScriptTokenizer::CURSOR_MIDDLE && previous.cursor_place != GDScriptTokenizer::CURSOR_END && current.cursor_place == GDScriptTokenizer::CURSOR_NONE) {
+	if (previous.cursor_place != GDScriptTokenizerText::CURSOR_MIDDLE && previous.cursor_place != GDScriptTokenizerText::CURSOR_END && current.cursor_place == GDScriptTokenizerText::CURSOR_NONE) {
 		return;
 	}
 	CompletionContext context;
@@ -252,7 +253,7 @@ void GDScriptParser::make_completion_context(CompletionType p_type, Variant::Typ
 	context.current_class = current_class;
 	context.current_function = current_function;
 	context.current_suite = current_suite;
-	context.current_line = tokenizer.get_cursor_line();
+	context.current_line = tokenizer->get_cursor_line();
 	context.builtin_type = p_builtin_type;
 	completion_context = context;
 }
@@ -265,7 +266,7 @@ void GDScriptParser::push_completion_call(Node *p_call) {
 	call.call = p_call;
 	call.argument = 0;
 	completion_call_stack.push_back(call);
-	if (previous.cursor_place == GDScriptTokenizer::CURSOR_MIDDLE || previous.cursor_place == GDScriptTokenizer::CURSOR_END || current.cursor_place == GDScriptTokenizer::CURSOR_BEGINNING) {
+	if (previous.cursor_place == GDScriptTokenizerText::CURSOR_MIDDLE || previous.cursor_place == GDScriptTokenizerText::CURSOR_END || current.cursor_place == GDScriptTokenizerText::CURSOR_BEGINNING) {
 		completion_call = call;
 	}
 }
@@ -328,17 +329,21 @@ Error GDScriptParser::parse(const String &p_source_code, const String &p_script_
 		source = source.replace_first(String::chr(0xFFFF), String());
 	}
 
-	tokenizer.set_source_code(source);
-	tokenizer.set_cursor_position(cursor_line, cursor_column);
+	GDScriptTokenizerText *text_tokenizer = memnew(GDScriptTokenizerText);
+	text_tokenizer->set_source_code(source);
+
+	tokenizer = text_tokenizer;
+
+	tokenizer->set_cursor_position(cursor_line, cursor_column);
 	script_path = p_script_path.simplify_path();
-	current = tokenizer.scan();
+	current = tokenizer->scan();
 	// Avoid error or newline as the first token.
 	// The latter can mess with the parser when opening files filled exclusively with comments and newlines.
 	while (current.type == GDScriptTokenizer::Token::ERROR || current.type == GDScriptTokenizer::Token::NEWLINE) {
 		if (current.type == GDScriptTokenizer::Token::ERROR) {
 			push_error(current.literal);
 		}
-		current = tokenizer.scan();
+		current = tokenizer->scan();
 	}
 
 #ifdef DEBUG_ENABLED
@@ -359,6 +364,8 @@ Error GDScriptParser::parse(const String &p_source_code, const String &p_script_
 	parse_program();
 	pop_multiline();
 
+	memdelete(text_tokenizer);
+
 #ifdef DEBUG_ENABLED
 	if (multiline_stack.size() > 0) {
 		ERR_PRINT("Parser bug: Imbalanced multiline stack.");
@@ -372,6 +379,39 @@ Error GDScriptParser::parse(const String &p_source_code, const String &p_script_
 	}
 }
 
+Error GDScriptParser::parse_binary(const Vector<uint8_t> &p_binary, const String &p_script_path) {
+	GDScriptTokenizerBuffer *buffer_tokenizer = memnew(GDScriptTokenizerBuffer);
+	Error err = buffer_tokenizer->set_code_buffer(p_binary);
+
+	if (err) {
+		return err;
+	}
+
+	tokenizer = buffer_tokenizer;
+	script_path = p_script_path;
+	current = tokenizer->scan();
+	// Avoid error or newline as the first token.
+	// The latter can mess with the parser when opening files filled exclusively with comments and newlines.
+	while (current.type == GDScriptTokenizer::Token::ERROR || current.type == GDScriptTokenizer::Token::NEWLINE) {
+		if (current.type == GDScriptTokenizer::Token::ERROR) {
+			push_error(current.literal);
+		}
+		current = tokenizer->scan();
+	}
+
+	push_multiline(false); // Keep one for the whole parsing.
+	parse_program();
+	pop_multiline();
+
+	memdelete(buffer_tokenizer);
+
+	if (errors.is_empty()) {
+		return OK;
+	} else {
+		return ERR_PARSE_ERROR;
+	}
+}
+
 GDScriptTokenizer::Token GDScriptParser::advance() {
 	lambda_ended = false; // Empty marker since we're past the end in any case.
 
@@ -379,16 +419,16 @@ GDScriptTokenizer::Token GDScriptParser::advance() {
 		ERR_FAIL_COND_V_MSG(current.type == GDScriptTokenizer::Token::TK_EOF, current, "GDScript parser bug: Trying to advance past the end of stream.");
 	}
 	if (for_completion && !completion_call_stack.is_empty()) {
-		if (completion_call.call == nullptr && tokenizer.is_past_cursor()) {
+		if (completion_call.call == nullptr && tokenizer->is_past_cursor()) {
 			completion_call = completion_call_stack.back()->get();
 			passed_cursor = true;
 		}
 	}
 	previous = current;
-	current = tokenizer.scan();
+	current = tokenizer->scan();
 	while (current.type == GDScriptTokenizer::Token::ERROR) {
 		push_error(current.literal);
-		current = tokenizer.scan();
+		current = tokenizer->scan();
 	}
 	if (previous.type != GDScriptTokenizer::Token::DEDENT) { // `DEDENT` belongs to the next non-empty line.
 		for (Node *n : nodes_in_progress) {
@@ -457,11 +497,11 @@ void GDScriptParser::synchronize() {
 
 void GDScriptParser::push_multiline(bool p_state) {
 	multiline_stack.push_back(p_state);
-	tokenizer.set_multiline_mode(p_state);
+	tokenizer->set_multiline_mode(p_state);
 	if (p_state) {
 		// Consume potential whitespace tokens already waiting in line.
 		while (current.type == GDScriptTokenizer::Token::NEWLINE || current.type == GDScriptTokenizer::Token::INDENT || current.type == GDScriptTokenizer::Token::DEDENT) {
-			current = tokenizer.scan(); // Don't call advance() here, as we don't want to change the previous token.
+			current = tokenizer->scan(); // Don't call advance() here, as we don't want to change the previous token.
 		}
 	}
 }
@@ -469,7 +509,7 @@ void GDScriptParser::push_multiline(bool p_state) {
 void GDScriptParser::pop_multiline() {
 	ERR_FAIL_COND_MSG(multiline_stack.size() == 0, "Parser bug: trying to pop from multiline stack without available value.");
 	multiline_stack.pop_back();
-	tokenizer.set_multiline_mode(multiline_stack.size() > 0 ? multiline_stack.back()->get() : false);
+	tokenizer->set_multiline_mode(multiline_stack.size() > 0 ? multiline_stack.back()->get() : false);
 }
 
 bool GDScriptParser::is_statement_end_token() const {
@@ -588,7 +628,7 @@ void GDScriptParser::parse_program() {
 	complete_extents(head);
 
 #ifdef TOOLS_ENABLED
-	const HashMap<int, GDScriptTokenizer::CommentData> &comments = tokenizer.get_comments();
+	const HashMap<int, GDScriptTokenizer::CommentData> &comments = tokenizer->get_comments();
 	int line = MIN(max_script_doc_line, head->end_line);
 	while (line > 0) {
 		if (comments.has(line) && comments[line].new_line && comments[line].comment.begins_with("##")) {
@@ -597,6 +637,7 @@ void GDScriptParser::parse_program() {
 		}
 		line--;
 	}
+
 #endif // TOOLS_ENABLED
 
 	if (!check(GDScriptTokenizer::Token::TK_EOF)) {
@@ -793,7 +834,7 @@ void GDScriptParser::parse_class_member(T *(GDScriptParser::*p_parse_function)(b
 		if (has_comment(member->start_line, true)) {
 			// Inline doc comment.
 			member->doc_data = parse_class_doc_comment(member->start_line, true);
-		} else if (has_comment(doc_comment_line, true) && tokenizer.get_comments()[doc_comment_line].new_line) {
+		} else if (has_comment(doc_comment_line, true) && tokenizer->get_comments()[doc_comment_line].new_line) {
 			// Normal doc comment. Don't check `min_member_doc_line` because a class ends parsing after its members.
 			// This may not work correctly for cases like `var a; class B`, but it doesn't matter in practice.
 			member->doc_data = parse_class_doc_comment(doc_comment_line);
@@ -802,7 +843,7 @@ void GDScriptParser::parse_class_member(T *(GDScriptParser::*p_parse_function)(b
 		if (has_comment(member->start_line, true)) {
 			// Inline doc comment.
 			member->doc_data = parse_doc_comment(member->start_line, true);
-		} else if (doc_comment_line >= min_member_doc_line && has_comment(doc_comment_line, true) && tokenizer.get_comments()[doc_comment_line].new_line) {
+		} else if (doc_comment_line >= min_member_doc_line && has_comment(doc_comment_line, true) && tokenizer->get_comments()[doc_comment_line].new_line) {
 			// Normal doc comment.
 			member->doc_data = parse_doc_comment(doc_comment_line);
 		}
@@ -1357,7 +1398,7 @@ GDScriptParser::EnumNode *GDScriptParser::parse_enum(bool p_is_static) {
 			if (i == enum_node->values.size() - 1 || enum_node->values[i + 1].line > enum_value_line) {
 				doc_data = parse_doc_comment(enum_value_line, true);
 			}
-		} else if (doc_comment_line >= min_enum_value_doc_line && has_comment(doc_comment_line, true) && tokenizer.get_comments()[doc_comment_line].new_line) {
+		} else if (doc_comment_line >= min_enum_value_doc_line && has_comment(doc_comment_line, true) && tokenizer->get_comments()[doc_comment_line].new_line) {
 			// Normal doc comment.
 			doc_data = parse_doc_comment(doc_comment_line);
 		}
@@ -2346,6 +2387,9 @@ GDScriptParser::ExpressionNode *GDScriptParser::parse_identifier(ExpressionNode
 	IdentifierNode *identifier = alloc_node<IdentifierNode>();
 	complete_extents(identifier);
 	identifier->name = previous.get_identifier();
+	if (identifier->name.operator String().is_empty()) {
+		print_line("Empty identifier found.");
+	}
 	identifier->suite = current_suite;
 
 	if (current_suite != nullptr && current_suite->has_local(identifier->name)) {
@@ -3050,7 +3094,7 @@ GDScriptParser::ExpressionNode *GDScriptParser::parse_call(ExpressionNode *p_pre
 			// Allow for trailing comma.
 			break;
 		}
-		bool use_identifier_completion = current.cursor_place == GDScriptTokenizer::CURSOR_END || current.cursor_place == GDScriptTokenizer::CURSOR_MIDDLE;
+		bool use_identifier_completion = current.cursor_place == GDScriptTokenizerText::CURSOR_END || current.cursor_place == GDScriptTokenizerText::CURSOR_MIDDLE;
 		ExpressionNode *argument = parse_expression(false);
 		if (argument == nullptr) {
 			push_error(R"(Expected expression as the function argument.)");
@@ -3220,7 +3264,7 @@ GDScriptParser::ExpressionNode *GDScriptParser::parse_lambda(ExpressionNode *p_p
 	// Reset the multiline stack since we don't want the multiline mode one in the lambda body.
 	push_multiline(false);
 	if (multiline_context) {
-		tokenizer.push_expression_indented_block();
+		tokenizer->push_expression_indented_block();
 	}
 
 	push_multiline(true); // For the parameters.
@@ -3267,9 +3311,9 @@ GDScriptParser::ExpressionNode *GDScriptParser::parse_lambda(ExpressionNode *p_p
 	if (multiline_context) {
 		// If we're in multiline mode, we want to skip the spurious DEDENT and NEWLINE tokens.
 		while (check(GDScriptTokenizer::Token::DEDENT) || check(GDScriptTokenizer::Token::INDENT) || check(GDScriptTokenizer::Token::NEWLINE)) {
-			current = tokenizer.scan(); // Not advance() since we don't want to change the previous token.
+			current = tokenizer->scan(); // Not advance() since we don't want to change the previous token.
 		}
-		tokenizer.pop_expression_indented_block();
+		tokenizer->pop_expression_indented_block();
 	}
 
 	current_function = previous_function;
@@ -3518,20 +3562,20 @@ static String _process_doc_line(const String &p_line, const String &p_text, cons
 }
 
 bool GDScriptParser::has_comment(int p_line, bool p_must_be_doc) {
-	bool has_comment = tokenizer.get_comments().has(p_line);
+	bool has_comment = tokenizer->get_comments().has(p_line);
 	// If there are no comments or if we don't care whether the comment
 	// is a docstring, we have our result.
 	if (!p_must_be_doc || !has_comment) {
 		return has_comment;
 	}
 
-	return tokenizer.get_comments()[p_line].comment.begins_with("##");
+	return tokenizer->get_comments()[p_line].comment.begins_with("##");
 }
 
 GDScriptParser::MemberDocData GDScriptParser::parse_doc_comment(int p_line, bool p_single_line) {
 	ERR_FAIL_COND_V(!has_comment(p_line, true), MemberDocData());
 
-	const HashMap<int, GDScriptTokenizer::CommentData> &comments = tokenizer.get_comments();
+	const HashMap<int, GDScriptTokenizer::CommentData> &comments = tokenizer->get_comments();
 	int line = p_line;
 
 	if (!p_single_line) {
@@ -3580,7 +3624,7 @@ GDScriptParser::MemberDocData GDScriptParser::parse_doc_comment(int p_line, bool
 GDScriptParser::ClassDocData GDScriptParser::parse_class_doc_comment(int p_line, bool p_single_line) {
 	ERR_FAIL_COND_V(!has_comment(p_line, true), ClassDocData());
 
-	const HashMap<int, GDScriptTokenizer::CommentData> &comments = tokenizer.get_comments();
+	const HashMap<int, GDScriptTokenizer::CommentData> &comments = tokenizer->get_comments();
 	int line = p_line;
 
 	if (!p_single_line) {
@@ -5027,6 +5071,9 @@ void GDScriptParser::TreePrinter::print_function(FunctionNode *p_function, const
 	for (const AnnotationNode *E : p_function->annotations) {
 		print_annotation(E);
 	}
+	if (p_function->is_static) {
+		push_text("Static ");
+	}
 	push_text(p_context);
 	push_text(" ");
 	if (p_function->identifier) {
@@ -5371,6 +5418,9 @@ void GDScriptParser::TreePrinter::print_variable(VariableNode *p_variable) {
 		print_annotation(E);
 	}
 
+	if (p_variable->is_static) {
+		push_text("Static ");
+	}
 	push_text("Variable ");
 	print_identifier(p_variable->identifier);
 
-- 
cgit v1.2.3


From 72e5f8c31e9237a4581cf250aa9943a4290f4017 Mon Sep 17 00:00:00 2001
From: George Marques <george@gmarqu.es>
Date: Fri, 26 Jan 2024 14:49:31 -0300
Subject: GDScript: Enable compression on export

Besides the regular option to export GDScript as binary tokens, this
also includes a compression option on top of it. The binary format
needs to encode some information which generally makes it bigger than
the source text. This option reduces that difference by using Zstandard
compression on the buffer.
---
 modules/gdscript/gdscript_parser.cpp | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'modules/gdscript/gdscript_parser.cpp')

diff --git a/modules/gdscript/gdscript_parser.cpp b/modules/gdscript/gdscript_parser.cpp
index a0036d38d6..3ba6e4d160 100644
--- a/modules/gdscript/gdscript_parser.cpp
+++ b/modules/gdscript/gdscript_parser.cpp
@@ -365,6 +365,7 @@ Error GDScriptParser::parse(const String &p_source_code, const String &p_script_
 	pop_multiline();
 
 	memdelete(text_tokenizer);
+	tokenizer = nullptr;
 
 #ifdef DEBUG_ENABLED
 	if (multiline_stack.size() > 0) {
@@ -384,6 +385,7 @@ Error GDScriptParser::parse_binary(const Vector<uint8_t> &p_binary, const String
 	Error err = buffer_tokenizer->set_code_buffer(p_binary);
 
 	if (err) {
+		memdelete(buffer_tokenizer);
 		return err;
 	}
 
@@ -404,6 +406,7 @@ Error GDScriptParser::parse_binary(const Vector<uint8_t> &p_binary, const String
 	pop_multiline();
 
 	memdelete(buffer_tokenizer);
+	tokenizer = nullptr;
 
 	if (errors.is_empty()) {
 		return OK;
-- 
cgit v1.2.3