27 files changed, 817 insertions, 168 deletions
diff --git a/core/config/project_settings.cpp b/core/config/project_settings.cpp
index e59f79fcc8..37a2608c10 100644
--- a/core/config/project_settings.cpp
+++ b/core/config/project_settings.cpp
@@ -1491,7 +1491,7 @@ ProjectSettings::ProjectSettings() {
 
 	PackedStringArray extensions;
 	extensions.push_back("gd");
-	if (Engine::get_singleton()->has_singleton("GodotSharp")) {
+	if (ClassDB::class_exists("CSharpScript")) {
 		extensions.push_back("cs");
 	}
 	extensions.push_back("gdshader");
diff --git a/core/debugger/remote_debugger.cpp b/core/debugger/remote_debugger.cpp
index bd30da3047..e2ed7245a2 100644
--- a/core/debugger/remote_debugger.cpp
+++ b/core/debugger/remote_debugger.cpp
@@ -39,6 +39,7 @@
 #include "core/io/resource_loader.h"
 #include "core/object/script_language.h"
 #include "core/os/os.h"
+#include "servers/display_server.h"
 
 class RemoteDebugger::PerformanceProfiler : public EngineProfiler {
 	Object *performance = nullptr;
@@ -539,7 +540,7 @@ void RemoteDebugger::debug(bool p_can_continue, bool p_is_error_breakpoint) {
 			OS::get_singleton()->delay_usec(10000);
 			if (Thread::get_caller_id() == Thread::get_main_id()) {
 				// If this is a busy loop on the main thread, events still need to be processed.
-				OS::get_singleton()->process_and_drop_events();
+				DisplayServer::get_singleton()->force_process_and_drop_events();
 			}
 		}
 	}
diff --git a/core/error/error_macros.cpp b/core/error/error_macros.cpp
index 8376c0aaf8..813ee7684f 100644
--- a/core/error/error_macros.cpp
+++ b/core/error/error_macros.cpp
@@ -34,6 +34,12 @@
 #include "core/os/os.h"
 #include "core/string/ustring.h"
 
+// Optional physics interpolation warnings try to include the path to the relevant node.
+#if defined(DEBUG_ENABLED) && defined(TOOLS_ENABLED)
+#include "core/config/project_settings.h"
+#include "scene/main/node.h"
+#endif
+
 static ErrorHandlerList *error_handler_list = nullptr;
 
 void add_error_handler(ErrorHandlerList *p_handler) {
@@ -128,3 +134,48 @@ void _err_print_index_error(const char *p_function, const char *p_file, int p_li
 void _err_flush_stdout() {
 	fflush(stdout);
 }
+
+// Prevent error spam by limiting the warnings to a certain frequency.
+void _physics_interpolation_warning(const char *p_function, const char *p_file, int p_line, ObjectID p_id, const char *p_warn_string) {
+#if defined(DEBUG_ENABLED) && defined(TOOLS_ENABLED)
+	const uint32_t warn_max = 2048;
+	const uint32_t warn_timeout_seconds = 15;
+
+	static uint32_t warn_count = warn_max;
+	static uint32_t warn_timeout = warn_timeout_seconds;
+
+	uint32_t time_now = UINT32_MAX;
+
+	if (warn_count) {
+		warn_count--;
+	}
+
+	if (!warn_count) {
+		time_now = OS::get_singleton()->get_ticks_msec() / 1000;
+	}
+
+	if ((warn_count == 0) && (time_now >= warn_timeout)) {
+		warn_count = warn_max;
+		warn_timeout = time_now + warn_timeout_seconds;
+
+		if (GLOBAL_GET("debug/settings/physics_interpolation/enable_warnings")) {
+			// UINT64_MAX means unused.
+			if (p_id.operator uint64_t() == UINT64_MAX) {
+				_err_print_error(p_function, p_file, p_line, "[Physics interpolation] " + String(p_warn_string) + " (possibly benign).", false, ERR_HANDLER_WARNING);
+			} else {
+				String node_name;
+				if (p_id.is_valid()) {
+					Node *node = Object::cast_to<Node>(ObjectDB::get_instance(p_id));
+					if (node && node->is_inside_tree()) {
+						node_name = "\"" + String(node->get_path()) + "\"";
+					} else {
+						node_name = "\"unknown\"";
+					}
+				}
+
+				_err_print_error(p_function, p_file, p_line, "[Physics interpolation] " + String(p_warn_string) + ": " + node_name + " (possibly benign).", false, ERR_HANDLER_WARNING);
+			}
+		}
+	}
+#endif
+}
diff --git a/core/error/error_macros.h b/core/error/error_macros.h
index ab7dbcbd44..d31adb72be 100644
--- a/core/error/error_macros.h
+++ b/core/error/error_macros.h
@@ -31,6 +31,7 @@
 #ifndef ERROR_MACROS_H
 #define ERROR_MACROS_H
 
+#include "core/object/object_id.h"
 #include "core/typedefs.h"
 
 #include <atomic> // We'd normally use safe_refcount.h, but that would cause circular includes.
@@ -71,6 +72,8 @@ void _err_print_index_error(const char *p_function, const char *p_file, int p_li
 void _err_print_index_error(const char *p_function, const char *p_file, int p_line, int64_t p_index, int64_t p_size, const char *p_index_str, const char *p_size_str, const String &p_message, bool p_editor_notify = false, bool fatal = false);
 void _err_flush_stdout();
 
+void _physics_interpolation_warning(const char *p_function, const char *p_file, int p_line, ObjectID p_id, const char *p_warn_string);
+
 #ifdef __GNUC__
 //#define FUNCTION_STR __PRETTY_FUNCTION__ - too annoying
 #define FUNCTION_STR __FUNCTION__
@@ -832,4 +835,14 @@ void _err_flush_stdout();
 #define DEV_CHECK_ONCE(m_cond)
 #endif
 
+/**
+ * Physics Interpolation warnings.
+ * These are spam protection warnings.
+ */
+#define PHYSICS_INTERPOLATION_NODE_WARNING(m_object_id, m_string) \
+	_physics_interpolation_warning(FUNCTION_STR, __FILE__, __LINE__, m_object_id, m_string)
+
+#define PHYSICS_INTERPOLATION_WARNING(m_string) \
+	_physics_interpolation_warning(FUNCTION_STR, __FILE__, __LINE__, UINT64_MAX, m_string)
+
 #endif // ERROR_MACROS_H
diff --git a/core/extension/gdextension.cpp b/core/extension/gdextension.cpp
index 47628e4ea0..8e2366fc95 100644
--- a/core/extension/gdextension.cpp
+++ b/core/extension/gdextension.cpp
@@ -795,7 +795,7 @@ Error GDExtension::open_library(const String &p_path, const String &p_entry_symb
 		// because that's what we want to check to see if it's changed.
 		library_path = actual_lib_path.get_base_dir().path_join(p_path.get_file());
 	} else {
-		library_path = p_path;
+		library_path = actual_lib_path;
 	}
 
 	ERR_FAIL_COND_V_MSG(err == ERR_FILE_NOT_FOUND, err, "GDExtension dynamic library not found: " + abs_path);
diff --git a/core/extension/gdextension_interface.h b/core/extension/gdextension_interface.h
index d6c1df9c00..fce377f967 100644
--- a/core/extension/gdextension_interface.h
+++ b/core/extension/gdextension_interface.h
@@ -2800,12 +2800,16 @@ typedef void (*GDExtensionInterfaceClassdbRegisterExtensionClassVirtualMethod)(G
  *
  * Registers an integer constant on an extension class in the ClassDB.
  *
+ * Note about registering bitfield values (if p_is_bitfield is true): even though p_constant_value is signed, language bindings are
+ * advised to treat bitfields as uint64_t, since this is generally clearer and can prevent mistakes like using -1 for setting all bits.
+ * Language APIs should thus provide an abstraction that registers bitfields (uint64_t) separately from regular constants (int64_t).
+ *
  * @param p_library A pointer the library received by the GDExtension's entry point function.
  * @param p_class_name A pointer to a StringName with the class name.
  * @param p_enum_name A pointer to a StringName with the enum name.
  * @param p_constant_name A pointer to a StringName with the constant name.
  * @param p_constant_value The constant value.
- * @param p_is_bitfield Whether or not this is a bit field.
+ * @param p_is_bitfield Whether or not this constant is part of a bitfield.
  */
 typedef void (*GDExtensionInterfaceClassdbRegisterExtensionClassIntegerConstant)(GDExtensionClassLibraryPtr p_library, GDExtensionConstStringNamePtr p_class_name, GDExtensionConstStringNamePtr p_enum_name, GDExtensionConstStringNamePtr p_constant_name, GDExtensionInt p_constant_value, GDExtensionBool p_is_bitfield);
 
diff --git a/core/input/input.cpp b/core/input/input.cpp
index ec0303df06..91378591b0 100644
--- a/core/input/input.cpp
+++ b/core/input/input.cpp
@@ -890,8 +890,9 @@ void Input::action_press(const StringName &p_action, float p_strength) {
 	// Create or retrieve existing action.
 	ActionState &action_state = action_states[p_action];
 
+	// As input may come in part way through a physics tick, the earliest we can react to it is the next physics tick.
 	if (!action_state.cache.pressed) {
-		action_state.pressed_physics_frame = Engine::get_singleton()->get_physics_frames();
+		action_state.pressed_physics_frame = Engine::get_singleton()->get_physics_frames() + 1;
 		action_state.pressed_process_frame = Engine::get_singleton()->get_process_frames();
 	}
 	action_state.exact = true;
@@ -908,7 +909,8 @@ void Input::action_release(const StringName &p_action) {
 	action_state.cache.pressed = 0;
 	action_state.cache.strength = 0.0;
 	action_state.cache.raw_strength = 0.0;
-	action_state.released_physics_frame = Engine::get_singleton()->get_physics_frames();
+	// As input may come in part way through a physics tick, the earliest we can react to it is the next physics tick.
+	action_state.released_physics_frame = Engine::get_singleton()->get_physics_frames() + 1;
 	action_state.released_process_frame = Engine::get_singleton()->get_process_frames();
 	action_state.device_states.clear();
 	action_state.exact = true;
@@ -1023,7 +1025,7 @@ void Input::parse_input_event(const Ref<InputEvent> &p_event) {
 		if (buffered_events.is_empty() || !buffered_events.back()->get()->accumulate(p_event)) {
 			buffered_events.push_back(p_event);
 		}
-	} else if (use_input_buffering) {
+	} else if (agile_input_event_flushing) {
 		buffered_events.push_back(p_event);
 	} else {
 		_parse_input_event_impl(p_event, false);
@@ -1054,12 +1056,12 @@ void Input::flush_buffered_events() {
 	}
 }
 
-bool Input::is_using_input_buffering() {
-	return use_input_buffering;
+bool Input::is_agile_input_event_flushing() {
+	return agile_input_event_flushing;
 }
 
-void Input::set_use_input_buffering(bool p_enable) {
-	use_input_buffering = p_enable;
+void Input::set_agile_input_event_flushing(bool p_enable) {
+	agile_input_event_flushing = p_enable;
 }
 
 void Input::set_use_accumulated_input(bool p_enable) {
diff --git a/core/input/input.h b/core/input/input.h
index 4daea0c9e8..89e48f53d7 100644
--- a/core/input/input.h
+++ b/core/input/input.h
@@ -128,7 +128,7 @@ private:
 
 	bool emulate_touch_from_mouse = false;
 	bool emulate_mouse_from_touch = false;
-	bool use_input_buffering = false;
+	bool agile_input_event_flushing = false;
 	bool use_accumulated_input = true;
 
 	int mouse_from_touch_index = -1;
@@ -367,8 +367,8 @@ public:
 	void flush_frame_parsed_events();
 #endif
 	void flush_buffered_events();
-	bool is_using_input_buffering();
-	void set_use_input_buffering(bool p_enable);
+	bool is_agile_input_event_flushing();
+	void set_agile_input_event_flushing(bool p_enable);
 	void set_use_accumulated_input(bool p_enable);
 	bool is_using_accumulated_input();
 
diff --git a/core/io/file_access_pack.cpp b/core/io/file_access_pack.cpp
index 991b94db38..02bf0a6039 100644
--- a/core/io/file_access_pack.cpp
+++ b/core/io/file_access_pack.cpp
@@ -119,6 +119,10 @@ void PackedData::_free_packed_dirs(PackedDir *p_dir) {
 }
 
 PackedData::~PackedData() {
+	if (singleton == this) {
+		singleton = nullptr;
+	}
+
 	for (int i = 0; i < sources.size(); i++) {
 		memdelete(sources[i]);
 	}
diff --git a/core/io/image.cpp b/core/io/image.cpp
index 4b1188ad47..b35d405662 100644
--- a/core/io/image.cpp
+++ b/core/io/image.cpp
@@ -300,10 +300,10 @@ int Image::get_format_block_size(Format p_format) {
 	return 1;
 }
 
-void Image::_get_mipmap_offset_and_size(int p_mipmap, int &r_offset, int &r_width, int &r_height) const {
+void Image::_get_mipmap_offset_and_size(int p_mipmap, int64_t &r_offset, int &r_width, int &r_height) const {
 	int w = width;
 	int h = height;
-	int ofs = 0;
+	int64_t ofs = 0;
 
 	int pixel_size = get_format_pixel_size(format);
 	int pixel_rshift = get_format_pixel_rshift(format);
@@ -315,7 +315,7 @@ void Image::_get_mipmap_offset_and_size(int p_mipmap, int &r_offset, int &r_widt
 		int bw = w % block != 0 ? w + (block - w % block) : w;
 		int bh = h % block != 0 ? h + (block - h % block) : h;
 
-		int s = bw * bh;
+		int64_t s = bw * bh;
 
 		s *= pixel_size;
 		s >>= pixel_rshift;
@@ -329,37 +329,30 @@ void Image::_get_mipmap_offset_and_size(int p_mipmap, int &r_offset, int &r_widt
 	r_height = h;
 }
 
-int Image::get_mipmap_offset(int p_mipmap) const {
+int64_t Image::get_mipmap_offset(int p_mipmap) const {
 	ERR_FAIL_INDEX_V(p_mipmap, get_mipmap_count() + 1, -1);
 
-	int ofs, w, h;
+	int64_t ofs;
+	int w, h;
 	_get_mipmap_offset_and_size(p_mipmap, ofs, w, h);
 	return ofs;
 }
 
-int Image::get_mipmap_byte_size(int p_mipmap) const {
-	ERR_FAIL_INDEX_V(p_mipmap, get_mipmap_count() + 1, -1);
-
-	int ofs, w, h;
-	_get_mipmap_offset_and_size(p_mipmap, ofs, w, h);
-	int ofs2;
-	_get_mipmap_offset_and_size(p_mipmap + 1, ofs2, w, h);
-	return ofs2 - ofs;
-}
-
-void Image::get_mipmap_offset_and_size(int p_mipmap, int &r_ofs, int &r_size) const {
-	int ofs, w, h;
+void Image::get_mipmap_offset_and_size(int p_mipmap, int64_t &r_ofs, int64_t &r_size) const {
+	int64_t ofs;
+	int w, h;
 	_get_mipmap_offset_and_size(p_mipmap, ofs, w, h);
-	int ofs2;
+	int64_t ofs2;
 	_get_mipmap_offset_and_size(p_mipmap + 1, ofs2, w, h);
 	r_ofs = ofs;
 	r_size = ofs2 - ofs;
 }
 
-void Image::get_mipmap_offset_size_and_dimensions(int p_mipmap, int &r_ofs, int &r_size, int &w, int &h) const {
-	int ofs;
+void Image::get_mipmap_offset_size_and_dimensions(int p_mipmap, int64_t &r_ofs, int64_t &r_size, int &w, int &h) const {
+	int64_t ofs;
 	_get_mipmap_offset_and_size(p_mipmap, ofs, w, h);
-	int ofs2, w2, h2;
+	int64_t ofs2;
+	int w2, h2;
 	_get_mipmap_offset_and_size(p_mipmap + 1, ofs2, w2, h2);
 	r_ofs = ofs;
 	r_size = ofs2 - ofs;
@@ -508,6 +501,38 @@ static void _convert(int p_width, int p_height, const uint8_t *p_src, uint8_t *p
 	}
 }
 
+template <typename T, uint32_t read_channels, uint32_t write_channels, T def_zero, T def_one>
+static void _convert_fast(int p_width, int p_height, const T *p_src, T *p_dst) {
+	uint32_t dst_count = 0;
+	uint32_t src_count = 0;
+
+	const int resolution = p_width * p_height;
+
+	for (int i = 0; i < resolution; i++) {
+		memcpy(p_dst + dst_count, p_src + src_count, MIN(read_channels, write_channels) * sizeof(T));
+
+		if constexpr (write_channels > read_channels) {
+			const T def_value[4] = { def_zero, def_zero, def_zero, def_one };
+			memcpy(p_dst + dst_count + read_channels, &def_value[read_channels], (write_channels - read_channels) * sizeof(T));
+		}
+
+		dst_count += write_channels;
+		src_count += read_channels;
+	}
+}
+
+static bool _are_formats_compatible(Image::Format p_format0, Image::Format p_format1) {
+	if (p_format0 <= Image::FORMAT_RGBA8 && p_format1 <= Image::FORMAT_RGBA8) {
+		return true;
+	} else if (p_format0 <= Image::FORMAT_RGBAH && p_format0 >= Image::FORMAT_RH && p_format1 <= Image::FORMAT_RGBAH && p_format1 >= Image::FORMAT_RH) {
+		return true;
+	} else if (p_format0 <= Image::FORMAT_RGBAF && p_format0 >= Image::FORMAT_RF && p_format1 <= Image::FORMAT_RGBAF && p_format1 >= Image::FORMAT_RF) {
+		return true;
+	}
+
+	return false;
+}
+
 void Image::convert(Format p_new_format) {
 	ERR_FAIL_INDEX_MSG(p_new_format, FORMAT_MAX, "The Image format specified (" + itos(p_new_format) + ") is out of range. See Image's Format enum.");
 	if (data.size() == 0) {
@@ -524,7 +549,7 @@ void Image::convert(Format p_new_format) {
 	if (Image::is_format_compressed(format) || Image::is_format_compressed(p_new_format)) {
 		ERR_FAIL_MSG("Cannot convert to <-> from compressed formats. Use compress() and decompress() instead.");
 
-	} else if (format > FORMAT_RGBA8 || p_new_format > FORMAT_RGBA8) {
+	} else if (!_are_formats_compatible(format, p_new_format)) {
 		//use put/set pixel which is slower but works with non byte formats
 		Image new_img(width, height, mipmaps, p_new_format);
 
@@ -538,8 +563,8 @@ void Image::convert(Format p_new_format) {
 				}
 			}
 
-			int mip_offset = 0;
-			int mip_size = 0;
+			int64_t mip_offset = 0;
+			int64_t mip_size = 0;
 			new_img.get_mipmap_offset_and_size(mip, mip_offset, mip_size);
 
 			memcpy(new_img.data.ptrw() + mip_offset, new_mip->data.ptr(), mip_size);
@@ -555,8 +580,8 @@ void Image::convert(Format p_new_format) {
 	int conversion_type = format | p_new_format << 8;
 
 	for (int mip = 0; mip < mipmap_count; mip++) {
-		int mip_offset = 0;
-		int mip_size = 0;
+		int64_t mip_offset = 0;
+		int64_t mip_size = 0;
 		int mip_width = 0;
 		int mip_height = 0;
 		get_mipmap_offset_size_and_dimensions(mip, mip_offset, mip_size, mip_width, mip_height);
@@ -655,6 +680,78 @@ void Image::convert(Format p_new_format) {
 			case FORMAT_RGBA8 | (FORMAT_RGB8 << 8):
 				_convert<3, true, 3, false, false, false>(mip_width, mip_height, rptr, wptr);
 				break;
+			case FORMAT_RH | (FORMAT_RGH << 8):
+				_convert_fast<uint16_t, 1, 2, 0x0000, 0x3C00>(mip_width, mip_height, (const uint16_t *)rptr, (uint16_t *)wptr);
+				break;
+			case FORMAT_RH | (FORMAT_RGBH << 8):
+				_convert_fast<uint16_t, 1, 3, 0x0000, 0x3C00>(mip_width, mip_height, (const uint16_t *)rptr, (uint16_t *)wptr);
+				break;
+			case FORMAT_RH | (FORMAT_RGBAH << 8):
+				_convert_fast<uint16_t, 1, 4, 0x0000, 0x3C00>(mip_width, mip_height, (const uint16_t *)rptr, (uint16_t *)wptr);
+				break;
+			case FORMAT_RGH | (FORMAT_RH << 8):
+				_convert_fast<uint16_t, 2, 1, 0x0000, 0x3C00>(mip_width, mip_height, (const uint16_t *)rptr, (uint16_t *)wptr);
+				break;
+			case FORMAT_RGH | (FORMAT_RGBH << 8):
+				_convert_fast<uint16_t, 2, 3, 0x0000, 0x3C00>(mip_width, mip_height, (const uint16_t *)rptr, (uint16_t *)wptr);
+				break;
+			case FORMAT_RGH | (FORMAT_RGBAH << 8):
+				_convert_fast<uint16_t, 2, 4, 0x0000, 0x3C00>(mip_width, mip_height, (const uint16_t *)rptr, (uint16_t *)wptr);
+				break;
+			case FORMAT_RGBH | (FORMAT_RH << 8):
+				_convert_fast<uint16_t, 3, 1, 0x0000, 0x3C00>(mip_width, mip_height, (const uint16_t *)rptr, (uint16_t *)wptr);
+				break;
+			case FORMAT_RGBH | (FORMAT_RGH << 8):
+				_convert_fast<uint16_t, 3, 2, 0x0000, 0x3C00>(mip_width, mip_height, (const uint16_t *)rptr, (uint16_t *)wptr);
+				break;
+			case FORMAT_RGBH | (FORMAT_RGBAH << 8):
+				_convert_fast<uint16_t, 3, 4, 0x0000, 0x3C00>(mip_width, mip_height, (const uint16_t *)rptr, (uint16_t *)wptr);
+				break;
+			case FORMAT_RGBAH | (FORMAT_RH << 8):
+				_convert_fast<uint16_t, 4, 1, 0x0000, 0x3C00>(mip_width, mip_height, (const uint16_t *)rptr, (uint16_t *)wptr);
+				break;
+			case FORMAT_RGBAH | (FORMAT_RGH << 8):
+				_convert_fast<uint16_t, 4, 2, 0x0000, 0x3C00>(mip_width, mip_height, (const uint16_t *)rptr, (uint16_t *)wptr);
+				break;
+			case FORMAT_RGBAH | (FORMAT_RGBH << 8):
+				_convert_fast<uint16_t, 4, 3, 0x0000, 0x3C00>(mip_width, mip_height, (const uint16_t *)rptr, (uint16_t *)wptr);
+				break;
+			case FORMAT_RF | (FORMAT_RGF << 8):
+				_convert_fast<uint32_t, 1, 2, 0x00000000, 0x3F800000>(mip_width, mip_height, (const uint32_t *)rptr, (uint32_t *)wptr);
+				break;
+			case FORMAT_RF | (FORMAT_RGBF << 8):
+				_convert_fast<uint32_t, 1, 3, 0x00000000, 0x3F800000>(mip_width, mip_height, (const uint32_t *)rptr, (uint32_t *)wptr);
+				break;
+			case FORMAT_RF | (FORMAT_RGBAF << 8):
+				_convert_fast<uint32_t, 1, 4, 0x00000000, 0x3F800000>(mip_width, mip_height, (const uint32_t *)rptr, (uint32_t *)wptr);
+				break;
+			case FORMAT_RGF | (FORMAT_RF << 8):
+				_convert_fast<uint32_t, 2, 1, 0x00000000, 0x3F800000>(mip_width, mip_height, (const uint32_t *)rptr, (uint32_t *)wptr);
+				break;
+			case FORMAT_RGF | (FORMAT_RGBF << 8):
+				_convert_fast<uint32_t, 2, 3, 0x00000000, 0x3F800000>(mip_width, mip_height, (const uint32_t *)rptr, (uint32_t *)wptr);
+				break;
+			case FORMAT_RGF | (FORMAT_RGBAF << 8):
+				_convert_fast<uint32_t, 2, 4, 0x00000000, 0x3F800000>(mip_width, mip_height, (const uint32_t *)rptr, (uint32_t *)wptr);
+				break;
+			case FORMAT_RGBF | (FORMAT_RF << 8):
+				_convert_fast<uint32_t, 3, 1, 0x00000000, 0x3F800000>(mip_width, mip_height, (const uint32_t *)rptr, (uint32_t *)wptr);
+				break;
+			case FORMAT_RGBF | (FORMAT_RGF << 8):
+				_convert_fast<uint32_t, 3, 2, 0x00000000, 0x3F800000>(mip_width, mip_height, (const uint32_t *)rptr, (uint32_t *)wptr);
+				break;
+			case FORMAT_RGBF | (FORMAT_RGBAF << 8):
+				_convert_fast<uint32_t, 3, 4, 0x00000000, 0x3F800000>(mip_width, mip_height, (const uint32_t *)rptr, (uint32_t *)wptr);
+				break;
+			case FORMAT_RGBAF | (FORMAT_RF << 8):
+				_convert_fast<uint32_t, 4, 1, 0x00000000, 0x3F800000>(mip_width, mip_height, (const uint32_t *)rptr, (uint32_t *)wptr);
+				break;
+			case FORMAT_RGBAF | (FORMAT_RGF << 8):
+				_convert_fast<uint32_t, 4, 2, 0x00000000, 0x3F800000>(mip_width, mip_height, (const uint32_t *)rptr, (uint32_t *)wptr);
+				break;
+			case FORMAT_RGBAF | (FORMAT_RGBF << 8):
+				_convert_fast<uint32_t, 4, 3, 0x00000000, 0x3F800000>(mip_width, mip_height, (const uint32_t *)rptr, (uint32_t *)wptr);
+				break;
 		}
 	}
 
@@ -1151,7 +1248,7 @@ void Image::resize(int p_width, int p_height, Interpolation p_interpolation) {
 					if (i == 0) {
 						// Read from the first mipmap that will be interpolated
 						// (if both levels are the same, we will not interpolate, but at least we'll sample from the right level)
-						int offs;
+						int64_t offs;
 						_get_mipmap_offset_and_size(mip1, offs, src_width, src_height);
 						src_ptr = r_ptr + offs;
 					} else if (!interpolate_mipmaps) {
@@ -1159,7 +1256,7 @@ void Image::resize(int p_width, int p_height, Interpolation p_interpolation) {
 						break;
 					} else {
 						// Switch to read from the second mipmap that will be interpolated
-						int offs;
+						int64_t offs;
 						_get_mipmap_offset_and_size(mip2, offs, src_width, src_height);
 						src_ptr = r_ptr + offs;
 						// Switch to write to the second destination image
@@ -1599,9 +1696,9 @@ void Image::flip_x() {
 }
 
 /// Get mipmap size and offset.
-int Image::_get_dst_image_size(int p_width, int p_height, Format p_format, int &r_mipmaps, int p_mipmaps, int *r_mm_width, int *r_mm_height) {
+int64_t Image::_get_dst_image_size(int p_width, int p_height, Format p_format, int &r_mipmaps, int p_mipmaps, int *r_mm_width, int *r_mm_height) {
 	// Data offset in mipmaps (including the original texture).
-	int size = 0;
+	int64_t size = 0;
 
 	int w = p_width;
 	int h = p_height;
@@ -1623,7 +1720,7 @@ int Image::_get_dst_image_size(int p_width, int p_height, Format p_format, int &
 		int bw = w % block != 0 ? w + (block - w % block) : w;
 		int bh = h % block != 0 ? h + (block - h % block) : h;
 
-		int s = bw * bh;
+		int64_t s = bw * bh;
 
 		s *= pixsize;
 		s >>= pixshift;
@@ -1837,7 +1934,8 @@ Error Image::generate_mipmaps(bool p_renormalize) {
 	int prev_w = width;
 
 	for (int i = 1; i <= mmcount; i++) {
-		int ofs, w, h;
+		int64_t ofs;
+		int w, h;
 		_get_mipmap_offset_and_size(i, ofs, w, h);
 
 		switch (format) {
@@ -1993,7 +2091,8 @@ Error Image::generate_mipmap_roughness(RoughnessChannel p_roughness_channel, con
 	uint8_t *base_ptr = data.ptrw();
 
 	for (int i = 1; i <= mmcount; i++) {
-		int ofs, w, h;
+		int64_t ofs;
+		int w, h;
 		_get_mipmap_offset_and_size(i, ofs, w, h);
 		uint8_t *ptr = &base_ptr[ofs];
 
@@ -2102,21 +2201,6 @@ Error Image::generate_mipmap_roughness(RoughnessChannel p_roughness_channel, con
 				_set_color_at_ofs(ptr, pixel_ofs, c);
 			}
 		}
-#if 0
-		{
-			int size = get_mipmap_byte_size(i);
-			print_line("size for mimpap " + itos(i) + ": " + itos(size));
-			Vector<uint8_t> imgdata;
-			imgdata.resize(size);
-
-
-			uint8_t* wr = imgdata.ptrw();
-			memcpy(wr.ptr(), ptr, size);
-			wr = uint8_t*();
-			Ref<Image> im = Image::create_from_data(w, h, false, format, imgdata);
-			im->save_png("res://mipmap_" + itos(i) + ".png");
-		}
-#endif
 	}
 
 	return OK;
@@ -2131,7 +2215,8 @@ void Image::clear_mipmaps() {
 		return;
 	}
 
-	int ofs, w, h;
+	int64_t ofs;
+	int w, h;
 	_get_mipmap_offset_and_size(1, ofs, w, h);
 	data.resize(ofs);
 
@@ -2176,7 +2261,7 @@ void Image::initialize_data(int p_width, int p_height, bool p_use_mipmaps, Forma
 	ERR_FAIL_INDEX_MSG(p_format, FORMAT_MAX, "The Image format specified (" + itos(p_format) + ") is out of range. See Image's Format enum.");
 
 	int mm = 0;
-	int size = _get_dst_image_size(p_width, p_height, p_format, mm, p_use_mipmaps ? -1 : 0);
+	int64_t size = _get_dst_image_size(p_width, p_height, p_format, mm, p_use_mipmaps ? -1 : 0);
 	data.resize(size);
 
 	{
@@ -2202,7 +2287,7 @@ void Image::initialize_data(int p_width, int p_height, bool p_use_mipmaps, Forma
 	ERR_FAIL_INDEX_MSG(p_format, FORMAT_MAX, "The Image format specified (" + itos(p_format) + ") is out of range. See Image's Format enum.");
 
 	int mm;
-	int size = _get_dst_image_size(p_width, p_height, p_format, mm, p_use_mipmaps ? -1 : 0);
+	int64_t size = _get_dst_image_size(p_width, p_height, p_format, mm, p_use_mipmaps ? -1 : 0);
 
 	if (unlikely(p_data.size() != size)) {
 		String description_mipmaps = get_format_name(p_format) + " ";
@@ -2405,7 +2490,7 @@ bool Image::is_invisible() const {
 		return false;
 	}
 
-	int len = data.size();
+	int64_t len = data.size();
 
 	if (len == 0) {
 		return true;
@@ -2445,7 +2530,7 @@ bool Image::is_invisible() const {
 }
 
 Image::AlphaMode Image::detect_alpha() const {
-	int len = data.size();
+	int64_t len = data.size();
 
 	if (len == 0) {
 		return ALPHA_NONE;
@@ -2579,7 +2664,7 @@ Vector<uint8_t> Image::save_webp_to_buffer(const bool p_lossy, const float p_qua
 	return save_webp_buffer_func(Ref<Image>((Image *)this), p_lossy, p_quality);
 }
 
-int Image::get_image_data_size(int p_width, int p_height, Format p_format, bool p_mipmaps) {
+int64_t Image::get_image_data_size(int p_width, int p_height, Format p_format, bool p_mipmaps) {
 	int mm;
 	return _get_dst_image_size(p_width, p_height, p_format, mm, p_mipmaps ? -1 : 0);
 }
@@ -2597,7 +2682,7 @@ Size2i Image::get_image_mipmap_size(int p_width, int p_height, Format p_format,
 	return ret;
 }
 
-int Image::get_image_mipmap_offset(int p_width, int p_height, Format p_format, int p_mipmap) {
+int64_t Image::get_image_mipmap_offset(int p_width, int p_height, Format p_format, int p_mipmap) {
 	if (p_mipmap <= 0) {
 		return 0;
 	}
@@ -2605,7 +2690,7 @@ int Image::get_image_mipmap_offset(int p_width, int p_height, Format p_format, i
 	return _get_dst_image_size(p_width, p_height, p_format, mm, p_mipmap - 1);
 }
 
-int Image::get_image_mipmap_offset_and_dimensions(int p_width, int p_height, Format p_format, int p_mipmap, int &r_w, int &r_h) {
+int64_t Image::get_image_mipmap_offset_and_dimensions(int p_width, int p_height, Format p_format, int p_mipmap, int &r_w, int &r_h) {
 	if (p_mipmap <= 0) {
 		r_w = p_width;
 		r_h = p_height;
@@ -3642,9 +3727,10 @@ Ref<Image> Image::rgbe_to_srgb() {
 	return new_image;
 }
 
-Ref<Image> Image::get_image_from_mipmap(int p_mipamp) const {
-	int ofs, size, w, h;
-	get_mipmap_offset_size_and_dimensions(p_mipamp, ofs, size, w, h);
+Ref<Image> Image::get_image_from_mipmap(int p_mipmap) const {
+	int64_t ofs, size;
+	int w, h;
+	get_mipmap_offset_size_and_dimensions(p_mipmap, ofs, size, w, h);
 
 	Vector<uint8_t> new_data;
 	new_data.resize(size);
diff --git a/core/io/image.h b/core/io/image.h
index d3ae99954f..d55cc39dbb 100644
--- a/core/io/image.h
+++ b/core/io/image.h
@@ -195,9 +195,9 @@ private:
 		data = p_image.data;
 	}
 
-	_FORCE_INLINE_ void _get_mipmap_offset_and_size(int p_mipmap, int &r_offset, int &r_width, int &r_height) const; //get where the mipmap begins in data
+	_FORCE_INLINE_ void _get_mipmap_offset_and_size(int p_mipmap, int64_t &r_offset, int &r_width, int &r_height) const; //get where the mipmap begins in data
 
-	static int _get_dst_image_size(int p_width, int p_height, Format p_format, int &r_mipmaps, int p_mipmaps = -1, int *r_mm_width = nullptr, int *r_mm_height = nullptr);
+	static int64_t _get_dst_image_size(int p_width, int p_height, Format p_format, int &r_mipmaps, int p_mipmaps = -1, int *r_mm_width = nullptr, int *r_mm_height = nullptr);
 	bool _can_modify(Format p_format) const;
 
 	_FORCE_INLINE_ void _get_clipped_src_and_dest_rects(const Ref<Image> &p_src, const Rect2i &p_src_rect, const Point2i &p_dest, Rect2i &r_clipped_src_rect, Rect2i &r_clipped_dest_rect) const;
@@ -238,10 +238,12 @@ public:
 	 */
 	Format get_format() const;
 
-	int get_mipmap_byte_size(int p_mipmap) const; //get where the mipmap begins in data
-	int get_mipmap_offset(int p_mipmap) const; //get where the mipmap begins in data
-	void get_mipmap_offset_and_size(int p_mipmap, int &r_ofs, int &r_size) const; //get where the mipmap begins in data
-	void get_mipmap_offset_size_and_dimensions(int p_mipmap, int &r_ofs, int &r_size, int &w, int &h) const; //get where the mipmap begins in data
+	/**
+	 * Get where the mipmap begins in data.
+	 */
+	int64_t get_mipmap_offset(int p_mipmap) const;
+	void get_mipmap_offset_and_size(int p_mipmap, int64_t &r_ofs, int64_t &r_size) const;
+	void get_mipmap_offset_size_and_dimensions(int p_mipmap, int64_t &r_ofs, int64_t &r_size, int &w, int &h) const;
 
 	enum Image3DValidateError {
 		VALIDATE_3D_OK,
@@ -351,11 +353,11 @@ public:
 	static int get_format_block_size(Format p_format);
 	static void get_format_min_pixel_size(Format p_format, int &r_w, int &r_h);
 
-	static int get_image_data_size(int p_width, int p_height, Format p_format, bool p_mipmaps = false);
+	static int64_t get_image_data_size(int p_width, int p_height, Format p_format, bool p_mipmaps = false);
 	static int get_image_required_mipmaps(int p_width, int p_height, Format p_format);
 	static Size2i get_image_mipmap_size(int p_width, int p_height, Format p_format, int p_mipmap);
-	static int get_image_mipmap_offset(int p_width, int p_height, Format p_format, int p_mipmap);
-	static int get_image_mipmap_offset_and_dimensions(int p_width, int p_height, Format p_format, int p_mipmap, int &r_w, int &r_h);
+	static int64_t get_image_mipmap_offset(int p_width, int p_height, Format p_format, int p_mipmap);
+	static int64_t get_image_mipmap_offset_and_dimensions(int p_width, int p_height, Format p_format, int p_mipmap, int &r_w, int &r_h);
 
 	enum CompressMode {
 		COMPRESS_S3TC,
@@ -383,7 +385,7 @@ public:
 	void srgb_to_linear();
 	void normal_map_to_xy();
 	Ref<Image> rgbe_to_srgb();
-	Ref<Image> get_image_from_mipmap(int p_mipamp) const;
+	Ref<Image> get_image_from_mipmap(int p_mipmap) const;
 	void bump_map_to_normal_map(float bump_scale = 1.0);
 
 	void blit_rect(const Ref<Image> &p_src, const Rect2i &p_src_rect, const Point2i &p_dest);
diff --git a/core/io/marshalls.cpp b/core/io/marshalls.cpp
index c0d18d0120..67469de5cc 100644
--- a/core/io/marshalls.cpp
+++ b/core/io/marshalls.cpp
@@ -1315,10 +1315,12 @@ Error encode_variant(const Variant &p_variant, uint8_t *r_buffer, int &r_len, bo
 			if (array.is_typed()) {
 				Ref<Script> script = array.get_typed_script();
 				if (script.is_valid()) {
-					header |= HEADER_DATA_FIELD_TYPED_ARRAY_SCRIPT;
+					header |= p_full_objects ? HEADER_DATA_FIELD_TYPED_ARRAY_SCRIPT : HEADER_DATA_FIELD_TYPED_ARRAY_CLASS_NAME;
 				} else if (array.get_typed_class_name() != StringName()) {
 					header |= HEADER_DATA_FIELD_TYPED_ARRAY_CLASS_NAME;
 				} else {
+					// No need to check `p_full_objects` since for `Variant::OBJECT`
+					// `array.get_typed_class_name()` should be non-empty.
 					header |= HEADER_DATA_FIELD_TYPED_ARRAY_BUILTIN;
 				}
 			}
@@ -1783,12 +1785,18 @@ Error encode_variant(const Variant &p_variant, uint8_t *r_buffer, int &r_len, bo
 				Variant variant = array.get_typed_script();
 				Ref<Script> script = variant;
 				if (script.is_valid()) {
-					String path = script->get_path();
-					ERR_FAIL_COND_V_MSG(path.is_empty() || !path.begins_with("res://"), ERR_UNAVAILABLE, "Failed to encode a path to a custom script for an array type.");
-					_encode_string(path, buf, r_len);
+					if (p_full_objects) {
+						String path = script->get_path();
+						ERR_FAIL_COND_V_MSG(path.is_empty() || !path.begins_with("res://"), ERR_UNAVAILABLE, "Failed to encode a path to a custom script for an array type.");
+						_encode_string(path, buf, r_len);
+					} else {
+						_encode_string(EncodedObjectAsID::get_class_static(), buf, r_len);
+					}
 				} else if (array.get_typed_class_name() != StringName()) {
-					_encode_string(array.get_typed_class_name(), buf, r_len);
+					_encode_string(p_full_objects ? array.get_typed_class_name().operator String() : EncodedObjectAsID::get_class_static(), buf, r_len);
 				} else {
+					// No need to check `p_full_objects` since for `Variant::OBJECT`
+					// `array.get_typed_class_name()` should be non-empty.
 					if (buf) {
 						encode_uint32(array.get_typed_builtin(), buf);
 						buf += 4;
diff --git a/core/io/resource.cpp b/core/io/resource.cpp
index c045c0fc74..432adb88da 100644
--- a/core/io/resource.cpp
+++ b/core/io/resource.cpp
@@ -40,7 +40,12 @@
 #include <stdio.h>
 
 void Resource::emit_changed() {
-	emit_signal(CoreStringName(changed));
+	if (ResourceLoader::is_within_load() && MessageQueue::get_main_singleton() != MessageQueue::get_singleton() && !MessageQueue::get_singleton()->is_flushing()) {
+		// Let the connection happen on the call queue, later, since signals are not thread-safe.
+		call_deferred("emit_signal", CoreStringName(changed));
+	} else {
+		emit_signal(CoreStringName(changed));
+	}
 }
 
 void Resource::_resource_path_changed() {
@@ -161,12 +166,22 @@ bool Resource::editor_can_reload_from_file() {
 }
 
 void Resource::connect_changed(const Callable &p_callable, uint32_t p_flags) {
+	if (ResourceLoader::is_within_load() && MessageQueue::get_main_singleton() != MessageQueue::get_singleton() && !MessageQueue::get_singleton()->is_flushing()) {
+		// Let the check and connection happen on the call queue, later, since signals are not thread-safe.
+		callable_mp(this, &Resource::connect_changed).call_deferred(p_callable, p_flags);
+		return;
+	}
 	if (!is_connected(CoreStringName(changed), p_callable) || p_flags & CONNECT_REFERENCE_COUNTED) {
 		connect(CoreStringName(changed), p_callable, p_flags);
 	}
 }
 
 void Resource::disconnect_changed(const Callable &p_callable) {
+	if (ResourceLoader::is_within_load() && MessageQueue::get_main_singleton() != MessageQueue::get_singleton() && !MessageQueue::get_singleton()->is_flushing()) {
+		// Let the check and disconnection happen on the call queue, later, since signals are not thread-safe.
+		callable_mp(this, &Resource::disconnect_changed).call_deferred(p_callable);
+		return;
+	}
 	if (is_connected(CoreStringName(changed), p_callable)) {
 		disconnect(CoreStringName(changed), p_callable);
 	}
diff --git a/core/io/resource_loader.cpp b/core/io/resource_loader.cpp
index 20dd192da1..928bb95de3 100644
--- a/core/io/resource_loader.cpp
+++ b/core/io/resource_loader.cpp
@@ -304,31 +304,24 @@ void ResourceLoader::_thread_load_function(void *p_userdata) {
 	thread_load_mutex.unlock();
 
 	// Thread-safe either if it's the current thread or a brand new one.
-	thread_local bool mq_override_present = false;
 	CallQueue *own_mq_override = nullptr;
 	if (load_nesting == 0) {
-		mq_override_present = false;
 		load_paths_stack = memnew(Vector<String>);
 
-		if (!load_task.dependent_path.is_empty()) {
-			load_paths_stack->push_back(load_task.dependent_path);
-		}
 		if (!Thread::is_main_thread()) {
 			// Let the caller thread use its own, for added flexibility. Provide one otherwise.
 			if (MessageQueue::get_singleton() == MessageQueue::get_main_singleton()) {
 				own_mq_override = memnew(CallQueue);
 				MessageQueue::set_thread_singleton_override(own_mq_override);
 			}
-			mq_override_present = true;
 			set_current_thread_safe_for_nodes(true);
 		}
-	} else {
-		DEV_ASSERT(load_task.dependent_path.is_empty());
 	}
 	// --
 
-	Ref<Resource> res = _load(load_task.remapped_path, load_task.remapped_path != load_task.local_path ? load_task.local_path : String(), load_task.type_hint, load_task.cache_mode, &load_task.error, load_task.use_sub_threads, &load_task.progress);
-	if (mq_override_present) {
+	Error load_err = OK;
+	Ref<Resource> res = _load(load_task.remapped_path, load_task.remapped_path != load_task.local_path ? load_task.local_path : String(), load_task.type_hint, load_task.cache_mode, &load_err, load_task.use_sub_threads, &load_task.progress);
+	if (MessageQueue::get_singleton() != MessageQueue::get_main_singleton()) {
 		MessageQueue::get_singleton()->flush();
 	}
 
@@ -336,7 +329,8 @@ void ResourceLoader::_thread_load_function(void *p_userdata) {
 
 	load_task.resource = res;
 
-	load_task.progress = 1.0; //it was fully loaded at this point, so force progress to 1.0
+	load_task.progress = 1.0; // It was fully loaded at this point, so force progress to 1.0.
+	load_task.error = load_err;
 	if (load_task.error != OK) {
 		load_task.status = THREAD_LOAD_FAILED;
 	} else {
@@ -351,7 +345,14 @@ void ResourceLoader::_thread_load_function(void *p_userdata) {
 
 	bool ignoring = load_task.cache_mode == ResourceFormatLoader::CACHE_MODE_IGNORE || load_task.cache_mode == ResourceFormatLoader::CACHE_MODE_IGNORE_DEEP;
 	bool replacing = load_task.cache_mode == ResourceFormatLoader::CACHE_MODE_REPLACE || load_task.cache_mode == ResourceFormatLoader::CACHE_MODE_REPLACE_DEEP;
+	bool unlock_pending = true;
 	if (load_task.resource.is_valid()) {
+		// From now on, no critical section needed as no one will write to the task anymore.
+		// Moreover, the mutex being unlocked is a requirement if some of the calls below
+		// that set the resource up invoke code that in turn requests resource loading.
+		thread_load_mutex.unlock();
+		unlock_pending = false;
+
 		if (!ignoring) {
 			if (replacing) {
 				Ref<Resource> old_res = ResourceCache::get_ref(load_task.local_path);
@@ -389,20 +390,28 @@ void ResourceLoader::_thread_load_function(void *p_userdata) {
 			load_task.status = THREAD_LOAD_LOADED;
 			load_task.progress = 1.0;
 
+			thread_load_mutex.unlock();
+			unlock_pending = false;
+
 			if (_loaded_callback) {
 				_loaded_callback(load_task.resource, load_task.local_path);
 			}
 		}
 	}
 
-	thread_load_mutex.unlock();
+	if (unlock_pending) {
+		thread_load_mutex.unlock();
+	}
 
 	if (load_nesting == 0) {
 		if (own_mq_override) {
 			MessageQueue::set_thread_singleton_override(nullptr);
 			memdelete(own_mq_override);
 		}
-		memdelete(load_paths_stack);
+		if (load_paths_stack) {
+			memdelete(load_paths_stack);
+			load_paths_stack = nullptr;
+		}
 	}
 }
 
@@ -465,6 +474,7 @@ Ref<ResourceLoader::LoadToken> ResourceLoader::_load_start(const String &p_path,
 	bool ignoring_cache = p_cache_mode == ResourceFormatLoader::CACHE_MODE_IGNORE || p_cache_mode == ResourceFormatLoader::CACHE_MODE_IGNORE_DEEP;
 
 	Ref<LoadToken> load_token;
+	bool must_not_register = false;
 	ThreadLoadTask unregistered_load_task; // Once set, must be valid up to the call to do the load.
 	ThreadLoadTask *load_task_ptr = nullptr;
 	bool run_on_current_thread = false;
@@ -473,12 +483,13 @@ Ref<ResourceLoader::LoadToken> ResourceLoader::_load_start(const String &p_path,
 
 		if (!ignoring_cache && thread_load_tasks.has(local_path)) {
 			load_token = Ref<LoadToken>(thread_load_tasks[local_path].load_token);
-			if (!load_token.is_valid()) {
+			if (load_token.is_valid()) {
+				return load_token;
+			} else {
 				// The token is dying (reached 0 on another thread).
 				// Ensure it's killed now so the path can be safely reused right away.
 				thread_load_tasks[local_path].load_token->clear();
 			}
-			return load_token;
 		}
 
 		load_token.instantiate();
@@ -506,8 +517,9 @@ Ref<ResourceLoader::LoadToken> ResourceLoader::_load_start(const String &p_path,
 				}
 			}
 
-			// Cache-ignoring tasks aren't registered in the map and so must finish within scope.
-			if (ignoring_cache) {
+			// If we want to ignore cache, but there's another task loading it, we can't add this one to the map and we also have to finish within scope.
+			must_not_register = ignoring_cache && thread_load_tasks.has(local_path);
+			if (must_not_register) {
 				load_token->local_path.clear();
 				unregistered_load_task = load_task;
 				load_task_ptr = &unregistered_load_task;
@@ -518,7 +530,7 @@ Ref<ResourceLoader::LoadToken> ResourceLoader::_load_start(const String &p_path,
 			}
 		}
 
-		run_on_current_thread = ignoring_cache || p_thread_mode == LOAD_THREAD_FROM_CURRENT;
+		run_on_current_thread = must_not_register || p_thread_mode == LOAD_THREAD_FROM_CURRENT;
 
 		if (run_on_current_thread) {
 			load_task_ptr->thread_id = Thread::get_caller_id();
@@ -529,7 +541,7 @@ Ref<ResourceLoader::LoadToken> ResourceLoader::_load_start(const String &p_path,
 
 	if (run_on_current_thread) {
 		_thread_load_function(load_task_ptr);
-		if (ignoring_cache) {
+		if (must_not_register) {
 			load_token->res_if_unregistered = load_task_ptr->resource;
 		}
 	}
@@ -560,39 +572,46 @@ float ResourceLoader::_dependency_get_progress(const String &p_path) {
 }
 
 ResourceLoader::ThreadLoadStatus ResourceLoader::load_threaded_get_status(const String &p_path, float *r_progress) {
-	MutexLock thread_load_lock(thread_load_mutex);
+	bool ensure_progress = false;
+	ThreadLoadStatus status = THREAD_LOAD_IN_PROGRESS;
+	{
+		MutexLock thread_load_lock(thread_load_mutex);
 
-	if (!user_load_tokens.has(p_path)) {
-		print_verbose("load_threaded_get_status(): No threaded load for resource path '" + p_path + "' has been initiated or its result has already been collected.");
-		return THREAD_LOAD_INVALID_RESOURCE;
-	}
+		if (!user_load_tokens.has(p_path)) {
+			print_verbose("load_threaded_get_status(): No threaded load for resource path '" + p_path + "' has been initiated or its result has already been collected.");
+			return THREAD_LOAD_INVALID_RESOURCE;
+		}
 
-	String local_path = _validate_local_path(p_path);
-	if (!thread_load_tasks.has(local_path)) {
+		String local_path = _validate_local_path(p_path);
+		if (!thread_load_tasks.has(local_path)) {
 #ifdef DEV_ENABLED
-		CRASH_NOW();
+			CRASH_NOW();
 #endif
-		// On non-dev, be defensive and at least avoid crashing (at this point at least).
-		return THREAD_LOAD_INVALID_RESOURCE;
-	}
+			// On non-dev, be defensive and at least avoid crashing (at this point at least).
+			return THREAD_LOAD_INVALID_RESOURCE;
+		}
 
-	ThreadLoadTask &load_task = thread_load_tasks[local_path];
-	ThreadLoadStatus status;
-	status = load_task.status;
-	if (r_progress) {
-		*r_progress = _dependency_get_progress(local_path);
-	}
+		ThreadLoadTask &load_task = thread_load_tasks[local_path];
+		status = load_task.status;
+		if (r_progress) {
+			*r_progress = _dependency_get_progress(local_path);
+		}
 
-	// Support userland polling in a loop on the main thread.
-	if (Thread::is_main_thread() && status == THREAD_LOAD_IN_PROGRESS) {
-		uint64_t frame = Engine::get_singleton()->get_process_frames();
-		if (frame == load_task.last_progress_check_main_thread_frame) {
-			_ensure_load_progress();
-		} else {
-			load_task.last_progress_check_main_thread_frame = frame;
+		// Support userland polling in a loop on the main thread.
+		if (Thread::is_main_thread() && status == THREAD_LOAD_IN_PROGRESS) {
+			uint64_t frame = Engine::get_singleton()->get_process_frames();
+			if (frame == load_task.last_progress_check_main_thread_frame) {
+				ensure_progress = true;
+			} else {
+				load_task.last_progress_check_main_thread_frame = frame;
+			}
 		}
 	}
 
+	if (ensure_progress) {
+		_ensure_load_progress();
+	}
+
 	return status;
 }
 
@@ -626,13 +645,13 @@ Ref<Resource> ResourceLoader::load_threaded_get(const String &p_path, Error *r_e
 		if (Thread::is_main_thread() && !load_token->local_path.is_empty()) {
 			const ThreadLoadTask &load_task = thread_load_tasks[load_token->local_path];
 			while (load_task.status == THREAD_LOAD_IN_PROGRESS) {
-				if (!_ensure_load_progress()) {
-					// This local poll loop is not needed.
-					break;
-				}
 				thread_load_lock.~MutexLock();
+				bool exit = !_ensure_load_progress();
 				OS::get_singleton()->delay_usec(1000);
 				new (&thread_load_lock) MutexLock(thread_load_mutex);
+				if (exit) {
+					break;
+				}
 			}
 		}
 
@@ -1290,7 +1309,7 @@ bool ResourceLoader::timestamp_on_load = false;
 
 thread_local int ResourceLoader::load_nesting = 0;
 thread_local WorkerThreadPool::TaskID ResourceLoader::caller_task_id = 0;
-thread_local Vector<String> *ResourceLoader::load_paths_stack;
+thread_local Vector<String> *ResourceLoader::load_paths_stack = nullptr;
 thread_local HashMap<int, HashMap<String, Ref<Resource>>> ResourceLoader::res_ref_overrides;
 
 template <>
diff --git a/core/io/resource_loader.h b/core/io/resource_loader.h
index 46df79ea22..5f1831f0d9 100644
--- a/core/io/resource_loader.h
+++ b/core/io/resource_loader.h
@@ -170,7 +170,6 @@ private:
 		LoadToken *load_token = nullptr;
 		String local_path;
 		String remapped_path;
-		String dependent_path;
 		String type_hint;
 		float progress = 0.0f;
 		float max_reported_progress = 0.0f;
diff --git a/core/math/math_funcs.h b/core/math/math_funcs.h
index 3060f31970..fd53ed28fd 100644
--- a/core/math/math_funcs.h
+++ b/core/math/math_funcs.h
@@ -447,14 +447,22 @@ public:
 
 	static _ALWAYS_INLINE_ double smoothstep(double p_from, double p_to, double p_s) {
 		if (is_equal_approx(p_from, p_to)) {
-			return p_from;
+			if (likely(p_from <= p_to)) {
+				return p_s <= p_from ? 0.0 : 1.0;
+			} else {
+				return p_s <= p_to ? 1.0 : 0.0;
+			}
 		}
 		double s = CLAMP((p_s - p_from) / (p_to - p_from), 0.0, 1.0);
 		return s * s * (3.0 - 2.0 * s);
 	}
 	static _ALWAYS_INLINE_ float smoothstep(float p_from, float p_to, float p_s) {
 		if (is_equal_approx(p_from, p_to)) {
-			return p_from;
+			if (likely(p_from <= p_to)) {
+				return p_s <= p_from ? 0.0f : 1.0f;
+			} else {
+				return p_s <= p_to ? 1.0f : 0.0f;
+			}
 		}
 		float s = CLAMP((p_s - p_from) / (p_to - p_from), 0.0f, 1.0f);
 		return s * s * (3.0f - 2.0f * s);
diff --git a/core/math/transform_interpolator.cpp b/core/math/transform_interpolator.cpp
index 6a564b0ca7..1cd35b3d1a 100644
--- a/core/math/transform_interpolator.cpp
+++ b/core/math/transform_interpolator.cpp
@@ -31,6 +31,7 @@
 #include "transform_interpolator.h"
 
 #include "core/math/transform_2d.h"
+#include "core/math/transform_3d.h"
 
 void TransformInterpolator::interpolate_transform_2d(const Transform2D &p_prev, const Transform2D &p_curr, Transform2D &r_result, real_t p_fraction) {
 	// Special case for physics interpolation, if flipping, don't interpolate basis.
@@ -44,3 +45,340 @@ void TransformInterpolator::interpolate_transform_2d(const Transform2D &p_prev,
 
 	r_result = p_prev.interpolate_with(p_curr, p_fraction);
 }
+
+void TransformInterpolator::interpolate_transform_3d(const Transform3D &p_prev, const Transform3D &p_curr, Transform3D &r_result, real_t p_fraction) {
+	r_result.origin = p_prev.origin + ((p_curr.origin - p_prev.origin) * p_fraction);
+	interpolate_basis(p_prev.basis, p_curr.basis, r_result.basis, p_fraction);
+}
+
+void TransformInterpolator::interpolate_basis(const Basis &p_prev, const Basis &p_curr, Basis &r_result, real_t p_fraction) {
+	Method method = find_method(p_prev, p_curr);
+	interpolate_basis_via_method(p_prev, p_curr, r_result, p_fraction, method);
+}
+
+void TransformInterpolator::interpolate_transform_3d_via_method(const Transform3D &p_prev, const Transform3D &p_curr, Transform3D &r_result, real_t p_fraction, Method p_method) {
+	r_result.origin = p_prev.origin + ((p_curr.origin - p_prev.origin) * p_fraction);
+	interpolate_basis_via_method(p_prev.basis, p_curr.basis, r_result.basis, p_fraction, p_method);
+}
+
+void TransformInterpolator::interpolate_basis_via_method(const Basis &p_prev, const Basis &p_curr, Basis &r_result, real_t p_fraction, Method p_method) {
+	switch (p_method) {
+		default: {
+			interpolate_basis_linear(p_prev, p_curr, r_result, p_fraction);
+		} break;
+		case INTERP_SLERP: {
+			r_result = _basis_slerp_unchecked(p_prev, p_curr, p_fraction);
+		} break;
+		case INTERP_SCALED_SLERP: {
+			interpolate_basis_scaled_slerp(p_prev, p_curr, r_result, p_fraction);
+		} break;
+	}
+}
+
+Quaternion TransformInterpolator::_basis_to_quat_unchecked(const Basis &p_basis) {
+	Basis m = p_basis;
+	real_t trace = m.rows[0][0] + m.rows[1][1] + m.rows[2][2];
+	real_t temp[4];
+
+	if (trace > 0.0) {
+		real_t s = Math::sqrt(trace + 1.0f);
+		temp[3] = (s * 0.5f);
+		s = 0.5f / s;
+
+		temp[0] = ((m.rows[2][1] - m.rows[1][2]) * s);
+		temp[1] = ((m.rows[0][2] - m.rows[2][0]) * s);
+		temp[2] = ((m.rows[1][0] - m.rows[0][1]) * s);
+	} else {
+		int i = m.rows[0][0] < m.rows[1][1]
+				? (m.rows[1][1] < m.rows[2][2] ? 2 : 1)
+				: (m.rows[0][0] < m.rows[2][2] ? 2 : 0);
+		int j = (i + 1) % 3;
+		int k = (i + 2) % 3;
+
+		real_t s = Math::sqrt(m.rows[i][i] - m.rows[j][j] - m.rows[k][k] + 1.0f);
+		temp[i] = s * 0.5f;
+		s = 0.5f / s;
+
+		temp[3] = (m.rows[k][j] - m.rows[j][k]) * s;
+		temp[j] = (m.rows[j][i] + m.rows[i][j]) * s;
+		temp[k] = (m.rows[k][i] + m.rows[i][k]) * s;
+	}
+
+	return Quaternion(temp[0], temp[1], temp[2], temp[3]);
+}
+
+Quaternion TransformInterpolator::_quat_slerp_unchecked(const Quaternion &p_from, const Quaternion &p_to, real_t p_fraction) {
+	Quaternion to1;
+	real_t omega, cosom, sinom, scale0, scale1;
+
+	// Calculate cosine.
+	cosom = p_from.dot(p_to);
+
+	// Adjust signs (if necessary)
+	if (cosom < 0.0f) {
+		cosom = -cosom;
+		to1.x = -p_to.x;
+		to1.y = -p_to.y;
+		to1.z = -p_to.z;
+		to1.w = -p_to.w;
+	} else {
+		to1.x = p_to.x;
+		to1.y = p_to.y;
+		to1.z = p_to.z;
+		to1.w = p_to.w;
+	}
+
+	// Calculate coefficients.
+
+	// This check could possibly be removed as we dealt with this
+	// case in the find_method() function, but is left for safety, it probably
+	// isn't a bottleneck.
+	if ((1.0f - cosom) > (real_t)CMP_EPSILON) {
+		// standard case (slerp)
+		omega = Math::acos(cosom);
+		sinom = Math::sin(omega);
+		scale0 = Math::sin((1.0f - p_fraction) * omega) / sinom;
+		scale1 = Math::sin(p_fraction * omega) / sinom;
+	} else {
+		// "from" and "to" quaternions are very close
+		//  ... so we can do a linear interpolation
+		scale0 = 1.0f - p_fraction;
+		scale1 = p_fraction;
+	}
+	// Calculate final values.
+	return Quaternion(
+			scale0 * p_from.x + scale1 * to1.x,
+			scale0 * p_from.y + scale1 * to1.y,
+			scale0 * p_from.z + scale1 * to1.z,
+			scale0 * p_from.w + scale1 * to1.w);
+}
+
+Basis TransformInterpolator::_basis_slerp_unchecked(Basis p_from, Basis p_to, real_t p_fraction) {
+	Quaternion from = _basis_to_quat_unchecked(p_from);
+	Quaternion to = _basis_to_quat_unchecked(p_to);
+
+	Basis b(_quat_slerp_unchecked(from, to, p_fraction));
+	return b;
+}
+
+void TransformInterpolator::interpolate_basis_scaled_slerp(Basis p_prev, Basis p_curr, Basis &r_result, real_t p_fraction) {
+	// Normalize both and find lengths.
+	Vector3 lengths_prev = _basis_orthonormalize(p_prev);
+	Vector3 lengths_curr = _basis_orthonormalize(p_curr);
+
+	r_result = _basis_slerp_unchecked(p_prev, p_curr, p_fraction);
+
+	// Now the result is unit length basis, we need to scale.
+	Vector3 lengths_lerped = lengths_prev + ((lengths_curr - lengths_prev) * p_fraction);
+
+	// Keep a note that the column / row order of the basis is weird,
+	// so keep an eye for bugs with this.
+	r_result[0] *= lengths_lerped;
+	r_result[1] *= lengths_lerped;
+	r_result[2] *= lengths_lerped;
+}
+
+void TransformInterpolator::interpolate_basis_linear(const Basis &p_prev, const Basis &p_curr, Basis &r_result, real_t p_fraction) {
+	// Interpolate basis.
+	r_result = p_prev.lerp(p_curr, p_fraction);
+
+	// It turns out we need to guard against zero scale basis.
+	// This is kind of silly, as we should probably fix the bugs elsewhere in Godot that can't deal with
+	// zero scale, but until that time...
+	for (int n = 0; n < 3; n++) {
+		Vector3 &axis = r_result[n];
+
+		// Not ok, this could cause errors due to bugs elsewhere,
+		// so we will bodge set this to a small value.
+		const real_t smallest = 0.0001f;
+		const real_t smallest_squared = smallest * smallest;
+		if (axis.length_squared() < smallest_squared) {
+			// Setting a different component to the smallest
+			// helps prevent the situation where all the axes are pointing in the same direction,
+			// which could be a problem for e.g. cross products...
+			axis[n] = smallest;
+		}
+	}
+}
+
+// Returns length.
+real_t TransformInterpolator::_vec3_normalize(Vector3 &p_vec) {
+	real_t lengthsq = p_vec.length_squared();
+	if (lengthsq == 0.0f) {
+		p_vec.x = p_vec.y = p_vec.z = 0.0f;
+		return 0.0f;
+	}
+	real_t length = Math::sqrt(lengthsq);
+	p_vec.x /= length;
+	p_vec.y /= length;
+	p_vec.z /= length;
+	return length;
+}
+
+// Returns lengths.
+Vector3 TransformInterpolator::_basis_orthonormalize(Basis &r_basis) {
+	// Gram-Schmidt Process.
+
+	Vector3 x = r_basis.get_column(0);
+	Vector3 y = r_basis.get_column(1);
+	Vector3 z = r_basis.get_column(2);
+
+	Vector3 lengths;
+
+	lengths.x = _vec3_normalize(x);
+	y = (y - x * (x.dot(y)));
+	lengths.y = _vec3_normalize(y);
+	z = (z - x * (x.dot(z)) - y * (y.dot(z)));
+	lengths.z = _vec3_normalize(z);
+
+	r_basis.set_column(0, x);
+	r_basis.set_column(1, y);
+	r_basis.set_column(2, z);
+
+	return lengths;
+}
+
+TransformInterpolator::Method TransformInterpolator::_test_basis(Basis p_basis, bool r_needed_normalize, Quaternion &r_quat) {
+	// Axis lengths.
+	Vector3 al = Vector3(p_basis.get_column(0).length_squared(),
+			p_basis.get_column(1).length_squared(),
+			p_basis.get_column(2).length_squared());
+
+	// Non unit scale?
+	if (r_needed_normalize || !_vec3_is_equal_approx(al, Vector3(1.0, 1.0, 1.0), (real_t)0.001f)) {
+		// If the basis is not normalized (at least approximately), it will fail the checks needed for slerp.
+		// So we try to detect a scaled (but not sheared) basis, which we *can* slerp by normalizing first,
+		// and lerping the scales separately.
+
+		// If any of the axes are really small, it is unlikely to be a valid rotation, or is scaled too small to deal with float error.
+		const real_t sl_epsilon = 0.00001f;
+		if ((al.x < sl_epsilon) ||
+				(al.y < sl_epsilon) ||
+				(al.z < sl_epsilon)) {
+			return INTERP_LERP;
+		}
+
+		// Normalize the basis.
+		Basis norm_basis = p_basis;
+
+		al.x = Math::sqrt(al.x);
+		al.y = Math::sqrt(al.y);
+		al.z = Math::sqrt(al.z);
+
+		norm_basis.set_column(0, norm_basis.get_column(0) / al.x);
+		norm_basis.set_column(1, norm_basis.get_column(1) / al.y);
+		norm_basis.set_column(2, norm_basis.get_column(2) / al.z);
+
+		// This doesn't appear necessary, as the later checks will catch it.
+		// if (!_basis_is_orthogonal_any_scale(norm_basis)) {
+		// return INTERP_LERP;
+		// }
+
+		p_basis = norm_basis;
+
+		// Orthonormalize not necessary as normal normalization(!) works if the
+		// axes are orthonormal.
+		// p_basis.orthonormalize();
+
+		// If we needed to normalize one of the two bases, we will need to normalize both,
+		// regardless of whether the 2nd needs it, just to make sure it takes the path to return
+		// INTERP_SCALED_LERP on the 2nd call of _test_basis.
+		r_needed_normalize = true;
+	}
+
+	// Apply less stringent tests than the built in slerp, the standard Godot slerp
+	// is too susceptible to float error to be useful.
+	real_t det = p_basis.determinant();
+	if (!Math::is_equal_approx(det, 1, (real_t)0.01f)) {
+		return INTERP_LERP;
+	}
+
+	if (!_basis_is_orthogonal(p_basis)) {
+		return INTERP_LERP;
+	}
+
+	// TODO: This could possibly be less stringent too, check this.
+	r_quat = _basis_to_quat_unchecked(p_basis);
+	if (!r_quat.is_normalized()) {
+		return INTERP_LERP;
+	}
+
+	return r_needed_normalize ? INTERP_SCALED_SLERP : INTERP_SLERP;
+}
+
+// This check doesn't seem to be needed but is preserved in case of bugs.
+bool TransformInterpolator::_basis_is_orthogonal_any_scale(const Basis &p_basis) {
+	Vector3 cross = p_basis.get_column(0).cross(p_basis.get_column(1));
+	real_t l = _vec3_normalize(cross);
+	// Too small numbers, revert to lerp.
+	if (l < 0.001f) {
+		return false;
+	}
+
+	const real_t epsilon = 0.9995f;
+
+	real_t dot = cross.dot(p_basis.get_column(2));
+	if (dot < epsilon) {
+		return false;
+	}
+
+	cross = p_basis.get_column(1).cross(p_basis.get_column(2));
+	l = _vec3_normalize(cross);
+	// Too small numbers, revert to lerp.
+	if (l < 0.001f) {
+		return false;
+	}
+
+	dot = cross.dot(p_basis.get_column(0));
+	if (dot < epsilon) {
+		return false;
+	}
+
+	return true;
+}
+
+bool TransformInterpolator::_basis_is_orthogonal(const Basis &p_basis, real_t p_epsilon) {
+	Basis identity;
+	Basis m = p_basis * p_basis.transposed();
+
+	// Less stringent tests than the standard Godot slerp.
+	if (!_vec3_is_equal_approx(m[0], identity[0], p_epsilon) || !_vec3_is_equal_approx(m[1], identity[1], p_epsilon) || !_vec3_is_equal_approx(m[2], identity[2], p_epsilon)) {
+		return false;
+	}
+	return true;
+}
+
+real_t TransformInterpolator::checksum_transform_3d(const Transform3D &p_transform) {
+	// just a really basic checksum, this can probably be improved
+	real_t sum = _vec3_sum(p_transform.origin);
+	sum -= _vec3_sum(p_transform.basis.rows[0]);
+	sum += _vec3_sum(p_transform.basis.rows[1]);
+	sum -= _vec3_sum(p_transform.basis.rows[2]);
+	return sum;
+}
+
+TransformInterpolator::Method TransformInterpolator::find_method(const Basis &p_a, const Basis &p_b) {
+	bool needed_normalize = false;
+
+	Quaternion q0;
+	Method method = _test_basis(p_a, needed_normalize, q0);
+	if (method == INTERP_LERP) {
+		return method;
+	}
+
+	Quaternion q1;
+	method = _test_basis(p_b, needed_normalize, q1);
+	if (method == INTERP_LERP) {
+		return method;
+	}
+
+	// Are they close together?
+	// Apply the same test that will revert to lerp as is present in the slerp routine.
+	// Calculate cosine.
+	real_t cosom = Math::abs(q0.dot(q1));
+	if ((1.0f - cosom) <= (real_t)CMP_EPSILON) {
+		return INTERP_LERP;
+	}
+
+	return method;
+}
diff --git a/core/math/transform_interpolator.h b/core/math/transform_interpolator.h
index a9bce2bd7f..cc556707e4 100644
--- a/core/math/transform_interpolator.h
+++ b/core/math/transform_interpolator.h
@@ -32,15 +32,64 @@
 #define TRANSFORM_INTERPOLATOR_H
 
 #include "core/math/math_defs.h"
+#include "core/math/vector3.h"
+
+// Keep all the functions for fixed timestep interpolation together.
+// There are two stages involved:
+// Finding a method, for determining the interpolation method between two
+// keyframes (which are physics ticks).
+// And applying that pre-determined method.
+
+// Pre-determining the method makes sense because it is expensive and often
+// several frames may occur between each physics tick, which will make it cheaper
+// than performing every frame.
 
 struct Transform2D;
+struct Transform3D;
+struct Basis;
+struct Quaternion;
 
 class TransformInterpolator {
+public:
+	enum Method {
+		INTERP_LERP,
+		INTERP_SLERP,
+		INTERP_SCALED_SLERP,
+	};
+
 private:
-	static bool _sign(real_t p_val) { return p_val >= 0; }
+	_FORCE_INLINE_ static bool _sign(real_t p_val) { return p_val >= 0; }
+	static real_t _vec3_sum(const Vector3 &p_pt) { return p_pt.x + p_pt.y + p_pt.z; }
+	static real_t _vec3_normalize(Vector3 &p_vec);
+	_FORCE_INLINE_ static bool _vec3_is_equal_approx(const Vector3 &p_a, const Vector3 &p_b, real_t p_tolerance) {
+		return Math::is_equal_approx(p_a.x, p_b.x, p_tolerance) && Math::is_equal_approx(p_a.y, p_b.y, p_tolerance) && Math::is_equal_approx(p_a.z, p_b.z, p_tolerance);
+	}
+	static Vector3 _basis_orthonormalize(Basis &r_basis);
+	static Method _test_basis(Basis p_basis, bool r_needed_normalize, Quaternion &r_quat);
+	static Basis _basis_slerp_unchecked(Basis p_from, Basis p_to, real_t p_fraction);
+	static Quaternion _quat_slerp_unchecked(const Quaternion &p_from, const Quaternion &p_to, real_t p_fraction);
+	static Quaternion _basis_to_quat_unchecked(const Basis &p_basis);
+	static bool _basis_is_orthogonal(const Basis &p_basis, real_t p_epsilon = 0.01f);
+	static bool _basis_is_orthogonal_any_scale(const Basis &p_basis);
+
+	static void interpolate_basis_linear(const Basis &p_prev, const Basis &p_curr, Basis &r_result, real_t p_fraction);
+	static void interpolate_basis_scaled_slerp(Basis p_prev, Basis p_curr, Basis &r_result, real_t p_fraction);
 
 public:
 	static void interpolate_transform_2d(const Transform2D &p_prev, const Transform2D &p_curr, Transform2D &r_result, real_t p_fraction);
+
+	// Generic functions, use when you don't know what method should be used, e.g. from GDScript.
+	// These will be slower.
+	static void interpolate_transform_3d(const Transform3D &p_prev, const Transform3D &p_curr, Transform3D &r_result, real_t p_fraction);
+	static void interpolate_basis(const Basis &p_prev, const Basis &p_curr, Basis &r_result, real_t p_fraction);
+
+	// Optimized function when you know ahead of time the method.
+	static void interpolate_transform_3d_via_method(const Transform3D &p_prev, const Transform3D &p_curr, Transform3D &r_result, real_t p_fraction, Method p_method);
+	static void interpolate_basis_via_method(const Basis &p_prev, const Basis &p_curr, Basis &r_result, real_t p_fraction, Method p_method);
+
+	static real_t checksum_transform_3d(const Transform3D &p_transform);
+
+	static Method find_method(const Basis &p_a, const Basis &p_b);
 };
 
 #endif // TRANSFORM_INTERPOLATOR_H
diff --git a/core/object/object.cpp b/core/object/object.cpp
index 97a3a405b9..e4d1a8fc9a 100644
--- a/core/object/object.cpp
+++ b/core/object/object.cpp
@@ -763,7 +763,7 @@ Variant Object::callp(const StringName &p_method, const Variant **p_args, int p_
 		}
 		if (is_ref_counted()) {
 			r_error.error = Callable::CallError::CALL_ERROR_INVALID_METHOD;
-			ERR_FAIL_V_MSG(Variant(), "Can't 'free' a reference.");
+			ERR_FAIL_V_MSG(Variant(), "Can't free a RefCounted object.");
 		}
 
 		if (_lock_index.get() > 1) {
diff --git a/core/object/script_language.cpp b/core/object/script_language.cpp
index 0b528e908a..cdc56e5ec5 100644
--- a/core/object/script_language.cpp
+++ b/core/object/script_language.cpp
@@ -491,10 +491,6 @@ void ScriptServer::save_global_classes() {
 	ProjectSettings::get_singleton()->store_global_class_list(gcarr);
 }
 
-String ScriptServer::get_global_class_cache_file_path() {
-	return ProjectSettings::get_singleton()->get_global_class_list_path();
-}
-
 ////////////////////
 
 ScriptCodeCompletionCache *ScriptCodeCompletionCache::singleton = nullptr;
diff --git a/core/object/script_language.h b/core/object/script_language.h
index 223f114150..59a43a7b29 100644
--- a/core/object/script_language.h
+++ b/core/object/script_language.h
@@ -97,7 +97,6 @@ public:
 	static void get_global_class_list(List<StringName> *r_global_classes);
 	static void get_inheriters_list(const StringName &p_base_type, List<StringName> *r_classes);
 	static void save_global_classes();
-	static String get_global_class_cache_file_path();
 
 	static void init_languages();
 	static void finish_languages();
diff --git a/core/object/worker_thread_pool.cpp b/core/object/worker_thread_pool.cpp
index caf4ed3835..56b9fa8475 100644
--- a/core/object/worker_thread_pool.cpp
+++ b/core/object/worker_thread_pool.cpp
@@ -59,8 +59,9 @@ void WorkerThreadPool::_process_task(Task *p_task) {
 	CallQueue *call_queue_backup = MessageQueue::get_singleton() != MessageQueue::get_main_singleton() ? MessageQueue::get_singleton() : nullptr;
 
 	{
-		// Tasks must start with this unset. They are free to set-and-forget otherwise.
+		// Tasks must start with these at default values. They are free to set-and-forget otherwise.
 		set_current_thread_safe_for_nodes(false);
+		MessageQueue::set_thread_singleton_override(nullptr);
 		// Since the WorkerThreadPool is started before the script server,
 		// its pre-created threads can't have ScriptServer::thread_enter() called on them early.
 		// Therefore, we do it late at the first opportunity, so in case the task
@@ -82,6 +83,10 @@ void WorkerThreadPool::_process_task(Task *p_task) {
 	}
 #endif
 
+#ifdef THREADS_ENABLED
+	bool low_priority = p_task->low_priority;
+#endif
+
 	if (p_task->group) {
 		// Handling a group
 		bool do_post = false;
@@ -158,7 +163,7 @@ void WorkerThreadPool::_process_task(Task *p_task) {
 #ifdef THREADS_ENABLED
 	{
 		curr_thread.current_task = prev_task;
-		if (p_task->low_priority) {
+		if (low_priority) {
 			low_priority_threads_used--;
 
 			if (_try_promote_low_priority_task()) {
@@ -397,16 +402,17 @@ Error WorkerThreadPool::wait_for_task_completion(TaskID p_task_id) {
 		task->waiting_user++;
 	}
 
-	task_mutex.unlock();
-
 	if (caller_pool_thread) {
+		task_mutex.unlock();
 		_wait_collaboratively(caller_pool_thread, task);
+		task_mutex.lock();
 		task->waiting_pool--;
 		if (task->waiting_pool == 0 && task->waiting_user == 0) {
 			tasks.erase(p_task_id);
 			task_allocator.free(task);
 		}
 	} else {
+		task_mutex.unlock();
 		task->done_semaphore.wait();
 		task_mutex.lock();
 		task->waiting_user--;
@@ -414,9 +420,9 @@ Error WorkerThreadPool::wait_for_task_completion(TaskID p_task_id) {
 			tasks.erase(p_task_id);
 			task_allocator.free(task);
 		}
-		task_mutex.unlock();
 	}
 
+	task_mutex.unlock();
 	return OK;
 }
 
@@ -670,7 +676,7 @@ uint32_t WorkerThreadPool::thread_enter_unlock_allowance_zone(BinaryMutex *p_mut
 
 uint32_t WorkerThreadPool::_thread_enter_unlock_allowance_zone(void *p_mutex, bool p_is_binary) {
 	for (uint32_t i = 0; i < MAX_UNLOCKABLE_MUTEXES; i++) {
-		if (unlikely(unlockable_mutexes[i] == (uintptr_t)p_mutex)) {
+		if (unlikely((unlockable_mutexes[i] & ~1) == (uintptr_t)p_mutex)) {
 			// Already registered in the current thread.
 			return UINT32_MAX;
 		}
diff --git a/core/os/main_loop.h b/core/os/main_loop.h
index e48541d074..9c22cbaf3c 100644
--- a/core/os/main_loop.h
+++ b/core/os/main_loop.h
@@ -64,6 +64,7 @@ public:
 	virtual void initialize();
 	virtual void iteration_prepare() {}
 	virtual bool physics_process(double p_time);
+	virtual void iteration_end() {}
 	virtual bool process(double p_time);
 	virtual void finalize();
 
diff --git a/core/os/os.h b/core/os/os.h
index 63cc6ed50e..91e0ce9379 100644
--- a/core/os/os.h
+++ b/core/os/os.h
@@ -328,8 +328,6 @@ public:
 	virtual void benchmark_end_measure(const String &p_context, const String &p_what);
 	virtual void benchmark_dump();
 
-	virtual void process_and_drop_events() {}
-
 	virtual Error setup_remote_filesystem(const String &p_server_host, int p_port, const String &p_password, String &r_project_path);
 
 	enum PreferredTextureFormat {
diff --git a/core/string/ustring.cpp b/core/string/ustring.cpp
index 3d37e17ef8..c41ae1ccd8 100644
--- a/core/string/ustring.cpp
+++ b/core/string/ustring.cpp
@@ -1639,13 +1639,43 @@ Vector<int> String::split_ints_mk(const Vector<String> &p_splitters, bool p_allo
 }
 
 String String::join(const Vector<String> &parts) const {
+	if (parts.is_empty()) {
+		return String();
+	} else if (parts.size() == 1) {
+		return parts[0];
+	}
+
+	const int this_length = length();
+
+	int new_size = (parts.size() - 1) * this_length;
+	for (const String &part : parts) {
+		new_size += part.length();
+	}
+	new_size += 1;
+
 	String ret;
-	for (int i = 0; i < parts.size(); ++i) {
-		if (i > 0) {
-			ret += *this;
+	ret.resize(new_size);
+	char32_t *ret_ptrw = ret.ptrw();
+	const char32_t *this_ptr = ptr();
+
+	bool first = true;
+	for (const String &part : parts) {
+		if (first) {
+			first = false;
+		} else if (this_length) {
+			memcpy(ret_ptrw, this_ptr, this_length * sizeof(char32_t));
+			ret_ptrw += this_length;
+		}
+
+		const int part_length = part.length();
+		if (part_length) {
+			memcpy(ret_ptrw, part.ptr(), part_length * sizeof(char32_t));
+			ret_ptrw += part_length;
 		}
-		ret += parts[i];
 	}
+
+	*ret_ptrw = 0;
+
 	return ret;
 }
 
@@ -3149,7 +3179,7 @@ Vector<uint8_t> String::sha256_buffer() const {
 }
 
 String String::insert(int p_at_pos, const String &p_string) const {
-	if (p_at_pos < 0) {
+	if (p_string.is_empty() || p_at_pos < 0) {
 		return *this;
 	}
 
@@ -3157,17 +3187,27 @@ String String::insert(int p_at_pos, const String &p_string) const {
 		p_at_pos = length();
 	}
 
-	String pre;
+	String ret;
+	ret.resize(length() + p_string.length() + 1);
+	char32_t *ret_ptrw = ret.ptrw();
+	const char32_t *this_ptr = ptr();
+
 	if (p_at_pos > 0) {
-		pre = substr(0, p_at_pos);
+		memcpy(ret_ptrw, this_ptr, p_at_pos * sizeof(char32_t));
+		ret_ptrw += p_at_pos;
 	}
 
-	String post;
+	memcpy(ret_ptrw, p_string.ptr(), p_string.length() * sizeof(char32_t));
+	ret_ptrw += p_string.length();
+
 	if (p_at_pos < length()) {
-		post = substr(p_at_pos, length() - p_at_pos);
+		memcpy(ret_ptrw, this_ptr + p_at_pos, (length() - p_at_pos) * sizeof(char32_t));
+		ret_ptrw += length() - p_at_pos;
 	}
 
-	return pre + p_string + post;
+	*ret_ptrw = 0;
+
+	return ret;
 }
 
 String String::erase(int p_pos, int p_chars) const {
diff --git a/core/templates/command_queue_mt.h b/core/templates/command_queue_mt.h
index 0748e9cb83..1e6c6e42a9 100644
--- a/core/templates/command_queue_mt.h
+++ b/core/templates/command_queue_mt.h
@@ -370,15 +370,19 @@ class CommandQueueMT {
 			flush_read_ptr += 8;
 			CommandBase *cmd = reinterpret_cast<CommandBase *>(&command_mem[flush_read_ptr]);
 			cmd->call();
+
+			// Handle potential realloc due to the command and unlock allowance.
+			cmd = reinterpret_cast<CommandBase *>(&command_mem[flush_read_ptr]);
+
 			if (unlikely(cmd->sync)) {
 				sync_head++;
 				unlock(); // Give an opportunity to awaiters right away.
 				sync_cond_var.notify_all();
 				lock();
+				// Handle potential realloc happened during unlock.
+				cmd = reinterpret_cast<CommandBase *>(&command_mem[flush_read_ptr]);
 			}
 
-			// If the command involved reallocating the buffer, the address may have changed.
-			cmd = reinterpret_cast<CommandBase *>(&command_mem[flush_read_ptr]);
 			cmd->~CommandBase();
 
 			flush_read_ptr += size;
diff --git a/core/variant/variant_construct.h b/core/variant/variant_construct.h
index b824044b82..5afdb884f6 100644
--- a/core/variant/variant_construct.h
+++ b/core/variant/variant_construct.h
@@ -153,11 +153,14 @@ public:
 class VariantConstructorObject {
 public:
 	static void construct(Variant &r_ret, const Variant **p_args, Callable::CallError &r_error) {
-		VariantInternal::clear(&r_ret);
 		if (p_args[0]->get_type() == Variant::NIL) {
+			VariantInternal::clear(&r_ret);
+			VariantTypeChanger<Object *>::change(&r_ret);
 			VariantInternal::object_assign_null(&r_ret);
 			r_error.error = Callable::CallError::CALL_OK;
 		} else if (p_args[0]->get_type() == Variant::OBJECT) {
+			VariantInternal::clear(&r_ret);
+			VariantTypeChanger<Object *>::change(&r_ret);
 			VariantInternal::object_assign(&r_ret, p_args[0]);
 			r_error.error = Callable::CallError::CALL_OK;
 		} else {
@@ -169,6 +172,7 @@ public:
 
 	static inline void validated_construct(Variant *r_ret, const Variant **p_args) {
 		VariantInternal::clear(r_ret);
+		VariantTypeChanger<Object *>::change(r_ret);
 		VariantInternal::object_assign(r_ret, p_args[0]);
 	}
 	static void ptr_construct(void *base, const void **p_args) {
@@ -198,11 +202,13 @@ public:
 		}
 
 		VariantInternal::clear(&r_ret);
+		VariantTypeChanger<Object *>::change(&r_ret);
 		VariantInternal::object_assign_null(&r_ret);
 	}
 
 	static inline void validated_construct(Variant *r_ret, const Variant **p_args) {
 		VariantInternal::clear(r_ret);
+		VariantTypeChanger<Object *>::change(r_ret);
 		VariantInternal::object_assign_null(r_ret);
 	}
 	static void ptr_construct(void *base, const void **p_args) {