12 files changed, 651 insertions, 42 deletions
diff --git a/core/error/error_macros.cpp b/core/error/error_macros.cpp
index 8376c0aaf8..813ee7684f 100644
--- a/core/error/error_macros.cpp
+++ b/core/error/error_macros.cpp
@@ -34,6 +34,12 @@
 #include "core/os/os.h"
 #include "core/string/ustring.h"
 
+// Optional physics interpolation warnings try to include the path to the relevant node.
+#if defined(DEBUG_ENABLED) && defined(TOOLS_ENABLED)
+#include "core/config/project_settings.h"
+#include "scene/main/node.h"
+#endif
+
 static ErrorHandlerList *error_handler_list = nullptr;
 
 void add_error_handler(ErrorHandlerList *p_handler) {
@@ -128,3 +134,48 @@ void _err_print_index_error(const char *p_function, const char *p_file, int p_li
 void _err_flush_stdout() {
 	fflush(stdout);
 }
+
+// Prevent error spam by limiting the warnings to a certain frequency.
+void _physics_interpolation_warning(const char *p_function, const char *p_file, int p_line, ObjectID p_id, const char *p_warn_string) {
+#if defined(DEBUG_ENABLED) && defined(TOOLS_ENABLED)
+	const uint32_t warn_max = 2048;
+	const uint32_t warn_timeout_seconds = 15;
+
+	static uint32_t warn_count = warn_max;
+	static uint32_t warn_timeout = warn_timeout_seconds;
+
+	uint32_t time_now = UINT32_MAX;
+
+	if (warn_count) {
+		warn_count--;
+	}
+
+	if (!warn_count) {
+		time_now = OS::get_singleton()->get_ticks_msec() / 1000;
+	}
+
+	if ((warn_count == 0) && (time_now >= warn_timeout)) {
+		warn_count = warn_max;
+		warn_timeout = time_now + warn_timeout_seconds;
+
+		if (GLOBAL_GET("debug/settings/physics_interpolation/enable_warnings")) {
+			// UINT64_MAX means unused.
+			if (p_id.operator uint64_t() == UINT64_MAX) {
+				_err_print_error(p_function, p_file, p_line, "[Physics interpolation] " + String(p_warn_string) + " (possibly benign).", false, ERR_HANDLER_WARNING);
+			} else {
+				String node_name;
+				if (p_id.is_valid()) {
+					Node *node = Object::cast_to<Node>(ObjectDB::get_instance(p_id));
+					if (node && node->is_inside_tree()) {
+						node_name = "\"" + String(node->get_path()) + "\"";
+					} else {
+						node_name = "\"unknown\"";
+					}
+				}
+
+				_err_print_error(p_function, p_file, p_line, "[Physics interpolation] " + String(p_warn_string) + ": " + node_name + " (possibly benign).", false, ERR_HANDLER_WARNING);
+			}
+		}
+	}
+#endif
+}
diff --git a/core/error/error_macros.h b/core/error/error_macros.h
index ab7dbcbd44..d31adb72be 100644
--- a/core/error/error_macros.h
+++ b/core/error/error_macros.h
@@ -31,6 +31,7 @@
 #ifndef ERROR_MACROS_H
 #define ERROR_MACROS_H
 
+#include "core/object/object_id.h"
 #include "core/typedefs.h"
 
 #include <atomic> // We'd normally use safe_refcount.h, but that would cause circular includes.
@@ -71,6 +72,8 @@ void _err_print_index_error(const char *p_function, const char *p_file, int p_li
 void _err_print_index_error(const char *p_function, const char *p_file, int p_line, int64_t p_index, int64_t p_size, const char *p_index_str, const char *p_size_str, const String &p_message, bool p_editor_notify = false, bool fatal = false);
 void _err_flush_stdout();
 
+void _physics_interpolation_warning(const char *p_function, const char *p_file, int p_line, ObjectID p_id, const char *p_warn_string);
+
 #ifdef __GNUC__
 //#define FUNCTION_STR __PRETTY_FUNCTION__ - too annoying
 #define FUNCTION_STR __FUNCTION__
@@ -832,4 +835,14 @@ void _err_flush_stdout();
 #define DEV_CHECK_ONCE(m_cond)
 #endif
 
+/**
+ * Physics Interpolation warnings.
+ * These are spam protection warnings.
+ */
+#define PHYSICS_INTERPOLATION_NODE_WARNING(m_object_id, m_string) \
+	_physics_interpolation_warning(FUNCTION_STR, __FILE__, __LINE__, m_object_id, m_string)
+
+#define PHYSICS_INTERPOLATION_WARNING(m_string) \
+	_physics_interpolation_warning(FUNCTION_STR, __FILE__, __LINE__, UINT64_MAX, m_string)
+
 #endif // ERROR_MACROS_H
diff --git a/core/extension/gdextension.cpp b/core/extension/gdextension.cpp
index 8e2366fc95..cb6832ea39 100644
--- a/core/extension/gdextension.cpp
+++ b/core/extension/gdextension.cpp
@@ -781,23 +781,14 @@ Error GDExtension::open_library(const String &p_path, const String &p_entry_symb
 		}
 	}
 
-	String actual_lib_path;
 	OS::GDExtensionData data = {
 		true, // also_set_library_path
-		&actual_lib_path, // r_resolved_path
+		&library_path, // r_resolved_path
 		Engine::get_singleton()->is_editor_hint(), // generate_temp_files
 		&abs_dependencies_paths, // library_dependencies
 	};
 	Error err = OS::get_singleton()->open_dynamic_library(abs_path, library, &data);
 
-	if (actual_lib_path.get_file() != abs_path.get_file()) {
-		// If temporary files are generated, let's change the library path to point at the original,
-		// because that's what we want to check to see if it's changed.
-		library_path = actual_lib_path.get_base_dir().path_join(p_path.get_file());
-	} else {
-		library_path = actual_lib_path;
-	}
-
 	ERR_FAIL_COND_V_MSG(err == ERR_FILE_NOT_FOUND, err, "GDExtension dynamic library not found: " + abs_path);
 	ERR_FAIL_COND_V_MSG(err != OK, err, "Can't open GDExtension dynamic library: " + abs_path);
 
diff --git a/core/io/file_access_pack.cpp b/core/io/file_access_pack.cpp
index 991b94db38..02bf0a6039 100644
--- a/core/io/file_access_pack.cpp
+++ b/core/io/file_access_pack.cpp
@@ -119,6 +119,10 @@ void PackedData::_free_packed_dirs(PackedDir *p_dir) {
 }
 
 PackedData::~PackedData() {
+	if (singleton == this) {
+		singleton = nullptr;
+	}
+
 	for (int i = 0; i < sources.size(); i++) {
 		memdelete(sources[i]);
 	}
diff --git a/core/io/image.cpp b/core/io/image.cpp
index d0598e4dc6..b35d405662 100644
--- a/core/io/image.cpp
+++ b/core/io/image.cpp
@@ -501,6 +501,38 @@ static void _convert(int p_width, int p_height, const uint8_t *p_src, uint8_t *p
 	}
 }
 
+template <typename T, uint32_t read_channels, uint32_t write_channels, T def_zero, T def_one>
+static void _convert_fast(int p_width, int p_height, const T *p_src, T *p_dst) {
+	uint32_t dst_count = 0;
+	uint32_t src_count = 0;
+
+	const int resolution = p_width * p_height;
+
+	for (int i = 0; i < resolution; i++) {
+		memcpy(p_dst + dst_count, p_src + src_count, MIN(read_channels, write_channels) * sizeof(T));
+
+		if constexpr (write_channels > read_channels) {
+			const T def_value[4] = { def_zero, def_zero, def_zero, def_one };
+			memcpy(p_dst + dst_count + read_channels, &def_value[read_channels], (write_channels - read_channels) * sizeof(T));
+		}
+
+		dst_count += write_channels;
+		src_count += read_channels;
+	}
+}
+
+static bool _are_formats_compatible(Image::Format p_format0, Image::Format p_format1) {
+	if (p_format0 <= Image::FORMAT_RGBA8 && p_format1 <= Image::FORMAT_RGBA8) {
+		return true;
+	} else if (p_format0 <= Image::FORMAT_RGBAH && p_format0 >= Image::FORMAT_RH && p_format1 <= Image::FORMAT_RGBAH && p_format1 >= Image::FORMAT_RH) {
+		return true;
+	} else if (p_format0 <= Image::FORMAT_RGBAF && p_format0 >= Image::FORMAT_RF && p_format1 <= Image::FORMAT_RGBAF && p_format1 >= Image::FORMAT_RF) {
+		return true;
+	}
+
+	return false;
+}
+
 void Image::convert(Format p_new_format) {
 	ERR_FAIL_INDEX_MSG(p_new_format, FORMAT_MAX, "The Image format specified (" + itos(p_new_format) + ") is out of range. See Image's Format enum.");
 	if (data.size() == 0) {
@@ -517,7 +549,7 @@ void Image::convert(Format p_new_format) {
 	if (Image::is_format_compressed(format) || Image::is_format_compressed(p_new_format)) {
 		ERR_FAIL_MSG("Cannot convert to <-> from compressed formats. Use compress() and decompress() instead.");
 
-	} else if (format > FORMAT_RGBA8 || p_new_format > FORMAT_RGBA8) {
+	} else if (!_are_formats_compatible(format, p_new_format)) {
 		//use put/set pixel which is slower but works with non byte formats
 		Image new_img(width, height, mipmaps, p_new_format);
 
@@ -648,6 +680,78 @@ void Image::convert(Format p_new_format) {
 			case FORMAT_RGBA8 | (FORMAT_RGB8 << 8):
 				_convert<3, true, 3, false, false, false>(mip_width, mip_height, rptr, wptr);
 				break;
+			case FORMAT_RH | (FORMAT_RGH << 8):
+				_convert_fast<uint16_t, 1, 2, 0x0000, 0x3C00>(mip_width, mip_height, (const uint16_t *)rptr, (uint16_t *)wptr);
+				break;
+			case FORMAT_RH | (FORMAT_RGBH << 8):
+				_convert_fast<uint16_t, 1, 3, 0x0000, 0x3C00>(mip_width, mip_height, (const uint16_t *)rptr, (uint16_t *)wptr);
+				break;
+			case FORMAT_RH | (FORMAT_RGBAH << 8):
+				_convert_fast<uint16_t, 1, 4, 0x0000, 0x3C00>(mip_width, mip_height, (const uint16_t *)rptr, (uint16_t *)wptr);
+				break;
+			case FORMAT_RGH | (FORMAT_RH << 8):
+				_convert_fast<uint16_t, 2, 1, 0x0000, 0x3C00>(mip_width, mip_height, (const uint16_t *)rptr, (uint16_t *)wptr);
+				break;
+			case FORMAT_RGH | (FORMAT_RGBH << 8):
+				_convert_fast<uint16_t, 2, 3, 0x0000, 0x3C00>(mip_width, mip_height, (const uint16_t *)rptr, (uint16_t *)wptr);
+				break;
+			case FORMAT_RGH | (FORMAT_RGBAH << 8):
+				_convert_fast<uint16_t, 2, 4, 0x0000, 0x3C00>(mip_width, mip_height, (const uint16_t *)rptr, (uint16_t *)wptr);
+				break;
+			case FORMAT_RGBH | (FORMAT_RH << 8):
+				_convert_fast<uint16_t, 3, 1, 0x0000, 0x3C00>(mip_width, mip_height, (const uint16_t *)rptr, (uint16_t *)wptr);
+				break;
+			case FORMAT_RGBH | (FORMAT_RGH << 8):
+				_convert_fast<uint16_t, 3, 2, 0x0000, 0x3C00>(mip_width, mip_height, (const uint16_t *)rptr, (uint16_t *)wptr);
+				break;
+			case FORMAT_RGBH | (FORMAT_RGBAH << 8):
+				_convert_fast<uint16_t, 3, 4, 0x0000, 0x3C00>(mip_width, mip_height, (const uint16_t *)rptr, (uint16_t *)wptr);
+				break;
+			case FORMAT_RGBAH | (FORMAT_RH << 8):
+				_convert_fast<uint16_t, 4, 1, 0x0000, 0x3C00>(mip_width, mip_height, (const uint16_t *)rptr, (uint16_t *)wptr);
+				break;
+			case FORMAT_RGBAH | (FORMAT_RGH << 8):
+				_convert_fast<uint16_t, 4, 2, 0x0000, 0x3C00>(mip_width, mip_height, (const uint16_t *)rptr, (uint16_t *)wptr);
+				break;
+			case FORMAT_RGBAH | (FORMAT_RGBH << 8):
+				_convert_fast<uint16_t, 4, 3, 0x0000, 0x3C00>(mip_width, mip_height, (const uint16_t *)rptr, (uint16_t *)wptr);
+				break;
+			case FORMAT_RF | (FORMAT_RGF << 8):
+				_convert_fast<uint32_t, 1, 2, 0x00000000, 0x3F800000>(mip_width, mip_height, (const uint32_t *)rptr, (uint32_t *)wptr);
+				break;
+			case FORMAT_RF | (FORMAT_RGBF << 8):
+				_convert_fast<uint32_t, 1, 3, 0x00000000, 0x3F800000>(mip_width, mip_height, (const uint32_t *)rptr, (uint32_t *)wptr);
+				break;
+			case FORMAT_RF | (FORMAT_RGBAF << 8):
+				_convert_fast<uint32_t, 1, 4, 0x00000000, 0x3F800000>(mip_width, mip_height, (const uint32_t *)rptr, (uint32_t *)wptr);
+				break;
+			case FORMAT_RGF | (FORMAT_RF << 8):
+				_convert_fast<uint32_t, 2, 1, 0x00000000, 0x3F800000>(mip_width, mip_height, (const uint32_t *)rptr, (uint32_t *)wptr);
+				break;
+			case FORMAT_RGF | (FORMAT_RGBF << 8):
+				_convert_fast<uint32_t, 2, 3, 0x00000000, 0x3F800000>(mip_width, mip_height, (const uint32_t *)rptr, (uint32_t *)wptr);
+				break;
+			case FORMAT_RGF | (FORMAT_RGBAF << 8):
+				_convert_fast<uint32_t, 2, 4, 0x00000000, 0x3F800000>(mip_width, mip_height, (const uint32_t *)rptr, (uint32_t *)wptr);
+				break;
+			case FORMAT_RGBF | (FORMAT_RF << 8):
+				_convert_fast<uint32_t, 3, 1, 0x00000000, 0x3F800000>(mip_width, mip_height, (const uint32_t *)rptr, (uint32_t *)wptr);
+				break;
+			case FORMAT_RGBF | (FORMAT_RGF << 8):
+				_convert_fast<uint32_t, 3, 2, 0x00000000, 0x3F800000>(mip_width, mip_height, (const uint32_t *)rptr, (uint32_t *)wptr);
+				break;
+			case FORMAT_RGBF | (FORMAT_RGBAF << 8):
+				_convert_fast<uint32_t, 3, 4, 0x00000000, 0x3F800000>(mip_width, mip_height, (const uint32_t *)rptr, (uint32_t *)wptr);
+				break;
+			case FORMAT_RGBAF | (FORMAT_RF << 8):
+				_convert_fast<uint32_t, 4, 1, 0x00000000, 0x3F800000>(mip_width, mip_height, (const uint32_t *)rptr, (uint32_t *)wptr);
+				break;
+			case FORMAT_RGBAF | (FORMAT_RGF << 8):
+				_convert_fast<uint32_t, 4, 2, 0x00000000, 0x3F800000>(mip_width, mip_height, (const uint32_t *)rptr, (uint32_t *)wptr);
+				break;
+			case FORMAT_RGBAF | (FORMAT_RGBF << 8):
+				_convert_fast<uint32_t, 4, 3, 0x00000000, 0x3F800000>(mip_width, mip_height, (const uint32_t *)rptr, (uint32_t *)wptr);
+				break;
 		}
 	}
 
diff --git a/core/io/resource_loader.cpp b/core/io/resource_loader.cpp
index c5582ad231..928bb95de3 100644
--- a/core/io/resource_loader.cpp
+++ b/core/io/resource_loader.cpp
@@ -474,6 +474,7 @@ Ref<ResourceLoader::LoadToken> ResourceLoader::_load_start(const String &p_path,
 	bool ignoring_cache = p_cache_mode == ResourceFormatLoader::CACHE_MODE_IGNORE || p_cache_mode == ResourceFormatLoader::CACHE_MODE_IGNORE_DEEP;
 
 	Ref<LoadToken> load_token;
+	bool must_not_register = false;
 	ThreadLoadTask unregistered_load_task; // Once set, must be valid up to the call to do the load.
 	ThreadLoadTask *load_task_ptr = nullptr;
 	bool run_on_current_thread = false;
@@ -516,8 +517,9 @@ Ref<ResourceLoader::LoadToken> ResourceLoader::_load_start(const String &p_path,
 				}
 			}
 
-			// Cache-ignoring tasks aren't registered in the map and so must finish within scope.
-			if (ignoring_cache) {
+			// If we want to ignore cache, but there's another task loading it, we can't add this one to the map and we also have to finish within scope.
+			must_not_register = ignoring_cache && thread_load_tasks.has(local_path);
+			if (must_not_register) {
 				load_token->local_path.clear();
 				unregistered_load_task = load_task;
 				load_task_ptr = &unregistered_load_task;
@@ -528,7 +530,7 @@ Ref<ResourceLoader::LoadToken> ResourceLoader::_load_start(const String &p_path,
 			}
 		}
 
-		run_on_current_thread = ignoring_cache || p_thread_mode == LOAD_THREAD_FROM_CURRENT;
+		run_on_current_thread = must_not_register || p_thread_mode == LOAD_THREAD_FROM_CURRENT;
 
 		if (run_on_current_thread) {
 			load_task_ptr->thread_id = Thread::get_caller_id();
@@ -539,7 +541,7 @@ Ref<ResourceLoader::LoadToken> ResourceLoader::_load_start(const String &p_path,
 
 	if (run_on_current_thread) {
 		_thread_load_function(load_task_ptr);
-		if (ignoring_cache) {
+		if (must_not_register) {
 			load_token->res_if_unregistered = load_task_ptr->resource;
 		}
 	}
diff --git a/core/math/math_funcs.h b/core/math/math_funcs.h
index 3060f31970..fd53ed28fd 100644
--- a/core/math/math_funcs.h
+++ b/core/math/math_funcs.h
@@ -447,14 +447,22 @@ public:
 
 	static _ALWAYS_INLINE_ double smoothstep(double p_from, double p_to, double p_s) {
 		if (is_equal_approx(p_from, p_to)) {
-			return p_from;
+			if (likely(p_from <= p_to)) {
+				return p_s <= p_from ? 0.0 : 1.0;
+			} else {
+				return p_s <= p_to ? 1.0 : 0.0;
+			}
 		}
 		double s = CLAMP((p_s - p_from) / (p_to - p_from), 0.0, 1.0);
 		return s * s * (3.0 - 2.0 * s);
 	}
 	static _ALWAYS_INLINE_ float smoothstep(float p_from, float p_to, float p_s) {
 		if (is_equal_approx(p_from, p_to)) {
-			return p_from;
+			if (likely(p_from <= p_to)) {
+				return p_s <= p_from ? 0.0f : 1.0f;
+			} else {
+				return p_s <= p_to ? 1.0f : 0.0f;
+			}
 		}
 		float s = CLAMP((p_s - p_from) / (p_to - p_from), 0.0f, 1.0f);
 		return s * s * (3.0f - 2.0f * s);
diff --git a/core/math/transform_interpolator.cpp b/core/math/transform_interpolator.cpp
index 6a564b0ca7..1cd35b3d1a 100644
--- a/core/math/transform_interpolator.cpp
+++ b/core/math/transform_interpolator.cpp
@@ -31,6 +31,7 @@
 #include "transform_interpolator.h"
 
 #include "core/math/transform_2d.h"
+#include "core/math/transform_3d.h"
 
 void TransformInterpolator::interpolate_transform_2d(const Transform2D &p_prev, const Transform2D &p_curr, Transform2D &r_result, real_t p_fraction) {
 	// Special case for physics interpolation, if flipping, don't interpolate basis.
@@ -44,3 +45,340 @@ void TransformInterpolator::interpolate_transform_2d(const Transform2D &p_prev,
 
 	r_result = p_prev.interpolate_with(p_curr, p_fraction);
 }
+
+void TransformInterpolator::interpolate_transform_3d(const Transform3D &p_prev, const Transform3D &p_curr, Transform3D &r_result, real_t p_fraction) {
+	r_result.origin = p_prev.origin + ((p_curr.origin - p_prev.origin) * p_fraction);
+	interpolate_basis(p_prev.basis, p_curr.basis, r_result.basis, p_fraction);
+}
+
+void TransformInterpolator::interpolate_basis(const Basis &p_prev, const Basis &p_curr, Basis &r_result, real_t p_fraction) {
+	Method method = find_method(p_prev, p_curr);
+	interpolate_basis_via_method(p_prev, p_curr, r_result, p_fraction, method);
+}
+
+void TransformInterpolator::interpolate_transform_3d_via_method(const Transform3D &p_prev, const Transform3D &p_curr, Transform3D &r_result, real_t p_fraction, Method p_method) {
+	r_result.origin = p_prev.origin + ((p_curr.origin - p_prev.origin) * p_fraction);
+	interpolate_basis_via_method(p_prev.basis, p_curr.basis, r_result.basis, p_fraction, p_method);
+}
+
+void TransformInterpolator::interpolate_basis_via_method(const Basis &p_prev, const Basis &p_curr, Basis &r_result, real_t p_fraction, Method p_method) {
+	switch (p_method) {
+		default: {
+			interpolate_basis_linear(p_prev, p_curr, r_result, p_fraction);
+		} break;
+		case INTERP_SLERP: {
+			r_result = _basis_slerp_unchecked(p_prev, p_curr, p_fraction);
+		} break;
+		case INTERP_SCALED_SLERP: {
+			interpolate_basis_scaled_slerp(p_prev, p_curr, r_result, p_fraction);
+		} break;
+	}
+}
+
+Quaternion TransformInterpolator::_basis_to_quat_unchecked(const Basis &p_basis) {
+	Basis m = p_basis;
+	real_t trace = m.rows[0][0] + m.rows[1][1] + m.rows[2][2];
+	real_t temp[4];
+
+	if (trace > 0.0) {
+		real_t s = Math::sqrt(trace + 1.0f);
+		temp[3] = (s * 0.5f);
+		s = 0.5f / s;
+
+		temp[0] = ((m.rows[2][1] - m.rows[1][2]) * s);
+		temp[1] = ((m.rows[0][2] - m.rows[2][0]) * s);
+		temp[2] = ((m.rows[1][0] - m.rows[0][1]) * s);
+	} else {
+		int i = m.rows[0][0] < m.rows[1][1]
+				? (m.rows[1][1] < m.rows[2][2] ? 2 : 1)
+				: (m.rows[0][0] < m.rows[2][2] ? 2 : 0);
+		int j = (i + 1) % 3;
+		int k = (i + 2) % 3;
+
+		real_t s = Math::sqrt(m.rows[i][i] - m.rows[j][j] - m.rows[k][k] + 1.0f);
+		temp[i] = s * 0.5f;
+		s = 0.5f / s;
+
+		temp[3] = (m.rows[k][j] - m.rows[j][k]) * s;
+		temp[j] = (m.rows[j][i] + m.rows[i][j]) * s;
+		temp[k] = (m.rows[k][i] + m.rows[i][k]) * s;
+	}
+
+	return Quaternion(temp[0], temp[1], temp[2], temp[3]);
+}
+
+Quaternion TransformInterpolator::_quat_slerp_unchecked(const Quaternion &p_from, const Quaternion &p_to, real_t p_fraction) {
+	Quaternion to1;
+	real_t omega, cosom, sinom, scale0, scale1;
+
+	// Calculate cosine.
+	cosom = p_from.dot(p_to);
+
+	// Adjust signs (if necessary)
+	if (cosom < 0.0f) {
+		cosom = -cosom;
+		to1.x = -p_to.x;
+		to1.y = -p_to.y;
+		to1.z = -p_to.z;
+		to1.w = -p_to.w;
+	} else {
+		to1.x = p_to.x;
+		to1.y = p_to.y;
+		to1.z = p_to.z;
+		to1.w = p_to.w;
+	}
+
+	// Calculate coefficients.
+
+	// This check could possibly be removed as we dealt with this
+	// case in the find_method() function, but is left for safety, it probably
+	// isn't a bottleneck.
+	if ((1.0f - cosom) > (real_t)CMP_EPSILON) {
+		// standard case (slerp)
+		omega = Math::acos(cosom);
+		sinom = Math::sin(omega);
+		scale0 = Math::sin((1.0f - p_fraction) * omega) / sinom;
+		scale1 = Math::sin(p_fraction * omega) / sinom;
+	} else {
+		// "from" and "to" quaternions are very close
+		//  ... so we can do a linear interpolation
+		scale0 = 1.0f - p_fraction;
+		scale1 = p_fraction;
+	}
+	// Calculate final values.
+	return Quaternion(
+			scale0 * p_from.x + scale1 * to1.x,
+			scale0 * p_from.y + scale1 * to1.y,
+			scale0 * p_from.z + scale1 * to1.z,
+			scale0 * p_from.w + scale1 * to1.w);
+}
+
+Basis TransformInterpolator::_basis_slerp_unchecked(Basis p_from, Basis p_to, real_t p_fraction) {
+	Quaternion from = _basis_to_quat_unchecked(p_from);
+	Quaternion to = _basis_to_quat_unchecked(p_to);
+
+	Basis b(_quat_slerp_unchecked(from, to, p_fraction));
+	return b;
+}
+
+void TransformInterpolator::interpolate_basis_scaled_slerp(Basis p_prev, Basis p_curr, Basis &r_result, real_t p_fraction) {
+	// Normalize both and find lengths.
+	Vector3 lengths_prev = _basis_orthonormalize(p_prev);
+	Vector3 lengths_curr = _basis_orthonormalize(p_curr);
+
+	r_result = _basis_slerp_unchecked(p_prev, p_curr, p_fraction);
+
+	// Now the result is unit length basis, we need to scale.
+	Vector3 lengths_lerped = lengths_prev + ((lengths_curr - lengths_prev) * p_fraction);
+
+	// Keep a note that the column / row order of the basis is weird,
+	// so keep an eye for bugs with this.
+	r_result[0] *= lengths_lerped;
+	r_result[1] *= lengths_lerped;
+	r_result[2] *= lengths_lerped;
+}
+
+void TransformInterpolator::interpolate_basis_linear(const Basis &p_prev, const Basis &p_curr, Basis &r_result, real_t p_fraction) {
+	// Interpolate basis.
+	r_result = p_prev.lerp(p_curr, p_fraction);
+
+	// It turns out we need to guard against zero scale basis.
+	// This is kind of silly, as we should probably fix the bugs elsewhere in Godot that can't deal with
+	// zero scale, but until that time...
+	for (int n = 0; n < 3; n++) {
+		Vector3 &axis = r_result[n];
+
+		// Not ok, this could cause errors due to bugs elsewhere,
+		// so we will bodge set this to a small value.
+		const real_t smallest = 0.0001f;
+		const real_t smallest_squared = smallest * smallest;
+		if (axis.length_squared() < smallest_squared) {
+			// Setting a different component to the smallest
+			// helps prevent the situation where all the axes are pointing in the same direction,
+			// which could be a problem for e.g. cross products...
+			axis[n] = smallest;
+		}
+	}
+}
+
+// Returns length.
+real_t TransformInterpolator::_vec3_normalize(Vector3 &p_vec) {
+	real_t lengthsq = p_vec.length_squared();
+	if (lengthsq == 0.0f) {
+		p_vec.x = p_vec.y = p_vec.z = 0.0f;
+		return 0.0f;
+	}
+	real_t length = Math::sqrt(lengthsq);
+	p_vec.x /= length;
+	p_vec.y /= length;
+	p_vec.z /= length;
+	return length;
+}
+
+// Returns lengths.
+Vector3 TransformInterpolator::_basis_orthonormalize(Basis &r_basis) {
+	// Gram-Schmidt Process.
+
+	Vector3 x = r_basis.get_column(0);
+	Vector3 y = r_basis.get_column(1);
+	Vector3 z = r_basis.get_column(2);
+
+	Vector3 lengths;
+
+	lengths.x = _vec3_normalize(x);
+	y = (y - x * (x.dot(y)));
+	lengths.y = _vec3_normalize(y);
+	z = (z - x * (x.dot(z)) - y * (y.dot(z)));
+	lengths.z = _vec3_normalize(z);
+
+	r_basis.set_column(0, x);
+	r_basis.set_column(1, y);
+	r_basis.set_column(2, z);
+
+	return lengths;
+}
+
+TransformInterpolator::Method TransformInterpolator::_test_basis(Basis p_basis, bool r_needed_normalize, Quaternion &r_quat) {
+	// Axis lengths.
+	Vector3 al = Vector3(p_basis.get_column(0).length_squared(),
+			p_basis.get_column(1).length_squared(),
+			p_basis.get_column(2).length_squared());
+
+	// Non unit scale?
+	if (r_needed_normalize || !_vec3_is_equal_approx(al, Vector3(1.0, 1.0, 1.0), (real_t)0.001f)) {
+		// If the basis is not normalized (at least approximately), it will fail the checks needed for slerp.
+		// So we try to detect a scaled (but not sheared) basis, which we *can* slerp by normalizing first,
+		// and lerping the scales separately.
+
+		// If any of the axes are really small, it is unlikely to be a valid rotation, or is scaled too small to deal with float error.
+		const real_t sl_epsilon = 0.00001f;
+		if ((al.x < sl_epsilon) ||
+				(al.y < sl_epsilon) ||
+				(al.z < sl_epsilon)) {
+			return INTERP_LERP;
+		}
+
+		// Normalize the basis.
+		Basis norm_basis = p_basis;
+
+		al.x = Math::sqrt(al.x);
+		al.y = Math::sqrt(al.y);
+		al.z = Math::sqrt(al.z);
+
+		norm_basis.set_column(0, norm_basis.get_column(0) / al.x);
+		norm_basis.set_column(1, norm_basis.get_column(1) / al.y);
+		norm_basis.set_column(2, norm_basis.get_column(2) / al.z);
+
+		// This doesn't appear necessary, as the later checks will catch it.
+		// if (!_basis_is_orthogonal_any_scale(norm_basis)) {
+		// return INTERP_LERP;
+		// }
+
+		p_basis = norm_basis;
+
+		// Orthonormalize not necessary as normal normalization(!) works if the
+		// axes are orthonormal.
+		// p_basis.orthonormalize();
+
+		// If we needed to normalize one of the two bases, we will need to normalize both,
+		// regardless of whether the 2nd needs it, just to make sure it takes the path to return
+		// INTERP_SCALED_LERP on the 2nd call of _test_basis.
+		r_needed_normalize = true;
+	}
+
+	// Apply less stringent tests than the built in slerp, the standard Godot slerp
+	// is too susceptible to float error to be useful.
+	real_t det = p_basis.determinant();
+	if (!Math::is_equal_approx(det, 1, (real_t)0.01f)) {
+		return INTERP_LERP;
+	}
+
+	if (!_basis_is_orthogonal(p_basis)) {
+		return INTERP_LERP;
+	}
+
+	// TODO: This could possibly be less stringent too, check this.
+	r_quat = _basis_to_quat_unchecked(p_basis);
+	if (!r_quat.is_normalized()) {
+		return INTERP_LERP;
+	}
+
+	return r_needed_normalize ? INTERP_SCALED_SLERP : INTERP_SLERP;
+}
+
+// This check doesn't seem to be needed but is preserved in case of bugs.
+bool TransformInterpolator::_basis_is_orthogonal_any_scale(const Basis &p_basis) {
+	Vector3 cross = p_basis.get_column(0).cross(p_basis.get_column(1));
+	real_t l = _vec3_normalize(cross);
+	// Too small numbers, revert to lerp.
+	if (l < 0.001f) {
+		return false;
+	}
+
+	const real_t epsilon = 0.9995f;
+
+	real_t dot = cross.dot(p_basis.get_column(2));
+	if (dot < epsilon) {
+		return false;
+	}
+
+	cross = p_basis.get_column(1).cross(p_basis.get_column(2));
+	l = _vec3_normalize(cross);
+	// Too small numbers, revert to lerp.
+	if (l < 0.001f) {
+		return false;
+	}
+
+	dot = cross.dot(p_basis.get_column(0));
+	if (dot < epsilon) {
+		return false;
+	}
+
+	return true;
+}
+
+bool TransformInterpolator::_basis_is_orthogonal(const Basis &p_basis, real_t p_epsilon) {
+	Basis identity;
+	Basis m = p_basis * p_basis.transposed();
+
+	// Less stringent tests than the standard Godot slerp.
+	if (!_vec3_is_equal_approx(m[0], identity[0], p_epsilon) || !_vec3_is_equal_approx(m[1], identity[1], p_epsilon) || !_vec3_is_equal_approx(m[2], identity[2], p_epsilon)) {
+		return false;
+	}
+	return true;
+}
+
+real_t TransformInterpolator::checksum_transform_3d(const Transform3D &p_transform) {
+	// just a really basic checksum, this can probably be improved
+	real_t sum = _vec3_sum(p_transform.origin);
+	sum -= _vec3_sum(p_transform.basis.rows[0]);
+	sum += _vec3_sum(p_transform.basis.rows[1]);
+	sum -= _vec3_sum(p_transform.basis.rows[2]);
+	return sum;
+}
+
+TransformInterpolator::Method TransformInterpolator::find_method(const Basis &p_a, const Basis &p_b) {
+	bool needed_normalize = false;
+
+	Quaternion q0;
+	Method method = _test_basis(p_a, needed_normalize, q0);
+	if (method == INTERP_LERP) {
+		return method;
+	}
+
+	Quaternion q1;
+	method = _test_basis(p_b, needed_normalize, q1);
+	if (method == INTERP_LERP) {
+		return method;
+	}
+
+	// Are they close together?
+	// Apply the same test that will revert to lerp as is present in the slerp routine.
+	// Calculate cosine.
+	real_t cosom = Math::abs(q0.dot(q1));
+	if ((1.0f - cosom) <= (real_t)CMP_EPSILON) {
+		return INTERP_LERP;
+	}
+
+	return method;
+}
diff --git a/core/math/transform_interpolator.h b/core/math/transform_interpolator.h
index a9bce2bd7f..cc556707e4 100644
--- a/core/math/transform_interpolator.h
+++ b/core/math/transform_interpolator.h
@@ -32,15 +32,64 @@
 #define TRANSFORM_INTERPOLATOR_H
 
 #include "core/math/math_defs.h"
+#include "core/math/vector3.h"
+
+// Keep all the functions for fixed timestep interpolation together.
+// There are two stages involved:
+// Finding a method, for determining the interpolation method between two
+// keyframes (which are physics ticks).
+// And applying that pre-determined method.
+
+// Pre-determining the method makes sense because it is expensive and often
+// several frames may occur between each physics tick, which will make it cheaper
+// than performing every frame.
 
 struct Transform2D;
+struct Transform3D;
+struct Basis;
+struct Quaternion;
 
 class TransformInterpolator {
+public:
+	enum Method {
+		INTERP_LERP,
+		INTERP_SLERP,
+		INTERP_SCALED_SLERP,
+	};
+
 private:
-	static bool _sign(real_t p_val) { return p_val >= 0; }
+	_FORCE_INLINE_ static bool _sign(real_t p_val) { return p_val >= 0; }
+	static real_t _vec3_sum(const Vector3 &p_pt) { return p_pt.x + p_pt.y + p_pt.z; }
+	static real_t _vec3_normalize(Vector3 &p_vec);
+	_FORCE_INLINE_ static bool _vec3_is_equal_approx(const Vector3 &p_a, const Vector3 &p_b, real_t p_tolerance) {
+		return Math::is_equal_approx(p_a.x, p_b.x, p_tolerance) && Math::is_equal_approx(p_a.y, p_b.y, p_tolerance) && Math::is_equal_approx(p_a.z, p_b.z, p_tolerance);
+	}
+	static Vector3 _basis_orthonormalize(Basis &r_basis);
+	static Method _test_basis(Basis p_basis, bool r_needed_normalize, Quaternion &r_quat);
+	static Basis _basis_slerp_unchecked(Basis p_from, Basis p_to, real_t p_fraction);
+	static Quaternion _quat_slerp_unchecked(const Quaternion &p_from, const Quaternion &p_to, real_t p_fraction);
+	static Quaternion _basis_to_quat_unchecked(const Basis &p_basis);
+	static bool _basis_is_orthogonal(const Basis &p_basis, real_t p_epsilon = 0.01f);
+	static bool _basis_is_orthogonal_any_scale(const Basis &p_basis);
+
+	static void interpolate_basis_linear(const Basis &p_prev, const Basis &p_curr, Basis &r_result, real_t p_fraction);
+	static void interpolate_basis_scaled_slerp(Basis p_prev, Basis p_curr, Basis &r_result, real_t p_fraction);
 
 public:
 	static void interpolate_transform_2d(const Transform2D &p_prev, const Transform2D &p_curr, Transform2D &r_result, real_t p_fraction);
+
+	// Generic functions, use when you don't know what method should be used, e.g. from GDScript.
+	// These will be slower.
+	static void interpolate_transform_3d(const Transform3D &p_prev, const Transform3D &p_curr, Transform3D &r_result, real_t p_fraction);
+	static void interpolate_basis(const Basis &p_prev, const Basis &p_curr, Basis &r_result, real_t p_fraction);
+
+	// Optimized function when you know ahead of time the method.
+	static void interpolate_transform_3d_via_method(const Transform3D &p_prev, const Transform3D &p_curr, Transform3D &r_result, real_t p_fraction, Method p_method);
+	static void interpolate_basis_via_method(const Basis &p_prev, const Basis &p_curr, Basis &r_result, real_t p_fraction, Method p_method);
+
+	static real_t checksum_transform_3d(const Transform3D &p_transform);
+
+	static Method find_method(const Basis &p_a, const Basis &p_b);
 };
 
 #endif // TRANSFORM_INTERPOLATOR_H
diff --git a/core/os/main_loop.h b/core/os/main_loop.h
index e48541d074..9c22cbaf3c 100644
--- a/core/os/main_loop.h
+++ b/core/os/main_loop.h
@@ -64,6 +64,7 @@ public:
 	virtual void initialize();
 	virtual void iteration_prepare() {}
 	virtual bool physics_process(double p_time);
+	virtual void iteration_end() {}
 	virtual bool process(double p_time);
 	virtual void finalize();
 
diff --git a/core/string/ustring.cpp b/core/string/ustring.cpp
index 3d37e17ef8..cf19a1d48f 100644
--- a/core/string/ustring.cpp
+++ b/core/string/ustring.cpp
@@ -1639,13 +1639,43 @@ Vector<int> String::split_ints_mk(const Vector<String> &p_splitters, bool p_allo
 }
 
 String String::join(const Vector<String> &parts) const {
+	if (parts.is_empty()) {
+		return String();
+	} else if (parts.size() == 1) {
+		return parts[0];
+	}
+
+	const int this_length = length();
+
+	int new_size = (parts.size() - 1) * this_length;
+	for (const String &part : parts) {
+		new_size += part.length();
+	}
+	new_size += 1;
+
 	String ret;
-	for (int i = 0; i < parts.size(); ++i) {
-		if (i > 0) {
-			ret += *this;
+	ret.resize(new_size);
+	char32_t *ret_ptrw = ret.ptrw();
+	const char32_t *this_ptr = ptr();
+
+	bool first = true;
+	for (const String &part : parts) {
+		if (first) {
+			first = false;
+		} else if (this_length) {
+			memcpy(ret_ptrw, this_ptr, this_length * sizeof(char32_t));
+			ret_ptrw += this_length;
+		}
+
+		const int part_length = part.length();
+		if (part_length) {
+			memcpy(ret_ptrw, part.ptr(), part_length * sizeof(char32_t));
+			ret_ptrw += part_length;
 		}
-		ret += parts[i];
 	}
+
+	*ret_ptrw = 0;
+
 	return ret;
 }
 
@@ -3149,7 +3179,7 @@ Vector<uint8_t> String::sha256_buffer() const {
 }
 
 String String::insert(int p_at_pos, const String &p_string) const {
-	if (p_at_pos < 0) {
+	if (p_string.is_empty() || p_at_pos < 0) {
 		return *this;
 	}
 
@@ -3157,17 +3187,27 @@ String String::insert(int p_at_pos, const String &p_string) const {
 		p_at_pos = length();
 	}
 
-	String pre;
+	String ret;
+	ret.resize(length() + p_string.length() + 1);
+	char32_t *ret_ptrw = ret.ptrw();
+	const char32_t *this_ptr = ptr();
+
 	if (p_at_pos > 0) {
-		pre = substr(0, p_at_pos);
+		memcpy(ret_ptrw, this_ptr, p_at_pos * sizeof(char32_t));
+		ret_ptrw += p_at_pos;
 	}
 
-	String post;
+	memcpy(ret_ptrw, p_string.ptr(), p_string.length() * sizeof(char32_t));
+	ret_ptrw += p_string.length();
+
 	if (p_at_pos < length()) {
-		post = substr(p_at_pos, length() - p_at_pos);
+		memcpy(ret_ptrw, this_ptr + p_at_pos, (length() - p_at_pos) * sizeof(char32_t));
+		ret_ptrw += length() - p_at_pos;
 	}
 
-	return pre + p_string + post;
+	*ret_ptrw = 0;
+
+	return ret;
 }
 
 String String::erase(int p_pos, int p_chars) const {
@@ -5321,6 +5361,11 @@ String String::lpad(int min_length, const String &character) const {
 //   "fish %s %d pie" % ["frog", 12]
 // In case of an error, the string returned is the error description and "error" is true.
 String String::sprintf(const Array &values, bool *error) const {
+	static const String ZERO("0");
+	static const String SPACE(" ");
+	static const String MINUS("-");
+	static const String PLUS("+");
+
 	String formatted;
 	char32_t *self = (char32_t *)get_data();
 	bool in_format = false;
@@ -5343,7 +5388,7 @@ String String::sprintf(const Array &values, bool *error) const {
 		if (in_format) { // We have % - let's see what else we get.
 			switch (c) {
 				case '%': { // Replace %% with %
-					formatted += chr(c);
+					formatted += c;
 					in_format = false;
 					break;
 				}
@@ -5393,7 +5438,7 @@ String String::sprintf(const Array &values, bool *error) const {
 
 					// Padding.
 					int pad_chars_count = (negative || show_sign) ? min_chars - 1 : min_chars;
-					String pad_char = pad_with_zeros ? String("0") : String(" ");
+					const String &pad_char = pad_with_zeros ? ZERO : SPACE;
 					if (left_justified) {
 						str = str.rpad(pad_chars_count, pad_char);
 					} else {
@@ -5402,7 +5447,7 @@ String String::sprintf(const Array &values, bool *error) const {
 
 					// Sign.
 					if (show_sign || negative) {
-						String sign_char = negative ? "-" : "+";
+						const String &sign_char = negative ? MINUS : PLUS;
 						if (left_justified) {
 							str = str.insert(0, sign_char);
 						} else {
@@ -5439,7 +5484,7 @@ String String::sprintf(const Array &values, bool *error) const {
 
 					// Padding. Leave room for sign later if required.
 					int pad_chars_count = (is_negative || show_sign) ? min_chars - 1 : min_chars;
-					String pad_char = (pad_with_zeros && is_finite) ? String("0") : String(" "); // Never pad NaN or inf with zeros
+					const String &pad_char = (pad_with_zeros && is_finite) ? ZERO : SPACE; // Never pad NaN or inf with zeros
 					if (left_justified) {
 						str = str.rpad(pad_chars_count, pad_char);
 					} else {
@@ -5448,7 +5493,7 @@ String String::sprintf(const Array &values, bool *error) const {
 
 					// Add sign if needed.
 					if (show_sign || is_negative) {
-						String sign_char = is_negative ? "-" : "+";
+						const String &sign_char = is_negative ? MINUS : PLUS;
 						if (left_justified) {
 							str = str.insert(0, sign_char);
 						} else {
@@ -5501,7 +5546,7 @@ String String::sprintf(const Array &values, bool *error) const {
 
 						// Padding. Leave room for sign later if required.
 						int pad_chars_count = val < 0 ? min_chars - 1 : min_chars;
-						String pad_char = (pad_with_zeros && is_finite) ? String("0") : String(" "); // Never pad NaN or inf with zeros
+						const String &pad_char = (pad_with_zeros && is_finite) ? ZERO : SPACE; // Never pad NaN or inf with zeros
 						if (left_justified) {
 							number_str = number_str.rpad(pad_chars_count, pad_char);
 						} else {
@@ -5511,9 +5556,9 @@ String String::sprintf(const Array &values, bool *error) const {
 						// Add sign if needed.
 						if (val < 0) {
 							if (left_justified) {
-								number_str = number_str.insert(0, "-");
+								number_str = number_str.insert(0, MINUS);
 							} else {
-								number_str = number_str.insert(pad_with_zeros ? 0 : number_str.length() - initial_len, "-");
+								number_str = number_str.insert(pad_with_zeros ? 0 : number_str.length() - initial_len, MINUS);
 							}
 						}
 
@@ -5678,7 +5723,7 @@ String String::sprintf(const Array &values, bool *error) const {
 					in_decimals = false;
 					break;
 				default:
-					formatted += chr(c);
+					formatted += c;
 			}
 		}
 	}
diff --git a/core/templates/cowdata.h b/core/templates/cowdata.h
index f22ae1f1d3..6f818956ea 100644
--- a/core/templates/cowdata.h
+++ b/core/templates/cowdata.h
@@ -222,12 +222,15 @@ public:
 	}
 
 	Error insert(Size p_pos, const T &p_val) {
-		ERR_FAIL_INDEX_V(p_pos, size() + 1, ERR_INVALID_PARAMETER);
-		resize(size() + 1);
-		for (Size i = (size() - 1); i > p_pos; i--) {
-			set(i, get(i - 1));
+		Size new_size = size() + 1;
+		ERR_FAIL_INDEX_V(p_pos, new_size, ERR_INVALID_PARAMETER);
+		Error err = resize(new_size);
+		ERR_FAIL_COND_V(err, err);
+		T *p = ptrw();
+		for (Size i = new_size - 1; i > p_pos; i--) {
+			p[i] = p[i - 1];
 		}
-		set(p_pos, p_val);
+		p[p_pos] = p_val;
 
 		return OK;
 	}