386 files changed, 16400 insertions, 7716 deletions
diff --git a/.github/workflows/static_checks.yml b/.github/workflows/static_checks.yml
index 74935bb618..0ed7432833 100644
--- a/.github/workflows/static_checks.yml
+++ b/.github/workflows/static_checks.yml
@@ -32,7 +32,7 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
           if [ "${{ github.event_name }}" == "pull_request" ]; then
-            files=$(git diff-tree --no-commit-id --name-only -r HEAD~${{ github.event.pull_request.commits }}..HEAD 2> /dev/null || true)
+            files=$(git diff-tree --no-commit-id --name-only -r HEAD^1..HEAD 2> /dev/null || true)
           elif [ "${{ github.event_name }}" == "push" -a "${{ github.event.forced }}" == "false" -a "${{ github.event.created }}" == "false" ]; then
             files=$(git diff-tree --no-commit-id --name-only -r ${{ github.event.before }}..${{ github.event.after }} 2> /dev/null || true)
           fi
diff --git a/.gitignore b/.gitignore
index d40f4d3948..46dcf84b43 100644
--- a/.gitignore
+++ b/.gitignore
@@ -35,6 +35,10 @@ bin
 compile_commands.json
 platform/windows/godot_res.res
 
+# Ninja build files
+build.ninja
+.ninja
+
 # Generated by Godot binary
 .import/
 /gdextension_interface.h
diff --git a/SConstruct b/SConstruct
index 753cea40e3..73ef420a0d 100644
--- a/SConstruct
+++ b/SConstruct
@@ -203,6 +203,7 @@ opts.Add(BoolVariable("custom_modules_recursive", "Detect custom modules recursi
 opts.Add(BoolVariable("dev_mode", "Alias for dev options: verbose=yes warnings=extra werror=yes tests=yes", False))
 opts.Add(BoolVariable("tests", "Build the unit tests", False))
 opts.Add(BoolVariable("fast_unsafe", "Enable unsafe options for faster rebuilds", False))
+opts.Add(BoolVariable("ninja", "Use the ninja backend for faster rebuilds", False))
 opts.Add(BoolVariable("compiledb", "Generate compilation DB (`compile_commands.json`) for external tools", False))
 opts.Add(BoolVariable("verbose", "Enable verbose output for the compilation", False))
 opts.Add(BoolVariable("progress", "Show a progress indicator during compilation", True))
@@ -956,7 +957,8 @@ if selected_platform in platform_list:
         env.vs_incs = []
         env.vs_srcs = []
 
-    # CompileDB
+    # CompileDB and Ninja are only available with certain SCons versions which
+    # not everybody might have yet, so we have to check.
     from SCons import __version__ as scons_raw_version
 
     scons_ver = env._get_major_minor_revision(scons_raw_version)
@@ -968,6 +970,20 @@ if selected_platform in platform_list:
         env.Tool("compilation_db")
         env.Alias("compiledb", env.CompilationDatabase())
 
+    if env["ninja"]:
+        if scons_ver < (4, 2, 0):
+            print("The `ninja=yes` option requires SCons 4.2 or later, but your version is %s." % scons_raw_version)
+            Exit(255)
+
+        SetOption("experimental", "ninja")
+
+        # By setting this we allow the user to run ninja by themselves with all
+        # the flags they need, as apparently automatically running from scons
+        # is way slower.
+        SetOption("disable_execute_ninja", True)
+
+        env.Tool("ninja")
+
     # Threads
     if env["threads"]:
         env.Append(CPPDEFINES=["THREADS_ENABLED"])
@@ -1041,9 +1057,10 @@ atexit.register(print_elapsed_time)
 
 
 def purge_flaky_files():
+    paths_to_keep = ["ninja.build"]
     for build_failure in GetBuildFailures():
-        path = build_failure.node.abspath
-        if os.path.isfile(path):
+        path = build_failure.node.path
+        if os.path.isfile(path) and path not in paths_to_keep:
             os.remove(path)
 
 
diff --git a/core/SCsub b/core/SCsub
index 7edf8ea88d..f7b733a221 100644
--- a/core/SCsub
+++ b/core/SCsub
@@ -3,6 +3,7 @@
 Import("env")
 
 import core_builders
+import methods
 
 env.core_sources = []
 
@@ -35,10 +36,12 @@ if "SCRIPT_AES256_ENCRYPTION_KEY" in os.environ:
         )
         Exit(255)
 
-# NOTE: It is safe to generate this file here, since this is still executed serially
-with open("script_encryption_key.gen.cpp", "w", encoding="utf-8", newline="\n") as f:
-    f.write('#include "core/config/project_settings.h"\nuint8_t script_encryption_key[32]={' + txt + "};\n")
 
+script_encryption_key_contents = (
+    '#include "core/config/project_settings.h"\nuint8_t script_encryption_key[32]={' + txt + "};\n"
+)
+
+methods.write_file_if_needed("script_encryption_key.gen.cpp", script_encryption_key_contents)
 
 # Add required thirdparty code.
 
diff --git a/core/config/project_settings.cpp b/core/config/project_settings.cpp
index ce7fa1074b..14023c5c75 100644
--- a/core/config/project_settings.cpp
+++ b/core/config/project_settings.cpp
@@ -1523,6 +1523,8 @@ ProjectSettings::ProjectSettings() {
 	GLOBAL_DEF("debug/settings/crash_handler/message.editor",
 			String("Please include this when reporting the bug on: https://github.com/godotengine/godot/issues"));
 	GLOBAL_DEF_RST(PropertyInfo(Variant::INT, "rendering/occlusion_culling/bvh_build_quality", PROPERTY_HINT_ENUM, "Low,Medium,High"), 2);
+	GLOBAL_DEF_RST("rendering/occlusion_culling/jitter_projection", true);
+
 	GLOBAL_DEF_RST("internationalization/rendering/force_right_to_left_layout_direction", false);
 	GLOBAL_DEF_BASIC(PropertyInfo(Variant::INT, "internationalization/rendering/root_node_layout_direction", PROPERTY_HINT_ENUM, "Based on Application Locale,Left-to-Right,Right-to-Left,Based on System Locale"), 0);
 
diff --git a/core/io/image.cpp b/core/io/image.cpp
index c454f06d67..6096211cff 100644
--- a/core/io/image.cpp
+++ b/core/io/image.cpp
@@ -4114,7 +4114,7 @@ Dictionary Image::compute_image_metrics(const Ref<Image> p_compared_image, bool
 			continue;
 		}
 
-		image_metric_max = MAX(image_metric_max, i);
+		image_metric_max = i;
 
 		double x = i * hist[i];
 
diff --git a/core/io/marshalls.cpp b/core/io/marshalls.cpp
index bc2493d360..b25fcccd7f 100644
--- a/core/io/marshalls.cpp
+++ b/core/io/marshalls.cpp
@@ -30,7 +30,10 @@
 
 #include "marshalls.h"
 
+#include "core/core_string_names.h"
+#include "core/io/resource_loader.h"
 #include "core/object/ref_counted.h"
+#include "core/object/script_language.h"
 #include "core/os/keyboard.h"
 #include "core/string/print_string.h"
 
@@ -55,9 +58,22 @@ ObjectID EncodedObjectAsID::get_object_id() const {
 #define ERR_FAIL_ADD_OF(a, b, err) ERR_FAIL_COND_V(((int32_t)(b)) < 0 || ((int32_t)(a)) < 0 || ((int32_t)(a)) > INT_MAX - ((int32_t)(b)), err)
 #define ERR_FAIL_MUL_OF(a, b, err) ERR_FAIL_COND_V(((int32_t)(a)) < 0 || ((int32_t)(b)) <= 0 || ((int32_t)(a)) > INT_MAX / ((int32_t)(b)), err)
 
-#define ENCODE_MASK 0xFF
-#define ENCODE_FLAG_64 1 << 16
-#define ENCODE_FLAG_OBJECT_AS_ID 1 << 16
+// Byte 0: `Variant::Type`, byte 1: unused, bytes 2 and 3: additional data.
+#define HEADER_TYPE_MASK 0xFF
+
+// For `Variant::INT`, `Variant::FLOAT` and other math types.
+#define HEADER_DATA_FLAG_64 (1 << 16)
+
+// For `Variant::OBJECT`.
+#define HEADER_DATA_FLAG_OBJECT_AS_ID (1 << 16)
+
+// For `Variant::ARRAY`.
+// Occupies bits 16 and 17.
+#define HEADER_DATA_FIELD_TYPED_ARRAY_MASK (0b11 << 16)
+#define HEADER_DATA_FIELD_TYPED_ARRAY_NONE (0b00 << 16)
+#define HEADER_DATA_FIELD_TYPED_ARRAY_BUILTIN (0b01 << 16)
+#define HEADER_DATA_FIELD_TYPED_ARRAY_CLASS_NAME (0b10 << 16)
+#define HEADER_DATA_FIELD_TYPED_ARRAY_SCRIPT (0b11 << 16)
 
 static Error _decode_string(const uint8_t *&buf, int &len, int *r_len, String &r_string) {
 	ERR_FAIL_COND_V(len < 4, ERR_INVALID_DATA);
@@ -101,9 +117,9 @@ Error decode_variant(Variant &r_variant, const uint8_t *p_buffer, int p_len, int
 
 	ERR_FAIL_COND_V(len < 4, ERR_INVALID_DATA);
 
-	uint32_t type = decode_uint32(buf);
+	uint32_t header = decode_uint32(buf);
 
-	ERR_FAIL_COND_V((type & ENCODE_MASK) >= Variant::VARIANT_MAX, ERR_INVALID_DATA);
+	ERR_FAIL_COND_V((header & HEADER_TYPE_MASK) >= Variant::VARIANT_MAX, ERR_INVALID_DATA);
 
 	buf += 4;
 	len -= 4;
@@ -114,7 +130,7 @@ Error decode_variant(Variant &r_variant, const uint8_t *p_buffer, int p_len, int
 	// Note: We cannot use sizeof(real_t) for decoding, in case a different size is encoded.
 	// Decoding math types always checks for the encoded size, while encoding always uses compilation setting.
 	// This does lead to some code duplication for decoding, but compatibility is the priority.
-	switch (type & ENCODE_MASK) {
+	switch (header & HEADER_TYPE_MASK) {
 		case Variant::NIL: {
 			r_variant = Variant();
 		} break;
@@ -127,7 +143,7 @@ Error decode_variant(Variant &r_variant, const uint8_t *p_buffer, int p_len, int
 			}
 		} break;
 		case Variant::INT: {
-			if (type & ENCODE_FLAG_64) {
+			if (header & HEADER_DATA_FLAG_64) {
 				ERR_FAIL_COND_V(len < 8, ERR_INVALID_DATA);
 				int64_t val = decode_uint64(buf);
 				r_variant = val;
@@ -146,7 +162,7 @@ Error decode_variant(Variant &r_variant, const uint8_t *p_buffer, int p_len, int
 
 		} break;
 		case Variant::FLOAT: {
-			if (type & ENCODE_FLAG_64) {
+			if (header & HEADER_DATA_FLAG_64) {
 				ERR_FAIL_COND_V((size_t)len < sizeof(double), ERR_INVALID_DATA);
 				double val = decode_double(buf);
 				r_variant = val;
@@ -176,7 +192,7 @@ Error decode_variant(Variant &r_variant, const uint8_t *p_buffer, int p_len, int
 		// math types
 		case Variant::VECTOR2: {
 			Vector2 val;
-			if (type & ENCODE_FLAG_64) {
+			if (header & HEADER_DATA_FLAG_64) {
 				ERR_FAIL_COND_V((size_t)len < sizeof(double) * 2, ERR_INVALID_DATA);
 				val.x = decode_double(&buf[0]);
 				val.y = decode_double(&buf[sizeof(double)]);
@@ -210,7 +226,7 @@ Error decode_variant(Variant &r_variant, const uint8_t *p_buffer, int p_len, int
 		} break;
 		case Variant::RECT2: {
 			Rect2 val;
-			if (type & ENCODE_FLAG_64) {
+			if (header & HEADER_DATA_FLAG_64) {
 				ERR_FAIL_COND_V((size_t)len < sizeof(double) * 4, ERR_INVALID_DATA);
 				val.position.x = decode_double(&buf[0]);
 				val.position.y = decode_double(&buf[sizeof(double)]);
@@ -250,7 +266,7 @@ Error decode_variant(Variant &r_variant, const uint8_t *p_buffer, int p_len, int
 		} break;
 		case Variant::VECTOR3: {
 			Vector3 val;
-			if (type & ENCODE_FLAG_64) {
+			if (header & HEADER_DATA_FLAG_64) {
 				ERR_FAIL_COND_V((size_t)len < sizeof(double) * 3, ERR_INVALID_DATA);
 				val.x = decode_double(&buf[0]);
 				val.y = decode_double(&buf[sizeof(double)]);
@@ -287,7 +303,7 @@ Error decode_variant(Variant &r_variant, const uint8_t *p_buffer, int p_len, int
 		} break;
 		case Variant::VECTOR4: {
 			Vector4 val;
-			if (type & ENCODE_FLAG_64) {
+			if (header & HEADER_DATA_FLAG_64) {
 				ERR_FAIL_COND_V((size_t)len < sizeof(double) * 4, ERR_INVALID_DATA);
 				val.x = decode_double(&buf[0]);
 				val.y = decode_double(&buf[sizeof(double)]);
@@ -327,7 +343,7 @@ Error decode_variant(Variant &r_variant, const uint8_t *p_buffer, int p_len, int
 		} break;
 		case Variant::TRANSFORM2D: {
 			Transform2D val;
-			if (type & ENCODE_FLAG_64) {
+			if (header & HEADER_DATA_FLAG_64) {
 				ERR_FAIL_COND_V((size_t)len < sizeof(double) * 6, ERR_INVALID_DATA);
 				for (int i = 0; i < 3; i++) {
 					for (int j = 0; j < 2; j++) {
@@ -355,7 +371,7 @@ Error decode_variant(Variant &r_variant, const uint8_t *p_buffer, int p_len, int
 		} break;
 		case Variant::PLANE: {
 			Plane val;
-			if (type & ENCODE_FLAG_64) {
+			if (header & HEADER_DATA_FLAG_64) {
 				ERR_FAIL_COND_V((size_t)len < sizeof(double) * 4, ERR_INVALID_DATA);
 				val.normal.x = decode_double(&buf[0]);
 				val.normal.y = decode_double(&buf[sizeof(double)]);
@@ -381,7 +397,7 @@ Error decode_variant(Variant &r_variant, const uint8_t *p_buffer, int p_len, int
 		} break;
 		case Variant::QUATERNION: {
 			Quaternion val;
-			if (type & ENCODE_FLAG_64) {
+			if (header & HEADER_DATA_FLAG_64) {
 				ERR_FAIL_COND_V((size_t)len < sizeof(double) * 4, ERR_INVALID_DATA);
 				val.x = decode_double(&buf[0]);
 				val.y = decode_double(&buf[sizeof(double)]);
@@ -407,7 +423,7 @@ Error decode_variant(Variant &r_variant, const uint8_t *p_buffer, int p_len, int
 		} break;
 		case Variant::AABB: {
 			AABB val;
-			if (type & ENCODE_FLAG_64) {
+			if (header & HEADER_DATA_FLAG_64) {
 				ERR_FAIL_COND_V((size_t)len < sizeof(double) * 6, ERR_INVALID_DATA);
 				val.position.x = decode_double(&buf[0]);
 				val.position.y = decode_double(&buf[sizeof(double)]);
@@ -437,7 +453,7 @@ Error decode_variant(Variant &r_variant, const uint8_t *p_buffer, int p_len, int
 		} break;
 		case Variant::BASIS: {
 			Basis val;
-			if (type & ENCODE_FLAG_64) {
+			if (header & HEADER_DATA_FLAG_64) {
 				ERR_FAIL_COND_V((size_t)len < sizeof(double) * 9, ERR_INVALID_DATA);
 				for (int i = 0; i < 3; i++) {
 					for (int j = 0; j < 3; j++) {
@@ -465,7 +481,7 @@ Error decode_variant(Variant &r_variant, const uint8_t *p_buffer, int p_len, int
 		} break;
 		case Variant::TRANSFORM3D: {
 			Transform3D val;
-			if (type & ENCODE_FLAG_64) {
+			if (header & HEADER_DATA_FLAG_64) {
 				ERR_FAIL_COND_V((size_t)len < sizeof(double) * 12, ERR_INVALID_DATA);
 				for (int i = 0; i < 3; i++) {
 					for (int j = 0; j < 3; j++) {
@@ -499,7 +515,7 @@ Error decode_variant(Variant &r_variant, const uint8_t *p_buffer, int p_len, int
 		} break;
 		case Variant::PROJECTION: {
 			Projection val;
-			if (type & ENCODE_FLAG_64) {
+			if (header & HEADER_DATA_FLAG_64) {
 				ERR_FAIL_COND_V((size_t)len < sizeof(double) * 16, ERR_INVALID_DATA);
 				for (int i = 0; i < 4; i++) {
 					for (int j = 0; j < 4; j++) {
@@ -560,12 +576,12 @@ Error decode_variant(Variant &r_variant, const uint8_t *p_buffer, int p_len, int
 
 				uint32_t namecount = strlen &= 0x7FFFFFFF;
 				uint32_t subnamecount = decode_uint32(buf + 4);
-				uint32_t flags = decode_uint32(buf + 8);
+				uint32_t np_flags = decode_uint32(buf + 8);
 
 				len -= 12;
 				buf += 12;
 
-				if (flags & 2) { // Obsolete format with property separate from subpath
+				if (np_flags & 2) { // Obsolete format with property separate from subpath.
 					subnamecount++;
 				}
 
@@ -589,7 +605,7 @@ Error decode_variant(Variant &r_variant, const uint8_t *p_buffer, int p_len, int
 					}
 				}
 
-				r_variant = NodePath(names, subnames, flags & 1);
+				r_variant = NodePath(names, subnames, np_flags & 1);
 
 			} else {
 				//old format, just a string
@@ -608,8 +624,8 @@ Error decode_variant(Variant &r_variant, const uint8_t *p_buffer, int p_len, int
 			r_variant = RID::from_uint64(id);
 		} break;
 		case Variant::OBJECT: {
-			if (type & ENCODE_FLAG_OBJECT_AS_ID) {
-				//this _is_ allowed
+			if (header & HEADER_DATA_FLAG_OBJECT_AS_ID) {
+				// This _is_ allowed.
 				ERR_FAIL_COND_V(len < 8, ERR_INVALID_DATA);
 				ObjectID val = ObjectID(decode_uint64(buf));
 				if (r_len) {
@@ -625,7 +641,6 @@ Error decode_variant(Variant &r_variant, const uint8_t *p_buffer, int p_len, int
 
 					r_variant = obj_as_id;
 				}
-
 			} else {
 				ERR_FAIL_COND_V(!p_allow_objects, ERR_UNAUTHORIZED);
 
@@ -672,7 +687,16 @@ Error decode_variant(Variant &r_variant, const uint8_t *p_buffer, int p_len, int
 							(*r_len) += used;
 						}
 
-						obj->set(str, value);
+						if (str == "script") {
+							ERR_FAIL_COND_V_MSG(value.get_type() != Variant::STRING, ERR_INVALID_DATA, "Invalid value for \"script\" property, expected script path as String.");
+							String path = value;
+							ERR_FAIL_COND_V_MSG(path.is_empty() || !path.begins_with("res://") || !ResourceLoader::exists(path, "Script"), ERR_INVALID_DATA, "Invalid script path: '" + path + "'.");
+							Ref<Script> script = ResourceLoader::load(path, "Script");
+							ERR_FAIL_COND_V_MSG(script.is_null(), ERR_INVALID_DATA, "Can't load script at path: '" + path + "'.");
+							obj->set_script(script);
+						} else {
+							obj->set(str, value);
+						}
 					}
 
 					if (Object::cast_to<RefCounted>(obj)) {
@@ -747,7 +771,60 @@ Error decode_variant(Variant &r_variant, const uint8_t *p_buffer, int p_len, int
 
 		} break;
 		case Variant::ARRAY: {
+			Variant::Type builtin_type = Variant::VARIANT_MAX;
+			StringName class_name;
+			Ref<Script> script;
+
+			switch (header & HEADER_DATA_FIELD_TYPED_ARRAY_MASK) {
+				case HEADER_DATA_FIELD_TYPED_ARRAY_NONE:
+					break; // Untyped array.
+				case HEADER_DATA_FIELD_TYPED_ARRAY_BUILTIN: {
+					ERR_FAIL_COND_V(len < 4, ERR_INVALID_DATA);
+
+					int32_t bt = decode_uint32(buf);
+					buf += 4;
+					len -= 4;
+					if (r_len) {
+						(*r_len) += 4;
+					}
+
+					ERR_FAIL_INDEX_V(bt, Variant::VARIANT_MAX, ERR_INVALID_DATA);
+					builtin_type = (Variant::Type)bt;
+					ERR_FAIL_COND_V(!p_allow_objects && builtin_type == Variant::OBJECT, ERR_UNAUTHORIZED);
+				} break;
+				case HEADER_DATA_FIELD_TYPED_ARRAY_CLASS_NAME: {
+					ERR_FAIL_COND_V(!p_allow_objects, ERR_UNAUTHORIZED);
+
+					String str;
+					Error err = _decode_string(buf, len, r_len, str);
+					if (err) {
+						return err;
+					}
+
+					builtin_type = Variant::OBJECT;
+					class_name = str;
+				} break;
+				case HEADER_DATA_FIELD_TYPED_ARRAY_SCRIPT: {
+					ERR_FAIL_COND_V(!p_allow_objects, ERR_UNAUTHORIZED);
+
+					String path;
+					Error err = _decode_string(buf, len, r_len, path);
+					if (err) {
+						return err;
+					}
+					ERR_FAIL_COND_V_MSG(path.is_empty() || !path.begins_with("res://") || !ResourceLoader::exists(path, "Script"), ERR_INVALID_DATA, "Invalid script path: '" + path + "'.");
+					script = ResourceLoader::load(path, "Script");
+					ERR_FAIL_COND_V_MSG(script.is_null(), ERR_INVALID_DATA, "Can't load script at path: '" + path + "'.");
+
+					builtin_type = Variant::OBJECT;
+					class_name = script->get_instance_base_type();
+				} break;
+				default:
+					ERR_FAIL_V(ERR_INVALID_DATA); // Future proofing.
+			}
+
 			ERR_FAIL_COND_V(len < 4, ERR_INVALID_DATA);
+
 			int32_t count = decode_uint32(buf);
 			//  bool shared = count&0x80000000;
 			count &= 0x7FFFFFFF;
@@ -760,6 +837,9 @@ Error decode_variant(Variant &r_variant, const uint8_t *p_buffer, int p_len, int
 			}
 
 			Array varr;
+			if (builtin_type != Variant::VARIANT_MAX) {
+				varr.set_typed(builtin_type, class_name, script);
+			}
 
 			for (int i = 0; i < count; i++) {
 				int used = 0;
@@ -936,7 +1016,7 @@ Error decode_variant(Variant &r_variant, const uint8_t *p_buffer, int p_len, int
 
 			Vector<Vector2> varray;
 
-			if (type & ENCODE_FLAG_64) {
+			if (header & HEADER_DATA_FLAG_64) {
 				ERR_FAIL_MUL_OF(count, sizeof(double) * 2, ERR_INVALID_DATA);
 				ERR_FAIL_COND_V(count < 0 || count * sizeof(double) * 2 > (size_t)len, ERR_INVALID_DATA);
 
@@ -996,7 +1076,7 @@ Error decode_variant(Variant &r_variant, const uint8_t *p_buffer, int p_len, int
 
 			Vector<Vector3> varray;
 
-			if (type & ENCODE_FLAG_64) {
+			if (header & HEADER_DATA_FLAG_64) {
 				ERR_FAIL_MUL_OF(count, sizeof(double) * 3, ERR_INVALID_DATA);
 				ERR_FAIL_COND_V(count < 0 || count * sizeof(double) * 3 > (size_t)len, ERR_INVALID_DATA);
 
@@ -1122,20 +1202,20 @@ Error encode_variant(const Variant &p_variant, uint8_t *r_buffer, int &r_len, bo
 
 	r_len = 0;
 
-	uint32_t flags = 0;
+	uint32_t header = p_variant.get_type();
 
 	switch (p_variant.get_type()) {
 		case Variant::INT: {
 			int64_t val = p_variant;
 			if (val > (int64_t)INT_MAX || val < (int64_t)INT_MIN) {
-				flags |= ENCODE_FLAG_64;
+				header |= HEADER_DATA_FLAG_64;
 			}
 		} break;
 		case Variant::FLOAT: {
 			double d = p_variant;
 			float f = d;
 			if (double(f) != d) {
-				flags |= ENCODE_FLAG_64;
+				header |= HEADER_DATA_FLAG_64;
 			}
 		} break;
 		case Variant::OBJECT: {
@@ -1151,7 +1231,23 @@ Error encode_variant(const Variant &p_variant, uint8_t *r_buffer, int &r_len, bo
 			}
 
 			if (!p_full_objects) {
-				flags |= ENCODE_FLAG_OBJECT_AS_ID;
+				header |= HEADER_DATA_FLAG_OBJECT_AS_ID;
+			}
+		} break;
+		case Variant::ARRAY: {
+			Array array = p_variant;
+			if (array.is_typed()) {
+				Ref<Script> script = array.get_typed_script();
+				if (script.is_valid()) {
+					ERR_FAIL_COND_V(!p_full_objects, ERR_UNAVAILABLE);
+					header |= HEADER_DATA_FIELD_TYPED_ARRAY_SCRIPT;
+				} else if (array.get_typed_class_name() != StringName()) {
+					ERR_FAIL_COND_V(!p_full_objects, ERR_UNAVAILABLE);
+					header |= HEADER_DATA_FIELD_TYPED_ARRAY_CLASS_NAME;
+				} else {
+					ERR_FAIL_COND_V(!p_full_objects && array.get_typed_builtin() == Variant::OBJECT, ERR_UNAVAILABLE);
+					header |= HEADER_DATA_FIELD_TYPED_ARRAY_BUILTIN;
+				}
 			}
 		} break;
 #ifdef REAL_T_IS_DOUBLE
@@ -1168,7 +1264,7 @@ Error encode_variant(const Variant &p_variant, uint8_t *r_buffer, int &r_len, bo
 		case Variant::BASIS:
 		case Variant::RECT2:
 		case Variant::AABB: {
-			flags |= ENCODE_FLAG_64;
+			header |= HEADER_DATA_FLAG_64;
 		} break;
 #endif // REAL_T_IS_DOUBLE
 		default: {
@@ -1176,7 +1272,7 @@ Error encode_variant(const Variant &p_variant, uint8_t *r_buffer, int &r_len, bo
 	}
 
 	if (buf) {
-		encode_uint32(p_variant.get_type() | flags, buf);
+		encode_uint32(header, buf);
 		buf += 4;
 	}
 	r_len += 4;
@@ -1194,7 +1290,7 @@ Error encode_variant(const Variant &p_variant, uint8_t *r_buffer, int &r_len, bo
 
 		} break;
 		case Variant::INT: {
-			if (flags & ENCODE_FLAG_64) {
+			if (header & HEADER_DATA_FLAG_64) {
 				//64 bits
 				if (buf) {
 					encode_uint64(p_variant.operator int64_t(), buf);
@@ -1210,7 +1306,7 @@ Error encode_variant(const Variant &p_variant, uint8_t *r_buffer, int &r_len, bo
 			}
 		} break;
 		case Variant::FLOAT: {
-			if (flags & ENCODE_FLAG_64) {
+			if (header & HEADER_DATA_FLAG_64) {
 				if (buf) {
 					encode_double(p_variant.operator double(), buf);
 				}
@@ -1523,8 +1619,21 @@ Error encode_variant(const Variant &p_variant, uint8_t *r_buffer, int &r_len, bo
 
 						_encode_string(E.name, buf, r_len);
 
+						Variant value;
+
+						if (E.name == CoreStringNames::get_singleton()->_script) {
+							Ref<Script> script = obj->get_script();
+							if (script.is_valid()) {
+								String path = script->get_path();
+								ERR_FAIL_COND_V_MSG(path.is_empty() || !path.begins_with("res://"), ERR_UNAVAILABLE, "Failed to encode a path to a custom script.");
+								value = path;
+							}
+						} else {
+							value = obj->get(E.name);
+						}
+
 						int len;
-						Error err = encode_variant(obj->get(E.name), buf, len, p_full_objects, p_depth + 1);
+						Error err = encode_variant(value, buf, len, p_full_objects, p_depth + 1);
 						ERR_FAIL_COND_V(err, err);
 						ERR_FAIL_COND_V(len % 4, ERR_BUG);
 						r_len += len;
@@ -1594,24 +1703,41 @@ Error encode_variant(const Variant &p_variant, uint8_t *r_buffer, int &r_len, bo
 
 		} break;
 		case Variant::ARRAY: {
-			Array v = p_variant;
+			Array array = p_variant;
+
+			if (array.is_typed()) {
+				Variant variant = array.get_typed_script();
+				Ref<Script> script = variant;
+				if (script.is_valid()) {
+					String path = script->get_path();
+					ERR_FAIL_COND_V_MSG(path.is_empty() || !path.begins_with("res://"), ERR_UNAVAILABLE, "Failed to encode a path to a custom script for an array type.");
+					_encode_string(path, buf, r_len);
+				} else if (array.get_typed_class_name() != StringName()) {
+					_encode_string(array.get_typed_class_name(), buf, r_len);
+				} else {
+					if (buf) {
+						encode_uint32(array.get_typed_builtin(), buf);
+						buf += 4;
+					}
+					r_len += 4;
+				}
+			}
 
 			if (buf) {
-				encode_uint32(uint32_t(v.size()), buf);
+				encode_uint32(uint32_t(array.size()), buf);
 				buf += 4;
 			}
-
 			r_len += 4;
 
-			for (int i = 0; i < v.size(); i++) {
+			for (int i = 0; i < array.size(); i++) {
 				int len;
-				Error err = encode_variant(v.get(i), buf, len, p_full_objects, p_depth + 1);
+				Error err = encode_variant(array.get(i), buf, len, p_full_objects, p_depth + 1);
 				ERR_FAIL_COND_V(err, err);
 				ERR_FAIL_COND_V(len % 4, ERR_BUG);
-				r_len += len;
 				if (buf) {
 					buf += len;
 				}
+				r_len += len;
 			}
 
 		} break;
diff --git a/core/math/a_star.compat.inc b/core/math/a_star.compat.inc
new file mode 100644
index 0000000000..664d7ffd5e
--- /dev/null
+++ b/core/math/a_star.compat.inc
@@ -0,0 +1,59 @@
+/**************************************************************************/
+/*  a_star.compat.inc                                                     */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             GODOT ENGINE                               */
+/*                        https://godotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#ifndef DISABLE_DEPRECATED
+
+Vector<int64_t> AStar3D::_get_id_path_bind_compat_88047(int64_t p_from_id, int64_t p_to_id) {
+	return get_id_path(p_from_id, p_to_id, false);
+}
+
+Vector<Vector3> AStar3D::_get_point_path_bind_compat_88047(int64_t p_from_id, int64_t p_to_id) {
+	return get_point_path(p_from_id, p_to_id, false);
+}
+
+void AStar3D::_bind_compatibility_methods() {
+	ClassDB::bind_compatibility_method(D_METHOD("get_id_path", "from_id", "to_id"), &AStar3D::_get_id_path_bind_compat_88047);
+	ClassDB::bind_compatibility_method(D_METHOD("get_point_path", "from_id", "to_id"), &AStar3D::_get_point_path_bind_compat_88047);
+}
+
+Vector<int64_t> AStar2D::_get_id_path_bind_compat_88047(int64_t p_from_id, int64_t p_to_id) {
+	return get_id_path(p_from_id, p_to_id, false);
+}
+
+Vector<Vector2> AStar2D::_get_point_path_bind_compat_88047(int64_t p_from_id, int64_t p_to_id) {
+	return get_point_path(p_from_id, p_to_id, false);
+}
+
+void AStar2D::_bind_compatibility_methods() {
+	ClassDB::bind_compatibility_method(D_METHOD("get_id_path", "from_id", "to_id"), &AStar2D::_get_id_path_bind_compat_88047);
+	ClassDB::bind_compatibility_method(D_METHOD("get_point_path", "from_id", "to_id"), &AStar2D::_get_point_path_bind_compat_88047);
+}
+
+#endif // DISABLE_DEPRECATED
diff --git a/core/math/a_star.cpp b/core/math/a_star.cpp
index fb54058bd9..4497604947 100644
--- a/core/math/a_star.cpp
+++ b/core/math/a_star.cpp
@@ -29,6 +29,7 @@
 /**************************************************************************/
 
 #include "a_star.h"
+#include "a_star.compat.inc"
 
 #include "core/math/geometry_3d.h"
 #include "core/object/script_language.h"
@@ -319,6 +320,7 @@ Vector3 AStar3D::get_closest_position_in_segment(const Vector3 &p_point) const {
 }
 
 bool AStar3D::_solve(Point *begin_point, Point *end_point) {
+	last_closest_point = nullptr;
 	pass++;
 
 	if (!end_point->enabled) {
@@ -332,11 +334,18 @@ bool AStar3D::_solve(Point *begin_point, Point *end_point) {
 
 	begin_point->g_score = 0;
 	begin_point->f_score = _estimate_cost(begin_point->id, end_point->id);
+	begin_point->abs_g_score = 0;
+	begin_point->abs_f_score = _estimate_cost(begin_point->id, end_point->id);
 	open_list.push_back(begin_point);
 
 	while (!open_list.is_empty()) {
 		Point *p = open_list[0]; // The currently processed point.
 
+		// Find point closer to end_point, or same distance to end_point but closer to begin_point.
+		if (last_closest_point == nullptr || last_closest_point->abs_f_score > p->abs_f_score || (last_closest_point->abs_f_score >= p->abs_f_score && last_closest_point->abs_g_score > p->abs_g_score)) {
+			last_closest_point = p;
+		}
+
 		if (p == end_point) {
 			found_route = true;
 			break;
@@ -368,6 +377,8 @@ bool AStar3D::_solve(Point *begin_point, Point *end_point) {
 			e->prev_point = p;
 			e->g_score = tentative_g_score;
 			e->f_score = e->g_score + _estimate_cost(e->id, end_point->id);
+			e->abs_g_score = tentative_g_score;
+			e->abs_f_score = e->f_score - e->g_score;
 
 			if (new_point) { // The position of the new points is already known.
 				sorter.push_heap(0, open_list.size() - 1, 0, e, open_list.ptr());
@@ -414,7 +425,7 @@ real_t AStar3D::_compute_cost(int64_t p_from_id, int64_t p_to_id) {
 	return from_point->pos.distance_to(to_point->pos);
 }
 
-Vector<Vector3> AStar3D::get_point_path(int64_t p_from_id, int64_t p_to_id) {
+Vector<Vector3> AStar3D::get_point_path(int64_t p_from_id, int64_t p_to_id, bool p_allow_partial_path) {
 	Point *a = nullptr;
 	bool from_exists = points.lookup(p_from_id, a);
 	ERR_FAIL_COND_V_MSG(!from_exists, Vector<Vector3>(), vformat("Can't get point path. Point with id: %d doesn't exist.", p_from_id));
@@ -434,7 +445,12 @@ Vector<Vector3> AStar3D::get_point_path(int64_t p_from_id, int64_t p_to_id) {
 
 	bool found_route = _solve(begin_point, end_point);
 	if (!found_route) {
-		return Vector<Vector3>();
+		if (!p_allow_partial_path || last_closest_point == nullptr) {
+			return Vector<Vector3>();
+		}
+
+		// Use closest point instead.
+		end_point = last_closest_point;
 	}
 
 	Point *p = end_point;
@@ -463,7 +479,7 @@ Vector<Vector3> AStar3D::get_point_path(int64_t p_from_id, int64_t p_to_id) {
 	return path;
 }
 
-Vector<int64_t> AStar3D::get_id_path(int64_t p_from_id, int64_t p_to_id) {
+Vector<int64_t> AStar3D::get_id_path(int64_t p_from_id, int64_t p_to_id, bool p_allow_partial_path) {
 	Point *a = nullptr;
 	bool from_exists = points.lookup(p_from_id, a);
 	ERR_FAIL_COND_V_MSG(!from_exists, Vector<int64_t>(), vformat("Can't get id path. Point with id: %d doesn't exist.", p_from_id));
@@ -483,7 +499,12 @@ Vector<int64_t> AStar3D::get_id_path(int64_t p_from_id, int64_t p_to_id) {
 
 	bool found_route = _solve(begin_point, end_point);
 	if (!found_route) {
-		return Vector<int64_t>();
+		if (!p_allow_partial_path || last_closest_point == nullptr) {
+			return Vector<int64_t>();
+		}
+
+		// Use closest point instead.
+		end_point = last_closest_point;
 	}
 
 	Point *p = end_point;
@@ -555,8 +576,8 @@ void AStar3D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("get_closest_point", "to_position", "include_disabled"), &AStar3D::get_closest_point, DEFVAL(false));
 	ClassDB::bind_method(D_METHOD("get_closest_position_in_segment", "to_position"), &AStar3D::get_closest_position_in_segment);
 
-	ClassDB::bind_method(D_METHOD("get_point_path", "from_id", "to_id"), &AStar3D::get_point_path);
-	ClassDB::bind_method(D_METHOD("get_id_path", "from_id", "to_id"), &AStar3D::get_id_path);
+	ClassDB::bind_method(D_METHOD("get_point_path", "from_id", "to_id", "allow_partial_path"), &AStar3D::get_point_path, DEFVAL(false));
+	ClassDB::bind_method(D_METHOD("get_id_path", "from_id", "to_id", "allow_partial_path"), &AStar3D::get_id_path, DEFVAL(false));
 
 	GDVIRTUAL_BIND(_estimate_cost, "from_id", "to_id")
 	GDVIRTUAL_BIND(_compute_cost, "from_id", "to_id")
@@ -688,7 +709,7 @@ real_t AStar2D::_compute_cost(int64_t p_from_id, int64_t p_to_id) {
 	return from_point->pos.distance_to(to_point->pos);
 }
 
-Vector<Vector2> AStar2D::get_point_path(int64_t p_from_id, int64_t p_to_id) {
+Vector<Vector2> AStar2D::get_point_path(int64_t p_from_id, int64_t p_to_id, bool p_allow_partial_path) {
 	AStar3D::Point *a = nullptr;
 	bool from_exists = astar.points.lookup(p_from_id, a);
 	ERR_FAIL_COND_V_MSG(!from_exists, Vector<Vector2>(), vformat("Can't get point path. Point with id: %d doesn't exist.", p_from_id));
@@ -707,7 +728,12 @@ Vector<Vector2> AStar2D::get_point_path(int64_t p_from_id, int64_t p_to_id) {
 
 	bool found_route = _solve(begin_point, end_point);
 	if (!found_route) {
-		return Vector<Vector2>();
+		if (!p_allow_partial_path || astar.last_closest_point == nullptr) {
+			return Vector<Vector2>();
+		}
+
+		// Use closest point instead.
+		end_point = astar.last_closest_point;
 	}
 
 	AStar3D::Point *p = end_point;
@@ -736,7 +762,7 @@ Vector<Vector2> AStar2D::get_point_path(int64_t p_from_id, int64_t p_to_id) {
 	return path;
 }
 
-Vector<int64_t> AStar2D::get_id_path(int64_t p_from_id, int64_t p_to_id) {
+Vector<int64_t> AStar2D::get_id_path(int64_t p_from_id, int64_t p_to_id, bool p_allow_partial_path) {
 	AStar3D::Point *a = nullptr;
 	bool from_exists = astar.points.lookup(p_from_id, a);
 	ERR_FAIL_COND_V_MSG(!from_exists, Vector<int64_t>(), vformat("Can't get id path. Point with id: %d doesn't exist.", p_from_id));
@@ -756,7 +782,12 @@ Vector<int64_t> AStar2D::get_id_path(int64_t p_from_id, int64_t p_to_id) {
 
 	bool found_route = _solve(begin_point, end_point);
 	if (!found_route) {
-		return Vector<int64_t>();
+		if (!p_allow_partial_path || astar.last_closest_point == nullptr) {
+			return Vector<int64_t>();
+		}
+
+		// Use closest point instead.
+		end_point = astar.last_closest_point;
 	}
 
 	AStar3D::Point *p = end_point;
@@ -786,6 +817,7 @@ Vector<int64_t> AStar2D::get_id_path(int64_t p_from_id, int64_t p_to_id) {
 }
 
 bool AStar2D::_solve(AStar3D::Point *begin_point, AStar3D::Point *end_point) {
+	astar.last_closest_point = nullptr;
 	astar.pass++;
 
 	if (!end_point->enabled) {
@@ -799,11 +831,18 @@ bool AStar2D::_solve(AStar3D::Point *begin_point, AStar3D::Point *end_point) {
 
 	begin_point->g_score = 0;
 	begin_point->f_score = _estimate_cost(begin_point->id, end_point->id);
+	begin_point->abs_g_score = 0;
+	begin_point->abs_f_score = _estimate_cost(begin_point->id, end_point->id);
 	open_list.push_back(begin_point);
 
 	while (!open_list.is_empty()) {
 		AStar3D::Point *p = open_list[0]; // The currently processed point.
 
+		// Find point closer to end_point, or same distance to end_point but closer to begin_point.
+		if (astar.last_closest_point == nullptr || astar.last_closest_point->abs_f_score > p->abs_f_score || (astar.last_closest_point->abs_f_score >= p->abs_f_score && astar.last_closest_point->abs_g_score > p->abs_g_score)) {
+			astar.last_closest_point = p;
+		}
+
 		if (p == end_point) {
 			found_route = true;
 			break;
@@ -835,6 +874,8 @@ bool AStar2D::_solve(AStar3D::Point *begin_point, AStar3D::Point *end_point) {
 			e->prev_point = p;
 			e->g_score = tentative_g_score;
 			e->f_score = e->g_score + _estimate_cost(e->id, end_point->id);
+			e->abs_g_score = tentative_g_score;
+			e->abs_f_score = e->f_score - e->g_score;
 
 			if (new_point) { // The position of the new points is already known.
 				sorter.push_heap(0, open_list.size() - 1, 0, e, open_list.ptr());
@@ -874,8 +915,8 @@ void AStar2D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("get_closest_point", "to_position", "include_disabled"), &AStar2D::get_closest_point, DEFVAL(false));
 	ClassDB::bind_method(D_METHOD("get_closest_position_in_segment", "to_position"), &AStar2D::get_closest_position_in_segment);
 
-	ClassDB::bind_method(D_METHOD("get_point_path", "from_id", "to_id"), &AStar2D::get_point_path);
-	ClassDB::bind_method(D_METHOD("get_id_path", "from_id", "to_id"), &AStar2D::get_id_path);
+	ClassDB::bind_method(D_METHOD("get_point_path", "from_id", "to_id", "allow_partial_path"), &AStar2D::get_point_path, DEFVAL(false));
+	ClassDB::bind_method(D_METHOD("get_id_path", "from_id", "to_id", "allow_partial_path"), &AStar2D::get_id_path, DEFVAL(false));
 
 	GDVIRTUAL_BIND(_estimate_cost, "from_id", "to_id")
 	GDVIRTUAL_BIND(_compute_cost, "from_id", "to_id")
diff --git a/core/math/a_star.h b/core/math/a_star.h
index 0758500c8a..8e054c4789 100644
--- a/core/math/a_star.h
+++ b/core/math/a_star.h
@@ -60,6 +60,10 @@ class AStar3D : public RefCounted {
 		real_t f_score = 0;
 		uint64_t open_pass = 0;
 		uint64_t closed_pass = 0;
+
+		// Used for getting closest_point_of_last_pathing_call.
+		real_t abs_g_score = 0;
+		real_t abs_f_score = 0;
 	};
 
 	struct SortPoints {
@@ -109,6 +113,7 @@ class AStar3D : public RefCounted {
 
 	OAHashMap<int64_t, Point *> points;
 	HashSet<Segment, Segment> segments;
+	Point *last_closest_point = nullptr;
 
 	bool _solve(Point *begin_point, Point *end_point);
 
@@ -121,6 +126,12 @@ protected:
 	GDVIRTUAL2RC(real_t, _estimate_cost, int64_t, int64_t)
 	GDVIRTUAL2RC(real_t, _compute_cost, int64_t, int64_t)
 
+#ifndef DISABLE_DEPRECATED
+	Vector<int64_t> _get_id_path_bind_compat_88047(int64_t p_from_id, int64_t p_to_id);
+	Vector<Vector3> _get_point_path_bind_compat_88047(int64_t p_from_id, int64_t p_to_id);
+	static void _bind_compatibility_methods();
+#endif
+
 public:
 	int64_t get_available_point_id() const;
 
@@ -149,8 +160,8 @@ public:
 	int64_t get_closest_point(const Vector3 &p_point, bool p_include_disabled = false) const;
 	Vector3 get_closest_position_in_segment(const Vector3 &p_point) const;
 
-	Vector<Vector3> get_point_path(int64_t p_from_id, int64_t p_to_id);
-	Vector<int64_t> get_id_path(int64_t p_from_id, int64_t p_to_id);
+	Vector<Vector3> get_point_path(int64_t p_from_id, int64_t p_to_id, bool p_allow_partial_path = false);
+	Vector<int64_t> get_id_path(int64_t p_from_id, int64_t p_to_id, bool p_allow_partial_path = false);
 
 	AStar3D() {}
 	~AStar3D();
@@ -171,6 +182,12 @@ protected:
 	GDVIRTUAL2RC(real_t, _estimate_cost, int64_t, int64_t)
 	GDVIRTUAL2RC(real_t, _compute_cost, int64_t, int64_t)
 
+#ifndef DISABLE_DEPRECATED
+	Vector<int64_t> _get_id_path_bind_compat_88047(int64_t p_from_id, int64_t p_to_id);
+	Vector<Vector2> _get_point_path_bind_compat_88047(int64_t p_from_id, int64_t p_to_id);
+	static void _bind_compatibility_methods();
+#endif
+
 public:
 	int64_t get_available_point_id() const;
 
@@ -199,8 +216,8 @@ public:
 	int64_t get_closest_point(const Vector2 &p_point, bool p_include_disabled = false) const;
 	Vector2 get_closest_position_in_segment(const Vector2 &p_point) const;
 
-	Vector<Vector2> get_point_path(int64_t p_from_id, int64_t p_to_id);
-	Vector<int64_t> get_id_path(int64_t p_from_id, int64_t p_to_id);
+	Vector<Vector2> get_point_path(int64_t p_from_id, int64_t p_to_id, bool p_allow_partial_path = false);
+	Vector<int64_t> get_id_path(int64_t p_from_id, int64_t p_to_id, bool p_allow_partial_path = false);
 
 	AStar2D() {}
 	~AStar2D() {}
diff --git a/core/math/a_star_grid_2d.compat.inc b/core/math/a_star_grid_2d.compat.inc
new file mode 100644
index 0000000000..e7124c2477
--- /dev/null
+++ b/core/math/a_star_grid_2d.compat.inc
@@ -0,0 +1,48 @@
+/**************************************************************************/
+/*  a_star_grid_2d.compat.inc                                             */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             GODOT ENGINE                               */
+/*                        https://godotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#ifndef DISABLE_DEPRECATED
+
+#include "core/variant/typed_array.h"
+
+TypedArray<Vector2i> AStarGrid2D::_get_id_path_bind_compat_88047(const Vector2i &p_from_id, const Vector2i &p_to_id) {
+	return get_id_path(p_from_id, p_to_id, false);
+}
+
+Vector<Vector2> AStarGrid2D::_get_point_path_bind_compat_88047(const Vector2i &p_from_id, const Vector2i &p_to_id) {
+	return get_point_path(p_from_id, p_to_id, false);
+}
+
+void AStarGrid2D::_bind_compatibility_methods() {
+	ClassDB::bind_compatibility_method(D_METHOD("get_id_path", "from_id", "to_id"), &AStarGrid2D::_get_id_path_bind_compat_88047);
+	ClassDB::bind_compatibility_method(D_METHOD("get_point_path", "from_id", "to_id"), &AStarGrid2D::_get_point_path_bind_compat_88047);
+}
+
+#endif // DISABLE_DEPRECATED
diff --git a/core/math/a_star_grid_2d.cpp b/core/math/a_star_grid_2d.cpp
index d17f465ab8..f272407869 100644
--- a/core/math/a_star_grid_2d.cpp
+++ b/core/math/a_star_grid_2d.cpp
@@ -29,6 +29,7 @@
 /**************************************************************************/
 
 #include "a_star_grid_2d.h"
+#include "a_star_grid_2d.compat.inc"
 
 #include "core/variant/typed_array.h"
 
@@ -446,6 +447,7 @@ void AStarGrid2D::_get_nbors(Point *p_point, LocalVector<Point *> &r_nbors) {
 }
 
 bool AStarGrid2D::_solve(Point *p_begin_point, Point *p_end_point) {
+	last_closest_point = nullptr;
 	pass++;
 
 	if (p_end_point->solid) {
@@ -459,12 +461,19 @@ bool AStarGrid2D::_solve(Point *p_begin_point, Point *p_end_point) {
 
 	p_begin_point->g_score = 0;
 	p_begin_point->f_score = _estimate_cost(p_begin_point->id, p_end_point->id);
+	p_begin_point->abs_g_score = 0;
+	p_begin_point->abs_f_score = _estimate_cost(p_begin_point->id, p_end_point->id);
 	open_list.push_back(p_begin_point);
 	end = p_end_point;
 
 	while (!open_list.is_empty()) {
 		Point *p = open_list[0]; // The currently processed point.
 
+		// Find point closer to end_point, or same distance to end_point but closer to begin_point.
+		if (last_closest_point == nullptr || last_closest_point->abs_f_score > p->abs_f_score || (last_closest_point->abs_f_score >= p->abs_f_score && last_closest_point->abs_g_score > p->abs_g_score)) {
+			last_closest_point = p;
+		}
+
 		if (p == p_end_point) {
 			found_route = true;
 			break;
@@ -508,6 +517,9 @@ bool AStarGrid2D::_solve(Point *p_begin_point, Point *p_end_point) {
 			e->g_score = tentative_g_score;
 			e->f_score = e->g_score + _estimate_cost(e->id, p_end_point->id);
 
+			e->abs_g_score = tentative_g_score;
+			e->abs_f_score = e->f_score - e->g_score;
+
 			if (new_point) { // The position of the new points is already known.
 				sorter.push_heap(0, open_list.size() - 1, 0, e, open_list.ptr());
 			} else {
@@ -546,7 +558,7 @@ Vector2 AStarGrid2D::get_point_position(const Vector2i &p_id) const {
 	return _get_point_unchecked(p_id)->pos;
 }
 
-Vector<Vector2> AStarGrid2D::get_point_path(const Vector2i &p_from_id, const Vector2i &p_to_id) {
+Vector<Vector2> AStarGrid2D::get_point_path(const Vector2i &p_from_id, const Vector2i &p_to_id, bool p_allow_partial_path) {
 	ERR_FAIL_COND_V_MSG(dirty, Vector<Vector2>(), "Grid is not initialized. Call the update method.");
 	ERR_FAIL_COND_V_MSG(!is_in_boundsv(p_from_id), Vector<Vector2>(), vformat("Can't get id path. Point %s out of bounds %s.", p_from_id, region));
 	ERR_FAIL_COND_V_MSG(!is_in_boundsv(p_to_id), Vector<Vector2>(), vformat("Can't get id path. Point %s out of bounds %s.", p_to_id, region));
@@ -565,7 +577,12 @@ Vector<Vector2> AStarGrid2D::get_point_path(const Vector2i &p_from_id, const Vec
 
 	bool found_route = _solve(begin_point, end_point);
 	if (!found_route) {
-		return Vector<Vector2>();
+		if (!p_allow_partial_path || last_closest_point == nullptr) {
+			return Vector<Vector2>();
+		}
+
+		// Use closest point instead.
+		end_point = last_closest_point;
 	}
 
 	Point *p = end_point;
@@ -594,7 +611,7 @@ Vector<Vector2> AStarGrid2D::get_point_path(const Vector2i &p_from_id, const Vec
 	return path;
 }
 
-TypedArray<Vector2i> AStarGrid2D::get_id_path(const Vector2i &p_from_id, const Vector2i &p_to_id) {
+TypedArray<Vector2i> AStarGrid2D::get_id_path(const Vector2i &p_from_id, const Vector2i &p_to_id, bool p_allow_partial_path) {
 	ERR_FAIL_COND_V_MSG(dirty, TypedArray<Vector2i>(), "Grid is not initialized. Call the update method.");
 	ERR_FAIL_COND_V_MSG(!is_in_boundsv(p_from_id), TypedArray<Vector2i>(), vformat("Can't get id path. Point %s out of bounds %s.", p_from_id, region));
 	ERR_FAIL_COND_V_MSG(!is_in_boundsv(p_to_id), TypedArray<Vector2i>(), vformat("Can't get id path. Point %s out of bounds %s.", p_to_id, region));
@@ -613,7 +630,12 @@ TypedArray<Vector2i> AStarGrid2D::get_id_path(const Vector2i &p_from_id, const V
 
 	bool found_route = _solve(begin_point, end_point);
 	if (!found_route) {
-		return TypedArray<Vector2i>();
+		if (!p_allow_partial_path || last_closest_point == nullptr) {
+			return TypedArray<Vector2i>();
+		}
+
+		// Use closest point instead.
+		end_point = last_closest_point;
 	}
 
 	Point *p = end_point;
@@ -672,8 +694,8 @@ void AStarGrid2D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("clear"), &AStarGrid2D::clear);
 
 	ClassDB::bind_method(D_METHOD("get_point_position", "id"), &AStarGrid2D::get_point_position);
-	ClassDB::bind_method(D_METHOD("get_point_path", "from_id", "to_id"), &AStarGrid2D::get_point_path);
-	ClassDB::bind_method(D_METHOD("get_id_path", "from_id", "to_id"), &AStarGrid2D::get_id_path);
+	ClassDB::bind_method(D_METHOD("get_point_path", "from_id", "to_id", "allow_partial_path"), &AStarGrid2D::get_point_path, DEFVAL(false));
+	ClassDB::bind_method(D_METHOD("get_id_path", "from_id", "to_id", "allow_partial_path"), &AStarGrid2D::get_id_path, DEFVAL(false));
 
 	GDVIRTUAL_BIND(_estimate_cost, "from_id", "to_id")
 	GDVIRTUAL_BIND(_compute_cost, "from_id", "to_id")
diff --git a/core/math/a_star_grid_2d.h b/core/math/a_star_grid_2d.h
index 69cb77dd3e..1a9f6dcc11 100644
--- a/core/math/a_star_grid_2d.h
+++ b/core/math/a_star_grid_2d.h
@@ -89,6 +89,10 @@ private:
 		uint64_t open_pass = 0;
 		uint64_t closed_pass = 0;
 
+		// Used for getting last_closest_point.
+		real_t abs_g_score = 0;
+		real_t abs_f_score = 0;
+
 		Point() {}
 
 		Point(const Vector2i &p_id, const Vector2 &p_pos) :
@@ -109,6 +113,7 @@ private:
 
 	LocalVector<LocalVector<Point>> points;
 	Point *end = nullptr;
+	Point *last_closest_point = nullptr;
 
 	uint64_t pass = 1;
 
@@ -152,6 +157,12 @@ protected:
 	GDVIRTUAL2RC(real_t, _estimate_cost, Vector2i, Vector2i)
 	GDVIRTUAL2RC(real_t, _compute_cost, Vector2i, Vector2i)
 
+#ifndef DISABLE_DEPRECATED
+	TypedArray<Vector2i> _get_id_path_bind_compat_88047(const Vector2i &p_from, const Vector2i &p_to);
+	Vector<Vector2> _get_point_path_bind_compat_88047(const Vector2i &p_from, const Vector2i &p_to);
+	static void _bind_compatibility_methods();
+#endif
+
 public:
 	void set_region(const Rect2i &p_region);
 	Rect2i get_region() const;
@@ -198,8 +209,8 @@ public:
 	void clear();
 
 	Vector2 get_point_position(const Vector2i &p_id) const;
-	Vector<Vector2> get_point_path(const Vector2i &p_from, const Vector2i &p_to);
-	TypedArray<Vector2i> get_id_path(const Vector2i &p_from, const Vector2i &p_to);
+	Vector<Vector2> get_point_path(const Vector2i &p_from, const Vector2i &p_to, bool p_allow_partial_path = false);
+	TypedArray<Vector2i> get_id_path(const Vector2i &p_from, const Vector2i &p_to, bool p_allow_partial_path = false);
 };
 
 VARIANT_ENUM_CAST(AStarGrid2D::DiagonalMode);
diff --git a/core/math/projection.cpp b/core/math/projection.cpp
index 9d5dc8b4d6..d3cf11f91a 100644
--- a/core/math/projection.cpp
+++ b/core/math/projection.cpp
@@ -719,7 +719,8 @@ Projection Projection::operator*(const Projection &p_matrix) const {
 	return new_matrix;
 }
 
-void Projection::set_depth_correction(bool p_flip_y) {
+void Projection::set_depth_correction(bool p_flip_y, bool p_reverse_z, bool p_remap_z) {
+	// p_remap_z is used to convert from OpenGL-style clip space (-1 - 1) to Vulkan style (0 - 1).
 	real_t *m = &columns[0][0];
 
 	m[0] = 1;
@@ -732,11 +733,11 @@ void Projection::set_depth_correction(bool p_flip_y) {
 	m[7] = 0.0;
 	m[8] = 0.0;
 	m[9] = 0.0;
-	m[10] = 0.5;
+	m[10] = p_remap_z ? (p_reverse_z ? -0.5 : 0.5) : (p_reverse_z ? -1.0 : 1.0);
 	m[11] = 0.0;
 	m[12] = 0.0;
 	m[13] = 0.0;
-	m[14] = 0.5;
+	m[14] = p_remap_z ? 0.5 : 0.0;
 	m[15] = 1.0;
 }
 
diff --git a/core/math/projection.h b/core/math/projection.h
index b98f636344..7bba9b337e 100644
--- a/core/math/projection.h
+++ b/core/math/projection.h
@@ -69,7 +69,7 @@ struct _NO_DISCARD_ Projection {
 	void set_identity();
 	void set_zero();
 	void set_light_bias();
-	void set_depth_correction(bool p_flip_y = true);
+	void set_depth_correction(bool p_flip_y = true, bool p_reverse_z = true, bool p_remap_z = true);
 
 	void set_light_atlas_rect(const Rect2 &p_rect);
 	void set_perspective(real_t p_fovy_degrees, real_t p_aspect, real_t p_z_near, real_t p_z_far, bool p_flip_fov = false);
diff --git a/core/string/translation.cpp b/core/string/translation.cpp
index 0a0052d6cb..613edd11cd 100644
--- a/core/string/translation.cpp
+++ b/core/string/translation.cpp
@@ -776,9 +776,9 @@ void TranslationServer::set_property_translation(const Ref<Translation> &p_trans
 	property_translation = p_translation;
 }
 
-StringName TranslationServer::property_translate(const StringName &p_message) const {
+StringName TranslationServer::property_translate(const StringName &p_message, const StringName &p_context) const {
 	if (property_translation.is_valid()) {
-		StringName r = property_translation->get_message(p_message);
+		StringName r = property_translation->get_message(p_message, p_context);
 		if (r) {
 			return r;
 		}
diff --git a/core/string/translation.h b/core/string/translation.h
index 470ba88232..78d6721347 100644
--- a/core/string/translation.h
+++ b/core/string/translation.h
@@ -183,7 +183,7 @@ public:
 	StringName tool_translate(const StringName &p_message, const StringName &p_context = "") const;
 	StringName tool_translate_plural(const StringName &p_message, const StringName &p_message_plural, int p_n, const StringName &p_context = "") const;
 	void set_property_translation(const Ref<Translation> &p_translation);
-	StringName property_translate(const StringName &p_message) const;
+	StringName property_translate(const StringName &p_message, const StringName &p_context = "") const;
 	void set_doc_translation(const Ref<Translation> &p_translation);
 	StringName doc_translate(const StringName &p_message, const StringName &p_context = "") const;
 	StringName doc_translate_plural(const StringName &p_message, const StringName &p_message_plural, int p_n, const StringName &p_context = "") const;
diff --git a/doc/classes/AStar2D.xml b/doc/classes/AStar2D.xml
index f10e80e048..c877b3de78 100644
--- a/doc/classes/AStar2D.xml
+++ b/doc/classes/AStar2D.xml
@@ -139,8 +139,10 @@
 			<return type="PackedInt64Array" />
 			<param index="0" name="from_id" type="int" />
 			<param index="1" name="to_id" type="int" />
+			<param index="2" name="allow_partial_path" type="bool" default="false" />
 			<description>
 				Returns an array with the IDs of the points that form the path found by AStar2D between the given points. The array is ordered from the starting point to the ending point of the path.
+				If there is no valid path to the target, and [param allow_partial_path] is [code]true[/code], returns a path to the point closest to the target that can be reached.
 				[codeblocks]
 				[gdscript]
 				var astar = AStar2D.new()
@@ -228,8 +230,10 @@
 			<return type="PackedVector2Array" />
 			<param index="0" name="from_id" type="int" />
 			<param index="1" name="to_id" type="int" />
+			<param index="2" name="allow_partial_path" type="bool" default="false" />
 			<description>
 				Returns an array with the points that are in the path found by AStar2D between the given points. The array is ordered from the starting point to the ending point of the path.
+				If there is no valid path to the target, and [param allow_partial_path] is [code]true[/code], returns a path to the point closest to the target that can be reached.
 				[b]Note:[/b] This method is not thread-safe. If called from a [Thread], it will return an empty [PackedVector2Array] and will print an error message.
 			</description>
 		</method>
diff --git a/doc/classes/AStar3D.xml b/doc/classes/AStar3D.xml
index e2afeef377..c8a80a9012 100644
--- a/doc/classes/AStar3D.xml
+++ b/doc/classes/AStar3D.xml
@@ -168,8 +168,10 @@
 			<return type="PackedInt64Array" />
 			<param index="0" name="from_id" type="int" />
 			<param index="1" name="to_id" type="int" />
+			<param index="2" name="allow_partial_path" type="bool" default="false" />
 			<description>
 				Returns an array with the IDs of the points that form the path found by AStar3D between the given points. The array is ordered from the starting point to the ending point of the path.
+				If there is no valid path to the target, and [param allow_partial_path] is [code]true[/code], returns a path to the point closest to the target that can be reached.
 				[codeblocks]
 				[gdscript]
 				var astar = AStar3D.new()
@@ -255,8 +257,10 @@
 			<return type="PackedVector3Array" />
 			<param index="0" name="from_id" type="int" />
 			<param index="1" name="to_id" type="int" />
+			<param index="2" name="allow_partial_path" type="bool" default="false" />
 			<description>
 				Returns an array with the points that are in the path found by AStar3D between the given points. The array is ordered from the starting point to the ending point of the path.
+				If there is no valid path to the target, and [param allow_partial_path] is [code]true[/code], returns a path to the point closest to the target that can be reached.
 				[b]Note:[/b] This method is not thread-safe. If called from a [Thread], it will return an empty [PackedVector3Array] and will print an error message.
 			</description>
 		</method>
diff --git a/doc/classes/AStarGrid2D.xml b/doc/classes/AStarGrid2D.xml
index 4501bec314..794e40fde9 100644
--- a/doc/classes/AStarGrid2D.xml
+++ b/doc/classes/AStarGrid2D.xml
@@ -75,16 +75,20 @@
 			<return type="Vector2i[]" />
 			<param index="0" name="from_id" type="Vector2i" />
 			<param index="1" name="to_id" type="Vector2i" />
+			<param index="2" name="allow_partial_path" type="bool" default="false" />
 			<description>
 				Returns an array with the IDs of the points that form the path found by AStar2D between the given points. The array is ordered from the starting point to the ending point of the path.
+				If there is no valid path to the target, and [param allow_partial_path] is [code]true[/code], returns a path to the point closest to the target that can be reached.
 			</description>
 		</method>
 		<method name="get_point_path">
 			<return type="PackedVector2Array" />
 			<param index="0" name="from_id" type="Vector2i" />
 			<param index="1" name="to_id" type="Vector2i" />
+			<param index="2" name="allow_partial_path" type="bool" default="false" />
 			<description>
 				Returns an array with the points that are in the path found by [AStarGrid2D] between the given points. The array is ordered from the starting point to the ending point of the path.
+				If there is no valid path to the target, and [param allow_partial_path] is [code]true[/code], returns a path to the point closest to the target that can be reached.
 				[b]Note:[/b] This method is not thread-safe. If called from a [Thread], it will return an empty [PackedVector3Array] and will print an error message.
 			</description>
 		</method>
diff --git a/doc/classes/CollisionObject3D.xml b/doc/classes/CollisionObject3D.xml
index b45a5d8c56..063c27c76a 100644
--- a/doc/classes/CollisionObject3D.xml
+++ b/doc/classes/CollisionObject3D.xml
@@ -14,11 +14,11 @@
 			<return type="void" />
 			<param index="0" name="camera" type="Camera3D" />
 			<param index="1" name="event" type="InputEvent" />
-			<param index="2" name="position" type="Vector3" />
+			<param index="2" name="event_position" type="Vector3" />
 			<param index="3" name="normal" type="Vector3" />
 			<param index="4" name="shape_idx" type="int" />
 			<description>
-				Receives unhandled [InputEvent]s. [param position] is the location in world space of the mouse pointer on the surface of the shape with index [param shape_idx] and [param normal] is the normal vector of the surface at that point. Connect to the [signal input_event] signal to easily pick up these events.
+				Receives unhandled [InputEvent]s. [param event_position] is the location in world space of the mouse pointer on the surface of the shape with index [param shape_idx] and [param normal] is the normal vector of the surface at that point. Connect to the [signal input_event] signal to easily pick up these events.
 				[b]Note:[/b] [method _input_event] requires [member input_ray_pickable] to be [code]true[/code] and at least one [member collision_layer] bit to be set.
 			</description>
 		</method>
@@ -207,11 +207,11 @@
 		<signal name="input_event">
 			<param index="0" name="camera" type="Node" />
 			<param index="1" name="event" type="InputEvent" />
-			<param index="2" name="position" type="Vector3" />
+			<param index="2" name="event_position" type="Vector3" />
 			<param index="3" name="normal" type="Vector3" />
 			<param index="4" name="shape_idx" type="int" />
 			<description>
-				Emitted when the object receives an unhandled [InputEvent]. [param position] is the location in world space of the mouse pointer on the surface of the shape with index [param shape_idx] and [param normal] is the normal vector of the surface at that point.
+				Emitted when the object receives an unhandled [InputEvent]. [param event_position] is the location in world space of the mouse pointer on the surface of the shape with index [param shape_idx] and [param normal] is the normal vector of the surface at that point.
 			</description>
 		</signal>
 		<signal name="mouse_entered">
diff --git a/doc/classes/EditorFileDialog.xml b/doc/classes/EditorFileDialog.xml
index b51341dc24..4befcf5e69 100644
--- a/doc/classes/EditorFileDialog.xml
+++ b/doc/classes/EditorFileDialog.xml
@@ -19,6 +19,16 @@
 				For example, a [param filter] of [code]"*.tscn, *.scn"[/code] and a [param description] of [code]"Scenes"[/code] results in filter text "Scenes (*.tscn, *.scn)".
 			</description>
 		</method>
+		<method name="add_option">
+			<return type="void" />
+			<param index="0" name="name" type="String" />
+			<param index="1" name="values" type="PackedStringArray" />
+			<param index="2" name="default_value_index" type="int" />
+			<description>
+				Adds an additional [OptionButton] to the file dialog. If [param values] is empty, a [CheckBox] is added instead.
+				[param default_value_index] should be an index of the value in the [param values]. If [param values] is empty it should be either [code]1[/code] (checked), or [code]0[/code] (unchecked).
+			</description>
+		</method>
 		<method name="add_side_menu">
 			<return type="void" />
 			<param index="0" name="menu" type="Control" />
@@ -40,6 +50,33 @@
 				[b]Warning:[/b] This is a required internal node, removing and freeing it may cause a crash. If you wish to hide it or any of its children, use their [member CanvasItem.visible] property.
 			</description>
 		</method>
+		<method name="get_option_default" qualifiers="const">
+			<return type="int" />
+			<param index="0" name="option" type="int" />
+			<description>
+				Returns the default value index of the [OptionButton] or [CheckBox] with index [param option].
+			</description>
+		</method>
+		<method name="get_option_name" qualifiers="const">
+			<return type="String" />
+			<param index="0" name="option" type="int" />
+			<description>
+				Returns the name of the [OptionButton] or [CheckBox] with index [param option].
+			</description>
+		</method>
+		<method name="get_option_values" qualifiers="const">
+			<return type="PackedStringArray" />
+			<param index="0" name="option" type="int" />
+			<description>
+				Returns an array of values of the [OptionButton] with index [param option].
+			</description>
+		</method>
+		<method name="get_selected_options" qualifiers="const">
+			<return type="Dictionary" />
+			<description>
+				Returns a [Dictionary] with the selected values of the additional [OptionButton]s and/or [CheckBox]es. [Dictionary] keys are names and values are selected value indices.
+			</description>
+		</method>
 		<method name="get_vbox">
 			<return type="VBoxContainer" />
 			<description>
@@ -53,6 +90,30 @@
 				Notify the [EditorFileDialog] that its view of the data is no longer accurate. Updates the view contents on next view update.
 			</description>
 		</method>
+		<method name="set_option_default">
+			<return type="void" />
+			<param index="0" name="option" type="int" />
+			<param index="1" name="default_value_index" type="int" />
+			<description>
+				Sets the default value index of the [OptionButton] or [CheckBox] with index [param option].
+			</description>
+		</method>
+		<method name="set_option_name">
+			<return type="void" />
+			<param index="0" name="option" type="int" />
+			<param index="1" name="name" type="String" />
+			<description>
+				Sets the name of the [OptionButton] or [CheckBox] with index [param option].
+			</description>
+		</method>
+		<method name="set_option_values">
+			<return type="void" />
+			<param index="0" name="option" type="int" />
+			<param index="1" name="values" type="PackedStringArray" />
+			<description>
+				Sets the option values of the [OptionButton] with index [param option].
+			</description>
+		</method>
 	</methods>
 	<members>
 		<member name="access" type="int" setter="set_access" getter="get_access" enum="EditorFileDialog.Access" default="0">
@@ -80,6 +141,9 @@
 		<member name="filters" type="PackedStringArray" setter="set_filters" getter="get_filters" default="PackedStringArray()">
 			The available file type filters. For example, this shows only [code].png[/code] and [code].gd[/code] files: [code]set_filters(PackedStringArray(["*.png ; PNG Images","*.gd ; GDScript Files"]))[/code]. Multiple file types can also be specified in a single filter. [code]"*.png, *.jpg, *.jpeg ; Supported Images"[/code] will show both PNG and JPEG files when selected.
 		</member>
+		<member name="option_count" type="int" setter="set_option_count" getter="get_option_count" default="0">
+			The number of additional [OptionButton]s and [CheckBox]es in the dialog.
+		</member>
 		<member name="show_hidden_files" type="bool" setter="set_show_hidden_files" getter="is_showing_hidden_files" default="false">
 			If [code]true[/code], hidden files and directories will be visible in the [EditorFileDialog]. This property is synchronized with [member EditorSettings.filesystem/file_dialog/show_hidden_files].
 		</member>
diff --git a/doc/classes/EditorSettings.xml b/doc/classes/EditorSettings.xml
index 48ed191db1..573171b7e1 100644
--- a/doc/classes/EditorSettings.xml
+++ b/doc/classes/EditorSettings.xml
@@ -448,6 +448,9 @@
 			The color to use for the TileMap editor's grid.
 			[b]Note:[/b] Only effective if [member editors/tiles_editor/display_grid] is [code]true[/code].
 		</member>
+		<member name="editors/tiles_editor/highlight_selected_layer" type="bool" setter="" getter="">
+			Highlight the currently selected TileMapLayer by dimming the other ones in the scene.
+		</member>
 		<member name="editors/visual_editors/category_colors/color_color" type="Color" setter="" getter="">
 			The color of a graph node's header when it belongs to the "Color" category.
 		</member>
@@ -714,6 +717,9 @@
 			If [code]true[/code], editor main menu is using embedded [MenuBar] instead of system global menu.
 			Specific to the macOS platform.
 		</member>
+		<member name="interface/editor/use_native_file_dialogs" type="bool" setter="" getter="">
+			If [code]true[/code], editor UI uses OS native file/directory selection dialogs.
+		</member>
 		<member name="interface/editor/vsync_mode" type="int" setter="" getter="">
 			Sets the V-Sync mode for the editor. Does not affect the project when run from the editor (this is controlled by [member ProjectSettings.display/window/vsync/vsync_mode]).
 			Depending on the platform and used renderer, the engine will fall back to [b]Enabled[/b] if the desired mode is not supported.
diff --git a/doc/classes/FileDialog.xml b/doc/classes/FileDialog.xml
index 9065adc0e0..dec3160ffe 100644
--- a/doc/classes/FileDialog.xml
+++ b/doc/classes/FileDialog.xml
@@ -23,9 +23,10 @@
 			<return type="void" />
 			<param index="0" name="name" type="String" />
 			<param index="1" name="values" type="PackedStringArray" />
-			<param index="2" name="index" type="int" />
+			<param index="2" name="default_value_index" type="int" />
 			<description>
 				Adds an additional [OptionButton] to the file dialog. If [param values] is empty, a [CheckBox] is added instead.
+				[param default_value_index] should be an index of the value in the [param values]. If [param values] is empty it should be either [code]1[/code] (checked), or [code]0[/code] (unchecked).
 			</description>
 		</method>
 		<method name="clear_filters">
@@ -90,7 +91,7 @@
 		<method name="set_option_default">
 			<return type="void" />
 			<param index="0" name="option" type="int" />
-			<param index="1" name="index" type="int" />
+			<param index="1" name="default_value_index" type="int" />
 			<description>
 				Sets the default value index of the [OptionButton] or [CheckBox] with index [param option].
 			</description>
diff --git a/doc/classes/GeometryInstance3D.xml b/doc/classes/GeometryInstance3D.xml
index a93f77e324..e52a3d7683 100644
--- a/doc/classes/GeometryInstance3D.xml
+++ b/doc/classes/GeometryInstance3D.xml
@@ -102,13 +102,13 @@
 			In other words, the actual mesh will not be visible, only the shadows casted from the mesh will be.
 		</constant>
 		<constant name="GI_MODE_DISABLED" value="0" enum="GIMode">
-			Disabled global illumination mode. Use for dynamic objects that do not contribute to global illumination (such as characters). When using [VoxelGI] and SDFGI, the geometry will [i]receive[/i] indirect lighting and reflections but the geometry will not be considered in GI baking. When using [LightmapGI], the object will receive indirect lighting using lightmap probes instead of using the baked lightmap texture.
+			Disabled global illumination mode. Use for dynamic objects that do not contribute to global illumination (such as characters). When using [VoxelGI] and SDFGI, the geometry will [i]receive[/i] indirect lighting and reflections but the geometry will not be considered in GI baking.
 		</constant>
 		<constant name="GI_MODE_STATIC" value="1" enum="GIMode">
 			Baked global illumination mode. Use for static objects that contribute to global illumination (such as level geometry). This GI mode is effective when using [VoxelGI], SDFGI and [LightmapGI].
 		</constant>
 		<constant name="GI_MODE_DYNAMIC" value="2" enum="GIMode">
-			Dynamic global illumination mode. Use for dynamic objects that contribute to global illumination. This GI mode is only effective when using [VoxelGI], but it has a higher performance impact than [constant GI_MODE_STATIC]. When using other GI methods, this will act the same as [constant GI_MODE_DISABLED].
+			Dynamic global illumination mode. Use for dynamic objects that contribute to global illumination. This GI mode is only effective when using [VoxelGI], but it has a higher performance impact than [constant GI_MODE_STATIC]. When using other GI methods, this will act the same as [constant GI_MODE_DISABLED]. When using [LightmapGI], the object will receive indirect lighting using lightmap probes instead of using the baked lightmap texture.
 		</constant>
 		<constant name="LIGHTMAP_SCALE_1X" value="0" enum="LightmapScale">
 			The standard texel density for lightmapping with [LightmapGI].
diff --git a/doc/classes/GraphEdit.xml b/doc/classes/GraphEdit.xml
index a9ac47d8df..18469b00a8 100644
--- a/doc/classes/GraphEdit.xml
+++ b/doc/classes/GraphEdit.xml
@@ -413,9 +413,9 @@
 			</description>
 		</signal>
 		<signal name="popup_request">
-			<param index="0" name="position" type="Vector2" />
+			<param index="0" name="at_position" type="Vector2" />
 			<description>
-				Emitted when a popup is requested. Happens on right-clicking in the GraphEdit. [param position] is the position of the mouse pointer when the signal is sent.
+				Emitted when a popup is requested. Happens on right-clicking in the GraphEdit. [param at_position] is the position of the mouse pointer when the signal is sent.
 			</description>
 		</signal>
 		<signal name="scroll_offset_changed">
diff --git a/doc/classes/HeightMapShape3D.xml b/doc/classes/HeightMapShape3D.xml
index ba79cbc89a..7e3055b34e 100644
--- a/doc/classes/HeightMapShape3D.xml
+++ b/doc/classes/HeightMapShape3D.xml
@@ -6,6 +6,19 @@
 	<description>
 		A 3D heightmap shape, intended for use in physics. Usually used to provide a shape for a [CollisionShape3D]. This is useful for terrain, but it is limited as overhangs (such as caves) cannot be stored. Holes in a [HeightMapShape3D] are created by assigning very low values to points in the desired area.
 		[b]Performance:[/b] [HeightMapShape3D] is faster to check collisions against than [ConcavePolygonShape3D], but it is significantly slower than primitive shapes like [BoxShape3D].
+		A heightmap collision shape can also be build by using an [Image] reference:
+		[codeblocks]
+		[gdscript]
+		var heightmap_texture: Texture = ResourceLoader.load("res://heightmap_image.exr")
+		var heightmap_image: Image = heightmap_texture.get_image()
+		heightmap_image.convert(Image.FORMAT_RF)
+
+		var height_min: float = 0.0
+		var height_max: float = 10.0
+
+		update_map_data_from_image(heightmap_image, height_min, height_max)
+		[/gdscript]
+		[/codeblocks]
 	</description>
 	<tutorials>
 	</tutorials>
@@ -22,6 +35,17 @@
 				Returns the smallest height value found in [member map_data]. Recalculates only when [member map_data] changes.
 			</description>
 		</method>
+		<method name="update_map_data_from_image">
+			<return type="void" />
+			<param index="0" name="image" type="Image" />
+			<param index="1" name="height_min" type="float" />
+			<param index="2" name="height_max" type="float" />
+			<description>
+				Updates [member map_data] with data read from an [Image] reference. Automatically resizes heightmap [member map_width] and [member map_depth] to fit the full image width and height.
+				The image needs to be in either [constant Image.FORMAT_RF] (32 bit), [constant Image.FORMAT_RH] (16 bit), or [constant Image.FORMAT_R8] (8 bit).
+				Each image pixel is read in as a float on the range from [code]0.0[/code] (black pixel) to [code]1.0[/code] (white pixel). This range value gets remapped to [param height_min] and [param height_max] to form the final height value.
+			</description>
+		</method>
 	</methods>
 	<members>
 		<member name="map_data" type="PackedFloat32Array" setter="set_map_data" getter="get_map_data" default="PackedFloat32Array(0, 0, 0, 0)">
diff --git a/doc/classes/Node.xml b/doc/classes/Node.xml
index e6fdd229bf..ae6cd9596c 100644
--- a/doc/classes/Node.xml
+++ b/doc/classes/Node.xml
@@ -795,8 +795,9 @@
 			<return type="void" />
 			<param index="0" name="node" type="Node" />
 			<param index="1" name="keep_groups" type="bool" default="false" />
+			<param index="2" name="keep_children" type="bool" default="true" />
 			<description>
-				Replaces this node by the given [param node]. All children of this node are moved to [param node].
+				Replaces this node by the given [param node]. If [param keep_children] is [code]true[/code] all children of this node are moved to [param node].
 				If [param keep_groups] is [code]true[/code], the [param node] is added to the same groups that the replaced node is in (see [method add_to_group]).
 				[b]Warning:[/b] The replaced node is removed from the tree, but it is [b]not[/b] deleted. To prevent memory leaks, store a reference to the node in a variable, or use [method Object.free].
 			</description>
diff --git a/doc/classes/ProjectSettings.xml b/doc/classes/ProjectSettings.xml
index 48ac96a25f..4d3e838bb1 100644
--- a/doc/classes/ProjectSettings.xml
+++ b/doc/classes/ProjectSettings.xml
@@ -2643,6 +2643,9 @@
 			The [url=https://en.wikipedia.org/wiki/Bounding_volume_hierarchy]Bounding Volume Hierarchy[/url] quality to use when rendering the occlusion culling buffer. Higher values will result in more accurate occlusion culling, at the cost of higher CPU usage. See also [member rendering/occlusion_culling/occlusion_rays_per_thread].
 			[b]Note:[/b] This property is only read when the project starts. To adjust the BVH build quality at runtime, use [method RenderingServer.viewport_set_occlusion_culling_build_quality].
 		</member>
+		<member name="rendering/occlusion_culling/jitter_projection" type="bool" setter="" getter="" default="true">
+			If [code]true[/code], the projection used for rendering the occlusion buffer will be jittered. This can help prevent objects being incorrectly culled when visible through small gaps.
+		</member>
 		<member name="rendering/occlusion_culling/occlusion_rays_per_thread" type="int" setter="" getter="" default="512">
 			The number of occlusion rays traced per CPU thread. Higher values will result in more accurate occlusion culling, at the cost of higher CPU usage. The occlusion culling buffer's pixel count is roughly equal to [code]occlusion_rays_per_thread * number_of_logical_cpu_cores[/code], so it will depend on the system's CPU. Therefore, CPUs with fewer cores will use a lower resolution to attempt keeping performance costs even across devices. See also [member rendering/occlusion_culling/bvh_build_quality].
 			[b]Note:[/b] This property is only read when the project starts. To adjust the number of occlusion rays traced per thread at runtime, use [method RenderingServer.viewport_set_occlusion_rays_per_thread].
diff --git a/doc/classes/StatusIndicator.xml b/doc/classes/StatusIndicator.xml
index 6b015c3d15..e1fcc35ad7 100644
--- a/doc/classes/StatusIndicator.xml
+++ b/doc/classes/StatusIndicator.xml
@@ -22,7 +22,7 @@
 	<signals>
 		<signal name="pressed">
 			<param index="0" name="mouse_button" type="int" />
-			<param index="1" name="position" type="Vector2i" />
+			<param index="1" name="mouse_position" type="Vector2i" />
 			<description>
 				Emitted when the status indicator is pressed.
 			</description>
diff --git a/doc/classes/TileMap.xml b/doc/classes/TileMap.xml
index fc19e1de49..bc8a1d7bf1 100644
--- a/doc/classes/TileMap.xml
+++ b/doc/classes/TileMap.xml
@@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="UTF-8" ?>
-<class name="TileMap" inherits="TileMapLayerGroup" keywords="gridmap" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="../class.xsd">
+<class name="TileMap" inherits="Node2D" deprecated="Use multiple [TileMapLayer] nodes instead." keywords="gridmap" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="../class.xsd">
 	<brief_description>
 		Node for 2D tile-based maps.
 	</brief_description>
@@ -89,7 +89,8 @@
 			<param index="1" name="coords" type="Vector2i" />
 			<param index="2" name="use_proxies" type="bool" default="false" />
 			<description>
-				Returns the tile alternative ID of the cell on layer [param layer] at [param coords]. If [param use_proxies] is [code]false[/code], ignores the [TileSet]'s tile proxies, returning the raw alternative identifier. See [method TileSet.map_tile_proxy].
+				Returns the tile alternative ID of the cell on layer [param layer] at [param coords].
+				If [param use_proxies] is [code]false[/code], ignores the [TileSet]'s tile proxies, returning the raw alternative identifier. See [method TileSet.map_tile_proxy].
 				If [param layer] is negative, the layers are accessed from the last one.
 			</description>
 		</method>
@@ -100,7 +101,7 @@
 			<param index="2" name="use_proxies" type="bool" default="false" />
 			<description>
 				Returns the tile atlas coordinates ID of the cell on layer [param layer] at coordinates [param coords]. Returns [code]Vector2i(-1, -1)[/code] if the cell does not exist.
-				If [param use_proxies] is [code]false[/code], ignores the [TileSet]'s tile proxies, returning the raw alternative identifier. See [method TileSet.map_tile_proxy].
+				If [param use_proxies] is [code]false[/code], ignores the [TileSet]'s tile proxies, returning the raw atlas coordinate identifier. See [method TileSet.map_tile_proxy].
 				If [param layer] is negative, the layers are accessed from the last one.
 			</description>
 		</method>
@@ -111,7 +112,7 @@
 			<param index="2" name="use_proxies" type="bool" default="false" />
 			<description>
 				Returns the tile source ID of the cell on layer [param layer] at coordinates [param coords]. Returns [code]-1[/code] if the cell does not exist.
-				If [param use_proxies] is [code]false[/code], ignores the [TileSet]'s tile proxies, returning the raw alternative identifier. See [method TileSet.map_tile_proxy].
+				If [param use_proxies] is [code]false[/code], ignores the [TileSet]'s tile proxies, returning the raw source identifier. See [method TileSet.map_tile_proxy].
 				If [param layer] is negative, the layers are accessed from the last one.
 			</description>
 		</method>
@@ -123,7 +124,6 @@
 			<description>
 				Returns the [TileData] object associated with the given cell, or [code]null[/code] if the cell does not exist or is not a [TileSetAtlasSource].
 				If [param layer] is negative, the layers are accessed from the last one.
-				If [param use_proxies] is [code]false[/code], ignores the [TileSet]'s tile proxies, returning the raw alternative identifier. See [method TileSet.map_tile_proxy].
 				[codeblock]
 				func get_clicked_tile_power():
 				    var clicked_cell = tile_map.local_to_map(tile_map.get_local_mouse_position())
@@ -133,6 +133,7 @@
 				    else:
 				        return 0
 				[/codeblock]
+				If [param use_proxies] is [code]false[/code], ignores the [TileSet]'s tile proxies. See [method TileSet.map_tile_proxy].
 			</description>
 		</method>
 		<method name="get_coords_for_body_rid">
@@ -489,6 +490,9 @@
 			The quadrant size does not apply on Y-sorted layers, as tiles are be grouped by Y position instead in that case.
 			[b]Note:[/b] As quadrants are created according to the map's coordinate system, the quadrant's "square shape" might not look like square in the TileMap's local coordinate system.
 		</member>
+		<member name="tile_set" type="TileSet" setter="set_tileset" getter="get_tileset">
+			The [TileSet] used by this [TileMap]. The textures, collisions, and additional behavior of all available tiles are stored here.
+		</member>
 	</members>
 	<signals>
 		<signal name="changed">
diff --git a/doc/classes/TileMapLayer.xml b/doc/classes/TileMapLayer.xml
new file mode 100644
index 0000000000..bc8e259599
--- /dev/null
+++ b/doc/classes/TileMapLayer.xml
@@ -0,0 +1,303 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<class name="TileMapLayer" inherits="Node2D" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="../class.xsd">
+	<brief_description>
+		Node for 2D tile-based maps.
+	</brief_description>
+	<description>
+		Node for 2D tile-based maps. A [TileMapLayer] uses a [TileSet] which contain a list of tiles which are used to create grid-based maps. Unlike the [TileMap] node, which is deprecated, [TileMapLayer] has only one layer of tiles. You can use several [TileMapLayer] to achieve the same result as a [TileMap] node.
+		For performance reasons, all TileMap updates are batched at the end of a frame. Notably, this means that scene tiles from a [TileSetScenesCollectionSource] may be initialized after their parent. This is only queued when inside the scene tree.
+		To force an update earlier on, call [method update_internals].
+	</description>
+	<tutorials>
+	</tutorials>
+	<methods>
+		<method name="_tile_data_runtime_update" qualifiers="virtual">
+			<return type="void" />
+			<param index="0" name="coords" type="Vector2i" />
+			<param index="1" name="tile_data" type="TileData" />
+			<description>
+				Called with a [TileData] object about to be used internally by the [TileMapLayer], allowing its modification at runtime.
+				This method is only called if [method _use_tile_data_runtime_update] is implemented and returns [code]true[/code] for the given tile [param coords].
+				[b]Warning:[/b] The [param tile_data] object's sub-resources are the same as the one in the TileSet. Modifying them might impact the whole TileSet. Instead, make sure to duplicate those resources.
+				[b]Note:[/b] If the properties of [param tile_data] object should change over time, use [method notify_runtime_tile_data_update] to notify the [TileMapLayer] it needs an update.
+			</description>
+		</method>
+		<method name="_use_tile_data_runtime_update" qualifiers="virtual">
+			<return type="bool" />
+			<param index="0" name="coords" type="Vector2i" />
+			<description>
+				Should return [code]true[/code] if the tile at coordinates [param coords] requires a runtime update.
+				[b]Warning:[/b] Make sure this function only returns [code]true[/code] when needed. Any tile processed at runtime without a need for it will imply a significant performance penalty.
+				[b]Note:[/b] If the result of this function should change, use [method notify_runtime_tile_data_update] to notify the [TileMapLayer] it needs an update.
+			</description>
+		</method>
+		<method name="clear">
+			<return type="void" />
+			<description>
+				Clears all cells.
+			</description>
+		</method>
+		<method name="erase_cell">
+			<return type="void" />
+			<param index="0" name="coords" type="Vector2i" />
+			<description>
+				Erases the cell at coordinates [param coords].
+			</description>
+		</method>
+		<method name="fix_invalid_tiles">
+			<return type="void" />
+			<description>
+				Clears cells containing tiles that do not exist in the [member tile_set].
+			</description>
+		</method>
+		<method name="get_cell_alternative_tile" qualifiers="const">
+			<return type="int" />
+			<param index="0" name="coords" type="Vector2i" />
+			<description>
+				Returns the tile alternative ID of the cell at coordinates [param coords].
+			</description>
+		</method>
+		<method name="get_cell_atlas_coords" qualifiers="const">
+			<return type="Vector2i" />
+			<param index="0" name="coords" type="Vector2i" />
+			<description>
+				Returns the tile atlas coordinates ID of the cell at coordinates [param coords]. Returns [code]Vector2i(-1, -1)[/code] if the cell does not exist.
+			</description>
+		</method>
+		<method name="get_cell_source_id" qualifiers="const">
+			<return type="int" />
+			<param index="0" name="coords" type="Vector2i" />
+			<description>
+				Returns the tile source ID of the cell at coordinates [param coords]. Returns [code]-1[/code] if the cell does not exist.
+			</description>
+		</method>
+		<method name="get_cell_tile_data" qualifiers="const">
+			<return type="TileData" />
+			<param index="0" name="coords" type="Vector2i" />
+			<description>
+				Returns the [TileData] object associated with the given cell, or [code]null[/code] if the cell does not exist or is not a [TileSetAtlasSource].
+				[codeblock]
+				func get_clicked_tile_power():
+				    var clicked_cell = tile_map_layer.local_to_map(tile_map_layer.get_local_mouse_position())
+				    var data = tile_map_layer.get_cell_tile_data(clicked_cell)
+				    if data:
+				        return data.get_custom_data("power")
+				    else:
+				        return 0
+				[/codeblock]
+			</description>
+		</method>
+		<method name="get_coords_for_body_rid" qualifiers="const">
+			<return type="Vector2i" />
+			<param index="0" name="body" type="RID" />
+			<description>
+				Returns the coordinates of the tile for given physics body [RID]. Such an [RID] can be retrieved from [method KinematicCollision2D.get_collider_rid], when colliding with a tile.
+			</description>
+		</method>
+		<method name="get_navigation_map" qualifiers="const">
+			<return type="RID" />
+			<description>
+				Returns the [RID] of the [NavigationServer2D] navigation used by this [TileMapLayer]. 
+				By default this returns the default [World2D] navigation map, unless a custom map was provided using [method set_navigation_map].
+			</description>
+		</method>
+		<method name="get_neighbor_cell" qualifiers="const">
+			<return type="Vector2i" />
+			<param index="0" name="coords" type="Vector2i" />
+			<param index="1" name="neighbor" type="int" enum="TileSet.CellNeighbor" />
+			<description>
+				Returns the neighboring cell to the one at coordinates [param coords], identified by the [param neighbor] direction. This method takes into account the different layouts a TileMap can take.
+			</description>
+		</method>
+		<method name="get_pattern">
+			<return type="TileMapPattern" />
+			<param index="0" name="coords_array" type="Vector2i[]" />
+			<description>
+				Creates and returns a new [TileMapPattern] from the given array of cells. See also [method set_pattern].
+			</description>
+		</method>
+		<method name="get_surrounding_cells">
+			<return type="Vector2i[]" />
+			<param index="0" name="coords" type="Vector2i" />
+			<description>
+				Returns the list of all neighboring cells to the one at [param coords].
+			</description>
+		</method>
+		<method name="get_used_cells" qualifiers="const">
+			<return type="Vector2i[]" />
+			<description>
+				Returns a [Vector2i] array with the positions of all cells containing a tile. A cell is considered empty if its source identifier equals [code]-1[/code], its atlas coordinate identifier is [code]Vector2(-1, -1)[/code] and its alternative identifier is [code]-1[/code].
+			</description>
+		</method>
+		<method name="get_used_cells_by_id" qualifiers="const">
+			<return type="Vector2i[]" />
+			<param index="0" name="source_id" type="int" default="-1" />
+			<param index="1" name="atlas_coords" type="Vector2i" default="Vector2i(-1, -1)" />
+			<param index="2" name="alternative_tile" type="int" default="-1" />
+			<description>
+				Returns a [Vector2i] array with the positions of all cells containing a tile. Tiles may be filtered according to their source ([param source_id]), their atlas coordinates ([param atlas_coords]), or alternative id ([param alternative_tile]).
+				If a parameter has its value set to the default one, this parameter is not used to filter a cell. Thus, if all parameters have their respective default values, this method returns the same result as [method get_used_cells].
+				A cell is considered empty if its source identifier equals [code]-1[/code], its atlas coordinate identifier is [code]Vector2(-1, -1)[/code] and its alternative identifier is [code]-1[/code].
+			</description>
+		</method>
+		<method name="get_used_rect" qualifiers="const">
+			<return type="Rect2i" />
+			<description>
+				Returns a rectangle enclosing the used (non-empty) tiles of the map.
+			</description>
+		</method>
+		<method name="has_body_rid" qualifiers="const">
+			<return type="bool" />
+			<param index="0" name="body" type="RID" />
+			<description>
+				Returns whether the provided [param body] [RID] belongs to one of this [TileMapLayer]'s cells.
+			</description>
+		</method>
+		<method name="local_to_map" qualifiers="const">
+			<return type="Vector2i" />
+			<param index="0" name="local_position" type="Vector2" />
+			<description>
+				Returns the map coordinates of the cell containing the given [param local_position]. If [param local_position] is in global coordinates, consider using [method Node2D.to_local] before passing it to this method. See also [method map_to_local].
+			</description>
+		</method>
+		<method name="map_pattern">
+			<return type="Vector2i" />
+			<param index="0" name="position_in_tilemap" type="Vector2i" />
+			<param index="1" name="coords_in_pattern" type="Vector2i" />
+			<param index="2" name="pattern" type="TileMapPattern" />
+			<description>
+				Returns for the given coordinates [param coords_in_pattern] in a [TileMapPattern] the corresponding cell coordinates if the pattern was pasted at the [param position_in_tilemap] coordinates (see [method set_pattern]). This mapping is required as in half-offset tile shapes, the mapping might not work by calculating [code]position_in_tile_map + coords_in_pattern[/code].
+			</description>
+		</method>
+		<method name="map_to_local" qualifiers="const">
+			<return type="Vector2" />
+			<param index="0" name="map_position" type="Vector2i" />
+			<description>
+				Returns the centered position of a cell in the [TileMapLayer]'s local coordinate space. To convert the returned value into global coordinates, use [method Node2D.to_global]. See also [method local_to_map].
+				[b]Note:[/b] This may not correspond to the visual position of the tile, i.e. it ignores the [member TileData.texture_origin] property of individual tiles.
+			</description>
+		</method>
+		<method name="notify_runtime_tile_data_update">
+			<return type="void" />
+			<description>
+				Notifies the [TileMapLayer] node that calls to [method _use_tile_data_runtime_update] or [method _tile_data_runtime_update] will lead to different results. This will thus trigger a [TileMapLayer] update.
+				[b]Warning:[/b] Updating the [TileMapLayer] is computationally expensive and may impact performance. Try to limit the number of calls to this function to avoid unnecessary update.
+				[b]Note:[/b] This does not trigger a direct update of the [TileMapLayer], the update will be done at the end of the frame as usual (unless you call [method update_internals]).
+			</description>
+		</method>
+		<method name="set_cell">
+			<return type="void" />
+			<param index="0" name="coords" type="Vector2i" />
+			<param index="1" name="source_id" type="int" default="-1" />
+			<param index="2" name="atlas_coords" type="Vector2i" default="Vector2i(-1, -1)" />
+			<param index="3" name="alternative_tile" type="int" default="0" />
+			<description>
+				Sets the tile identifiers for the cell at coordinates [param coords]. Each tile of the [TileSet] is identified using three parts:
+				- The source identifier [param source_id] identifies a [TileSetSource] identifier. See [method TileSet.set_source_id],
+				- The atlas coordinate identifier [param atlas_coords] identifies a tile coordinates in the atlas (if the source is a [TileSetAtlasSource]). For [TileSetScenesCollectionSource] it should always be [code]Vector2i(0, 0)[/code],
+				- The alternative tile identifier [param alternative_tile] identifies a tile alternative in the atlas (if the source is a [TileSetAtlasSource]), and the scene for a [TileSetScenesCollectionSource].
+				If [param source_id] is set to [code]-1[/code], [param atlas_coords] to [code]Vector2i(-1, -1)[/code], or [param alternative_tile] to [code]-1[/code], the cell will be erased. An erased cell gets [b]all[/b] its identifiers automatically set to their respective invalid values, namely [code]-1[/code], [code]Vector2i(-1, -1)[/code] and [code]-1[/code].
+			</description>
+		</method>
+		<method name="set_cells_terrain_connect">
+			<return type="void" />
+			<param index="0" name="cells" type="Vector2i[]" />
+			<param index="1" name="terrain_set" type="int" />
+			<param index="2" name="terrain" type="int" />
+			<param index="3" name="ignore_empty_terrains" type="bool" default="true" />
+			<description>
+				Update all the cells in the [param cells] coordinates array so that they use the given [param terrain] for the given [param terrain_set]. If an updated cell has the same terrain as one of its neighboring cells, this function tries to join the two. This function might update neighboring tiles if needed to create correct terrain transitions.
+				If [param ignore_empty_terrains] is true, empty terrains will be ignored when trying to find the best fitting tile for the given terrain constraints.
+				[b]Note:[/b] To work correctly, this method requires the [TileMapLayer]'s TileSet to have terrains set up with all required terrain combinations. Otherwise, it may produce unexpected results.
+			</description>
+		</method>
+		<method name="set_cells_terrain_path">
+			<return type="void" />
+			<param index="0" name="path" type="Vector2i[]" />
+			<param index="1" name="terrain_set" type="int" />
+			<param index="2" name="terrain" type="int" />
+			<param index="3" name="ignore_empty_terrains" type="bool" default="true" />
+			<description>
+				Update all the cells in the [param path] coordinates array so that they use the given [param terrain] for the given [param terrain_set]. The function will also connect two successive cell in the path with the same terrain. This function might update neighboring tiles if needed to create correct terrain transitions.
+				If [param ignore_empty_terrains] is true, empty terrains will be ignored when trying to find the best fitting tile for the given terrain constraints.
+				[b]Note:[/b] To work correctly, this method requires the [TileMapLayer]'s TileSet to have terrains set up with all required terrain combinations. Otherwise, it may produce unexpected results.
+			</description>
+		</method>
+		<method name="set_navigation_map">
+			<return type="void" />
+			<param index="0" name="map" type="RID" />
+			<description>
+				Sets a custom [param map] as a [NavigationServer2D] navigation map. If not set, uses the default [World2D] navigation map instead.
+			</description>
+		</method>
+		<method name="set_pattern">
+			<return type="void" />
+			<param index="0" name="position" type="Vector2i" />
+			<param index="1" name="pattern" type="TileMapPattern" />
+			<description>
+				Pastes the [TileMapPattern] at the given [param position] in the tile map. See also [method get_pattern].
+			</description>
+		</method>
+		<method name="update_internals">
+			<return type="void" />
+			<description>
+				Triggers a direct update of the [TileMapLayer]. Usually, calling this function is not needed, as [TileMapLayer] node updates automatically when one of its properties or cells is modified.
+				However, for performance reasons, those updates are batched and delayed to the end of the frame. Calling this function will force the [TileMapLayer] to update right away instead.
+				[b]Warning:[/b] Updating the [TileMapLayer] is computationally expensive and may impact performance. Try to limit the number of updates and how many tiles they impact.
+			</description>
+		</method>
+	</methods>
+	<members>
+		<member name="collision_enabled" type="bool" setter="set_collision_enabled" getter="is_collision_enabled" default="true">
+			Enable or disable collisions.
+		</member>
+		<member name="collision_visibility_mode" type="int" setter="set_collision_visibility_mode" getter="get_collision_visibility_mode" enum="TileMapLayer.DebugVisibilityMode" default="0">
+			Show or hide the [TileMapLayer]'s collision shapes. If set to [constant DEBUG_VISIBILITY_MODE_DEFAULT], this depends on the show collision debug settings.
+		</member>
+		<member name="enabled" type="bool" setter="set_enabled" getter="is_enabled" default="true">
+			If [code]false[/code], disables this [TileMapLayer] completely (rendering, collision, navigation, scene tiles, etc.)
+		</member>
+		<member name="navigation_enabled" type="bool" setter="set_navigation_enabled" getter="is_navigation_enabled" default="true">
+			If [code]true[/code], navigation regions are enabled.
+		</member>
+		<member name="navigation_visibility_mode" type="int" setter="set_navigation_visibility_mode" getter="get_navigation_visibility_mode" enum="TileMapLayer.DebugVisibilityMode" default="0">
+			Show or hide the [TileMapLayer]'s navigation meshes. If set to [constant DEBUG_VISIBILITY_MODE_DEFAULT], this depends on the show navigation debug settings.
+		</member>
+		<member name="rendering_quadrant_size" type="int" setter="set_rendering_quadrant_size" getter="get_rendering_quadrant_size" default="16">
+			The [TileMapLayer]'s quadrant size. A quadrant is a group of tiles to be drawn together on a single canvas item, for optimization purposes. [member rendering_quadrant_size] defines the length of a square's side, in the map's coordinate system, that forms the quadrant. Thus, the default quandrant size groups together [code]16 * 16 = 256[/code] tiles.
+			The quadrant size does not apply on a Y-sorted [TileMapLayer], as tiles are be grouped by Y position instead in that case.
+			[b]Note:[/b] As quadrants are created according to the map's coordinate system, the quadrant's "square shape" might not look like square in the [TileMapLayer]'s local coordinate system.
+		</member>
+		<member name="tile_map_data" type="PackedByteArray" setter="set_tile_map_data_from_array" getter="get_tile_map_data_as_array" default="PackedByteArray(0, 0)">
+			The raw tile map data as a byte array.
+		</member>
+		<member name="tile_set" type="TileSet" setter="set_tile_set" getter="get_tile_set">
+			The [TileSet] used by this layer. The textures, collisions, and additional behavior of all available tiles are stored here.
+		</member>
+		<member name="use_kinematic_bodies" type="bool" setter="set_use_kinematic_bodies" getter="is_using_kinematic_bodies" default="false">
+			If [code]true[/code], this [TileMapLayer] collision shapes will be instantiated as kinematic bodies. This can be needed for moving [TileMapLayer] nodes (i.e. moving platforms).
+		</member>
+		<member name="y_sort_origin" type="int" setter="set_y_sort_origin" getter="get_y_sort_origin" default="0">
+			This Y-sort origin value is added to each tile's Y-sort origin value. This allows, for example, to fake a different height level. This can be useful for top-down view games.
+		</member>
+	</members>
+	<signals>
+		<signal name="changed">
+			<description>
+				Emitted when this [TileMapLayer]'s properties changes. This includes modified cells, properties, or changes made to its assigned [TileSet].
+				[b]Note:[/b] This signal may be emitted very often when batch-modifying a [TileMapLayer]. Avoid executing complex processing in a connected function, and consider delaying it to the end of the frame instead (i.e. calling [method Object.call_deferred]).
+			</description>
+		</signal>
+	</signals>
+	<constants>
+		<constant name="DEBUG_VISIBILITY_MODE_DEFAULT" value="0" enum="DebugVisibilityMode">
+			Hide the collisions or navigation debug shapes in the editor, and use the debug settings to determine their visibility in game (i.e. [member SceneTree.debug_collisions_hint] or [member SceneTree.debug_navigation_hint]).
+		</constant>
+		<constant name="DEBUG_VISIBILITY_MODE_FORCE_HIDE" value="2" enum="DebugVisibilityMode">
+			Always hide the collisions or navigation debug shapes.
+		</constant>
+		<constant name="DEBUG_VISIBILITY_MODE_FORCE_SHOW" value="1" enum="DebugVisibilityMode">
+			Always show the collisions or navigation debug shapes.
+		</constant>
+	</constants>
+</class>
diff --git a/doc/classes/TileMapLayerGroup.xml b/doc/classes/TileMapLayerGroup.xml
deleted file mode 100644
index 3787d3bb17..0000000000
--- a/doc/classes/TileMapLayerGroup.xml
+++ /dev/null
@@ -1,17 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" ?>
-<class name="TileMapLayerGroup" inherits="Node2D" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="../class.xsd">
-	<brief_description>
-		Groups a set of tile map layers together, allowing them to share a provided [TileSet].
-	</brief_description>
-	<description>
-		Groups together tile map layers as part or the same map, replacing the [TileMap] node. Child layers will use this node's [member tile_set].
-		The editor also uses [TileMapLayerGroup] as a way to store which layers are selected in a given group. This allows highlighting the currently selected layers.
-	</description>
-	<tutorials>
-	</tutorials>
-	<members>
-		<member name="tile_set" type="TileSet" setter="set_tileset" getter="get_tileset">
-			The assigned [TileSet]. This TileSet will be applied to all child layers.
-		</member>
-	</members>
-</class>
diff --git a/doc/classes/Tree.xml b/doc/classes/Tree.xml
index 0f318efbd1..d95492479c 100644
--- a/doc/classes/Tree.xml
+++ b/doc/classes/Tree.xml
@@ -414,7 +414,7 @@
 			</description>
 		</signal>
 		<signal name="empty_clicked">
-			<param index="0" name="position" type="Vector2" />
+			<param index="0" name="click_position" type="Vector2" />
 			<param index="1" name="mouse_button_index" type="int" />
 			<description>
 				Emitted when a mouse button is clicked in the empty space of the tree.
@@ -442,7 +442,7 @@
 			</description>
 		</signal>
 		<signal name="item_mouse_selected">
-			<param index="0" name="position" type="Vector2" />
+			<param index="0" name="mouse_position" type="Vector2" />
 			<param index="1" name="mouse_button_index" type="int" />
 			<description>
 				Emitted when an item is selected with a mouse button.
diff --git a/drivers/gles3/rasterizer_scene_gles3.cpp b/drivers/gles3/rasterizer_scene_gles3.cpp
index 29cfa251d6..b8cc3928eb 100644
--- a/drivers/gles3/rasterizer_scene_gles3.cpp
+++ b/drivers/gles3/rasterizer_scene_gles3.cpp
@@ -823,6 +823,11 @@ void RasterizerSceneGLES3::_draw_sky(RID p_env, const Projection &p_projection,
 	} else {
 		camera = p_projection;
 	}
+
+	Projection correction;
+	correction.set_depth_correction(false, true, false);
+	camera = correction * camera;
+
 	Basis sky_transform = environment_get_sky_orientation(p_env);
 	sky_transform.invert();
 	sky_transform = sky_transform * p_transform.basis;
@@ -933,7 +938,7 @@ void RasterizerSceneGLES3::_update_sky_radiance(RID p_env, const Projection &p_p
 		Projection cm;
 		cm.set_perspective(90, 1, 0.01, 10.0);
 		Projection correction;
-		correction.columns[1][1] = -1.0;
+		correction.set_depth_correction(true, true, false);
 		cm = correction * cm;
 
 		bool success = material_storage->shaders.sky_shader.version_bind_shader(shader_data->version, SkyShaderGLES3::MODE_CUBEMAP);
@@ -1546,7 +1551,7 @@ void RasterizerSceneGLES3::_fill_render_list(RenderListType p_render_list, const
 // Needs to be called after _setup_lights so that directional_light_count is accurate.
 void RasterizerSceneGLES3::_setup_environment(const RenderDataGLES3 *p_render_data, bool p_no_fog, const Size2i &p_screen_size, bool p_flip_y, const Color &p_default_bg_color, bool p_pancake_shadows, float p_shadow_bias) {
 	Projection correction;
-	correction.columns[1][1] = p_flip_y ? -1.0 : 1.0;
+	correction.set_depth_correction(p_flip_y, true, false);
 	Projection projection = correction * p_render_data->cam_projection;
 	//store camera into ubo
 	GLES3::MaterialStorage::store_camera(projection, scene_state.ubo.projection_matrix);
@@ -1801,7 +1806,9 @@ void RasterizerSceneGLES3::_setup_lights(const RenderDataGLES3 *p_render_data, b
 					shadow_data.blend_splits = uint32_t((shadow_mode != RS::LIGHT_DIRECTIONAL_SHADOW_ORTHOGONAL) && light_storage->light_directional_get_blend_splits(base));
 					for (int j = 0; j < 4; j++) {
 						Rect2 atlas_rect = li->shadow_transform[j].atlas_rect;
-						Projection matrix = li->shadow_transform[j].camera;
+						Projection correction;
+						correction.set_depth_correction(false, true, false);
+						Projection matrix = correction * li->shadow_transform[j].camera;
 						float split = li->shadow_transform[MIN(limit, j)].split;
 
 						Projection bias;
@@ -2027,7 +2034,9 @@ void RasterizerSceneGLES3::_setup_lights(const RenderDataGLES3 *p_render_data, b
 				Projection bias;
 				bias.set_light_bias();
 
-				Projection cm = li->shadow_transform[0].camera;
+				Projection correction;
+				correction.set_depth_correction(false, true, false);
+				Projection cm = correction * li->shadow_transform[0].camera;
 				Projection shadow_mtx = bias * cm * modelview;
 				GLES3::MaterialStorage::store_camera(shadow_mtx, shadow_data.shadow_matrix);
 			}
@@ -2274,11 +2283,11 @@ void RasterizerSceneGLES3::_render_shadow_pass(RID p_light, RID p_shadow_atlas,
 	scene_state.reset_gl_state();
 	scene_state.enable_gl_depth_test(true);
 	scene_state.enable_gl_depth_draw(true);
-	glDepthFunc(GL_LESS);
+	glDepthFunc(GL_GREATER);
 
 	glColorMask(0, 0, 0, 0);
 	glDrawBuffers(0, nullptr);
-	RasterizerGLES3::clear_depth(1.0);
+	RasterizerGLES3::clear_depth(0.0);
 	if (needs_clear) {
 		glClear(GL_DEPTH_BUFFER_BIT);
 	}
@@ -2515,7 +2524,7 @@ void RasterizerSceneGLES3::render_scene(const Ref<RenderSceneBuffers> &p_render_
 			Projection projection = render_data.cam_projection;
 			if (render_data.reflection_probe.is_valid()) {
 				Projection correction;
-				correction.columns[1][1] = -1.0;
+				correction.set_depth_correction(true, true, false);
 				projection = correction * render_data.cam_projection;
 			}
 
@@ -2554,11 +2563,11 @@ void RasterizerSceneGLES3::render_scene(const Ref<RenderSceneBuffers> &p_render_
 		scene_state.enable_gl_depth_test(true);
 		scene_state.enable_gl_depth_draw(true);
 		scene_state.enable_gl_blend(false);
-		glDepthFunc(GL_LEQUAL);
+		glDepthFunc(GL_GEQUAL);
 		scene_state.enable_gl_scissor_test(false);
 
 		glColorMask(0, 0, 0, 0);
-		RasterizerGLES3::clear_depth(1.0);
+		RasterizerGLES3::clear_depth(0.0);
 		glClear(GL_DEPTH_BUFFER_BIT);
 		glDrawBuffers(0, nullptr);
 
@@ -2590,7 +2599,7 @@ void RasterizerSceneGLES3::render_scene(const Ref<RenderSceneBuffers> &p_render_
 	scene_state.enable_gl_scissor_test(false);
 	scene_state.enable_gl_depth_test(true);
 	scene_state.enable_gl_depth_draw(true);
-	glDepthFunc(GL_LEQUAL);
+	glDepthFunc(GL_GEQUAL);
 
 	{
 		GLuint db = GL_COLOR_ATTACHMENT0;
@@ -2598,7 +2607,7 @@ void RasterizerSceneGLES3::render_scene(const Ref<RenderSceneBuffers> &p_render_
 	}
 
 	if (!fb_cleared) {
-		RasterizerGLES3::clear_depth(1.0);
+		RasterizerGLES3::clear_depth(0.0);
 		glClear(GL_DEPTH_BUFFER_BIT);
 	}
 
@@ -3559,12 +3568,12 @@ void RasterizerSceneGLES3::render_particle_collider_heightfield(RID p_collider,
 	scene_state.reset_gl_state();
 	scene_state.enable_gl_depth_test(true);
 	scene_state.enable_gl_depth_draw(true);
-	glDepthFunc(GL_LESS);
+	glDepthFunc(GL_GREATER);
 
 	glDrawBuffers(0, nullptr);
 
 	glColorMask(0, 0, 0, 0);
-	RasterizerGLES3::clear_depth(1.0);
+	RasterizerGLES3::clear_depth(0.0);
 
 	glClear(GL_DEPTH_BUFFER_BIT);
 
@@ -3605,7 +3614,7 @@ void RasterizerSceneGLES3::_render_uv2(const PagedArray<RenderGeometryInstance *
 		scene_state.reset_gl_state();
 		scene_state.enable_gl_depth_test(true);
 		scene_state.enable_gl_depth_draw(true);
-		glDepthFunc(GL_LESS);
+		glDepthFunc(GL_GREATER);
 
 		TightLocalVector<GLenum> draw_buffers;
 		draw_buffers.push_back(GL_COLOR_ATTACHMENT0);
@@ -3738,7 +3747,7 @@ void RasterizerSceneGLES3::_render_buffers_debug_draw(Ref<RenderSceneBuffersGLES
 						copy_effects->copy_cube_to_rect(atlas_uv_rect);
 
 						glTexParameteri(GL_TEXTURE_CUBE_MAP, GL_TEXTURE_COMPARE_MODE, GL_COMPARE_REF_TO_TEXTURE);
-						glTexParameteri(GL_TEXTURE_CUBE_MAP, GL_TEXTURE_COMPARE_FUNC, GL_LESS);
+						glTexParameteri(GL_TEXTURE_CUBE_MAP, GL_TEXTURE_COMPARE_FUNC, GL_GREATER);
 					} else {
 						glBindTexture(GL_TEXTURE_2D, shadow_tex);
 						glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_COMPARE_MODE, GL_NONE);
@@ -3746,7 +3755,7 @@ void RasterizerSceneGLES3::_render_buffers_debug_draw(Ref<RenderSceneBuffersGLES
 						copy_effects->copy_to_rect(atlas_uv_rect);
 
 						glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_COMPARE_MODE, GL_COMPARE_REF_TO_TEXTURE);
-						glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_COMPARE_FUNC, GL_LESS);
+						glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_COMPARE_FUNC, GL_GREATER);
 					}
 				}
 			}
@@ -3782,7 +3791,7 @@ void RasterizerSceneGLES3::_render_buffers_debug_draw(Ref<RenderSceneBuffersGLES
 			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_B, GL_BLUE);
 			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_A, GL_ALPHA);
 			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_COMPARE_MODE, GL_COMPARE_REF_TO_TEXTURE);
-			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_COMPARE_FUNC, GL_LESS);
+			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_COMPARE_FUNC, GL_GREATER);
 			glBindTexture(GL_TEXTURE_2D, 0);
 		}
 	}
diff --git a/drivers/gles3/shaders/canvas.glsl b/drivers/gles3/shaders/canvas.glsl
index 8da7d7dc80..efddbe9ad2 100644
--- a/drivers/gles3/shaders/canvas.glsl
+++ b/drivers/gles3/shaders/canvas.glsl
@@ -160,15 +160,18 @@ void main() {
 	if (gl_VertexID % 3 == 0) {
 		vertex = read_draw_data_point_a;
 		uv = read_draw_data_uv_a;
-		color = vec4(unpackHalf2x16(read_draw_data_color_a_rg), unpackHalf2x16(read_draw_data_color_a_ba));
+		color.xy = unpackHalf2x16(read_draw_data_color_a_rg);
+		color.zw = unpackHalf2x16(read_draw_data_color_a_ba);
 	} else if (gl_VertexID % 3 == 1) {
 		vertex = read_draw_data_point_b;
 		uv = read_draw_data_uv_b;
-		color = vec4(unpackHalf2x16(read_draw_data_color_b_rg), unpackHalf2x16(read_draw_data_color_b_ba));
+		color.xy = unpackHalf2x16(read_draw_data_color_b_rg);
+		color.zw = unpackHalf2x16(read_draw_data_color_b_ba);
 	} else {
 		vertex = read_draw_data_point_c;
 		uv = read_draw_data_uv_c;
-		color = vec4(unpackHalf2x16(read_draw_data_color_c_rg), unpackHalf2x16(read_draw_data_color_c_ba));
+		color.xy = unpackHalf2x16(read_draw_data_color_c_rg);
+		color.zw = unpackHalf2x16(read_draw_data_color_c_ba);
 	}
 
 #elif defined(USE_ATTRIBUTES)
@@ -178,11 +181,14 @@ void main() {
 
 #ifdef USE_INSTANCING
 	if (bool(read_draw_data_flags & FLAGS_INSTANCING_HAS_COLORS)) {
-		vec4 instance_color = vec4(unpackHalf2x16(instance_color_custom_data.x), unpackHalf2x16(instance_color_custom_data.y));
+		vec4 instance_color;
+		instance_color.xy = unpackHalf2x16(uint(instance_color_custom_data.x));
+		instance_color.zw = unpackHalf2x16(uint(instance_color_custom_data.y));
 		color *= instance_color;
 	}
 	if (bool(read_draw_data_flags & FLAGS_INSTANCING_HAS_CUSTOM_DATA)) {
-		instance_custom = vec4(unpackHalf2x16(instance_color_custom_data.z), unpackHalf2x16(instance_color_custom_data.w));
+		instance_custom.xy = unpackHalf2x16(instance_color_custom_data.z);
+		instance_custom.zw = unpackHalf2x16(instance_color_custom_data.w);
 	}
 #endif // !USE_INSTANCING
 
diff --git a/drivers/gles3/shaders/cube_to_dp.glsl b/drivers/gles3/shaders/cube_to_dp.glsl
index 2384529a89..ec1982738a 100644
--- a/drivers/gles3/shaders/cube_to_dp.glsl
+++ b/drivers/gles3/shaders/cube_to_dp.glsl
@@ -95,6 +95,6 @@ void main() {
 	float depth_fix = 1.0 / dot(normal, unorm);
 
 	depth = 2.0 * depth - 1.0;
-	float linear_depth = 2.0 * z_near * z_far / (z_far + z_near - depth * (z_far - z_near));
-	gl_FragDepth = (linear_depth * depth_fix + bias) / z_far;
+	float linear_depth = 2.0 * z_near * z_far / (z_far + z_near + depth * (z_far - z_near));
+	gl_FragDepth = (z_far - (linear_depth * depth_fix + bias)) / z_far;
 }
diff --git a/drivers/gles3/shaders/particles.glsl b/drivers/gles3/shaders/particles.glsl
index d95f7f4309..096f0a57ae 100644
--- a/drivers/gles3/shaders/particles.glsl
+++ b/drivers/gles3/shaders/particles.glsl
@@ -339,7 +339,8 @@ void main() {
 				amount = max(0.0, 1.0 - d);
 			} else if (attractors[i].type == ATTRACTOR_TYPE_VECTOR_FIELD) {
 			}
-			amount = pow(amount, attractors[i].attenuation);
+			mediump float attractor_attenuation = attractors[i].attenuation;
+			amount = pow(amount, attractor_attenuation);
 			dir = safe_normalize(mix(dir, attractors[i].transform[2].xyz, attractors[i].directionality));
 			attractor_force -= amount * dir * attractors[i].strength;
 		}
diff --git a/drivers/gles3/shaders/particles_copy.glsl b/drivers/gles3/shaders/particles_copy.glsl
index 0bb8efc52d..55b5e6d7ce 100644
--- a/drivers/gles3/shaders/particles_copy.glsl
+++ b/drivers/gles3/shaders/particles_copy.glsl
@@ -57,45 +57,39 @@ void main() {
 		txform = transpose(mat4(xform_1, xform_2, vec4(0.0, 0.0, 1.0, 0.0), vec4(0.0, 0.0, 0.0, 1.0)));
 #endif
 
-		switch (align_mode) {
-			case TRANSFORM_ALIGN_DISABLED: {
-			} break; //nothing
-			case TRANSFORM_ALIGN_Z_BILLBOARD: {
-				mat3 local = mat3(normalize(cross(align_up, sort_direction)), align_up, sort_direction);
-				local = local * mat3(txform);
-				txform[0].xyz = local[0];
-				txform[1].xyz = local[1];
-				txform[2].xyz = local[2];
-
-			} break;
-			case TRANSFORM_ALIGN_Y_TO_VELOCITY: {
-				vec3 v = velocity_flags.xyz;
-				float s = (length(txform[0]) + length(txform[1]) + length(txform[2])) / 3.0;
-				if (length(v) > 0.0) {
-					txform[1].xyz = normalize(v);
-				} else {
-					txform[1].xyz = normalize(txform[1].xyz);
-				}
-
-				txform[0].xyz = normalize(cross(txform[1].xyz, txform[2].xyz));
-				txform[2].xyz = vec3(0.0, 0.0, 1.0) * s;
-				txform[0].xyz *= s;
-				txform[1].xyz *= s;
-			} break;
-			case TRANSFORM_ALIGN_Z_BILLBOARD_Y_TO_VELOCITY: {
-				vec3 sv = velocity_flags.xyz - sort_direction * dot(sort_direction, velocity_flags.xyz); //screen velocity
-
-				if (length(sv) == 0.0) {
-					sv = align_up;
-				}
-
-				sv = normalize(sv);
-
-				txform[0].xyz = normalize(cross(sv, sort_direction)) * length(txform[0]);
-				txform[1].xyz = sv * length(txform[1]);
-				txform[2].xyz = sort_direction * length(txform[2]);
-
-			} break;
+		if (align_mode == TRANSFORM_ALIGN_DISABLED) {
+			// nothing
+		} else if (align_mode == TRANSFORM_ALIGN_Z_BILLBOARD) {
+			mat3 local = mat3(normalize(cross(align_up, sort_direction)), align_up, sort_direction);
+			local = local * mat3(txform);
+			txform[0].xyz = local[0];
+			txform[1].xyz = local[1];
+			txform[2].xyz = local[2];
+		} else if (align_mode == TRANSFORM_ALIGN_Y_TO_VELOCITY) {
+			vec3 v = velocity_flags.xyz;
+			float s = (length(txform[0]) + length(txform[1]) + length(txform[2])) / 3.0;
+			if (length(v) > 0.0) {
+				txform[1].xyz = normalize(v);
+			} else {
+				txform[1].xyz = normalize(txform[1].xyz);
+			}
+
+			txform[0].xyz = normalize(cross(txform[1].xyz, txform[2].xyz));
+			txform[2].xyz = vec3(0.0, 0.0, 1.0) * s;
+			txform[0].xyz *= s;
+			txform[1].xyz *= s;
+		} else if (align_mode == TRANSFORM_ALIGN_Z_BILLBOARD_Y_TO_VELOCITY) {
+			vec3 sv = velocity_flags.xyz - sort_direction * dot(sort_direction, velocity_flags.xyz); //screen velocity
+
+			if (length(sv) == 0.0) {
+				sv = align_up;
+			}
+
+			sv = normalize(sv);
+
+			txform[0].xyz = normalize(cross(sv, sort_direction)) * length(txform[0]);
+			txform[1].xyz = sv * length(txform[1]);
+			txform[2].xyz = sort_direction * length(txform[2]);
 		}
 
 		txform[3].xyz += velocity_flags.xyz * frame_remainder;
@@ -108,7 +102,10 @@ void main() {
 	}
 	txform = transpose(txform);
 
-	instance_color_custom_data = uvec4(packHalf2x16(color.xy), packHalf2x16(color.zw), packHalf2x16(custom.xy), packHalf2x16(custom.zw));
+	instance_color_custom_data.x = packHalf2x16(color.xy);
+	instance_color_custom_data.y = packHalf2x16(color.zw);
+	instance_color_custom_data.z = packHalf2x16(custom.xy);
+	instance_color_custom_data.w = packHalf2x16(custom.zw);
 	out_xform_1 = txform[0];
 	out_xform_2 = txform[1];
 #ifdef MODE_3D
diff --git a/drivers/gles3/shaders/scene.glsl b/drivers/gles3/shaders/scene.glsl
index d73407d674..8bf844991d 100644
--- a/drivers/gles3/shaders/scene.glsl
+++ b/drivers/gles3/shaders/scene.glsl
@@ -366,7 +366,9 @@ void main() {
 #if defined(COLOR_USED)
 	color_interp = color_attrib;
 #ifdef USE_INSTANCING
-	vec4 instance_color = vec4(unpackHalf2x16(instance_color_custom_data.x), unpackHalf2x16(instance_color_custom_data.y));
+	vec4 instance_color;
+	instance_color.xy = unpackHalf2x16(instance_color_custom_data.x);
+	instance_color.zw = unpackHalf2x16(instance_color_custom_data.y);
 	color_interp *= instance_color;
 #endif
 #endif
@@ -403,7 +405,9 @@ void main() {
 #endif //USE_MULTIVIEW
 
 #ifdef USE_INSTANCING
-	vec4 instance_custom = vec4(unpackHalf2x16(instance_color_custom_data.z), unpackHalf2x16(instance_color_custom_data.w));
+	vec4 instance_custom;
+	instance_custom.xy = unpackHalf2x16(instance_color_custom_data.z);
+	instance_custom.zw = unpackHalf2x16(instance_color_custom_data.w);
 #else
 	vec4 instance_custom = vec4(0.0);
 #endif
@@ -1710,7 +1714,7 @@ void main() {
 #ifdef MODE_RENDER_DEPTH
 #ifdef RENDER_SHADOWS_LINEAR
 	// Linearize the depth buffer if rendering cubemap shadows.
-	gl_FragDepth = (length(vertex) + scene_data.shadow_bias) / scene_data.z_far;
+	gl_FragDepth = (scene_data.z_far - (length(vertex) + scene_data.shadow_bias)) / scene_data.z_far;
 #endif
 
 // Nothing happens, so a tree-ssa optimizer will result in no fragment shader :)
@@ -1747,7 +1751,8 @@ void main() {
 #endif //!MODE_UNSHADED
 
 #ifndef FOG_DISABLED
-	fog = vec4(unpackHalf2x16(fog_rg), unpackHalf2x16(fog_ba));
+	fog.xy = unpackHalf2x16(fog_rg);
+	fog.zw = unpackHalf2x16(fog_ba);
 
 #ifndef DISABLE_FOG
 	if (scene_data.fog_enabled) {
@@ -1914,7 +1919,7 @@ void main() {
 	float omni_shadow = 1.0f;
 #ifndef SHADOWS_DISABLED
 	vec3 light_ray = ((positional_shadows[positional_shadow_index].shadow_matrix * vec4(shadow_coord.xyz, 1.0))).xyz;
-	omni_shadow = texture(omni_shadow_texture, vec4(light_ray, length(light_ray) * omni_lights[omni_light_index].inv_radius));
+	omni_shadow = texture(omni_shadow_texture, vec4(light_ray, 1.0 - length(light_ray) * omni_lights[omni_light_index].inv_radius));
 	omni_shadow = mix(1.0, omni_shadow, omni_lights[omni_light_index].shadow_opacity);
 #endif // SHADOWS_DISABLED
 	light_process_omni(omni_light_index, vertex, view, normal, f0, roughness, metallic, omni_shadow, albedo, alpha,
@@ -1964,7 +1969,8 @@ void main() {
 	vec3 additive_light_color = diffuse_light + specular_light;
 
 #ifndef FOG_DISABLED
-	fog = vec4(unpackHalf2x16(fog_rg), unpackHalf2x16(fog_ba));
+	fog.xy = unpackHalf2x16(fog_rg);
+	fog.zw = unpackHalf2x16(fog_ba);
 
 #ifndef DISABLE_FOG
 	if (scene_data.fog_enabled) {
diff --git a/drivers/gles3/shaders/sky.glsl b/drivers/gles3/shaders/sky.glsl
index b10ea12e6e..6c33bf7123 100644
--- a/drivers/gles3/shaders/sky.glsl
+++ b/drivers/gles3/shaders/sky.glsl
@@ -28,7 +28,7 @@ void main() {
 	// We're doing clockwise culling so flip the order
 	uv_interp = vec2(vertex_attrib.x, vertex_attrib.y * -1.0);
 #endif
-	gl_Position = vec4(uv_interp, 1.0, 1.0);
+	gl_Position = vec4(uv_interp, -1.0, 1.0);
 }
 
 /* clang-format off */
@@ -139,9 +139,11 @@ void main() {
 	vec3 cube_normal;
 #ifdef USE_MULTIVIEW
 	// In multiview our projection matrices will contain positional and rotational offsets that we need to properly unproject.
-	vec4 unproject = vec4(uv_interp.x, uv_interp.y, 1.0, 1.0);
+	vec4 unproject = vec4(uv_interp.xy, -1.0, 1.0); // unproject at the far plane
 	vec4 unprojected = multiview_data.inv_projection_matrix_view[ViewIndex] * unproject;
 	cube_normal = unprojected.xyz / unprojected.w;
+
+	// Unproject will give us the position between the eyes, need to re-offset.
 	cube_normal += multiview_data.eye_offset[ViewIndex].xyz;
 #else
 	cube_normal.z = -1.0;
diff --git a/drivers/gles3/shaders/stdlib_inc.glsl b/drivers/gles3/shaders/stdlib_inc.glsl
index 92bf2d87e4..029084c34c 100644
--- a/drivers/gles3/shaders/stdlib_inc.glsl
+++ b/drivers/gles3/shaders/stdlib_inc.glsl
@@ -1,5 +1,12 @@
 
-#ifdef USE_GLES_OVER_GL
+// Compatibility renames. These are exposed with the "godot_" prefix
+// to work around two distinct Adreno bugs:
+// 1. Some Adreno devices expose ES310 functions in ES300 shaders.
+//    Internally, we must use the "godot_" prefix, but user shaders
+//    will be mapped automatically.
+// 2. Adreno 3XX devices have poor implementations of the other packing
+//    functions, so we just use our own everywhere to keep it simple.
+
 // Floating point pack/unpack functions are part of the GLSL ES 300 specification used by web and mobile.
 uint float2half(uint f) {
 	uint e = f & uint(0x7f800000);
@@ -17,40 +24,34 @@ uint half2float(uint h) {
 	return ((h & uint(0x8000)) << uint(16)) | uint((h_e >> uint(10)) != uint(0)) * (((h_e + uint(0x1c000)) << uint(13)) | ((h & uint(0x03ff)) << uint(13)));
 }
 
-uint packHalf2x16(vec2 v) {
+uint godot_packHalf2x16(vec2 v) {
 	return float2half(floatBitsToUint(v.x)) | float2half(floatBitsToUint(v.y)) << uint(16);
 }
 
-vec2 unpackHalf2x16(uint v) {
+vec2 godot_unpackHalf2x16(uint v) {
 	return vec2(uintBitsToFloat(half2float(v & uint(0xffff))),
 			uintBitsToFloat(half2float(v >> uint(16))));
 }
 
-uint packUnorm2x16(vec2 v) {
+uint godot_packUnorm2x16(vec2 v) {
 	uvec2 uv = uvec2(round(clamp(v, vec2(0.0), vec2(1.0)) * 65535.0));
 	return uv.x | uv.y << uint(16);
 }
 
-vec2 unpackUnorm2x16(uint p) {
+vec2 godot_unpackUnorm2x16(uint p) {
 	return vec2(float(p & uint(0xffff)), float(p >> uint(16))) * 0.000015259021; // 1.0 / 65535.0 optimization
 }
 
-uint packSnorm2x16(vec2 v) {
+uint godot_packSnorm2x16(vec2 v) {
 	uvec2 uv = uvec2(round(clamp(v, vec2(-1.0), vec2(1.0)) * 32767.0) + 32767.0);
 	return uv.x | uv.y << uint(16);
 }
 
-vec2 unpackSnorm2x16(uint p) {
+vec2 godot_unpackSnorm2x16(uint p) {
 	vec2 v = vec2(float(p & uint(0xffff)), float(p >> uint(16)));
 	return clamp((v - 32767.0) * vec2(0.00003051851), vec2(-1.0), vec2(1.0));
 }
 
-#endif
-
-// Compatibility renames. These are exposed with the "godot_" prefix
-// to work around an Adreno bug which was exposing these ES310 functions
-// in ES300 shaders. Internally, we must use the "godot_" prefix, but user shaders
-// will be mapped automatically.
 uint godot_packUnorm4x8(vec4 v) {
 	uvec4 uv = uvec4(round(clamp(v, vec4(0.0), vec4(1.0)) * 255.0));
 	return uv.x | (uv.y << uint(8)) | (uv.z << uint(16)) | (uv.w << uint(24));
@@ -74,3 +75,9 @@ vec4 godot_unpackSnorm4x8(uint p) {
 #define unpackUnorm4x8 godot_unpackUnorm4x8
 #define packSnorm4x8 godot_packSnorm4x8
 #define unpackSnorm4x8 godot_unpackSnorm4x8
+#define packHalf2x16 godot_packHalf2x16
+#define unpackHalf2x16 godot_unpackHalf2x16
+#define packUnorm2x16 godot_packUnorm2x16
+#define unpackUnorm2x16 godot_unpackUnorm2x16
+#define packSnorm2x16 godot_packSnorm2x16
+#define unpackSnorm2x16 godot_unpackSnorm2x16
diff --git a/drivers/gles3/storage/light_storage.cpp b/drivers/gles3/storage/light_storage.cpp
index f5d1f8dabd..d7d77c6b8f 100644
--- a/drivers/gles3/storage/light_storage.cpp
+++ b/drivers/gles3/storage/light_storage.cpp
@@ -1020,7 +1020,7 @@ bool LightStorage::_shadow_atlas_find_shadow(ShadowAtlas *shadow_atlas, int *p_i
 				glTexParameteri(GL_TEXTURE_CUBE_MAP, GL_TEXTURE_WRAP_R, GL_CLAMP_TO_EDGE);
 
 				glTexParameteri(GL_TEXTURE_CUBE_MAP, GL_TEXTURE_COMPARE_MODE, GL_COMPARE_REF_TO_TEXTURE);
-				glTexParameteri(GL_TEXTURE_CUBE_MAP, GL_TEXTURE_COMPARE_FUNC, GL_LESS);
+				glTexParameteri(GL_TEXTURE_CUBE_MAP, GL_TEXTURE_COMPARE_FUNC, GL_GREATER);
 
 				glFramebufferTexture2D(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_CUBE_MAP_POSITIVE_X, texture_id, 0);
 
@@ -1042,7 +1042,7 @@ bool LightStorage::_shadow_atlas_find_shadow(ShadowAtlas *shadow_atlas, int *p_i
 				glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
 
 				glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_COMPARE_MODE, GL_COMPARE_REF_TO_TEXTURE);
-				glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_COMPARE_FUNC, GL_LESS);
+				glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_COMPARE_FUNC, GL_GREATER);
 
 				glFramebufferTexture2D(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, texture_id, 0);
 
@@ -1128,14 +1128,14 @@ void LightStorage::update_directional_shadow_atlas() {
 		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
 
 		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_COMPARE_MODE, GL_COMPARE_REF_TO_TEXTURE);
-		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_COMPARE_FUNC, GL_LESS);
+		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_COMPARE_FUNC, GL_GREATER);
 
 		glFramebufferTexture2D(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, directional_shadow.depth, 0);
 	}
 	glUseProgram(0);
 	glDepthMask(GL_TRUE);
 	glBindFramebuffer(GL_FRAMEBUFFER, directional_shadow.fbo);
-	RasterizerGLES3::clear_depth(1.0);
+	RasterizerGLES3::clear_depth(0.0);
 	glClear(GL_DEPTH_BUFFER_BIT);
 
 	glBindTexture(GL_TEXTURE_2D, 0);
diff --git a/drivers/gles3/storage/particles_storage.cpp b/drivers/gles3/storage/particles_storage.cpp
index b72b4eaf8d..09878aaee4 100644
--- a/drivers/gles3/storage/particles_storage.cpp
+++ b/drivers/gles3/storage/particles_storage.cpp
@@ -31,6 +31,8 @@
 #ifdef GLES3_ENABLED
 
 #include "particles_storage.h"
+
+#include "config.h"
 #include "material_storage.h"
 #include "mesh_storage.h"
 #include "texture_storage.h"
@@ -120,6 +122,8 @@ void ParticlesStorage::particles_set_mode(RID p_particles, RS::ParticlesMode p_m
 }
 
 void ParticlesStorage::particles_set_emitting(RID p_particles, bool p_emitting) {
+	ERR_FAIL_COND_MSG(GLES3::Config::get_singleton()->adreno_3xx_compatibility, "Due to driver bugs, GPUParticles are not supported on Adreno 3XX devices. Please use CPUParticles instead.");
+
 	Particles *particles = particles_owner.get_or_null(p_particles);
 	ERR_FAIL_NULL(particles);
 
@@ -127,6 +131,10 @@ void ParticlesStorage::particles_set_emitting(RID p_particles, bool p_emitting)
 }
 
 bool ParticlesStorage::particles_get_emitting(RID p_particles) {
+	if (GLES3::Config::get_singleton()->adreno_3xx_compatibility) {
+		return false;
+	}
+
 	ERR_FAIL_COND_V_MSG(RSG::threaded, false, "This function should never be used with threaded rendering, as it stalls the renderer.");
 	Particles *particles = particles_owner.get_or_null(p_particles);
 	ERR_FAIL_NULL_V(particles, false);
@@ -1003,6 +1011,12 @@ void ParticlesStorage::_particles_update_instance_buffer(Particles *particles, c
 }
 
 void ParticlesStorage::update_particles() {
+	if (!particle_update_list.first()) {
+		// Return early to avoid unnecessary state changes.
+		return;
+	}
+
+	RENDER_TIMESTAMP("Update GPUParticles");
 	glEnable(GL_RASTERIZER_DISCARD);
 	glBindFramebuffer(GL_FRAMEBUFFER, GLES3::TextureStorage::system_fbo);
 
diff --git a/drivers/gles3/storage/texture_storage.cpp b/drivers/gles3/storage/texture_storage.cpp
index c955b3f708..6f32e4d49d 100644
--- a/drivers/gles3/storage/texture_storage.cpp
+++ b/drivers/gles3/storage/texture_storage.cpp
@@ -1077,7 +1077,7 @@ Ref<Image> TextureStorage::texture_2d_get(RID p_texture) const {
 		glDisable(GL_DEPTH_TEST);
 		glDisable(GL_CULL_FACE);
 		glDisable(GL_BLEND);
-		glDepthFunc(GL_LEQUAL);
+		glDepthFunc(GL_GEQUAL);
 		glColorMask(1, 1, 1, 1);
 		glActiveTexture(GL_TEXTURE0);
 		glBindTexture(GL_TEXTURE_2D, texture->tex_id);
diff --git a/editor/SCsub b/editor/SCsub
index f4d30b68b1..e3b17b83f8 100644
--- a/editor/SCsub
+++ b/editor/SCsub
@@ -7,19 +7,24 @@ env.editor_sources = []
 import os
 import glob
 import editor_builders
+import methods
 
 
 def _make_doc_data_class_path(to_path):
-    # NOTE: It is safe to generate this file here, since this is still executed serially
-    with open(os.path.join(to_path, "doc_data_class_path.gen.h"), "w", encoding="utf-8", newline="\n") as g:
-        g.write("static const int _doc_data_class_path_count = " + str(len(env.doc_class_path)) + ";\n")
-        g.write("struct _DocDataClassPath { const char* name; const char* path; };\n")
+    file_path = os.path.join(to_path, "doc_data_class_path.gen.h")
 
-        g.write("static const _DocDataClassPath _doc_data_class_paths[" + str(len(env.doc_class_path) + 1) + "] = {\n")
-        for c in sorted(env.doc_class_path):
-            g.write('\t{"' + c + '", "' + env.doc_class_path[c] + '"},\n')
-        g.write("\t{nullptr, nullptr}\n")
-        g.write("};\n")
+    class_path_data = ""
+    class_path_data += "static const int _doc_data_class_path_count = " + str(len(env.doc_class_path)) + ";\n"
+    class_path_data += "struct _DocDataClassPath { const char* name; const char* path; };\n"
+    class_path_data += (
+        "static const _DocDataClassPath _doc_data_class_paths[" + str(len(env.doc_class_path) + 1) + "] = {\n"
+    )
+    for c in sorted(env.doc_class_path):
+        class_path_data += '\t{"' + c + '", "' + env.doc_class_path[c] + '"},\n'
+    class_path_data += "\t{nullptr, nullptr}\n"
+    class_path_data += "};\n"
+
+    methods.write_file_if_needed(file_path, class_path_data)
 
 
 if env.editor_build:
@@ -38,10 +43,7 @@ if env.editor_build:
         reg_exporters += "\tregister_" + e + "_exporter_types();\n"
     reg_exporters += "}\n"
 
-    # NOTE: It is safe to generate this file here, since this is still executed serially
-    with open("register_exporters.gen.cpp", "w", encoding="utf-8", newline="\n") as f:
-        f.write(reg_exporters_inc)
-        f.write(reg_exporters)
+    methods.write_file_if_needed("register_exporters.gen.cpp", reg_exporters_inc + reg_exporters)
 
     # Core API documentation.
     docs = []
diff --git a/editor/animation_track_editor.cpp b/editor/animation_track_editor.cpp
index 99ba35a6d0..bec95d40c6 100644
--- a/editor/animation_track_editor.cpp
+++ b/editor/animation_track_editor.cpp
@@ -5006,12 +5006,17 @@ void AnimationTrackEditor::_new_track_node_selected(NodePath p_path) {
 }
 
 void AnimationTrackEditor::_add_track(int p_type) {
-	if (!root) {
+	AnimationPlayer *ap = AnimationPlayerEditor::get_singleton()->get_player();
+	if (!ap) {
+		ERR_FAIL_EDMSG("No AnimationPlayer is currently being edited.");
+	}
+	Node *root_node = ap->get_node_or_null(ap->get_root_node());
+	if (!root_node) {
 		EditorNode::get_singleton()->show_warning(TTR("Not possible to add a new track without a root"));
 		return;
 	}
 	adding_track_type = p_type;
-	pick_track->popup_scenetree_dialog();
+	pick_track->popup_scenetree_dialog(nullptr, root_node);
 	pick_track->get_filter_line_edit()->clear();
 	pick_track->get_filter_line_edit()->grab_focus();
 }
diff --git a/editor/connections_dialog.cpp b/editor/connections_dialog.cpp
index 22a3326fc5..2db5b02d63 100644
--- a/editor/connections_dialog.cpp
+++ b/editor/connections_dialog.cpp
@@ -616,7 +616,7 @@ void ConnectDialog::init(const ConnectionData &p_cd, const PackedStringArray &p_
 	signal_args = p_signal_args;
 
 	tree->set_selected(nullptr);
-	tree->set_marked(source, true);
+	tree->set_marked(source);
 
 	if (p_cd.target) {
 		set_dst_node(static_cast<Node *>(p_cd.target));
diff --git a/editor/editor_build_profile.cpp b/editor/editor_build_profile.cpp
index 65bca1a935..67bc55fdd6 100644
--- a/editor/editor_build_profile.cpp
+++ b/editor/editor_build_profile.cpp
@@ -383,7 +383,7 @@ void EditorBuildProfileManager::_profile_action(int p_action) {
 
 	switch (p_action) {
 		case ACTION_RESET: {
-			confirm_dialog->set_text("Reset the edited profile?");
+			confirm_dialog->set_text(TTR("Reset the edited profile?"));
 			confirm_dialog->popup_centered();
 		} break;
 		case ACTION_LOAD: {
@@ -404,11 +404,11 @@ void EditorBuildProfileManager::_profile_action(int p_action) {
 			export_profile->set_current_file(profile_path->get_text());
 		} break;
 		case ACTION_NEW: {
-			confirm_dialog->set_text("Create a new profile?");
+			confirm_dialog->set_text(TTR("Create a new profile?"));
 			confirm_dialog->popup_centered();
 		} break;
 		case ACTION_DETECT: {
-			confirm_dialog->set_text("This will scan all files in the current project to detect used classes.");
+			confirm_dialog->set_text(TTR("This will scan all files in the current project to detect used classes."));
 			confirm_dialog->popup_centered();
 		} break;
 		case ACTION_MAX: {
diff --git a/editor/editor_data.cpp b/editor/editor_data.cpp
index 4b68a21cb9..6ab29c1850 100644
--- a/editor/editor_data.cpp
+++ b/editor/editor_data.cpp
@@ -722,15 +722,8 @@ bool EditorData::check_and_update_scene(int p_idx) {
 
 		new_scene->set_scene_file_path(edited_scene[p_idx].root->get_scene_file_path());
 		Node *old_root = edited_scene[p_idx].root;
-		for (int i = 0; i < old_root->get_child_count(); i++) {
-			memdelete(old_root->get_child(i));
-		}
-		old_root->replace_by(new_scene);
+		old_root->replace_by(new_scene, false, false);
 		memdelete(old_root);
-		edited_scene.write[p_idx].root = new_scene;
-		if (!new_scene->get_scene_file_path().is_empty()) {
-			edited_scene.write[p_idx].path = new_scene->get_scene_file_path();
-		}
 		edited_scene.write[p_idx].selection = new_selection;
 
 		return true;
diff --git a/editor/editor_dock_manager.cpp b/editor/editor_dock_manager.cpp
index 3fba07f686..b6250671ee 100644
--- a/editor/editor_dock_manager.cpp
+++ b/editor/editor_dock_manager.cpp
@@ -149,17 +149,53 @@ void EditorDockManager::_update_layout() {
 	}
 	EditorNode::get_singleton()->edit_current();
 	dock_context_popup->docks_updated();
+	_update_docks_menu();
 	EditorNode::get_singleton()->save_editor_layout_delayed();
 }
 
+void EditorDockManager::_update_docks_menu() {
+	docks_menu->clear();
+	docks_menu->reset_size();
+
+	const Ref<Texture2D> icon = docks_menu->get_editor_theme_icon(SNAME("Window"));
+	const Color closed_icon_color_mod = Color(1, 1, 1, 0.5);
+
+	// Add docks.
+	docks_menu_docks.clear();
+	int id = 0;
+	for (const KeyValue<Control *, DockInfo> &dock : all_docks) {
+		if (dock.value.shortcut.is_valid()) {
+			docks_menu->add_shortcut(dock.value.shortcut, id);
+			docks_menu->set_item_text(id, dock.value.title);
+		} else {
+			docks_menu->add_item(dock.value.title, id);
+		}
+		docks_menu->set_item_icon(id, icon);
+		if (!dock.value.open) {
+			docks_menu->set_item_icon_modulate(id, closed_icon_color_mod);
+		}
+		docks_menu->set_item_disabled(id, !dock.value.enabled);
+		docks_menu_docks.push_back(dock.key);
+		id++;
+	}
+}
+
+void EditorDockManager::_docks_menu_option(int p_id) {
+	focus_dock(docks_menu_docks[p_id]);
+}
+
 void EditorDockManager::_window_close_request(WindowWrapper *p_wrapper) {
 	// Give the dock back to the original owner.
 	Control *dock = _close_window(p_wrapper);
 	ERR_FAIL_COND(!all_docks.has(dock));
 
-	all_docks[dock].open = false;
-	open_dock(dock);
-	focus_dock(dock);
+	if (all_docks[dock].previous_at_bottom || all_docks[dock].dock_slot_index != DOCK_SLOT_NONE) {
+		all_docks[dock].open = false;
+		open_dock(dock);
+		focus_dock(dock);
+	} else {
+		close_dock(dock);
+	}
 }
 
 Control *EditorDockManager::_close_window(WindowWrapper *p_wrapper) {
@@ -174,7 +210,7 @@ Control *EditorDockManager::_close_window(WindowWrapper *p_wrapper) {
 	return dock;
 }
 
-void EditorDockManager::_open_dock_in_window(Control *p_dock, bool p_show_window) {
+void EditorDockManager::_open_dock_in_window(Control *p_dock, bool p_show_window, bool p_reset_size) {
 	ERR_FAIL_NULL(p_dock);
 
 	Size2 borders = Size2(4, 4) * EDSCALE;
@@ -201,6 +237,12 @@ void EditorDockManager::_open_dock_in_window(Control *p_dock, bool p_show_window
 	if (p_show_window) {
 		wrapper->restore_window(Rect2i(dock_screen_pos, dock_size), EditorNode::get_singleton()->get_gui_base()->get_window()->get_current_screen());
 		_update_layout();
+		if (p_reset_size) {
+			// Use a default size of one third the current window size.
+			Size2i popup_size = EditorNode::get_singleton()->get_window()->get_size() / 3.0;
+			p_dock->get_window()->set_size(popup_size);
+			p_dock->get_window()->move_to_center();
+		}
 		p_dock->get_window()->grab_focus();
 	}
 }
@@ -338,6 +380,10 @@ void EditorDockManager::save_docks_to_config(Ref<ConfigFile> p_layout, const Str
 			p_layout->set_value(p_section, "dock_" + itos(i + 1) + "_selected_tab_idx", selected_tab_idx);
 		}
 	}
+	if (p_layout->has_section_key(p_section, "dock_0")) {
+		// Clear the keys where the dock has no slot so it is overridden.
+		p_layout->erase_section_key(p_section, "dock_0");
+	}
 
 	// Save docks in windows.
 	Dictionary floating_docks_dump;
@@ -425,8 +471,8 @@ void EditorDockManager::load_docks_from_config(Ref<ConfigFile> p_layout, const S
 		dock_map[dock.key->get_name()] = dock.key;
 	}
 
-	// Load docks by slot.
-	for (int i = 0; i < DOCK_SLOT_MAX; i++) {
+	// Load docks by slot. Index -1 is for docks that have no slot.
+	for (int i = -1; i < DOCK_SLOT_MAX; i++) {
 		if (!p_layout->has_section_key(p_section, "dock_" + itos(i + 1))) {
 			continue;
 		}
@@ -450,7 +496,7 @@ void EditorDockManager::load_docks_from_config(Ref<ConfigFile> p_layout, const S
 				_restore_dock_to_saved_window(dock, floating_docks_dump[name]);
 			} else if (dock_bottom.has(name)) {
 				_dock_move_to_bottom(dock);
-			} else {
+			} else if (i >= 0) {
 				_move_dock(dock, dock_slot[i], 0);
 			}
 
@@ -465,7 +511,7 @@ void EditorDockManager::load_docks_from_config(Ref<ConfigFile> p_layout, const S
 			}
 
 			all_docks[dock].dock_slot_index = i;
-			all_docks[dock].previous_tab_index = j;
+			all_docks[dock].previous_tab_index = i >= 0 ? j : 0;
 		}
 	}
 
@@ -500,6 +546,8 @@ void EditorDockManager::load_docks_from_config(Ref<ConfigFile> p_layout, const S
 	}
 
 	FileSystemDock::get_singleton()->load_layout_from_config(p_layout, p_section);
+
+	_update_docks_menu();
 }
 
 void EditorDockManager::bottom_dock_show_placement_popup(const Rect2i &p_position, Control *p_dock) {
@@ -564,13 +612,16 @@ void EditorDockManager::open_dock(Control *p_dock, bool p_set_current) {
 	// Open dock to its previous location.
 	if (all_docks[p_dock].previous_at_bottom) {
 		_dock_move_to_bottom(p_dock);
-	} else {
+	} else if (all_docks[p_dock].dock_slot_index != DOCK_SLOT_NONE) {
 		TabContainer *slot = dock_slot[all_docks[p_dock].dock_slot_index];
 		int tab_index = all_docks[p_dock].previous_tab_index;
 		if (tab_index < 0) {
 			tab_index = slot->get_tab_count();
 		}
 		_move_dock(p_dock, slot, tab_index, p_set_current);
+	} else {
+		_open_dock_in_window(p_dock, true, true);
+		return;
 	}
 
 	_update_layout();
@@ -615,9 +666,8 @@ void EditorDockManager::focus_dock(Control *p_dock) {
 	tab_container->set_current_tab(tab_index);
 }
 
-void EditorDockManager::add_control_to_dock(DockSlot p_slot, Control *p_dock, const String &p_title, const Ref<Shortcut> &p_shortcut) {
+void EditorDockManager::add_dock(Control *p_dock, const String &p_title, DockSlot p_slot, const Ref<Shortcut> &p_shortcut) {
 	ERR_FAIL_NULL(p_dock);
-	ERR_FAIL_INDEX(p_slot, DOCK_SLOT_MAX);
 	ERR_FAIL_COND_MSG(all_docks.has(p_dock), vformat("Cannot add dock '%s', already added.", p_dock->get_name()));
 
 	DockInfo dock_info;
@@ -626,10 +676,17 @@ void EditorDockManager::add_control_to_dock(DockSlot p_slot, Control *p_dock, co
 	dock_info.shortcut = p_shortcut;
 	all_docks[p_dock] = dock_info;
 
-	open_dock(p_dock, false);
+	if (p_slot != DOCK_SLOT_NONE) {
+		ERR_FAIL_INDEX(p_slot, DOCK_SLOT_MAX);
+		open_dock(p_dock, false);
+	} else {
+		closed_dock_parent->add_child(p_dock);
+		p_dock->hide();
+		_update_layout();
+	}
 }
 
-void EditorDockManager::remove_control_from_dock(Control *p_dock) {
+void EditorDockManager::remove_dock(Control *p_dock) {
 	ERR_FAIL_NULL(p_dock);
 	ERR_FAIL_COND_MSG(!all_docks.has(p_dock), vformat("Cannot remove unknown dock '%s'.", p_dock->get_name()));
 
@@ -688,6 +745,10 @@ int EditorDockManager::get_vsplit_count() const {
 	return vsplits.size();
 }
 
+PopupMenu *EditorDockManager::get_docks_menu() {
+	return docks_menu;
+}
+
 EditorDockManager::EditorDockManager() {
 	singleton = this;
 
@@ -695,6 +756,10 @@ EditorDockManager::EditorDockManager() {
 
 	dock_context_popup = memnew(DockContextPopup);
 	EditorNode::get_singleton()->get_gui_base()->add_child(dock_context_popup);
+
+	docks_menu = memnew(PopupMenu);
+	docks_menu->connect("id_pressed", callable_mp(this, &EditorDockManager::_docks_menu_option));
+	EditorNode::get_singleton()->get_gui_base()->connect("theme_changed", callable_mp(this, &EditorDockManager::_update_docks_menu));
 }
 
 void DockContextPopup::_notification(int p_what) {
@@ -715,6 +780,7 @@ void DockContextPopup::_notification(int p_what) {
 				tab_move_right_button->set_tooltip_text(TTR("Move this dock right one tab."));
 			}
 			dock_to_bottom_button->set_icon(get_editor_theme_icon(SNAME("ControlAlignBottomWide")));
+			close_button->set_icon(get_editor_theme_icon(SNAME("Close")));
 		} break;
 	}
 }
@@ -741,6 +807,11 @@ void DockContextPopup::_tab_move_right() {
 	dock_select->queue_redraw();
 }
 
+void DockContextPopup::_close_dock() {
+	hide();
+	dock_manager->close_dock(context_dock);
+}
+
 void DockContextPopup::_float_dock() {
 	hide();
 	dock_manager->_open_dock_in_window(context_dock);
@@ -975,4 +1046,12 @@ DockContextPopup::DockContextPopup() {
 	dock_to_bottom_button->connect("pressed", callable_mp(this, &DockContextPopup::_move_dock_to_bottom));
 	dock_to_bottom_button->hide();
 	dock_select_popup_vb->add_child(dock_to_bottom_button);
+
+	close_button = memnew(Button);
+	close_button->set_text(TTR("Close"));
+	close_button->set_tooltip_text(TTR("Close this dock."));
+	close_button->set_focus_mode(Control::FOCUS_NONE);
+	close_button->set_h_size_flags(Control::SIZE_EXPAND_FILL);
+	close_button->connect("pressed", callable_mp(this, &DockContextPopup::_close_dock));
+	dock_select_popup_vb->add_child(close_button);
 }
diff --git a/editor/editor_dock_manager.h b/editor/editor_dock_manager.h
index 370c149967..cbb076c809 100644
--- a/editor/editor_dock_manager.h
+++ b/editor/editor_dock_manager.h
@@ -62,6 +62,7 @@ class EditorDockManager : public Object {
 
 public:
 	enum DockSlot {
+		DOCK_SLOT_NONE = -1,
 		DOCK_SLOT_LEFT_UL,
 		DOCK_SLOT_LEFT_BL,
 		DOCK_SLOT_LEFT_UR,
@@ -84,7 +85,7 @@ private:
 		int previous_tab_index = -1;
 		bool previous_at_bottom = false;
 		WindowWrapper *dock_window = nullptr;
-		int dock_slot_index = DOCK_SLOT_LEFT_UL;
+		int dock_slot_index = DOCK_SLOT_NONE;
 		Ref<Shortcut> shortcut;
 	};
 
@@ -100,6 +101,8 @@ private:
 	bool docks_visible = true;
 
 	DockContextPopup *dock_context_popup = nullptr;
+	PopupMenu *docks_menu = nullptr;
+	Vector<Control *> docks_menu_docks;
 	Control *closed_dock_parent = nullptr;
 
 	void _dock_split_dragged(int p_offset);
@@ -108,9 +111,12 @@ private:
 	void _dock_container_update_visibility(TabContainer *p_dock_container);
 	void _update_layout();
 
+	void _update_docks_menu();
+	void _docks_menu_option(int p_id);
+
 	void _window_close_request(WindowWrapper *p_wrapper);
 	Control *_close_window(WindowWrapper *p_wrapper);
-	void _open_dock_in_window(Control *p_dock, bool p_show_window = true);
+	void _open_dock_in_window(Control *p_dock, bool p_show_window = true, bool p_reset_size = false);
 	void _restore_dock_to_saved_window(Control *p_dock, const Dictionary &p_window_dump);
 
 	void _dock_move_to_bottom(Control *p_dock);
@@ -127,6 +133,7 @@ public:
 	void add_hsplit(DockSplitContainer *p_split);
 	void register_dock_slot(DockSlot p_dock_slot, TabContainer *p_tab_container);
 	int get_vsplit_count() const;
+	PopupMenu *get_docks_menu();
 
 	void save_docks_to_config(Ref<ConfigFile> p_layout, const String &p_section) const;
 	void load_docks_from_config(Ref<ConfigFile> p_layout, const String &p_section);
@@ -143,8 +150,8 @@ public:
 	void set_docks_visible(bool p_show);
 	bool are_docks_visible() const;
 
-	void add_control_to_dock(DockSlot p_slot, Control *p_dock, const String &p_title = "", const Ref<Shortcut> &p_shortcut = nullptr);
-	void remove_control_from_dock(Control *p_dock);
+	void add_dock(Control *p_dock, const String &p_title = "", DockSlot p_slot = DOCK_SLOT_NONE, const Ref<Shortcut> &p_shortcut = nullptr);
+	void remove_dock(Control *p_dock);
 
 	EditorDockManager();
 };
@@ -157,6 +164,7 @@ class DockContextPopup : public PopupPanel {
 	Button *make_float_button = nullptr;
 	Button *tab_move_left_button = nullptr;
 	Button *tab_move_right_button = nullptr;
+	Button *close_button = nullptr;
 	Button *dock_to_bottom_button = nullptr;
 
 	Control *dock_select = nullptr;
@@ -169,6 +177,7 @@ class DockContextPopup : public PopupPanel {
 
 	void _tab_move_left();
 	void _tab_move_right();
+	void _close_dock();
 	void _float_dock();
 	void _move_dock_to_bottom();
 
diff --git a/editor/editor_feature_profile.cpp b/editor/editor_feature_profile.cpp
index 5236f9e254..541bcd5e02 100644
--- a/editor/editor_feature_profile.cpp
+++ b/editor/editor_feature_profile.cpp
@@ -619,8 +619,8 @@ void EditorFeatureProfileManager::_class_list_item_selected() {
 			if (!(E.usage & PROPERTY_USAGE_EDITOR)) {
 				continue;
 			}
-			const String text = EditorPropertyNameProcessor::get_singleton()->process_name(name, text_style);
-			const String tooltip = EditorPropertyNameProcessor::get_singleton()->process_name(name, tooltip_style);
+			const String text = EditorPropertyNameProcessor::get_singleton()->process_name(name, text_style, name, class_name);
+			const String tooltip = EditorPropertyNameProcessor::get_singleton()->process_name(name, tooltip_style, name, class_name);
 
 			TreeItem *property = property_list->create_item(properties);
 			property->set_cell_mode(0, TreeItem::CELL_MODE_CHECK);
diff --git a/editor/editor_inspector.cpp b/editor/editor_inspector.cpp
index c5ce815d6b..7919d61f26 100644
--- a/editor/editor_inspector.cpp
+++ b/editor/editor_inspector.cpp
@@ -57,7 +57,7 @@ bool EditorInspector::_property_path_matches(const String &p_property_path, cons
 
 	const Vector<String> prop_sections = p_property_path.split("/");
 	for (int i = 0; i < prop_sections.size(); i++) {
-		if (p_filter.is_subsequence_ofn(EditorPropertyNameProcessor::get_singleton()->process_name(prop_sections[i], p_style))) {
+		if (p_filter.is_subsequence_ofn(EditorPropertyNameProcessor::get_singleton()->process_name(prop_sections[i], p_style, p_property_path))) {
 			return true;
 		}
 	}
@@ -3012,7 +3012,7 @@ void EditorInspector::update_tree() {
 		if ((p.usage & PROPERTY_USAGE_SCRIPT_VARIABLE) && name_style == EditorPropertyNameProcessor::STYLE_LOCALIZED) {
 			name_style = EditorPropertyNameProcessor::STYLE_CAPITALIZED;
 		}
-		const String property_label_string = EditorPropertyNameProcessor::get_singleton()->process_name(name_override, name_style) + feature_tag;
+		const String property_label_string = EditorPropertyNameProcessor::get_singleton()->process_name(name_override, name_style, p.name, doc_name) + feature_tag;
 
 		// Remove the property from the path.
 		int idx = path.rfind("/");
@@ -3081,8 +3081,8 @@ void EditorInspector::update_tree() {
 						tooltip = EditorPropertyNameProcessor::get_singleton()->translate_group_name(component);
 					}
 				} else {
-					label = EditorPropertyNameProcessor::get_singleton()->process_name(component, section_name_style);
-					tooltip = EditorPropertyNameProcessor::get_singleton()->process_name(component, EditorPropertyNameProcessor::get_tooltip_style(section_name_style));
+					label = EditorPropertyNameProcessor::get_singleton()->process_name(component, section_name_style, p.name, doc_name);
+					tooltip = EditorPropertyNameProcessor::get_singleton()->process_name(component, EditorPropertyNameProcessor::get_tooltip_style(section_name_style), p.name, doc_name);
 				}
 
 				Color c = sscolor;
@@ -3145,7 +3145,7 @@ void EditorInspector::update_tree() {
 				editor_inspector_array = memnew(EditorInspectorArray(all_read_only));
 
 				String array_label = path.contains("/") ? path.substr(path.rfind("/") + 1) : path;
-				array_label = EditorPropertyNameProcessor::get_singleton()->process_name(property_label_string, property_name_style);
+				array_label = EditorPropertyNameProcessor::get_singleton()->process_name(property_label_string, property_name_style, p.name, doc_name);
 				int page = per_array_page.has(array_element_prefix) ? per_array_page[array_element_prefix] : 0;
 				editor_inspector_array->setup_with_move_element_function(object, array_label, array_element_prefix, page, c, use_folding);
 				editor_inspector_array->connect("page_change_request", callable_mp(this, &EditorInspector::_page_change_request).bind(array_element_prefix));
diff --git a/editor/editor_locale_dialog.cpp b/editor/editor_locale_dialog.cpp
index 1318b59685..e97e4ac777 100644
--- a/editor/editor_locale_dialog.cpp
+++ b/editor/editor_locale_dialog.cpp
@@ -322,7 +322,7 @@ void EditorLocaleDialog::_update_tree() {
 
 	if (!is_edit_mode) {
 		TreeItem *t = cnt_list->create_item(c_root);
-		t->set_text(0, "[Default]");
+		t->set_text(0, TTR("[Default]"));
 		t->set_metadata(0, "");
 	}
 
diff --git a/editor/editor_node.cpp b/editor/editor_node.cpp
index edb6fee75c..616c5667dd 100644
--- a/editor/editor_node.cpp
+++ b/editor/editor_node.cpp
@@ -2066,8 +2066,9 @@ void EditorNode::_dialog_action(String p_file) {
 		} break;
 
 		case FILE_EXPORT_MESH_LIBRARY: {
-			bool merge_with_existing_library = file_export_lib_merge->is_pressed();
-			bool apply_mesh_instance_transforms = file_export_lib_apply_xforms->is_pressed();
+			const Dictionary &fd_options = file_export_lib->get_selected_options();
+			bool merge_with_existing_library = fd_options.get(TTR("Merge With Existing"), true);
+			bool apply_mesh_instance_transforms = fd_options.get(TTR("Apply MeshInstance Transforms"), false);
 
 			Ref<MeshLibrary> ml;
 			if (merge_with_existing_library && FileAccess::exists(p_file)) {
@@ -2506,8 +2507,8 @@ void EditorNode::_edit_current(bool p_skip_foreign, bool p_skip_inspector_update
 			editable_info,
 			info_is_warning);
 
-	Object *editor_owner = is_node ? (Object *)SceneTreeDock::get_singleton() : is_resource ? (Object *)InspectorDock::get_inspector_singleton()
-																							: (Object *)this;
+	Object *editor_owner = (is_node || current_obj->is_class("MultiNodeEdit")) ? (Object *)SceneTreeDock::get_singleton() : is_resource ? (Object *)InspectorDock::get_inspector_singleton()
+																																		: (Object *)this;
 
 	// Take care of the main editor plugin.
 
@@ -2723,8 +2724,8 @@ void EditorNode::_menu_option_confirm(int p_option, bool p_confirmed) {
 				root_name = EditorNode::adjust_scene_name_casing(root_name);
 				file->set_current_path(root_name + "." + extensions.front()->get().to_lower());
 			}
-			file->popup_file_dialog();
 			file->set_title(TTR("Save Scene As..."));
+			file->popup_file_dialog();
 
 		} break;
 
@@ -3238,8 +3239,8 @@ void EditorNode::_export_as_menu_option(int p_idx) {
 			file_export_lib->add_filter("*." + E);
 		}
 
-		file_export_lib->popup_file_dialog();
 		file_export_lib->set_title(TTR("Export Mesh Library"));
+		file_export_lib->popup_file_dialog();
 	} else { // Custom menu options added by plugins
 		if (export_as_menu->get_item_submenu(p_idx).is_empty()) { // If not a submenu
 			Callable callback = export_as_menu->get_item_metadata(p_idx);
@@ -3818,6 +3819,14 @@ void EditorNode::_set_current_scene_nocheck(int p_idx) {
 		}
 	}
 
+	if (editor_data.check_and_update_scene(p_idx)) {
+		if (!editor_data.get_scene_path(p_idx).is_empty()) {
+			editor_folding.load_scene_folding(editor_data.get_edited_scene_root(p_idx), editor_data.get_scene_path(p_idx));
+		}
+
+		EditorUndoRedoManager::get_singleton()->clear_history(false, editor_data.get_scene_history_id(p_idx));
+	}
+
 	Dictionary state = editor_data.restore_edited_scene_state(editor_selection, &editor_history);
 	_edit_current(true);
 
@@ -3827,14 +3836,6 @@ void EditorNode::_set_current_scene_nocheck(int p_idx) {
 	if (tabs_to_close.is_empty()) {
 		callable_mp(this, &EditorNode::_set_main_scene_state).call_deferred(state, get_edited_scene()); // Do after everything else is done setting up.
 	}
-
-	if (editor_data.check_and_update_scene(p_idx)) {
-		if (!editor_data.get_scene_path(p_idx).is_empty()) {
-			editor_folding.load_scene_folding(editor_data.get_edited_scene_root(p_idx), editor_data.get_scene_path(p_idx));
-		}
-
-		EditorUndoRedoManager::get_singleton()->clear_history(false, editor_data.get_scene_history_id(p_idx));
-	}
 }
 
 void EditorNode::setup_color_picker(ColorPicker *p_picker) {
@@ -5279,6 +5280,12 @@ void EditorNode::_scene_tab_closed(int p_tab) {
 	scene_tabs->update_scene_tabs();
 }
 
+void EditorNode::_cancel_close_scene_tab() {
+	if (_is_closing_editor()) {
+		tab_closing_menu_option = -1;
+	}
+}
+
 void EditorNode::_toggle_distraction_free_mode() {
 	if (EDITOR_GET("interface/editor/separate_distraction_mode")) {
 		int screen = -1;
@@ -6829,6 +6836,8 @@ EditorNode::EditorNode() {
 	settings_menu->add_shortcut(ED_SHORTCUT("editor/command_palette", TTR("Command Palette..."), KeyModifierMask::CMD_OR_CTRL | KeyModifierMask::SHIFT | Key::P), HELP_COMMAND_PALETTE);
 	settings_menu->add_separator();
 
+	settings_menu->add_submenu_node_item(TTR("Editor Docks"), editor_dock_manager->get_docks_menu());
+
 	editor_layouts = memnew(PopupMenu);
 	editor_layouts->set_auto_translate_mode(AUTO_TRANSLATE_MODE_DISABLED);
 	settings_menu->add_submenu_node_item(TTR("Editor Layout"), editor_layouts);
@@ -6998,22 +7007,22 @@ EditorNode::EditorNode() {
 	history_dock = memnew(HistoryDock);
 
 	// Scene: Top left.
-	editor_dock_manager->add_control_to_dock(EditorDockManager::DOCK_SLOT_LEFT_UR, SceneTreeDock::get_singleton(), TTR("Scene"));
+	editor_dock_manager->add_dock(SceneTreeDock::get_singleton(), TTR("Scene"), EditorDockManager::DOCK_SLOT_LEFT_UR);
 
 	// Import: Top left, behind Scene.
-	editor_dock_manager->add_control_to_dock(EditorDockManager::DOCK_SLOT_LEFT_UR, ImportDock::get_singleton(), TTR("Import"));
+	editor_dock_manager->add_dock(ImportDock::get_singleton(), TTR("Import"), EditorDockManager::DOCK_SLOT_LEFT_UR);
 
 	// FileSystem: Bottom left.
-	editor_dock_manager->add_control_to_dock(EditorDockManager::DOCK_SLOT_LEFT_BR, FileSystemDock::get_singleton(), TTR("FileSystem"), ED_SHORTCUT_AND_COMMAND("bottom_panels/toggle_filesystem_bottom_panel", TTR("Toggle FileSystem Bottom Panel"), KeyModifierMask::ALT | Key::F));
+	editor_dock_manager->add_dock(FileSystemDock::get_singleton(), TTR("FileSystem"), EditorDockManager::DOCK_SLOT_LEFT_BR, ED_SHORTCUT_AND_COMMAND("bottom_panels/toggle_filesystem_bottom_panel", TTR("Toggle FileSystem Bottom Panel"), KeyModifierMask::ALT | Key::F));
 
 	// Inspector: Full height right.
-	editor_dock_manager->add_control_to_dock(EditorDockManager::DOCK_SLOT_RIGHT_UL, InspectorDock::get_singleton(), TTR("Inspector"));
+	editor_dock_manager->add_dock(InspectorDock::get_singleton(), TTR("Inspector"), EditorDockManager::DOCK_SLOT_RIGHT_UL);
 
 	// Node: Full height right, behind Inspector.
-	editor_dock_manager->add_control_to_dock(EditorDockManager::DOCK_SLOT_RIGHT_UL, NodeDock::get_singleton(), TTR("Node"));
+	editor_dock_manager->add_dock(NodeDock::get_singleton(), TTR("Node"), EditorDockManager::DOCK_SLOT_RIGHT_UL);
 
 	// History: Full height right, behind Node.
-	editor_dock_manager->add_control_to_dock(EditorDockManager::DOCK_SLOT_RIGHT_UL, history_dock, TTR("History"));
+	editor_dock_manager->add_dock(history_dock, TTR("History"), EditorDockManager::DOCK_SLOT_RIGHT_UL);
 
 	// Add some offsets to left_r and main hsplits to make LEFT_R and RIGHT_L docks wider than minsize.
 	left_r_hsplit->set_split_offset(270 * EDSCALE);
@@ -7070,6 +7079,7 @@ EditorNode::EditorNode() {
 	save_confirmation->set_min_size(Vector2(450.0 * EDSCALE, 0));
 	save_confirmation->connect("confirmed", callable_mp(this, &EditorNode::_menu_confirm_current));
 	save_confirmation->connect("custom_action", callable_mp(this, &EditorNode::_discard_changes));
+	save_confirmation->connect("canceled", callable_mp(this, &EditorNode::_cancel_close_scene_tab));
 
 	gradle_build_manage_templates = memnew(ConfirmationDialog);
 	gradle_build_manage_templates->set_text(TTR("Android build template is missing, please install relevant templates."));
@@ -7128,16 +7138,8 @@ EditorNode::EditorNode() {
 	file_export_lib->set_title(TTR("Export Library"));
 	file_export_lib->set_file_mode(EditorFileDialog::FILE_MODE_SAVE_FILE);
 	file_export_lib->connect("file_selected", callable_mp(this, &EditorNode::_dialog_action));
-	file_export_lib_merge = memnew(CheckBox);
-	file_export_lib_merge->set_text(TTR("Merge With Existing"));
-	file_export_lib_merge->set_h_size_flags(Control::SIZE_SHRINK_CENTER);
-	file_export_lib_merge->set_pressed(true);
-	file_export_lib->get_vbox()->add_child(file_export_lib_merge);
-	file_export_lib_apply_xforms = memnew(CheckBox);
-	file_export_lib_apply_xforms->set_text(TTR("Apply MeshInstance Transforms"));
-	file_export_lib_apply_xforms->set_h_size_flags(Control::SIZE_SHRINK_CENTER);
-	file_export_lib_apply_xforms->set_pressed(false);
-	file_export_lib->get_vbox()->add_child(file_export_lib_apply_xforms);
+	file_export_lib->add_option(TTR("Merge With Existing"), Vector<String>(), true);
+	file_export_lib->add_option(TTR("Apply MeshInstance Transforms"), Vector<String>(), false);
 	gui_base->add_child(file_export_lib);
 
 	file_script = memnew(EditorFileDialog);
diff --git a/editor/editor_node.h b/editor/editor_node.h
index b479626648..8bcf29811d 100644
--- a/editor/editor_node.h
+++ b/editor/editor_node.h
@@ -399,8 +399,6 @@ private:
 	EditorFileDialog *file_export_lib = nullptr;
 	EditorFileDialog *file_script = nullptr;
 	EditorFileDialog *file_android_build_source = nullptr;
-	CheckBox *file_export_lib_merge = nullptr;
-	CheckBox *file_export_lib_apply_xforms = nullptr;
 	String current_path;
 	MenuButton *update_spinner = nullptr;
 
@@ -572,6 +570,7 @@ private:
 	int _next_unsaved_scene(bool p_valid_filename, int p_start = 0);
 	void _discard_changes(const String &p_str = String());
 	void _scene_tab_closed(int p_tab);
+	void _cancel_close_scene_tab();
 
 	void _inherit_request(String p_file);
 	void _instantiate_request(const Vector<String> &p_files);
diff --git a/editor/editor_plugin.cpp b/editor/editor_plugin.cpp
index 67fe0c2e28..f42a1555a2 100644
--- a/editor/editor_plugin.cpp
+++ b/editor/editor_plugin.cpp
@@ -87,12 +87,12 @@ Button *EditorPlugin::add_control_to_bottom_panel(Control *p_control, const Stri
 
 void EditorPlugin::add_control_to_dock(DockSlot p_slot, Control *p_control, const Ref<Shortcut> &p_shortcut) {
 	ERR_FAIL_NULL(p_control);
-	EditorDockManager::get_singleton()->add_control_to_dock(EditorDockManager::DockSlot(p_slot), p_control, String(), p_shortcut);
+	EditorDockManager::get_singleton()->add_dock(p_control, String(), EditorDockManager::DockSlot(p_slot), p_shortcut);
 }
 
 void EditorPlugin::remove_control_from_docks(Control *p_control) {
 	ERR_FAIL_NULL(p_control);
-	EditorDockManager::get_singleton()->remove_control_from_dock(p_control);
+	EditorDockManager::get_singleton()->remove_dock(p_control);
 }
 
 void EditorPlugin::remove_control_from_bottom_panel(Control *p_control) {
diff --git a/editor/editor_properties.cpp b/editor/editor_properties.cpp
index b7380c9fc2..4858fcf78f 100644
--- a/editor/editor_properties.cpp
+++ b/editor/editor_properties.cpp
@@ -2159,7 +2159,7 @@ EditorPropertyQuaternion::EditorPropertyQuaternion() {
 	warning_dialog->set_text(TTR("Temporary Euler will not be stored in the object with the original value. Instead, it will be stored as Quaternion with irreversible conversion.\nThis is due to the fact that the result of Euler->Quaternion can be determined uniquely, but the result of Quaternion->Euler can be multi-existent."));
 
 	euler_label = memnew(Label);
-	euler_label->set_text("Temporary Euler");
+	euler_label->set_text(TTR("Temporary Euler"));
 
 	edit_custom_bc->add_child(warning);
 	edit_custom_bc->add_child(edit_custom_layout);
@@ -2751,7 +2751,15 @@ void EditorPropertyNodePath::_node_assign() {
 		add_child(scene_tree);
 		scene_tree->connect("selected", callable_mp(this, &EditorPropertyNodePath::_node_selected));
 	}
-	scene_tree->popup_scenetree_dialog();
+
+	Variant val = get_edited_property_value();
+	Node *n = nullptr;
+	if (val.get_type() == Variant::Type::NODE_PATH) {
+		n = get_base_node()->get_node_or_null(val);
+	} else {
+		n = Object::cast_to<Node>(val);
+	}
+	scene_tree->popup_scenetree_dialog(n, get_base_node());
 }
 
 void EditorPropertyNodePath::_update_menu() {
@@ -3184,7 +3192,6 @@ void EditorPropertyResource::_resource_changed(const Ref<Resource> &p_resource)
 			add_child(scene_tree);
 			scene_tree->connect("selected", callable_mp(this, &EditorPropertyResource::_viewport_selected));
 		}
-
 		scene_tree->popup_scenetree_dialog();
 	}
 }
diff --git a/editor/editor_property_name_processor.cpp b/editor/editor_property_name_processor.cpp
index a892ea0f85..1318b84d60 100644
--- a/editor/editor_property_name_processor.cpp
+++ b/editor/editor_property_name_processor.cpp
@@ -91,7 +91,27 @@ String EditorPropertyNameProcessor::_capitalize_name(const String &p_name) const
 	return capitalized;
 }
 
-String EditorPropertyNameProcessor::process_name(const String &p_name, Style p_style) const {
+StringName EditorPropertyNameProcessor::_get_context(const String &p_name, const String &p_property, const StringName &p_class) const {
+	if (p_property.is_empty() && p_class == StringName()) {
+		return StringName();
+	}
+	const HashMap<String, StringName> *context_map = translation_contexts.getptr(p_name);
+	if (context_map == nullptr) {
+		return StringName();
+	}
+	// It's expected that full property path is enough to distinguish between usages.
+	// In case a class name is needed, all usages should be prefixed with the class name.
+	const StringName *context = context_map->getptr(p_property);
+	if (context == nullptr && p_class != StringName()) {
+		context = context_map->getptr(String(p_class) + "::" + p_property);
+	}
+	if (context == nullptr) {
+		return StringName();
+	}
+	return *context;
+}
+
+String EditorPropertyNameProcessor::process_name(const String &p_name, Style p_style, const String &p_property, const StringName &p_class) const {
 	switch (p_style) {
 		case STYLE_RAW: {
 			return p_name;
@@ -104,7 +124,7 @@ String EditorPropertyNameProcessor::process_name(const String &p_name, Style p_s
 		case STYLE_LOCALIZED: {
 			const String capitalized = _capitalize_name(p_name);
 			if (TranslationServer::get_singleton()) {
-				return TranslationServer::get_singleton()->property_translate(capitalized);
+				return TranslationServer::get_singleton()->property_translate(capitalized, _get_context(p_name, p_property, p_class));
 			}
 			return capitalized;
 		} break;
@@ -320,6 +340,25 @@ EditorPropertyNameProcessor::EditorPropertyNameProcessor() {
 			"then",
 			"to",
 	});
+
+	// Translation context associated with a name.
+	// The second key is either:
+	// - `full/property/path`
+	// - `Class::full/property/path`
+	// In case a class name is needed to distinguish between usages, all usages should use the second format.
+	//
+	// The following initialization is parsed in `editor/translations/scripts/common.py` with a regex.
+	// The map name and value definition format should be kept synced with the regex.
+	translation_contexts["force"]["constant_force"] = "Physics";
+	translation_contexts["force"]["force/8_bit"] = "Enforce";
+	translation_contexts["force"]["force/mono"] = "Enforce";
+	translation_contexts["force"]["force/max_rate"] = "Enforce";
+	translation_contexts["force"]["force/max_rate_hz"] = "Enforce";
+	translation_contexts["normal"]["theme_override_styles/normal"] = "Ordinary";
+	translation_contexts["normal"]["TextureButton::texture_normal"] = "Ordinary";
+	translation_contexts["normal"]["Decal::texture_normal"] = "Geometry";
+	translation_contexts["normal"]["detail_normal"] = "Geometry";
+	translation_contexts["normal"]["normal"] = "Geometry";
 }
 
 EditorPropertyNameProcessor::~EditorPropertyNameProcessor() {
diff --git a/editor/editor_property_name_processor.h b/editor/editor_property_name_processor.h
index 8e3cecb45b..2c68423c84 100644
--- a/editor/editor_property_name_processor.h
+++ b/editor/editor_property_name_processor.h
@@ -42,9 +42,14 @@ class EditorPropertyNameProcessor : public Node {
 	HashMap<String, String> capitalize_string_remaps;
 	LocalVector<String> stop_words; // Exceptions that shouldn't be capitalized.
 
+	HashMap<String, HashMap<String, StringName>> translation_contexts;
+
 	// Capitalizes property path segments.
 	String _capitalize_name(const String &p_name) const;
 
+	// Returns the translation context for the given name.
+	StringName _get_context(const String &p_name, const String &p_property, const StringName &p_class) const;
+
 public:
 	// Matches `interface/inspector/capitalize_properties` editor setting.
 	enum Style {
@@ -62,7 +67,8 @@ public:
 	static bool is_localization_available();
 
 	// Turns property path segment into the given style.
-	String process_name(const String &p_name, Style p_style) const;
+	// `p_class` and `p_property` are only used for `STYLE_LOCALIZED`, associating the name with a translation context.
+	String process_name(const String &p_name, Style p_style, const String &p_property = "", const StringName &p_class = "") const;
 
 	// Translate plain text group names.
 	String translate_group_name(const String &p_name) const;
diff --git a/editor/editor_sectioned_inspector.cpp b/editor/editor_sectioned_inspector.cpp
index 8e8908faaf..f13af8e4ca 100644
--- a/editor/editor_sectioned_inspector.cpp
+++ b/editor/editor_sectioned_inspector.cpp
@@ -42,7 +42,7 @@ static bool _property_path_matches(const String &p_property_path, const String &
 
 	const Vector<String> sections = p_property_path.split("/");
 	for (int i = 0; i < sections.size(); i++) {
-		if (p_filter.is_subsequence_ofn(EditorPropertyNameProcessor::get_singleton()->process_name(sections[i], p_style))) {
+		if (p_filter.is_subsequence_ofn(EditorPropertyNameProcessor::get_singleton()->process_name(sections[i], p_style, p_property_path))) {
 			return true;
 		}
 	}
@@ -278,8 +278,8 @@ void SectionedInspector::update_category_list() {
 				TreeItem *ms = sections->create_item(parent);
 				section_map[metasection] = ms;
 
-				const String text = EditorPropertyNameProcessor::get_singleton()->process_name(sectionarr[i], name_style);
-				const String tooltip = EditorPropertyNameProcessor::get_singleton()->process_name(sectionarr[i], tooltip_style);
+				const String text = EditorPropertyNameProcessor::get_singleton()->process_name(sectionarr[i], name_style, pi.name);
+				const String tooltip = EditorPropertyNameProcessor::get_singleton()->process_name(sectionarr[i], tooltip_style, pi.name);
 
 				ms->set_text(0, text);
 				ms->set_tooltip_text(0, tooltip);
diff --git a/editor/editor_settings.cpp b/editor/editor_settings.cpp
index ee33e171e3..32d581a26e 100644
--- a/editor/editor_settings.cpp
+++ b/editor/editor_settings.cpp
@@ -417,6 +417,7 @@ void EditorSettings::_load_defaults(Ref<ConfigFile> p_extra_config) {
 	set_restart_if_changed("interface/editor/debug/enable_pseudolocalization", true);
 	// Use pseudolocalization in editor.
 	EDITOR_SETTING_USAGE(Variant::BOOL, PROPERTY_HINT_NONE, "interface/editor/use_embedded_menu", false, "", PROPERTY_USAGE_DEFAULT | PROPERTY_USAGE_RESTART_IF_CHANGED)
+	EDITOR_SETTING_USAGE(Variant::BOOL, PROPERTY_HINT_NONE, "interface/editor/use_native_file_dialogs", false, "", PROPERTY_USAGE_DEFAULT)
 	EDITOR_SETTING_USAGE(Variant::BOOL, PROPERTY_HINT_NONE, "interface/editor/expand_to_title", true, "", PROPERTY_USAGE_DEFAULT | PROPERTY_USAGE_RESTART_IF_CHANGED)
 
 	EDITOR_SETTING(Variant::INT, PROPERTY_HINT_RANGE, "interface/editor/main_font_size", 14, "8,48,1")
@@ -755,6 +756,7 @@ void EditorSettings::_load_defaults(Ref<ConfigFile> p_extra_config) {
 
 	// Tiles editor
 	_initial_set("editors/tiles_editor/display_grid", true);
+	_initial_set("editors/tiles_editor/highlight_selected_layer", true);
 	_initial_set("editors/tiles_editor/grid_color", Color(1.0, 0.5, 0.2, 0.5));
 
 	// Polygon editor
diff --git a/editor/editor_settings_dialog.cpp b/editor/editor_settings_dialog.cpp
index d6946afbc1..7faab03a35 100644
--- a/editor/editor_settings_dialog.cpp
+++ b/editor/editor_settings_dialog.cpp
@@ -452,8 +452,8 @@ void EditorSettingsDialog::_update_shortcuts() {
 
 		TreeItem *section = shortcuts->create_item(root);
 
-		const String item_name = EditorPropertyNameProcessor::get_singleton()->process_name(section_name, name_style);
-		const String tooltip = EditorPropertyNameProcessor::get_singleton()->process_name(section_name, tooltip_style);
+		const String item_name = EditorPropertyNameProcessor::get_singleton()->process_name(section_name, name_style, E);
+		const String tooltip = EditorPropertyNameProcessor::get_singleton()->process_name(section_name, tooltip_style, E);
 
 		section->set_text(0, item_name);
 		section->set_tooltip_text(0, tooltip);
diff --git a/editor/export/project_export.cpp b/editor/export/project_export.cpp
index 536e7a0f04..038e357ce2 100644
--- a/editor/export/project_export.cpp
+++ b/editor/export/project_export.cpp
@@ -1035,10 +1035,13 @@ void ProjectExportDialog::_export_pck_zip_selected(const String &p_path) {
 	Ref<EditorExportPlatform> platform = current->get_platform();
 	ERR_FAIL_COND(platform.is_null());
 
+	const Dictionary &fd_option = export_pck_zip->get_selected_options();
+	bool export_debug = fd_option.get(TTR("Export With Debug"), true);
+
 	if (p_path.ends_with(".zip")) {
-		platform->export_zip(current, export_pck_zip_debug->is_pressed(), p_path);
+		platform->export_zip(current, export_debug, p_path);
 	} else if (p_path.ends_with(".pck")) {
-		platform->export_pack(current, export_pck_zip_debug->is_pressed(), p_path);
+		platform->export_pack(current, export_debug, p_path);
 	} else {
 		ERR_FAIL_MSG("Path must end with .pck or .zip");
 	}
@@ -1123,7 +1126,10 @@ void ProjectExportDialog::_export_project_to_path(const String &p_path) {
 
 	platform->clear_messages();
 	current->update_value_overrides();
-	Error err = platform->export_project(current, export_debug->is_pressed(), current->get_export_path(), 0);
+	Dictionary fd_option = export_project->get_selected_options();
+	bool export_debug = fd_option.get(TTR("Export With Debug"), true);
+
+	Error err = platform->export_project(current, export_debug, current->get_export_path(), 0);
 	result_dialog_log->clear();
 	if (err != ERR_SKIP) {
 		if (platform->fill_log_messages(result_dialog_log, err)) {
@@ -1552,17 +1558,8 @@ ProjectExportDialog::ProjectExportDialog() {
 	export_project->connect("file_selected", callable_mp(this, &ProjectExportDialog::_export_project_to_path));
 	export_project->get_line_edit()->connect("text_changed", callable_mp(this, &ProjectExportDialog::_validate_export_path));
 
-	export_debug = memnew(CheckBox);
-	export_debug->set_text(TTR("Export With Debug"));
-	export_debug->set_pressed(true);
-	export_debug->set_h_size_flags(Control::SIZE_SHRINK_CENTER);
-	export_project->get_vbox()->add_child(export_debug);
-
-	export_pck_zip_debug = memnew(CheckBox);
-	export_pck_zip_debug->set_text(TTR("Export With Debug"));
-	export_pck_zip_debug->set_pressed(true);
-	export_pck_zip_debug->set_h_size_flags(Control::SIZE_SHRINK_CENTER);
-	export_pck_zip->get_vbox()->add_child(export_pck_zip_debug);
+	export_project->add_option(TTR("Export With Debug"), Vector<String>(), true);
+	export_pck_zip->add_option(TTR("Export With Debug"), Vector<String>(), true);
 
 	set_hide_on_ok(false);
 
diff --git a/editor/export/project_export.h b/editor/export/project_export.h
index bcab05cebb..c3499177f3 100644
--- a/editor/export/project_export.h
+++ b/editor/export/project_export.h
@@ -154,8 +154,6 @@ class ProjectExportDialog : public ConfirmationDialog {
 
 	EditorFileDialog *export_pck_zip = nullptr;
 	EditorFileDialog *export_project = nullptr;
-	CheckBox *export_debug = nullptr;
-	CheckBox *export_pck_zip_debug = nullptr;
 
 	CheckButton *enc_pck = nullptr;
 	CheckButton *enc_directory = nullptr;
diff --git a/editor/groups_editor.cpp b/editor/groups_editor.cpp
index 902e166fd9..bec13b710d 100644
--- a/editor/groups_editor.cpp
+++ b/editor/groups_editor.cpp
@@ -196,7 +196,7 @@ void GroupsEditor::_update_tree() {
 	TreeItem *root = tree->create_item();
 
 	TreeItem *local_root = tree->create_item(root);
-	local_root->set_text(0, "Scene Groups");
+	local_root->set_text(0, TTR("Scene Groups"));
 	local_root->set_icon(0, get_editor_theme_icon(SNAME("PackedScene")));
 	local_root->set_custom_bg_color(0, get_theme_color(SNAME("prop_subsection"), SNAME("Editor")));
 	local_root->set_selectable(0, false);
@@ -233,7 +233,7 @@ void GroupsEditor::_update_tree() {
 	keys.sort_custom<NoCaseComparator>();
 
 	TreeItem *global_root = tree->create_item(root);
-	global_root->set_text(0, "Global Groups");
+	global_root->set_text(0, TTR("Global Groups"));
 	global_root->set_icon(0, get_editor_theme_icon(SNAME("Environment")));
 	global_root->set_custom_bg_color(0, get_theme_color(SNAME("prop_subsection"), SNAME("Editor")));
 	global_root->set_selectable(0, false);
diff --git a/editor/gui/editor_file_dialog.cpp b/editor/gui/editor_file_dialog.cpp
index df1f026f78..25c8610ff4 100644
--- a/editor/gui/editor_file_dialog.cpp
+++ b/editor/gui/editor_file_dialog.cpp
@@ -42,6 +42,8 @@
 #include "editor/filesystem_dock.h"
 #include "editor/themes/editor_scale.h"
 #include "scene/gui/center_container.h"
+#include "scene/gui/check_box.h"
+#include "scene/gui/grid_container.h"
 #include "scene/gui/label.h"
 #include "scene/gui/margin_container.h"
 #include "scene/gui/option_button.h"
@@ -56,6 +58,98 @@ EditorFileDialog::GetIconFunc EditorFileDialog::get_thumbnail_func = nullptr;
 EditorFileDialog::RegisterFunc EditorFileDialog::register_func = nullptr;
 EditorFileDialog::RegisterFunc EditorFileDialog::unregister_func = nullptr;
 
+void EditorFileDialog::_native_popup() {
+	// Show native dialog directly.
+	String root;
+	if (access == ACCESS_RESOURCES) {
+		root = ProjectSettings::get_singleton()->get_resource_path();
+	} else if (access == ACCESS_USERDATA) {
+		root = OS::get_singleton()->get_user_data_dir();
+	}
+	DisplayServer::get_singleton()->file_dialog_with_options_show(get_title(), ProjectSettings::get_singleton()->globalize_path(dir->get_text()), root, file->get_text().get_file(), show_hidden_files, DisplayServer::FileDialogMode(mode), filters, _get_options(), callable_mp(this, &EditorFileDialog::_native_dialog_cb));
+}
+
+void EditorFileDialog::popup(const Rect2i &p_rect) {
+	_update_option_controls();
+
+	bool use_native = DisplayServer::get_singleton()->has_feature(DisplayServer::FEATURE_NATIVE_DIALOG_FILE) && (bool(EDITOR_GET("interface/editor/use_native_file_dialogs")) || OS::get_singleton()->is_sandboxed());
+	if (!side_vbox && use_native) {
+		_native_popup();
+	} else {
+		// Show custom file dialog (full dialog or side menu only).
+		_update_side_menu_visibility(use_native);
+		ConfirmationDialog::popup(p_rect);
+	}
+}
+
+void EditorFileDialog::set_visible(bool p_visible) {
+	if (p_visible) {
+		bool use_native = DisplayServer::get_singleton()->has_feature(DisplayServer::FEATURE_NATIVE_DIALOG_FILE) && (bool(EDITOR_GET("interface/editor/use_native_file_dialogs")) || OS::get_singleton()->is_sandboxed());
+		_update_option_controls();
+		if (!side_vbox && use_native) {
+			_native_popup();
+		} else {
+			// Show custom file dialog (full dialog or side menu only).
+			_update_side_menu_visibility(use_native);
+			ConfirmationDialog::set_visible(p_visible);
+		}
+	} else {
+		ConfirmationDialog::set_visible(p_visible);
+	}
+}
+
+void EditorFileDialog::_native_dialog_cb(bool p_ok, const Vector<String> &p_files, int p_filter, const Dictionary &p_selected_options) {
+	if (!p_ok) {
+		file->set_text("");
+		emit_signal(SNAME("canceled"));
+		return;
+	}
+
+	if (p_files.is_empty()) {
+		return;
+	}
+
+	Vector<String> files = p_files;
+	if (access != ACCESS_FILESYSTEM) {
+		for (String &file_name : files) {
+			file_name = ProjectSettings::get_singleton()->localize_path(file_name);
+		}
+	}
+	String f = files[0];
+	if (mode == FILE_MODE_OPEN_FILES) {
+		emit_signal(SNAME("files_selected"), files);
+	} else {
+		if (mode == FILE_MODE_SAVE_FILE) {
+			if (p_filter >= 0 && p_filter < filters.size()) {
+				bool valid = false;
+				String flt = filters[p_filter].get_slice(";", 0);
+				int filter_slice_count = flt.get_slice_count(",");
+				for (int j = 0; j < filter_slice_count; j++) {
+					String str = (flt.get_slice(",", j).strip_edges());
+					if (f.match(str)) {
+						valid = true;
+						break;
+					}
+				}
+
+				if (!valid && filter_slice_count > 0) {
+					String str = (flt.get_slice(",", 0).strip_edges());
+					f += str.substr(1, str.length() - 1);
+				}
+			}
+			emit_signal(SNAME("file_selected"), f);
+		} else if ((mode == FILE_MODE_OPEN_ANY || mode == FILE_MODE_OPEN_FILE) && dir_access->file_exists(f)) {
+			emit_signal(SNAME("file_selected"), f);
+		} else if (mode == FILE_MODE_OPEN_ANY || mode == FILE_MODE_OPEN_DIR) {
+			emit_signal(SNAME("dir_selected"), f);
+		}
+	}
+	file->set_text(f);
+	dir->set_text(f.get_base_dir());
+	selected_options = p_selected_options;
+	filter->select(p_filter);
+}
+
 void EditorFileDialog::popup_file_dialog() {
 	popup_centered_clamped(Size2(1050, 700) * EDSCALE, 0.8);
 	_focus_file_text();
@@ -385,6 +479,15 @@ void EditorFileDialog::_request_single_thumbnail(const String &p_path) {
 }
 
 void EditorFileDialog::_action_pressed() {
+	// Accept side menu properties and show native dialog.
+	if (side_vbox && DisplayServer::get_singleton()->has_feature(DisplayServer::FEATURE_NATIVE_DIALOG_FILE) && (bool(EDITOR_GET("interface/editor/use_native_file_dialogs")) || OS::get_singleton()->is_sandboxed())) {
+		hide();
+		_native_popup();
+
+		return;
+	}
+
+	// Accept selection in the custom dialog.
 	if (mode == FILE_MODE_OPEN_FILES) {
 		String fbase = dir_access->get_current_dir();
 
@@ -1627,6 +1730,165 @@ EditorFileDialog::DisplayMode EditorFileDialog::get_display_mode() const {
 	return display_mode;
 }
 
+TypedArray<Dictionary> EditorFileDialog::_get_options() const {
+	TypedArray<Dictionary> out;
+	for (const EditorFileDialog::Option &opt : options) {
+		Dictionary dict;
+		dict["name"] = opt.name;
+		dict["values"] = opt.values;
+		dict["default"] = (int)selected_options.get(opt.name, opt.default_idx);
+		out.push_back(dict);
+	}
+	return out;
+}
+
+void EditorFileDialog::_option_changed_checkbox_toggled(bool p_pressed, const String &p_name) {
+	if (selected_options.has(p_name)) {
+		selected_options[p_name] = p_pressed;
+	}
+}
+
+void EditorFileDialog::_option_changed_item_selected(int p_idx, const String &p_name) {
+	if (selected_options.has(p_name)) {
+		selected_options[p_name] = p_idx;
+	}
+}
+
+void EditorFileDialog::_update_option_controls() {
+	if (!options_dirty) {
+		return;
+	}
+	options_dirty = false;
+
+	while (grid_options->get_child_count() > 0) {
+		Node *child = grid_options->get_child(0);
+		grid_options->remove_child(child);
+		child->queue_free();
+	}
+	selected_options.clear();
+
+	for (const EditorFileDialog::Option &opt : options) {
+		Label *lbl = memnew(Label);
+		lbl->set_text(opt.name);
+		grid_options->add_child(lbl);
+		if (opt.values.is_empty()) {
+			CheckBox *cb = memnew(CheckBox);
+			cb->set_pressed(opt.default_idx);
+			grid_options->add_child(cb);
+			cb->connect("toggled", callable_mp(this, &EditorFileDialog::_option_changed_checkbox_toggled).bind(opt.name));
+			selected_options[opt.name] = (bool)opt.default_idx;
+		} else {
+			OptionButton *ob = memnew(OptionButton);
+			for (const String &val : opt.values) {
+				ob->add_item(val);
+			}
+			ob->select(opt.default_idx);
+			grid_options->add_child(ob);
+			ob->connect("item_selected", callable_mp(this, &EditorFileDialog::_option_changed_item_selected).bind(opt.name));
+			selected_options[opt.name] = opt.default_idx;
+		}
+	}
+}
+
+Dictionary EditorFileDialog::get_selected_options() const {
+	return selected_options;
+}
+
+String EditorFileDialog::get_option_name(int p_option) const {
+	ERR_FAIL_INDEX_V(p_option, options.size(), String());
+	return options[p_option].name;
+}
+
+Vector<String> EditorFileDialog::get_option_values(int p_option) const {
+	ERR_FAIL_INDEX_V(p_option, options.size(), Vector<String>());
+	return options[p_option].values;
+}
+
+int EditorFileDialog::get_option_default(int p_option) const {
+	ERR_FAIL_INDEX_V(p_option, options.size(), -1);
+	return options[p_option].default_idx;
+}
+
+void EditorFileDialog::set_option_name(int p_option, const String &p_name) {
+	if (p_option < 0) {
+		p_option += get_option_count();
+	}
+	ERR_FAIL_INDEX(p_option, options.size());
+	options.write[p_option].name = p_name;
+	options_dirty = true;
+	if (is_visible()) {
+		_update_option_controls();
+	}
+}
+
+void EditorFileDialog::set_option_values(int p_option, const Vector<String> &p_values) {
+	if (p_option < 0) {
+		p_option += get_option_count();
+	}
+	ERR_FAIL_INDEX(p_option, options.size());
+	options.write[p_option].values = p_values;
+	if (p_values.is_empty()) {
+		options.write[p_option].default_idx = CLAMP(options[p_option].default_idx, 0, 1);
+	} else {
+		options.write[p_option].default_idx = CLAMP(options[p_option].default_idx, 0, options[p_option].values.size() - 1);
+	}
+	options_dirty = true;
+	if (is_visible()) {
+		_update_option_controls();
+	}
+}
+
+void EditorFileDialog::set_option_default(int p_option, int p_index) {
+	if (p_option < 0) {
+		p_option += get_option_count();
+	}
+	ERR_FAIL_INDEX(p_option, options.size());
+	if (options[p_option].values.is_empty()) {
+		options.write[p_option].default_idx = CLAMP(p_index, 0, 1);
+	} else {
+		options.write[p_option].default_idx = CLAMP(p_index, 0, options[p_option].values.size() - 1);
+	}
+	options_dirty = true;
+	if (is_visible()) {
+		_update_option_controls();
+	}
+}
+
+void EditorFileDialog::add_option(const String &p_name, const Vector<String> &p_values, int p_index) {
+	Option opt;
+	opt.name = p_name;
+	opt.values = p_values;
+	if (opt.values.is_empty()) {
+		opt.default_idx = CLAMP(p_index, 0, 1);
+	} else {
+		opt.default_idx = CLAMP(p_index, 0, opt.values.size() - 1);
+	}
+	options.push_back(opt);
+	options_dirty = true;
+	if (is_visible()) {
+		_update_option_controls();
+	}
+}
+
+void EditorFileDialog::set_option_count(int p_count) {
+	ERR_FAIL_COND(p_count < 0);
+
+	if (options.size() == p_count) {
+		return;
+	}
+	options.resize(p_count);
+
+	options_dirty = true;
+	notify_property_list_changed();
+	if (is_visible()) {
+		_update_option_controls();
+	}
+}
+
+int EditorFileDialog::get_option_count() const {
+	return options.size();
+}
+
 void EditorFileDialog::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("_cancel_pressed"), &EditorFileDialog::_cancel_pressed);
 
@@ -1634,6 +1896,16 @@ void EditorFileDialog::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("add_filter", "filter", "description"), &EditorFileDialog::add_filter, DEFVAL(""));
 	ClassDB::bind_method(D_METHOD("set_filters", "filters"), &EditorFileDialog::set_filters);
 	ClassDB::bind_method(D_METHOD("get_filters"), &EditorFileDialog::get_filters);
+	ClassDB::bind_method(D_METHOD("get_option_name", "option"), &EditorFileDialog::get_option_name);
+	ClassDB::bind_method(D_METHOD("get_option_values", "option"), &EditorFileDialog::get_option_values);
+	ClassDB::bind_method(D_METHOD("get_option_default", "option"), &EditorFileDialog::get_option_default);
+	ClassDB::bind_method(D_METHOD("set_option_name", "option", "name"), &EditorFileDialog::set_option_name);
+	ClassDB::bind_method(D_METHOD("set_option_values", "option", "values"), &EditorFileDialog::set_option_values);
+	ClassDB::bind_method(D_METHOD("set_option_default", "option", "default_value_index"), &EditorFileDialog::set_option_default);
+	ClassDB::bind_method(D_METHOD("set_option_count", "count"), &EditorFileDialog::set_option_count);
+	ClassDB::bind_method(D_METHOD("get_option_count"), &EditorFileDialog::get_option_count);
+	ClassDB::bind_method(D_METHOD("add_option", "name", "values", "default_value_index"), &EditorFileDialog::add_option);
+	ClassDB::bind_method(D_METHOD("get_selected_options"), &EditorFileDialog::get_selected_options);
 	ClassDB::bind_method(D_METHOD("get_current_dir"), &EditorFileDialog::get_current_dir);
 	ClassDB::bind_method(D_METHOD("get_current_file"), &EditorFileDialog::get_current_file);
 	ClassDB::bind_method(D_METHOD("get_current_path"), &EditorFileDialog::get_current_path);
@@ -1669,6 +1941,7 @@ void EditorFileDialog::_bind_methods() {
 	ADD_PROPERTY(PropertyInfo(Variant::STRING, "current_file", PROPERTY_HINT_FILE, "*", PROPERTY_USAGE_NONE), "set_current_file", "get_current_file");
 	ADD_PROPERTY(PropertyInfo(Variant::STRING, "current_path", PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NONE), "set_current_path", "get_current_path");
 	ADD_PROPERTY(PropertyInfo(Variant::PACKED_STRING_ARRAY, "filters"), "set_filters", "get_filters");
+	ADD_ARRAY_COUNT("Options", "option_count", "set_option_count", "get_option_count", "option_");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "show_hidden_files"), "set_show_hidden_files", "is_showing_hidden_files");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "disable_overwrite_warning"), "set_disable_overwrite_warning", "is_overwrite_warning_disabled");
 
@@ -1684,6 +1957,13 @@ void EditorFileDialog::_bind_methods() {
 
 	BIND_ENUM_CONSTANT(DISPLAY_THUMBNAILS);
 	BIND_ENUM_CONSTANT(DISPLAY_LIST);
+
+	Option defaults;
+
+	base_property_helper.set_prefix("option_");
+	base_property_helper.register_property(PropertyInfo(Variant::STRING, "name"), defaults.name, &EditorFileDialog::set_option_name, &EditorFileDialog::get_option_name);
+	base_property_helper.register_property(PropertyInfo(Variant::PACKED_STRING_ARRAY, "values"), defaults.values, &EditorFileDialog::set_option_values, &EditorFileDialog::get_option_values);
+	base_property_helper.register_property(PropertyInfo(Variant::INT, "default"), defaults.default_idx, &EditorFileDialog::set_option_default, &EditorFileDialog::get_option_default);
 }
 
 void EditorFileDialog::set_show_hidden_files(bool p_show) {
@@ -1752,7 +2032,7 @@ void EditorFileDialog::add_side_menu(Control *p_menu, const String &p_title) {
 	// HSplitContainer has 3 children at maximum capacity, 1 of them is the SplitContainerDragger.
 	ERR_FAIL_COND_MSG(body_hsplit->get_child_count() > 2, "EditorFileDialog: Only one side menu can be added.");
 	// Everything for the side menu goes inside of a VBoxContainer.
-	VBoxContainer *side_vbox = memnew(VBoxContainer);
+	side_vbox = memnew(VBoxContainer);
 	side_vbox->set_h_size_flags(Control::SIZE_EXPAND_FILL);
 	side_vbox->set_stretch_ratio(0.5);
 	body_hsplit->add_child(side_vbox);
@@ -1767,10 +2047,23 @@ void EditorFileDialog::add_side_menu(Control *p_menu, const String &p_title) {
 	side_vbox->add_child(p_menu);
 }
 
+void EditorFileDialog::_update_side_menu_visibility(bool p_native_dlg) {
+	if (p_native_dlg) {
+		pathhb->set_visible(false);
+		grid_options->set_visible(false);
+		list_hb->set_visible(false);
+	} else {
+		pathhb->set_visible(true);
+		grid_options->set_visible(true);
+		list_hb->set_visible(true);
+	}
+}
+
 EditorFileDialog::EditorFileDialog() {
 	show_hidden_files = default_show_hidden_files;
 	display_mode = default_display_mode;
-	VBoxContainer *vbc = memnew(VBoxContainer);
+
+	vbc = memnew(VBoxContainer);
 	add_child(vbc);
 
 	set_title(TTR("Save a File"));
@@ -1797,7 +2090,7 @@ EditorFileDialog::EditorFileDialog() {
 		ED_SHORTCUT_OVERRIDE("file_dialog/toggle_mode", "macos", KeyModifierMask::META | KeyModifierMask::CTRL | Key::V);
 	}
 
-	HBoxContainer *pathhb = memnew(HBoxContainer);
+	pathhb = memnew(HBoxContainer);
 	vbc->add_child(pathhb);
 
 	dir_prev = memnew(Button);
@@ -1893,6 +2186,11 @@ EditorFileDialog::EditorFileDialog() {
 	body_hsplit->set_v_size_flags(Control::SIZE_EXPAND_FILL);
 	vbc->add_child(body_hsplit);
 
+	grid_options = memnew(GridContainer);
+	grid_options->set_h_size_flags(Control::SIZE_SHRINK_CENTER);
+	grid_options->set_columns(2);
+	vbc->add_child(grid_options);
+
 	list_hb = memnew(HSplitContainer);
 	list_hb->set_h_size_flags(Control::SIZE_EXPAND_FILL);
 	body_hsplit->add_child(list_hb);
@@ -2044,6 +2342,8 @@ EditorFileDialog::EditorFileDialog() {
 	if (register_func) {
 		register_func(this);
 	}
+
+	property_helper.setup_for_instance(base_property_helper, this);
 }
 
 EditorFileDialog::~EditorFileDialog() {
diff --git a/editor/gui/editor_file_dialog.h b/editor/gui/editor_file_dialog.h
index 1e1c99bc76..7d6fa19a44 100644
--- a/editor/gui/editor_file_dialog.h
+++ b/editor/gui/editor_file_dialog.h
@@ -33,7 +33,9 @@
 
 #include "core/io/dir_access.h"
 #include "scene/gui/dialogs.h"
+#include "scene/property_list_helper.h"
 
+class GridContainer;
 class DependencyRemoveDialog;
 class HSplitContainer;
 class ItemList;
@@ -87,6 +89,7 @@ private:
 	Button *makedir = nullptr;
 	Access access = ACCESS_RESOURCES;
 
+	GridContainer *grid_options = nullptr;
 	VBoxContainer *vbox = nullptr;
 	FileMode mode = FILE_MODE_SAVE_FILE;
 	bool can_create_dir = false;
@@ -113,6 +116,9 @@ private:
 	ConfirmationDialog *confirm_save = nullptr;
 	DependencyRemoveDialog *dep_remove_dialog = nullptr;
 	ConfirmationDialog *global_remove_dialog = nullptr;
+	VBoxContainer *side_vbox = nullptr;
+	VBoxContainer *vbc = nullptr;
+	HBoxContainer *pathhb = nullptr;
 
 	Button *mode_thumbnails = nullptr;
 	Button *mode_list = nullptr;
@@ -174,6 +180,19 @@ private:
 		Ref<Texture2D> progress[8]{};
 	} theme_cache;
 
+	struct Option {
+		String name;
+		Vector<String> values;
+		int default_idx = 0;
+	};
+
+	static inline PropertyListHelper base_property_helper;
+	PropertyListHelper property_helper;
+
+	Vector<Option> options;
+	Dictionary selected_options;
+	bool options_dirty = false;
+
 	void update_dir();
 	void update_file_name();
 	void update_file_list();
@@ -233,15 +252,33 @@ private:
 
 	bool _is_open_should_be_disabled();
 
+	void _update_side_menu_visibility(bool p_native_dlg);
+
+	void _native_popup();
+	void _native_dialog_cb(bool p_ok, const Vector<String> &p_files, int p_filter, const Dictionary &p_selected_options);
+
+	TypedArray<Dictionary> _get_options() const;
+	void _update_option_controls();
+	void _option_changed_checkbox_toggled(bool p_pressed, const String &p_name);
+	void _option_changed_item_selected(int p_idx, const String &p_name);
+
 protected:
 	virtual void _update_theme_item_cache() override;
 
 	void _notification(int p_what);
+	bool _set(const StringName &p_name, const Variant &p_value) { return property_helper.property_set_value(p_name, p_value); }
+	bool _get(const StringName &p_name, Variant &r_ret) const { return property_helper.property_get_value(p_name, r_ret); }
+	void _get_property_list(List<PropertyInfo> *p_list) const { property_helper.get_property_list(p_list, options.size()); }
+	bool _property_can_revert(const StringName &p_name) const { return property_helper.property_can_revert(p_name); }
+	bool _property_get_revert(const StringName &p_name, Variant &r_property) const { return property_helper.property_get_revert(p_name, r_property); }
 	static void _bind_methods();
 
 public:
 	Color get_dir_icon_color(const String &p_dir_path);
 
+	virtual void set_visible(bool p_visible) override;
+	virtual void popup(const Rect2i &p_rect = Rect2i()) override;
+
 	// Public for use with callable_mp.
 	void _file_submitted(const String &p_file);
 
@@ -261,6 +298,20 @@ public:
 	void set_current_file(const String &p_file);
 	void set_current_path(const String &p_path);
 
+	String get_option_name(int p_option) const;
+	Vector<String> get_option_values(int p_option) const;
+	int get_option_default(int p_option) const;
+	void set_option_name(int p_option, const String &p_name);
+	void set_option_values(int p_option, const Vector<String> &p_values);
+	void set_option_default(int p_option, int p_index);
+
+	void add_option(const String &p_name, const Vector<String> &p_values, int p_index);
+
+	void set_option_count(int p_count);
+	int get_option_count() const;
+
+	Dictionary get_selected_options() const;
+
 	void set_display_mode(DisplayMode p_mode);
 	DisplayMode get_display_mode() const;
 
diff --git a/editor/gui/scene_tree_editor.cpp b/editor/gui/scene_tree_editor.cpp
index 835f953603..0dd75ea033 100644
--- a/editor/gui/scene_tree_editor.cpp
+++ b/editor/gui/scene_tree_editor.cpp
@@ -925,6 +925,27 @@ void SceneTreeEditor::_notification(int p_what) {
 
 			_update_tree();
 		} break;
+
+		case NOTIFICATION_VISIBILITY_CHANGED: {
+			if (is_visible()) {
+				TreeItem *item = nullptr;
+				if (selected) {
+					// Scroll to selected node.
+					item = _find(tree->get_root(), selected->get_path());
+				} else if (marked.size() == 1) {
+					// Scroll to a single marked node.
+					Node *marked_node = *marked.begin();
+					if (marked_node) {
+						item = _find(tree->get_root(), marked_node->get_path());
+					}
+				}
+
+				if (item) {
+					// Must wait until tree is properly sized before scrolling.
+					callable_mp(tree, &Tree::scroll_to_item).call_deferred(item, true);
+				}
+			}
+		} break;
 	}
 }
 
@@ -1567,7 +1588,9 @@ SceneTreeEditor::~SceneTreeEditor() {
 
 /******** DIALOG *********/
 
-void SceneTreeDialog::popup_scenetree_dialog() {
+void SceneTreeDialog::popup_scenetree_dialog(Node *p_selected_node, Node *p_marked_node, bool p_marked_node_selectable, bool p_marked_node_children_selectable) {
+	get_scene_tree()->set_marked(p_marked_node, p_marked_node_selectable, p_marked_node_children_selectable);
+	get_scene_tree()->set_selected(p_selected_node);
 	popup_centered_clamped(Size2(350, 700) * EDSCALE);
 }
 
diff --git a/editor/gui/scene_tree_editor.h b/editor/gui/scene_tree_editor.h
index a869e867b8..c1abdcef8b 100644
--- a/editor/gui/scene_tree_editor.h
+++ b/editor/gui/scene_tree_editor.h
@@ -157,8 +157,8 @@ public:
 	void set_as_scene_tree_dock();
 	void set_display_foreign_nodes(bool p_display);
 
-	void set_marked(const HashSet<Node *> &p_marked, bool p_selectable = false, bool p_children_selectable = true);
-	void set_marked(Node *p_marked, bool p_selectable = false, bool p_children_selectable = true);
+	void set_marked(const HashSet<Node *> &p_marked, bool p_selectable = true, bool p_children_selectable = true);
+	void set_marked(Node *p_marked, bool p_selectable = true, bool p_children_selectable = true);
 	void set_selected(Node *p_node, bool p_emit_selected = true);
 	Node *get_selected();
 	void set_can_rename(bool p_can_rename) { can_rename = p_can_rename; }
@@ -201,7 +201,7 @@ protected:
 	static void _bind_methods();
 
 public:
-	void popup_scenetree_dialog();
+	void popup_scenetree_dialog(Node *p_selected_node = nullptr, Node *p_marked_node = nullptr, bool p_marked_node_selectable = true, bool p_marked_node_children_selectable = true);
 	void set_valid_types(const Vector<StringName> &p_valid);
 
 	SceneTreeEditor *get_scene_tree() { return tree; }
diff --git a/editor/icons/CurveTilt.svg b/editor/icons/CurveTilt.svg
new file mode 100644
index 0000000000..4cf7c5bff3
--- /dev/null
+++ b/editor/icons/CurveTilt.svg
@@ -0,0 +1 @@
+<svg enable-background="new 0 0 16 16" viewBox="0 0 16 16" xmlns="http://www.w3.org/2000/svg"><circle cx="8" cy="8" fill="none" r="5" stroke="#ff6" stroke-miterlimit="10" stroke-width="2"/><path d="m8 8v-5" fill="none" stroke="#ff6" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2"/><path d="m8 1c-1.1 0-2 .9-2 2s.9 2 2 2 2-.9 2-2-.9-2-2-2z" fill="#fff"/></svg>
+\ No newline at end of file
diff --git a/editor/icons/TileMapLayer.svg b/editor/icons/TileMapLayer.svg
index 1903a87e3b..90664dee03 100644
--- a/editor/icons/TileMapLayer.svg
+++ b/editor/icons/TileMapLayer.svg
@@ -1 +1 @@
-<svg height="16" viewBox="0 0 16 16" width="16" xmlns="http://www.w3.org/2000/svg"><path d="m1 7v2h2v-2zm3 0v2h2v-2zm3 0v2h2v-2zm3 0v2h2v-2zm3 0v2h2v-2z" fill="#8da5f3"/></svg>
-\ No newline at end of file
+<svg width="16" height="16" viewBox="0 0 16 16" xmlns="http://www.w3.org/2000/svg"><path d="M8 2 6.25 3.375 8 4.75l1.75-1.375zm2.624 2.062-1.75 1.375 1.75 1.375 1.75-1.375Zm2.626 2.063L11.5 7.5l1.75 1.375L15 7.5ZM5.376 4.062l-1.75 1.375 1.75 1.375 1.75-1.375zM8 6.124 6.25 7.499 8 8.874l1.75-1.375zm2.626 2.063-1.75 1.375 1.75 1.375 1.75-1.375ZM2.75 6.125 1 7.5l1.75 1.375L4.5 7.5Zm2.624 2.062-1.75 1.375 1.75 1.375 1.75-1.375ZM8 10.25l-1.75 1.375L8 13l1.75-1.375z" fill="#8da5f3"/></svg>
+\ No newline at end of file
diff --git a/editor/plugins/asset_library_editor_plugin.cpp b/editor/plugins/asset_library_editor_plugin.cpp
index e742053a07..559befe5a4 100644
--- a/editor/plugins/asset_library_editor_plugin.cpp
+++ b/editor/plugins/asset_library_editor_plugin.cpp
@@ -55,7 +55,9 @@ static inline void setup_http_request(HTTPRequest *request) {
 }
 
 void EditorAssetLibraryItem::configure(const String &p_title, int p_asset_id, const String &p_category, int p_category_id, const String &p_author, int p_author_id, const String &p_cost) {
-	title->set_text(p_title);
+	title_text = p_title;
+	title->set_text(title_text);
+	title->set_tooltip_text(title_text);
 	asset_id = p_asset_id;
 	category->set_text(p_category);
 	category_id = p_category_id;
@@ -66,16 +68,15 @@ void EditorAssetLibraryItem::configure(const String &p_title, int p_asset_id, co
 
 // TODO: Refactor this method to use the TextServer.
 void EditorAssetLibraryItem::clamp_width(int p_max_width) {
-	int text_pixel_width = title->get_button_font().ptr()->get_string_size(title->get_text()).x * EDSCALE;
-
-	String full_text = title->get_text();
-	title->set_tooltip_text(full_text);
+	int text_pixel_width = title->get_button_font()->get_string_size(title_text).x * EDSCALE;
 
 	if (text_pixel_width > p_max_width) {
 		// Truncate title text to within the current column width.
-		int max_length = p_max_width / (text_pixel_width / full_text.length());
-		String truncated_text = full_text.left(max_length - 3) + "...";
+		int max_length = p_max_width / (text_pixel_width / title_text.length());
+		String truncated_text = title_text.left(max_length - 3) + "...";
 		title->set_text(truncated_text);
+	} else {
+		title->set_text(title_text);
 	}
 }
 
@@ -1526,7 +1527,15 @@ void EditorAssetLibrary::_update_asset_items_columns() {
 		asset_items->set_columns(new_columns);
 	}
 
-	asset_items_column_width = (get_size().x / new_columns) - (100 * EDSCALE);
+	asset_items_column_width = (get_size().x / new_columns) - (120 * EDSCALE);
+
+	for (int i = 0; i < asset_items->get_child_count(); i++) {
+		EditorAssetLibraryItem *item = Object::cast_to<EditorAssetLibraryItem>(asset_items->get_child(i));
+		if (!item || !item->is_visible()) {
+			continue;
+		}
+		item->clamp_width(asset_items_column_width);
+	}
 }
 
 void EditorAssetLibrary::_set_library_message(const String &p_message) {
diff --git a/editor/plugins/asset_library_editor_plugin.h b/editor/plugins/asset_library_editor_plugin.h
index d4a1411c18..16b784d629 100644
--- a/editor/plugins/asset_library_editor_plugin.h
+++ b/editor/plugins/asset_library_editor_plugin.h
@@ -62,6 +62,7 @@ class EditorAssetLibraryItem : public PanelContainer {
 	LinkButton *author = nullptr;
 	Label *price = nullptr;
 
+	String title_text;
 	int asset_id = 0;
 	int category_id = 0;
 	int author_id = 0;
diff --git a/editor/plugins/cpu_particles_2d_editor_plugin.cpp b/editor/plugins/cpu_particles_2d_editor_plugin.cpp
index cc78117f01..dfc8323fc0 100644
--- a/editor/plugins/cpu_particles_2d_editor_plugin.cpp
+++ b/editor/plugins/cpu_particles_2d_editor_plugin.cpp
@@ -251,7 +251,7 @@ void CPUParticles2DEditorPlugin::_notification(int p_what) {
 	switch (p_what) {
 		case NOTIFICATION_ENTER_TREE: {
 			menu->get_popup()->connect("id_pressed", callable_mp(this, &CPUParticles2DEditorPlugin::_menu_callback));
-			menu->set_icon(epoints->get_editor_theme_icon(SNAME("CPUParticles2D")));
+			menu->set_icon(file->get_editor_theme_icon(SNAME("CPUParticles2D")));
 			file->connect("file_selected", callable_mp(this, &CPUParticles2DEditorPlugin::_file_selected));
 		} break;
 	}
@@ -284,13 +284,6 @@ CPUParticles2DEditorPlugin::CPUParticles2DEditorPlugin() {
 	file->set_file_mode(EditorFileDialog::FILE_MODE_OPEN_FILE);
 	toolbar->add_child(file);
 
-	epoints = memnew(SpinBox);
-	epoints->set_min(1);
-	epoints->set_max(8192);
-	epoints->set_step(1);
-	epoints->set_value(512);
-	file->get_vbox()->add_margin_child(TTR("Generated Point Count:"), epoints);
-
 	emission_mask = memnew(ConfirmationDialog);
 	emission_mask->set_title(TTR("Load Emission Mask"));
 	VBoxContainer *emvb = memnew(VBoxContainer);
diff --git a/editor/plugins/cpu_particles_2d_editor_plugin.h b/editor/plugins/cpu_particles_2d_editor_plugin.h
index ffcf53af66..a408f771eb 100644
--- a/editor/plugins/cpu_particles_2d_editor_plugin.h
+++ b/editor/plugins/cpu_particles_2d_editor_plugin.h
@@ -66,8 +66,6 @@ class CPUParticles2DEditorPlugin : public EditorPlugin {
 	HBoxContainer *toolbar = nullptr;
 	MenuButton *menu = nullptr;
 
-	SpinBox *epoints = nullptr;
-
 	ConfirmationDialog *emission_mask = nullptr;
 	OptionButton *emission_mask_mode = nullptr;
 	CheckBox *emission_mask_centered = nullptr;
diff --git a/editor/plugins/cpu_particles_3d_editor_plugin.cpp b/editor/plugins/cpu_particles_3d_editor_plugin.cpp
index bf427f733b..b5e3f102cf 100644
--- a/editor/plugins/cpu_particles_3d_editor_plugin.cpp
+++ b/editor/plugins/cpu_particles_3d_editor_plugin.cpp
@@ -56,7 +56,6 @@ void CPUParticles3DEditor::_menu_option(int p_option) {
 	switch (p_option) {
 		case MENU_OPTION_CREATE_EMISSION_VOLUME_FROM_NODE: {
 			emission_tree_dialog->popup_scenetree_dialog();
-
 		} break;
 
 		case MENU_OPTION_RESTART: {
diff --git a/editor/plugins/gpu_particles_2d_editor_plugin.cpp b/editor/plugins/gpu_particles_2d_editor_plugin.cpp
index 5b363056a3..e9f1b07c34 100644
--- a/editor/plugins/gpu_particles_2d_editor_plugin.cpp
+++ b/editor/plugins/gpu_particles_2d_editor_plugin.cpp
@@ -388,13 +388,6 @@ GPUParticles2DEditorPlugin::GPUParticles2DEditorPlugin() {
 	file->set_file_mode(EditorFileDialog::FILE_MODE_OPEN_FILE);
 	toolbar->add_child(file);
 
-	epoints = memnew(SpinBox);
-	epoints->set_min(1);
-	epoints->set_max(8192);
-	epoints->set_step(1);
-	epoints->set_value(512);
-	file->get_vbox()->add_margin_child(TTR("Generated Point Count:"), epoints);
-
 	generate_visibility_rect = memnew(ConfirmationDialog);
 	generate_visibility_rect->set_title(TTR("Generate Visibility Rect"));
 	VBoxContainer *genvb = memnew(VBoxContainer);
diff --git a/editor/plugins/gpu_particles_2d_editor_plugin.h b/editor/plugins/gpu_particles_2d_editor_plugin.h
index f202179eff..aad623ee60 100644
--- a/editor/plugins/gpu_particles_2d_editor_plugin.h
+++ b/editor/plugins/gpu_particles_2d_editor_plugin.h
@@ -68,8 +68,6 @@ class GPUParticles2DEditorPlugin : public EditorPlugin {
 	HBoxContainer *toolbar = nullptr;
 	MenuButton *menu = nullptr;
 
-	SpinBox *epoints = nullptr;
-
 	ConfirmationDialog *generate_visibility_rect = nullptr;
 	SpinBox *generate_seconds = nullptr;
 
diff --git a/editor/plugins/multimesh_editor_plugin.cpp b/editor/plugins/multimesh_editor_plugin.cpp
index 086abc0859..9fec263af3 100644
--- a/editor/plugins/multimesh_editor_plugin.cpp
+++ b/editor/plugins/multimesh_editor_plugin.cpp
@@ -255,13 +255,15 @@ void MultiMeshEditor::edit(MultiMeshInstance3D *p_multimesh) {
 
 void MultiMeshEditor::_browse(bool p_source) {
 	browsing_source = p_source;
-	std->get_scene_tree()->set_marked(node, false);
-	std->popup_scenetree_dialog();
+	Node *browsed_node = nullptr;
 	if (p_source) {
+		browsed_node = node->get_node_or_null(mesh_source->get_text());
 		std->set_title(TTR("Select a Source Mesh:"));
 	} else {
+		browsed_node = node->get_node_or_null(surface_source->get_text());
 		std->set_title(TTR("Select a Target Surface:"));
 	}
+	std->popup_scenetree_dialog(browsed_node);
 }
 
 void MultiMeshEditor::_bind_methods() {
diff --git a/editor/plugins/node_3d_editor_plugin.cpp b/editor/plugins/node_3d_editor_plugin.cpp
index 468d7fb051..7e271bb68e 100644
--- a/editor/plugins/node_3d_editor_plugin.cpp
+++ b/editor/plugins/node_3d_editor_plugin.cpp
@@ -7798,6 +7798,10 @@ Vector<int> Node3DEditor::get_subgizmo_selection() {
 	return ret;
 }
 
+void Node3DEditor::clear_subgizmo_selection(Object *p_obj) {
+	_clear_subgizmo_selection(p_obj);
+}
+
 void Node3DEditor::add_control_to_menu_panel(Control *p_control) {
 	ERR_FAIL_NULL(p_control);
 	ERR_FAIL_COND(p_control->get_parent());
diff --git a/editor/plugins/node_3d_editor_plugin.h b/editor/plugins/node_3d_editor_plugin.h
index 7ddbb74006..66fa932f7c 100644
--- a/editor/plugins/node_3d_editor_plugin.h
+++ b/editor/plugins/node_3d_editor_plugin.h
@@ -895,6 +895,7 @@ public:
 	bool is_current_selected_gizmo(const EditorNode3DGizmo *p_gizmo);
 	bool is_subgizmo_selected(int p_id);
 	Vector<int> get_subgizmo_selection();
+	void clear_subgizmo_selection(Object *p_obj = nullptr);
 
 	Ref<EditorNode3DGizmo> get_current_hover_gizmo() const { return current_hover_gizmo; }
 	void set_current_hover_gizmo(Ref<EditorNode3DGizmo> p_gizmo) { current_hover_gizmo = p_gizmo; }
diff --git a/editor/plugins/path_3d_editor_plugin.cpp b/editor/plugins/path_3d_editor_plugin.cpp
index e4522f5a03..1cffdb6454 100644
--- a/editor/plugins/path_3d_editor_plugin.cpp
+++ b/editor/plugins/path_3d_editor_plugin.cpp
@@ -38,6 +38,7 @@
 #include "editor/editor_string_names.h"
 #include "editor/editor_undo_redo_manager.h"
 #include "node_3d_editor_plugin.h"
+#include "scene/gui/dialogs.h"
 #include "scene/gui/menu_button.h"
 #include "scene/resources/curve.h"
 
@@ -111,7 +112,7 @@ void Path3DGizmo::set_handle(int p_id, bool p_secondary, Camera3D *p_camera, con
 	// Primary handles: position.
 	if (!p_secondary) {
 		Vector3 inters;
-		// Special cas for primary handle, the handle id equals control point id.
+		// Special case for primary handle, the handle id equals control point id.
 		const int idx = p_id;
 		if (p.intersects_ray(ray_from, ray_dir, &inters)) {
 			if (Node3DEditor::get_singleton()->is_snap_enabled()) {
@@ -200,6 +201,22 @@ void Path3DGizmo::commit_handle(int p_id, bool p_secondary, const Variant &p_res
 
 	EditorUndoRedoManager *ur = EditorUndoRedoManager::get_singleton();
 
+	// Primary handles: position.
+	if (!p_secondary && !Path3DEditorPlugin::singleton->curve_edit->is_pressed()) {
+		// Special case for primary handle, the handle id equals control point id.
+		const int idx = p_id;
+		if (p_cancel) {
+			c->set_point_position(idx, p_restore);
+			return;
+		}
+		ur->create_action(TTR("Set Curve Point Position"));
+		ur->add_do_method(c.ptr(), "set_point_position", idx, c->get_point_position(idx));
+		ur->add_undo_method(c.ptr(), "set_point_position", idx, p_restore);
+		ur->commit_action();
+
+		return;
+	}
+
 	// Secondary handles: in, out, tilt.
 	const HandleInfo info = _secondary_handles_info[p_id];
 	const int idx = info.point_idx;
@@ -263,6 +280,7 @@ void Path3DGizmo::redraw() {
 	Ref<StandardMaterial3D> path_thin_material = gizmo_plugin->get_material("path_thin_material", this);
 	Ref<StandardMaterial3D> path_tilt_material = gizmo_plugin->get_material("path_tilt_material", this);
 	Ref<StandardMaterial3D> path_tilt_muted_material = gizmo_plugin->get_material("path_tilt_muted_material", this);
+	Ref<StandardMaterial3D> handles_material = gizmo_plugin->get_material("handles");
 	Ref<StandardMaterial3D> sec_handles_material = gizmo_plugin->get_material("sec_handles");
 
 	Ref<Curve3D> c = path->get_curve();
@@ -340,56 +358,50 @@ void Path3DGizmo::redraw() {
 	if (Path3DEditorPlugin::singleton->get_edited_path() == path) {
 		PackedVector3Array handle_lines;
 		PackedVector3Array tilt_handle_lines;
+		PackedVector3Array primary_handle_points;
 		PackedVector3Array secondary_handle_points;
 		PackedInt32Array collected_secondary_handle_ids; // Avoid shadowing member on Node3DEditorGizmo.
 
 		_secondary_handles_info.resize(c->get_point_count() * 3);
 
 		for (int idx = 0; idx < c->get_point_count(); idx++) {
+			// Collect primary-handles.
 			const Vector3 pos = c->get_point_position(idx);
-			bool is_current_point_selected = is_subgizmo_selected(idx);
-			bool is_previous_point_selected = is_subgizmo_selected(idx - 1);
-			bool is_following_point_selected = is_subgizmo_selected(idx + 1);
+			primary_handle_points.append(pos);
 
 			HandleInfo info;
 			info.point_idx = idx;
 
 			// Collect in-handles except for the first point.
-			if (idx > 0 && (is_current_point_selected || is_previous_point_selected)) {
+			if (idx > 0 && Path3DEditorPlugin::singleton->curve_edit_curve->is_pressed()) {
 				const Vector3 in = c->get_point_in(idx);
 
-				// Display in-handles only when they are "initialized".
-				if (in.length_squared() > 0) {
-					info.type = HandleType::HANDLE_TYPE_IN;
-					const int handle_idx = idx * 3 + 0;
-					collected_secondary_handle_ids.append(handle_idx);
-					_secondary_handles_info.write[handle_idx] = info;
+				info.type = HandleType::HANDLE_TYPE_IN;
+				const int handle_idx = idx * 3 + 0;
+				collected_secondary_handle_ids.append(handle_idx);
+				_secondary_handles_info.write[handle_idx] = info;
 
-					secondary_handle_points.append(pos + in);
-					handle_lines.append(pos);
-					handle_lines.append(pos + in);
-				}
+				secondary_handle_points.append(pos + in);
+				handle_lines.append(pos);
+				handle_lines.append(pos + in);
 			}
 
 			// Collect out-handles except for the last point.
-			if (idx < c->get_point_count() - 1 && (is_current_point_selected || is_following_point_selected)) {
+			if (idx < c->get_point_count() - 1 && Path3DEditorPlugin::singleton->curve_edit_curve->is_pressed()) {
 				const Vector3 out = c->get_point_out(idx);
 
-				// Display out-handles only when they are "initialized".
-				if (out.length_squared() > 0) {
-					info.type = HandleType::HANDLE_TYPE_OUT;
-					const int handle_idx = idx * 3 + 1;
-					collected_secondary_handle_ids.append(handle_idx);
-					_secondary_handles_info.write[handle_idx] = info;
+				info.type = HandleType::HANDLE_TYPE_OUT;
+				const int handle_idx = idx * 3 + 1;
+				collected_secondary_handle_ids.append(handle_idx);
+				_secondary_handles_info.write[handle_idx] = info;
 
-					secondary_handle_points.append(pos + out);
-					handle_lines.append(pos);
-					handle_lines.append(pos + out);
-				}
+				secondary_handle_points.append(pos + out);
+				handle_lines.append(pos);
+				handle_lines.append(pos + out);
 			}
 
 			// Collect tilt-handles.
-			if (is_current_point_selected || is_previous_point_selected || is_following_point_selected) {
+			if (Path3DEditorPlugin::singleton->curve_edit_tilt->is_pressed()) {
 				// Tilt handle.
 				{
 					info.type = HandleType::HANDLE_TYPE_TILT;
@@ -419,7 +431,7 @@ void Path3DGizmo::redraw() {
 						const Vector3 edge = sin(a) * side + cos(a) * up;
 						disk.append(pos + edge * disk_size);
 					}
-					add_vertices(disk, is_current_point_selected ? path_tilt_material : path_tilt_muted_material, Mesh::PRIMITIVE_LINE_STRIP);
+					add_vertices(disk, path_tilt_material, Mesh::PRIMITIVE_LINE_STRIP);
 				}
 			}
 		}
@@ -432,6 +444,9 @@ void Path3DGizmo::redraw() {
 			add_lines(tilt_handle_lines, path_tilt_material);
 		}
 
+		if (!Path3DEditorPlugin::singleton->curve_edit->is_pressed() && primary_handle_points.size()) {
+			add_handles(primary_handle_points, handles_material);
+		}
 		if (secondary_handle_points.size()) {
 			add_handles(secondary_handle_points, sec_handles_material, collected_secondary_handle_ids, false, true);
 		}
@@ -453,6 +468,12 @@ Path3DGizmo::Path3DGizmo(Path3D *p_path, float p_disk_size) {
 
 	// Connecting to a signal once, rather than plaguing the implementation with calls to `Node3DEditor::update_transform_gizmo`.
 	path->connect("curve_changed", callable_mp(this, &Path3DGizmo::_update_transform_gizmo));
+
+	Path3DEditorPlugin::singleton->curve_edit->connect("pressed", callable_mp(this, &Path3DGizmo::redraw));
+	Path3DEditorPlugin::singleton->curve_edit_curve->connect("pressed", callable_mp(this, &Path3DGizmo::redraw));
+	Path3DEditorPlugin::singleton->curve_create->connect("pressed", callable_mp(this, &Path3DGizmo::redraw));
+	Path3DEditorPlugin::singleton->curve_del->connect("pressed", callable_mp(this, &Path3DGizmo::redraw));
+	Path3DEditorPlugin::singleton->curve_close->connect("pressed", callable_mp(this, &Path3DGizmo::redraw));
 }
 
 EditorPlugin::AfterGUIInput Path3DEditorPlugin::forward_3d_gui_input(Camera3D *p_camera, const Ref<InputEvent> &p_event) {
@@ -623,9 +644,6 @@ EditorPlugin::AfterGUIInput Path3DEditorPlugin::forward_3d_gui_input(Camera3D *p
 				}
 			}
 		}
-		if (curve_edit_curve->is_pressed()) {
-			mb->set_shift_pressed(true);
-		}
 	}
 
 	return EditorPlugin::AFTER_GUI_INPUT_PASS;
@@ -674,8 +692,11 @@ void Path3DEditorPlugin::make_visible(bool p_visible) {
 void Path3DEditorPlugin::_mode_changed(int p_mode) {
 	curve_create->set_pressed(p_mode == MODE_CREATE);
 	curve_edit_curve->set_pressed(p_mode == MODE_EDIT_CURVE);
+	curve_edit_tilt->set_pressed(p_mode == MODE_EDIT_TILT);
 	curve_edit->set_pressed(p_mode == MODE_EDIT);
 	curve_del->set_pressed(p_mode == MODE_DELETE);
+
+	Node3DEditor::get_singleton()->clear_subgizmo_selection();
 }
 
 void Path3DEditorPlugin::_close_curve() {
@@ -715,14 +736,57 @@ void Path3DEditorPlugin::_handle_option_pressed(int p_option) {
 	}
 }
 
+void Path3DEditorPlugin::_confirm_clear_points() {
+	if (!path || path->get_curve().is_null() || path->get_curve()->get_point_count() == 0) {
+		return;
+	}
+	clear_points_dialog->reset_size();
+	clear_points_dialog->popup_centered();
+}
+
+void Path3DEditorPlugin::_clear_points() {
+	EditorUndoRedoManager *undo_redo = EditorUndoRedoManager::get_singleton();
+	PackedVector3Array points = path->get_curve()->get_points().duplicate();
+
+	undo_redo->create_action(TTR("Clear Curve Points"));
+	undo_redo->add_do_method(this, "_clear_curve_points");
+	undo_redo->add_undo_method(this, "_restore_curve_points", points);
+	undo_redo->commit_action();
+}
+
+void Path3DEditorPlugin::_clear_curve_points() {
+	if (!path || path->get_curve().is_null() || path->get_curve()->get_point_count() == 0) {
+		return;
+	}
+	Ref<Curve3D> curve = path->get_curve();
+	curve->clear_points();
+}
+
+void Path3DEditorPlugin::_restore_curve_points(const PackedVector3Array &p_points) {
+	if (!path || path->get_curve().is_null()) {
+		return;
+	}
+	Ref<Curve3D> curve = path->get_curve();
+
+	if (curve->get_point_count() > 0) {
+		curve->clear_points();
+	}
+
+	for (int i = 0; i < p_points.size(); i += 3) {
+		curve->add_point(p_points[i + 2], p_points[i], p_points[i + 1]);
+	}
+}
+
 void Path3DEditorPlugin::_update_theme() {
 	// TODO: Split the EditorPlugin instance from the UI instance and connect this properly.
 	// See the 2D path editor for inspiration.
 	curve_edit->set_icon(EditorNode::get_singleton()->get_editor_theme()->get_icon(SNAME("CurveEdit"), EditorStringName(EditorIcons)));
 	curve_edit_curve->set_icon(EditorNode::get_singleton()->get_editor_theme()->get_icon(SNAME("CurveCurve"), EditorStringName(EditorIcons)));
+	curve_edit_tilt->set_icon(EditorNode::get_singleton()->get_editor_theme()->get_icon(SNAME("CurveTilt"), EditorStringName(EditorIcons)));
 	curve_create->set_icon(EditorNode::get_singleton()->get_editor_theme()->get_icon(SNAME("CurveCreate"), EditorStringName(EditorIcons)));
 	curve_del->set_icon(EditorNode::get_singleton()->get_editor_theme()->get_icon(SNAME("CurveDelete"), EditorStringName(EditorIcons)));
 	curve_close->set_icon(EditorNode::get_singleton()->get_editor_theme()->get_icon(SNAME("CurveClose"), EditorStringName(EditorIcons)));
+	curve_clear_points->set_icon(EditorNode::get_singleton()->get_editor_theme()->get_icon(SNAME("Clear"), EditorStringName(EditorIcons)));
 }
 
 void Path3DEditorPlugin::_notification(int p_what) {
@@ -730,6 +794,7 @@ void Path3DEditorPlugin::_notification(int p_what) {
 		case NOTIFICATION_ENTER_TREE: {
 			curve_create->connect("pressed", callable_mp(this, &Path3DEditorPlugin::_mode_changed).bind(MODE_CREATE));
 			curve_edit_curve->connect("pressed", callable_mp(this, &Path3DEditorPlugin::_mode_changed).bind(MODE_EDIT_CURVE));
+			curve_edit_tilt->connect("pressed", callable_mp(this, &Path3DEditorPlugin::_mode_changed).bind(MODE_EDIT_TILT));
 			curve_edit->connect("pressed", callable_mp(this, &Path3DEditorPlugin::_mode_changed).bind(MODE_EDIT));
 			curve_del->connect("pressed", callable_mp(this, &Path3DEditorPlugin::_mode_changed).bind(MODE_DELETE));
 			curve_close->connect("pressed", callable_mp(this, &Path3DEditorPlugin::_close_curve));
@@ -747,6 +812,8 @@ void Path3DEditorPlugin::_notification(int p_what) {
 }
 
 void Path3DEditorPlugin::_bind_methods() {
+	ClassDB::bind_method(D_METHOD("_clear_curve_points"), &Path3DEditorPlugin::_clear_curve_points);
+	ClassDB::bind_method(D_METHOD("_restore_curve_points"), &Path3DEditorPlugin::_restore_curve_points);
 }
 
 Path3DEditorPlugin *Path3DEditorPlugin::singleton = nullptr;
@@ -761,6 +828,7 @@ Path3DEditorPlugin::Path3DEditorPlugin() {
 
 	Ref<Path3DGizmoPlugin> gizmo_plugin = memnew(Path3DGizmoPlugin(disk_size));
 	Node3DEditor::get_singleton()->add_gizmo_plugin(gizmo_plugin);
+	path_3d_gizmo_plugin = gizmo_plugin;
 
 	topmenu_bar = memnew(HBoxContainer);
 	topmenu_bar->hide();
@@ -770,16 +838,23 @@ Path3DEditorPlugin::Path3DEditorPlugin() {
 	curve_edit->set_theme_type_variation("FlatButton");
 	curve_edit->set_toggle_mode(true);
 	curve_edit->set_focus_mode(Control::FOCUS_NONE);
-	curve_edit->set_tooltip_text(TTR("Select Points") + "\n" + TTR("Shift+Drag: Select Control Points") + "\n" + keycode_get_string((Key)KeyModifierMask::CMD_OR_CTRL) + TTR("Click: Add Point") + "\n" + TTR("Right Click: Delete Point"));
+	curve_edit->set_tooltip_text(TTR("Select Points") + "\n" + TTR("Shift+Click: Select multiple Points") + "\n" + keycode_get_string((Key)KeyModifierMask::CMD_OR_CTRL) + TTR("Click: Add Point") + "\n" + TTR("Right Click: Delete Point"));
 	topmenu_bar->add_child(curve_edit);
 
 	curve_edit_curve = memnew(Button);
 	curve_edit_curve->set_theme_type_variation("FlatButton");
 	curve_edit_curve->set_toggle_mode(true);
 	curve_edit_curve->set_focus_mode(Control::FOCUS_NONE);
-	curve_edit_curve->set_tooltip_text(TTR("Select Control Points (Shift+Drag)"));
+	curve_edit_curve->set_tooltip_text(TTR("Select Control Points") + "\n" + TTR("Shift+Click: Drag out Control Points"));
 	topmenu_bar->add_child(curve_edit_curve);
 
+	curve_edit_tilt = memnew(Button);
+	curve_edit_tilt->set_theme_type_variation("FlatButton");
+	curve_edit_tilt->set_toggle_mode(true);
+	curve_edit_tilt->set_focus_mode(Control::FOCUS_NONE);
+	curve_edit_tilt->set_tooltip_text(TTR("Select Tilt Handles"));
+	topmenu_bar->add_child(curve_edit_tilt);
+
 	curve_create = memnew(Button);
 	curve_create->set_theme_type_variation("FlatButton");
 	curve_create->set_toggle_mode(true);
@@ -800,7 +875,18 @@ Path3DEditorPlugin::Path3DEditorPlugin() {
 	curve_close->set_tooltip_text(TTR("Close Curve"));
 	topmenu_bar->add_child(curve_close);
 
-	PopupMenu *menu;
+	curve_clear_points = memnew(Button);
+	curve_clear_points->set_theme_type_variation("FlatButton");
+	curve_clear_points->set_focus_mode(Control::FOCUS_NONE);
+	curve_clear_points->set_tooltip_text(TTR("Clear Points"));
+	curve_clear_points->connect("pressed", callable_mp(this, &Path3DEditorPlugin::_confirm_clear_points));
+	topmenu_bar->add_child(curve_clear_points);
+
+	clear_points_dialog = memnew(ConfirmationDialog);
+	clear_points_dialog->set_title(TTR("Please Confirm..."));
+	clear_points_dialog->set_text(TTR("Remove all curve points?"));
+	clear_points_dialog->connect("confirmed", callable_mp(this, &Path3DEditorPlugin::_clear_points));
+	topmenu_bar->add_child(clear_points_dialog);
 
 	handle_menu = memnew(MenuButton);
 	handle_menu->set_flat(false);
@@ -808,6 +894,7 @@ Path3DEditorPlugin::Path3DEditorPlugin() {
 	handle_menu->set_text(TTR("Options"));
 	topmenu_bar->add_child(handle_menu);
 
+	PopupMenu *menu;
 	menu = handle_menu->get_popup();
 	menu->add_check_item(TTR("Mirror Handle Angles"));
 	menu->set_item_checked(HANDLE_OPTION_ANGLE, mirror_handle_angle);
@@ -849,11 +936,13 @@ void Path3DGizmoPlugin::redraw(EditorNode3DGizmo *p_gizmo) {
 	Ref<StandardMaterial3D> handle_material = get_material("handles", p_gizmo);
 	PackedVector3Array handles;
 
-	for (int idx = 0; idx < curve->get_point_count(); ++idx) {
-		// Collect handles.
-		const Vector3 pos = curve->get_point_position(idx);
+	if (Path3DEditorPlugin::singleton->curve_edit->is_pressed()) {
+		for (int idx = 0; idx < curve->get_point_count(); ++idx) {
+			// Collect handles.
+			const Vector3 pos = curve->get_point_position(idx);
 
-		handles.append(pos);
+			handles.append(pos);
+		}
 	}
 
 	if (handles.size()) {
@@ -867,10 +956,12 @@ int Path3DGizmoPlugin::subgizmos_intersect_ray(const EditorNode3DGizmo *p_gizmo,
 	Ref<Curve3D> curve = path->get_curve();
 	ERR_FAIL_COND_V(curve.is_null(), -1);
 
-	for (int idx = 0; idx < curve->get_point_count(); ++idx) {
-		Vector3 pos = path->get_global_transform().xform(curve->get_point_position(idx));
-		if (p_camera->unproject_position(pos).distance_to(p_point) < 20) {
-			return idx;
+	if (Path3DEditorPlugin::singleton->curve_edit->is_pressed()) {
+		for (int idx = 0; idx < curve->get_point_count(); ++idx) {
+			Vector3 pos = path->get_global_transform().xform(curve->get_point_position(idx));
+			if (p_camera->unproject_position(pos).distance_to(p_point) < 20) {
+				return idx;
+			}
 		}
 	}
 	return -1;
@@ -884,18 +975,20 @@ Vector<int> Path3DGizmoPlugin::subgizmos_intersect_frustum(const EditorNode3DGiz
 	Ref<Curve3D> curve = path->get_curve();
 	ERR_FAIL_COND_V(curve.is_null(), contained_points);
 
-	for (int idx = 0; idx < curve->get_point_count(); ++idx) {
-		Vector3 pos = path->get_global_transform().xform(curve->get_point_position(idx));
-		bool is_contained_in_frustum = true;
-		for (int i = 0; i < p_frustum.size(); ++i) {
-			if (p_frustum[i].distance_to(pos) > 0) {
-				is_contained_in_frustum = false;
-				break;
+	if (Path3DEditorPlugin::singleton->curve_edit->is_pressed()) {
+		for (int idx = 0; idx < curve->get_point_count(); ++idx) {
+			Vector3 pos = path->get_global_transform().xform(curve->get_point_position(idx));
+			bool is_contained_in_frustum = true;
+			for (int i = 0; i < p_frustum.size(); ++i) {
+				if (p_frustum[i].distance_to(pos) > 0) {
+					is_contained_in_frustum = false;
+					break;
+				}
 			}
-		}
 
-		if (is_contained_in_frustum) {
-			contained_points.push_back(idx);
+			if (is_contained_in_frustum) {
+				contained_points.push_back(idx);
+			}
 		}
 	}
 
diff --git a/editor/plugins/path_3d_editor_plugin.h b/editor/plugins/path_3d_editor_plugin.h
index 931ac7128c..6a933a419f 100644
--- a/editor/plugins/path_3d_editor_plugin.h
+++ b/editor/plugins/path_3d_editor_plugin.h
@@ -38,6 +38,7 @@
 
 class HBoxContainer;
 class MenuButton;
+class ConfirmationDialog;
 
 class Path3DGizmo : public EditorNode3DGizmo {
 	GDCLASS(Path3DGizmo, EditorNode3DGizmo);
@@ -106,20 +107,30 @@ public:
 class Path3DEditorPlugin : public EditorPlugin {
 	GDCLASS(Path3DEditorPlugin, EditorPlugin);
 
+	friend class Path3DGizmo;
+	friend class Path3DGizmoPlugin;
+
+	Ref<Path3DGizmoPlugin> path_3d_gizmo_plugin;
+
 	HBoxContainer *topmenu_bar = nullptr;
 	Button *curve_create = nullptr;
 	Button *curve_edit = nullptr;
 	Button *curve_edit_curve = nullptr;
+	Button *curve_edit_tilt = nullptr;
 	Button *curve_del = nullptr;
 	Button *curve_close = nullptr;
+	Button *curve_clear_points = nullptr;
 	MenuButton *handle_menu = nullptr;
 
+	ConfirmationDialog *clear_points_dialog = nullptr;
+
 	float disk_size = 0.8;
 
 	enum Mode {
 		MODE_CREATE,
 		MODE_EDIT,
 		MODE_EDIT_CURVE,
+		MODE_EDIT_TILT,
 		MODE_DELETE,
 		ACTION_CLOSE
 	};
@@ -135,6 +146,11 @@ class Path3DEditorPlugin : public EditorPlugin {
 	bool mirror_handle_angle;
 	bool mirror_handle_length;
 
+	void _confirm_clear_points();
+	void _clear_points();
+	void _clear_curve_points();
+	void _restore_curve_points(const PackedVector3Array &p_points);
+
 	enum HandleOption {
 		HANDLE_OPTION_ANGLE,
 		HANDLE_OPTION_LENGTH
diff --git a/editor/plugins/script_editor_plugin.cpp b/editor/plugins/script_editor_plugin.cpp
index 6f1eef62de..eeb84cb79e 100644
--- a/editor/plugins/script_editor_plugin.cpp
+++ b/editor/plugins/script_editor_plugin.cpp
@@ -1224,8 +1224,8 @@ void ScriptEditor::_menu_option(int p_option) {
 			for (const String &E : textfile_extensions) {
 				file_dialog->add_filter("*." + E, E.to_upper());
 			}
-			file_dialog->popup_file_dialog();
 			file_dialog->set_title(TTR("New Text File..."));
+			file_dialog->popup_file_dialog();
 			open_textfile_after_create = true;
 		} break;
 		case FILE_OPEN: {
@@ -1244,8 +1244,8 @@ void ScriptEditor::_menu_option(int p_option) {
 				file_dialog->add_filter("*." + E, E.to_upper());
 			}
 
-			file_dialog->popup_file_dialog();
 			file_dialog->set_title(TTR("Open File"));
+			file_dialog->popup_file_dialog();
 			return;
 		} break;
 		case FILE_REOPEN_CLOSED: {
@@ -1368,8 +1368,8 @@ void ScriptEditor::_menu_option(int p_option) {
 					file_dialog->clear_filters();
 					file_dialog->set_current_dir(text_file->get_path().get_base_dir());
 					file_dialog->set_current_file(text_file->get_path().get_file());
-					file_dialog->popup_file_dialog();
 					file_dialog->set_title(TTR("Save File As..."));
+					file_dialog->popup_file_dialog();
 					break;
 				}
 
@@ -1543,8 +1543,8 @@ void ScriptEditor::_theme_option(int p_option) {
 			file_dialog_option = THEME_IMPORT;
 			file_dialog->clear_filters();
 			file_dialog->add_filter("*.tet");
-			file_dialog->popup_file_dialog();
 			file_dialog->set_title(TTR("Import Theme"));
+			file_dialog->popup_file_dialog();
 		} break;
 		case THEME_RELOAD: {
 			EditorSettings::get_singleton()->load_text_editor_theme();
@@ -1569,8 +1569,8 @@ void ScriptEditor::_show_save_theme_as_dialog() {
 	file_dialog->clear_filters();
 	file_dialog->add_filter("*.tet");
 	file_dialog->set_current_path(EditorPaths::get_singleton()->get_text_editor_themes_dir().path_join(EDITOR_GET("text_editor/theme/color_theme")));
-	file_dialog->popup_file_dialog();
 	file_dialog->set_title(TTR("Save Theme As..."));
+	file_dialog->popup_file_dialog();
 }
 
 bool ScriptEditor::_has_docs_tab() const {
diff --git a/editor/plugins/tiles/tile_data_editors.cpp b/editor/plugins/tiles/tile_data_editors.cpp
index 597fd7393f..f047e4ff16 100644
--- a/editor/plugins/tiles/tile_data_editors.cpp
+++ b/editor/plugins/tiles/tile_data_editors.cpp
@@ -1280,7 +1280,7 @@ void TileDataDefaultEditor::setup_property_editor(Variant::Type p_type, const St
 	property_editor = EditorInspectorDefaultPlugin::get_editor_for_property(dummy_object, p_type, p_property, PROPERTY_HINT_NONE, "", PROPERTY_USAGE_DEFAULT);
 	property_editor->set_object_and_property(dummy_object, p_property);
 	if (p_label.is_empty()) {
-		property_editor->set_label(EditorPropertyNameProcessor::get_singleton()->process_name(p_property, EditorPropertyNameProcessor::get_default_inspector_style()));
+		property_editor->set_label(EditorPropertyNameProcessor::get_singleton()->process_name(p_property, EditorPropertyNameProcessor::get_default_inspector_style(), p_property));
 	} else {
 		property_editor->set_label(p_label);
 	}
diff --git a/editor/plugins/tiles/tile_map_layer_editor.cpp b/editor/plugins/tiles/tile_map_layer_editor.cpp
index ccadc0643b..ac4708aa06 100644
--- a/editor/plugins/tiles/tile_map_layer_editor.cpp
+++ b/editor/plugins/tiles/tile_map_layer_editor.cpp
@@ -36,9 +36,9 @@
 #include "editor/editor_resource_preview.h"
 #include "editor/editor_settings.h"
 #include "editor/editor_undo_redo_manager.h"
+#include "editor/multi_node_edit.h"
 #include "editor/plugins/canvas_item_editor_plugin.h"
 #include "editor/themes/editor_scale.h"
-
 #include "scene/2d/camera_2d.h"
 #include "scene/2d/tile_map_layer.h"
 #include "scene/gui/center_container.h"
@@ -107,7 +107,7 @@ void TileMapLayerEditorTilesPlugin::_update_transform_buttons() {
 		return;
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null() || selection_pattern.is_null()) {
 		return;
 	}
@@ -171,7 +171,7 @@ void TileMapLayerEditorTilesPlugin::_update_tile_set_sources_list() {
 		return;
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return;
 	}
@@ -256,7 +256,7 @@ void TileMapLayerEditorTilesPlugin::_update_source_display() {
 		return;
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return;
 	}
@@ -302,7 +302,7 @@ void TileMapLayerEditorTilesPlugin::_patterns_item_list_gui_input(const Ref<Inpu
 		return;
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return;
 	}
@@ -348,7 +348,7 @@ void TileMapLayerEditorTilesPlugin::_update_patterns_list() {
 		return;
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return;
 	}
@@ -380,7 +380,7 @@ void TileMapLayerEditorTilesPlugin::_update_atlas_view() {
 		return;
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return;
 	}
@@ -401,7 +401,7 @@ void TileMapLayerEditorTilesPlugin::_update_scenes_collection_view() {
 		return;
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return;
 	}
@@ -459,7 +459,7 @@ void TileMapLayerEditorTilesPlugin::_scenes_list_multi_selected(int p_index, boo
 		return;
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return;
 	}
@@ -538,7 +538,7 @@ bool TileMapLayerEditorTilesPlugin::forward_canvas_gui_input(const Ref<InputEven
 		return false;
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return false;
 	}
@@ -784,7 +784,7 @@ void TileMapLayerEditorTilesPlugin::forward_canvas_draw_over_viewport(Control *p
 		return;
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return;
 	}
@@ -1021,7 +1021,7 @@ TileMapCell TileMapLayerEditorTilesPlugin::_pick_random_tile(Ref<TileMapPattern>
 		return TileMapCell();
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return TileMapCell();
 	}
@@ -1073,7 +1073,7 @@ HashMap<Vector2i, TileMapCell> TileMapLayerEditorTilesPlugin::_draw_line(Vector2
 		return HashMap<Vector2i, TileMapCell>();
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return HashMap<Vector2i, TileMapCell>();
 	}
@@ -1122,7 +1122,7 @@ HashMap<Vector2i, TileMapCell> TileMapLayerEditorTilesPlugin::_draw_rect(Vector2
 		return HashMap<Vector2i, TileMapCell>();
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return HashMap<Vector2i, TileMapCell>();
 	}
@@ -1181,7 +1181,7 @@ HashMap<Vector2i, TileMapCell> TileMapLayerEditorTilesPlugin::_draw_bucket_fill(
 		return HashMap<Vector2i, TileMapCell>();
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return HashMap<Vector2i, TileMapCell>();
 	}
@@ -1293,7 +1293,7 @@ void TileMapLayerEditorTilesPlugin::_stop_dragging() {
 		return;
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return;
 	}
@@ -1601,7 +1601,7 @@ void TileMapLayerEditorTilesPlugin::_update_fix_selected_and_hovered() {
 		selection_pattern.instantiate();
 		return;
 	}
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		hovered_tile.source_id = TileSet::INVALID_SOURCE;
 		hovered_tile.set_atlas_coords(TileSetSource::INVALID_ATLAS_COORDS);
@@ -1688,7 +1688,7 @@ void TileMapLayerEditorTilesPlugin::_update_selection_pattern_from_tilemap_selec
 		return;
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return;
 	}
@@ -1709,7 +1709,7 @@ void TileMapLayerEditorTilesPlugin::_update_selection_pattern_from_tileset_tiles
 		return;
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return;
 	}
@@ -1783,7 +1783,7 @@ void TileMapLayerEditorTilesPlugin::_update_selection_pattern_from_tileset_patte
 		return;
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return;
 	}
@@ -1822,7 +1822,7 @@ void TileMapLayerEditorTilesPlugin::_tile_atlas_control_draw() {
 		return;
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return;
 	}
@@ -1901,7 +1901,7 @@ void TileMapLayerEditorTilesPlugin::_tile_atlas_control_gui_input(const Ref<Inpu
 		return;
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return;
 	}
@@ -2007,7 +2007,7 @@ void TileMapLayerEditorTilesPlugin::_tile_alternatives_control_draw() {
 		return;
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return;
 	}
@@ -2059,7 +2059,7 @@ void TileMapLayerEditorTilesPlugin::_tile_alternatives_control_gui_input(const R
 		return;
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return;
 	}
@@ -2157,16 +2157,21 @@ void TileMapLayerEditorTilesPlugin::edit(ObjectID p_tile_map_layer_id) {
 
 	// Disable sort button if the tileset is read-only
 	TileMapLayer *edited_layer = _get_edited_layer();
+	Ref<TileSet> tile_set;
 	if (edited_layer) {
-		Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+		tile_set = edited_layer->get_tile_set();
 		if (tile_set.is_valid()) {
 			source_sort_button->set_disabled(EditorNode::get_singleton()->is_resource_read_only(tile_set));
 		}
 	}
 
-	if (edited_tile_map_layer_id != p_tile_map_layer_id) {
-		edited_tile_map_layer_id = p_tile_map_layer_id;
+	TileMapLayer *new_tile_map_layer = Object::cast_to<TileMapLayer>(ObjectDB::get_instance(edited_tile_map_layer_id));
+	Ref<TileSet> new_tile_set;
+	if (new_tile_map_layer) {
+		new_tile_set = new_tile_map_layer->get_tile_set();
+	}
 
+	if (tile_set.is_valid() && tile_set != new_tile_set) {
 		// Clear the selection.
 		tile_set_selection.clear();
 		patterns_item_list->deselect_all();
@@ -2531,7 +2536,7 @@ HashMap<Vector2i, TileMapCell> TileMapLayerEditorTerrainsPlugin::_draw_terrain_p
 		return HashMap<Vector2i, TileMapCell>();
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return HashMap<Vector2i, TileMapCell>();
 	}
@@ -2583,7 +2588,7 @@ HashMap<Vector2i, TileMapCell> TileMapLayerEditorTerrainsPlugin::_draw_terrain_p
 		return HashMap<Vector2i, TileMapCell>();
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return HashMap<Vector2i, TileMapCell>();
 	}
@@ -2630,7 +2635,7 @@ HashMap<Vector2i, TileMapCell> TileMapLayerEditorTerrainsPlugin::_draw_line(Vect
 		return HashMap<Vector2i, TileMapCell>();
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return HashMap<Vector2i, TileMapCell>();
 	}
@@ -2654,7 +2659,7 @@ HashMap<Vector2i, TileMapCell> TileMapLayerEditorTerrainsPlugin::_draw_rect(Vect
 		return HashMap<Vector2i, TileMapCell>();
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return HashMap<Vector2i, TileMapCell>();
 	}
@@ -2688,7 +2693,7 @@ RBSet<Vector2i> TileMapLayerEditorTerrainsPlugin::_get_cells_for_bucket_fill(Vec
 		return RBSet<Vector2i>();
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return RBSet<Vector2i>();
 	}
@@ -2799,7 +2804,7 @@ HashMap<Vector2i, TileMapCell> TileMapLayerEditorTerrainsPlugin::_draw_bucket_fi
 		return HashMap<Vector2i, TileMapCell>();
 	}
 
-	const Ref<TileSet> &tile_set = edited_layer->get_effective_tile_set();
+	const Ref<TileSet> &tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return HashMap<Vector2i, TileMapCell>();
 	}
@@ -2827,7 +2832,7 @@ void TileMapLayerEditorTerrainsPlugin::_stop_dragging() {
 		return;
 	}
 
-	const Ref<TileSet> &tile_set = edited_layer->get_effective_tile_set();
+	const Ref<TileSet> &tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return;
 	}
@@ -2961,7 +2966,7 @@ void TileMapLayerEditorTerrainsPlugin::_update_selection() {
 		return;
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return;
 	}
@@ -3013,7 +3018,7 @@ bool TileMapLayerEditorTerrainsPlugin::forward_canvas_gui_input(const Ref<InputE
 		return false;
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return false;
 	}
@@ -3150,7 +3155,7 @@ void TileMapLayerEditorTerrainsPlugin::forward_canvas_draw_over_viewport(Control
 		return;
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return;
 	}
@@ -3275,7 +3280,7 @@ void TileMapLayerEditorTerrainsPlugin::_update_terrains_cache() {
 		return;
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return;
 	}
@@ -3342,7 +3347,7 @@ void TileMapLayerEditorTerrainsPlugin::_update_terrains_tree() {
 	const TileMapLayer *edited_layer = _get_edited_layer();
 	ERR_FAIL_NULL(edited_layer);
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return;
 	}
@@ -3389,7 +3394,7 @@ void TileMapLayerEditorTerrainsPlugin::_update_tiles_list() {
 		return;
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return;
 	}
@@ -3627,57 +3632,258 @@ TileMapLayer *TileMapLayerEditor::_get_edited_layer() const {
 	return Object::cast_to<TileMapLayer>(ObjectDB::get_instance(edited_tile_map_layer_id));
 }
 
+void TileMapLayerEditor::_find_tile_map_layers_in_scene(Node *p_current, const Node *p_owner, Vector<TileMapLayer *> &r_list) const {
+	ERR_FAIL_COND(!p_current || !p_owner);
+	if (p_current != p_owner && p_current->get_owner() != p_owner) {
+		return;
+	}
+	TileMapLayer *layer = Object::cast_to<TileMapLayer>(p_current);
+	if (layer) {
+		r_list.append(layer);
+	}
+	for (int i = 0; i < p_current->get_child_count(); i++) {
+		Node *child = p_current->get_child(i);
+		_find_tile_map_layers_in_scene(child, p_owner, r_list);
+	}
+}
+
+void TileMapLayerEditor::_update_tile_map_layers_in_scene_list_cache() {
+	if (!layers_in_scene_list_cache_needs_update) {
+		return;
+	}
+	EditorNode *en = EditorNode::get_singleton();
+	Node *edited_scene_root = en->get_edited_scene();
+	if (!edited_scene_root) {
+		return;
+	}
+
+	tile_map_layers_in_scene_cache.clear();
+	_find_tile_map_layers_in_scene(edited_scene_root, edited_scene_root, tile_map_layers_in_scene_cache);
+	layers_in_scene_list_cache_needs_update = false;
+}
+
+void TileMapLayerEditor::_node_change(Node *p_node) {
+	if (!layers_in_scene_list_cache_needs_update && p_node->is_part_of_edited_scene() && Object::cast_to<TileMapLayer>(p_node)) {
+		layers_in_scene_list_cache_needs_update = true;
+	}
+}
+
 void TileMapLayerEditor::_notification(int p_what) {
 	switch (p_what) {
+		case NOTIFICATION_READY: {
+			get_tree()->connect("node_added", callable_mp(this, &TileMapLayerEditor::_node_change));
+			get_tree()->connect("node_removed", callable_mp(this, &TileMapLayerEditor::_node_change));
+		} break;
 		case NOTIFICATION_THEME_CHANGED: {
 			missing_tile_texture = get_editor_theme_icon(SNAME("StatusWarning"));
 			warning_pattern_texture = get_editor_theme_icon(SNAME("WarningPattern"));
 			advanced_menu_button->set_icon(get_editor_theme_icon(SNAME("Tools")));
+			select_previous_layer->set_icon(get_editor_theme_icon(SNAME("MoveUp")));
+			select_next_layer->set_icon(get_editor_theme_icon(SNAME("MoveDown")));
+			select_all_layers->set_icon(get_editor_theme_icon(SNAME("FileList")));
 			toggle_grid_button->set_icon(get_editor_theme_icon(SNAME("Grid")));
-			toggle_grid_button->set_pressed(EDITOR_GET("editors/tiles_editor/display_grid"));
 			toggle_highlight_selected_layer_button->set_icon(get_editor_theme_icon(SNAME("TileMapHighlightSelected")));
 		} break;
 
 		case NOTIFICATION_INTERNAL_PROCESS: {
-			if (is_visible_in_tree() && tileset_changed_needs_update) {
+			if (is_visible_in_tree() && tile_map_layer_changed_needs_update) {
 				_update_bottom_panel();
-				update_layers_selector();
-				_update_highlighting_toggle();
+				_update_layers_selector();
 				tabs_plugins[tabs_bar->get_current_tab()]->tile_set_changed();
 				CanvasItemEditor::get_singleton()->update_viewport();
-				tileset_changed_needs_update = false;
+				tile_map_layer_changed_needs_update = false;
 			}
 		} break;
 
 		case EditorSettings::NOTIFICATION_EDITOR_SETTINGS_CHANGED: {
 			if (EditorSettings::get_singleton()->check_changed_settings_in_group("editors/tiles_editor")) {
 				toggle_grid_button->set_pressed_no_signal(EDITOR_GET("editors/tiles_editor/display_grid"));
+				toggle_highlight_selected_layer_button->set_pressed_no_signal(EDITOR_GET("editors/tiles_editor/highlight_selected_layer"));
 			}
 		} break;
 	}
 }
 
-void TileMapLayerEditor::_bind_methods() {
-	ADD_SIGNAL(MethodInfo("change_selected_layer_request", PropertyInfo(Variant::STRING_NAME, "layer_name")));
-}
-
 void TileMapLayerEditor::_on_grid_toggled(bool p_pressed) {
 	EditorSettings::get_singleton()->set("editors/tiles_editor/display_grid", p_pressed);
 	CanvasItemEditor::get_singleton()->update_viewport();
 }
 
-void TileMapLayerEditor::_layers_selection_item_selected(int p_index) {
-	emit_signal("change_selected_layer_request", layers_selection_button->get_item_metadata(p_index));
+void TileMapLayerEditor::_select_previous_layer_pressed() {
+	_layers_select_next_or_previous(false);
 }
 
-void TileMapLayerEditor::_highlight_selected_layer_button_toggled(bool p_pressed) {
+void TileMapLayerEditor::_select_next_layer_pressed() {
+	_layers_select_next_or_previous(true);
+}
+
+void TileMapLayerEditor::_select_all_layers_pressed() {
+	EditorNode *en = EditorNode::get_singleton();
+	Node *edited_scene_root = en->get_edited_scene();
+	ERR_FAIL_NULL(edited_scene_root);
+
+	en->get_editor_selection()->clear();
+	if (tile_map_layers_in_scene_cache.size() == 1) {
+		en->edit_node(tile_map_layers_in_scene_cache[0]);
+		en->get_editor_selection()->add_node(tile_map_layers_in_scene_cache[0]);
+	} else {
+		_update_tile_map_layers_in_scene_list_cache();
+		Ref<MultiNodeEdit> multi_node_edit = memnew(MultiNodeEdit);
+		for (TileMapLayer *layer : tile_map_layers_in_scene_cache) {
+			multi_node_edit->add_node(edited_scene_root->get_path_to(layer));
+			en->get_editor_selection()->add_node(layer);
+		}
+		en->push_item(multi_node_edit.ptr());
+	}
+}
+
+void TileMapLayerEditor::_layers_selection_item_selected(int p_index) {
 	TileMapLayer *edited_layer = _get_edited_layer();
 	ERR_FAIL_NULL(edited_layer);
 
-	TileMapLayerGroup *tile_map_layer_group = Object::cast_to<TileMapLayerGroup>(edited_layer->get_parent());
-	ERR_FAIL_NULL(tile_map_layer_group);
+	TileMap *tile_map = Object::cast_to<TileMap>(edited_layer->get_parent());
+	ERR_FAIL_NULL(tile_map);
+
+	TileMapLayer *new_edited = Object::cast_to<TileMapLayer>(tile_map->get_child(p_index));
+	edit(new_edited);
+}
+
+void TileMapLayerEditor::_update_layers_selector() {
+	const TileMapLayer *edited_layer = _get_edited_layer();
+
+	// Update the selector.
+	layers_selection_button->clear();
+	layers_selection_button->hide();
+	select_all_layers->show();
+	select_next_layer->set_disabled(false);
+	select_previous_layer->set_disabled(false);
+	advanced_menu_button->get_popup()->set_item_disabled(ADVANCED_MENU_EXTRACT_TILE_MAP_LAYERS, true);
+	if (edited_layer) {
+		TileMap *tile_map = Object::cast_to<TileMap>(edited_layer->get_parent());
+		if (tile_map && edited_layer->get_index_in_tile_map() >= 0) {
+			// Build the list of layers.
+			for (int i = 0; i < tile_map->get_layers_count(); i++) {
+				const TileMapLayer *layer = Object::cast_to<TileMapLayer>(tile_map->get_child(i));
+				if (layer) {
+					int index = layers_selection_button->get_item_count();
+					layers_selection_button->add_item(layer->get_name());
+					layers_selection_button->set_item_metadata(index, layer->get_name());
+					if (edited_layer == layer) {
+						layers_selection_button->select(index);
+					}
+				}
+			}
+
+			// Disable selector if there's no layer to select.
+			layers_selection_button->set_disabled(false);
+			if (layers_selection_button->get_item_count() == 0) {
+				layers_selection_button->set_disabled(true);
+				layers_selection_button->set_text(TTR("No Layers"));
+			}
+
+			// Disable next/previous if there's one or less layers.
+			if (layers_selection_button->get_item_count() <= 1) {
+				select_next_layer->set_disabled(true);
+				select_previous_layer->set_disabled(true);
+			}
+			layers_selection_button->show();
+			select_all_layers->hide();
+
+			// Enable the "extract as TileMapLayer" option only if we are editing a TleMap.
+			advanced_menu_button->get_popup()->set_item_disabled(ADVANCED_MENU_EXTRACT_TILE_MAP_LAYERS, false);
+		}
+	} else {
+		select_all_layers->hide();
+		select_next_layer->set_disabled(true);
+		select_previous_layer->set_disabled(true);
+	}
+}
+
+void TileMapLayerEditor::_clear_all_layers_highlighting() {
+	// Note: This function might be removed if we remove the TileMap node at some point.
+	// All processing could be done in _update_all_layers_highlighting otherwise.
+	TileMapLayer *edited_layer = _get_edited_layer();
+
+	// Use default mode.
+	if (edited_layer && edited_layer->get_index_in_tile_map() >= 0) {
+		// For the TileMap node.
+		TileMap *tile_map = Object::cast_to<TileMap>(edited_layer->get_parent());
+		if (tile_map) {
+			for (int i = 0; i < tile_map->get_layers_count(); i++) {
+				TileMapLayer *layer = Object::cast_to<TileMapLayer>(tile_map->get_child(i));
+				layer->set_highlight_mode(TileMapLayer::HIGHLIGHT_MODE_DEFAULT);
+			}
+		}
+	} else {
+		// For other TileMapLayer nodes.
+		_update_tile_map_layers_in_scene_list_cache();
+		for (TileMapLayer *layer : tile_map_layers_in_scene_cache) {
+			layer->set_highlight_mode(TileMapLayer::HIGHLIGHT_MODE_DEFAULT);
+		}
+	}
+}
+
+void TileMapLayerEditor::_update_all_layers_highlighting() {
+	EditorNode *en = EditorNode::get_singleton();
+	Node *edited_scene_root = en->get_edited_scene();
+	if (!edited_scene_root) {
+		return;
+	}
+
+	// Get selected layer.
+	TileMapLayer *edited_layer = _get_edited_layer();
+
+	bool highlight_selected_layer = EDITOR_GET("editors/tiles_editor/highlight_selected_layer");
+	if (edited_layer && highlight_selected_layer) {
+		int edited_z_index = edited_layer->get_z_index();
+
+		if (edited_layer->get_index_in_tile_map() >= 0) {
+			// For the TileMap node.
+			TileMap *tile_map = Object::cast_to<TileMap>(edited_layer->get_parent());
+			ERR_FAIL_NULL(tile_map);
+
+			bool passed = false;
+			for (int i = 0; i < tile_map->get_layers_count(); i++) {
+				TileMapLayer *layer = Object::cast_to<TileMapLayer>(tile_map->get_child(i));
+				if (layer == edited_layer) {
+					passed = true;
+					layer->set_highlight_mode(TileMapLayer::HIGHLIGHT_MODE_DEFAULT);
+				} else {
+					if (passed || layer->get_z_index() > edited_z_index) {
+						layer->set_highlight_mode(TileMapLayer::HIGHLIGHT_MODE_ABOVE);
+					} else {
+						layer->set_highlight_mode(TileMapLayer::HIGHLIGHT_MODE_BELOW);
+					}
+				}
+			}
+		} else {
+			// Update highlight mode for independent layers.
+			_update_tile_map_layers_in_scene_list_cache();
+			bool passed = false;
+			for (TileMapLayer *layer : tile_map_layers_in_scene_cache) {
+				if (layer == edited_layer) {
+					passed = true;
+					layer->set_highlight_mode(TileMapLayer::HIGHLIGHT_MODE_DEFAULT);
+				} else {
+					if (passed || layer->get_z_index() > edited_z_index) {
+						layer->set_highlight_mode(TileMapLayer::HIGHLIGHT_MODE_ABOVE);
+					} else {
+						layer->set_highlight_mode(TileMapLayer::HIGHLIGHT_MODE_BELOW);
+					}
+				}
+			}
+		}
+	}
+}
+
+void TileMapLayerEditor::_highlight_selected_layer_button_toggled(bool p_pressed) {
+	TileMapLayer *edited_layer = _get_edited_layer();
+	if (!edited_layer) {
+		return;
+	}
 
-	tile_map_layer_group->set_highlight_selected_layer(p_pressed);
+	EditorSettings::get_singleton()->set("editors/tiles_editor/highlight_selected_layer", p_pressed);
+	_update_all_layers_highlighting();
 }
 
 void TileMapLayerEditor::_advanced_menu_button_id_pressed(int p_id) {
@@ -3686,12 +3892,12 @@ void TileMapLayerEditor::_advanced_menu_button_id_pressed(int p_id) {
 		return;
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return;
 	}
 
-	if (p_id == 0) { // Replace Tile Proxies
+	if (p_id == ADVANCED_MENU_REPLACE_WITH_PROXIES) { // Replace Tile Proxies
 		EditorUndoRedoManager *undo_redo = EditorUndoRedoManager::get_singleton();
 		undo_redo->create_action(TTR("Replace Tiles with Proxies"));
 		TypedArray<Vector2i> used_cells = edited_layer->get_used_cells();
@@ -3710,31 +3916,74 @@ void TileMapLayerEditor::_advanced_menu_button_id_pressed(int p_id) {
 		}
 
 		undo_redo->commit_action();
+	} else if (p_id == ADVANCED_MENU_EXTRACT_TILE_MAP_LAYERS) { // Transform internal TileMap layers into TileMapLayers.
+		ERR_FAIL_COND(edited_layer->get_index_in_tile_map() < 0);
+
+		EditorNode *en = EditorNode::get_singleton();
+		Node *edited_scene_root = en->get_edited_scene();
+		ERR_FAIL_NULL(edited_scene_root);
+
+		EditorUndoRedoManager *undo_redo = EditorUndoRedoManager::get_singleton();
+		undo_redo->create_action(TTR("Extract TileMap layers as individual TileMapLayer nodes"));
+
+		TileMap *tile_map = Object::cast_to<TileMap>(edited_layer->get_parent());
+		for (int i = 0; i < tile_map->get_layers_count(); i++) {
+			undo_redo->add_do_method(tile_map, "remove_layer", 0);
+		}
+
+		for (int i = 0; i < tile_map->get_layers_count(); i++) {
+			TileMapLayer *new_layer = tile_map->duplicate_layer_from_internal(i);
+			undo_redo->add_do_method(tile_map, "add_child", new_layer);
+			undo_redo->add_do_method(new_layer, "set_owner", edited_scene_root);
+			undo_redo->add_do_property(new_layer, "tile_set", tile_map->get_tileset()); // Workaround for a bug: #89947.
+			undo_redo->add_undo_method(tile_map, "remove_child", new_layer);
+			undo_redo->add_do_reference(new_layer);
+		}
+
+		List<PropertyInfo> prop_list;
+		tile_map->get_property_list(&prop_list);
+		for (PropertyInfo &prop : prop_list) {
+			undo_redo->add_undo_property(tile_map, prop.name, tile_map->get(prop.name));
+		}
+		undo_redo->commit_action();
 	}
 }
 
 void TileMapLayerEditor::_update_bottom_panel() {
 	const TileMapLayer *edited_layer = _get_edited_layer();
-	if (!edited_layer) {
-		return;
+	Ref<TileSet> tile_set;
+	if (edited_layer) {
+		tile_set = edited_layer->get_tile_set();
+	}
+
+	// Update state labels.
+	if (is_multi_node_edit) {
+		cant_edit_label->set_text(TTR("Can't edit multiple layers at once."));
+		cant_edit_label->show();
+	} else if (!edited_layer) {
+		cant_edit_label->set_text(TTR("The selected TileMap has no layer to edit."));
+		cant_edit_label->show();
+	} else if (!edited_layer->is_enabled() || !edited_layer->is_visible_in_tree()) {
+		cant_edit_label->set_text(TTR("The edited layer is disabled or invisible"));
+		cant_edit_label->show();
+	} else if (tile_set.is_null()) {
+		cant_edit_label->set_text(TTR("The edited TileMap or TileMapLayer node has no TileSet resource.\nCreate or load a TileSet resource in the Tile Set property in the inspector."));
+		cant_edit_label->show();
+	} else {
+		cant_edit_label->hide();
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
-
-	// Update the visibility of controls.
-	missing_tileset_label->set_visible(tile_set.is_null());
+	// Update tabs visibility.
 	for (TileMapLayerSubEditorPlugin::TabData &tab_data : tabs_data) {
 		tab_data.panel->hide();
 	}
-	if (tile_set.is_valid()) {
-		tabs_data[tabs_bar->get_current_tab()].panel->show();
-	}
+	tabs_data[tabs_bar->get_current_tab()].panel->set_visible(!cant_edit_label->is_visible());
 }
 
 Vector<Vector2i> TileMapLayerEditor::get_line(const TileMapLayer *p_tile_map_layer, Vector2i p_from_cell, Vector2i p_to_cell) {
 	ERR_FAIL_NULL_V(p_tile_map_layer, Vector<Vector2i>());
 
-	Ref<TileSet> tile_set = p_tile_map_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = p_tile_map_layer->get_tile_set();
 	ERR_FAIL_COND_V(tile_set.is_null(), Vector<Vector2i>());
 
 	if (tile_set->get_tile_shape() == TileSet::TILE_SHAPE_SQUARE) {
@@ -3805,7 +4054,7 @@ Vector<Vector2i> TileMapLayerEditor::get_line(const TileMapLayer *p_tile_map_lay
 }
 
 void TileMapLayerEditor::_tile_map_layer_changed() {
-	tileset_changed_needs_update = true;
+	tile_map_layer_changed_needs_update = true;
 }
 
 void TileMapLayerEditor::_tab_changed(int p_tab_id) {
@@ -3825,7 +4074,7 @@ void TileMapLayerEditor::_tab_changed(int p_tab_id) {
 
 	TileMapLayer *tile_map_layer = _get_edited_layer();
 	if (tile_map_layer) {
-		if (tile_map_layer->get_effective_tile_set().is_valid()) {
+		if (tile_map_layer->get_tile_set().is_valid()) {
 			tabs_data[tabs_bar->get_current_tab()].panel->show();
 		}
 	}
@@ -3841,36 +4090,38 @@ void TileMapLayerEditor::_layers_select_next_or_previous(bool p_next) {
 		return;
 	}
 
-	TileMapLayerGroup *tile_map_layer_group = Object::cast_to<TileMapLayerGroup>(edited_layer->get_parent());
-	if (!tile_map_layer_group) {
-		return;
-	}
+	EditorNode *en = EditorNode::get_singleton();
+	Node *edited_scene_root = en->get_edited_scene();
+	ERR_FAIL_NULL(edited_scene_root);
 
+	TileMapLayer *new_selected_layer = nullptr;
 	int inc = p_next ? 1 : -1;
-	int index = Math::posmod(edited_layer->get_index() + inc, tile_map_layer_group->get_child_count());
-	const TileMapLayer *new_selected_layer = Object::cast_to<TileMapLayer>(tile_map_layer_group->get_child(index));
-	while (new_selected_layer != edited_layer) {
-		if (new_selected_layer && new_selected_layer->is_enabled()) {
-			break;
+	if (edited_layer->get_index_in_tile_map() >= 0) {
+		// Part of a TileMap.
+		TileMap *tile_map = Object::cast_to<TileMap>(edited_layer->get_parent());
+		new_selected_layer = Object::cast_to<TileMapLayer>(tile_map->get_child(Math::posmod(edited_layer->get_index_in_tile_map() + inc, tile_map->get_layers_count())));
+	} else {
+		// Individual layer.
+		_update_tile_map_layers_in_scene_list_cache();
+		int edited_index = -1;
+		for (int i = 0; i < tile_map_layers_in_scene_cache.size(); i++) {
+			if (tile_map_layers_in_scene_cache[i] == edited_layer) {
+				edited_index = i;
+				break;
+			}
 		}
-		index = Math::posmod((index + inc), tile_map_layer_group->get_child_count());
-		new_selected_layer = Object::cast_to<TileMapLayer>(tile_map_layer_group->get_child(index));
+		new_selected_layer = tile_map_layers_in_scene_cache[Math::posmod(edited_index + inc, tile_map_layers_in_scene_cache.size())];
 	}
 
-	if (new_selected_layer != edited_layer) {
-		emit_signal("change_selected_layer_request", new_selected_layer->get_name());
-	}
-}
+	ERR_FAIL_NULL(new_selected_layer);
 
-void TileMapLayerEditor::_update_highlighting_toggle() {
-	const TileMapLayer *edited_layer = _get_edited_layer();
-	if (!edited_layer) {
-		return;
-	}
-
-	TileMapLayerGroup *tile_map_layer_group = Object::cast_to<TileMapLayerGroup>(edited_layer->get_parent());
-	if (tile_map_layer_group) {
-		toggle_highlight_selected_layer_button->set_pressed(tile_map_layer_group->is_highlighting_selected_layer());
+	if (edited_layer->get_index_in_tile_map() < 0) {
+		// Only if not part of a TileMap.
+		en->edit_node(new_selected_layer);
+		en->get_editor_selection()->clear();
+		en->get_editor_selection()->add_node(new_selected_layer);
+	} else {
+		edit(new_selected_layer);
 	}
 }
 
@@ -3966,7 +4217,7 @@ void TileMapLayerEditor::forward_canvas_draw_over_viewport(Control *p_overlay) {
 		return;
 	}
 
-	Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = edited_layer->get_tile_set();
 	if (tile_set.is_null()) {
 		return;
 	}
@@ -3994,29 +4245,25 @@ void TileMapLayerEditor::forward_canvas_draw_over_viewport(Control *p_overlay) {
 			}
 
 			if (!source || !source->has_tile(tile_atlas_coords) || !source->has_alternative_tile(tile_atlas_coords, tile_alternative_tile)) {
-				// Generate a random color from the hashed values of the tiles.
-				Array a = tile_set->map_tile_proxy(tile_source_id, tile_atlas_coords, tile_alternative_tile);
-				if (int(a[0]) == tile_source_id && Vector2i(a[1]) == tile_atlas_coords && int(a[2]) == tile_alternative_tile) {
-					// Only display the pattern if we have no proxy tile.
-					Array to_hash;
-					to_hash.push_back(tile_source_id);
-					to_hash.push_back(tile_atlas_coords);
-					to_hash.push_back(tile_alternative_tile);
-					uint32_t hash = RandomPCG(to_hash.hash()).rand();
-
-					Color color;
-					color = color.from_hsv(
-							(float)((hash >> 24) & 0xFF) / 256.0,
-							Math::lerp(0.5, 1.0, (float)((hash >> 16) & 0xFF) / 256.0),
-							Math::lerp(0.5, 1.0, (float)((hash >> 8) & 0xFF) / 256.0),
-							0.8);
-
-					// Draw the scaled tile.
-					Transform2D tile_xform;
-					tile_xform.set_origin(tile_set->map_to_local(coords));
-					tile_xform.set_scale(tile_shape_size);
-					tile_set->draw_tile_shape(p_overlay, xform * tile_xform, color, true, warning_pattern_texture);
-				}
+				// Generate a random color from the hashed identifier of the tiles.
+				Array to_hash;
+				to_hash.push_back(tile_source_id);
+				to_hash.push_back(tile_atlas_coords);
+				to_hash.push_back(tile_alternative_tile);
+				uint32_t hash = RandomPCG(to_hash.hash()).rand();
+
+				Color color;
+				color = color.from_hsv(
+						(float)((hash >> 24) & 0xFF) / 256.0,
+						Math::lerp(0.5, 1.0, (float)((hash >> 16) & 0xFF) / 256.0),
+						Math::lerp(0.5, 1.0, (float)((hash >> 8) & 0xFF) / 256.0),
+						0.8);
+
+				// Display the warning pattern.
+				Transform2D tile_xform;
+				tile_xform.set_origin(tile_set->map_to_local(coords));
+				tile_xform.set_scale(tile_shape_size);
+				tile_set->draw_tile_shape(p_overlay, xform * tile_xform, color, true, warning_pattern_texture);
 
 				// Draw the warning icon.
 				Vector2::Axis min_axis = missing_tile_texture->get_size().min_axis_index();
@@ -4092,75 +4339,50 @@ void TileMapLayerEditor::forward_canvas_draw_over_viewport(Control *p_overlay) {
 	tabs_plugins[tabs_bar->get_current_tab()]->forward_canvas_draw_over_viewport(p_overlay);
 }
 
-void TileMapLayerEditor::edit(TileMapLayer *p_tile_map_layer) {
-	if (p_tile_map_layer && p_tile_map_layer->get_instance_id() == edited_tile_map_layer_id) {
+void TileMapLayerEditor::edit(Object *p_edited) {
+	if (p_edited && p_edited->get_instance_id() == edited_tile_map_layer_id) {
 		return;
 	}
 
+	_clear_all_layers_highlighting();
+
 	// Disconnect to changes.
 	TileMapLayer *tile_map_layer = _get_edited_layer();
 	if (tile_map_layer) {
 		tile_map_layer->disconnect("changed", callable_mp(this, &TileMapLayerEditor::_tile_map_layer_changed));
+		tile_map_layer->disconnect("visibility_changed", callable_mp(this, &TileMapLayerEditor::_tile_map_layer_changed));
 	}
 
 	// Update the edited layer.
-	if (p_tile_map_layer) {
+	TileMapLayer *new_layer = Object::cast_to<TileMapLayer>(p_edited);
+	if (new_layer) {
 		// Change the edited object.
-		edited_tile_map_layer_id = p_tile_map_layer->get_instance_id();
+		edited_tile_map_layer_id = new_layer->get_instance_id();
 
 		tile_map_layer = _get_edited_layer();
 		// Connect to changes.
 		if (!tile_map_layer->is_connected("changed", callable_mp(this, &TileMapLayerEditor::_tile_map_layer_changed))) {
 			tile_map_layer->connect("changed", callable_mp(this, &TileMapLayerEditor::_tile_map_layer_changed));
+			tile_map_layer->connect("visibility_changed", callable_mp(this, &TileMapLayerEditor::_tile_map_layer_changed));
 		}
 	} else {
 		edited_tile_map_layer_id = ObjectID();
 	}
 
-	update_layers_selector();
-	_update_highlighting_toggle();
+	// Check if we are trying to use a MultiNodeEdit.
+	is_multi_node_edit = Object::cast_to<MultiNodeEdit>(p_edited);
 
-	// Call the plugins.
+	// Call the plugins and update everything.
 	tabs_plugins[tabs_bar->get_current_tab()]->edit(edited_tile_map_layer_id);
+	_update_layers_selector();
+	_update_all_layers_highlighting();
 
 	_tile_map_layer_changed();
 }
 
-void TileMapLayerEditor::update_layers_selector() {
-	const TileMapLayer *edited_layer = _get_edited_layer();
-	if (!edited_layer) {
-		return;
-	}
-
-	TileMapLayerGroup *tile_map_layer_group = Object::cast_to<TileMapLayerGroup>(edited_layer->get_parent());
-	if (tile_map_layer_group) {
-		// Update the selector
-		layers_selection_button->show();
-		layers_selection_button->clear();
-
-		// Build the list of layers.
-		for (int i = 0; i < tile_map_layer_group->get_child_count(); i++) {
-			const TileMapLayer *layer = Object::cast_to<TileMapLayer>(tile_map_layer_group->get_child(i));
-			if (layer) {
-				int index = layers_selection_button->get_item_count();
-				layers_selection_button->add_item(layer->get_name());
-				layers_selection_button->set_item_disabled(index, !layer->is_enabled());
-				layers_selection_button->set_item_metadata(index, layer->get_name());
-				if (edited_layer == layer) {
-					layers_selection_button->select(index);
-				}
-			}
-		}
-
-		// Disable the button if there's no layer to select.
-		layers_selection_button->set_disabled(false);
-		if (layers_selection_button->get_item_count() == 0) {
-			layers_selection_button->set_disabled(true);
-			layers_selection_button->set_text(TTR("No Layers"));
-		}
-	} else {
-		layers_selection_button->hide();
-	}
+void TileMapLayerEditor::set_show_layer_selector(bool p_show_layer_selector) {
+	show_layers_selector = p_show_layer_selector;
+	_update_layers_selector();
 }
 
 TileMapLayerEditor::TileMapLayerEditor() {
@@ -4210,13 +4432,36 @@ TileMapLayerEditor::TileMapLayerEditor() {
 	tile_map_toolbar->add_child(c);
 
 	// Layer selector.
+	layer_selection_hbox = memnew(HBoxContainer);
+	tile_map_toolbar->add_child(layer_selection_hbox);
+
 	layers_selection_button = memnew(OptionButton);
 	layers_selection_button->set_custom_minimum_size(Size2(200, 0));
 	layers_selection_button->set_text_overrun_behavior(TextServer::OVERRUN_TRIM_ELLIPSIS);
 	layers_selection_button->set_tooltip_text(TTR("TileMap Layers"));
 	layers_selection_button->connect("item_selected", callable_mp(this, &TileMapLayerEditor::_layers_selection_item_selected));
-	tile_map_toolbar->add_child(layers_selection_button);
-
+	layer_selection_hbox->add_child(layers_selection_button);
+
+	select_previous_layer = memnew(Button);
+	select_previous_layer->set_theme_type_variation("FlatButton");
+	select_previous_layer->set_tooltip_text(TTR("Select previous layer"));
+	select_previous_layer->connect("pressed", callable_mp(this, &TileMapLayerEditor::_select_previous_layer_pressed));
+	layer_selection_hbox->add_child(select_previous_layer);
+
+	select_next_layer = memnew(Button);
+	select_next_layer->set_theme_type_variation("FlatButton");
+	select_next_layer->set_tooltip_text(TTR("Select next layer"));
+	select_next_layer->connect("pressed", callable_mp(this, &TileMapLayerEditor::_select_next_layer_pressed));
+	layer_selection_hbox->add_child(select_next_layer);
+
+	select_all_layers = memnew(Button);
+	select_all_layers->set_theme_type_variation("FlatButton");
+	select_all_layers->set_text(TTR("Select all layers"));
+	select_all_layers->connect("pressed", callable_mp(this, &TileMapLayerEditor::_select_all_layers_pressed));
+	select_all_layers->set_tooltip_text(TTR("Select all TileMapLayers in scene"));
+	layer_selection_hbox->add_child(select_all_layers);
+
+	// Highlighting selected layer.
 	toggle_highlight_selected_layer_button = memnew(Button);
 	toggle_highlight_selected_layer_button->set_theme_type_variation("FlatButton");
 	toggle_highlight_selected_layer_button->set_toggle_mode(true);
@@ -4239,18 +4484,19 @@ TileMapLayerEditor::TileMapLayerEditor() {
 	advanced_menu_button = memnew(MenuButton);
 	advanced_menu_button->set_flat(false);
 	advanced_menu_button->set_theme_type_variation("FlatButton");
-	advanced_menu_button->get_popup()->add_item(TTR("Automatically Replace Tiles with Proxies"));
+	advanced_menu_button->get_popup()->add_item(TTR("Automatically Replace Tiles with Proxies"), ADVANCED_MENU_REPLACE_WITH_PROXIES);
+	advanced_menu_button->get_popup()->add_item(TTR("Extract TileMap layers as individual TileMapLayer nodes"), ADVANCED_MENU_EXTRACT_TILE_MAP_LAYERS);
 	advanced_menu_button->get_popup()->connect("id_pressed", callable_mp(this, &TileMapLayerEditor::_advanced_menu_button_id_pressed));
 	tile_map_toolbar->add_child(advanced_menu_button);
 
-	missing_tileset_label = memnew(Label);
-	missing_tileset_label->set_text(TTR("The edited TileMap node has no TileSet resource.\nCreate or load a TileSet resource in the Tile Set property in the inspector."));
-	missing_tileset_label->set_h_size_flags(SIZE_EXPAND_FILL);
-	missing_tileset_label->set_v_size_flags(SIZE_EXPAND_FILL);
-	missing_tileset_label->set_horizontal_alignment(HORIZONTAL_ALIGNMENT_CENTER);
-	missing_tileset_label->set_vertical_alignment(VERTICAL_ALIGNMENT_CENTER);
-	missing_tileset_label->hide();
-	add_child(missing_tileset_label);
+	// A label for editing errors.
+	cant_edit_label = memnew(Label);
+	cant_edit_label->set_h_size_flags(SIZE_EXPAND_FILL);
+	cant_edit_label->set_v_size_flags(SIZE_EXPAND_FILL);
+	cant_edit_label->set_horizontal_alignment(HORIZONTAL_ALIGNMENT_CENTER);
+	cant_edit_label->set_vertical_alignment(VERTICAL_ALIGNMENT_CENTER);
+	cant_edit_label->hide();
+	add_child(cant_edit_label);
 
 	for (unsigned int tab_index = 0; tab_index < tabs_data.size(); tab_index++) {
 		add_child(tabs_data[tab_index].panel);
diff --git a/editor/plugins/tiles/tile_map_layer_editor.h b/editor/plugins/tiles/tile_map_layer_editor.h
index a7fea2abcf..2603261449 100644
--- a/editor/plugins/tiles/tile_map_layer_editor.h
+++ b/editor/plugins/tiles/tile_map_layer_editor.h
@@ -339,10 +339,16 @@ class TileMapLayerEditor : public VBoxContainer {
 	GDCLASS(TileMapLayerEditor, VBoxContainer);
 
 private:
-	bool tileset_changed_needs_update = false;
+	bool tile_map_layer_changed_needs_update = false;
 
 	ObjectID edited_tile_map_layer_id;
+	bool is_multi_node_edit = false;
+	Vector<TileMapLayer *> tile_map_layers_in_scene_cache;
+	bool layers_in_scene_list_cache_needs_update = false;
 	TileMapLayer *_get_edited_layer() const;
+	void _find_tile_map_layers_in_scene(Node *p_current, const Node *p_owner, Vector<TileMapLayer *> &r_list) const;
+	void _update_tile_map_layers_in_scene_list_cache();
+	void _node_change(Node *p_node);
 
 	// Vector to keep plugins.
 	Vector<TileMapLayerSubEditorPlugin *> tile_map_editor_plugins;
@@ -350,20 +356,36 @@ private:
 	// Toolbar.
 	HFlowContainer *tile_map_toolbar = nullptr;
 
+	bool show_layers_selector = false;
+
+	HBoxContainer *layer_selection_hbox = nullptr;
+	Button *select_previous_layer = nullptr;
+	void _select_previous_layer_pressed();
+	Button *select_next_layer = nullptr;
+	void _select_next_layer_pressed();
+	Button *select_all_layers = nullptr;
+	void _select_all_layers_pressed();
 	OptionButton *layers_selection_button = nullptr;
 	void _layers_selection_item_selected(int p_index);
+	void _update_layers_selector();
 
 	Button *toggle_highlight_selected_layer_button = nullptr;
+	void _clear_all_layers_highlighting();
+	void _update_all_layers_highlighting();
 	void _highlight_selected_layer_button_toggled(bool p_pressed);
 
 	Button *toggle_grid_button = nullptr;
 	void _on_grid_toggled(bool p_pressed);
 
+	enum {
+		ADVANCED_MENU_REPLACE_WITH_PROXIES,
+		ADVANCED_MENU_EXTRACT_TILE_MAP_LAYERS,
+	};
 	MenuButton *advanced_menu_button = nullptr;
 	void _advanced_menu_button_id_pressed(int p_id);
 
 	// Bottom panel.
-	Label *missing_tileset_label = nullptr;
+	Label *cant_edit_label = nullptr;
 	TabBar *tabs_bar = nullptr;
 	LocalVector<TileMapLayerSubEditorPlugin::TabData> tabs_data;
 	LocalVector<TileMapLayerSubEditorPlugin *> tabs_plugins;
@@ -379,22 +401,20 @@ private:
 
 	// Updates.
 	void _layers_select_next_or_previous(bool p_next);
-	void _update_highlighting_toggle();
 
 	// Inspector undo/redo callback.
 	void _move_tile_map_array_element(Object *p_undo_redo, Object *p_edited, const String &p_array_prefix, int p_from_index, int p_to_pos);
 
 protected:
 	void _notification(int p_what);
-	static void _bind_methods();
 	void _draw_shape(Control *p_control, Rect2 p_region, TileSet::TileShape p_shape, TileSet::TileOffsetAxis p_offset_axis, Color p_color);
 
 public:
 	bool forward_canvas_gui_input(const Ref<InputEvent> &p_event);
 	void forward_canvas_draw_over_viewport(Control *p_overlay);
 
-	void edit(TileMapLayer *p_tile_map_layer);
-	void update_layers_selector();
+	void edit(Object *p_tile_map_layer);
+	void set_show_layer_selector(bool p_show_layer_selector);
 
 	TileMapLayerEditor();
 	~TileMapLayerEditor();
diff --git a/editor/plugins/tiles/tiles_editor_plugin.cpp b/editor/plugins/tiles/tiles_editor_plugin.cpp
index ed21a29487..c3141beb1a 100644
--- a/editor/plugins/tiles/tiles_editor_plugin.cpp
+++ b/editor/plugins/tiles/tiles_editor_plugin.cpp
@@ -40,6 +40,7 @@
 #include "editor/editor_settings.h"
 #include "editor/editor_string_names.h"
 #include "editor/gui/editor_bottom_panel.h"
+#include "editor/multi_node_edit.h"
 #include "editor/plugins/canvas_item_editor_plugin.h"
 #include "editor/themes/editor_scale.h"
 #include "scene/2d/tile_map.h"
@@ -336,7 +337,7 @@ void TileMapEditorPlugin::_tile_map_layer_removed() {
 void TileMapEditorPlugin::_update_tile_map() {
 	TileMapLayer *edited_layer = Object::cast_to<TileMapLayer>(ObjectDB::get_instance(tile_map_layer_id));
 	if (edited_layer) {
-		Ref<TileSet> tile_set = edited_layer->get_effective_tile_set();
+		Ref<TileSet> tile_set = edited_layer->get_tile_set();
 		if (tile_set.is_valid() && tile_set_id != tile_set->get_instance_id()) {
 			tile_set_plugin_singleton->edit(tile_set.ptr());
 			tile_set_plugin_singleton->make_visible(true);
@@ -355,38 +356,25 @@ void TileMapEditorPlugin::_select_layer(const StringName &p_name) {
 	ERR_FAIL_NULL(edited_layer);
 
 	Node *parent = edited_layer->get_parent();
-	ERR_FAIL_NULL(parent);
-
-	TileMapLayer *new_layer = Object::cast_to<TileMapLayer>(parent->get_node_or_null(String(p_name)));
-	edit(new_layer);
+	if (parent) {
+		TileMapLayer *new_layer = Object::cast_to<TileMapLayer>(parent->get_node_or_null(String(p_name)));
+		edit(new_layer);
+	}
 }
 
-void TileMapEditorPlugin::_edit_tile_map_layer(TileMapLayer *p_tile_map_layer) {
+void TileMapEditorPlugin::_edit_tile_map_layer(TileMapLayer *p_tile_map_layer, bool p_show_layer_selector) {
 	ERR_FAIL_NULL(p_tile_map_layer);
 
 	editor->edit(p_tile_map_layer);
-
-	// Update the selected layers in the TileMapLayerGroup parent node.
-	TileMapLayerGroup *tile_map_layer_group = Object::cast_to<TileMapLayerGroup>(p_tile_map_layer->get_parent());
-	if (tile_map_layer_group) {
-		Vector<StringName> selected;
-		selected.push_back(p_tile_map_layer->get_name());
-		tile_map_layer_group->set_selected_layers(selected);
-	}
+	editor->set_show_layer_selector(p_show_layer_selector);
 
 	// Update the object IDs.
 	tile_map_layer_id = p_tile_map_layer->get_instance_id();
 	p_tile_map_layer->connect("changed", callable_mp(this, &TileMapEditorPlugin::_tile_map_layer_changed));
 	p_tile_map_layer->connect("tree_exited", callable_mp(this, &TileMapEditorPlugin::_tile_map_layer_removed));
-	if (tile_map_layer_group) {
-		tile_map_group_id = tile_map_layer_group->get_instance_id();
-		tile_map_layer_group->connect("child_entered_tree", callable_mp(editor, &TileMapLayerEditor::update_layers_selector).unbind(1));
-		tile_map_layer_group->connect("child_exiting_tree", callable_mp(editor, &TileMapLayerEditor::update_layers_selector).unbind(1));
-		tile_map_layer_group->connect("child_order_changed", callable_mp(editor, &TileMapLayerEditor::update_layers_selector));
-	}
 
 	// Update the edited tileset.
-	Ref<TileSet> tile_set = p_tile_map_layer->get_effective_tile_set();
+	Ref<TileSet> tile_set = p_tile_map_layer->get_tile_set();
 	if (tile_set.is_valid()) {
 		tile_set_plugin_singleton->edit(tile_set.ptr());
 		tile_set_plugin_singleton->make_visible(true);
@@ -397,30 +385,15 @@ void TileMapEditorPlugin::_edit_tile_map_layer(TileMapLayer *p_tile_map_layer) {
 	}
 }
 
-void TileMapEditorPlugin::_edit_tile_map_layer_group(TileMapLayerGroup *p_tile_map_layer_group) {
-	ERR_FAIL_NULL(p_tile_map_layer_group);
-
-	Vector<StringName> selected_layers = p_tile_map_layer_group->get_selected_layers();
-
-	TileMapLayer *selected_layer = nullptr;
-	if (selected_layers.size() > 0) {
-		// Edit the selected layer.
-		selected_layer = Object::cast_to<TileMapLayer>(p_tile_map_layer_group->get_node_or_null(String(selected_layers[0])));
-	}
-	if (!selected_layer) {
-		// Edit the first layer found.
-		for (int i = 0; i < p_tile_map_layer_group->get_child_count(); i++) {
-			selected_layer = Object::cast_to<TileMapLayer>(p_tile_map_layer_group->get_child(i));
-			if (selected_layer) {
-				break;
-			}
-		}
-	}
+void TileMapEditorPlugin::_edit_tile_map(TileMap *p_tile_map) {
+	ERR_FAIL_NULL(p_tile_map);
 
-	if (selected_layer) {
-		_edit_tile_map_layer(selected_layer);
+	if (p_tile_map->get_layers_count() > 0) {
+		TileMapLayer *selected_layer = Object::cast_to<TileMapLayer>(p_tile_map->get_child(0));
+		_edit_tile_map_layer(selected_layer, true);
 	} else {
 		editor->edit(nullptr);
+		editor->set_show_layer_selector(false);
 	}
 }
 
@@ -437,36 +410,38 @@ void TileMapEditorPlugin::edit(Object *p_object) {
 		edited_layer->disconnect("tree_exited", callable_mp(this, &TileMapEditorPlugin::_tile_map_layer_removed));
 	}
 
-	TileMapLayerGroup *tile_map_group = Object::cast_to<TileMapLayerGroup>(ObjectDB::get_instance(tile_map_group_id));
-	if (tile_map_group) {
-		tile_map_group->disconnect("child_entered_tree", callable_mp(editor, &TileMapLayerEditor::update_layers_selector).unbind(1));
-		tile_map_group->disconnect("child_exiting_tree", callable_mp(editor, &TileMapLayerEditor::update_layers_selector).unbind(1));
-		tile_map_group->disconnect("child_order_changed", callable_mp(editor, &TileMapLayerEditor::update_layers_selector));
-	}
-
 	tile_map_group_id = ObjectID();
 	tile_map_layer_id = ObjectID();
 	tile_set_id = ObjectID();
 
-	TileMapLayerGroup *tile_map_layer_group = Object::cast_to<TileMap>(p_object);
+	TileMap *tile_map = Object::cast_to<TileMap>(p_object);
 	TileMapLayer *tile_map_layer = Object::cast_to<TileMapLayer>(p_object);
-	if (tile_map_layer_group) {
-		_edit_tile_map_layer_group(tile_map_layer_group);
+	MultiNodeEdit *multi_node_edit = Object::cast_to<MultiNodeEdit>(p_object);
+	if (tile_map) {
+		_edit_tile_map(tile_map);
 	} else if (tile_map_layer) {
-		_edit_tile_map_layer(tile_map_layer);
+		_edit_tile_map_layer(tile_map_layer, false);
+	} else if (multi_node_edit) {
+		editor->edit(multi_node_edit);
 	} else {
-		// Deselect the layer in the group.
-		if (edited_layer) {
-			tile_map_layer_group = Object::cast_to<TileMapLayerGroup>(edited_layer->get_parent());
-			if (tile_map_layer_group) {
-				tile_map_layer_group->set_selected_layers(Vector<StringName>());
-			}
-		}
+		editor->edit(nullptr);
 	}
 }
 
 bool TileMapEditorPlugin::handles(Object *p_object) const {
-	return Object::cast_to<TileMapLayer>(p_object) != nullptr || Object::cast_to<TileMapLayerGroup>(p_object) != nullptr;
+	MultiNodeEdit *multi_node_edit = Object::cast_to<MultiNodeEdit>(p_object);
+	Node *edited_scene = EditorNode::get_singleton()->get_edited_scene();
+	if (multi_node_edit && edited_scene) {
+		bool only_tile_map_layers = true;
+		for (int i = 0; i < multi_node_edit->get_node_count(); i++) {
+			if (!Object::cast_to<TileMapLayer>(edited_scene->get_node(multi_node_edit->get_node(i)))) {
+				only_tile_map_layers = false;
+				break;
+			}
+		}
+		return only_tile_map_layers;
+	}
+	return Object::cast_to<TileMapLayer>(p_object) != nullptr || Object::cast_to<TileMap>(p_object) != nullptr;
 }
 
 void TileMapEditorPlugin::make_visible(bool p_visible) {
@@ -509,7 +484,6 @@ TileMapEditorPlugin::TileMapEditorPlugin() {
 	editor->set_h_size_flags(Control::SIZE_EXPAND_FILL);
 	editor->set_v_size_flags(Control::SIZE_EXPAND_FILL);
 	editor->set_custom_minimum_size(Size2(0, 200) * EDSCALE);
-	editor->connect("change_selected_layer_request", callable_mp(this, &TileMapEditorPlugin::_select_layer));
 	editor->hide();
 
 	button = EditorNode::get_bottom_panel()->add_item(TTR("TileMap"), editor, ED_SHORTCUT_AND_COMMAND("bottom_panels/toggle_tile_map_bottom_panel", TTR("Toggle TileMap Bottom Panel")));
diff --git a/editor/plugins/tiles/tiles_editor_plugin.h b/editor/plugins/tiles/tiles_editor_plugin.h
index ac3e8986d6..23a6a52a5c 100644
--- a/editor/plugins/tiles/tiles_editor_plugin.h
+++ b/editor/plugins/tiles/tiles_editor_plugin.h
@@ -126,8 +126,8 @@ class TileMapEditorPlugin : public EditorPlugin {
 	void _update_tile_map();
 	void _select_layer(const StringName &p_name);
 
-	void _edit_tile_map_layer(TileMapLayer *p_tile_map_layer);
-	void _edit_tile_map_layer_group(TileMapLayerGroup *p_tile_map_layer_group);
+	void _edit_tile_map_layer(TileMapLayer *p_tile_map_layer, bool p_show_layer_selector);
+	void _edit_tile_map(TileMap *p_tile_map);
 
 protected:
 	void _notification(int p_notification);
diff --git a/editor/plugins/version_control_editor_plugin.cpp b/editor/plugins/version_control_editor_plugin.cpp
index 1a602568fe..9173838471 100644
--- a/editor/plugins/version_control_editor_plugin.cpp
+++ b/editor/plugins/version_control_editor_plugin.cpp
@@ -911,7 +911,7 @@ void VersionControlEditorPlugin::fetch_available_vcs_plugin_names() {
 }
 
 void VersionControlEditorPlugin::register_editor() {
-	EditorDockManager::get_singleton()->add_control_to_dock(EditorDockManager::DOCK_SLOT_RIGHT_UL, version_commit_dock);
+	EditorDockManager::get_singleton()->add_dock(version_commit_dock, "", EditorDockManager::DOCK_SLOT_RIGHT_UL);
 
 	version_control_dock_button = EditorNode::get_bottom_panel()->add_item(TTR("Version Control"), version_control_dock, ED_SHORTCUT_AND_COMMAND("bottom_panels/toggle_version_control_bottom_panel", TTR("Toggle Version Control Bottom Panel")));
 
@@ -931,7 +931,7 @@ void VersionControlEditorPlugin::shut_down() {
 	memdelete(EditorVCSInterface::get_singleton());
 	EditorVCSInterface::set_singleton(nullptr);
 
-	EditorDockManager::get_singleton()->remove_control_from_dock(version_commit_dock);
+	EditorDockManager::get_singleton()->remove_dock(version_commit_dock);
 	EditorNode::get_bottom_panel()->remove_item(version_control_dock);
 
 	_set_vcs_ui_state(false);
diff --git a/editor/reparent_dialog.cpp b/editor/reparent_dialog.cpp
index ec5208b549..a139d180f0 100644
--- a/editor/reparent_dialog.cpp
+++ b/editor/reparent_dialog.cpp
@@ -60,7 +60,7 @@ void ReparentDialog::_reparent() {
 
 void ReparentDialog::set_current(const HashSet<Node *> &p_selection) {
 	tree->set_marked(p_selection, false, false);
-	//tree->set_selected(p_node->get_parent());
+	tree->set_selected(nullptr);
 }
 
 void ReparentDialog::_bind_methods() {
@@ -74,7 +74,6 @@ ReparentDialog::ReparentDialog() {
 
 	VBoxContainer *vbc = memnew(VBoxContainer);
 	add_child(vbc);
-	//set_child_rect(vbc);
 
 	tree = memnew(SceneTreeEditor(false));
 	tree->set_show_enabled_subscene(true);
@@ -86,10 +85,6 @@ ReparentDialog::ReparentDialog() {
 	keep_transform->set_pressed(true);
 	vbc->add_child(keep_transform);
 
-	//vbc->add_margin_child("Options:",node_only);
-
-	//cancel->connect("pressed", this,"_cancel");
-
 	set_ok_button_text(TTR("Reparent"));
 }
 
diff --git a/editor/scene_tree_dock.cpp b/editor/scene_tree_dock.cpp
index ded6fc2a35..26ddec6603 100644
--- a/editor/scene_tree_dock.cpp
+++ b/editor/scene_tree_dock.cpp
@@ -3156,7 +3156,7 @@ void SceneTreeDock::_files_dropped(const Vector<String> &p_files, NodePath p_to,
 			const EditorPropertyNameProcessor::Style style = InspectorDock::get_singleton()->get_property_name_style();
 			menu_properties->clear();
 			for (const String &p : valid_properties) {
-				menu_properties->add_item(EditorPropertyNameProcessor::get_singleton()->process_name(p, style));
+				menu_properties->add_item(EditorPropertyNameProcessor::get_singleton()->process_name(p, style, p, node->get_class_name()));
 				menu_properties->set_item_metadata(-1, p);
 			}
 
diff --git a/methods.py b/methods.py
index c0d129f93e..7af621befc 100644
--- a/methods.py
+++ b/methods.py
@@ -228,14 +228,22 @@ def get_version_info(module_version_string="", silent=False):
     return version_info
 
 
+def write_file_if_needed(path, string):
+    try:
+        with open(path, "r", encoding="utf-8", newline="\n") as f:
+            if f.read() == string:
+                return
+    except FileNotFoundError:
+        pass
+
+    with open(path, "w", encoding="utf-8", newline="\n") as f:
+        f.write(string)
+
+
 def generate_version_header(module_version_string=""):
     version_info = get_version_info(module_version_string)
 
-    # NOTE: It is safe to generate these files here, since this is still executed serially.
-
-    with open("core/version_generated.gen.h", "w", encoding="utf-8", newline="\n") as f:
-        f.write(
-            """\
+    version_info_header = """\
 /* THIS FILE IS GENERATED DO NOT EDIT */
 #ifndef VERSION_GENERATED_GEN_H
 #define VERSION_GENERATED_GEN_H
@@ -252,21 +260,20 @@ def generate_version_header(module_version_string=""):
 #define VERSION_DOCS_URL "https://docs.godotengine.org/en/" VERSION_DOCS_BRANCH
 #endif // VERSION_GENERATED_GEN_H
 """.format(
-                **version_info
-            )
-        )
+        **version_info
+    )
 
-    with open("core/version_hash.gen.cpp", "w", encoding="utf-8", newline="\n") as fhash:
-        fhash.write(
-            """\
+    version_hash_data = """\
 /* THIS FILE IS GENERATED DO NOT EDIT */
 #include "core/version.h"
 const char *const VERSION_HASH = "{git_hash}";
 const uint64_t VERSION_TIMESTAMP = {git_timestamp};
 """.format(
-                **version_info
-            )
-        )
+        **version_info
+    )
+
+    write_file_if_needed("core/version_generated.gen.h", version_info_header)
+    write_file_if_needed("core/version_hash.gen.cpp", version_hash_data)
 
 
 def parse_cg_file(fname, uniforms, sizes, conditionals):
@@ -385,15 +392,18 @@ def is_module(path):
 
 
 def write_disabled_classes(class_list):
-    with open("core/disabled_classes.gen.h", "w", encoding="utf-8", newline="\n") as f:
-        f.write("/* THIS FILE IS GENERATED DO NOT EDIT */\n")
-        f.write("#ifndef DISABLED_CLASSES_GEN_H\n")
-        f.write("#define DISABLED_CLASSES_GEN_H\n\n")
-        for c in class_list:
-            cs = c.strip()
-            if cs != "":
-                f.write("#define ClassDB_Disable_" + cs + " 1\n")
-        f.write("\n#endif\n")
+    file_contents = ""
+
+    file_contents += "/* THIS FILE IS GENERATED DO NOT EDIT */\n"
+    file_contents += "#ifndef DISABLED_CLASSES_GEN_H\n"
+    file_contents += "#define DISABLED_CLASSES_GEN_H\n\n"
+    for c in class_list:
+        cs = c.strip()
+        if cs != "":
+            file_contents += "#define ClassDB_Disable_" + cs + " 1\n"
+    file_contents += "\n#endif\n"
+
+    write_file_if_needed("core/disabled_classes.gen.h", file_contents)
 
 
 def write_modules(modules):
@@ -435,9 +445,7 @@ void uninitialize_modules(ModuleInitializationLevel p_level) {
         uninitialize_cpp,
     )
 
-    # NOTE: It is safe to generate this file here, since this is still executed serially
-    with open("modules/register_module_types.gen.cpp", "w", encoding="utf-8", newline="\n") as f:
-        f.write(modules_cpp)
+    write_file_if_needed("modules/register_module_types.gen.cpp", modules_cpp)
 
 
 def convert_custom_modules_path(path):
diff --git a/misc/extension_api_validation/4.2-stable.expected b/misc/extension_api_validation/4.2-stable.expected
index 8623e8eb25..53a29fa660 100644
--- a/misc/extension_api_validation/4.2-stable.expected
+++ b/misc/extension_api_validation/4.2-stable.expected
@@ -99,15 +99,6 @@ Validate extension JSON: Error: Field 'classes/GLTFBufferView/methods/get_indice
 Change AudioStreamPlayer* is_autoplay_enabled and GLTFBufferView getters to be const.
 
 
-GH-87379
---------
-Validate extension JSON: API was removed: classes/TileMap/methods/get_tileset
-Validate extension JSON: API was removed: classes/TileMap/methods/set_tileset
-Validate extension JSON: API was removed: classes/TileMap/properties/tile_set
-
-Moved to the parent TileMapLayerGroup class. No change should be necessary.
-
-
 GH-87340
 --------
 Validate extension JSON: JSON file: Field was added in a way that breaks compatibility 'classes/RenderingDevice/methods/screen_get_framebuffer_format': arguments
@@ -248,3 +239,23 @@ Validate extension JSON: Error: Field 'classes/AcceptDialog/methods/register_tex
 Validate extension JSON: Error: Field 'classes/AcceptDialog/methods/remove_button/arguments/0': type changed value in new API, from "Control" to "Button".
 
 Changed argument type to the more specific one actually expected by the method. Compatibility method registered.
+
+
+GH-89992
+--------
+Validate extension JSON: Error: Field 'classes/Node/methods/replace_by/arguments': size changed value in new API, from 2 to 3.
+
+Added optional argument to prevent children to be reparented during replace_by. Compatibility method registered.
+
+
+GH-88047
+--------
+Validate extension JSON: Error: Field 'classes/AStar2D/methods/get_id_path/arguments': size changed value in new API, from 2 to 3.
+Validate extension JSON: Error: Field 'classes/AStar2D/methods/get_point_path/arguments': size changed value in new API, from 2 to 3.
+Validate extension JSON: Error: Field 'classes/AStar3D/methods/get_id_path/arguments': size changed value in new API, from 2 to 3.
+Validate extension JSON: Error: Field 'classes/AStar3D/methods/get_point_path/arguments': size changed value in new API, from 2 to 3.
+Validate extension JSON: Error: Field 'classes/AStarGrid2D/methods/get_id_path/arguments': size changed value in new API, from 2 to 3.
+Validate extension JSON: Error: Field 'classes/AStarGrid2D/methods/get_point_path/arguments': size changed value in new API, from 2 to 3.
+
+Added optional "allow_partial_path" argument to get_id_path and get_point_path methods in AStar classes.
+Compatibility methods registered.
diff --git a/modules/mono/csharp_script.cpp b/modules/mono/csharp_script.cpp
index 858d1d3e4e..0dd1dc7c12 100644
--- a/modules/mono/csharp_script.cpp
+++ b/modules/mono/csharp_script.cpp
@@ -1408,7 +1408,11 @@ GDExtensionBool CSharpLanguage::_instance_binding_reference_callback(void *p_tok
 }
 
 void *CSharpLanguage::get_instance_binding(Object *p_object) {
-	void *binding = p_object->get_instance_binding(get_singleton(), &_instance_binding_callbacks);
+	return p_object->get_instance_binding(get_singleton(), &_instance_binding_callbacks);
+}
+
+void *CSharpLanguage::get_instance_binding_with_setup(Object *p_object) {
+	void *binding = get_instance_binding(p_object);
 
 	// Initially this was in `_instance_binding_create_callback`. However, after the new instance
 	// binding re-write it was resulting in a deadlock in `_instance_binding_reference`, as
@@ -1433,11 +1437,7 @@ void *CSharpLanguage::get_existing_instance_binding(Object *p_object) {
 #ifdef DEBUG_ENABLED
 	CRASH_COND(p_object->has_instance_binding(p_object));
 #endif
-	return p_object->get_instance_binding(get_singleton(), &_instance_binding_callbacks);
-}
-
-void CSharpLanguage::set_instance_binding(Object *p_object, void *p_binding) {
-	p_object->set_instance_binding(get_singleton(), p_binding, &_instance_binding_callbacks);
+	return get_instance_binding(p_object);
 }
 
 bool CSharpLanguage::has_instance_binding(Object *p_object) {
@@ -1464,13 +1464,6 @@ void CSharpLanguage::tie_native_managed_to_unmanaged(GCHandleIntPtr p_gchandle_i
 	// Another reason for doing this is that this instance could outlive CSharpLanguage, which would
 	// be problematic when using a script. See: https://github.com/godotengine/godot/issues/25621
 
-	CSharpScriptBinding script_binding;
-
-	script_binding.inited = true;
-	script_binding.type_name = *p_native_name;
-	script_binding.gchandle = gchandle;
-	script_binding.owner = p_unmanaged;
-
 	if (p_ref_counted) {
 		// Unsafe refcount increment. The managed instance also counts as a reference.
 		// This way if the unmanaged world has no references to our owner
@@ -1486,14 +1479,13 @@ void CSharpLanguage::tie_native_managed_to_unmanaged(GCHandleIntPtr p_gchandle_i
 	// The object was just created, no script instance binding should have been attached
 	CRASH_COND(CSharpLanguage::has_instance_binding(p_unmanaged));
 
-	void *data;
-	{
-		MutexLock lock(CSharpLanguage::get_singleton()->get_language_bind_mutex());
-		data = (void *)CSharpLanguage::get_singleton()->insert_script_binding(p_unmanaged, script_binding);
-	}
+	void *binding = CSharpLanguage::get_singleton()->get_instance_binding(p_unmanaged);
 
-	// Should be thread safe because the object was just created and nothing else should be referencing it
-	CSharpLanguage::set_instance_binding(p_unmanaged, data);
+	CSharpScriptBinding &script_binding = ((RBMap<Object *, CSharpScriptBinding>::Element *)binding)->value();
+	script_binding.inited = true;
+	script_binding.type_name = *p_native_name;
+	script_binding.gchandle = gchandle;
+	script_binding.owner = p_unmanaged;
 }
 
 void CSharpLanguage::tie_user_managed_to_unmanaged(GCHandleIntPtr p_gchandle_intptr, Object *p_unmanaged, Ref<CSharpScript> *p_script, bool p_ref_counted) {
@@ -2092,7 +2084,7 @@ CSharpInstance::~CSharpInstance() {
 		bool die = _unreference_owner_unsafe();
 		CRASH_COND(die); // `owner_keep_alive` holds a reference, so it can't die
 
-		void *data = CSharpLanguage::get_instance_binding(owner);
+		void *data = CSharpLanguage::get_instance_binding_with_setup(owner);
 		CRASH_COND(data == nullptr);
 		CSharpScriptBinding &script_binding = ((RBMap<Object *, CSharpScriptBinding>::Element *)data)->get();
 		CRASH_COND(!script_binding.inited);
diff --git a/modules/mono/csharp_script.h b/modules/mono/csharp_script.h
index 06d526f494..e3f39c50f4 100644
--- a/modules/mono/csharp_script.h
+++ b/modules/mono/csharp_script.h
@@ -442,7 +442,7 @@ class CSharpLanguage : public ScriptLanguage {
 public:
 	static void *get_instance_binding(Object *p_object);
 	static void *get_existing_instance_binding(Object *p_object);
-	static void set_instance_binding(Object *p_object, void *p_binding);
+	static void *get_instance_binding_with_setup(Object *p_object);
 	static bool has_instance_binding(Object *p_object);
 
 	const Mutex &get_language_bind_mutex() {
diff --git a/modules/mono/glue/runtime_interop.cpp b/modules/mono/glue/runtime_interop.cpp
index 0089e9c2a2..4bb324c0ee 100644
--- a/modules/mono/glue/runtime_interop.cpp
+++ b/modules/mono/glue/runtime_interop.cpp
@@ -239,7 +239,7 @@ GCHandleIntPtr godotsharp_internal_unmanaged_get_instance_binding_managed(Object
 	CRASH_COND(!p_unmanaged);
 #endif
 
-	void *data = CSharpLanguage::get_instance_binding(p_unmanaged);
+	void *data = CSharpLanguage::get_instance_binding_with_setup(p_unmanaged);
 	ERR_FAIL_NULL_V(data, { nullptr });
 	CSharpScriptBinding &script_binding = ((RBMap<Object *, CSharpScriptBinding>::Element *)data)->value();
 	ERR_FAIL_COND_V(!script_binding.inited, { nullptr });
@@ -252,7 +252,7 @@ GCHandleIntPtr godotsharp_internal_unmanaged_instance_binding_create_managed(Obj
 	CRASH_COND(!p_unmanaged);
 #endif
 
-	void *data = CSharpLanguage::get_instance_binding(p_unmanaged);
+	void *data = CSharpLanguage::get_instance_binding_with_setup(p_unmanaged);
 	ERR_FAIL_NULL_V(data, { nullptr });
 	CSharpScriptBinding &script_binding = ((RBMap<Object *, CSharpScriptBinding>::Element *)data)->value();
 	ERR_FAIL_COND_V(!script_binding.inited, { nullptr });
diff --git a/modules/multiplayer/editor/replication_editor.cpp b/modules/multiplayer/editor/replication_editor.cpp
index 58803124cf..8453a41473 100644
--- a/modules/multiplayer/editor/replication_editor.cpp
+++ b/modules/multiplayer/editor/replication_editor.cpp
@@ -131,7 +131,7 @@ void ReplicationEditor::_pick_new_property() {
 		EditorNode::get_singleton()->show_warning(TTR("Not possible to add a new property to synchronize without a root."));
 		return;
 	}
-	pick_node->popup_scenetree_dialog();
+	pick_node->popup_scenetree_dialog(nullptr, current);
 	pick_node->get_filter_line_edit()->clear();
 	pick_node->get_filter_line_edit()->grab_focus();
 }
@@ -600,7 +600,7 @@ void ReplicationEditor::_add_property(const NodePath &p_property, bool p_spawn,
 	item->set_text_alignment(2, HORIZONTAL_ALIGNMENT_CENTER);
 	item->set_cell_mode(2, TreeItem::CELL_MODE_RANGE);
 	item->set_range_config(2, 0, 2, 1);
-	item->set_text(2, "Never,Always,On Change");
+	item->set_text(2, TTR("Never", "Replication Mode") + "," + TTR("Always", "Replication Mode") + "," + TTR("On Change", "Replication Mode"));
 	item->set_range(2, (int)p_mode);
 	item->set_editable(2, true);
 }
diff --git a/modules/openxr/editor/openxr_action_set_editor.cpp b/modules/openxr/editor/openxr_action_set_editor.cpp
index a9fc6c4db6..8b4a0e989c 100644
--- a/modules/openxr/editor/openxr_action_set_editor.cpp
+++ b/modules/openxr/editor/openxr_action_set_editor.cpp
@@ -261,13 +261,13 @@ OpenXRActionSetEditor::OpenXRActionSetEditor(Ref<OpenXRActionMap> p_action_map,
 	action_set_hb->add_child(action_set_priority);
 
 	add_action = memnew(Button);
-	add_action->set_tooltip_text("Add Action.");
+	add_action->set_tooltip_text(TTR("Add action."));
 	add_action->connect("pressed", callable_mp(this, &OpenXRActionSetEditor::_on_add_action));
 	add_action->set_flat(true);
 	action_set_hb->add_child(add_action);
 
 	rem_action_set = memnew(Button);
-	rem_action_set->set_tooltip_text("Remove Action Set.");
+	rem_action_set->set_tooltip_text(TTR("Remove action set."));
 	rem_action_set->connect("pressed", callable_mp(this, &OpenXRActionSetEditor::_on_remove_action_set));
 	rem_action_set->set_flat(true);
 	action_set_hb->add_child(rem_action_set);
diff --git a/modules/openxr/editor/openxr_select_interaction_profile_dialog.cpp b/modules/openxr/editor/openxr_select_interaction_profile_dialog.cpp
index 51642d8503..8fd66fac04 100644
--- a/modules/openxr/editor/openxr_select_interaction_profile_dialog.cpp
+++ b/modules/openxr/editor/openxr_select_interaction_profile_dialog.cpp
@@ -111,7 +111,7 @@ void OpenXRSelectInteractionProfileDialog::ok_pressed() {
 }
 
 OpenXRSelectInteractionProfileDialog::OpenXRSelectInteractionProfileDialog() {
-	set_title("Select an interaction profile");
+	set_title(TTR("Select an interaction profile"));
 
 	scroll = memnew(ScrollContainer);
 	scroll->set_custom_minimum_size(Size2(600.0, 400.0));
diff --git a/modules/openxr/register_types.cpp b/modules/openxr/register_types.cpp
index 3d34b27407..ba468b6967 100644
--- a/modules/openxr/register_types.cpp
+++ b/modules/openxr/register_types.cpp
@@ -135,7 +135,9 @@ void initialize_openxr_module(ModuleInitializationLevel p_level) {
 				const char *init_error_message =
 						"OpenXR was requested but failed to start.\n"
 						"Please check if your HMD is connected.\n"
-						"When using Windows MR please note that WMR only has DirectX support, make sure SteamVR is your default OpenXR runtime.\n"
+#ifdef WINDOWS_ENABLED
+						"When using Windows Mixed Reality, note that WMR only has DirectX support. Make sure SteamVR is your default OpenXR runtime.\n"
+#endif
 						"Godot will start in normal mode.\n";
 
 				WARN_PRINT(init_error_message);
diff --git a/modules/raycast/SCsub b/modules/raycast/SCsub
index 209ebab388..f3a8e30763 100644
--- a/modules/raycast/SCsub
+++ b/modules/raycast/SCsub
@@ -15,10 +15,10 @@ if env["builtin_embree"]:
     embree_src = [
         "common/sys/sysinfo.cpp",
         "common/sys/alloc.cpp",
+        "common/sys/estring.cpp",
         "common/sys/filename.cpp",
         "common/sys/library.cpp",
         "common/sys/thread.cpp",
-        "common/sys/string.cpp",
         "common/sys/regression.cpp",
         "common/sys/mutex.cpp",
         "common/sys/condition.cpp",
@@ -36,6 +36,7 @@ if env["builtin_embree"]:
         "kernels/common/rtcore.cpp",
         "kernels/common/rtcore_builder.cpp",
         "kernels/common/scene.cpp",
+        "kernels/common/scene_verify.cpp",
         "kernels/common/alloc.cpp",
         "kernels/common/geometry.cpp",
         "kernels/common/scene_triangle_mesh.cpp",
@@ -56,8 +57,6 @@ if env["builtin_embree"]:
         "kernels/bvh/bvh_builder_twolevel.cpp",
         "kernels/bvh/bvh_intersector1_bvh4.cpp",
         "kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp",
-        "kernels/bvh/bvh_intersector_stream_bvh4.cpp",
-        "kernels/bvh/bvh_intersector_stream_filters.cpp",
     ]
 
     thirdparty_sources = [thirdparty_dir + file for file in embree_src]
diff --git a/modules/raycast/godot_update_embree.py b/modules/raycast/godot_update_embree.py
index f7af937c8b..c179060365 100644
--- a/modules/raycast/godot_update_embree.py
+++ b/modules/raycast/godot_update_embree.py
@@ -1,6 +1,13 @@
-import glob, os, shutil, subprocess, re
+import glob
+import os
+import re
+import shutil
+import stat
+import subprocess
+from types import TracebackType
+from typing import Any, Callable, Tuple, Type
 
-git_tag = "v3.13.5"
+git_tag = "v4.3.1"
 
 include_dirs = [
     "common/tasking",
@@ -15,7 +22,7 @@ include_dirs = [
     "common/simd",
     "common/simd/arm",
     "common/simd/wasm",
-    "include/embree3",
+    "include/embree4",
     "kernels/subdiv",
     "kernels/geometry",
 ]
@@ -23,10 +30,10 @@ include_dirs = [
 cpp_files = [
     "common/sys/sysinfo.cpp",
     "common/sys/alloc.cpp",
+    "common/sys/estring.cpp",
     "common/sys/filename.cpp",
     "common/sys/library.cpp",
     "common/sys/thread.cpp",
-    "common/sys/string.cpp",
     "common/sys/regression.cpp",
     "common/sys/mutex.cpp",
     "common/sys/condition.cpp",
@@ -44,6 +51,7 @@ cpp_files = [
     "kernels/common/rtcore.cpp",
     "kernels/common/rtcore_builder.cpp",
     "kernels/common/scene.cpp",
+    "kernels/common/scene_verify.cpp",
     "kernels/common/alloc.cpp",
     "kernels/common/geometry.cpp",
     "kernels/common/scene_triangle_mesh.cpp",
@@ -65,26 +73,58 @@ cpp_files = [
     "kernels/bvh/bvh_intersector1.cpp",
     "kernels/bvh/bvh_intersector1_bvh4.cpp",
     "kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp",
-    "kernels/bvh/bvh_intersector_stream_bvh4.cpp",
-    "kernels/bvh/bvh_intersector_stream_filters.cpp",
     "kernels/bvh/bvh_intersector_hybrid.cpp",
-    "kernels/bvh/bvh_intersector_stream.cpp",
 ]
 
-os.chdir("../../thirdparty")
+config_files = [
+    "kernels/config.h.in",
+    "kernels/rtcore_config.h.in",
+]
+
+license_file = "LICENSE.txt"
+
+os.chdir(f"{os.path.dirname(__file__)}/../../thirdparty")
 
 dir_name = "embree"
 if os.path.exists(dir_name):
     shutil.rmtree(dir_name)
 
+# In case something went wrong and embree-tmp stayed on the system.
+if os.path.exists("embree-tmp"):
+    shutil.rmtree("embree-tmp")
+
 subprocess.run(["git", "clone", "https://github.com/embree/embree.git", "embree-tmp"])
 os.chdir("embree-tmp")
 subprocess.run(["git", "checkout", git_tag])
 
 commit_hash = str(subprocess.check_output(["git", "rev-parse", "HEAD"], universal_newlines=True)).strip()
 
+
+def on_rm_error(
+    function: Callable[..., Any], path: str, excinfo: Tuple[Type[Exception], Exception, TracebackType]
+) -> None:
+    """
+    Error handler for `shutil.rmtree()`.
+
+    If the error is due to read-only files,
+    it will change the file permissions and retry.
+    """
+    os.chmod(path, stat.S_IWRITE)
+    os.unlink(path)
+
+
+# 3.12 Python and beyond should replace `onerror` with `onexc`.
+# We remove the .git directory because it contains
+# a lot of read-only files that are problematic on Windows.
+shutil.rmtree(".git", onerror=on_rm_error)
+
 all_files = set(cpp_files)
 
+for config_file in config_files:
+    all_files.add(config_file)
+
+all_files.add(license_file)
+
 dest_dir = os.path.join("..", dir_name)
 for include_dir in include_dirs:
     headers = glob.iglob(os.path.join(include_dir, "*.h"))
@@ -105,87 +145,8 @@ with open(os.path.join(dest_dir, "kernels/hash.h"), "w", encoding="utf-8", newli
 """
     )
 
-with open(os.path.join(dest_dir, "kernels/config.h"), "w", encoding="utf-8", newline="\n") as config_file:
-    config_file.write(
-        """// Copyright 2009-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-/* #undef EMBREE_RAY_MASK */
-/* #undef EMBREE_STAT_COUNTERS */
-/* #undef EMBREE_BACKFACE_CULLING */
-/* #undef EMBREE_BACKFACE_CULLING_CURVES */
-#define EMBREE_FILTER_FUNCTION
-/* #undef EMBREE_IGNORE_INVALID_RAYS */
-#define EMBREE_GEOMETRY_TRIANGLE
-/* #undef EMBREE_GEOMETRY_QUAD */
-/* #undef EMBREE_GEOMETRY_CURVE */
-/* #undef EMBREE_GEOMETRY_SUBDIVISION */
-/* #undef EMBREE_GEOMETRY_USER */
-/* #undef EMBREE_GEOMETRY_INSTANCE */
-/* #undef EMBREE_GEOMETRY_GRID */
-/* #undef EMBREE_GEOMETRY_POINT */
-#define EMBREE_RAY_PACKETS
-/* #undef EMBREE_COMPACT_POLYS */
-
-#define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0
-#define EMBREE_DISC_POINT_SELF_INTERSECTION_AVOIDANCE
-
-#if defined(EMBREE_GEOMETRY_TRIANGLE)
-  #define IF_ENABLED_TRIS(x) x
-#else
-  #define IF_ENABLED_TRIS(x)
-#endif
-
-#if defined(EMBREE_GEOMETRY_QUAD)
-  #define IF_ENABLED_QUADS(x) x
-#else
-  #define IF_ENABLED_QUADS(x)
-#endif
-
-#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT)
-  #define IF_ENABLED_CURVES_OR_POINTS(x) x
-#else
-  #define IF_ENABLED_CURVES_OR_POINTS(x)
-#endif
-
-#if defined(EMBREE_GEOMETRY_CURVE)
-  #define IF_ENABLED_CURVES(x) x
-#else
-  #define IF_ENABLED_CURVES(x)
-#endif
-
-#if defined(EMBREE_GEOMETRY_POINT)
-  #define IF_ENABLED_POINTS(x) x
-#else
-  #define IF_ENABLED_POINTS(x)
-#endif
-
-#if defined(EMBREE_GEOMETRY_SUBDIVISION)
-  #define IF_ENABLED_SUBDIV(x) x
-#else
-  #define IF_ENABLED_SUBDIV(x)
-#endif
-
-#if defined(EMBREE_GEOMETRY_USER)
-  #define IF_ENABLED_USER(x) x
-#else
-  #define IF_ENABLED_USER(x)
-#endif
-
-#if defined(EMBREE_GEOMETRY_INSTANCE)
-  #define IF_ENABLED_INSTANCE(x) x
-#else
-  #define IF_ENABLED_INSTANCE(x)
-#endif
-
-#if defined(EMBREE_GEOMETRY_GRID)
-  #define IF_ENABLED_GRIDS(x) x
-#else
-  #define IF_ENABLED_GRIDS(x)
-#endif
-"""
-    )
-
+for config_file in config_files:
+    os.rename(os.path.join(dest_dir, config_file), os.path.join(dest_dir, config_file[:-3]))
 
 with open("CMakeLists.txt", "r", encoding="utf-8") as cmake_file:
     cmake_content = cmake_file.read()
@@ -193,70 +154,25 @@ with open("CMakeLists.txt", "r", encoding="utf-8") as cmake_file:
     minor_version = int(re.compile(r"EMBREE_VERSION_MINOR\s(\d+)").findall(cmake_content)[0])
     patch_version = int(re.compile(r"EMBREE_VERSION_PATCH\s(\d+)").findall(cmake_content)[0])
 
-with open(
-    os.path.join(dest_dir, "include/embree3/rtcore_config.h"), "w", encoding="utf-8", newline="\n"
-) as config_file:
-    config_file.write(
-        f"""// Copyright 2009-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
+shutil.move(os.path.join(dest_dir, "kernels/rtcore_config.h"), os.path.join(dest_dir, ("include/embree4/")))
 
-#define RTC_VERSION_MAJOR {major_version}
-#define RTC_VERSION_MINOR {minor_version}
-#define RTC_VERSION_PATCH {patch_version}
-#define RTC_VERSION {major_version}{minor_version:02d}{patch_version:02d}
-#define RTC_VERSION_STRING "{major_version}.{minor_version}.{patch_version}"
-
-#define RTC_MAX_INSTANCE_LEVEL_COUNT 1
-
-#define EMBREE_MIN_WIDTH 0
-#define RTC_MIN_WIDTH EMBREE_MIN_WIDTH
-
-#if !defined(EMBREE_STATIC_LIB)
-#   define EMBREE_STATIC_LIB
-#endif
-/* #undef EMBREE_API_NAMESPACE*/
-
-#if defined(EMBREE_API_NAMESPACE)
-#  define RTC_NAMESPACE
-#  define RTC_NAMESPACE_BEGIN namespace {{
-#  define RTC_NAMESPACE_END }}
-#  define RTC_NAMESPACE_USE using namespace;
-#  define RTC_API_EXTERN_C
-#  undef EMBREE_API_NAMESPACE
-#else
-#  define RTC_NAMESPACE_BEGIN
-#  define RTC_NAMESPACE_END
-#  define RTC_NAMESPACE_USE
-#  if defined(__cplusplus)
-#    define RTC_API_EXTERN_C extern "C"
-#  else
-#    define RTC_API_EXTERN_C
-#  endif
-#endif
-
-#if defined(ISPC)
-#  define RTC_API_IMPORT extern "C" unmasked
-#  define RTC_API_EXPORT extern "C" unmasked
-#elif defined(EMBREE_STATIC_LIB)
-#  define RTC_API_IMPORT RTC_API_EXTERN_C
-#  define RTC_API_EXPORT RTC_API_EXTERN_C
-#elif defined(_WIN32)
-#  define RTC_API_IMPORT RTC_API_EXTERN_C __declspec(dllimport)
-#  define RTC_API_EXPORT RTC_API_EXTERN_C __declspec(dllexport)
-#else
-#  define RTC_API_IMPORT RTC_API_EXTERN_C
-#  define RTC_API_EXPORT RTC_API_EXTERN_C __attribute__ ((visibility ("default")))
-#endif
-
-#if defined(RTC_EXPORT_API)
-#  define RTC_API RTC_API_EXPORT
-#else
-#  define RTC_API RTC_API_IMPORT
-#endif
-"""
-    )
+with open(
+    os.path.join(dest_dir, "include/embree4/rtcore_config.h"), "r+", encoding="utf-8", newline="\n"
+) as rtcore_config:
+    lines = rtcore_config.readlines()
+    rtcore_config.seek(0)
+    for i, line in enumerate(lines):
+        if line.startswith("#define RTC_VERSION_MAJOR"):
+            lines[i : i + 5] = [
+                f"#define RTC_VERSION_MAJOR {major_version}\n",
+                f"#define RTC_VERSION_MINOR {minor_version}\n",
+                f"#define RTC_VERSION_PATCH {patch_version}\n",
+                f"#define RTC_VERSION {major_version}{minor_version:02d}{patch_version:02d}\n",
+                f'#define RTC_VERSION_STRING "{major_version}.{minor_version}.{patch_version}"\n',
+            ]
+            break
+    rtcore_config.writelines(lines)
+    rtcore_config.truncate()
 
 os.chdir("..")
 shutil.rmtree("embree-tmp")
@@ -264,4 +180,4 @@ shutil.rmtree("embree-tmp")
 subprocess.run(["git", "restore", "embree/patches"])
 
 for patch in os.listdir("embree/patches"):
-    subprocess.run(["git", "apply", "embree/patches/" + patch])
+    subprocess.run(["git", "apply", f"embree/patches/{patch}"])
diff --git a/modules/raycast/lightmap_raycaster_embree.cpp b/modules/raycast/lightmap_raycaster_embree.cpp
index 2a66c36d53..84d9e19a3f 100644
--- a/modules/raycast/lightmap_raycaster_embree.cpp
+++ b/modules/raycast/lightmap_raycaster_embree.cpp
@@ -69,11 +69,12 @@ void LightmapRaycasterEmbree::filter_function(const struct RTCFilterFunctionNArg
 }
 
 bool LightmapRaycasterEmbree::intersect(Ray &r_ray) {
-	RTCIntersectContext context;
-
-	rtcInitIntersectContext(&context);
-
-	rtcIntersect1(embree_scene, &context, (RTCRayHit *)&r_ray);
+	RTCRayQueryContext context;
+	rtcInitRayQueryContext(&context);
+	RTCIntersectArguments args;
+	rtcInitIntersectArguments(&args);
+	args.context = &context;
+	rtcIntersect1(embree_scene, (RTCRayHit *)&r_ray, &args);
 	return r_ray.geomID != RTC_INVALID_GEOMETRY_ID;
 }
 
diff --git a/modules/raycast/lightmap_raycaster_embree.h b/modules/raycast/lightmap_raycaster_embree.h
index d1999e329e..2b4530a368 100644
--- a/modules/raycast/lightmap_raycaster_embree.h
+++ b/modules/raycast/lightmap_raycaster_embree.h
@@ -37,7 +37,7 @@
 #include "core/object/object.h"
 #include "scene/3d/lightmapper.h"
 
-#include <embree3/rtcore.h>
+#include <embree4/rtcore.h>
 
 class LightmapRaycasterEmbree : public LightmapRaycaster {
 	GDCLASS(LightmapRaycasterEmbree, LightmapRaycaster);
diff --git a/modules/raycast/raycast_occlusion_cull.cpp b/modules/raycast/raycast_occlusion_cull.cpp
index 5005000eae..94d8b267d1 100644
--- a/modules/raycast/raycast_occlusion_cull.cpp
+++ b/modules/raycast/raycast_occlusion_cull.cpp
@@ -488,11 +488,13 @@ void RaycastOcclusionCull::Scenario::update() {
 }
 
 void RaycastOcclusionCull::Scenario::_raycast(uint32_t p_idx, const RaycastThreadData *p_raycast_data) const {
-	RTCIntersectContext ctx;
-	rtcInitIntersectContext(&ctx);
-	ctx.flags = RTC_INTERSECT_CONTEXT_FLAG_COHERENT;
-
-	rtcIntersect16((const int *)&p_raycast_data->masks[p_idx * TILE_RAYS], ebr_scene[current_scene_idx], &ctx, &p_raycast_data->rays[p_idx]);
+	RTCRayQueryContext context;
+	rtcInitRayQueryContext(&context);
+	RTCIntersectArguments args;
+	rtcInitIntersectArguments(&args);
+	args.flags = RTC_RAY_QUERY_FLAG_COHERENT;
+	args.context = &context;
+	rtcIntersect16((const int *)&p_raycast_data->masks[p_idx * TILE_RAYS], ebr_scene[current_scene_idx], &p_raycast_data->rays[p_idx], &args);
 }
 
 void RaycastOcclusionCull::Scenario::raycast(CameraRayTile *r_rays, const uint32_t *p_valid_masks, uint32_t p_tile_count) const {
@@ -536,6 +538,64 @@ void RaycastOcclusionCull::buffer_set_size(RID p_buffer, const Vector2i &p_size)
 	buffers[p_buffer].resize(p_size);
 }
 
+Projection RaycastOcclusionCull::_jitter_projection(const Projection &p_cam_projection, const Size2i &p_viewport_size) {
+	if (!_jitter_enabled) {
+		return p_cam_projection;
+	}
+
+	// Prevent divide by zero when using NULL viewport.
+	if ((p_viewport_size.x <= 0) || (p_viewport_size.y <= 0)) {
+		return p_cam_projection;
+	}
+
+	Projection p = p_cam_projection;
+
+	int32_t frame = Engine::get_singleton()->get_frames_drawn();
+	frame %= 9;
+
+	Vector2 jitter;
+
+	switch (frame) {
+		default:
+			break;
+		case 1: {
+			jitter = Vector2(-1, -1);
+		} break;
+		case 2: {
+			jitter = Vector2(1, -1);
+		} break;
+		case 3: {
+			jitter = Vector2(-1, 1);
+		} break;
+		case 4: {
+			jitter = Vector2(1, 1);
+		} break;
+		case 5: {
+			jitter = Vector2(-0.5f, -0.5f);
+		} break;
+		case 6: {
+			jitter = Vector2(0.5f, -0.5f);
+		} break;
+		case 7: {
+			jitter = Vector2(-0.5f, 0.5f);
+		} break;
+		case 8: {
+			jitter = Vector2(0.5f, 0.5f);
+		} break;
+	}
+
+	// The multiplier here determines the divergence from center,
+	// and is to some extent a balancing act.
+	// Higher divergence gives fewer false hidden, but more false shown.
+	// False hidden is obvious to viewer, false shown is not.
+	// False shown can lower percentage that are occluded, and therefore performance.
+	jitter *= Vector2(1 / (float)p_viewport_size.x, 1 / (float)p_viewport_size.y) * 0.05f;
+
+	p.add_jitter_offset(jitter);
+
+	return p;
+}
+
 void RaycastOcclusionCull::buffer_update(RID p_buffer, const Transform3D &p_cam_transform, const Projection &p_cam_projection, bool p_cam_orthogonal) {
 	if (!buffers.has(p_buffer)) {
 		return;
@@ -550,7 +610,9 @@ void RaycastOcclusionCull::buffer_update(RID p_buffer, const Transform3D &p_cam_
 	Scenario &scenario = scenarios[buffer.scenario_rid];
 	scenario.update();
 
-	buffer.update_camera_rays(p_cam_transform, p_cam_projection, p_cam_orthogonal);
+	Projection jittered_proj = _jitter_projection(p_cam_projection, buffer.get_occlusion_buffer_size());
+
+	buffer.update_camera_rays(p_cam_transform, jittered_proj, p_cam_orthogonal);
 
 	scenario.raycast(buffer.camera_rays, buffer.camera_ray_masks.ptr(), buffer.camera_rays_tile_count);
 	buffer.sort_rays(-p_cam_transform.basis.get_column(2), p_cam_orthogonal);
@@ -596,6 +658,7 @@ void RaycastOcclusionCull::_init_embree() {
 RaycastOcclusionCull::RaycastOcclusionCull() {
 	raycast_singleton = this;
 	int default_quality = GLOBAL_GET("rendering/occlusion_culling/bvh_build_quality");
+	_jitter_enabled = GLOBAL_GET("rendering/occlusion_culling/jitter_projection");
 	build_quality = RS::ViewportOcclusionCullingBuildQuality(default_quality);
 }
 
diff --git a/modules/raycast/raycast_occlusion_cull.h b/modules/raycast/raycast_occlusion_cull.h
index ab5eb4eaf0..335a685672 100644
--- a/modules/raycast/raycast_occlusion_cull.h
+++ b/modules/raycast/raycast_occlusion_cull.h
@@ -40,7 +40,7 @@
 #include "scene/resources/mesh.h"
 #include "servers/rendering/renderer_scene_occlusion_cull.h"
 
-#include <embree3/rtcore.h>
+#include <embree4/rtcore.h>
 
 class RaycastOcclusionCull : public RendererSceneOcclusionCull {
 	typedef RTCRayHit16 CameraRayTile;
@@ -163,8 +163,10 @@ private:
 	HashMap<RID, Scenario> scenarios;
 	HashMap<RID, RaycastHZBuffer> buffers;
 	RS::ViewportOcclusionCullingBuildQuality build_quality;
+	bool _jitter_enabled = false;
 
 	void _init_embree();
+	Projection _jitter_projection(const Projection &p_cam_projection, const Size2i &p_viewport_size);
 
 public:
 	virtual bool is_occluder(RID p_rid) override;
diff --git a/modules/raycast/static_raycaster_embree.cpp b/modules/raycast/static_raycaster_embree.cpp
index f9076d30dd..a6ad340397 100644
--- a/modules/raycast/static_raycaster_embree.cpp
+++ b/modules/raycast/static_raycaster_embree.cpp
@@ -53,9 +53,12 @@ void StaticRaycasterEmbree::free() {
 }
 
 bool StaticRaycasterEmbree::intersect(Ray &r_ray) {
-	RTCIntersectContext context;
-	rtcInitIntersectContext(&context);
-	rtcIntersect1(embree_scene, &context, (RTCRayHit *)&r_ray);
+	RTCRayQueryContext context;
+	rtcInitRayQueryContext(&context);
+	RTCIntersectArguments args;
+	rtcInitIntersectArguments(&args);
+	args.context = &context;
+	rtcIntersect1(embree_scene, (RTCRayHit *)&r_ray, &args);
 	return r_ray.geomID != RTC_INVALID_GEOMETRY_ID;
 }
 
diff --git a/modules/raycast/static_raycaster_embree.h b/modules/raycast/static_raycaster_embree.h
index 24e1c7b92f..3ffab32bad 100644
--- a/modules/raycast/static_raycaster_embree.h
+++ b/modules/raycast/static_raycaster_embree.h
@@ -35,7 +35,7 @@
 
 #include "core/math/static_raycaster.h"
 
-#include <embree3/rtcore.h>
+#include <embree4/rtcore.h>
 
 class StaticRaycasterEmbree : public StaticRaycaster {
 	GDCLASS(StaticRaycasterEmbree, StaticRaycaster);
diff --git a/platform/SCsub b/platform/SCsub
index e432cebd48..ca282e3e68 100644
--- a/platform/SCsub
+++ b/platform/SCsub
@@ -1,5 +1,7 @@
 #!/usr/bin/env python
 
+import methods
+
 Import("env")
 
 env.platform_sources = []
@@ -18,12 +20,7 @@ reg_apis_inc += "\n"
 reg_apis += "}\n\n"
 unreg_apis += "}\n"
 
-# NOTE: It is safe to generate this file here, since this is still execute serially
-with open("register_platform_apis.gen.cpp", "w", encoding="utf-8", newline="\n") as f:
-    f.write(reg_apis_inc)
-    f.write(reg_apis)
-    f.write(unreg_apis)
-
+methods.write_file_if_needed("register_platform_apis.gen.cpp", reg_apis_inc + reg_apis + unreg_apis)
 env.add_source_files(env.platform_sources, "register_platform_apis.gen.cpp")
 
 lib = env.add_library("platform", env.platform_sources)
diff --git a/platform/linuxbsd/freedesktop_portal_desktop.cpp b/platform/linuxbsd/freedesktop_portal_desktop.cpp
index cdebed58b2..214725832f 100644
--- a/platform/linuxbsd/freedesktop_portal_desktop.cpp
+++ b/platform/linuxbsd/freedesktop_portal_desktop.cpp
@@ -367,6 +367,7 @@ Error FreeDesktopPortalDesktop::file_dialog_show(DisplayServer::WindowID p_windo
 	}
 
 	ERR_FAIL_INDEX_V(int(p_mode), DisplayServer::FILE_DIALOG_MODE_SAVE_MAX, FAILED);
+	ERR_FAIL_NULL_V(monitor_connection, FAILED);
 
 	Vector<String> filter_names;
 	Vector<String> filter_exts;
@@ -406,24 +407,16 @@ Error FreeDesktopPortalDesktop::file_dialog_show(DisplayServer::WindowID p_windo
 	Error rng_err = rng.get_random_bytes(uuid, 64);
 	ERR_FAIL_COND_V_MSG(rng_err, rng_err, "Failed to generate unique token.");
 
-	fd.connection = dbus_bus_get(DBUS_BUS_SESSION, &err);
-	if (dbus_error_is_set(&err)) {
-		ERR_PRINT(vformat("Failed to open DBus connection: %s", err.message));
-		dbus_error_free(&err);
-		unsupported = true;
-		return FAILED;
-	}
-
-	String dbus_unique_name = String::utf8(dbus_bus_get_unique_name(fd.connection));
+	String dbus_unique_name = String::utf8(dbus_bus_get_unique_name(monitor_connection));
 	String token = String::hex_encode_buffer(uuid, 64);
 	String path = vformat("/org/freedesktop/portal/desktop/request/%s/%s", dbus_unique_name.replace(".", "_").replace(":", ""), token);
 
-	fd.path = vformat("type='signal',sender='org.freedesktop.portal.Desktop',path='%s',interface='org.freedesktop.portal.Request',member='Response',destination='%s'", path, dbus_unique_name);
-	dbus_bus_add_match(fd.connection, fd.path.utf8().get_data(), &err);
+	fd.path = path;
+	fd.filter = vformat("type='signal',sender='org.freedesktop.portal.Desktop',path='%s',interface='org.freedesktop.portal.Request',member='Response',destination='%s'", path, dbus_unique_name);
+	dbus_bus_add_match(monitor_connection, fd.filter.utf8().get_data(), &err);
 	if (dbus_error_is_set(&err)) {
 		ERR_PRINT(vformat("Failed to add DBus match: %s", err.message));
 		dbus_error_free(&err);
-		dbus_connection_unref(fd.connection);
 		return FAILED;
 	}
 
@@ -460,14 +453,13 @@ Error FreeDesktopPortalDesktop::file_dialog_show(DisplayServer::WindowID p_windo
 		dbus_message_iter_close_container(&iter, &arr_iter);
 	}
 
-	DBusMessage *reply = dbus_connection_send_with_reply_and_block(fd.connection, message, DBUS_TIMEOUT_INFINITE, &err);
+	DBusMessage *reply = dbus_connection_send_with_reply_and_block(monitor_connection, message, DBUS_TIMEOUT_INFINITE, &err);
 	dbus_message_unref(message);
 
 	if (!reply || dbus_error_is_set(&err)) {
 		ERR_PRINT(vformat("Failed to send DBus message: %s", err.message));
 		dbus_error_free(&err);
-		dbus_bus_remove_match(fd.connection, fd.path.utf8().get_data(), &err);
-		dbus_connection_unref(fd.connection);
+		dbus_bus_remove_match(monitor_connection, fd.filter.utf8().get_data(), &err);
 		return FAILED;
 	}
 
@@ -479,19 +471,17 @@ Error FreeDesktopPortalDesktop::file_dialog_show(DisplayServer::WindowID p_windo
 				const char *new_path = nullptr;
 				dbus_message_iter_get_basic(&iter, &new_path);
 				if (String::utf8(new_path) != path) {
-					dbus_bus_remove_match(fd.connection, fd.path.utf8().get_data(), &err);
+					dbus_bus_remove_match(monitor_connection, fd.filter.utf8().get_data(), &err);
 					if (dbus_error_is_set(&err)) {
 						ERR_PRINT(vformat("Failed to remove DBus match: %s", err.message));
 						dbus_error_free(&err);
-						dbus_connection_unref(fd.connection);
 						return FAILED;
 					}
-					fd.path = String::utf8(new_path);
-					dbus_bus_add_match(fd.connection, fd.path.utf8().get_data(), &err);
+					fd.filter = String::utf8(new_path);
+					dbus_bus_add_match(monitor_connection, fd.filter.utf8().get_data(), &err);
 					if (dbus_error_is_set(&err)) {
 						ERR_PRINT(vformat("Failed to add DBus match: %s", err.message));
 						dbus_error_free(&err);
-						dbus_connection_unref(fd.connection);
 						return FAILED;
 					}
 				}
@@ -532,57 +522,9 @@ void FreeDesktopPortalDesktop::_thread_monitor(void *p_ud) {
 	FreeDesktopPortalDesktop *portal = (FreeDesktopPortalDesktop *)p_ud;
 
 	while (!portal->monitor_thread_abort.is_set()) {
-		{
-			MutexLock lock(portal->file_dialog_mutex);
-			for (int i = portal->file_dialogs.size() - 1; i >= 0; i--) {
-				bool remove = false;
-				{
-					FreeDesktopPortalDesktop::FileDialogData &fd = portal->file_dialogs.write[i];
-					if (fd.connection) {
-						while (true) {
-							DBusMessage *msg = dbus_connection_pop_message(fd.connection);
-							if (!msg) {
-								break;
-							} else if (dbus_message_is_signal(msg, "org.freedesktop.portal.Request", "Response")) {
-								DBusMessageIter iter;
-								if (dbus_message_iter_init(msg, &iter)) {
-									bool cancel = false;
-									Vector<String> uris;
-									Dictionary options;
-									int index = 0;
-									file_chooser_parse_response(&iter, fd.filter_names, cancel, uris, index, options);
-
-									if (fd.callback.is_valid()) {
-										callable_mp(portal, &FreeDesktopPortalDesktop::_file_dialog_callback).call_deferred(fd.callback, !cancel, uris, index, options, fd.opt_in_cb);
-									}
-									if (fd.prev_focus != DisplayServer::INVALID_WINDOW_ID) {
-										callable_mp(DisplayServer::get_singleton(), &DisplayServer::window_move_to_foreground).call_deferred(fd.prev_focus);
-									}
-								}
-								dbus_message_unref(msg);
-
-								DBusError err;
-								dbus_error_init(&err);
-								dbus_bus_remove_match(fd.connection, fd.path.utf8().get_data(), &err);
-								dbus_error_free(&err);
-								dbus_connection_unref(fd.connection);
-								remove = true;
-								break;
-							}
-							dbus_message_unref(msg);
-						}
-						dbus_connection_read_write(fd.connection, 0);
-					}
-				}
-				if (remove) {
-					portal->file_dialogs.remove_at(i);
-				}
-			}
-		}
-
-		if (portal->theme_connection) {
+		if (portal->monitor_connection) {
 			while (true) {
-				DBusMessage *msg = dbus_connection_pop_message(portal->theme_connection);
+				DBusMessage *msg = dbus_connection_pop_message(portal->monitor_connection);
 				if (!msg) {
 					break;
 				} else if (dbus_message_is_signal(msg, "org.freedesktop.portal.Settings", "SettingChanged")) {
@@ -599,12 +541,41 @@ void FreeDesktopPortalDesktop::_thread_monitor(void *p_ud) {
 							callable_mp(portal, &FreeDesktopPortalDesktop::_system_theme_changed_callback).call_deferred();
 						}
 					}
-					dbus_message_unref(msg);
-					break;
+				} else if (dbus_message_is_signal(msg, "org.freedesktop.portal.Request", "Response")) {
+					String path = String::utf8(dbus_message_get_path(msg));
+					MutexLock lock(portal->file_dialog_mutex);
+					for (int i = 0; i < portal->file_dialogs.size(); i++) {
+						FreeDesktopPortalDesktop::FileDialogData &fd = portal->file_dialogs.write[i];
+						if (fd.path == path) {
+							DBusMessageIter iter;
+							if (dbus_message_iter_init(msg, &iter)) {
+								bool cancel = false;
+								Vector<String> uris;
+								Dictionary options;
+								int index = 0;
+								file_chooser_parse_response(&iter, fd.filter_names, cancel, uris, index, options);
+
+								if (fd.callback.is_valid()) {
+									callable_mp(portal, &FreeDesktopPortalDesktop::_file_dialog_callback).call_deferred(fd.callback, !cancel, uris, index, options, fd.opt_in_cb);
+								}
+								if (fd.prev_focus != DisplayServer::INVALID_WINDOW_ID) {
+									callable_mp(DisplayServer::get_singleton(), &DisplayServer::window_move_to_foreground).call_deferred(fd.prev_focus);
+								}
+							}
+
+							DBusError err;
+							dbus_error_init(&err);
+							dbus_bus_remove_match(portal->monitor_connection, fd.filter.utf8().get_data(), &err);
+							dbus_error_free(&err);
+
+							portal->file_dialogs.remove_at(i);
+							break;
+						}
+					}
 				}
 				dbus_message_unref(msg);
 			}
-			dbus_connection_read_write(portal->theme_connection, 0);
+			dbus_connection_read_write(portal->monitor_connection, 0);
 		}
 
 		usleep(50000);
@@ -647,18 +618,18 @@ FreeDesktopPortalDesktop::FreeDesktopPortalDesktop() {
 
 	DBusError err;
 	dbus_error_init(&err);
-	theme_connection = dbus_bus_get(DBUS_BUS_SESSION, &err);
+	monitor_connection = dbus_bus_get(DBUS_BUS_SESSION, &err);
 	if (dbus_error_is_set(&err)) {
 		dbus_error_free(&err);
 	} else {
 		theme_path = "type='signal',sender='org.freedesktop.portal.Desktop',interface='org.freedesktop.portal.Settings',member='SettingChanged'";
-		dbus_bus_add_match(theme_connection, theme_path.utf8().get_data(), &err);
+		dbus_bus_add_match(monitor_connection, theme_path.utf8().get_data(), &err);
 		if (dbus_error_is_set(&err)) {
 			dbus_error_free(&err);
-			dbus_connection_unref(theme_connection);
-			theme_connection = nullptr;
+			dbus_connection_unref(monitor_connection);
+			monitor_connection = nullptr;
 		}
-		dbus_connection_read_write(theme_connection, 0);
+		dbus_connection_read_write(monitor_connection, 0);
 	}
 
 	if (!unsupported) {
@@ -673,21 +644,17 @@ FreeDesktopPortalDesktop::~FreeDesktopPortalDesktop() {
 		monitor_thread.wait_to_finish();
 	}
 
-	for (FreeDesktopPortalDesktop::FileDialogData &fd : file_dialogs) {
-		if (fd.connection) {
-			DBusError err;
+	if (monitor_connection) {
+		DBusError err;
+		for (FreeDesktopPortalDesktop::FileDialogData &fd : file_dialogs) {
 			dbus_error_init(&err);
-			dbus_bus_remove_match(fd.connection, fd.path.utf8().get_data(), &err);
+			dbus_bus_remove_match(monitor_connection, fd.filter.utf8().get_data(), &err);
 			dbus_error_free(&err);
-			dbus_connection_unref(fd.connection);
 		}
-	}
-	if (theme_connection) {
-		DBusError err;
 		dbus_error_init(&err);
-		dbus_bus_remove_match(theme_connection, theme_path.utf8().get_data(), &err);
+		dbus_bus_remove_match(monitor_connection, theme_path.utf8().get_data(), &err);
 		dbus_error_free(&err);
-		dbus_connection_unref(theme_connection);
+		dbus_connection_unref(monitor_connection);
 	}
 }
 
diff --git a/platform/linuxbsd/freedesktop_portal_desktop.h b/platform/linuxbsd/freedesktop_portal_desktop.h
index 75afe02a26..ba9c352e62 100644
--- a/platform/linuxbsd/freedesktop_portal_desktop.h
+++ b/platform/linuxbsd/freedesktop_portal_desktop.h
@@ -60,9 +60,9 @@ private:
 
 	struct FileDialogData {
 		Vector<String> filter_names;
-		DBusConnection *connection = nullptr;
 		DisplayServer::WindowID prev_focus = DisplayServer::INVALID_WINDOW_ID;
 		Callable callback;
+		String filter;
 		String path;
 		bool opt_in_cb = false;
 	};
@@ -71,8 +71,8 @@ private:
 	Vector<FileDialogData> file_dialogs;
 	Thread monitor_thread;
 	SafeFlag monitor_thread_abort;
+	DBusConnection *monitor_connection = nullptr;
 
-	DBusConnection *theme_connection = nullptr;
 	String theme_path;
 	Callable system_theme_changed;
 	void _system_theme_changed_callback();
diff --git a/platform_methods.py b/platform_methods.py
index 37fc8a83ed..56115db4a4 100644
--- a/platform_methods.py
+++ b/platform_methods.py
@@ -5,6 +5,7 @@ import platform
 import uuid
 import functools
 import subprocess
+import methods
 
 # NOTE: The multiprocessing module is not compatible with SCons due to conflict on cPickle
 
@@ -65,10 +66,9 @@ def generate_export_icons(platform_path, platform_name):
 
             svg_str += '";\n'
 
-        # NOTE: It is safe to generate this file here, since this is still executed serially.
         wf = export_path + "/" + name + "_svg.gen.h"
-        with open(wf, "w", encoding="utf-8", newline="\n") as svgw:
-            svgw.write(svg_str)
+
+        methods.write_file_if_needed(wf, svg_str)
 
 
 def get_build_version(short):
diff --git a/scene/2d/tile_map.cpp b/scene/2d/tile_map.cpp
index f8737730ba..bbf1d09bbc 100644
--- a/scene/2d/tile_map.cpp
+++ b/scene/2d/tile_map.cpp
@@ -32,7 +32,7 @@
 #include "tile_map.compat.inc"
 
 #include "core/core_string_names.h"
-#include "scene/2d/tile_map_layer.h"
+#include "core/io/marshalls.h"
 #include "scene/gui/control.h"
 
 #define TILEMAP_CALL_FOR_LAYER(layer, function, ...) \
@@ -49,10 +49,118 @@
 	ERR_FAIL_INDEX_V(layer, (int)layers.size(), err_value);       \
 	return layers[layer]->function(__VA_ARGS__);
 
+void TileMap::_tile_set_changed() {
+	update_configuration_warnings();
+}
+
 void TileMap::_emit_changed() {
 	emit_signal(CoreStringNames::get_singleton()->changed);
 }
 
+void TileMap::_set_tile_map_data_using_compatibility_format(int p_layer, TileMapDataFormat p_format, const Vector<int> &p_data) {
+	ERR_FAIL_INDEX(p_layer, (int)layers.size());
+	ERR_FAIL_COND(p_format >= TileMapDataFormat::TILE_MAP_DATA_FORMAT_MAX);
+#ifndef DISABLE_DEPRECATED
+	ERR_FAIL_COND_MSG(p_format != (TileMapDataFormat)(TILE_MAP_DATA_FORMAT_MAX - 1), "Old TileMap data format detected despite DISABLE_DEPRECATED being set compilation time.");
+#endif // DISABLE_DEPRECATED
+
+	// Set data for a given tile from raw data.
+	int c = p_data.size();
+	const int *r = p_data.ptr();
+
+	int offset = (p_format >= TileMapDataFormat::TILE_MAP_DATA_FORMAT_2) ? 3 : 2;
+	ERR_FAIL_COND_MSG(c % offset != 0, vformat("Corrupted tile data. Got size: %d. Expected modulo: %d", c, offset));
+
+	layers[p_layer]->clear();
+
+	for (int i = 0; i < c; i += offset) {
+		const uint8_t *ptr = (const uint8_t *)&r[i];
+		uint8_t local[12];
+		for (int j = 0; j < ((p_format >= TileMapDataFormat::TILE_MAP_DATA_FORMAT_2) ? 12 : 8); j++) {
+			local[j] = ptr[j];
+		}
+
+#ifdef BIG_ENDIAN_ENABLED
+		SWAP(local[0], local[3]);
+		SWAP(local[1], local[2]);
+		SWAP(local[4], local[7]);
+		SWAP(local[5], local[6]);
+		//TODO: ask someone to check this...
+		if (FORMAT >= FORMAT_2) {
+			SWAP(local[8], local[11]);
+			SWAP(local[9], local[10]);
+		}
+#endif
+		// Extracts position in TileMap.
+		int16_t x = decode_uint16(&local[0]);
+		int16_t y = decode_uint16(&local[2]);
+
+		if (p_format == TileMapDataFormat::TILE_MAP_DATA_FORMAT_3) {
+			uint16_t source_id = decode_uint16(&local[4]);
+			uint16_t atlas_coords_x = decode_uint16(&local[6]);
+			uint16_t atlas_coords_y = decode_uint16(&local[8]);
+			uint16_t alternative_tile = decode_uint16(&local[10]);
+			layers[p_layer]->set_cell(Vector2i(x, y), source_id, Vector2i(atlas_coords_x, atlas_coords_y), alternative_tile);
+		} else {
+#ifndef DISABLE_DEPRECATED
+			// Previous decated format.
+			uint32_t v = decode_uint32(&local[4]);
+			// Extract the transform flags that used to be in the tilemap.
+			bool flip_h = v & (1UL << 29);
+			bool flip_v = v & (1UL << 30);
+			bool transpose = v & (1UL << 31);
+			v &= (1UL << 29) - 1;
+
+			// Extract autotile/atlas coords.
+			int16_t coord_x = 0;
+			int16_t coord_y = 0;
+			if (p_format == TileMapDataFormat::TILE_MAP_DATA_FORMAT_2) {
+				coord_x = decode_uint16(&local[8]);
+				coord_y = decode_uint16(&local[10]);
+			}
+
+			if (tile_set.is_valid()) {
+				Array a = tile_set->compatibility_tilemap_map(v, Vector2i(coord_x, coord_y), flip_h, flip_v, transpose);
+				if (a.size() == 3) {
+					layers[p_layer]->set_cell(Vector2i(x, y), a[0], a[1], a[2]);
+				} else {
+					ERR_PRINT(vformat("No valid tile in Tileset for: tile:%s coords:%s flip_h:%s flip_v:%s transpose:%s", v, Vector2i(coord_x, coord_y), flip_h, flip_v, transpose));
+				}
+			} else {
+				int compatibility_alternative_tile = ((int)flip_h) + ((int)flip_v << 1) + ((int)transpose << 2);
+				layers[p_layer]->set_cell(Vector2i(x, y), v, Vector2i(coord_x, coord_y), compatibility_alternative_tile);
+			}
+#endif // DISABLE_DEPRECATED
+		}
+	}
+}
+
+Vector<int> TileMap::_get_tile_map_data_using_compatibility_format(int p_layer) const {
+	ERR_FAIL_INDEX_V(p_layer, (int)layers.size(), Vector<int>());
+
+	// Export tile data to raw format.
+	const HashMap<Vector2i, CellData> tile_map_layer_data = layers[p_layer]->get_tile_map_layer_data();
+	Vector<int> tile_data;
+	tile_data.resize(tile_map_layer_data.size() * 3);
+	int *w = tile_data.ptrw();
+
+	// Save in highest format.
+
+	int idx = 0;
+	for (const KeyValue<Vector2i, CellData> &E : tile_map_layer_data) {
+		uint8_t *ptr = (uint8_t *)&w[idx];
+		encode_uint16((int16_t)(E.key.x), &ptr[0]);
+		encode_uint16((int16_t)(E.key.y), &ptr[2]);
+		encode_uint16(E.value.cell.source_id, &ptr[4]);
+		encode_uint16(E.value.cell.coord_x, &ptr[6]);
+		encode_uint16(E.value.cell.coord_y, &ptr[8]);
+		encode_uint16(E.value.cell.alternative_tile, &ptr[10]);
+		idx += 3;
+	}
+
+	return tile_data;
+}
+
 void TileMap::_notification(int p_what) {
 	switch (p_what) {
 		case TileMap::NOTIFICATION_INTERNAL_PHYSICS_PROCESS: {
@@ -199,6 +307,34 @@ void TileMap::draw_tile(RID p_canvas_item, const Vector2 &p_position, const Ref<
 	}
 }
 
+void TileMap::set_tileset(const Ref<TileSet> &p_tileset) {
+	if (p_tileset == tile_set) {
+		return;
+	}
+
+	// Set the tileset, registering to its changes.
+	if (tile_set.is_valid()) {
+		tile_set->disconnect_changed(callable_mp(this, &TileMap::_tile_set_changed));
+	}
+
+	tile_set = p_tileset;
+
+	if (tile_set.is_valid()) {
+		tile_set->connect_changed(callable_mp(this, &TileMap::_tile_set_changed));
+	}
+
+	for (int i = 0; i < get_child_count(); i++) {
+		TileMapLayer *layer = Object::cast_to<TileMapLayer>(get_child(i));
+		if (layer) {
+			layer->set_tile_set(tile_set);
+		}
+	}
+}
+
+Ref<TileSet> TileMap::get_tileset() const {
+	return tile_set;
+}
+
 int TileMap::get_layers_count() const {
 	return layers.size();
 }
@@ -215,6 +351,7 @@ void TileMap::add_layer(int p_to_pos) {
 	layers.insert(p_to_pos, new_layer);
 	add_child(new_layer, false, INTERNAL_MODE_FRONT);
 	new_layer->set_name(vformat("Layer%d", p_to_pos));
+	new_layer->set_tile_set(tile_set);
 	move_child(new_layer, p_to_pos);
 	for (uint32_t i = 0; i < layers.size(); i++) {
 		layers[i]->set_as_tile_map_internal_node(i);
@@ -251,8 +388,11 @@ void TileMap::remove_layer(int p_layer) {
 	ERR_FAIL_INDEX(p_layer, (int)layers.size());
 
 	// Clear before removing the layer.
-	layers[p_layer]->queue_free();
+	TileMapLayer *removed = layers[p_layer];
 	layers.remove_at(p_layer);
+	remove_child(removed);
+	removed->queue_free();
+
 	for (uint32_t i = 0; i < layers.size(); i++) {
 		layers[i]->set_as_tile_map_internal_node(i);
 	}
@@ -349,7 +489,7 @@ void TileMap::set_collision_visibility_mode(TileMap::VisibilityMode p_show_colli
 	}
 	collision_visibility_mode = p_show_collision;
 	for (TileMapLayer *layer : layers) {
-		layer->set_collision_visibility_mode(TileMapLayer::VisibilityMode(p_show_collision));
+		layer->set_collision_visibility_mode(TileMapLayer::DebugVisibilityMode(p_show_collision));
 	}
 	_emit_changed();
 }
@@ -364,7 +504,7 @@ void TileMap::set_navigation_visibility_mode(TileMap::VisibilityMode p_show_navi
 	}
 	navigation_visibility_mode = p_show_navigation;
 	for (TileMapLayer *layer : layers) {
-		layer->set_navigation_visibility_mode(TileMapLayer::VisibilityMode(p_show_navigation));
+		layer->set_navigation_visibility_mode(TileMapLayer::DebugVisibilityMode(p_show_navigation));
 	}
 	_emit_changed();
 }
@@ -394,19 +534,85 @@ void TileMap::erase_cell(int p_layer, const Vector2i &p_coords) {
 }
 
 int TileMap::get_cell_source_id(int p_layer, const Vector2i &p_coords, bool p_use_proxies) const {
-	TILEMAP_CALL_FOR_LAYER_V(p_layer, TileSet::INVALID_SOURCE, get_cell_source_id, p_coords, p_use_proxies);
+	if (p_use_proxies && tile_set.is_valid()) {
+		if (p_layer < 0) {
+			p_layer = layers.size() + p_layer;
+		}
+		ERR_FAIL_INDEX_V(p_layer, (int)layers.size(), TileSet::INVALID_SOURCE);
+
+		int source_id = layers[p_layer]->get_cell_source_id(p_coords);
+		Vector2i atlas_coords = layers[p_layer]->get_cell_atlas_coords(p_coords);
+		int alternative_id = layers[p_layer]->get_cell_alternative_tile(p_coords);
+
+		Array arr = tile_set->map_tile_proxy(source_id, atlas_coords, alternative_id);
+		ERR_FAIL_COND_V(arr.size() != 3, TileSet::INVALID_SOURCE);
+		return arr[0];
+	} else {
+		TILEMAP_CALL_FOR_LAYER_V(p_layer, TileSet::INVALID_SOURCE, get_cell_source_id, p_coords);
+	}
 }
 
 Vector2i TileMap::get_cell_atlas_coords(int p_layer, const Vector2i &p_coords, bool p_use_proxies) const {
-	TILEMAP_CALL_FOR_LAYER_V(p_layer, TileSetSource::INVALID_ATLAS_COORDS, get_cell_atlas_coords, p_coords, p_use_proxies);
+	if (p_use_proxies && tile_set.is_valid()) {
+		if (p_layer < 0) {
+			p_layer = layers.size() + p_layer;
+		}
+		ERR_FAIL_INDEX_V(p_layer, (int)layers.size(), TileSetAtlasSource::INVALID_ATLAS_COORDS);
+
+		int source_id = layers[p_layer]->get_cell_source_id(p_coords);
+		Vector2i atlas_coords = layers[p_layer]->get_cell_atlas_coords(p_coords);
+		int alternative_id = layers[p_layer]->get_cell_alternative_tile(p_coords);
+
+		Array arr = tile_set->map_tile_proxy(source_id, atlas_coords, alternative_id);
+		ERR_FAIL_COND_V(arr.size() != 3, TileSetSource::INVALID_ATLAS_COORDS);
+		return arr[1];
+	} else {
+		TILEMAP_CALL_FOR_LAYER_V(p_layer, TileSetSource::INVALID_ATLAS_COORDS, get_cell_atlas_coords, p_coords);
+	}
 }
 
 int TileMap::get_cell_alternative_tile(int p_layer, const Vector2i &p_coords, bool p_use_proxies) const {
-	TILEMAP_CALL_FOR_LAYER_V(p_layer, TileSetSource::INVALID_TILE_ALTERNATIVE, get_cell_alternative_tile, p_coords, p_use_proxies);
+	if (p_use_proxies && tile_set.is_valid()) {
+		if (p_layer < 0) {
+			p_layer = layers.size() + p_layer;
+		}
+		ERR_FAIL_INDEX_V(p_layer, (int)layers.size(), TileSetSource::INVALID_TILE_ALTERNATIVE);
+
+		int source_id = layers[p_layer]->get_cell_source_id(p_coords);
+		Vector2i atlas_coords = layers[p_layer]->get_cell_atlas_coords(p_coords);
+		int alternative_id = layers[p_layer]->get_cell_alternative_tile(p_coords);
+
+		Array arr = tile_set->map_tile_proxy(source_id, atlas_coords, alternative_id);
+		ERR_FAIL_COND_V(arr.size() != 3, TileSetSource::INVALID_TILE_ALTERNATIVE);
+		return arr[2];
+	} else {
+		TILEMAP_CALL_FOR_LAYER_V(p_layer, TileSetSource::INVALID_TILE_ALTERNATIVE, get_cell_alternative_tile, p_coords);
+	}
 }
 
 TileData *TileMap::get_cell_tile_data(int p_layer, const Vector2i &p_coords, bool p_use_proxies) const {
-	TILEMAP_CALL_FOR_LAYER_V(p_layer, nullptr, get_cell_tile_data, p_coords, p_use_proxies);
+	if (p_use_proxies && tile_set.is_valid()) {
+		if (p_layer < 0) {
+			p_layer = layers.size() + p_layer;
+		}
+		ERR_FAIL_INDEX_V(p_layer, (int)layers.size(), nullptr);
+
+		int source_id = layers[p_layer]->get_cell_source_id(p_coords);
+		Vector2i atlas_coords = layers[p_layer]->get_cell_atlas_coords(p_coords);
+		int alternative_id = layers[p_layer]->get_cell_alternative_tile(p_coords);
+
+		Array arr = tile_set->map_tile_proxy(source_id, atlas_coords, alternative_id);
+		ERR_FAIL_COND_V(arr.size() != 3, nullptr);
+
+		Ref<TileSetAtlasSource> atlas_source = tile_set->get_source(arr[0]);
+		if (atlas_source.is_valid()) {
+			return atlas_source->get_tile_data(arr[1], arr[2]);
+		} else {
+			return nullptr;
+		}
+	} else {
+		TILEMAP_CALL_FOR_LAYER_V(p_layer, nullptr, get_cell_tile_data, p_coords);
+	}
 }
 
 Ref<TileMapPattern> TileMap::get_pattern(int p_layer, TypedArray<Vector2i> p_coords_array) {
@@ -451,7 +657,10 @@ void TileMap::set_cells_terrain_path(int p_layer, TypedArray<Vector2i> p_path, i
 }
 
 TileMapCell TileMap::get_cell(int p_layer, const Vector2i &p_coords, bool p_use_proxies) const {
-	TILEMAP_CALL_FOR_LAYER_V(p_layer, TileMapCell(), get_cell, p_coords, p_use_proxies);
+	if (p_use_proxies) {
+		WARN_DEPRECATED_MSG("use_proxies is deprecated.");
+	}
+	TILEMAP_CALL_FOR_LAYER_V(p_layer, TileMapCell(), get_cell, p_coords);
 }
 
 Vector2i TileMap::get_coords_for_body_rid(RID p_physics_body) {
@@ -478,6 +687,13 @@ void TileMap::fix_invalid_tiles() {
 	}
 }
 
+#ifdef TOOLS_ENABLED
+TileMapLayer *TileMap::duplicate_layer_from_internal(int p_layer) {
+	ERR_FAIL_INDEX_V(p_layer, (int)layers.size(), nullptr);
+	return Object::cast_to<TileMapLayer>(layers[p_layer]->duplicate(DUPLICATE_USE_INSTANTIATION | DUPLICATE_FROM_EDITOR));
+}
+#endif // TOOLS_ENABLED
+
 void TileMap::clear_layer(int p_layer) {
 	TILEMAP_CALL_FOR_LAYER(p_layer, clear)
 }
@@ -540,10 +756,11 @@ bool TileMap::_set(const StringName &p_name, const Variant &p_value) {
 				add_child(new_layer, false, INTERNAL_MODE_FRONT);
 				new_layer->set_as_tile_map_internal_node(0);
 				new_layer->set_name("Layer0");
+				new_layer->set_tile_set(tile_set);
 				new_layer->connect(CoreStringNames::get_singleton()->changed, callable_mp(this, &TileMap::_emit_changed));
 				layers.push_back(new_layer);
 			}
-			layers[0]->set_tile_data(format, p_value);
+			_set_tile_map_data_using_compatibility_format(0, format, p_value);
 			_emit_changed();
 			return true;
 		}
@@ -565,6 +782,7 @@ bool TileMap::_set(const StringName &p_name, const Variant &p_value) {
 				add_child(new_layer, false, INTERNAL_MODE_FRONT);
 				new_layer->set_as_tile_map_internal_node(index);
 				new_layer->set_name(vformat("Layer%d", index));
+				new_layer->set_tile_set(tile_set);
 				new_layer->connect(CoreStringNames::get_singleton()->changed, callable_mp(this, &TileMap::_emit_changed));
 				layers.push_back(new_layer);
 			}
@@ -596,7 +814,7 @@ bool TileMap::_set(const StringName &p_name, const Variant &p_value) {
 			set_layer_navigation_enabled(index, p_value);
 			return true;
 		} else if (components[1] == "tile_data") {
-			layers[index]->set_tile_data(format, p_value);
+			_set_tile_map_data_using_compatibility_format(index, format, p_value);
 			_emit_changed();
 			return true;
 		} else {
@@ -609,7 +827,7 @@ bool TileMap::_set(const StringName &p_name, const Variant &p_value) {
 bool TileMap::_get(const StringName &p_name, Variant &r_ret) const {
 	Vector<String> components = String(p_name).split("/", true, 2);
 	if (p_name == "format") {
-		r_ret = TileMapDataFormat::FORMAT_MAX - 1; // When saving, always save highest format.
+		r_ret = TileMapDataFormat::TILE_MAP_DATA_FORMAT_MAX - 1; // When saving, always save highest format.
 		return true;
 	}
 #ifndef DISABLE_DEPRECATED
@@ -646,7 +864,7 @@ bool TileMap::_get(const StringName &p_name, Variant &r_ret) const {
 			r_ret = is_layer_navigation_enabled(index);
 			return true;
 		} else if (components[1] == "tile_data") {
-			r_ret = layers[index]->get_tile_data();
+			r_ret = _get_tile_map_data_using_compatibility_format(index);
 			return true;
 		} else {
 			return false;
@@ -899,6 +1117,9 @@ void TileMap::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("force_update", "layer"), &TileMap::force_update, DEFVAL(-1));
 #endif // DISABLE_DEPRECATED
 
+	ClassDB::bind_method(D_METHOD("set_tileset", "tileset"), &TileMap::set_tileset);
+	ClassDB::bind_method(D_METHOD("get_tileset"), &TileMap::get_tileset);
+
 	ClassDB::bind_method(D_METHOD("set_rendering_quadrant_size", "size"), &TileMap::set_rendering_quadrant_size);
 	ClassDB::bind_method(D_METHOD("get_rendering_quadrant_size"), &TileMap::get_rendering_quadrant_size);
 
@@ -969,6 +1190,7 @@ void TileMap::_bind_methods() {
 	GDVIRTUAL_BIND(_use_tile_data_runtime_update, "layer", "coords");
 	GDVIRTUAL_BIND(_tile_data_runtime_update, "layer", "coords", "tile_data");
 
+	ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "tile_set", PROPERTY_HINT_RESOURCE_TYPE, "TileSet"), "set_tileset", "get_tileset");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "rendering_quadrant_size", PROPERTY_HINT_RANGE, "1,128,1"), "set_rendering_quadrant_size", "get_rendering_quadrant_size");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "collision_animatable"), "set_collision_animatable", "is_collision_animatable");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "collision_visibility_mode", PROPERTY_HINT_ENUM, "Default,Force Show,Force Hide"), "set_collision_visibility_mode", "get_collision_visibility_mode");
@@ -976,7 +1198,7 @@ void TileMap::_bind_methods() {
 
 	ADD_ARRAY("layers", "layer_");
 
-	ADD_PROPERTY_DEFAULT("format", TileMapDataFormat::FORMAT_1);
+	ADD_PROPERTY_DEFAULT("format", TileMapDataFormat::TILE_MAP_DATA_FORMAT_1);
 
 	ADD_SIGNAL(MethodInfo(CoreStringNames::get_singleton()->changed));
 
@@ -990,6 +1212,7 @@ TileMap::TileMap() {
 	add_child(new_layer, false, INTERNAL_MODE_FRONT);
 	new_layer->set_as_tile_map_internal_node(0);
 	new_layer->set_name("Layer0");
+	new_layer->set_tile_set(tile_set);
 	new_layer->connect(CoreStringNames::get_singleton()->changed, callable_mp(this, &TileMap::_emit_changed));
 	layers.push_back(new_layer);
 	default_layer = memnew(TileMapLayer);
diff --git a/scene/2d/tile_map.h b/scene/2d/tile_map.h
index edea90fa95..41068ea978 100644
--- a/scene/2d/tile_map.h
+++ b/scene/2d/tile_map.h
@@ -31,7 +31,7 @@
 #ifndef TILE_MAP_H
 #define TILE_MAP_H
 
-#include "scene/2d/tile_map_layer_group.h"
+#include "scene/2d/tile_map_layer.h"
 #include "scene/resources/2d/tile_set.h"
 
 class Control;
@@ -39,14 +39,14 @@ class TileMapLayer;
 class TerrainConstraint;
 
 enum TileMapDataFormat {
-	FORMAT_1 = 0,
-	FORMAT_2,
-	FORMAT_3,
-	FORMAT_MAX,
+	TILE_MAP_DATA_FORMAT_1 = 0,
+	TILE_MAP_DATA_FORMAT_2,
+	TILE_MAP_DATA_FORMAT_3,
+	TILE_MAP_DATA_FORMAT_MAX,
 };
 
-class TileMap : public TileMapLayerGroup {
-	GDCLASS(TileMap, TileMapLayerGroup)
+class TileMap : public Node2D {
+	GDCLASS(TileMap, Node2D)
 
 public:
 	// Kept for compatibility, but should use TileMapLayer::VisibilityMode instead.
@@ -60,11 +60,12 @@ private:
 	friend class TileSetPlugin;
 
 	// A compatibility enum to specify how is the data if formatted.
-	mutable TileMapDataFormat format = TileMapDataFormat::FORMAT_3;
+	mutable TileMapDataFormat format = TileMapDataFormat::TILE_MAP_DATA_FORMAT_3;
 
 	static constexpr float FP_ADJUST = 0.00001;
 
 	// Properties.
+	Ref<TileSet> tile_set;
 	int rendering_quadrant_size = 16;
 	bool collision_animatable = false;
 	VisibilityMode collision_visibility_mode = VISIBILITY_MODE_DEFAULT;
@@ -78,8 +79,14 @@ private:
 	Transform2D last_valid_transform;
 	Transform2D new_transform;
 
+	void _tile_set_changed();
+
 	void _emit_changed();
 
+	// Kept for compatibility with TileMap. With TileMapLayers as individual nodes, the format is stored directly in the array.
+	void _set_tile_map_data_using_compatibility_format(int p_layer, TileMapDataFormat p_format, const Vector<int> &p_data);
+	Vector<int> _get_tile_map_data_using_compatibility_format(int p_layer) const;
+
 protected:
 	bool _set(const StringName &p_name, const Variant &p_value);
 	bool _get(const StringName &p_name, Variant &r_ret) const;
@@ -114,6 +121,10 @@ public:
 
 	static void draw_tile(RID p_canvas_item, const Vector2 &p_position, const Ref<TileSet> p_tile_set, int p_atlas_source_id, const Vector2i &p_atlas_coords, int p_alternative_tile, int p_frame = -1, Color p_modulation = Color(1.0, 1.0, 1.0, 1.0), const TileData *p_tile_data_override = nullptr, real_t p_normalized_animation_offset = 0.0);
 
+	// Accessors.
+	void set_tileset(const Ref<TileSet> &p_tileset);
+	Ref<TileSet> get_tileset() const;
+
 	// Layers management.
 	int get_layers_count() const;
 	void add_layer(int p_to_pos);
@@ -200,6 +211,11 @@ public:
 	// Fixing and clearing methods.
 	void fix_invalid_tiles();
 
+#ifdef TOOLS_ENABLED
+	// Moving layers outside of TileMap.
+	TileMapLayer *duplicate_layer_from_internal(int p_layer);
+#endif // TOOLS_ENABLED
+
 	// Clears tiles from a given layer.
 	void clear_layer(int p_layer);
 	void clear();
diff --git a/scene/2d/tile_map_layer.cpp b/scene/2d/tile_map_layer.cpp
index 84c4c43a5a..3f10eb7661 100644
--- a/scene/2d/tile_map_layer.cpp
+++ b/scene/2d/tile_map_layer.cpp
@@ -32,6 +32,7 @@
 
 #include "core/core_string_names.h"
 #include "core/io/marshalls.h"
+#include "scene/2d/tile_map.h"
 #include "scene/gui/control.h"
 #include "scene/resources/world_2d.h"
 #include "servers/navigation_server_2d.h"
@@ -51,11 +52,10 @@ Vector2i TileMapLayer::_coords_to_debug_quadrant_coords(const Vector2i &p_coords
 }
 
 void TileMapLayer::_debug_update() {
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
 	RenderingServer *rs = RenderingServer::get_singleton();
 
 	// Check if we should cleanup everything.
-	bool forced_cleanup = in_destructor || !enabled || !tile_set.is_valid() || !is_visible_in_tree();
+	bool forced_cleanup = in_destructor || !enabled || tile_set.is_null() || !is_visible_in_tree();
 
 	if (forced_cleanup) {
 		for (KeyValue<Vector2i, Ref<DebugQuadrant>> &kv : debug_quadrant_map) {
@@ -84,7 +84,7 @@ void TileMapLayer::_debug_update() {
 
 	if (_debug_was_cleaned_up || anything_changed) {
 		// Update all cells.
-		for (KeyValue<Vector2i, CellData> &kv : tile_map) {
+		for (KeyValue<Vector2i, CellData> &kv : tile_map_layer_data) {
 			CellData &cell_data = kv.value;
 			_debug_quadrants_update_cell(cell_data, dirty_debug_quadrant_list);
 		}
@@ -179,33 +179,21 @@ void TileMapLayer::_debug_quadrants_update_cell(CellData &r_cell_data, SelfList<
 
 /////////////////////////////// Rendering //////////////////////////////////////
 void TileMapLayer::_rendering_update() {
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
 	RenderingServer *rs = RenderingServer::get_singleton();
 
 	// Check if we should cleanup everything.
-	bool forced_cleanup = in_destructor || !enabled || !tile_set.is_valid() || !is_visible_in_tree();
+	bool forced_cleanup = in_destructor || !enabled || tile_set.is_null() || !is_visible_in_tree();
 
 	// ----------- Layer level processing -----------
 	if (!forced_cleanup) {
 		// Modulate the layer.
 		Color layer_modulate = get_modulate();
 #ifdef TOOLS_ENABLED
-		const TileMapLayerGroup *tile_map_layer_group = Object::cast_to<TileMapLayerGroup>(get_parent());
-		if (tile_map_layer_group) {
-			const Vector<StringName> selected_layers = tile_map_layer_group->get_selected_layers();
-			if (tile_map_layer_group->is_highlighting_selected_layer() && selected_layers.size() == 1 && get_name() != selected_layers[0]) {
-				TileMapLayer *selected_layer = Object::cast_to<TileMapLayer>(tile_map_layer_group->get_node_or_null(String(selected_layers[0])));
-				if (selected_layer) {
-					int z_selected = selected_layer->get_z_index();
-					int layer_z_index = get_z_index();
-					if (layer_z_index < z_selected || (layer_z_index == z_selected && get_index() < selected_layer->get_index())) {
-						layer_modulate = layer_modulate.darkened(0.5);
-					} else if (layer_z_index > z_selected || (layer_z_index == z_selected && get_index() > selected_layer->get_index())) {
-						layer_modulate = layer_modulate.darkened(0.5);
-						layer_modulate.a *= 0.3;
-					}
-				}
-			}
+		if (highlight_mode == HIGHLIGHT_MODE_BELOW) {
+			layer_modulate = layer_modulate.darkened(0.5);
+		} else if (highlight_mode == HIGHLIGHT_MODE_ABOVE) {
+			layer_modulate = layer_modulate.darkened(0.5);
+			layer_modulate.a *= 0.3;
 		}
 #endif // TOOLS_ENABLED
 		rs->canvas_item_set_modulate(get_canvas_item(), layer_modulate);
@@ -219,7 +207,7 @@ void TileMapLayer::_rendering_update() {
 	// Check if anything changed that might change the quadrant shape.
 	// If so, recreate everything.
 	bool quandrant_shape_changed = dirty.flags[DIRTY_FLAGS_LAYER_RENDERING_QUADRANT_SIZE] ||
-			(is_y_sort_enabled() && (dirty.flags[DIRTY_FLAGS_LAYER_Y_SORT_ENABLED] || dirty.flags[DIRTY_FLAGS_LAYER_Y_SORT_ORIGIN] || dirty.flags[DIRTY_FLAGS_LAYER_LOCAL_TRANSFORM] || dirty.flags[DIRTY_FLAGS_LAYER_GROUP_TILE_SET]));
+			(is_y_sort_enabled() && (dirty.flags[DIRTY_FLAGS_LAYER_Y_SORT_ENABLED] || dirty.flags[DIRTY_FLAGS_LAYER_Y_SORT_ORIGIN] || dirty.flags[DIRTY_FLAGS_LAYER_LOCAL_TRANSFORM] || dirty.flags[DIRTY_FLAGS_TILE_SET]));
 
 	// Free all quadrants.
 	if (forced_cleanup || quandrant_shape_changed) {
@@ -238,9 +226,9 @@ void TileMapLayer::_rendering_update() {
 
 	if (!forced_cleanup) {
 		// List all quadrants to update, recreating them if needed.
-		if (dirty.flags[DIRTY_FLAGS_LAYER_GROUP_TILE_SET] || dirty.flags[DIRTY_FLAGS_LAYER_IN_TREE] || _rendering_was_cleaned_up) {
+		if (dirty.flags[DIRTY_FLAGS_TILE_SET] || dirty.flags[DIRTY_FLAGS_LAYER_IN_TREE] || _rendering_was_cleaned_up) {
 			// Update all cells.
-			for (KeyValue<Vector2i, CellData> &kv : tile_map) {
+			for (KeyValue<Vector2i, CellData> &kv : tile_map_layer_data) {
 				CellData &cell_data = kv.value;
 				_rendering_quadrants_update_cell(cell_data, dirty_rendering_quadrant_list);
 			}
@@ -420,13 +408,13 @@ void TileMapLayer::_rendering_update() {
 	// ----------- Occluders processing -----------
 	if (forced_cleanup) {
 		// Clean everything.
-		for (KeyValue<Vector2i, CellData> &kv : tile_map) {
+		for (KeyValue<Vector2i, CellData> &kv : tile_map_layer_data) {
 			_rendering_occluders_clear_cell(kv.value);
 		}
 	} else {
-		if (_rendering_was_cleaned_up || dirty.flags[DIRTY_FLAGS_LAYER_GROUP_TILE_SET]) {
+		if (_rendering_was_cleaned_up || dirty.flags[DIRTY_FLAGS_TILE_SET]) {
 			// Update all cells.
-			for (KeyValue<Vector2i, CellData> &kv : tile_map) {
+			for (KeyValue<Vector2i, CellData> &kv : tile_map_layer_data) {
 				_rendering_occluders_update_cell(kv.value);
 			}
 		} else {
@@ -445,11 +433,10 @@ void TileMapLayer::_rendering_update() {
 
 void TileMapLayer::_rendering_notification(int p_what) {
 	RenderingServer *rs = RenderingServer::get_singleton();
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
 	if (p_what == NOTIFICATION_TRANSFORM_CHANGED || p_what == NOTIFICATION_ENTER_CANVAS || p_what == NOTIFICATION_VISIBILITY_CHANGED) {
 		if (tile_set.is_valid()) {
 			Transform2D tilemap_xform = get_global_transform();
-			for (KeyValue<Vector2i, CellData> &kv : tile_map) {
+			for (KeyValue<Vector2i, CellData> &kv : tile_map_layer_data) {
 				const CellData &cell_data = kv.value;
 				for (const RID &occluder : cell_data.occluders) {
 					if (occluder.is_null()) {
@@ -474,8 +461,6 @@ void TileMapLayer::_rendering_notification(int p_what) {
 }
 
 void TileMapLayer::_rendering_quadrants_update_cell(CellData &r_cell_data, SelfList<RenderingQuadrant>::List &r_dirty_rendering_quadrant_list) {
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
-
 	// Check if the cell is valid and retrieve its y_sort_origin.
 	bool is_valid = false;
 	int tile_y_sort_origin = 0;
@@ -573,7 +558,6 @@ void TileMapLayer::_rendering_occluders_clear_cell(CellData &r_cell_data) {
 }
 
 void TileMapLayer::_rendering_occluders_update_cell(CellData &r_cell_data) {
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
 	RenderingServer *rs = RenderingServer::get_singleton();
 
 	// Free unused occluders then resize the occluders array.
@@ -642,8 +626,7 @@ void TileMapLayer::_rendering_occluders_update_cell(CellData &r_cell_data) {
 
 #ifdef DEBUG_ENABLED
 void TileMapLayer::_rendering_draw_cell_debug(const RID &p_canvas_item, const Vector2 &p_quadrant_pos, const CellData &r_cell_data) {
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
-	ERR_FAIL_COND(!tile_set.is_valid());
+	ERR_FAIL_COND(tile_set.is_null());
 
 	if (!Engine::get_singleton()->is_editor_hint()) {
 		return;
@@ -691,19 +674,17 @@ void TileMapLayer::_rendering_draw_cell_debug(const RID &p_canvas_item, const Ve
 /////////////////////////////// Physics //////////////////////////////////////
 
 void TileMapLayer::_physics_update() {
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
-
 	// Check if we should cleanup everything.
-	bool forced_cleanup = in_destructor || !enabled || !is_inside_tree() || !tile_set.is_valid();
+	bool forced_cleanup = in_destructor || !enabled || !collision_enabled || !is_inside_tree() || tile_set.is_null();
 	if (forced_cleanup) {
 		// Clean everything.
-		for (KeyValue<Vector2i, CellData> &kv : tile_map) {
+		for (KeyValue<Vector2i, CellData> &kv : tile_map_layer_data) {
 			_physics_clear_cell(kv.value);
 		}
 	} else {
-		if (_physics_was_cleaned_up || dirty.flags[DIRTY_FLAGS_LAYER_GROUP_TILE_SET] || dirty.flags[DIRTY_FLAGS_LAYER_USE_KINEMATIC_BODIES] || dirty.flags[DIRTY_FLAGS_LAYER_IN_TREE]) {
+		if (_physics_was_cleaned_up || dirty.flags[DIRTY_FLAGS_TILE_SET] || dirty.flags[DIRTY_FLAGS_LAYER_USE_KINEMATIC_BODIES] || dirty.flags[DIRTY_FLAGS_LAYER_IN_TREE]) {
 			// Update all cells.
-			for (KeyValue<Vector2i, CellData> &kv : tile_map) {
+			for (KeyValue<Vector2i, CellData> &kv : tile_map_layer_data) {
 				_physics_update_cell(kv.value);
 			}
 		} else {
@@ -721,7 +702,6 @@ void TileMapLayer::_physics_update() {
 }
 
 void TileMapLayer::_physics_notification(int p_what) {
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
 	Transform2D gl_transform = get_global_transform();
 	PhysicsServer2D *ps = PhysicsServer2D::get_singleton();
 
@@ -729,7 +709,7 @@ void TileMapLayer::_physics_notification(int p_what) {
 		case NOTIFICATION_TRANSFORM_CHANGED:
 			// Move the collisison shapes along with the TileMap.
 			if (is_inside_tree() && tile_set.is_valid()) {
-				for (KeyValue<Vector2i, CellData> &kv : tile_map) {
+				for (KeyValue<Vector2i, CellData> &kv : tile_map_layer_data) {
 					const CellData &cell_data = kv.value;
 
 					for (RID body : cell_data.bodies) {
@@ -747,7 +727,7 @@ void TileMapLayer::_physics_notification(int p_what) {
 			if (is_inside_tree()) {
 				RID space = get_world_2d()->get_space();
 
-				for (KeyValue<Vector2i, CellData> &kv : tile_map) {
+				for (KeyValue<Vector2i, CellData> &kv : tile_map_layer_data) {
 					const CellData &cell_data = kv.value;
 
 					for (RID body : cell_data.bodies) {
@@ -774,7 +754,6 @@ void TileMapLayer::_physics_clear_cell(CellData &r_cell_data) {
 }
 
 void TileMapLayer::_physics_update_cell(CellData &r_cell_data) {
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
 	Transform2D gl_transform = get_global_transform();
 	RID space = get_world_2d()->get_space();
 	PhysicsServer2D *ps = PhysicsServer2D::get_singleton();
@@ -891,8 +870,7 @@ void TileMapLayer::_physics_update_cell(CellData &r_cell_data) {
 #ifdef DEBUG_ENABLED
 void TileMapLayer::_physics_draw_cell_debug(const RID &p_canvas_item, const Vector2 &p_quadrant_pos, const CellData &r_cell_data) {
 	// Draw the debug collision shapes.
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
-	ERR_FAIL_COND(!tile_set.is_valid());
+	ERR_FAIL_COND(tile_set.is_null());
 
 	if (!get_tree()) {
 		return;
@@ -900,13 +878,13 @@ void TileMapLayer::_physics_draw_cell_debug(const RID &p_canvas_item, const Vect
 
 	bool show_collision = false;
 	switch (collision_visibility_mode) {
-		case TileMapLayer::VISIBILITY_MODE_DEFAULT:
+		case TileMapLayer::DEBUG_VISIBILITY_MODE_DEFAULT:
 			show_collision = !Engine::get_singleton()->is_editor_hint() && get_tree()->is_debugging_collisions_hint();
 			break;
-		case TileMapLayer::VISIBILITY_MODE_FORCE_HIDE:
+		case TileMapLayer::DEBUG_VISIBILITY_MODE_FORCE_HIDE:
 			show_collision = false;
 			break;
-		case TileMapLayer::VISIBILITY_MODE_FORCE_SHOW:
+		case TileMapLayer::DEBUG_VISIBILITY_MODE_FORCE_SHOW:
 			show_collision = true;
 			break;
 	}
@@ -948,10 +926,9 @@ void TileMapLayer::_physics_draw_cell_debug(const RID &p_canvas_item, const Vect
 void TileMapLayer::_navigation_update() {
 	ERR_FAIL_NULL(NavigationServer2D::get_singleton());
 	NavigationServer2D *ns = NavigationServer2D::get_singleton();
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
 
 	// Check if we should cleanup everything.
-	bool forced_cleanup = in_destructor || !enabled || !navigation_enabled || !is_inside_tree() || !tile_set.is_valid();
+	bool forced_cleanup = in_destructor || !enabled || !navigation_enabled || !is_inside_tree() || tile_set.is_null();
 
 	// ----------- Layer level processing -----------
 	// All this processing is kept for compatibility with the TileMap node.
@@ -980,13 +957,13 @@ void TileMapLayer::_navigation_update() {
 	// ----------- Navigation regions processing -----------
 	if (forced_cleanup) {
 		// Clean everything.
-		for (KeyValue<Vector2i, CellData> &kv : tile_map) {
+		for (KeyValue<Vector2i, CellData> &kv : tile_map_layer_data) {
 			_navigation_clear_cell(kv.value);
 		}
 	} else {
-		if (_navigation_was_cleaned_up || dirty.flags[DIRTY_FLAGS_LAYER_GROUP_TILE_SET] || dirty.flags[DIRTY_FLAGS_LAYER_IN_TREE] || dirty.flags[DIRTY_FLAGS_LAYER_NAVIGATION_MAP]) {
+		if (_navigation_was_cleaned_up || dirty.flags[DIRTY_FLAGS_TILE_SET] || dirty.flags[DIRTY_FLAGS_LAYER_IN_TREE] || dirty.flags[DIRTY_FLAGS_LAYER_NAVIGATION_MAP]) {
 			// Update all cells.
-			for (KeyValue<Vector2i, CellData> &kv : tile_map) {
+			for (KeyValue<Vector2i, CellData> &kv : tile_map_layer_data) {
 				_navigation_update_cell(kv.value);
 			}
 		} else {
@@ -1004,11 +981,10 @@ void TileMapLayer::_navigation_update() {
 }
 
 void TileMapLayer::_navigation_notification(int p_what) {
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
 	if (p_what == NOTIFICATION_TRANSFORM_CHANGED) {
 		if (tile_set.is_valid()) {
 			Transform2D tilemap_xform = get_global_transform();
-			for (KeyValue<Vector2i, CellData> &kv : tile_map) {
+			for (KeyValue<Vector2i, CellData> &kv : tile_map_layer_data) {
 				const CellData &cell_data = kv.value;
 				// Update navigation regions transform.
 				for (const RID &region : cell_data.navigation_regions) {
@@ -1038,7 +1014,6 @@ void TileMapLayer::_navigation_clear_cell(CellData &r_cell_data) {
 }
 
 void TileMapLayer::_navigation_update_cell(CellData &r_cell_data) {
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
 	NavigationServer2D *ns = NavigationServer2D::get_singleton();
 	Transform2D gl_xform = get_global_transform();
 	RID navigation_map = navigation_map_override.is_valid() ? navigation_map_override : get_world_2d()->get_navigation_map();
@@ -1119,13 +1094,13 @@ void TileMapLayer::_navigation_draw_cell_debug(const RID &p_canvas_item, const V
 	// Draw the debug collision shapes.
 	bool show_navigation = false;
 	switch (navigation_visibility_mode) {
-		case TileMapLayer::VISIBILITY_MODE_DEFAULT:
+		case TileMapLayer::DEBUG_VISIBILITY_MODE_DEFAULT:
 			show_navigation = !Engine::get_singleton()->is_editor_hint() && get_tree()->is_debugging_navigation_hint();
 			break;
-		case TileMapLayer::VISIBILITY_MODE_FORCE_HIDE:
+		case TileMapLayer::DEBUG_VISIBILITY_MODE_FORCE_HIDE:
 			show_navigation = false;
 			break;
-		case TileMapLayer::VISIBILITY_MODE_FORCE_SHOW:
+		case TileMapLayer::DEBUG_VISIBILITY_MODE_FORCE_SHOW:
 			show_navigation = true;
 			break;
 	}
@@ -1138,8 +1113,6 @@ void TileMapLayer::_navigation_draw_cell_debug(const RID &p_canvas_item, const V
 		return;
 	}
 
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
-
 	RenderingServer *rs = RenderingServer::get_singleton();
 	const NavigationServer2D *ns2d = NavigationServer2D::get_singleton();
 
@@ -1224,20 +1197,18 @@ void TileMapLayer::_navigation_draw_cell_debug(const RID &p_canvas_item, const V
 /////////////////////////////// Scenes //////////////////////////////////////
 
 void TileMapLayer::_scenes_update() {
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
-
 	// Check if we should cleanup everything.
-	bool forced_cleanup = in_destructor || !enabled || !is_inside_tree() || !tile_set.is_valid();
+	bool forced_cleanup = in_destructor || !enabled || !is_inside_tree() || tile_set.is_null();
 
 	if (forced_cleanup) {
 		// Clean everything.
-		for (KeyValue<Vector2i, CellData> &kv : tile_map) {
+		for (KeyValue<Vector2i, CellData> &kv : tile_map_layer_data) {
 			_scenes_clear_cell(kv.value);
 		}
 	} else {
-		if (_scenes_was_cleaned_up || dirty.flags[DIRTY_FLAGS_LAYER_GROUP_TILE_SET] || dirty.flags[DIRTY_FLAGS_LAYER_IN_TREE]) {
+		if (_scenes_was_cleaned_up || dirty.flags[DIRTY_FLAGS_TILE_SET] || dirty.flags[DIRTY_FLAGS_LAYER_IN_TREE]) {
 			// Update all cells.
-			for (KeyValue<Vector2i, CellData> &kv : tile_map) {
+			for (KeyValue<Vector2i, CellData> &kv : tile_map_layer_data) {
 				_scenes_update_cell(kv.value);
 			}
 		} else {
@@ -1270,8 +1241,6 @@ void TileMapLayer::_scenes_clear_cell(CellData &r_cell_data) {
 }
 
 void TileMapLayer::_scenes_update_cell(CellData &r_cell_data) {
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
-
 	// Clear the scene in any case.
 	_scenes_clear_cell(r_cell_data);
 
@@ -1312,8 +1281,7 @@ void TileMapLayer::_scenes_update_cell(CellData &r_cell_data) {
 
 #ifdef DEBUG_ENABLED
 void TileMapLayer::_scenes_draw_cell_debug(const RID &p_canvas_item, const Vector2 &p_quadrant_pos, const CellData &r_cell_data) {
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
-	ERR_FAIL_COND(!tile_set.is_valid());
+	ERR_FAIL_COND(tile_set.is_null());
 
 	if (!Engine::get_singleton()->is_editor_hint()) {
 		return;
@@ -1362,22 +1330,20 @@ void TileMapLayer::_scenes_draw_cell_debug(const RID &p_canvas_item, const Vecto
 /////////////////////////////////////////////////////////////////////
 
 void TileMapLayer::_build_runtime_update_tile_data() {
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
-
 	// Check if we should cleanup everything.
-	bool forced_cleanup = in_destructor || !enabled || !tile_set.is_valid() || !is_visible_in_tree();
+	bool forced_cleanup = in_destructor || !enabled || tile_set.is_null() || !is_visible_in_tree();
 	if (!forced_cleanup) {
 		bool valid_runtime_update = GDVIRTUAL_IS_OVERRIDDEN(_use_tile_data_runtime_update) && GDVIRTUAL_IS_OVERRIDDEN(_tile_data_runtime_update);
 		bool valid_runtime_update_for_tilemap = tile_map_node && tile_map_node->GDVIRTUAL_IS_OVERRIDDEN(_use_tile_data_runtime_update) && tile_map_node->GDVIRTUAL_IS_OVERRIDDEN(_tile_data_runtime_update); // For keeping compatibility.
 		if (valid_runtime_update || valid_runtime_update_for_tilemap) {
 			bool use_tilemap_for_runtime = valid_runtime_update_for_tilemap && !valid_runtime_update;
-			if (_runtime_update_tile_data_was_cleaned_up || dirty.flags[DIRTY_FLAGS_LAYER_GROUP_TILE_SET]) {
+			if (_runtime_update_tile_data_was_cleaned_up || dirty.flags[DIRTY_FLAGS_TILE_SET]) {
 				_runtime_update_needs_all_cells_cleaned_up = true;
-				for (KeyValue<Vector2i, CellData> &E : tile_map) {
+				for (KeyValue<Vector2i, CellData> &E : tile_map_layer_data) {
 					_build_runtime_update_tile_data_for_cell(E.value, use_tilemap_for_runtime);
 				}
 			} else if (dirty.flags[DIRTY_FLAGS_LAYER_RUNTIME_UPDATE]) {
-				for (KeyValue<Vector2i, CellData> &E : tile_map) {
+				for (KeyValue<Vector2i, CellData> &E : tile_map_layer_data) {
 					_build_runtime_update_tile_data_for_cell(E.value, use_tilemap_for_runtime, true);
 				}
 			} else {
@@ -1395,8 +1361,6 @@ void TileMapLayer::_build_runtime_update_tile_data() {
 }
 
 void TileMapLayer::_build_runtime_update_tile_data_for_cell(CellData &r_cell_data, bool p_use_tilemap_for_runtime, bool p_auto_add_to_dirty_list) {
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
-
 	TileMapCell &c = r_cell_data.cell;
 	TileSetSource *source;
 	if (tile_set->has_source(c.source_id)) {
@@ -1446,7 +1410,7 @@ void TileMapLayer::_build_runtime_update_tile_data_for_cell(CellData &r_cell_dat
 
 void TileMapLayer::_clear_runtime_update_tile_data() {
 	if (_runtime_update_needs_all_cells_cleaned_up) {
-		for (KeyValue<Vector2i, CellData> &E : tile_map) {
+		for (KeyValue<Vector2i, CellData> &E : tile_map_layer_data) {
 			_clear_runtime_update_tile_data_for_cell(E.value);
 		}
 		_runtime_update_needs_all_cells_cleaned_up = false;
@@ -1467,8 +1431,7 @@ void TileMapLayer::_clear_runtime_update_tile_data_for_cell(CellData &r_cell_dat
 }
 
 TileSet::TerrainsPattern TileMapLayer::_get_best_terrain_pattern_for_constraints(int p_terrain_set, const Vector2i &p_position, const RBSet<TerrainConstraint> &p_constraints, TileSet::TerrainsPattern p_current_pattern) const {
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
-	if (!tile_set.is_valid()) {
+	if (tile_set.is_null()) {
 		return TileSet::TerrainsPattern();
 	}
 	// Returns all tiles compatible with the given constraints.
@@ -1528,8 +1491,7 @@ TileSet::TerrainsPattern TileMapLayer::_get_best_terrain_pattern_for_constraints
 }
 
 RBSet<TerrainConstraint> TileMapLayer::_get_terrain_constraints_from_added_pattern(const Vector2i &p_position, int p_terrain_set, TileSet::TerrainsPattern p_terrains_pattern) const {
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
-	if (!tile_set.is_valid()) {
+	if (tile_set.is_null()) {
 		return RBSet<TerrainConstraint>();
 	}
 
@@ -1549,8 +1511,7 @@ RBSet<TerrainConstraint> TileMapLayer::_get_terrain_constraints_from_added_patte
 }
 
 RBSet<TerrainConstraint> TileMapLayer::_get_terrain_constraints_from_painted_cells_list(const RBSet<Vector2i> &p_painted, int p_terrain_set, bool p_ignore_empty_terrains) const {
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
-	if (!tile_set.is_valid()) {
+	if (tile_set.is_null()) {
 		return RBSet<TerrainConstraint>();
 	}
 
@@ -1636,6 +1597,12 @@ RBSet<TerrainConstraint> TileMapLayer::_get_terrain_constraints_from_painted_cel
 	return constraints;
 }
 
+void TileMapLayer::_tile_set_changed() {
+	dirty.flags[DIRTY_FLAGS_TILE_SET] = true;
+	_queue_internal_update();
+	emit_signal(CoreStringNames::get_singleton()->changed);
+}
+
 void TileMapLayer::_renamed() {
 	emit_signal(CoreStringNames::get_singleton()->changed);
 }
@@ -1706,7 +1673,7 @@ void TileMapLayer::_internal_update() {
 
 	// Remove cells that are empty after the cleanup.
 	for (const Vector2i &coords : to_delete) {
-		tile_map.erase(coords);
+		tile_map_layer_data.erase(coords);
 	}
 
 	// Clear the dirty cells list.
@@ -1754,12 +1721,115 @@ void TileMapLayer::_notification(int p_what) {
 }
 
 void TileMapLayer::_bind_methods() {
+	// --- Cells manipulation ---
+	// Generic cells manipulations and access.
 	ClassDB::bind_method(D_METHOD("set_cell", "coords", "source_id", "atlas_coords", "alternative_tile"), &TileMapLayer::set_cell, DEFVAL(TileSet::INVALID_SOURCE), DEFVAL(TileSetSource::INVALID_ATLAS_COORDS), DEFVAL(0));
+	ClassDB::bind_method(D_METHOD("erase_cell", "coords"), &TileMapLayer::erase_cell);
+	ClassDB::bind_method(D_METHOD("fix_invalid_tiles"), &TileMapLayer::fix_invalid_tiles);
+	ClassDB::bind_method(D_METHOD("clear"), &TileMapLayer::clear);
+
+	ClassDB::bind_method(D_METHOD("get_cell_source_id", "coords"), &TileMapLayer::get_cell_source_id);
+	ClassDB::bind_method(D_METHOD("get_cell_atlas_coords", "coords"), &TileMapLayer::get_cell_atlas_coords);
+	ClassDB::bind_method(D_METHOD("get_cell_alternative_tile", "coords"), &TileMapLayer::get_cell_alternative_tile);
+	ClassDB::bind_method(D_METHOD("get_cell_tile_data", "coords"), &TileMapLayer::get_cell_tile_data);
+
+	ClassDB::bind_method(D_METHOD("get_used_cells"), &TileMapLayer::get_used_cells);
+	ClassDB::bind_method(D_METHOD("get_used_cells_by_id", "source_id", "atlas_coords", "alternative_tile"), &TileMapLayer::get_used_cells_by_id, DEFVAL(TileSet::INVALID_SOURCE), DEFVAL(TileSetSource::INVALID_ATLAS_COORDS), DEFVAL(TileSetSource::INVALID_TILE_ALTERNATIVE));
+	ClassDB::bind_method(D_METHOD("get_used_rect"), &TileMapLayer::get_used_rect);
+
+	// Patterns.
+	ClassDB::bind_method(D_METHOD("get_pattern", "coords_array"), &TileMapLayer::get_pattern);
+	ClassDB::bind_method(D_METHOD("set_pattern", "position", "pattern"), &TileMapLayer::set_pattern);
+
+	// Terrains.
+	ClassDB::bind_method(D_METHOD("set_cells_terrain_connect", "cells", "terrain_set", "terrain", "ignore_empty_terrains"), &TileMapLayer::set_cells_terrain_connect, DEFVAL(true));
+	ClassDB::bind_method(D_METHOD("set_cells_terrain_path", "path", "terrain_set", "terrain", "ignore_empty_terrains"), &TileMapLayer::set_cells_terrain_path, DEFVAL(true));
+
+	// --- Physics helpers ---
+	ClassDB::bind_method(D_METHOD("has_body_rid", "body"), &TileMapLayer::has_body_rid);
+	ClassDB::bind_method(D_METHOD("get_coords_for_body_rid", "body"), &TileMapLayer::get_coords_for_body_rid);
+
+	// --- Runtime ---
+	ClassDB::bind_method(D_METHOD("update_internals"), &TileMapLayer::update_internals);
+	ClassDB::bind_method(D_METHOD("notify_runtime_tile_data_update"), &TileMapLayer::notify_runtime_tile_data_update, DEFVAL(-1));
+
+	// --- Shortcuts to methods defined in TileSet ---
+	ClassDB::bind_method(D_METHOD("map_pattern", "position_in_tilemap", "coords_in_pattern", "pattern"), &TileMapLayer::map_pattern);
+	ClassDB::bind_method(D_METHOD("get_surrounding_cells", "coords"), &TileMapLayer::get_surrounding_cells);
+	ClassDB::bind_method(D_METHOD("get_neighbor_cell", "coords", "neighbor"), &TileMapLayer::get_neighbor_cell);
+	ClassDB::bind_method(D_METHOD("map_to_local", "map_position"), &TileMapLayer::map_to_local);
+	ClassDB::bind_method(D_METHOD("local_to_map", "local_position"), &TileMapLayer::local_to_map);
+
+	// --- Accessors ---
+	ClassDB::bind_method(D_METHOD("set_tile_map_data_from_array", "tile_map_layer_data"), &TileMapLayer::set_tile_map_data_from_array);
+	ClassDB::bind_method(D_METHOD("get_tile_map_data_as_array"), &TileMapLayer::get_tile_map_data_as_array);
+
+	ClassDB::bind_method(D_METHOD("set_enabled", "enabled"), &TileMapLayer::set_enabled);
+	ClassDB::bind_method(D_METHOD("is_enabled"), &TileMapLayer::is_enabled);
+
+	ClassDB::bind_method(D_METHOD("set_tile_set", "tile_set"), &TileMapLayer::set_tile_set);
+	ClassDB::bind_method(D_METHOD("get_tile_set"), &TileMapLayer::get_tile_set);
+
+	ClassDB::bind_method(D_METHOD("set_y_sort_origin", "y_sort_origin"), &TileMapLayer::set_y_sort_origin);
+	ClassDB::bind_method(D_METHOD("get_y_sort_origin"), &TileMapLayer::get_y_sort_origin);
+	ClassDB::bind_method(D_METHOD("set_rendering_quadrant_size", "size"), &TileMapLayer::set_rendering_quadrant_size);
+	ClassDB::bind_method(D_METHOD("get_rendering_quadrant_size"), &TileMapLayer::get_rendering_quadrant_size);
+
+	ClassDB::bind_method(D_METHOD("set_collision_enabled", "enabled"), &TileMapLayer::set_collision_enabled);
+	ClassDB::bind_method(D_METHOD("is_collision_enabled"), &TileMapLayer::is_collision_enabled);
+	ClassDB::bind_method(D_METHOD("set_use_kinematic_bodies", "use_kinematic_bodies"), &TileMapLayer::set_use_kinematic_bodies);
+	ClassDB::bind_method(D_METHOD("is_using_kinematic_bodies"), &TileMapLayer::is_using_kinematic_bodies);
+	ClassDB::bind_method(D_METHOD("set_collision_visibility_mode", "visibility_mode"), &TileMapLayer::set_collision_visibility_mode);
+	ClassDB::bind_method(D_METHOD("get_collision_visibility_mode"), &TileMapLayer::get_collision_visibility_mode);
+
+	ClassDB::bind_method(D_METHOD("set_navigation_enabled", "enabled"), &TileMapLayer::set_navigation_enabled);
+	ClassDB::bind_method(D_METHOD("is_navigation_enabled"), &TileMapLayer::is_navigation_enabled);
+	ClassDB::bind_method(D_METHOD("set_navigation_map", "map"), &TileMapLayer::set_navigation_map);
+	ClassDB::bind_method(D_METHOD("get_navigation_map"), &TileMapLayer::get_navigation_map);
+	ClassDB::bind_method(D_METHOD("set_navigation_visibility_mode", "show_navigation"), &TileMapLayer::set_navigation_visibility_mode);
+	ClassDB::bind_method(D_METHOD("get_navigation_visibility_mode"), &TileMapLayer::get_navigation_visibility_mode);
 
 	GDVIRTUAL_BIND(_use_tile_data_runtime_update, "coords");
 	GDVIRTUAL_BIND(_tile_data_runtime_update, "coords", "tile_data");
 
+	ADD_PROPERTY(PropertyInfo(Variant::PACKED_BYTE_ARRAY, "tile_map_data", PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NO_EDITOR), "set_tile_map_data_from_array", "get_tile_map_data_as_array");
+
+	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "enabled"), "set_enabled", "is_enabled");
+	ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "tile_set", PROPERTY_HINT_RESOURCE_TYPE, "TileSet"), "set_tile_set", "get_tile_set");
+	ADD_GROUP("Rendering", "");
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "y_sort_origin"), "set_y_sort_origin", "get_y_sort_origin");
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "rendering_quadrant_size"), "set_rendering_quadrant_size", "get_rendering_quadrant_size");
+	ADD_GROUP("Physics", "");
+	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "collision_enabled"), "set_collision_enabled", "is_collision_enabled");
+	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "use_kinematic_bodies"), "set_use_kinematic_bodies", "is_using_kinematic_bodies");
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "collision_visibility_mode", PROPERTY_HINT_ENUM, "Default,Force Show,Force Hide"), "set_collision_visibility_mode", "get_collision_visibility_mode");
+	ADD_GROUP("Navigation", "");
+	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "navigation_enabled"), "set_navigation_enabled", "is_navigation_enabled");
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "navigation_visibility_mode", PROPERTY_HINT_ENUM, "Default,Force Show,Force Hide"), "set_navigation_visibility_mode", "get_navigation_visibility_mode");
+
 	ADD_SIGNAL(MethodInfo(CoreStringNames::get_singleton()->changed));
+
+	ADD_PROPERTY_DEFAULT("tile_map_data_format", TileMapDataFormat::TILE_MAP_DATA_FORMAT_1);
+
+	BIND_ENUM_CONSTANT(DEBUG_VISIBILITY_MODE_DEFAULT);
+	BIND_ENUM_CONSTANT(DEBUG_VISIBILITY_MODE_FORCE_HIDE);
+	BIND_ENUM_CONSTANT(DEBUG_VISIBILITY_MODE_FORCE_SHOW);
+}
+
+void TileMapLayer::_update_self_texture_filter(RS::CanvasItemTextureFilter p_texture_filter) {
+	// Set a default texture filter for the whole tilemap.
+	CanvasItem::_update_self_texture_filter(p_texture_filter);
+	dirty.flags[DIRTY_FLAGS_LAYER_TEXTURE_FILTER] = true;
+	_queue_internal_update();
+	emit_signal(CoreStringNames::get_singleton()->changed);
+}
+
+void TileMapLayer::_update_self_texture_repeat(RS::CanvasItemTextureRepeat p_texture_repeat) {
+	// Set a default texture repeat for the whole tilemap.
+	CanvasItem::_update_self_texture_repeat(p_texture_repeat);
+	dirty.flags[DIRTY_FLAGS_LAYER_TEXTURE_REPEAT] = true;
+	_queue_internal_update();
+	emit_signal(CoreStringNames::get_singleton()->changed);
 }
 
 void TileMapLayer::set_as_tile_map_internal_node(int p_index) {
@@ -1776,7 +1846,6 @@ void TileMapLayer::set_as_tile_map_internal_node(int p_index) {
 }
 
 Rect2 TileMapLayer::get_rect(bool &r_changed) const {
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
 	if (tile_set.is_null()) {
 		r_changed = rect_cache != Rect2();
 		return Rect2();
@@ -1789,7 +1858,7 @@ Rect2 TileMapLayer::get_rect(bool &r_changed) const {
 	if (rect_cache_dirty) {
 		Rect2 r_total;
 		bool first = true;
-		for (const KeyValue<Vector2i, CellData> &E : tile_map) {
+		for (const KeyValue<Vector2i, CellData> &E : tile_map_layer_data) {
 			Rect2 r;
 			r.position = tile_set->map_to_local(E.key);
 			r.size = Size2();
@@ -1811,8 +1880,7 @@ Rect2 TileMapLayer::get_rect(bool &r_changed) const {
 }
 
 HashMap<Vector2i, TileSet::TerrainsPattern> TileMapLayer::terrain_fill_constraints(const Vector<Vector2i> &p_to_replace, int p_terrain_set, const RBSet<TerrainConstraint> &p_constraints) const {
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
-	if (!tile_set.is_valid()) {
+	if (tile_set.is_null()) {
 		return HashMap<Vector2i, TileSet::TerrainsPattern>();
 	}
 
@@ -1860,8 +1928,7 @@ HashMap<Vector2i, TileSet::TerrainsPattern> TileMapLayer::terrain_fill_constrain
 
 HashMap<Vector2i, TileSet::TerrainsPattern> TileMapLayer::terrain_fill_connect(const Vector<Vector2i> &p_coords_array, int p_terrain_set, int p_terrain, bool p_ignore_empty_terrains) const {
 	HashMap<Vector2i, TileSet::TerrainsPattern> output;
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
-	ERR_FAIL_COND_V(!tile_set.is_valid(), output);
+	ERR_FAIL_COND_V(tile_set.is_null(), output);
 	ERR_FAIL_INDEX_V(p_terrain_set, tile_set->get_terrain_sets_count(), output);
 
 	// Build list and set of tiles that can be modified (painted and their surroundings).
@@ -1966,8 +2033,7 @@ HashMap<Vector2i, TileSet::TerrainsPattern> TileMapLayer::terrain_fill_connect(c
 
 HashMap<Vector2i, TileSet::TerrainsPattern> TileMapLayer::terrain_fill_path(const Vector<Vector2i> &p_coords_array, int p_terrain_set, int p_terrain, bool p_ignore_empty_terrains) const {
 	HashMap<Vector2i, TileSet::TerrainsPattern> output;
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
-	ERR_FAIL_COND_V(!tile_set.is_valid(), output);
+	ERR_FAIL_COND_V(tile_set.is_null(), output);
 	ERR_FAIL_INDEX_V(p_terrain_set, tile_set->get_terrain_sets_count(), output);
 
 	// Make sure the path is correct and build the peering bit list while doing it.
@@ -2040,8 +2106,7 @@ HashMap<Vector2i, TileSet::TerrainsPattern> TileMapLayer::terrain_fill_path(cons
 
 HashMap<Vector2i, TileSet::TerrainsPattern> TileMapLayer::terrain_fill_pattern(const Vector<Vector2i> &p_coords_array, int p_terrain_set, TileSet::TerrainsPattern p_terrains_pattern, bool p_ignore_empty_terrains) const {
 	HashMap<Vector2i, TileSet::TerrainsPattern> output;
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
-	ERR_FAIL_COND_V(!tile_set.is_valid(), output);
+	ERR_FAIL_COND_V(tile_set.is_null(), output);
 	ERR_FAIL_INDEX_V(p_terrain_set, tile_set->get_terrain_sets_count(), output);
 
 	// Build list and set of tiles that can be modified (painted and their surroundings).
@@ -2091,153 +2156,18 @@ HashMap<Vector2i, TileSet::TerrainsPattern> TileMapLayer::terrain_fill_pattern(c
 	return output;
 }
 
-TileMapCell TileMapLayer::get_cell(const Vector2i &p_coords, bool p_use_proxies) const {
-	if (!tile_map.has(p_coords)) {
+TileMapCell TileMapLayer::get_cell(const Vector2i &p_coords) const {
+	if (!tile_map_layer_data.has(p_coords)) {
 		return TileMapCell();
 	} else {
-		TileMapCell c = tile_map.find(p_coords)->value.cell;
-		const Ref<TileSet> &tile_set = get_effective_tile_set();
-		if (p_use_proxies && tile_set.is_valid()) {
-			Array proxyed = tile_set->map_tile_proxy(c.source_id, c.get_atlas_coords(), c.alternative_tile);
-			c.source_id = proxyed[0];
-			c.set_atlas_coords(proxyed[1]);
-			c.alternative_tile = proxyed[2];
-		}
-		return c;
-	}
-}
-
-void TileMapLayer::set_tile_data(TileMapDataFormat p_format, const Vector<int> &p_data) {
-	ERR_FAIL_COND(p_format > TileMapDataFormat::FORMAT_3);
-
-	// Set data for a given tile from raw data.
-
-	int c = p_data.size();
-	const int *r = p_data.ptr();
-
-	int offset = (p_format >= TileMapDataFormat::FORMAT_2) ? 3 : 2;
-	ERR_FAIL_COND_MSG(c % offset != 0, vformat("Corrupted tile data. Got size: %s. Expected modulo: %s", offset));
-
-	clear();
-
-#ifdef DISABLE_DEPRECATED
-	ERR_FAIL_COND_MSG(p_format != TileMapDataFormat::FORMAT_3, vformat("Cannot handle deprecated TileMapLayer data format version %d. This Godot version was compiled with no support for deprecated data.", p_format));
-#endif
-
-	for (int i = 0; i < c; i += offset) {
-		const uint8_t *ptr = (const uint8_t *)&r[i];
-		uint8_t local[12];
-		for (int j = 0; j < ((p_format >= TileMapDataFormat::FORMAT_2) ? 12 : 8); j++) {
-			local[j] = ptr[j];
-		}
-
-#ifdef BIG_ENDIAN_ENABLED
-
-		SWAP(local[0], local[3]);
-		SWAP(local[1], local[2]);
-		SWAP(local[4], local[7]);
-		SWAP(local[5], local[6]);
-		//TODO: ask someone to check this...
-		if (FORMAT >= FORMAT_2) {
-			SWAP(local[8], local[11]);
-			SWAP(local[9], local[10]);
-		}
-#endif
-		// Extracts position in TileMap.
-		int16_t x = decode_uint16(&local[0]);
-		int16_t y = decode_uint16(&local[2]);
-
-		if (p_format == TileMapDataFormat::FORMAT_3) {
-			uint16_t source_id = decode_uint16(&local[4]);
-			uint16_t atlas_coords_x = decode_uint16(&local[6]);
-			uint16_t atlas_coords_y = decode_uint16(&local[8]);
-			uint16_t alternative_tile = decode_uint16(&local[10]);
-			set_cell(Vector2i(x, y), source_id, Vector2i(atlas_coords_x, atlas_coords_y), alternative_tile);
-		} else {
-#ifndef DISABLE_DEPRECATED
-			// Previous decated format.
-
-			uint32_t v = decode_uint32(&local[4]);
-			// Extract the transform flags that used to be in the tilemap.
-			bool flip_h = v & (1UL << 29);
-			bool flip_v = v & (1UL << 30);
-			bool transpose = v & (1UL << 31);
-			v &= (1UL << 29) - 1;
-
-			// Extract autotile/atlas coords.
-			int16_t coord_x = 0;
-			int16_t coord_y = 0;
-			if (p_format == TileMapDataFormat::FORMAT_2) {
-				coord_x = decode_uint16(&local[8]);
-				coord_y = decode_uint16(&local[10]);
-			}
-
-			const Ref<TileSet> &tile_set = get_effective_tile_set();
-			if (tile_set.is_valid()) {
-				Array a = tile_set->compatibility_tilemap_map(v, Vector2i(coord_x, coord_y), flip_h, flip_v, transpose);
-				if (a.size() == 3) {
-					set_cell(Vector2i(x, y), a[0], a[1], a[2]);
-				} else {
-					ERR_PRINT(vformat("No valid tile in Tileset for: tile:%s coords:%s flip_h:%s flip_v:%s transpose:%s", v, Vector2i(coord_x, coord_y), flip_h, flip_v, transpose));
-				}
-			} else {
-				int compatibility_alternative_tile = ((int)flip_h) + ((int)flip_v << 1) + ((int)transpose << 2);
-				set_cell(Vector2i(x, y), v, Vector2i(coord_x, coord_y), compatibility_alternative_tile);
-			}
-#endif
-		}
-	}
-}
-
-Vector<int> TileMapLayer::get_tile_data() const {
-	// Export tile data to raw format.
-	Vector<int> tile_data;
-	tile_data.resize(tile_map.size() * 3);
-	int *w = tile_data.ptrw();
-
-	// Save in highest format.
-
-	int idx = 0;
-	for (const KeyValue<Vector2i, CellData> &E : tile_map) {
-		uint8_t *ptr = (uint8_t *)&w[idx];
-		encode_uint16((int16_t)(E.key.x), &ptr[0]);
-		encode_uint16((int16_t)(E.key.y), &ptr[2]);
-		encode_uint16(E.value.cell.source_id, &ptr[4]);
-		encode_uint16(E.value.cell.coord_x, &ptr[6]);
-		encode_uint16(E.value.cell.coord_y, &ptr[8]);
-		encode_uint16(E.value.cell.alternative_tile, &ptr[10]);
-		idx += 3;
-	}
-
-	return tile_data;
-}
-
-void TileMapLayer::notify_tile_map_layer_group_change(DirtyFlags p_what) {
-	if (p_what == DIRTY_FLAGS_LAYER_GROUP_SELECTED_LAYERS ||
-			p_what == DIRTY_FLAGS_LAYER_GROUP_HIGHLIGHT_SELECTED ||
-			p_what == DIRTY_FLAGS_LAYER_GROUP_TILE_SET) {
-		emit_signal(CoreStringNames::get_singleton()->changed);
+		return tile_map_layer_data.find(p_coords)->value.cell;
 	}
-
-	dirty.flags[p_what] = true;
-	_queue_internal_update();
-}
-
-void TileMapLayer::update_internals() {
-	pending_update = true;
-	_deferred_internal_update();
-}
-
-void TileMapLayer::notify_runtime_tile_data_update() {
-	dirty.flags[TileMapLayer::DIRTY_FLAGS_LAYER_RUNTIME_UPDATE] = true;
-	_queue_internal_update();
-	emit_signal(CoreStringNames::get_singleton()->changed);
 }
 
-void TileMapLayer::set_cell(const Vector2i &p_coords, int p_source_id, const Vector2i p_atlas_coords, int p_alternative_tile) {
+void TileMapLayer::set_cell(const Vector2i &p_coords, int p_source_id, const Vector2i &p_atlas_coords, int p_alternative_tile) {
 	// Set the current cell tile (using integer position).
 	Vector2i pk(p_coords);
-	HashMap<Vector2i, CellData>::Iterator E = tile_map.find(pk);
+	HashMap<Vector2i, CellData>::Iterator E = tile_map_layer_data.find(pk);
 
 	int source_id = p_source_id;
 	Vector2i atlas_coords = p_atlas_coords;
@@ -2258,7 +2188,7 @@ void TileMapLayer::set_cell(const Vector2i &p_coords, int p_source_id, const Vec
 		// Insert a new cell in the tile map.
 		CellData new_cell_data;
 		new_cell_data.coords = pk;
-		E = tile_map.insert(pk, new_cell_data);
+		E = tile_map_layer_data.insert(pk, new_cell_data);
 	} else {
 		if (E->value.cell.source_id == source_id && E->value.cell.get_atlas_coords() == atlas_coords && E->value.cell.alternative_tile == alternative_tile) {
 			return; // Nothing changed.
@@ -2283,83 +2213,139 @@ void TileMapLayer::erase_cell(const Vector2i &p_coords) {
 	set_cell(p_coords, TileSet::INVALID_SOURCE, TileSetSource::INVALID_ATLAS_COORDS, TileSetSource::INVALID_TILE_ALTERNATIVE);
 }
 
-int TileMapLayer::get_cell_source_id(const Vector2i &p_coords, bool p_use_proxies) const {
+void TileMapLayer::fix_invalid_tiles() {
+	ERR_FAIL_COND_MSG(tile_set.is_null(), "Cannot call fix_invalid_tiles() on a TileMap without a valid TileSet.");
+
+	RBSet<Vector2i> coords;
+	for (const KeyValue<Vector2i, CellData> &E : tile_map_layer_data) {
+		TileSetSource *source = *tile_set->get_source(E.value.cell.source_id);
+		if (!source || !source->has_tile(E.value.cell.get_atlas_coords()) || !source->has_alternative_tile(E.value.cell.get_atlas_coords(), E.value.cell.alternative_tile)) {
+			coords.insert(E.key);
+		}
+	}
+	for (const Vector2i &E : coords) {
+		set_cell(E, TileSet::INVALID_SOURCE, TileSetSource::INVALID_ATLAS_COORDS, TileSetSource::INVALID_TILE_ALTERNATIVE);
+	}
+}
+
+void TileMapLayer::clear() {
+	// Remove all tiles.
+	for (KeyValue<Vector2i, CellData> &kv : tile_map_layer_data) {
+		erase_cell(kv.key);
+	}
+	used_rect_cache_dirty = true;
+}
+
+int TileMapLayer::get_cell_source_id(const Vector2i &p_coords) const {
 	// Get a cell source id from position.
-	HashMap<Vector2i, CellData>::ConstIterator E = tile_map.find(p_coords);
+	HashMap<Vector2i, CellData>::ConstIterator E = tile_map_layer_data.find(p_coords);
 
 	if (!E) {
 		return TileSet::INVALID_SOURCE;
 	}
 
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
-	if (p_use_proxies && tile_set.is_valid()) {
-		Array proxyed = tile_set->map_tile_proxy(E->value.cell.source_id, E->value.cell.get_atlas_coords(), E->value.cell.alternative_tile);
-		return proxyed[0];
-	}
-
 	return E->value.cell.source_id;
 }
 
-Vector2i TileMapLayer::get_cell_atlas_coords(const Vector2i &p_coords, bool p_use_proxies) const {
+Vector2i TileMapLayer::get_cell_atlas_coords(const Vector2i &p_coords) const {
 	// Get a cell source id from position.
-	HashMap<Vector2i, CellData>::ConstIterator E = tile_map.find(p_coords);
+	HashMap<Vector2i, CellData>::ConstIterator E = tile_map_layer_data.find(p_coords);
 
 	if (!E) {
 		return TileSetSource::INVALID_ATLAS_COORDS;
 	}
 
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
-	if (p_use_proxies && tile_set.is_valid()) {
-		Array proxyed = tile_set->map_tile_proxy(E->value.cell.source_id, E->value.cell.get_atlas_coords(), E->value.cell.alternative_tile);
-		return proxyed[1];
-	}
-
 	return E->value.cell.get_atlas_coords();
 }
 
-int TileMapLayer::get_cell_alternative_tile(const Vector2i &p_coords, bool p_use_proxies) const {
+int TileMapLayer::get_cell_alternative_tile(const Vector2i &p_coords) const {
 	// Get a cell source id from position.
-	HashMap<Vector2i, CellData>::ConstIterator E = tile_map.find(p_coords);
+	HashMap<Vector2i, CellData>::ConstIterator E = tile_map_layer_data.find(p_coords);
 
 	if (!E) {
 		return TileSetSource::INVALID_TILE_ALTERNATIVE;
 	}
 
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
-	if (p_use_proxies && tile_set.is_valid()) {
-		Array proxyed = tile_set->map_tile_proxy(E->value.cell.source_id, E->value.cell.get_atlas_coords(), E->value.cell.alternative_tile);
-		return proxyed[2];
-	}
-
 	return E->value.cell.alternative_tile;
 }
 
-TileData *TileMapLayer::get_cell_tile_data(const Vector2i &p_coords, bool p_use_proxies) const {
-	int source_id = get_cell_source_id(p_coords, p_use_proxies);
+TileData *TileMapLayer::get_cell_tile_data(const Vector2i &p_coords) const {
+	int source_id = get_cell_source_id(p_coords);
 	if (source_id == TileSet::INVALID_SOURCE) {
 		return nullptr;
 	}
 
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
 	Ref<TileSetAtlasSource> source = tile_set->get_source(source_id);
 	if (source.is_valid()) {
-		return source->get_tile_data(get_cell_atlas_coords(p_coords, p_use_proxies), get_cell_alternative_tile(p_coords, p_use_proxies));
+		return source->get_tile_data(get_cell_atlas_coords(p_coords), get_cell_alternative_tile(p_coords));
 	}
 
 	return nullptr;
 }
 
-void TileMapLayer::clear() {
-	// Remove all tiles.
-	for (KeyValue<Vector2i, CellData> &kv : tile_map) {
-		erase_cell(kv.key);
+TypedArray<Vector2i> TileMapLayer::get_used_cells() const {
+	// Returns the cells used in the tilemap.
+	TypedArray<Vector2i> a;
+	for (const KeyValue<Vector2i, CellData> &E : tile_map_layer_data) {
+		const TileMapCell &c = E.value.cell;
+		if (c.source_id == TileSet::INVALID_SOURCE) {
+			continue;
+		}
+		a.push_back(E.key);
 	}
-	used_rect_cache_dirty = true;
+
+	return a;
+}
+
+TypedArray<Vector2i> TileMapLayer::get_used_cells_by_id(int p_source_id, const Vector2i &p_atlas_coords, int p_alternative_tile) const {
+	// Returns the cells used in the tilemap.
+	TypedArray<Vector2i> a;
+	for (const KeyValue<Vector2i, CellData> &E : tile_map_layer_data) {
+		const TileMapCell &c = E.value.cell;
+		if (c.source_id == TileSet::INVALID_SOURCE) {
+			continue;
+		}
+		if ((p_source_id == TileSet::INVALID_SOURCE || p_source_id == c.source_id) &&
+				(p_atlas_coords == TileSetSource::INVALID_ATLAS_COORDS || p_atlas_coords == c.get_atlas_coords()) &&
+				(p_alternative_tile == TileSetSource::INVALID_TILE_ALTERNATIVE || p_alternative_tile == c.alternative_tile)) {
+			a.push_back(E.key);
+		}
+	}
+
+	return a;
+}
+
+Rect2i TileMapLayer::get_used_rect() const {
+	// Return the rect of the currently used area.
+	if (used_rect_cache_dirty) {
+		used_rect_cache = Rect2i();
+
+		bool first = true;
+		for (const KeyValue<Vector2i, CellData> &E : tile_map_layer_data) {
+			const TileMapCell &c = E.value.cell;
+			if (c.source_id == TileSet::INVALID_SOURCE) {
+				continue;
+			}
+			if (first) {
+				used_rect_cache = Rect2i(E.key, Size2i());
+				first = false;
+			} else {
+				used_rect_cache.expand_to(E.key);
+			}
+		}
+		if (!first) {
+			// Only if we have at least one cell.
+			// The cache expands to top-left coordinate, so we add one full tile.
+			used_rect_cache.size += Vector2i(1, 1);
+		}
+		used_rect_cache_dirty = false;
+	}
+
+	return used_rect_cache;
 }
 
 Ref<TileMapPattern> TileMapLayer::get_pattern(TypedArray<Vector2i> p_coords_array) {
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
-	ERR_FAIL_COND_V(!tile_set.is_valid(), nullptr);
+	ERR_FAIL_COND_V(tile_set.is_null(), nullptr);
 
 	Ref<TileMapPattern> output;
 	output.instantiate();
@@ -2412,7 +2398,6 @@ Ref<TileMapPattern> TileMapLayer::get_pattern(TypedArray<Vector2i> p_coords_arra
 }
 
 void TileMapLayer::set_pattern(const Vector2i &p_position, const Ref<TileMapPattern> p_pattern) {
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
 	ERR_FAIL_COND(tile_set.is_null());
 	ERR_FAIL_COND(p_pattern.is_null());
 
@@ -2424,8 +2409,7 @@ void TileMapLayer::set_pattern(const Vector2i &p_position, const Ref<TileMapPatt
 }
 
 void TileMapLayer::set_cells_terrain_connect(TypedArray<Vector2i> p_cells, int p_terrain_set, int p_terrain, bool p_ignore_empty_terrains) {
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
-	ERR_FAIL_COND(!tile_set.is_valid());
+	ERR_FAIL_COND(tile_set.is_null());
 	ERR_FAIL_INDEX(p_terrain_set, tile_set->get_terrain_sets_count());
 
 	Vector<Vector2i> cells_vector;
@@ -2464,8 +2448,7 @@ void TileMapLayer::set_cells_terrain_connect(TypedArray<Vector2i> p_cells, int p
 }
 
 void TileMapLayer::set_cells_terrain_path(TypedArray<Vector2i> p_path, int p_terrain_set, int p_terrain, bool p_ignore_empty_terrains) {
-	const Ref<TileSet> &tile_set = get_effective_tile_set();
-	ERR_FAIL_COND(!tile_set.is_valid());
+	ERR_FAIL_COND(tile_set.is_null());
 	ERR_FAIL_INDEX(p_terrain_set, tile_set->get_terrain_sets_count());
 
 	Vector<Vector2i> vector_path;
@@ -2504,65 +2487,50 @@ void TileMapLayer::set_cells_terrain_path(TypedArray<Vector2i> p_path, int p_ter
 	}
 }
 
-TypedArray<Vector2i> TileMapLayer::get_used_cells() const {
-	// Returns the cells used in the tilemap.
-	TypedArray<Vector2i> a;
-	for (const KeyValue<Vector2i, CellData> &E : tile_map) {
-		const TileMapCell &c = E.value.cell;
-		if (c.source_id == TileSet::INVALID_SOURCE) {
-			continue;
-		}
-		a.push_back(E.key);
-	}
+bool TileMapLayer::has_body_rid(RID p_physics_body) const {
+	return bodies_coords.has(p_physics_body);
+}
 
-	return a;
+Vector2i TileMapLayer::get_coords_for_body_rid(RID p_physics_body) const {
+	const Vector2i *found = bodies_coords.getptr(p_physics_body);
+	ERR_FAIL_NULL_V(found, Vector2i());
+	return *found;
 }
 
-TypedArray<Vector2i> TileMapLayer::get_used_cells_by_id(int p_source_id, const Vector2i p_atlas_coords, int p_alternative_tile) const {
-	// Returns the cells used in the tilemap.
-	TypedArray<Vector2i> a;
-	for (const KeyValue<Vector2i, CellData> &E : tile_map) {
-		const TileMapCell &c = E.value.cell;
-		if (c.source_id == TileSet::INVALID_SOURCE) {
-			continue;
-		}
-		if ((p_source_id == TileSet::INVALID_SOURCE || p_source_id == c.source_id) &&
-				(p_atlas_coords == TileSetSource::INVALID_ATLAS_COORDS || p_atlas_coords == c.get_atlas_coords()) &&
-				(p_alternative_tile == TileSetSource::INVALID_TILE_ALTERNATIVE || p_alternative_tile == c.alternative_tile)) {
-			a.push_back(E.key);
-		}
-	}
+void TileMapLayer::update_internals() {
+	pending_update = true;
+	_deferred_internal_update();
+}
 
-	return a;
+void TileMapLayer::notify_runtime_tile_data_update() {
+	dirty.flags[TileMapLayer::DIRTY_FLAGS_LAYER_RUNTIME_UPDATE] = true;
+	_queue_internal_update();
+	emit_signal(CoreStringNames::get_singleton()->changed);
 }
 
-Rect2i TileMapLayer::get_used_rect() const {
-	// Return the rect of the currently used area.
-	if (used_rect_cache_dirty) {
-		used_rect_cache = Rect2i();
+Vector2i TileMapLayer::map_pattern(const Vector2i &p_position_in_tilemap, const Vector2i &p_coords_in_pattern, Ref<TileMapPattern> p_pattern) {
+	ERR_FAIL_COND_V(tile_set.is_null(), Vector2i());
+	return tile_set->map_pattern(p_position_in_tilemap, p_coords_in_pattern, p_pattern);
+}
 
-		bool first = true;
-		for (const KeyValue<Vector2i, CellData> &E : tile_map) {
-			const TileMapCell &c = E.value.cell;
-			if (c.source_id == TileSet::INVALID_SOURCE) {
-				continue;
-			}
-			if (first) {
-				used_rect_cache = Rect2i(E.key.x, E.key.y, 0, 0);
-				first = false;
-			} else {
-				used_rect_cache.expand_to(E.key);
-			}
-		}
-		if (!first) {
-			// Only if we have at least one cell.
-			// The cache expands to top-left coordinate, so we add one full tile.
-			used_rect_cache.size += Vector2i(1, 1);
-		}
-		used_rect_cache_dirty = false;
-	}
+TypedArray<Vector2i> TileMapLayer::get_surrounding_cells(const Vector2i &p_coords) {
+	ERR_FAIL_COND_V(tile_set.is_null(), TypedArray<Vector2i>());
+	return tile_set->get_surrounding_cells(p_coords);
+}
 
-	return used_rect_cache;
+Vector2i TileMapLayer::get_neighbor_cell(const Vector2i &p_coords, TileSet::CellNeighbor p_cell_neighbor) const {
+	ERR_FAIL_COND_V(tile_set.is_null(), Vector2i());
+	return tile_set->get_neighbor_cell(p_coords, p_cell_neighbor);
+}
+
+Vector2 TileMapLayer::map_to_local(const Vector2i &p_pos) const {
+	ERR_FAIL_COND_V(tile_set.is_null(), Vector2());
+	return tile_set->map_to_local(p_pos);
+}
+
+Vector2i TileMapLayer::local_to_map(const Vector2 &p_pos) const {
+	ERR_FAIL_COND_V(tile_set.is_null(), Vector2i());
+	return tile_set->local_to_map(p_pos);
 }
 
 void TileMapLayer::set_enabled(bool p_enabled) {
@@ -2573,16 +2541,127 @@ void TileMapLayer::set_enabled(bool p_enabled) {
 	dirty.flags[DIRTY_FLAGS_LAYER_ENABLED] = true;
 	_queue_internal_update();
 	emit_signal(CoreStringNames::get_singleton()->changed);
-
-	if (tile_map_node) {
-		tile_map_node->update_configuration_warnings();
-	}
 }
 
 bool TileMapLayer::is_enabled() const {
 	return enabled;
 }
 
+void TileMapLayer::set_tile_set(const Ref<TileSet> &p_tile_set) {
+	if (p_tile_set == tile_set) {
+		return;
+	}
+
+	dirty.flags[DIRTY_FLAGS_TILE_SET] = true;
+	_queue_internal_update();
+
+	// Set the TileSet, registering to its changes.
+	if (tile_set.is_valid()) {
+		tile_set->disconnect_changed(callable_mp(this, &TileMapLayer::_tile_set_changed));
+	}
+
+	tile_set = p_tile_set;
+
+	if (tile_set.is_valid()) {
+		tile_set->connect_changed(callable_mp(this, &TileMapLayer::_tile_set_changed));
+	}
+
+	emit_signal(CoreStringNames::get_singleton()->changed);
+
+	// Trigger updates for TileSet's read-only status.
+	notify_property_list_changed();
+}
+
+Ref<TileSet> TileMapLayer::get_tile_set() const {
+	return tile_set;
+}
+
+void TileMapLayer::set_highlight_mode(HighlightMode p_highlight_mode) {
+	if (p_highlight_mode == highlight_mode) {
+		return;
+	}
+	highlight_mode = p_highlight_mode;
+	_queue_internal_update();
+}
+
+TileMapLayer::HighlightMode TileMapLayer::get_highlight_mode() const {
+	return highlight_mode;
+}
+
+void TileMapLayer::set_tile_map_data_from_array(const Vector<uint8_t> &p_data) {
+	const int cell_data_struct_size = 12;
+
+	int size = p_data.size();
+	const uint8_t *ptr = p_data.ptr();
+
+	// Index in the array.
+	int index = 0;
+
+	// First extract the data version.
+	ERR_FAIL_COND_MSG(size < 2, "Corrupted tile map data: not enough bytes.");
+	uint16_t format = decode_uint16(&ptr[index]);
+	index += 2;
+	ERR_FAIL_COND_MSG(format >= TileMapLayerDataFormat::TILE_MAP_LAYER_DATA_FORMAT_MAX, vformat("Unsupported tile map data format: %s. Expected format ID lower or equal to: %s", format, TileMapLayerDataFormat::TILE_MAP_LAYER_DATA_FORMAT_MAX - 1));
+
+	// Clear the TileMap.
+	clear();
+
+	while (index < size) {
+		ERR_FAIL_COND_MSG(index + cell_data_struct_size > size, vformat("Corrupted tile map data: tiles might be missing."));
+
+		// Get a pointer at the start of the cell data.
+		const uint8_t *cell_data_ptr = &ptr[index];
+
+		// Extracts position in TileMap.
+		int16_t x = decode_uint16(&cell_data_ptr[0]);
+		int16_t y = decode_uint16(&cell_data_ptr[2]);
+
+		// Extracts the tile identifiers.
+		uint16_t source_id = decode_uint16(&cell_data_ptr[4]);
+		uint16_t atlas_coords_x = decode_uint16(&cell_data_ptr[6]);
+		uint16_t atlas_coords_y = decode_uint16(&cell_data_ptr[8]);
+		uint16_t alternative_tile = decode_uint16(&cell_data_ptr[10]);
+
+		set_cell(Vector2i(x, y), source_id, Vector2i(atlas_coords_x, atlas_coords_y), alternative_tile);
+		index += cell_data_struct_size;
+	}
+}
+
+Vector<uint8_t> TileMapLayer::get_tile_map_data_as_array() const {
+	const int cell_data_struct_size = 12;
+
+	Vector<uint8_t> tile_map_data_array;
+	tile_map_data_array.resize(2 + tile_map_layer_data.size() * cell_data_struct_size);
+	uint8_t *ptr = tile_map_data_array.ptrw();
+
+	// Index in the array.
+	int index = 0;
+
+	// Save the version.
+	encode_uint16(TileMapLayerDataFormat::TILE_MAP_LAYER_DATA_FORMAT_MAX - 1, &ptr[index]);
+	index += 2;
+
+	// Save in highest format.
+	for (const KeyValue<Vector2i, CellData> &E : tile_map_layer_data) {
+		// Get a pointer at the start of the cell data.
+		uint8_t *cell_data_ptr = (uint8_t *)&ptr[index];
+
+		// Store position in TileMap.
+		encode_uint16((int16_t)(E.key.x), &cell_data_ptr[0]);
+		encode_uint16((int16_t)(E.key.y), &cell_data_ptr[2]);
+
+		// Store the tile identifiers.
+		encode_uint16(E.value.cell.source_id, &cell_data_ptr[4]);
+		encode_uint16(E.value.cell.coord_x, &cell_data_ptr[6]);
+		encode_uint16(E.value.cell.coord_y, &cell_data_ptr[8]);
+		encode_uint16(E.value.cell.alternative_tile, &cell_data_ptr[10]);
+
+		index += cell_data_struct_size;
+	}
+
+	return tile_map_data_array;
+}
+
 void TileMapLayer::set_self_modulate(const Color &p_self_modulate) {
 	if (get_self_modulate() == p_self_modulate) {
 		return;
@@ -2602,9 +2681,6 @@ void TileMapLayer::set_y_sort_enabled(bool p_y_sort_enabled) {
 	_queue_internal_update();
 	emit_signal(CoreStringNames::get_singleton()->changed);
 
-	if (tile_map_node) {
-		tile_map_node->update_configuration_warnings();
-	}
 	_update_notify_local_transform();
 }
 
@@ -2630,10 +2706,6 @@ void TileMapLayer::set_z_index(int p_z_index) {
 	dirty.flags[DIRTY_FLAGS_LAYER_Z_INDEX] = true;
 	_queue_internal_update();
 	emit_signal(CoreStringNames::get_singleton()->changed);
-
-	if (tile_map_node) {
-		tile_map_node->update_configuration_warnings();
-	}
 }
 
 void TileMapLayer::set_light_mask(int p_light_mask) {
@@ -2646,22 +2718,6 @@ void TileMapLayer::set_light_mask(int p_light_mask) {
 	emit_signal(CoreStringNames::get_singleton()->changed);
 }
 
-void TileMapLayer::set_texture_filter(TextureFilter p_texture_filter) {
-	// Set a default texture filter for the whole tilemap.
-	CanvasItem::set_texture_filter(p_texture_filter);
-	dirty.flags[DIRTY_FLAGS_LAYER_TEXTURE_FILTER] = true;
-	_queue_internal_update();
-	emit_signal(CoreStringNames::get_singleton()->changed);
-}
-
-void TileMapLayer::set_texture_repeat(CanvasItem::TextureRepeat p_texture_repeat) {
-	// Set a default texture repeat for the whole tilemap.
-	CanvasItem::set_texture_repeat(p_texture_repeat);
-	dirty.flags[DIRTY_FLAGS_LAYER_TEXTURE_REPEAT] = true;
-	_queue_internal_update();
-	emit_signal(CoreStringNames::get_singleton()->changed);
-}
-
 void TileMapLayer::set_rendering_quadrant_size(int p_size) {
 	if (rendering_quadrant_size == p_size) {
 		return;
@@ -2678,6 +2734,20 @@ int TileMapLayer::get_rendering_quadrant_size() const {
 	return rendering_quadrant_size;
 }
 
+void TileMapLayer::set_collision_enabled(bool p_enabled) {
+	if (collision_enabled == p_enabled) {
+		return;
+	}
+	collision_enabled = p_enabled;
+	dirty.flags[DIRTY_FLAGS_LAYER_COLLISION_ENABLED] = true;
+	_queue_internal_update();
+	emit_signal(CoreStringNames::get_singleton()->changed);
+}
+
+bool TileMapLayer::is_collision_enabled() const {
+	return collision_enabled;
+}
+
 void TileMapLayer::set_use_kinematic_bodies(bool p_use_kinematic_bodies) {
 	use_kinematic_bodies = p_use_kinematic_bodies;
 	dirty.flags[DIRTY_FLAGS_LAYER_USE_KINEMATIC_BODIES] = p_use_kinematic_bodies;
@@ -2689,7 +2759,7 @@ bool TileMapLayer::is_using_kinematic_bodies() const {
 	return use_kinematic_bodies;
 }
 
-void TileMapLayer::set_collision_visibility_mode(TileMapLayer::VisibilityMode p_show_collision) {
+void TileMapLayer::set_collision_visibility_mode(TileMapLayer::DebugVisibilityMode p_show_collision) {
 	if (collision_visibility_mode == p_show_collision) {
 		return;
 	}
@@ -2699,7 +2769,7 @@ void TileMapLayer::set_collision_visibility_mode(TileMapLayer::VisibilityMode p_
 	emit_signal(CoreStringNames::get_singleton()->changed);
 }
 
-TileMapLayer::VisibilityMode TileMapLayer::get_collision_visibility_mode() const {
+TileMapLayer::DebugVisibilityMode TileMapLayer::get_collision_visibility_mode() const {
 	return collision_visibility_mode;
 }
 
@@ -2736,7 +2806,7 @@ RID TileMapLayer::get_navigation_map() const {
 	return RID();
 }
 
-void TileMapLayer::set_navigation_visibility_mode(TileMapLayer::VisibilityMode p_show_navigation) {
+void TileMapLayer::set_navigation_visibility_mode(TileMapLayer::DebugVisibilityMode p_show_navigation) {
 	if (navigation_visibility_mode == p_show_navigation) {
 		return;
 	}
@@ -2746,43 +2816,10 @@ void TileMapLayer::set_navigation_visibility_mode(TileMapLayer::VisibilityMode p
 	emit_signal(CoreStringNames::get_singleton()->changed);
 }
 
-TileMapLayer::VisibilityMode TileMapLayer::get_navigation_visibility_mode() const {
+TileMapLayer::DebugVisibilityMode TileMapLayer::get_navigation_visibility_mode() const {
 	return navigation_visibility_mode;
 }
 
-void TileMapLayer::fix_invalid_tiles() {
-	Ref<TileSet> tileset = get_effective_tile_set();
-	ERR_FAIL_COND_MSG(tileset.is_null(), "Cannot call fix_invalid_tiles() on a TileMap without a valid TileSet.");
-
-	RBSet<Vector2i> coords;
-	for (const KeyValue<Vector2i, CellData> &E : tile_map) {
-		TileSetSource *source = *tileset->get_source(E.value.cell.source_id);
-		if (!source || !source->has_tile(E.value.cell.get_atlas_coords()) || !source->has_alternative_tile(E.value.cell.get_atlas_coords(), E.value.cell.alternative_tile)) {
-			coords.insert(E.key);
-		}
-	}
-	for (const Vector2i &E : coords) {
-		set_cell(E, TileSet::INVALID_SOURCE, TileSetSource::INVALID_ATLAS_COORDS, TileSetSource::INVALID_TILE_ALTERNATIVE);
-	}
-}
-
-bool TileMapLayer::has_body_rid(RID p_physics_body) const {
-	return bodies_coords.has(p_physics_body);
-}
-
-Vector2i TileMapLayer::get_coords_for_body_rid(RID p_physics_body) const {
-	return bodies_coords[p_physics_body];
-}
-
-Ref<TileSet> TileMapLayer::get_effective_tile_set() const {
-	TileMapLayerGroup *tile_map_layer_group = Object::cast_to<TileMapLayerGroup>(get_parent());
-	if (tile_map_layer_group) {
-		return tile_map_layer_group->get_tileset();
-	} else {
-		return Ref<TileSet>();
-	}
-}
-
 TileMapLayer::TileMapLayer() {
 	set_notify_transform(true);
 }
diff --git a/scene/2d/tile_map_layer.h b/scene/2d/tile_map_layer.h
index c58c72949c..da48eb38d9 100644
--- a/scene/2d/tile_map_layer.h
+++ b/scene/2d/tile_map_layer.h
@@ -31,10 +31,15 @@
 #ifndef TILE_MAP_LAYER_H
 #define TILE_MAP_LAYER_H
 
-#include "scene/2d/tile_map.h"
 #include "scene/resources/2d/tile_set.h"
 
 class TileSetAtlasSource;
+class TileMap;
+
+enum TileMapLayerDataFormat {
+	TILE_MAP_LAYER_DATA_FORMAT_0 = 0,
+	TILE_MAP_LAYER_DATA_FORMAT_MAX,
+};
 
 class TerrainConstraint {
 private:
@@ -218,14 +223,21 @@ class TileMapLayer : public Node2D {
 	GDCLASS(TileMapLayer, Node2D);
 
 public:
-	enum VisibilityMode {
-		VISIBILITY_MODE_DEFAULT,
-		VISIBILITY_MODE_FORCE_SHOW,
-		VISIBILITY_MODE_FORCE_HIDE,
+	enum HighlightMode {
+		HIGHLIGHT_MODE_DEFAULT,
+		HIGHLIGHT_MODE_ABOVE,
+		HIGHLIGHT_MODE_BELOW,
+	};
+
+	enum DebugVisibilityMode {
+		DEBUG_VISIBILITY_MODE_DEFAULT,
+		DEBUG_VISIBILITY_MODE_FORCE_SHOW,
+		DEBUG_VISIBILITY_MODE_FORCE_HIDE,
 	};
 
 	enum DirtyFlags {
 		DIRTY_FLAGS_LAYER_ENABLED = 0,
+
 		DIRTY_FLAGS_LAYER_IN_TREE,
 		DIRTY_FLAGS_LAYER_IN_CANVAS,
 		DIRTY_FLAGS_LAYER_LOCAL_TRANSFORM,
@@ -238,6 +250,7 @@ public:
 		DIRTY_FLAGS_LAYER_TEXTURE_FILTER,
 		DIRTY_FLAGS_LAYER_TEXTURE_REPEAT,
 		DIRTY_FLAGS_LAYER_RENDERING_QUADRANT_SIZE,
+		DIRTY_FLAGS_LAYER_COLLISION_ENABLED,
 		DIRTY_FLAGS_LAYER_USE_KINEMATIC_BODIES,
 		DIRTY_FLAGS_LAYER_COLLISION_VISIBILITY_MODE,
 		DIRTY_FLAGS_LAYER_NAVIGATION_ENABLED,
@@ -249,26 +262,33 @@ public:
 
 		DIRTY_FLAGS_LAYER_GROUP_SELECTED_LAYERS,
 		DIRTY_FLAGS_LAYER_GROUP_HIGHLIGHT_SELECTED,
-		DIRTY_FLAGS_LAYER_GROUP_TILE_SET,
+
+		DIRTY_FLAGS_TILE_SET,
 
 		DIRTY_FLAGS_MAX,
 	};
 
 private:
-	// Exposed properties.
+	// Properties.
+	HashMap<Vector2i, CellData> tile_map_layer_data;
+
 	bool enabled = true;
+	Ref<TileSet> tile_set;
+
+	HighlightMode highlight_mode = HIGHLIGHT_MODE_DEFAULT;
+
 	int y_sort_origin = 0;
 	int rendering_quadrant_size = 16;
 
+	bool collision_enabled = true;
 	bool use_kinematic_bodies = false;
-	VisibilityMode collision_visibility_mode = VISIBILITY_MODE_DEFAULT;
+	DebugVisibilityMode collision_visibility_mode = DEBUG_VISIBILITY_MODE_DEFAULT;
 
 	bool navigation_enabled = true;
 	RID navigation_map_override;
-	VisibilityMode navigation_visibility_mode = VISIBILITY_MODE_DEFAULT;
+	DebugVisibilityMode navigation_visibility_mode = DEBUG_VISIBILITY_MODE_DEFAULT;
 
 	// Internal.
-	HashMap<Vector2i, CellData> tile_map;
 	bool pending_update = false;
 
 	// For keeping compatibility with TileMap.
@@ -348,6 +368,8 @@ private:
 	RBSet<TerrainConstraint> _get_terrain_constraints_from_added_pattern(const Vector2i &p_position, int p_terrain_set, TileSet::TerrainsPattern p_terrains_pattern) const;
 	RBSet<TerrainConstraint> _get_terrain_constraints_from_painted_cells_list(const RBSet<Vector2i> &p_painted, int p_terrain_set, bool p_ignore_empty_terrains) const;
 
+	void _tile_set_changed();
+
 	void _renamed();
 	void _update_notify_local_transform();
 
@@ -358,11 +380,21 @@ private:
 
 protected:
 	void _notification(int p_what);
+
 	static void _bind_methods();
 
+	virtual void _update_self_texture_filter(RS::CanvasItemTextureFilter p_texture_filter) override;
+	virtual void _update_self_texture_repeat(RS::CanvasItemTextureRepeat p_texture_repeat) override;
+
 public:
 	// TileMap node.
 	void set_as_tile_map_internal_node(int p_index);
+	int get_index_in_tile_map() const {
+		return layer_index_in_tile_map_node;
+	}
+	const HashMap<Vector2i, CellData> &get_tile_map_layer_data() const {
+		return tile_map_layer_data;
+	}
 
 	// Rect caching.
 	Rect2 get_rect(bool &r_changed) const;
@@ -374,27 +406,26 @@ public:
 	HashMap<Vector2i, TileSet::TerrainsPattern> terrain_fill_pattern(const Vector<Vector2i> &p_coords_array, int p_terrain_set, TileSet::TerrainsPattern p_terrains_pattern, bool p_ignore_empty_terrains = true) const; // Not exposed.
 
 	// Not exposed to users.
-	TileMapCell get_cell(const Vector2i &p_coords, bool p_use_proxies = false) const;
+	TileMapCell get_cell(const Vector2i &p_coords) const;
 
-	// For TileMap node's use.
-	void set_tile_data(TileMapDataFormat p_format, const Vector<int> &p_data);
-	Vector<int> get_tile_data() const;
-	void notify_tile_map_layer_group_change(DirtyFlags p_what);
-
-	void update_internals();
-	void notify_runtime_tile_data_update();
+	////////////// Exposed functions //////////////
 
-	// --- Exposed in TileMap ---
-	// Cells manipulation.
-	void set_cell(const Vector2i &p_coords, int p_source_id = TileSet::INVALID_SOURCE, const Vector2i p_atlas_coords = TileSetSource::INVALID_ATLAS_COORDS, int p_alternative_tile = 0);
+	// --- Cells manipulation ---
+	// Generic cells manipulations and data access.
+	void set_cell(const Vector2i &p_coords, int p_source_id = TileSet::INVALID_SOURCE, const Vector2i &p_atlas_coords = TileSetSource::INVALID_ATLAS_COORDS, int p_alternative_tile = 0);
 	void erase_cell(const Vector2i &p_coords);
-
-	int get_cell_source_id(const Vector2i &p_coords, bool p_use_proxies = false) const;
-	Vector2i get_cell_atlas_coords(const Vector2i &p_coords, bool p_use_proxies = false) const;
-	int get_cell_alternative_tile(const Vector2i &p_coords, bool p_use_proxies = false) const;
-	TileData *get_cell_tile_data(const Vector2i &p_coords, bool p_use_proxies = false) const; // Helper method to make accessing the data easier.
+	void fix_invalid_tiles();
 	void clear();
 
+	int get_cell_source_id(const Vector2i &p_coords) const;
+	Vector2i get_cell_atlas_coords(const Vector2i &p_coords) const;
+	int get_cell_alternative_tile(const Vector2i &p_coords) const;
+	TileData *get_cell_tile_data(const Vector2i &p_coords) const; // Helper method to make accessing the data easier.
+
+	TypedArray<Vector2i> get_used_cells() const;
+	TypedArray<Vector2i> get_used_cells_by_id(int p_source_id = TileSet::INVALID_SOURCE, const Vector2i &p_atlas_coords = TileSetSource::INVALID_ATLAS_COORDS, int p_alternative_tile = TileSetSource::INVALID_TILE_ALTERNATIVE) const;
+	Rect2i get_used_rect() const;
+
 	// Patterns.
 	Ref<TileMapPattern> get_pattern(TypedArray<Vector2i> p_coords_array);
 	void set_pattern(const Vector2i &p_position, const Ref<TileMapPattern> p_pattern);
@@ -403,54 +434,62 @@ public:
 	void set_cells_terrain_connect(TypedArray<Vector2i> p_cells, int p_terrain_set, int p_terrain, bool p_ignore_empty_terrains = true);
 	void set_cells_terrain_path(TypedArray<Vector2i> p_path, int p_terrain_set, int p_terrain, bool p_ignore_empty_terrains = true);
 
-	// Cells usage.
-	TypedArray<Vector2i> get_used_cells() const;
-	TypedArray<Vector2i> get_used_cells_by_id(int p_source_id = TileSet::INVALID_SOURCE, const Vector2i p_atlas_coords = TileSetSource::INVALID_ATLAS_COORDS, int p_alternative_tile = TileSetSource::INVALID_TILE_ALTERNATIVE) const;
-	Rect2i get_used_rect() const;
+	// --- Physics helpers ---
+	bool has_body_rid(RID p_physics_body) const;
+	Vector2i get_coords_for_body_rid(RID p_physics_body) const; // For finding tiles from collision.
+
+	// --- Runtime ---
+	void update_internals();
+	void notify_runtime_tile_data_update();
+	GDVIRTUAL1R(bool, _use_tile_data_runtime_update, Vector2i);
+	GDVIRTUAL2(_tile_data_runtime_update, Vector2i, TileData *);
+
+	// --- Shortcuts to methods defined in TileSet ---
+	Vector2i map_pattern(const Vector2i &p_position_in_tilemap, const Vector2i &p_coords_in_pattern, Ref<TileMapPattern> p_pattern);
+	TypedArray<Vector2i> get_surrounding_cells(const Vector2i &p_coords);
+	Vector2i get_neighbor_cell(const Vector2i &p_coords, TileSet::CellNeighbor p_cell_neighbor) const;
+	Vector2 map_to_local(const Vector2i &p_pos) const;
+	Vector2i local_to_map(const Vector2 &p_pos) const;
+
+	// --- Accessors ---
+	void set_tile_map_data_from_array(const Vector<uint8_t> &p_data);
+	Vector<uint8_t> get_tile_map_data_as_array() const;
 
-	// Layer properties.
 	void set_enabled(bool p_enabled);
 	bool is_enabled() const;
+	void set_tile_set(const Ref<TileSet> &p_tile_set);
+	Ref<TileSet> get_tile_set() const;
+
+	void set_highlight_mode(HighlightMode p_highlight_mode);
+	HighlightMode get_highlight_mode() const;
+
 	virtual void set_self_modulate(const Color &p_self_modulate) override;
 	virtual void set_y_sort_enabled(bool p_y_sort_enabled) override;
 	void set_y_sort_origin(int p_y_sort_origin);
 	int get_y_sort_origin() const;
 	virtual void set_z_index(int p_z_index) override;
 	virtual void set_light_mask(int p_light_mask) override;
-	virtual void set_texture_filter(CanvasItem::TextureFilter p_texture_filter) override;
-	virtual void set_texture_repeat(CanvasItem::TextureRepeat p_texture_repeat) override;
 	void set_rendering_quadrant_size(int p_size);
 	int get_rendering_quadrant_size() const;
 
+	void set_collision_enabled(bool p_enabled);
+	bool is_collision_enabled() const;
 	void set_use_kinematic_bodies(bool p_use_kinematic_bodies);
 	bool is_using_kinematic_bodies() const;
-	void set_collision_visibility_mode(VisibilityMode p_show_collision);
-	VisibilityMode get_collision_visibility_mode() const;
+	void set_collision_visibility_mode(DebugVisibilityMode p_show_collision);
+	DebugVisibilityMode get_collision_visibility_mode() const;
 
 	void set_navigation_enabled(bool p_enabled);
 	bool is_navigation_enabled() const;
 	void set_navigation_map(RID p_map);
 	RID get_navigation_map() const;
-	void set_navigation_visibility_mode(VisibilityMode p_show_navigation);
-	VisibilityMode get_navigation_visibility_mode() const;
-
-	// Fixing and clearing methods.
-	void fix_invalid_tiles();
-
-	// Find coords for body.
-	bool has_body_rid(RID p_physics_body) const;
-	Vector2i get_coords_for_body_rid(RID p_physics_body) const; // For finding tiles from collision.
-
-	// Helper.
-	Ref<TileSet> get_effective_tile_set() const;
-
-	// Virtual function to modify the TileData at runtime.
-	GDVIRTUAL1R(bool, _use_tile_data_runtime_update, Vector2i);
-	GDVIRTUAL2(_tile_data_runtime_update, Vector2i, TileData *);
-	// ---
+	void set_navigation_visibility_mode(DebugVisibilityMode p_show_navigation);
+	DebugVisibilityMode get_navigation_visibility_mode() const;
 
 	TileMapLayer();
 	~TileMapLayer();
 };
 
+VARIANT_ENUM_CAST(TileMapLayer::DebugVisibilityMode);
+
 #endif // TILE_MAP_LAYER_H
diff --git a/scene/2d/tile_map_layer_group.cpp b/scene/2d/tile_map_layer_group.cpp
deleted file mode 100644
index 132b4bbba5..0000000000
--- a/scene/2d/tile_map_layer_group.cpp
+++ /dev/null
@@ -1,148 +0,0 @@
-/**************************************************************************/
-/*  tile_map_layer_group.cpp                                              */
-/**************************************************************************/
-/*                         This file is part of:                          */
-/*                             GODOT ENGINE                               */
-/*                        https://godotengine.org                         */
-/**************************************************************************/
-/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
-/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
-/*                                                                        */
-/* Permission is hereby granted, free of charge, to any person obtaining  */
-/* a copy of this software and associated documentation files (the        */
-/* "Software"), to deal in the Software without restriction, including    */
-/* without limitation the rights to use, copy, modify, merge, publish,    */
-/* distribute, sublicense, and/or sell copies of the Software, and to     */
-/* permit persons to whom the Software is furnished to do so, subject to  */
-/* the following conditions:                                              */
-/*                                                                        */
-/* The above copyright notice and this permission notice shall be         */
-/* included in all copies or substantial portions of the Software.        */
-/*                                                                        */
-/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
-/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
-/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
-/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
-/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
-/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
-/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
-/**************************************************************************/
-
-#include "tile_map_layer_group.h"
-
-#include "core/core_string_names.h"
-#include "scene/2d/tile_map_layer.h"
-#include "scene/resources/2d/tile_set.h"
-
-#ifdef TOOLS_ENABLED
-
-void TileMapLayerGroup::_cleanup_selected_layers() {
-	for (int i = 0; i < selected_layers.size(); i++) {
-		const String name = selected_layers[i];
-		TileMapLayer *layer = Object::cast_to<TileMapLayer>(get_node_or_null(name));
-		if (!layer) {
-			selected_layers.remove_at(i);
-			i--;
-		}
-	}
-}
-
-#endif // TOOLS_ENABLED
-
-void TileMapLayerGroup::_tile_set_changed() {
-	for (int i = 0; i < get_child_count(); i++) {
-		TileMapLayer *layer = Object::cast_to<TileMapLayer>(get_child(i));
-		if (layer) {
-			layer->notify_tile_map_layer_group_change(TileMapLayer::DIRTY_FLAGS_LAYER_GROUP_TILE_SET);
-		}
-	}
-
-	update_configuration_warnings();
-}
-
-#ifdef TOOLS_ENABLED
-
-void TileMapLayerGroup::set_selected_layers(Vector<StringName> p_layer_names) {
-	selected_layers = p_layer_names;
-	_cleanup_selected_layers();
-
-	// Update the layers modulation.
-	for (int i = 0; i < get_child_count(); i++) {
-		TileMapLayer *layer = Object::cast_to<TileMapLayer>(get_child(i));
-		if (layer) {
-			layer->notify_tile_map_layer_group_change(TileMapLayer::DIRTY_FLAGS_LAYER_GROUP_SELECTED_LAYERS);
-		}
-	}
-}
-
-Vector<StringName> TileMapLayerGroup::get_selected_layers() const {
-	return selected_layers;
-}
-
-void TileMapLayerGroup::set_highlight_selected_layer(bool p_highlight_selected_layer) {
-	if (highlight_selected_layer == p_highlight_selected_layer) {
-		return;
-	}
-
-	highlight_selected_layer = p_highlight_selected_layer;
-
-	for (int i = 0; i < get_child_count(); i++) {
-		TileMapLayer *layer = Object::cast_to<TileMapLayer>(get_child(i));
-		if (layer) {
-			layer->notify_tile_map_layer_group_change(TileMapLayer::DIRTY_FLAGS_LAYER_GROUP_HIGHLIGHT_SELECTED);
-		}
-	}
-}
-
-bool TileMapLayerGroup::is_highlighting_selected_layer() const {
-	return highlight_selected_layer;
-}
-
-#endif // TOOLS_ENABLED
-
-void TileMapLayerGroup::remove_child_notify(Node *p_child) {
-#ifdef TOOLS_ENABLED
-	_cleanup_selected_layers();
-#endif // TOOLS_ENABLED
-}
-
-void TileMapLayerGroup::_bind_methods() {
-	ClassDB::bind_method(D_METHOD("set_tileset", "tileset"), &TileMapLayerGroup::set_tileset);
-	ClassDB::bind_method(D_METHOD("get_tileset"), &TileMapLayerGroup::get_tileset);
-
-	ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "tile_set", PROPERTY_HINT_RESOURCE_TYPE, "TileSet"), "set_tileset", "get_tileset");
-}
-
-void TileMapLayerGroup::set_tileset(const Ref<TileSet> &p_tileset) {
-	if (p_tileset == tile_set) {
-		return;
-	}
-
-	// Set the tileset, registering to its changes.
-	if (tile_set.is_valid()) {
-		tile_set->disconnect_changed(callable_mp(this, &TileMapLayerGroup::_tile_set_changed));
-	}
-
-	tile_set = p_tileset;
-
-	if (tile_set.is_valid()) {
-		tile_set->connect_changed(callable_mp(this, &TileMapLayerGroup::_tile_set_changed));
-	}
-
-	for (int i = 0; i < get_child_count(); i++) {
-		TileMapLayer *layer = Object::cast_to<TileMapLayer>(get_child(i));
-		if (layer) {
-			layer->notify_tile_map_layer_group_change(TileMapLayer::DIRTY_FLAGS_LAYER_GROUP_TILE_SET);
-		}
-	}
-}
-
-Ref<TileSet> TileMapLayerGroup::get_tileset() const {
-	return tile_set;
-}
-
-TileMapLayerGroup::~TileMapLayerGroup() {
-	if (tile_set.is_valid()) {
-		tile_set->disconnect_changed(callable_mp(this, &TileMapLayerGroup::_tile_set_changed));
-	}
-}
diff --git a/scene/3d/physics/collision_object_3d.cpp b/scene/3d/physics/collision_object_3d.cpp
index bbd2ef2fb8..54752b1281 100644
--- a/scene/3d/physics/collision_object_3d.cpp
+++ b/scene/3d/physics/collision_object_3d.cpp
@@ -489,11 +489,11 @@ void CollisionObject3D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("shape_owner_clear_shapes", "owner_id"), &CollisionObject3D::shape_owner_clear_shapes);
 	ClassDB::bind_method(D_METHOD("shape_find_owner", "shape_index"), &CollisionObject3D::shape_find_owner);
 
-	GDVIRTUAL_BIND(_input_event, "camera", "event", "position", "normal", "shape_idx");
+	GDVIRTUAL_BIND(_input_event, "camera", "event", "event_position", "normal", "shape_idx");
 	GDVIRTUAL_BIND(_mouse_enter);
 	GDVIRTUAL_BIND(_mouse_exit);
 
-	ADD_SIGNAL(MethodInfo("input_event", PropertyInfo(Variant::OBJECT, "camera", PROPERTY_HINT_RESOURCE_TYPE, "Node"), PropertyInfo(Variant::OBJECT, "event", PROPERTY_HINT_RESOURCE_TYPE, "InputEvent"), PropertyInfo(Variant::VECTOR3, "position"), PropertyInfo(Variant::VECTOR3, "normal"), PropertyInfo(Variant::INT, "shape_idx")));
+	ADD_SIGNAL(MethodInfo("input_event", PropertyInfo(Variant::OBJECT, "camera", PROPERTY_HINT_RESOURCE_TYPE, "Node"), PropertyInfo(Variant::OBJECT, "event", PROPERTY_HINT_RESOURCE_TYPE, "InputEvent"), PropertyInfo(Variant::VECTOR3, "event_position"), PropertyInfo(Variant::VECTOR3, "normal"), PropertyInfo(Variant::INT, "shape_idx")));
 	ADD_SIGNAL(MethodInfo("mouse_entered"));
 	ADD_SIGNAL(MethodInfo("mouse_exited"));
 
diff --git a/scene/animation/animation_node_state_machine.cpp b/scene/animation/animation_node_state_machine.cpp
index 0484694555..f5bef899da 100644
--- a/scene/animation/animation_node_state_machine.cpp
+++ b/scene/animation/animation_node_state_machine.cpp
@@ -916,15 +916,16 @@ bool AnimationNodeStateMachinePlayback::_transition_to_next_recursive(AnimationT
 	transition_path.push_back(current);
 	while (true) {
 		next = _find_next(p_tree, p_state_machine);
-		if (transition_path.has(next.node)) {
-			WARN_PRINT_ONCE_ED("AnimationNodeStateMachinePlayback: " + base_path + "playback aborts the transition by detecting one or more looped transitions in the same frame to prevent to infinity loop. You may need to check the transition settings.");
-			break; // Maybe infinity loop, do nothing more.
-		}
 
 		if (!_can_transition_to_next(p_tree, p_state_machine, next, p_test_only)) {
 			break; // Finish transition.
 		}
 
+		if (transition_path.has(next.node)) {
+			WARN_PRINT_ONCE_ED("AnimationNodeStateMachinePlayback: " + base_path + "playback aborts the transition by detecting one or more looped transitions in the same frame to prevent to infinity loop. You may need to check the transition settings.");
+			break; // Maybe infinity loop, do nothing more.
+		}
+
 		transition_path.push_back(next.node);
 		is_state_changed = true;
 
diff --git a/scene/gui/file_dialog.cpp b/scene/gui/file_dialog.cpp
index 1163c0e390..3816b337b8 100644
--- a/scene/gui/file_dialog.cpp
+++ b/scene/gui/file_dialog.cpp
@@ -59,6 +59,17 @@ void FileDialog::_focus_file_text() {
 	}
 }
 
+void FileDialog::_native_popup() {
+	// Show native dialog directly.
+	String root;
+	if (access == ACCESS_RESOURCES) {
+		root = ProjectSettings::get_singleton()->get_resource_path();
+	} else if (access == ACCESS_USERDATA) {
+		root = OS::get_singleton()->get_user_data_dir();
+	}
+	DisplayServer::get_singleton()->file_dialog_with_options_show(get_title(), ProjectSettings::get_singleton()->globalize_path(dir->get_text()), root, file->get_text().get_file(), show_hidden_files, DisplayServer::FileDialogMode(mode), filters, _get_options(), callable_mp(this, &FileDialog::_native_dialog_cb));
+}
+
 void FileDialog::popup(const Rect2i &p_rect) {
 	_update_option_controls();
 
@@ -69,20 +80,16 @@ void FileDialog::popup(const Rect2i &p_rect) {
 #endif
 
 	if (DisplayServer::get_singleton()->has_feature(DisplayServer::FEATURE_NATIVE_DIALOG_FILE) && (use_native_dialog || OS::get_singleton()->is_sandboxed())) {
-		String root;
-		if (access == ACCESS_RESOURCES) {
-			root = ProjectSettings::get_singleton()->get_resource_path();
-		} else if (access == ACCESS_USERDATA) {
-			root = OS::get_singleton()->get_user_data_dir();
-		}
-		DisplayServer::get_singleton()->file_dialog_with_options_show(get_title(), ProjectSettings::get_singleton()->globalize_path(dir->get_text()), root, file->get_text().get_file(), show_hidden_files, DisplayServer::FileDialogMode(mode), filters, _get_options(), callable_mp(this, &FileDialog::_native_dialog_cb));
+		_native_popup();
 	} else {
 		ConfirmationDialog::popup(p_rect);
 	}
 }
 
 void FileDialog::set_visible(bool p_visible) {
-	_update_option_controls();
+	if (p_visible) {
+		_update_option_controls();
+	}
 
 #ifdef TOOLS_ENABLED
 	if (is_part_of_edited_scene()) {
@@ -92,67 +99,62 @@ void FileDialog::set_visible(bool p_visible) {
 #endif
 
 	if (DisplayServer::get_singleton()->has_feature(DisplayServer::FEATURE_NATIVE_DIALOG_FILE) && (use_native_dialog || OS::get_singleton()->is_sandboxed())) {
-		if (p_visible) {
-			String root;
-			if (access == ACCESS_RESOURCES) {
-				root = ProjectSettings::get_singleton()->get_resource_path();
-			} else if (access == ACCESS_USERDATA) {
-				root = OS::get_singleton()->get_user_data_dir();
-			}
-			DisplayServer::get_singleton()->file_dialog_with_options_show(get_title(), ProjectSettings::get_singleton()->globalize_path(dir->get_text()), root, file->get_text().get_file(), show_hidden_files, DisplayServer::FileDialogMode(mode), filters, _get_options(), callable_mp(this, &FileDialog::_native_dialog_cb));
-		}
+		_native_popup();
 	} else {
 		ConfirmationDialog::set_visible(p_visible);
 	}
 }
 
 void FileDialog::_native_dialog_cb(bool p_ok, const Vector<String> &p_files, int p_filter, const Dictionary &p_selected_options) {
-	if (p_ok) {
-		if (p_files.size() > 0) {
-			Vector<String> files = p_files;
-			if (access != ACCESS_FILESYSTEM) {
-				for (String &file_name : files) {
-					file_name = ProjectSettings::get_singleton()->localize_path(file_name);
-				}
-			}
-			String f = files[0];
-			if (mode == FILE_MODE_OPEN_FILES) {
-				emit_signal(SNAME("files_selected"), files);
-			} else {
-				if (mode == FILE_MODE_SAVE_FILE) {
-					if (p_filter >= 0 && p_filter < filters.size()) {
-						bool valid = false;
-						String flt = filters[p_filter].get_slice(";", 0);
-						int filter_slice_count = flt.get_slice_count(",");
-						for (int j = 0; j < filter_slice_count; j++) {
-							String str = (flt.get_slice(",", j).strip_edges());
-							if (f.match(str)) {
-								valid = true;
-								break;
-							}
-						}
-
-						if (!valid && filter_slice_count > 0) {
-							String str = (flt.get_slice(",", 0).strip_edges());
-							f += str.substr(1, str.length() - 1);
-						}
+	if (!p_ok) {
+		file->set_text("");
+		emit_signal(SNAME("canceled"));
+		return;
+	}
+
+	if (p_files.is_empty()) {
+		return;
+	}
+
+	Vector<String> files = p_files;
+	if (access != ACCESS_FILESYSTEM) {
+		for (String &file_name : files) {
+			file_name = ProjectSettings::get_singleton()->localize_path(file_name);
+		}
+	}
+	String f = files[0];
+	if (mode == FILE_MODE_OPEN_FILES) {
+		emit_signal(SNAME("files_selected"), files);
+	} else {
+		if (mode == FILE_MODE_SAVE_FILE) {
+			if (p_filter >= 0 && p_filter < filters.size()) {
+				bool valid = false;
+				String flt = filters[p_filter].get_slice(";", 0);
+				int filter_slice_count = flt.get_slice_count(",");
+				for (int j = 0; j < filter_slice_count; j++) {
+					String str = (flt.get_slice(",", j).strip_edges());
+					if (f.match(str)) {
+						valid = true;
+						break;
 					}
-					emit_signal(SNAME("file_selected"), f);
-				} else if ((mode == FILE_MODE_OPEN_ANY || mode == FILE_MODE_OPEN_FILE) && dir_access->file_exists(f)) {
-					emit_signal(SNAME("file_selected"), f);
-				} else if (mode == FILE_MODE_OPEN_ANY || mode == FILE_MODE_OPEN_DIR) {
-					emit_signal(SNAME("dir_selected"), f);
+				}
+
+				if (!valid && filter_slice_count > 0) {
+					String str = (flt.get_slice(",", 0).strip_edges());
+					f += str.substr(1, str.length() - 1);
 				}
 			}
-			file->set_text(f);
-			dir->set_text(f.get_base_dir());
-			selected_options = p_selected_options;
-			filter->select(p_filter);
+			emit_signal(SNAME("file_selected"), f);
+		} else if ((mode == FILE_MODE_OPEN_ANY || mode == FILE_MODE_OPEN_FILE) && dir_access->file_exists(f)) {
+			emit_signal(SNAME("file_selected"), f);
+		} else if (mode == FILE_MODE_OPEN_ANY || mode == FILE_MODE_OPEN_DIR) {
+			emit_signal(SNAME("dir_selected"), f);
 		}
-	} else {
-		file->set_text("");
-		emit_signal(SNAME("canceled"));
 	}
+	file->set_text(f);
+	dir->set_text(f.get_base_dir());
+	selected_options = p_selected_options;
+	filter->select(p_filter);
 }
 
 VBoxContainer *FileDialog::get_vbox() {
@@ -1110,7 +1112,7 @@ void FileDialog::_update_option_controls() {
 	}
 	options_dirty = false;
 
-	while (grid_options->get_child_count(false) > 0) {
+	while (grid_options->get_child_count() > 0) {
 		Node *child = grid_options->get_child(0);
 		grid_options->remove_child(child);
 		child->queue_free();
@@ -1222,9 +1224,8 @@ void FileDialog::add_option(const String &p_name, const Vector<String> &p_values
 
 void FileDialog::set_option_count(int p_count) {
 	ERR_FAIL_COND(p_count < 0);
-	int prev_size = options.size();
 
-	if (prev_size == p_count) {
+	if (options.size() == p_count) {
 		return;
 	}
 	options.resize(p_count);
@@ -1298,10 +1299,10 @@ void FileDialog::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("get_option_default", "option"), &FileDialog::get_option_default);
 	ClassDB::bind_method(D_METHOD("set_option_name", "option", "name"), &FileDialog::set_option_name);
 	ClassDB::bind_method(D_METHOD("set_option_values", "option", "values"), &FileDialog::set_option_values);
-	ClassDB::bind_method(D_METHOD("set_option_default", "option", "index"), &FileDialog::set_option_default);
+	ClassDB::bind_method(D_METHOD("set_option_default", "option", "default_value_index"), &FileDialog::set_option_default);
 	ClassDB::bind_method(D_METHOD("set_option_count", "count"), &FileDialog::set_option_count);
 	ClassDB::bind_method(D_METHOD("get_option_count"), &FileDialog::get_option_count);
-	ClassDB::bind_method(D_METHOD("add_option", "name", "values", "index"), &FileDialog::add_option);
+	ClassDB::bind_method(D_METHOD("add_option", "name", "values", "default_value_index"), &FileDialog::add_option);
 	ClassDB::bind_method(D_METHOD("get_selected_options"), &FileDialog::get_selected_options);
 	ClassDB::bind_method(D_METHOD("get_current_dir"), &FileDialog::get_current_dir);
 	ClassDB::bind_method(D_METHOD("get_current_file"), &FileDialog::get_current_file);
diff --git a/scene/gui/file_dialog.h b/scene/gui/file_dialog.h
index 1b53c7e05e..7caae7e216 100644
--- a/scene/gui/file_dialog.h
+++ b/scene/gui/file_dialog.h
@@ -172,6 +172,7 @@ private:
 
 	virtual void shortcut_input(const Ref<InputEvent> &p_event) override;
 
+	void _native_popup();
 	void _native_dialog_cb(bool p_ok, const Vector<String> &p_files, int p_filter, const Dictionary &p_selected_options);
 
 	bool _is_open_should_be_disabled();
diff --git a/scene/gui/graph_edit.cpp b/scene/gui/graph_edit.cpp
index 8bce5c0caa..c8909840c8 100644
--- a/scene/gui/graph_edit.cpp
+++ b/scene/gui/graph_edit.cpp
@@ -2315,7 +2315,7 @@ void GraphEdit::_bind_methods() {
 	ADD_SIGNAL(MethodInfo("node_selected", PropertyInfo(Variant::OBJECT, "node", PROPERTY_HINT_RESOURCE_TYPE, "Node")));
 	ADD_SIGNAL(MethodInfo("node_deselected", PropertyInfo(Variant::OBJECT, "node", PROPERTY_HINT_RESOURCE_TYPE, "Node")));
 
-	ADD_SIGNAL(MethodInfo("popup_request", PropertyInfo(Variant::VECTOR2, "position")));
+	ADD_SIGNAL(MethodInfo("popup_request", PropertyInfo(Variant::VECTOR2, "at_position")));
 
 	ADD_SIGNAL(MethodInfo("begin_node_move"));
 	ADD_SIGNAL(MethodInfo("end_node_move"));
diff --git a/scene/gui/tree.cpp b/scene/gui/tree.cpp
index 8c544dcacf..1daf86fe0f 100644
--- a/scene/gui/tree.cpp
+++ b/scene/gui/tree.cpp
@@ -2195,9 +2195,8 @@ int Tree::draw_item(const Point2i &p_pos, const Point2 &p_draw_ofs, const Size2
 			if ((select_mode == SELECT_ROW && selected_item == p_item) || p_item->cells[i].selected || !p_item->has_meta("__focus_rect")) {
 				Rect2i r = cell_rect;
 
-				p_item->set_meta("__focus_rect", Rect2(r.position, r.size));
-
 				if (select_mode != SELECT_ROW) {
+					p_item->set_meta("__focus_rect", Rect2(r.position, r.size));
 					if (rtl) {
 						r.position.x = get_size().width - r.position.x - r.size.x;
 					}
@@ -2208,6 +2207,8 @@ int Tree::draw_item(const Point2i &p_pos, const Point2 &p_draw_ofs, const Size2
 							theme_cache.selected->draw(ci, r);
 						}
 					}
+				} else {
+					p_item->set_meta("__focus_col_" + itos(i), Rect2(r.position, r.size));
 				}
 			}
 
@@ -2693,7 +2694,6 @@ void Tree::select_single_item(TreeItem *p_selected, TreeItem *p_current, int p_c
 			if (p_selected == p_current && (!c.selected || allow_reselect)) {
 				c.selected = true;
 				selected_item = p_selected;
-				selected_col = 0;
 				if (!emitted_row) {
 					emit_signal(SNAME("item_selected"));
 					emitted_row = true;
@@ -2704,6 +2704,9 @@ void Tree::select_single_item(TreeItem *p_selected, TreeItem *p_current, int p_c
 					c.selected = false;
 				}
 			}
+			if (&selected_cell == &c) {
+				selected_col = i;
+			}
 		} else if (select_mode == SELECT_SINGLE || select_mode == SELECT_MULTI) {
 			if (!r_in_range && &selected_cell == &c) {
 				if (!selected_cell.selected || allow_reselect) {
@@ -3778,7 +3781,12 @@ void Tree::gui_input(const Ref<InputEvent> &p_event) {
 						Input::get_singleton()->set_mouse_mode(Input::MOUSE_MODE_VISIBLE);
 						warp_mouse(range_drag_capture_pos);
 					} else {
-						Rect2 rect = get_selected()->get_meta("__focus_rect");
+						Rect2 rect;
+						if (select_mode == SELECT_ROW) {
+							rect = get_selected()->get_meta("__focus_col_" + itos(selected_col));
+						} else {
+							rect = get_selected()->get_meta("__focus_rect");
+						}
 						Point2 mpos = mb->get_position();
 						int icon_size_x = 0;
 						Ref<Texture2D> icon = get_selected()->get_icon(selected_col);
@@ -3987,7 +3995,12 @@ bool Tree::edit_selected(bool p_force_edit) {
 		return false;
 	}
 
-	Rect2 rect = s->get_meta("__focus_rect");
+	Rect2 rect;
+	if (select_mode == SELECT_ROW) {
+		rect = s->get_meta("__focus_col_" + itos(selected_col));
+	} else {
+		rect = s->get_meta("__focus_rect");
+	}
 	popup_edited_item = s;
 	popup_edited_item_col = col;
 
@@ -5593,8 +5606,8 @@ void Tree::_bind_methods() {
 	ADD_SIGNAL(MethodInfo("item_selected"));
 	ADD_SIGNAL(MethodInfo("cell_selected"));
 	ADD_SIGNAL(MethodInfo("multi_selected", PropertyInfo(Variant::OBJECT, "item", PROPERTY_HINT_RESOURCE_TYPE, "TreeItem"), PropertyInfo(Variant::INT, "column"), PropertyInfo(Variant::BOOL, "selected")));
-	ADD_SIGNAL(MethodInfo("item_mouse_selected", PropertyInfo(Variant::VECTOR2, "position"), PropertyInfo(Variant::INT, "mouse_button_index")));
-	ADD_SIGNAL(MethodInfo("empty_clicked", PropertyInfo(Variant::VECTOR2, "position"), PropertyInfo(Variant::INT, "mouse_button_index")));
+	ADD_SIGNAL(MethodInfo("item_mouse_selected", PropertyInfo(Variant::VECTOR2, "mouse_position"), PropertyInfo(Variant::INT, "mouse_button_index")));
+	ADD_SIGNAL(MethodInfo("empty_clicked", PropertyInfo(Variant::VECTOR2, "click_position"), PropertyInfo(Variant::INT, "mouse_button_index")));
 	ADD_SIGNAL(MethodInfo("item_edited"));
 	ADD_SIGNAL(MethodInfo("custom_item_clicked", PropertyInfo(Variant::INT, "mouse_button_index")));
 	ADD_SIGNAL(MethodInfo("item_icon_double_clicked"));
diff --git a/scene/main/canvas_item.cpp b/scene/main/canvas_item.cpp
index 768c83954b..56aa453407 100644
--- a/scene/main/canvas_item.cpp
+++ b/scene/main/canvas_item.cpp
@@ -1401,14 +1401,17 @@ void CanvasItem::_refresh_texture_filter_cache() const {
 	}
 }
 
+void CanvasItem::_update_self_texture_filter(RS::CanvasItemTextureFilter p_texture_filter) {
+	RS::get_singleton()->canvas_item_set_default_texture_filter(get_canvas_item(), p_texture_filter);
+	queue_redraw();
+}
+
 void CanvasItem::_update_texture_filter_changed(bool p_propagate) {
 	if (!is_inside_tree()) {
 		return;
 	}
 	_refresh_texture_filter_cache();
-
-	RS::get_singleton()->canvas_item_set_default_texture_filter(get_canvas_item(), texture_filter_cache);
-	queue_redraw();
+	_update_self_texture_filter(texture_filter_cache);
 
 	if (p_propagate) {
 		for (CanvasItem *E : children_items) {
@@ -1452,14 +1455,18 @@ void CanvasItem::_refresh_texture_repeat_cache() const {
 	}
 }
 
+void CanvasItem::_update_self_texture_repeat(RS::CanvasItemTextureRepeat p_texture_repeat) {
+	RS::get_singleton()->canvas_item_set_default_texture_repeat(get_canvas_item(), p_texture_repeat);
+	queue_redraw();
+}
+
 void CanvasItem::_update_texture_repeat_changed(bool p_propagate) {
 	if (!is_inside_tree()) {
 		return;
 	}
 	_refresh_texture_repeat_cache();
+	_update_self_texture_repeat(texture_repeat_cache);
 
-	RS::get_singleton()->canvas_item_set_default_texture_repeat(get_canvas_item(), texture_repeat_cache);
-	queue_redraw();
 	if (p_propagate) {
 		for (CanvasItem *E : children_items) {
 			if (!E->top_level && E->texture_repeat == TEXTURE_REPEAT_PARENT_NODE) {
diff --git a/scene/main/canvas_item.h b/scene/main/canvas_item.h
index 383edeec93..8cec086ca6 100644
--- a/scene/main/canvas_item.h
+++ b/scene/main/canvas_item.h
@@ -152,6 +152,9 @@ private:
 	void _notify_transform_deferred();
 
 protected:
+	virtual void _update_self_texture_repeat(RS::CanvasItemTextureRepeat p_texture_repeat);
+	virtual void _update_self_texture_filter(RS::CanvasItemTextureFilter p_texture_filter);
+
 	_FORCE_INLINE_ void _notify_transform() {
 		_notify_transform(this);
 		if (is_inside_tree() && !block_transform_notify && notify_local_transform) {
diff --git a/scene/2d/tile_map_layer_group.h b/scene/main/node.compat.inc
index d80c244f80..69ece1a40d 100644
--- a/scene/2d/tile_map_layer_group.h
+++ b/scene/main/node.compat.inc
@@ -1,5 +1,5 @@
 /**************************************************************************/
-/*  tile_map_layer_group.h                                                */
+/*  node.compat.inc                                                       */
 /**************************************************************************/
 /*                         This file is part of:                          */
 /*                             GODOT ENGINE                               */
@@ -28,46 +28,14 @@
 /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
 /**************************************************************************/
 
-#ifndef TILE_MAP_LAYER_GROUP_H
-#define TILE_MAP_LAYER_GROUP_H
+#ifndef DISABLE_DEPRECATED
 
-#include "scene/2d/node_2d.h"
+void Node::_replace_by_bind_compat_89992(Node *p_node, bool p_keep_data) {
+	replace_by(p_node, p_keep_data, true);
+}
 
-class TileSet;
+void Node::_bind_compatibility_methods() {
+	ClassDB::bind_compatibility_method(D_METHOD("replace_by", "node", "keep_groups"), &Node::_replace_by_bind_compat_89992, DEFVAL(false));
+}
 
-class TileMapLayerGroup : public Node2D {
-	GDCLASS(TileMapLayerGroup, Node2D);
-
-private:
-	mutable Vector<StringName> selected_layers;
-	bool highlight_selected_layer = true;
-
-#ifdef TOOLS_ENABLED
-	void _cleanup_selected_layers();
-#endif
-	void _tile_set_changed();
-
-protected:
-	Ref<TileSet> tile_set;
-
-	virtual void remove_child_notify(Node *p_child) override;
-
-	static void _bind_methods();
-
-public:
-#ifdef TOOLS_ENABLED
-	// For editor use.
-	void set_selected_layers(Vector<StringName> p_layer_names);
-	Vector<StringName> get_selected_layers() const;
-	void set_highlight_selected_layer(bool p_highlight_selected_layer);
-	bool is_highlighting_selected_layer() const;
 #endif
-
-	// Accessors.
-	void set_tileset(const Ref<TileSet> &p_tileset);
-	Ref<TileSet> get_tileset() const;
-
-	~TileMapLayerGroup();
-};
-
-#endif // TILE_MAP_LAYER_GROUP_H
diff --git a/scene/main/node.cpp b/scene/main/node.cpp
index 4f97bcf348..11c200064e 100644
--- a/scene/main/node.cpp
+++ b/scene/main/node.cpp
@@ -29,6 +29,7 @@
 /**************************************************************************/
 
 #include "node.h"
+#include "node.compat.inc"
 
 #include "core/config/project_settings.h"
 #include "core/core_string_names.h"
@@ -3005,7 +3006,7 @@ static void find_owned_by(Node *p_by, Node *p_node, List<Node *> *p_owned) {
 	}
 }
 
-void Node::replace_by(Node *p_node, bool p_keep_groups) {
+void Node::replace_by(Node *p_node, bool p_keep_groups, bool p_keep_children) {
 	ERR_THREAD_GUARD
 	ERR_FAIL_NULL(p_node);
 	ERR_FAIL_COND(p_node->data.parent);
@@ -3026,13 +3027,13 @@ void Node::replace_by(Node *p_node, bool p_keep_groups) {
 	_replace_connections_target(p_node);
 
 	if (data.owner) {
-		for (int i = 0; i < get_child_count(); i++) {
-			find_owned_by(data.owner, get_child(i), &owned_by_owner);
+		if (p_keep_children) {
+			for (int i = 0; i < get_child_count(); i++) {
+				find_owned_by(data.owner, get_child(i), &owned_by_owner);
+			}
 		}
-
 		_clean_up_owner();
 	}
-
 	Node *parent = data.parent;
 	int index_in_parent = get_index(false);
 
@@ -3044,31 +3045,33 @@ void Node::replace_by(Node *p_node, bool p_keep_groups) {
 
 	emit_signal(SNAME("replacing_by"), p_node);
 
-	while (get_child_count()) {
-		Node *child = get_child(0);
-		remove_child(child);
-		if (!child->is_owned_by_parent()) {
-			// add the custom children to the p_node
-			Node *child_owner = child->get_owner() == this ? p_node : child->get_owner();
-			child->set_owner(nullptr);
-			p_node->add_child(child);
-			child->set_owner(child_owner);
+	if (p_keep_children) {
+		while (get_child_count()) {
+			Node *child = get_child(0);
+			remove_child(child);
+			if (!child->is_owned_by_parent()) {
+				// add the custom children to the p_node
+				Node *child_owner = child->get_owner() == this ? p_node : child->get_owner();
+				child->set_owner(nullptr);
+				p_node->add_child(child);
+				child->set_owner(child_owner);
+			}
 		}
-	}
 
-	p_node->set_owner(owner);
-	for (Node *E : owned) {
-		if (E->data.owner != p_node) {
-			E->set_owner(p_node);
+		for (Node *E : owned) {
+			if (E->data.owner != p_node) {
+				E->set_owner(p_node);
+			}
 		}
-	}
 
-	for (Node *E : owned_by_owner) {
-		if (E->data.owner != owner) {
-			E->set_owner(owner);
+		for (Node *E : owned_by_owner) {
+			if (E->data.owner != owner) {
+				E->set_owner(owner);
+			}
 		}
 	}
 
+	p_node->set_owner(owner);
 	p_node->set_scene_file_path(get_scene_file_path());
 }
 
@@ -3595,7 +3598,7 @@ void Node::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("create_tween"), &Node::create_tween);
 
 	ClassDB::bind_method(D_METHOD("duplicate", "flags"), &Node::duplicate, DEFVAL(DUPLICATE_USE_INSTANTIATION | DUPLICATE_SIGNALS | DUPLICATE_GROUPS | DUPLICATE_SCRIPTS));
-	ClassDB::bind_method(D_METHOD("replace_by", "node", "keep_groups"), &Node::replace_by, DEFVAL(false));
+	ClassDB::bind_method(D_METHOD("replace_by", "node", "keep_groups", "keep_children"), &Node::replace_by, DEFVAL(false), DEFVAL(true));
 
 	ClassDB::bind_method(D_METHOD("set_scene_instance_load_placeholder", "load_placeholder"), &Node::set_scene_instance_load_placeholder);
 	ClassDB::bind_method(D_METHOD("get_scene_instance_load_placeholder"), &Node::get_scene_instance_load_placeholder);
diff --git a/scene/main/node.h b/scene/main/node.h
index f49eeec9cd..99def10338 100644
--- a/scene/main/node.h
+++ b/scene/main/node.h
@@ -310,6 +310,11 @@ private:
 	Variant _call_thread_safe_bind(const Variant **p_args, int p_argcount, Callable::CallError &r_error);
 
 protected:
+#ifndef DISABLE_DEPRECATED
+	void _replace_by_bind_compat_89992(Node *p_node, bool p_keep_data = false);
+	static void _bind_compatibility_methods();
+#endif // DISABLE_DEPRECATED
+
 	void _block() { data.blocked++; }
 	void _unblock() { data.blocked--; }
 
@@ -629,7 +634,7 @@ public:
 		return binds;
 	}
 
-	void replace_by(Node *p_node, bool p_keep_data = false);
+	void replace_by(Node *p_node, bool p_keep_groups = false, bool p_keep_children = true);
 
 	void set_process_mode(ProcessMode p_mode);
 	ProcessMode get_process_mode() const;
diff --git a/scene/main/status_indicator.cpp b/scene/main/status_indicator.cpp
index ae58bc0b18..54b2ff75ca 100644
--- a/scene/main/status_indicator.cpp
+++ b/scene/main/status_indicator.cpp
@@ -67,7 +67,7 @@ void StatusIndicator::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_visible", "visible"), &StatusIndicator::set_visible);
 	ClassDB::bind_method(D_METHOD("is_visible"), &StatusIndicator::is_visible);
 
-	ADD_SIGNAL(MethodInfo("pressed", PropertyInfo(Variant::INT, "mouse_button"), PropertyInfo(Variant::VECTOR2I, "position")));
+	ADD_SIGNAL(MethodInfo("pressed", PropertyInfo(Variant::INT, "mouse_button"), PropertyInfo(Variant::VECTOR2I, "mouse_position")));
 
 	ADD_PROPERTY(PropertyInfo(Variant::STRING, "tooltip", PROPERTY_HINT_MULTILINE_TEXT), "set_tooltip", "get_tooltip");
 	ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "icon", PROPERTY_HINT_RESOURCE_TYPE, "Image"), "set_icon", "get_icon");
diff --git a/scene/main/viewport.cpp b/scene/main/viewport.cpp
index 4e1d2b3983..5313493dc6 100644
--- a/scene/main/viewport.cpp
+++ b/scene/main/viewport.cpp
@@ -1705,9 +1705,10 @@ void Viewport::_gui_input_event(Ref<InputEvent> p_event) {
 
 		Point2 mpos = mb->get_position();
 		if (mb->is_pressed()) {
-			if (!gui.mouse_focus_mask.is_empty()) {
-				// Do not steal mouse focus and stuff while a focus mask exists.
-				gui.mouse_focus_mask.set_flag(mouse_button_to_mask(mb->get_button_index()));
+			MouseButtonMask button_mask = mouse_button_to_mask(mb->get_button_index());
+			if (!gui.mouse_focus_mask.is_empty() && !gui.mouse_focus_mask.has_flag(button_mask)) {
+				// Do not steal mouse focus and stuff while a focus mask without the current mouse button exists.
+				gui.mouse_focus_mask.set_flag(button_mask);
 			} else {
 				gui.mouse_focus = gui_find_control(mpos);
 				gui.last_mouse_focus = gui.mouse_focus;
diff --git a/scene/main/window.cpp b/scene/main/window.cpp
index 586a88ea85..65f1365e67 100644
--- a/scene/main/window.cpp
+++ b/scene/main/window.cpp
@@ -724,6 +724,9 @@ void Window::_event_callback(DisplayServer::WindowEvent p_event) {
 			if (!is_inside_tree()) {
 				return;
 			}
+			// Ensure keeping the order of input events and window events when input events are buffered or accumulated.
+			Input::get_singleton()->flush_buffered_events();
+
 			Window *root = get_tree()->get_root();
 			if (!root->gui.windowmanager_window_over) {
 #ifdef DEV_ENABLED
@@ -2717,9 +2720,6 @@ void Window::_update_mouse_over(Vector2 p_pos) {
 		if (is_embedded()) {
 			mouse_in_window = true;
 			_propagate_window_notification(this, NOTIFICATION_WM_MOUSE_ENTER);
-		} else {
-			// Prevent update based on delayed InputEvents from DisplayServer.
-			return;
 		}
 	}
 
diff --git a/scene/register_scene_types.cpp b/scene/register_scene_types.cpp
index 3583dace2a..b6efd30f63 100644
--- a/scene/register_scene_types.cpp
+++ b/scene/register_scene_types.cpp
@@ -77,6 +77,7 @@
 #include "scene/2d/skeleton_2d.h"
 #include "scene/2d/sprite_2d.h"
 #include "scene/2d/tile_map.h"
+#include "scene/2d/tile_map_layer.h"
 #include "scene/2d/touch_screen_button.h"
 #include "scene/2d/visible_on_screen_notifier_2d.h"
 #include "scene/animation/animation_blend_space_1d.h"
@@ -813,7 +814,7 @@ void register_scene_types() {
 	GDREGISTER_CLASS(TileMapPattern);
 	GDREGISTER_CLASS(TileData);
 	GDREGISTER_CLASS(TileMap);
-	GDREGISTER_ABSTRACT_CLASS(TileMapLayerGroup);
+	GDREGISTER_CLASS(TileMapLayer);
 	GDREGISTER_CLASS(Parallax2D);
 	GDREGISTER_CLASS(ParallaxBackground);
 	GDREGISTER_CLASS(ParallaxLayer);
diff --git a/scene/resources/3d/height_map_shape_3d.cpp b/scene/resources/3d/height_map_shape_3d.cpp
index 35c905bd43..5b55b66152 100644
--- a/scene/resources/3d/height_map_shape_3d.cpp
+++ b/scene/resources/3d/height_map_shape_3d.cpp
@@ -30,6 +30,7 @@
 
 #include "height_map_shape_3d.h"
 
+#include "core/io/image.h"
 #include "servers/physics_server_3d.h"
 
 Vector<Vector3> HeightMapShape3D::get_debug_mesh_lines() const {
@@ -187,6 +188,104 @@ real_t HeightMapShape3D::get_max_height() const {
 	return max_height;
 }
 
+void HeightMapShape3D::update_map_data_from_image(const Ref<Image> &p_image, real_t p_height_min, real_t p_height_max) {
+	ERR_FAIL_COND_MSG(p_image.is_null(), "Heightmap update image requires a valid Image reference.");
+	ERR_FAIL_COND_MSG(p_image->get_format() != Image::FORMAT_RF && p_image->get_format() != Image::FORMAT_RH && p_image->get_format() != Image::FORMAT_R8, "Heightmap update image requires Image in format FORMAT_RF (32 bit), FORMAT_RH (16 bit), or FORMAT_R8 (8 bit).");
+	ERR_FAIL_COND_MSG(p_image->get_width() < 2, "Heightmap update image requires a minimum Image width of 2.");
+	ERR_FAIL_COND_MSG(p_image->get_height() < 2, "Heightmap update image requires a minimum Image height of 2.");
+	ERR_FAIL_COND_MSG(p_height_min > p_height_max, "Heightmap update image requires height_max to be greater than height_min.");
+
+	map_width = p_image->get_width();
+	map_depth = p_image->get_height();
+	map_data.resize(map_width * map_depth);
+
+	real_t new_min_height = FLT_MAX;
+	real_t new_max_height = -FLT_MAX;
+
+	float remap_height_min = float(p_height_min);
+	float remap_height_max = float(p_height_max);
+
+	real_t *map_data_ptrw = map_data.ptrw();
+
+	switch (p_image->get_format()) {
+		case Image::FORMAT_RF: {
+			const float *image_data_ptr = (float *)p_image->get_data().ptr();
+
+			for (int i = 0; i < map_data.size(); i++) {
+				float pixel_value = image_data_ptr[i];
+
+				DEV_ASSERT(pixel_value >= 0.0 && pixel_value <= 1.0);
+
+				real_t height_value = Math::remap(pixel_value, 0.0f, 1.0f, remap_height_min, remap_height_max);
+
+				if (height_value < new_min_height) {
+					new_min_height = height_value;
+				}
+				if (height_value > new_max_height) {
+					new_max_height = height_value;
+				}
+
+				map_data_ptrw[i] = height_value;
+			}
+
+		} break;
+
+		case Image::FORMAT_RH: {
+			const uint16_t *image_data_ptr = (uint16_t *)p_image->get_data().ptr();
+
+			for (int i = 0; i < map_data.size(); i++) {
+				float pixel_value = Math::half_to_float(image_data_ptr[i]);
+
+				DEV_ASSERT(pixel_value >= 0.0 && pixel_value <= 1.0);
+
+				real_t height_value = Math::remap(pixel_value, 0.0f, 1.0f, remap_height_min, remap_height_max);
+
+				if (height_value < new_min_height) {
+					new_min_height = height_value;
+				}
+				if (height_value > new_max_height) {
+					new_max_height = height_value;
+				}
+
+				map_data_ptrw[i] = height_value;
+			}
+
+		} break;
+
+		case Image::FORMAT_R8: {
+			const uint8_t *image_data_ptr = (uint8_t *)p_image->get_data().ptr();
+
+			for (int i = 0; i < map_data.size(); i++) {
+				float pixel_value = float(image_data_ptr[i] / 255.0);
+
+				DEV_ASSERT(pixel_value >= 0.0 && pixel_value <= 1.0);
+
+				real_t height_value = Math::remap(pixel_value, 0.0f, 1.0f, remap_height_min, remap_height_max);
+
+				if (height_value < new_min_height) {
+					new_min_height = height_value;
+				}
+				if (height_value > new_max_height) {
+					new_max_height = height_value;
+				}
+
+				map_data_ptrw[i] = height_value;
+			}
+
+		} break;
+
+		default: {
+			return;
+		}
+	}
+
+	min_height = new_min_height;
+	max_height = new_max_height;
+
+	_update_shape();
+	emit_changed();
+}
+
 void HeightMapShape3D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_map_width", "width"), &HeightMapShape3D::set_map_width);
 	ClassDB::bind_method(D_METHOD("get_map_width"), &HeightMapShape3D::get_map_width);
@@ -197,6 +296,8 @@ void HeightMapShape3D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("get_min_height"), &HeightMapShape3D::get_min_height);
 	ClassDB::bind_method(D_METHOD("get_max_height"), &HeightMapShape3D::get_max_height);
 
+	ClassDB::bind_method(D_METHOD("update_map_data_from_image", "image", "height_min", "height_max"), &HeightMapShape3D::update_map_data_from_image);
+
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "map_width", PROPERTY_HINT_RANGE, "0.001,100,0.001,or_greater"), "set_map_width", "get_map_width");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "map_depth", PROPERTY_HINT_RANGE, "0.001,100,0.001,or_greater"), "set_map_depth", "get_map_depth");
 	ADD_PROPERTY(PropertyInfo(Variant::PACKED_FLOAT32_ARRAY, "map_data"), "set_map_data", "get_map_data");
diff --git a/scene/resources/3d/height_map_shape_3d.h b/scene/resources/3d/height_map_shape_3d.h
index 363d9ec0e9..33ba9c4472 100644
--- a/scene/resources/3d/height_map_shape_3d.h
+++ b/scene/resources/3d/height_map_shape_3d.h
@@ -33,6 +33,8 @@
 
 #include "scene/resources/3d/shape_3d.h"
 
+class Image;
+
 class HeightMapShape3D : public Shape3D {
 	GDCLASS(HeightMapShape3D, Shape3D);
 
@@ -57,6 +59,8 @@ public:
 	real_t get_min_height() const;
 	real_t get_max_height() const;
 
+	void update_map_data_from_image(const Ref<Image> &p_image, real_t p_height_min, real_t p_height_max);
+
 	virtual Vector<Vector3> get_debug_mesh_lines() const override;
 	virtual real_t get_enclosing_radius() const override;
 
diff --git a/scene/resources/curve.cpp b/scene/resources/curve.cpp
index 2b54acef75..8926eb1d51 100644
--- a/scene/resources/curve.cpp
+++ b/scene/resources/curve.cpp
@@ -2029,6 +2029,10 @@ Vector3 Curve3D::get_closest_point(const Vector3 &p_to_point) const {
 	return nearest;
 }
 
+PackedVector3Array Curve3D::get_points() const {
+	return _get_data()["points"];
+}
+
 real_t Curve3D::get_closest_offset(const Vector3 &p_to_point) const {
 	// Brute force method.
 
diff --git a/scene/resources/curve.h b/scene/resources/curve.h
index e085dfedbd..6da337a93f 100644
--- a/scene/resources/curve.h
+++ b/scene/resources/curve.h
@@ -345,6 +345,7 @@ public:
 	PackedVector3Array get_baked_up_vectors() const;
 	Vector3 get_closest_point(const Vector3 &p_to_point) const;
 	real_t get_closest_offset(const Vector3 &p_to_point) const;
+	PackedVector3Array get_points() const;
 
 	PackedVector3Array tessellate(int p_max_stages = 5, real_t p_tolerance = 4) const; // Useful for display.
 	PackedVector3Array tessellate_even_length(int p_max_stages = 5, real_t p_length = 0.2) const; // Useful for baking.
diff --git a/scene/theme/default_theme.cpp b/scene/theme/default_theme.cpp
index e4c469b752..c45f52ec9e 100644
--- a/scene/theme/default_theme.cpp
+++ b/scene/theme/default_theme.cpp
@@ -936,8 +936,6 @@ void fill_default_theme(Ref<Theme> &theme, const Ref<Font> &default_font, const
 	theme->set_stylebox("separator", "VSeparator", separator_vertical);
 
 	theme->set_icon("close", "Icons", icons["close"]);
-	theme->set_font("normal", "Fonts", Ref<Font>());
-	theme->set_font("large", "Fonts", Ref<Font>());
 
 	theme->set_constant("separation", "HSeparator", Math::round(4 * scale));
 	theme->set_constant("separation", "VSeparator", Math::round(4 * scale));
diff --git a/scu_builders.py b/scu_builders.py
index 0435c0a4f5..b180cbc864 100644
--- a/scu_builders.py
+++ b/scu_builders.py
@@ -13,9 +13,9 @@ _scu_folders = set()
 _max_includes_per_scu = 1024
 
 
-def clear_out_existing_files(output_folder, extension):
+def clear_out_stale_files(output_folder, extension, fresh_files):
     output_folder = os.path.abspath(output_folder)
-    # print("clear_out_existing_files from folder: " + output_folder)
+    # print("clear_out_stale_files from folder: " + output_folder)
 
     if not os.path.isdir(output_folder):
         # folder does not exist or has not been created yet,
@@ -23,8 +23,9 @@ def clear_out_existing_files(output_folder, extension):
         return
 
     for file in glob.glob(output_folder + "/*." + extension):
-        # print("removed pre-existing file: " + file)
-        os.remove(file)
+        if not file in fresh_files:
+            # print("removed stale file: " + file)
+            os.remove(file)
 
 
 def folder_not_found(folder):
@@ -87,11 +88,16 @@ def write_output_file(file_count, include_list, start_line, end_line, output_fol
 
     short_filename = output_filename_prefix + num_string + ".gen." + extension
     output_filename = output_folder + "/" + short_filename
-    if _verbose:
-        print("SCU: Generating: %s" % short_filename)
-
     output_path = Path(output_filename)
-    output_path.write_text(file_text, encoding="utf8")
+
+    if not output_path.exists() or output_path.read_text() != file_text:
+        if _verbose:
+            print("SCU: Generating: %s" % short_filename)
+        output_path.write_text(file_text, encoding="utf8")
+    elif _verbose:
+        print("SCU: Generation not needed for: " + short_filename)
+
+    return output_filename
 
 
 def write_exception_output_file(file_count, exception_string, output_folder, output_filename_prefix, extension):
@@ -109,11 +115,16 @@ def write_exception_output_file(file_count, exception_string, output_folder, out
     short_filename = output_filename_prefix + "_exception" + num_string + ".gen." + extension
     output_filename = output_folder + "/" + short_filename
 
-    if _verbose:
-        print("SCU: Generating: " + short_filename)
-
     output_path = Path(output_filename)
-    output_path.write_text(file_text, encoding="utf8")
+
+    if not output_path.exists() or output_path.read_text() != file_text:
+        if _verbose:
+            print("SCU: Generating: " + short_filename)
+        output_path.write_text(file_text, encoding="utf8")
+    elif _verbose:
+        print("SCU: Generation not needed for: " + short_filename)
+
+    return output_filename
 
 
 def find_section_name(sub_folder):
@@ -214,10 +225,7 @@ def process_folder(folders, sought_exceptions=[], includes_per_scu=0, extension=
     output_folder = abs_main_folder + "/scu/"
     output_filename_prefix = "scu_" + out_filename
 
-    # Clear out any existing files (usually we will be overwriting,
-    # but we want to remove any that are pre-existing that will not be
-    # overwritten, so as to not compile anything stale)
-    clear_out_existing_files(output_folder, extension)
+    fresh_files = set()
 
     for file_count in range(0, num_output_files):
         end_line = start_line + lines_per_file
@@ -226,19 +234,28 @@ def process_folder(folders, sought_exceptions=[], includes_per_scu=0, extension=
         if file_count == (num_output_files - 1):
             end_line = len(found_includes)
 
-        write_output_file(
+        fresh_file = write_output_file(
             file_count, found_includes, start_line, end_line, output_folder, output_filename_prefix, extension
         )
 
+        fresh_files.add(fresh_file)
+
         start_line = end_line
 
     # Write the exceptions each in their own scu gen file,
     # so they can effectively compile in "old style / normal build".
     for exception_count in range(len(found_exceptions)):
-        write_exception_output_file(
+        fresh_file = write_exception_output_file(
             exception_count, found_exceptions[exception_count], output_folder, output_filename_prefix, extension
         )
 
+        fresh_files.add(fresh_file)
+
+    # Clear out any stale file (usually we will be overwriting if necessary,
+    # but we want to remove any that are pre-existing that will not be
+    # overwritten, so as to not compile anything stale).
+    clear_out_stale_files(output_folder, extension, fresh_files)
+
 
 def generate_scu_files(max_includes_per_scu):
     print("=============================")
diff --git a/servers/physics_3d/godot_collision_solver_3d_sat.cpp b/servers/physics_3d/godot_collision_solver_3d_sat.cpp
index beaa30eb84..c53c8481f4 100644
--- a/servers/physics_3d/godot_collision_solver_3d_sat.cpp
+++ b/servers/physics_3d/godot_collision_solver_3d_sat.cpp
@@ -1962,7 +1962,7 @@ static void _collision_cylinder_face(const GodotShape3D *p_a, const Transform3D
 
 	// Points of B, cylinder lateral surface.
 	for (int i = 0; i < 3; i++) {
-		const Vector3 &point = vertex[i];
+		const Vector3 point = vertex[i] - p_transform_a.origin;
 		Vector3 axis = Plane(cyl_axis).project(point).normalized();
 		if (axis.dot(normal) < 0.0) {
 			axis *= -1.0;
diff --git a/servers/rendering/renderer_rd/effects/copy_effects.cpp b/servers/rendering/renderer_rd/effects/copy_effects.cpp
index abbe04b5b2..1568867663 100644
--- a/servers/rendering/renderer_rd/effects/copy_effects.cpp
+++ b/servers/rendering/renderer_rd/effects/copy_effects.cpp
@@ -583,7 +583,7 @@ void CopyEffects::copy_to_fb_rect(RID p_source_rd_texture, RID p_dest_framebuffe
 	RID shader = copy_to_fb.shader.version_get_shader(copy_to_fb.shader_version, mode);
 	ERR_FAIL_COND(shader.is_null());
 
-	RD::DrawListID draw_list = RD::get_singleton()->draw_list_begin(p_dest_framebuffer, RD::INITIAL_ACTION_LOAD, RD::FINAL_ACTION_STORE, RD::INITIAL_ACTION_LOAD, RD::FINAL_ACTION_DISCARD, Vector<Color>(), 1.0, 0, p_rect);
+	RD::DrawListID draw_list = RD::get_singleton()->draw_list_begin(p_dest_framebuffer, RD::INITIAL_ACTION_LOAD, RD::FINAL_ACTION_STORE, RD::INITIAL_ACTION_LOAD, RD::FINAL_ACTION_DISCARD, Vector<Color>(), 0.0, 0, p_rect);
 	RD::get_singleton()->draw_list_bind_render_pipeline(draw_list, copy_to_fb.pipelines[mode].get_render_pipeline(RD::INVALID_ID, RD::get_singleton()->framebuffer_get_format(p_dest_framebuffer)));
 	RD::get_singleton()->draw_list_bind_uniform_set(draw_list, uniform_set_cache->get_cache(shader, 0, u_source_rd_texture), 0);
 	if (p_secondary.is_valid()) {
@@ -982,7 +982,7 @@ void CopyEffects::set_color_raster(RID p_dest_texture, const Color &p_color, con
 	RID shader = copy_to_fb.shader.version_get_shader(copy_to_fb.shader_version, mode);
 	ERR_FAIL_COND(shader.is_null());
 
-	RD::DrawListID draw_list = RD::get_singleton()->draw_list_begin(dest_framebuffer, RD::INITIAL_ACTION_LOAD, RD::FINAL_ACTION_STORE, RD::INITIAL_ACTION_LOAD, RD::FINAL_ACTION_DISCARD, Vector<Color>(), 1.0, 0, p_region);
+	RD::DrawListID draw_list = RD::get_singleton()->draw_list_begin(dest_framebuffer, RD::INITIAL_ACTION_LOAD, RD::FINAL_ACTION_STORE, RD::INITIAL_ACTION_LOAD, RD::FINAL_ACTION_DISCARD, Vector<Color>(), 0.0, 0, p_region);
 	RD::get_singleton()->draw_list_bind_render_pipeline(draw_list, copy_to_fb.pipelines[mode].get_render_pipeline(RD::INVALID_ID, RD::get_singleton()->framebuffer_get_format(dest_framebuffer)));
 	RD::get_singleton()->draw_list_bind_index_array(draw_list, material_storage->get_quad_index_array());
 	RD::get_singleton()->draw_list_set_push_constant(draw_list, &copy_to_fb.push_constant, sizeof(CopyToFbPushConstant));
diff --git a/servers/rendering/renderer_rd/effects/debug_effects.cpp b/servers/rendering/renderer_rd/effects/debug_effects.cpp
index a57a65fd5a..017ad41fdc 100644
--- a/servers/rendering/renderer_rd/effects/debug_effects.cpp
+++ b/servers/rendering/renderer_rd/effects/debug_effects.cpp
@@ -282,7 +282,7 @@ void DebugEffects::draw_shadow_frustum(RID p_light, const Projection &p_cam_proj
 		// And draw our frustum.
 		RD::FramebufferFormatID fb_format_id = RD::get_singleton()->framebuffer_get_format(p_dest_fb);
 
-		RD::DrawListID draw_list = RD::get_singleton()->draw_list_begin(p_dest_fb, RD::INITIAL_ACTION_LOAD, RD::FINAL_ACTION_STORE, RD::INITIAL_ACTION_LOAD, RD::FINAL_ACTION_DISCARD, Vector<Color>(), 1.0, 0, rect);
+		RD::DrawListID draw_list = RD::get_singleton()->draw_list_begin(p_dest_fb, RD::INITIAL_ACTION_LOAD, RD::FINAL_ACTION_STORE, RD::INITIAL_ACTION_LOAD, RD::FINAL_ACTION_DISCARD, Vector<Color>(), 0.0, 0, rect);
 
 		RID pipeline = shadow_frustum.pipelines[SFP_TRANSPARENT].get_render_pipeline(frustum.vertex_format, fb_format_id);
 		RD::get_singleton()->draw_list_bind_render_pipeline(draw_list, pipeline);
@@ -326,7 +326,7 @@ void DebugEffects::draw_shadow_frustum(RID p_light, const Projection &p_cam_proj
 			rect.size.x *= atlas_rect_norm.size.x;
 			rect.size.y *= atlas_rect_norm.size.y;
 
-			draw_list = RD::get_singleton()->draw_list_begin(p_dest_fb, RD::INITIAL_ACTION_LOAD, RD::FINAL_ACTION_STORE, RD::INITIAL_ACTION_LOAD, RD::FINAL_ACTION_DISCARD, Vector<Color>(), 1.0, 0, rect);
+			draw_list = RD::get_singleton()->draw_list_begin(p_dest_fb, RD::INITIAL_ACTION_LOAD, RD::FINAL_ACTION_STORE, RD::INITIAL_ACTION_LOAD, RD::FINAL_ACTION_DISCARD, Vector<Color>(), 0.0, 0, rect);
 
 			pipeline = shadow_frustum.pipelines[SFP_TRANSPARENT].get_render_pipeline(frustum.vertex_format, fb_format_id);
 			RD::get_singleton()->draw_list_bind_render_pipeline(draw_list, pipeline);
diff --git a/servers/rendering/renderer_rd/effects/fsr2.cpp b/servers/rendering/renderer_rd/effects/fsr2.cpp
index bebbf51d51..925352a7d1 100644
--- a/servers/rendering/renderer_rd/effects/fsr2.cpp
+++ b/servers/rendering/renderer_rd/effects/fsr2.cpp
@@ -527,6 +527,7 @@ FSR2Effect::FSR2Effect() {
 			"\n#define FFX_GLSL 1\n"
 			"\n#define FFX_FSR2_OPTION_LOW_RESOLUTION_MOTION_VECTORS 1\n"
 			"\n#define FFX_FSR2_OPTION_HDR_COLOR_INPUT 1\n"
+			"\n#define FFX_FSR2_OPTION_INVERTED_DEPTH 1\n"
 			"\n#define FFX_FSR2_OPTION_GODOT_REACTIVE_MASK_CLAMP 1\n"
 			"\n#define FFX_FSR2_OPTION_GODOT_DERIVE_INVALID_MOTION_VECTORS 1\n";
 
@@ -808,7 +809,7 @@ FSR2Effect::~FSR2Effect() {
 
 FSR2Context *FSR2Effect::create_context(Size2i p_internal_size, Size2i p_target_size) {
 	FSR2Context *context = memnew(RendererRD::FSR2Context);
-	context->fsr_desc.flags = FFX_FSR2_ENABLE_HIGH_DYNAMIC_RANGE;
+	context->fsr_desc.flags = FFX_FSR2_ENABLE_HIGH_DYNAMIC_RANGE | FFX_FSR2_ENABLE_DEPTH_INVERTED;
 	context->fsr_desc.maxRenderSize.width = p_internal_size.x;
 	context->fsr_desc.maxRenderSize.height = p_internal_size.y;
 	context->fsr_desc.displaySize.width = p_target_size.x;
diff --git a/servers/rendering/renderer_rd/effects/ss_effects.cpp b/servers/rendering/renderer_rd/effects/ss_effects.cpp
index bdd687d9f4..3db82c8fbd 100644
--- a/servers/rendering/renderer_rd/effects/ss_effects.cpp
+++ b/servers/rendering/renderer_rd/effects/ss_effects.cpp
@@ -483,8 +483,12 @@ void SSEffects::downsample_depth(Ref<RenderSceneBuffersRD> p_render_buffers, uin
 		downsample_uniform_set = uniform_set_cache->get_cache_vec(shader, 2, u_depths);
 	}
 
-	float depth_linearize_mul = -p_projection.columns[3][2] * 0.5;
-	float depth_linearize_add = p_projection.columns[2][2];
+	Projection correction;
+	correction.set_depth_correction(false);
+	Projection temp = correction * p_projection;
+
+	float depth_linearize_mul = -temp.columns[3][2];
+	float depth_linearize_add = temp.columns[2][2];
 	if (depth_linearize_mul * depth_linearize_add < 0) {
 		depth_linearize_add = -depth_linearize_add;
 	}
diff --git a/servers/rendering/renderer_rd/environment/gi.cpp b/servers/rendering/renderer_rd/environment/gi.cpp
index 78214ede0b..c7752f8a86 100644
--- a/servers/rendering/renderer_rd/environment/gi.cpp
+++ b/servers/rendering/renderer_rd/environment/gi.cpp
@@ -3407,7 +3407,7 @@ void GI::init(SkyRD *p_sky) {
 			RD::PipelineDepthStencilState ds;
 			ds.enable_depth_test = true;
 			ds.enable_depth_write = true;
-			ds.depth_compare_operator = RD::COMPARE_OP_LESS_OR_EQUAL;
+			ds.depth_compare_operator = RD::COMPARE_OP_GREATER_OR_EQUAL;
 
 			voxel_gi_debug_shader_version_pipelines[i].setup(voxel_gi_debug_shader_version_shaders[i], RD::RENDER_PRIMITIVE_TRIANGLES, rs, RD::PipelineMultisampleState(), ds, RD::PipelineColorBlendState::create_disabled(), 0);
 		}
@@ -3575,7 +3575,7 @@ void GI::init(SkyRD *p_sky) {
 			RD::PipelineDepthStencilState ds;
 			ds.enable_depth_test = true;
 			ds.enable_depth_write = true;
-			ds.depth_compare_operator = RD::COMPARE_OP_LESS_OR_EQUAL;
+			ds.depth_compare_operator = RD::COMPARE_OP_GREATER_OR_EQUAL;
 			for (int i = 0; i < SDFGIShader::PROBE_DEBUG_MAX; i++) {
 				// TODO check if version is enabled
 
@@ -3810,8 +3810,13 @@ void GI::process_gi(Ref<RenderSceneBuffersRD> p_render_buffers, const RID *p_nor
 			rbgi->scene_data_ubo = RD::get_singleton()->uniform_buffer_create(sizeof(SceneData));
 		}
 
+		Projection correction;
+		correction.set_depth_correction(false);
+
 		for (uint32_t v = 0; v < p_view_count; v++) {
-			RendererRD::MaterialStorage::store_camera(p_projections[v].inverse(), scene_data.inv_projection[v]);
+			Projection temp = correction * p_projections[v];
+
+			RendererRD::MaterialStorage::store_camera(temp.inverse(), scene_data.inv_projection[v]);
 			scene_data.eye_offset[v][0] = p_eye_offsets[v].x;
 			scene_data.eye_offset[v][1] = p_eye_offsets[v].y;
 			scene_data.eye_offset[v][2] = p_eye_offsets[v].z;
diff --git a/servers/rendering/renderer_rd/environment/sky.cpp b/servers/rendering/renderer_rd/environment/sky.cpp
index 41609dc74d..27c07f23fa 100644
--- a/servers/rendering/renderer_rd/environment/sky.cpp
+++ b/servers/rendering/renderer_rd/environment/sky.cpp
@@ -141,7 +141,7 @@ void SkyRD::SkyShaderData::set_code(const String &p_code) {
 	for (int i = 0; i < SKY_VERSION_MAX; i++) {
 		RD::PipelineDepthStencilState depth_stencil_state;
 		depth_stencil_state.enable_depth_test = true;
-		depth_stencil_state.depth_compare_operator = RD::COMPARE_OP_LESS_OR_EQUAL;
+		depth_stencil_state.depth_compare_operator = RD::COMPARE_OP_GREATER_OR_EQUAL;
 
 		if (scene_singleton->sky.sky_shader.shader.is_variant_enabled(i)) {
 			RID shader_variant = scene_singleton->sky.sky_shader.shader.version_get_shader(version, i);
@@ -1174,6 +1174,7 @@ void SkyRD::setup_sky(RID p_env, Ref<RenderSceneBuffersRD> p_render_buffers, con
 	}
 
 	Projection correction;
+	correction.set_depth_correction(false, true);
 	correction.add_jitter_offset(p_jitter);
 
 	sky_scene_state.view_count = p_view_count;
@@ -1184,10 +1185,12 @@ void SkyRD::setup_sky(RID p_env, Ref<RenderSceneBuffersRD> p_render_buffers, con
 	for (uint32_t i = 0; i < p_view_count; i++) {
 		Projection view_inv_projection = (correction * p_view_projections[i]).inverse();
 		if (p_view_count > 1) {
+			// Reprojection is used when we need to have things in combined space.
 			RendererRD::MaterialStorage::store_camera(p_cam_projection * view_inv_projection, sky_scene_state.ubo.combined_reprojection[i]);
 		} else {
+			// This is unused so just reset to identity.
 			Projection ident;
-			RendererRD::MaterialStorage::store_camera(correction, sky_scene_state.ubo.combined_reprojection[i]);
+			RendererRD::MaterialStorage::store_camera(ident, sky_scene_state.ubo.combined_reprojection[i]);
 		}
 
 		RendererRD::MaterialStorage::store_camera(view_inv_projection, sky_scene_state.ubo.view_inv_projections[i]);
diff --git a/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.cpp b/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.cpp
index d78f3ba05b..0e69ad99b8 100644
--- a/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.cpp
+++ b/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.cpp
@@ -1132,6 +1132,7 @@ void RenderForwardClustered::_update_sdfgi(RenderDataRD *p_render_data) {
 	}
 
 	if (rb.is_valid() && rb->has_custom_data(RB_SCOPE_SDFGI)) {
+		RENDER_TIMESTAMP("Render SDFGI");
 		Ref<RendererRD::GI::SDFGI> sdfgi = rb->get_custom_data(RB_SCOPE_SDFGI);
 		float exposure_normalization = 1.0;
 
@@ -1403,7 +1404,8 @@ void RenderForwardClustered::_pre_opaque_render(RenderDataRD *p_render_data, boo
 			}
 		}
 
-		//cube shadows are rendered in their own way
+		RENDER_TIMESTAMP("Render OmniLight Shadows");
+		// Cube shadows are rendered in their own way.
 		for (const int &index : p_render_data->cube_shadows) {
 			_render_shadow_pass(p_render_data->render_shadows[index].light, p_render_data->shadow_atlas, p_render_data->render_shadows[index].pass, p_render_data->render_shadows[index].instances, camera_plane, lod_distance_multiplier, p_render_data->scene_data->screen_mesh_lod_threshold, true, true, true, p_render_data->render_info, viewport_size, p_render_data->scene_data->cam_transform);
 		}
@@ -1459,6 +1461,7 @@ void RenderForwardClustered::_pre_opaque_render(RenderDataRD *p_render_data, boo
 		// drawcalls per eye/view. It will all sync up at the barrier.
 
 		if (p_use_ssao || p_use_ssil) {
+			RENDER_TIMESTAMP("Prepare Depth for SSAO/SSIL");
 			// Convert our depth buffer data to linear data in
 			for (uint32_t v = 0; v < rb->get_view_count(); v++) {
 				ss_effects->downsample_depth(rb, v, p_render_data->scene_data->view_projection[v]);
@@ -1474,6 +1477,8 @@ void RenderForwardClustered::_pre_opaque_render(RenderDataRD *p_render_data, boo
 		}
 	}
 
+	RENDER_TIMESTAMP("Pre Opaque Render");
+
 	if (current_cluster_builder) {
 		// Note: when rendering stereoscopic (multiview) we are using our combined frustum projection to create
 		// our cluster data. We use reprojection in the shader to adjust for our left/right eye.
@@ -1506,6 +1511,7 @@ void RenderForwardClustered::_pre_opaque_render(RenderDataRD *p_render_data, boo
 	}
 
 	if (rb_data.is_valid()) {
+		RENDER_TIMESTAMP("Update Volumetric Fog");
 		bool directional_shadows = RendererRD::LightStorage::get_singleton()->has_directional_shadows(directional_light_count);
 		_update_volumetric_fog(rb, p_render_data->environment, p_render_data->scene_data->cam_projection, p_render_data->scene_data->cam_transform, p_render_data->scene_data->prev_cam_transform.affine_inverse(), p_render_data->shadow_atlas, directional_light_count, directional_shadows, positional_light_count, p_render_data->voxel_gi_count, *p_render_data->fog_volumes);
 	}
@@ -1967,6 +1973,7 @@ void RenderForwardClustered::_render_scene(RenderDataRD *p_render_data, const Co
 			WARN_PRINT_ONCE("Pre opaque rendering effects can't access resolved depth buffers.");
 		}
 
+		RENDER_TIMESTAMP("Process Pre Opaque Compositor Effects");
 		_process_compositor_effects(RS::COMPOSITOR_EFFECT_CALLBACK_TYPE_PRE_OPAQUE, p_render_data);
 	}
 
@@ -1978,6 +1985,8 @@ void RenderForwardClustered::_render_scene(RenderDataRD *p_render_data, const Co
 	}
 	_pre_opaque_render(p_render_data, using_ssao, using_ssil, using_sdfgi || using_voxelgi, normal_roughness_views, rb_data.is_valid() && rb_data->has_voxelgi() ? rb_data->get_voxelgi() : RID());
 
+	RENDER_TIMESTAMP("Render Opaque Pass");
+
 	RD::get_singleton()->draw_command_begin_label("Render Opaque Pass");
 
 	p_render_data->scene_data->directional_light_count = p_render_data->directional_light_count;
@@ -1988,8 +1997,6 @@ void RenderForwardClustered::_render_scene(RenderDataRD *p_render_data, const Co
 
 	_setup_environment(p_render_data, is_reflection_probe, screen_size, !is_reflection_probe, p_default_bg_color, true, using_motion_pass);
 
-	RENDER_TIMESTAMP("Render Opaque Pass");
-
 	RID rp_uniform_set = _setup_render_pass_uniform_set(RENDER_LIST_OPAQUE, p_render_data, radiance_texture, samplers, true);
 
 	{
@@ -2014,7 +2021,7 @@ void RenderForwardClustered::_render_scene(RenderDataRD *p_render_data, const Co
 			uint32_t opaque_color_pass_flags = using_motion_pass ? (color_pass_flags & ~COLOR_PASS_FLAG_MOTION_VECTORS) : color_pass_flags;
 			RID opaque_framebuffer = using_motion_pass ? rb_data->get_color_pass_fb(opaque_color_pass_flags) : color_framebuffer;
 			RenderListParameters render_list_params(render_list[RENDER_LIST_OPAQUE].elements.ptr(), render_list[RENDER_LIST_OPAQUE].element_info.ptr(), render_list[RENDER_LIST_OPAQUE].elements.size(), reverse_cull, PASS_MODE_COLOR, opaque_color_pass_flags, rb_data.is_null(), p_render_data->directional_light_soft_shadows, rp_uniform_set, get_debug_draw_mode() == RS::VIEWPORT_DEBUG_DRAW_WIREFRAME, Vector2(), p_render_data->scene_data->lod_distance_multiplier, p_render_data->scene_data->screen_mesh_lod_threshold, p_render_data->scene_data->view_count, 0, spec_constant_base_flags);
-			_render_list_with_draw_list(&render_list_params, opaque_framebuffer, load_color ? RD::INITIAL_ACTION_LOAD : RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_STORE, depth_pre_pass ? RD::INITIAL_ACTION_LOAD : RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_STORE, c, 1.0, 0);
+			_render_list_with_draw_list(&render_list_params, opaque_framebuffer, load_color ? RD::INITIAL_ACTION_LOAD : RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_STORE, depth_pre_pass ? RD::INITIAL_ACTION_LOAD : RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_STORE, c, 0.0, 0);
 		}
 
 		RD::get_singleton()->draw_command_end_label();
@@ -2053,6 +2060,7 @@ void RenderForwardClustered::_render_scene(RenderDataRD *p_render_data, const Co
 			}
 		}
 
+		RENDER_TIMESTAMP("Process Post Opaque Compositor Effects");
 		_process_compositor_effects(RS::COMPOSITOR_EFFECT_CALLBACK_TYPE_POST_OPAQUE, p_render_data);
 	}
 
@@ -2113,6 +2121,7 @@ void RenderForwardClustered::_render_scene(RenderDataRD *p_render_data, const Co
 	}
 
 	{
+		RENDER_TIMESTAMP("Process Post Sky Compositor Effects");
 		// Don't need to check for depth or color resolve here, we've already triggered it.
 		_process_compositor_effects(RS::COMPOSITOR_EFFECT_CALLBACK_TYPE_POST_SKY, p_render_data);
 	}
@@ -2190,6 +2199,7 @@ void RenderForwardClustered::_render_scene(RenderDataRD *p_render_data, const Co
 			}
 		}
 
+		RENDER_TIMESTAMP("Process Pre Transparent Compositor Effects");
 		_process_compositor_effects(RS::COMPOSITOR_EFFECT_CALLBACK_TYPE_PRE_TRANSPARENT, p_render_data);
 	}
 
@@ -2234,6 +2244,7 @@ void RenderForwardClustered::_render_scene(RenderDataRD *p_render_data, const Co
 	RD::get_singleton()->draw_command_end_label();
 
 	{
+		RENDER_TIMESTAMP("Process Post Transparent Compositor Effects");
 		_process_compositor_effects(RS::COMPOSITOR_EFFECT_CALLBACK_TYPE_POST_TRANSPARENT, p_render_data);
 	}
 
@@ -2629,7 +2640,7 @@ void RenderForwardClustered::_render_shadow_end() {
 
 	for (SceneState::ShadowPass &shadow_pass : scene_state.shadow_passes) {
 		RenderListParameters render_list_parameters(render_list[RENDER_LIST_SECONDARY].elements.ptr() + shadow_pass.element_from, render_list[RENDER_LIST_SECONDARY].element_info.ptr() + shadow_pass.element_from, shadow_pass.element_count, shadow_pass.flip_cull, shadow_pass.pass_mode, 0, true, false, shadow_pass.rp_uniform_set, false, Vector2(), shadow_pass.lod_distance_multiplier, shadow_pass.screen_mesh_lod_threshold, 1, shadow_pass.element_from);
-		_render_list_with_draw_list(&render_list_parameters, shadow_pass.framebuffer, RD::INITIAL_ACTION_DISCARD, RD::FINAL_ACTION_DISCARD, shadow_pass.initial_depth_action, RD::FINAL_ACTION_STORE, Vector<Color>(), 1.0, 0, shadow_pass.rect);
+		_render_list_with_draw_list(&render_list_parameters, shadow_pass.framebuffer, RD::INITIAL_ACTION_DISCARD, RD::FINAL_ACTION_DISCARD, shadow_pass.initial_depth_action, RD::FINAL_ACTION_STORE, Vector<Color>(), 0.0, 0, shadow_pass.rect);
 	}
 
 	RD::get_singleton()->draw_command_end_label();
@@ -2729,7 +2740,7 @@ void RenderForwardClustered::_render_material(const Transform3D &p_cam_transform
 			Color(0, 0, 0, 0)
 		};
 
-		RD::DrawListID draw_list = RD::get_singleton()->draw_list_begin(p_framebuffer, RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_STORE, RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_STORE, clear, 1.0, 0, p_region);
+		RD::DrawListID draw_list = RD::get_singleton()->draw_list_begin(p_framebuffer, RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_STORE, RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_STORE, clear, 0.0, 0, p_region);
 		_render_list(draw_list, RD::get_singleton()->framebuffer_get_format(p_framebuffer), &render_list_params, 0, render_list_params.element_count);
 		RD::get_singleton()->draw_list_end();
 	}
@@ -2779,7 +2790,7 @@ void RenderForwardClustered::_render_uv2(const PagedArray<RenderGeometryInstance
 			Color(0, 0, 0, 0),
 			Color(0, 0, 0, 0)
 		};
-		RD::DrawListID draw_list = RD::get_singleton()->draw_list_begin(p_framebuffer, RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_STORE, RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_STORE, clear, 1.0, 0, p_region);
+		RD::DrawListID draw_list = RD::get_singleton()->draw_list_begin(p_framebuffer, RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_STORE, RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_STORE, clear, 0.0, 0, p_region);
 
 		const int uv_offset_count = 9;
 		static const Vector2 uv_offsets[uv_offset_count] = {
@@ -2885,7 +2896,7 @@ void RenderForwardClustered::_render_sdfgi(Ref<RenderSceneBuffersRD> p_render_bu
 		}
 
 		RenderListParameters render_list_params(render_list[RENDER_LIST_SECONDARY].elements.ptr(), render_list[RENDER_LIST_SECONDARY].element_info.ptr(), render_list[RENDER_LIST_SECONDARY].elements.size(), true, pass_mode, 0, true, false, rp_uniform_set, false);
-		_render_list_with_draw_list(&render_list_params, E->value, RD::INITIAL_ACTION_DISCARD, RD::FINAL_ACTION_DISCARD, RD::INITIAL_ACTION_DISCARD, RD::FINAL_ACTION_DISCARD, Vector<Color>(), 1.0, 0, Rect2());
+		_render_list_with_draw_list(&render_list_params, E->value, RD::INITIAL_ACTION_DISCARD, RD::FINAL_ACTION_DISCARD, RD::INITIAL_ACTION_DISCARD, RD::FINAL_ACTION_DISCARD, Vector<Color>(), 0.0, 0, Rect2());
 	}
 
 	RD::get_singleton()->draw_command_end_label();
@@ -4265,7 +4276,7 @@ RenderForwardClustered::RenderForwardClustered() {
 		sampler.mag_filter = RD::SAMPLER_FILTER_NEAREST;
 		sampler.min_filter = RD::SAMPLER_FILTER_NEAREST;
 		sampler.enable_compare = true;
-		sampler.compare_op = RD::COMPARE_OP_LESS;
+		sampler.compare_op = RD::COMPARE_OP_GREATER;
 		shadow_sampler = RD::get_singleton()->sampler_create(sampler);
 	}
 
diff --git a/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.h b/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.h
index 12af8822b4..1f12d92754 100644
--- a/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.h
+++ b/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.h
@@ -379,7 +379,7 @@ class RenderForwardClustered : public RendererSceneRenderRD {
 	template <PassMode p_pass_mode, uint32_t p_color_pass_flags = 0>
 	_FORCE_INLINE_ void _render_list_template(RenderingDevice::DrawListID p_draw_list, RenderingDevice::FramebufferFormatID p_framebuffer_Format, RenderListParameters *p_params, uint32_t p_from_element, uint32_t p_to_element);
 	void _render_list(RenderingDevice::DrawListID p_draw_list, RenderingDevice::FramebufferFormatID p_framebuffer_Format, RenderListParameters *p_params, uint32_t p_from_element, uint32_t p_to_element);
-	void _render_list_with_draw_list(RenderListParameters *p_params, RID p_framebuffer, RD::InitialAction p_initial_color_action, RD::FinalAction p_final_color_action, RD::InitialAction p_initial_depth_action, RD::FinalAction p_final_depth_action, const Vector<Color> &p_clear_color_values = Vector<Color>(), float p_clear_depth = 1.0, uint32_t p_clear_stencil = 0, const Rect2 &p_region = Rect2());
+	void _render_list_with_draw_list(RenderListParameters *p_params, RID p_framebuffer, RD::InitialAction p_initial_color_action, RD::FinalAction p_final_color_action, RD::InitialAction p_initial_depth_action, RD::FinalAction p_final_depth_action, const Vector<Color> &p_clear_color_values = Vector<Color>(), float p_clear_depth = 0.0, uint32_t p_clear_stencil = 0, const Rect2 &p_region = Rect2());
 
 	void _update_instance_data_buffer(RenderListType p_render_list);
 	void _fill_instance_data(RenderListType p_render_list, int *p_render_info = nullptr, uint32_t p_offset = 0, int32_t p_max_elements = -1, bool p_update_buffer = true);
diff --git a/servers/rendering/renderer_rd/forward_clustered/scene_shader_forward_clustered.cpp b/servers/rendering/renderer_rd/forward_clustered/scene_shader_forward_clustered.cpp
index 0b504eca0a..209fabeddf 100644
--- a/servers/rendering/renderer_rd/forward_clustered/scene_shader_forward_clustered.cpp
+++ b/servers/rendering/renderer_rd/forward_clustered/scene_shader_forward_clustered.cpp
@@ -260,7 +260,7 @@ void SceneShaderForwardClustered::ShaderData::set_code(const String &p_code) {
 
 	if (depth_test != DEPTH_TEST_DISABLED) {
 		depth_stencil_state.enable_depth_test = true;
-		depth_stencil_state.depth_compare_operator = RD::COMPARE_OP_LESS_OR_EQUAL;
+		depth_stencil_state.depth_compare_operator = RD::COMPARE_OP_GREATER_OR_EQUAL;
 		depth_stencil_state.enable_depth_write = depth_draw != DEPTH_DRAW_DISABLED ? true : false;
 	}
 	bool depth_pre_pass_enabled = bool(GLOBAL_GET("rendering/driver/depth_prepass/enable"));
@@ -827,7 +827,7 @@ void fragment() {
 		sampler.mag_filter = RD::SAMPLER_FILTER_LINEAR;
 		sampler.min_filter = RD::SAMPLER_FILTER_LINEAR;
 		sampler.enable_compare = true;
-		sampler.compare_op = RD::COMPARE_OP_LESS;
+		sampler.compare_op = RD::COMPARE_OP_GREATER;
 		shadow_sampler = RD::get_singleton()->sampler_create(sampler);
 	}
 }
diff --git a/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.cpp b/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.cpp
index 2f307c62f3..b54f511d05 100644
--- a/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.cpp
+++ b/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.cpp
@@ -984,7 +984,7 @@ void RenderForwardMobile::_render_scene(RenderDataRD *p_render_data, const Color
 			}
 		}
 
-		RD::DrawListID draw_list = RD::get_singleton()->draw_list_begin(framebuffer, load_color ? RD::INITIAL_ACTION_LOAD : RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_STORE, RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_STORE, c, 1.0, 0);
+		RD::DrawListID draw_list = RD::get_singleton()->draw_list_begin(framebuffer, load_color ? RD::INITIAL_ACTION_LOAD : RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_STORE, RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_STORE, c, 0.0, 0);
 		RD::FramebufferFormatID fb_format = RD::get_singleton()->framebuffer_get_format(framebuffer);
 
 		if (copy_canvas) {
@@ -1385,7 +1385,7 @@ void RenderForwardMobile::_render_shadow_end() {
 
 	for (SceneState::ShadowPass &shadow_pass : scene_state.shadow_passes) {
 		RenderListParameters render_list_parameters(render_list[RENDER_LIST_SECONDARY].elements.ptr() + shadow_pass.element_from, render_list[RENDER_LIST_SECONDARY].element_info.ptr() + shadow_pass.element_from, shadow_pass.element_count, shadow_pass.flip_cull, shadow_pass.pass_mode, shadow_pass.rp_uniform_set, 0, false, Vector2(), shadow_pass.lod_distance_multiplier, shadow_pass.screen_mesh_lod_threshold, 1, shadow_pass.element_from);
-		_render_list_with_draw_list(&render_list_parameters, shadow_pass.framebuffer, RD::INITIAL_ACTION_DISCARD, RD::FINAL_ACTION_DISCARD, shadow_pass.initial_depth_action, RD::FINAL_ACTION_STORE, Vector<Color>(), 1.0, 0, shadow_pass.rect);
+		_render_list_with_draw_list(&render_list_parameters, shadow_pass.framebuffer, RD::INITIAL_ACTION_DISCARD, RD::FINAL_ACTION_DISCARD, shadow_pass.initial_depth_action, RD::FINAL_ACTION_STORE, Vector<Color>(), 0.0, 0, shadow_pass.rect);
 	}
 
 	RD::get_singleton()->draw_command_end_label();
@@ -1437,7 +1437,7 @@ void RenderForwardMobile::_render_material(const Transform3D &p_cam_transform, c
 			Color(0, 0, 0, 0),
 			Color(0, 0, 0, 0)
 		};
-		RD::DrawListID draw_list = RD::get_singleton()->draw_list_begin(p_framebuffer, RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_STORE, RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_STORE, clear, 1.0, 0, p_region);
+		RD::DrawListID draw_list = RD::get_singleton()->draw_list_begin(p_framebuffer, RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_STORE, RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_STORE, clear, 0.0, 0, p_region);
 		_render_list(draw_list, RD::get_singleton()->framebuffer_get_format(p_framebuffer), &render_list_params, 0, render_list_params.element_count);
 		RD::get_singleton()->draw_list_end();
 	}
@@ -1483,7 +1483,7 @@ void RenderForwardMobile::_render_uv2(const PagedArray<RenderGeometryInstance *>
 			Color(0, 0, 0, 0)
 		};
 
-		RD::DrawListID draw_list = RD::get_singleton()->draw_list_begin(p_framebuffer, RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_STORE, RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_STORE, clear, 1.0, 0, p_region);
+		RD::DrawListID draw_list = RD::get_singleton()->draw_list_begin(p_framebuffer, RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_STORE, RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_STORE, clear, 0.0, 0, p_region);
 
 		const int uv_offset_count = 9;
 		static const Vector2 uv_offsets[uv_offset_count] = {
diff --git a/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.h b/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.h
index 5c02204627..f29503e5ec 100644
--- a/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.h
+++ b/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.h
@@ -350,7 +350,7 @@ private:
 	template <PassMode p_pass_mode>
 	_FORCE_INLINE_ void _render_list_template(RenderingDevice::DrawListID p_draw_list, RenderingDevice::FramebufferFormatID p_framebuffer_Format, RenderListParameters *p_params, uint32_t p_from_element, uint32_t p_to_element);
 	void _render_list(RenderingDevice::DrawListID p_draw_list, RenderingDevice::FramebufferFormatID p_framebuffer_Format, RenderListParameters *p_params, uint32_t p_from_element, uint32_t p_to_element);
-	void _render_list_with_draw_list(RenderListParameters *p_params, RID p_framebuffer, RD::InitialAction p_initial_color_action, RD::FinalAction p_final_color_action, RD::InitialAction p_initial_depth_action, RD::FinalAction p_final_depth_action, const Vector<Color> &p_clear_color_values = Vector<Color>(), float p_clear_depth = 1.0, uint32_t p_clear_stencil = 0, const Rect2 &p_region = Rect2());
+	void _render_list_with_draw_list(RenderListParameters *p_params, RID p_framebuffer, RD::InitialAction p_initial_color_action, RD::FinalAction p_final_color_action, RD::InitialAction p_initial_depth_action, RD::FinalAction p_final_depth_action, const Vector<Color> &p_clear_color_values = Vector<Color>(), float p_clear_depth = 0.0, uint32_t p_clear_stencil = 0, const Rect2 &p_region = Rect2());
 
 	RenderList render_list[RENDER_LIST_MAX];
 
diff --git a/servers/rendering/renderer_rd/forward_mobile/scene_shader_forward_mobile.cpp b/servers/rendering/renderer_rd/forward_mobile/scene_shader_forward_mobile.cpp
index 95ba76a707..a2f112669c 100644
--- a/servers/rendering/renderer_rd/forward_mobile/scene_shader_forward_mobile.cpp
+++ b/servers/rendering/renderer_rd/forward_mobile/scene_shader_forward_mobile.cpp
@@ -271,7 +271,7 @@ void SceneShaderForwardMobile::ShaderData::set_code(const String &p_code) {
 
 	if (depth_test != DEPTH_TEST_DISABLED) {
 		depth_stencil_state.enable_depth_test = true;
-		depth_stencil_state.depth_compare_operator = RD::COMPARE_OP_LESS_OR_EQUAL;
+		depth_stencil_state.depth_compare_operator = RD::COMPARE_OP_GREATER_OR_EQUAL;
 		depth_stencil_state.enable_depth_write = depth_draw != DEPTH_DRAW_DISABLED ? true : false;
 	}
 
@@ -731,7 +731,7 @@ void fragment() {
 		sampler.mag_filter = RD::SAMPLER_FILTER_LINEAR;
 		sampler.min_filter = RD::SAMPLER_FILTER_LINEAR;
 		sampler.enable_compare = true;
-		sampler.compare_op = RD::COMPARE_OP_LESS;
+		sampler.compare_op = RD::COMPARE_OP_GREATER;
 		shadow_sampler = RD::get_singleton()->sampler_create(sampler);
 	}
 }
diff --git a/servers/rendering/renderer_rd/shaders/effects/bokeh_dof.glsl b/servers/rendering/renderer_rd/shaders/effects/bokeh_dof.glsl
index fe770ac065..48c1b0a3f6 100644
--- a/servers/rendering/renderer_rd/shaders/effects/bokeh_dof.glsl
+++ b/servers/rendering/renderer_rd/shaders/effects/bokeh_dof.glsl
@@ -32,9 +32,9 @@ layout(set = 1, binding = 0) uniform sampler2D source_bokeh;
 float get_depth_at_pos(vec2 uv) {
 	float depth = textureLod(source_depth, uv, 0.0).x * 2.0 - 1.0;
 	if (params.orthogonal) {
-		depth = ((depth + (params.z_far + params.z_near) / (params.z_far - params.z_near)) * (params.z_far - params.z_near)) / 2.0;
+		depth = -(depth * (params.z_far - params.z_near) - (params.z_far + params.z_near)) / 2.0;
 	} else {
-		depth = 2.0 * params.z_near * params.z_far / (params.z_far + params.z_near - depth * (params.z_far - params.z_near));
+		depth = 2.0 * params.z_near * params.z_far / (params.z_far + params.z_near + depth * (params.z_far - params.z_near));
 	}
 	return depth;
 }
diff --git a/servers/rendering/renderer_rd/shaders/effects/bokeh_dof_raster.glsl b/servers/rendering/renderer_rd/shaders/effects/bokeh_dof_raster.glsl
index 947aa793d9..2010b58474 100644
--- a/servers/rendering/renderer_rd/shaders/effects/bokeh_dof_raster.glsl
+++ b/servers/rendering/renderer_rd/shaders/effects/bokeh_dof_raster.glsl
@@ -53,9 +53,9 @@ layout(set = 2, binding = 0) uniform sampler2D original_weight;
 float get_depth_at_pos(vec2 uv) {
 	float depth = textureLod(source_depth, uv, 0.0).x * 2.0 - 1.0;
 	if (params.orthogonal) {
-		depth = ((depth + (params.z_far + params.z_near) / (params.z_far - params.z_near)) * (params.z_far - params.z_near)) / 2.0;
+		depth = -(depth * (params.z_far - params.z_near) - (params.z_far + params.z_near)) / 2.0;
 	} else {
-		depth = 2.0 * params.z_near * params.z_far / (params.z_far + params.z_near - depth * (params.z_far - params.z_near));
+		depth = 2.0 * params.z_near * params.z_far / (params.z_far + params.z_near + depth * (params.z_far - params.z_near));
 	}
 	return depth;
 }
diff --git a/servers/rendering/renderer_rd/shaders/effects/cube_to_dp.glsl b/servers/rendering/renderer_rd/shaders/effects/cube_to_dp.glsl
index e77d0de719..3fb93dda35 100644
--- a/servers/rendering/renderer_rd/shaders/effects/cube_to_dp.glsl
+++ b/servers/rendering/renderer_rd/shaders/effects/cube_to_dp.glsl
@@ -77,8 +77,8 @@ void main() {
 	float depth_fix = 1.0 / dot(normal, unorm);
 
 	depth = 2.0 * depth - 1.0;
-	float linear_depth = 2.0 * params.z_near * params.z_far / (params.z_far + params.z_near - depth * (params.z_far - params.z_near));
-	depth = (linear_depth * depth_fix) / params.z_far;
-
+	float linear_depth = 2.0 * params.z_near * params.z_far / (params.z_far + params.z_near + depth * (params.z_far - params.z_near));
+	// linear_depth equal to view space depth
+	depth = (params.z_far - linear_depth * depth_fix) / params.z_far;
 	gl_FragDepth = depth;
 }
diff --git a/servers/rendering/renderer_rd/shaders/effects/screen_space_reflection_scale.glsl b/servers/rendering/renderer_rd/shaders/effects/screen_space_reflection_scale.glsl
index 51caa67d3c..d9e21b8cd1 100644
--- a/servers/rendering/renderer_rd/shaders/effects/screen_space_reflection_scale.glsl
+++ b/servers/rendering/renderer_rd/shaders/effects/screen_space_reflection_scale.glsl
@@ -98,9 +98,9 @@ void main() {
 			// unproject our Z value so we can use it directly.
 			depth = depth * 2.0 - 1.0;
 			if (params.orthogonal) {
-				depth = ((depth + (params.camera_z_far + params.camera_z_near) / (params.camera_z_far - params.camera_z_near)) * (params.camera_z_far - params.camera_z_near)) / 2.0;
+				depth = -(depth * (params.camera_z_far - params.camera_z_near) - (params.camera_z_far + params.camera_z_near)) / 2.0;
 			} else {
-				depth = 2.0 * params.camera_z_near * params.camera_z_far / (params.camera_z_far + params.camera_z_near - depth * (params.camera_z_far - params.camera_z_near));
+				depth = 2.0 * params.camera_z_near * params.camera_z_far / (params.camera_z_far + params.camera_z_near + depth * (params.camera_z_far - params.camera_z_near));
 			}
 			depth = -depth;
 		}
diff --git a/servers/rendering/renderer_rd/shaders/environment/gi.glsl b/servers/rendering/renderer_rd/shaders/environment/gi.glsl
index 80ed34cda1..480172f9dc 100644
--- a/servers/rendering/renderer_rd/shaders/environment/gi.glsl
+++ b/servers/rendering/renderer_rd/shaders/environment/gi.glsl
@@ -174,9 +174,9 @@ vec3 reconstruct_position(ivec2 screen_pos) {
 
 		pos.z = pos.z * 2.0 - 1.0;
 		if (params.orthogonal) {
-			pos.z = ((pos.z + (params.z_far + params.z_near) / (params.z_far - params.z_near)) * (params.z_far - params.z_near)) / 2.0;
+			pos.z = -(pos.z * (params.z_far - params.z_near) - (params.z_far + params.z_near)) / 2.0;
 		} else {
-			pos.z = 2.0 * params.z_near * params.z_far / (params.z_far + params.z_near - pos.z * (params.z_far - params.z_near));
+			pos.z = 2.0 * params.z_near * params.z_far / (params.z_far + params.z_near + pos.z * (params.z_far - params.z_near));
 		}
 		pos.z = -pos.z;
 
diff --git a/servers/rendering/renderer_rd/shaders/environment/sky.glsl b/servers/rendering/renderer_rd/shaders/environment/sky.glsl
index 4e5b11aed8..35457a2482 100644
--- a/servers/rendering/renderer_rd/shaders/environment/sky.glsl
+++ b/servers/rendering/renderer_rd/shaders/environment/sky.glsl
@@ -25,7 +25,7 @@ params;
 void main() {
 	vec2 base_arr[3] = vec2[](vec2(-1.0, -3.0), vec2(-1.0, 1.0), vec2(3.0, 1.0));
 	uv_interp = base_arr[gl_VertexIndex];
-	gl_Position = vec4(uv_interp, 1.0, 1.0);
+	gl_Position = vec4(uv_interp, 0.0, 1.0);
 }
 
 #[fragment]
@@ -158,7 +158,7 @@ vec3 interleaved_gradient_noise(vec2 pos) {
 
 vec4 volumetric_fog_process(vec2 screen_uv) {
 #ifdef USE_MULTIVIEW
-	vec4 reprojected = sky_scene_data.combined_reprojection[ViewIndex] * (vec4(screen_uv * 2.0 - 1.0, 1.0, 1.0) * sky_scene_data.z_far);
+	vec4 reprojected = sky_scene_data.combined_reprojection[ViewIndex] * vec4(screen_uv * 2.0 - 1.0, 0.0, 1.0); // Unproject at the far plane
 	vec3 fog_pos = vec3(reprojected.xy / reprojected.w, 1.0) * 0.5 + 0.5;
 #else
 	vec3 fog_pos = vec3(screen_uv, 1.0);
@@ -187,9 +187,11 @@ void main() {
 	vec3 cube_normal;
 #ifdef USE_MULTIVIEW
 	// In multiview our projection matrices will contain positional and rotational offsets that we need to properly unproject.
-	vec4 unproject = vec4(uv_interp.x, -uv_interp.y, 1.0, 1.0);
+	vec4 unproject = vec4(uv_interp.x, -uv_interp.y, 0.0, 1.0); // unproject at the far plane
 	vec4 unprojected = sky_scene_data.view_inv_projections[ViewIndex] * unproject;
 	cube_normal = unprojected.xyz / unprojected.w;
+
+	// Unproject will give us the position between the eyes, need to re-offset
 	cube_normal += sky_scene_data.view_eye_offsets[ViewIndex].xyz;
 #else
 	cube_normal.z = -1.0;
diff --git a/servers/rendering/renderer_rd/shaders/environment/volumetric_fog_process.glsl b/servers/rendering/renderer_rd/shaders/environment/volumetric_fog_process.glsl
index 57b9a4c320..d0cfe6a3b8 100644
--- a/servers/rendering/renderer_rd/shaders/environment/volumetric_fog_process.glsl
+++ b/servers/rendering/renderer_rd/shaders/environment/volumetric_fog_process.glsl
@@ -416,7 +416,7 @@ void main() {
 					}
 
 					float depth = texture(sampler2D(directional_shadow_atlas, linear_sampler), pssm_coord.xy).r;
-					float shadow = exp(min(0.0, (depth - pssm_coord.z)) * z_range * INV_FOG_FADE);
+					float shadow = exp(min(0.0, (pssm_coord.z - depth)) * z_range * INV_FOG_FADE);
 
 					shadow = mix(shadow, 1.0, smoothstep(directional_lights.data[i].fade_from, directional_lights.data[i].fade_to, view_pos.z)); //done with negative values for performance
 
@@ -519,7 +519,7 @@ void main() {
 
 							float depth = texture(sampler2D(shadow_atlas, linear_sampler), pos.xy).r;
 
-							shadow_attenuation = mix(1.0 - omni_lights.data[light_index].shadow_opacity, 1.0, exp(min(0.0, (depth - pos.z)) / omni_lights.data[light_index].inv_radius * INV_FOG_FADE));
+							shadow_attenuation = mix(1.0 - omni_lights.data[light_index].shadow_opacity, 1.0, exp(min(0.0, (pos.z - depth)) / omni_lights.data[light_index].inv_radius * INV_FOG_FADE));
 						}
 						total_light += light * attenuation * shadow_attenuation * henyey_greenstein(dot(normalize(light_pos - view_pos), normalize(view_pos)), params.phase_g) * omni_lights.data[light_index].volumetric_fog_energy;
 					}
@@ -597,7 +597,7 @@ void main() {
 
 							float depth = texture(sampler2D(shadow_atlas, linear_sampler), pos.xy).r;
 
-							shadow_attenuation = mix(1.0 - spot_lights.data[light_index].shadow_opacity, 1.0, exp(min(0.0, (depth - pos.z)) / spot_lights.data[light_index].inv_radius * INV_FOG_FADE));
+							shadow_attenuation = mix(1.0 - spot_lights.data[light_index].shadow_opacity, 1.0, exp(min(0.0, (pos.z - depth)) / spot_lights.data[light_index].inv_radius * INV_FOG_FADE));
 						}
 						total_light += light * attenuation * shadow_attenuation * henyey_greenstein(dot(normalize(light_rel_vec), normalize(view_pos)), params.phase_g) * spot_lights.data[light_index].volumetric_fog_energy;
 					}
diff --git a/servers/rendering/renderer_rd/shaders/forward_clustered/scene_forward_clustered.glsl b/servers/rendering/renderer_rd/shaders/forward_clustered/scene_forward_clustered.glsl
index 6eae64c04e..359d7799e5 100644
--- a/servers/rendering/renderer_rd/shaders/forward_clustered/scene_forward_clustered.glsl
+++ b/servers/rendering/renderer_rd/shaders/forward_clustered/scene_forward_clustered.glsl
@@ -487,8 +487,8 @@ void vertex_shader(vec3 vertex_input,
 
 #ifdef MODE_RENDER_DEPTH
 	if (scene_data.pancake_shadows) {
-		if (gl_Position.z <= 0.00001) {
-			gl_Position.z = 0.00001;
+		if (gl_Position.z >= 0.9999) {
+			gl_Position.z = 0.9999;
 		}
 	}
 #endif
diff --git a/servers/rendering/renderer_rd/shaders/forward_mobile/scene_forward_mobile.glsl b/servers/rendering/renderer_rd/shaders/forward_mobile/scene_forward_mobile.glsl
index 259edc63a0..c26313092b 100644
--- a/servers/rendering/renderer_rd/shaders/forward_mobile/scene_forward_mobile.glsl
+++ b/servers/rendering/renderer_rd/shaders/forward_mobile/scene_forward_mobile.glsl
@@ -475,8 +475,8 @@ void main() {
 
 #ifdef MODE_RENDER_DEPTH
 	if (scene_data.pancake_shadows) {
-		if (gl_Position.z <= 0.00001) {
-			gl_Position.z = 0.00001;
+		if (gl_Position.z >= 0.9999) {
+			gl_Position.z = 0.9999;
 		}
 	}
 #endif // MODE_RENDER_DEPTH
diff --git a/servers/rendering/renderer_rd/shaders/scene_forward_lights_inc.glsl b/servers/rendering/renderer_rd/shaders/scene_forward_lights_inc.glsl
index e9722bad1f..47e6fe5873 100644
--- a/servers/rendering/renderer_rd/shaders/scene_forward_lights_inc.glsl
+++ b/servers/rendering/renderer_rd/shaders/scene_forward_lights_inc.glsl
@@ -454,7 +454,7 @@ float light_process_omni_shadow(uint idx, vec3 vertex, vec3 normal) {
 			vec3 v0 = abs(basis_normal.z) < 0.999 ? vec3(0.0, 0.0, 1.0) : vec3(0.0, 1.0, 0.0);
 			vec3 tangent = normalize(cross(v0, basis_normal));
 			vec3 bitangent = normalize(cross(tangent, basis_normal));
-			float z_norm = shadow_len * omni_lights.data[idx].inv_radius;
+			float z_norm = 1.0 - shadow_len * omni_lights.data[idx].inv_radius;
 
 			tangent *= omni_lights.data[idx].soft_shadow_size * omni_lights.data[idx].soft_shadow_scale;
 			bitangent *= omni_lights.data[idx].soft_shadow_size * omni_lights.data[idx].soft_shadow_scale;
@@ -479,7 +479,7 @@ float light_process_omni_shadow(uint idx, vec3 vertex, vec3 normal) {
 				pos.xy = uv_rect.xy + pos.xy * uv_rect.zw;
 
 				float d = textureLod(sampler2D(shadow_atlas, SAMPLER_LINEAR_CLAMP), pos.xy, 0.0).r;
-				if (d < z_norm) {
+				if (d > z_norm) {
 					blocker_average += d;
 					blocker_count += 1.0;
 				}
@@ -488,11 +488,11 @@ float light_process_omni_shadow(uint idx, vec3 vertex, vec3 normal) {
 			if (blocker_count > 0.0) {
 				//blockers found, do soft shadow
 				blocker_average /= blocker_count;
-				float penumbra = (z_norm - blocker_average) / blocker_average;
+				float penumbra = (z_norm + blocker_average) / blocker_average;
 				tangent *= penumbra;
 				bitangent *= penumbra;
 
-				z_norm -= omni_lights.data[idx].inv_radius * omni_lights.data[idx].shadow_bias;
+				z_norm += omni_lights.data[idx].inv_radius * omni_lights.data[idx].shadow_bias;
 
 				shadow = 0.0;
 				for (uint i = 0; i < sc_penumbra_shadow_samples; i++) {
@@ -536,6 +536,7 @@ float light_process_omni_shadow(uint idx, vec3 vertex, vec3 normal) {
 			vec2 pos = shadow_sample.xy / shadow_sample.z;
 			float depth = shadow_len - omni_lights.data[idx].shadow_bias;
 			depth *= omni_lights.data[idx].inv_radius;
+			depth = 1.0 - depth;
 			shadow = mix(1.0, sample_omni_pcf_shadow(shadow_atlas, omni_lights.data[idx].soft_shadow_scale / shadow_sample.z, pos, uv_rect, flip_offset, depth), omni_lights.data[idx].shadow_opacity);
 		}
 
@@ -706,7 +707,7 @@ float light_process_spot_shadow(uint idx, vec3 vertex, vec3 normal) {
 		vec4 v = vec4(vertex + normal_bias, 1.0);
 
 		vec4 splane = (spot_lights.data[idx].shadow_matrix * v);
-		splane.z -= spot_lights.data[idx].shadow_bias / (light_length * spot_lights.data[idx].inv_radius);
+		splane.z += spot_lights.data[idx].shadow_bias / (light_length * spot_lights.data[idx].inv_radius);
 		splane /= splane.w;
 
 		float shadow;
diff --git a/servers/rendering/renderer_rd/storage_rd/light_storage.cpp b/servers/rendering/renderer_rd/storage_rd/light_storage.cpp
index cf8c29e624..d1ff9fc362 100644
--- a/servers/rendering/renderer_rd/storage_rd/light_storage.cpp
+++ b/servers/rendering/renderer_rd/storage_rd/light_storage.cpp
@@ -668,7 +668,9 @@ void LightStorage::update_light_buffers(RenderDataRD *p_render_data, const Paged
 					light_data.blend_splits = (smode != RS::LIGHT_DIRECTIONAL_SHADOW_ORTHOGONAL) && light->directional_blend_splits;
 					for (int j = 0; j < 4; j++) {
 						Rect2 atlas_rect = light_instance->shadow_transform[j].atlas_rect;
-						Projection matrix = light_instance->shadow_transform[j].camera;
+						Projection correction;
+						correction.set_depth_correction(false, true, false);
+						Projection matrix = correction * light_instance->shadow_transform[j].camera;
 						float split = light_instance->shadow_transform[MIN(limit, j)].split;
 
 						Projection bias;
@@ -967,7 +969,9 @@ void LightStorage::update_light_buffers(RenderDataRD *p_render_data, const Paged
 				Projection bias;
 				bias.set_light_bias();
 
-				Projection cm = light_instance->shadow_transform[0].camera;
+				Projection correction;
+				correction.set_depth_correction(false, true, false);
+				Projection cm = correction * light_instance->shadow_transform[0].camera;
 				Projection shadow_mtx = bias * cm * modelview;
 				RendererRD::MaterialStorage::store_camera(shadow_mtx, light_data.shadow_matrix);
 
diff --git a/servers/rendering/renderer_rd/storage_rd/particles_storage.cpp b/servers/rendering/renderer_rd/storage_rd/particles_storage.cpp
index e78b8de4db..c9c7c53d04 100644
--- a/servers/rendering/renderer_rd/storage_rd/particles_storage.cpp
+++ b/servers/rendering/renderer_rd/storage_rd/particles_storage.cpp
@@ -1388,6 +1388,11 @@ void ParticlesStorage::_particles_update_buffers(Particles *particles) {
 	}
 }
 void ParticlesStorage::update_particles() {
+	if (!particle_update_list.first()) {
+		return;
+	}
+
+	RENDER_TIMESTAMP("Update GPUParticles");
 	uint32_t frame = RSG::rasterizer->get_frame_number();
 	bool uses_motion_vectors = RSG::viewport->get_num_viewports_with_motion_vectors() > 0;
 	while (particle_update_list.first()) {
diff --git a/servers/rendering/renderer_rd/storage_rd/texture_storage.cpp b/servers/rendering/renderer_rd/storage_rd/texture_storage.cpp
index f3ce432495..da046bf6b1 100644
--- a/servers/rendering/renderer_rd/storage_rd/texture_storage.cpp
+++ b/servers/rendering/renderer_rd/storage_rd/texture_storage.cpp
@@ -2965,7 +2965,7 @@ void TextureStorage::update_decal_buffer(const PagedArray<RID> &p_decals, const
 			dd.emission_rect[3] = 0;
 		}
 
-		Color modulate = decal->modulate;
+		Color modulate = decal->modulate.srgb_to_linear();
 		dd.modulate[0] = modulate.r;
 		dd.modulate[1] = modulate.g;
 		dd.modulate[2] = modulate.b;
diff --git a/servers/rendering/renderer_scene_cull.cpp b/servers/rendering/renderer_scene_cull.cpp
index aa69cd8539..b33de9d6f4 100644
--- a/servers/rendering/renderer_scene_cull.cpp
+++ b/servers/rendering/renderer_scene_cull.cpp
@@ -1720,6 +1720,7 @@ void RendererSceneCull::_update_instance(Instance *p_instance) {
 		idata.base_rid = p_instance->base;
 		idata.parent_array_index = p_instance->visibility_parent ? p_instance->visibility_parent->array_index : -1;
 		idata.visibility_index = p_instance->visibility_index;
+		idata.occlusion_timeout = 0;
 
 		for (Instance *E : p_instance->visibility_dependencies) {
 			Instance *dep_instance = E;
@@ -2775,7 +2776,7 @@ void RendererSceneCull::_scene_cull(CullData &cull_data, InstanceCullResult &cul
 #define VIS_RANGE_CHECK ((idata.visibility_index == -1) || _visibility_range_check<false>(cull_data.scenario->instance_visibility[idata.visibility_index], cull_data.cam_transform.origin, cull_data.visibility_viewport_mask) == 0)
 #define VIS_PARENT_CHECK (_visibility_parent_check(cull_data, idata))
 #define VIS_CHECK (visibility_check < 0 ? (visibility_check = (visibility_flags != InstanceData::FLAG_VISIBILITY_DEPENDENCY_NEEDS_CHECK || (VIS_RANGE_CHECK && VIS_PARENT_CHECK))) : visibility_check)
-#define OCCLUSION_CULLED (cull_data.occlusion_buffer != nullptr && (cull_data.scenario->instance_data[i].flags & InstanceData::FLAG_IGNORE_OCCLUSION_CULLING) == 0 && cull_data.occlusion_buffer->is_occluded(cull_data.scenario->instance_aabbs[i].bounds, cull_data.cam_transform.origin, inv_cam_transform, *cull_data.camera_matrix, z_near))
+#define OCCLUSION_CULLED (cull_data.occlusion_buffer != nullptr && (cull_data.scenario->instance_data[i].flags & InstanceData::FLAG_IGNORE_OCCLUSION_CULLING) == 0 && cull_data.occlusion_buffer->is_occluded(cull_data.scenario->instance_aabbs[i].bounds, cull_data.cam_transform.origin, inv_cam_transform, *cull_data.camera_matrix, z_near, cull_data.scenario->instance_data[i].occlusion_timeout))
 
 		if (!HIDDEN_BY_VISIBILITY_CHECKS) {
 			if ((LAYER_CHECK && IN_FRUSTUM(cull_data.cull->frustum) && VIS_CHECK && !OCCLUSION_CULLED) || (cull_data.scenario->instance_data[i].flags & InstanceData::FLAG_IGNORE_ALL_CULLING)) {
@@ -3567,43 +3568,47 @@ void RendererSceneCull::render_probes() {
 
 	bool busy = false;
 
-	while (ref_probe) {
-		SelfList<InstanceReflectionProbeData> *next = ref_probe->next();
-		RID base = ref_probe->self()->owner->base;
+	if (ref_probe) {
+		RENDER_TIMESTAMP("Render ReflectionProbes");
 
-		switch (RSG::light_storage->reflection_probe_get_update_mode(base)) {
-			case RS::REFLECTION_PROBE_UPDATE_ONCE: {
-				if (busy) { //already rendering something
-					break;
-				}
+		while (ref_probe) {
+			SelfList<InstanceReflectionProbeData> *next = ref_probe->next();
+			RID base = ref_probe->self()->owner->base;
 
-				bool done = _render_reflection_probe_step(ref_probe->self()->owner, ref_probe->self()->render_step);
-				if (done) {
-					done_list.push_back(ref_probe);
-				} else {
-					ref_probe->self()->render_step++;
-				}
+			switch (RSG::light_storage->reflection_probe_get_update_mode(base)) {
+				case RS::REFLECTION_PROBE_UPDATE_ONCE: {
+					if (busy) { // Already rendering something.
+						break;
+					}
 
-				busy = true; //do not render another one of this kind
-			} break;
-			case RS::REFLECTION_PROBE_UPDATE_ALWAYS: {
-				int step = 0;
-				bool done = false;
-				while (!done) {
-					done = _render_reflection_probe_step(ref_probe->self()->owner, step);
-					step++;
-				}
+					bool done = _render_reflection_probe_step(ref_probe->self()->owner, ref_probe->self()->render_step);
+					if (done) {
+						done_list.push_back(ref_probe);
+					} else {
+						ref_probe->self()->render_step++;
+					}
 
-				done_list.push_back(ref_probe);
-			} break;
-		}
+					busy = true; // Do not render another one of this kind.
+				} break;
+				case RS::REFLECTION_PROBE_UPDATE_ALWAYS: {
+					int step = 0;
+					bool done = false;
+					while (!done) {
+						done = _render_reflection_probe_step(ref_probe->self()->owner, step);
+						step++;
+					}
 
-		ref_probe = next;
-	}
+					done_list.push_back(ref_probe);
+				} break;
+			}
+
+			ref_probe = next;
+		}
 
-	// Now remove from our list
-	for (SelfList<InstanceReflectionProbeData> *rp : done_list) {
-		reflection_probe_render_list.remove(rp);
+		// Now remove from our list
+		for (SelfList<InstanceReflectionProbeData> *rp : done_list) {
+			reflection_probe_render_list.remove(rp);
+		}
 	}
 
 	/* VOXEL GIS */
@@ -4252,6 +4257,7 @@ RendererSceneCull::RendererSceneCull() {
 	indexer_update_iterations = GLOBAL_GET("rendering/limits/spatial_indexer/update_iterations_per_frame");
 	thread_cull_threshold = GLOBAL_GET("rendering/limits/spatial_indexer/threaded_cull_minimum_instances");
 	thread_cull_threshold = MAX(thread_cull_threshold, (uint32_t)WorkerThreadPool::get_singleton()->get_thread_count()); //make sure there is at least one thread per CPU
+	RendererSceneOcclusionCull::HZBuffer::occlusion_jitter_enabled = GLOBAL_GET("rendering/occlusion_culling/jitter_projection");
 
 	dummy_occlusion_culling = memnew(RendererSceneOcclusionCull);
 
diff --git a/servers/rendering/renderer_scene_cull.h b/servers/rendering/renderer_scene_cull.h
index 341ba0e3b0..0039d14475 100644
--- a/servers/rendering/renderer_scene_cull.h
+++ b/servers/rendering/renderer_scene_cull.h
@@ -286,6 +286,13 @@ public:
 		Instance *instance = nullptr;
 		int32_t parent_array_index = -1;
 		int32_t visibility_index = -1;
+
+		// Each time occlusion culling determines an instance is visible,
+		// set this to occlusion_frame plus some delay.
+		// Once the timeout is reached, allow the instance to be occlusion culled.
+		// This creates a delay for occlusion culling, which prevents flickering
+		// when jittering the raster occlusion projection.
+		uint64_t occlusion_timeout = 0;
 	};
 
 	struct InstanceVisibilityData {
diff --git a/servers/rendering/renderer_scene_occlusion_cull.cpp b/servers/rendering/renderer_scene_occlusion_cull.cpp
index c4f0177c73..1f0239411a 100644
--- a/servers/rendering/renderer_scene_occlusion_cull.cpp
+++ b/servers/rendering/renderer_scene_occlusion_cull.cpp
@@ -43,6 +43,8 @@ const Vector3 RendererSceneOcclusionCull::HZBuffer::corners[8] = {
 	Vector3(1, 1, 1)
 };
 
+bool RendererSceneOcclusionCull::HZBuffer::occlusion_jitter_enabled = false;
+
 bool RendererSceneOcclusionCull::HZBuffer::is_empty() const {
 	return sizes.is_empty();
 }
@@ -66,6 +68,8 @@ void RendererSceneOcclusionCull::HZBuffer::clear() {
 }
 
 void RendererSceneOcclusionCull::HZBuffer::resize(const Size2i &p_size) {
+	occlusion_buffer_size = p_size;
+
 	if (p_size == Size2i()) {
 		clear();
 		return;
@@ -124,6 +128,9 @@ void RendererSceneOcclusionCull::HZBuffer::resize(const Size2i &p_size) {
 }
 
 void RendererSceneOcclusionCull::HZBuffer::update_mips() {
+	// Keep this up to date as a local to be used for occlusion timers.
+	occlusion_frame = Engine::get_singleton()->get_frames_drawn();
+
 	if (sizes.is_empty()) {
 		return;
 	}
diff --git a/servers/rendering/renderer_scene_occlusion_cull.h b/servers/rendering/renderer_scene_occlusion_cull.h
index 149d7b1cdb..5adba5dc6a 100644
--- a/servers/rendering/renderer_scene_occlusion_cull.h
+++ b/servers/rendering/renderer_scene_occlusion_cull.h
@@ -53,14 +53,10 @@ public:
 		PackedByteArray debug_data;
 		float debug_tex_range = 0.0f;
 
-	public:
-		bool is_empty() const;
-		virtual void clear();
-		virtual void resize(const Size2i &p_size);
-
-		void update_mips();
+		uint64_t occlusion_frame = 0;
+		Size2i occlusion_buffer_size;
 
-		_FORCE_INLINE_ bool is_occluded(const real_t p_bounds[6], const Vector3 &p_cam_position, const Transform3D &p_cam_inv_transform, const Projection &p_cam_projection, real_t p_near) const {
+		_FORCE_INLINE_ bool _is_occluded(const real_t p_bounds[6], const Vector3 &p_cam_position, const Transform3D &p_cam_inv_transform, const Projection &p_cam_projection, real_t p_near) const {
 			if (is_empty()) {
 				return false;
 			}
@@ -154,7 +150,47 @@ public:
 			return !visible;
 		}
 
+	public:
+		static bool occlusion_jitter_enabled;
+
+		bool is_empty() const;
+		virtual void clear();
+		virtual void resize(const Size2i &p_size);
+
+		void update_mips();
+
+		// Thin wrapper around _is_occluded(),
+		// allowing occlusion timers to delay the disappearance
+		// of objects to prevent flickering when using jittering.
+		_FORCE_INLINE_ bool is_occluded(const real_t p_bounds[6], const Vector3 &p_cam_position, const Transform3D &p_cam_inv_transform, const Projection &p_cam_projection, real_t p_near, uint64_t &r_occlusion_timeout) const {
+			bool occluded = _is_occluded(p_bounds, p_cam_position, p_cam_inv_transform, p_cam_projection, p_near);
+
+			// Special case, temporal jitter disabled,
+			// so we don't use occlusion timers.
+			if (!occlusion_jitter_enabled) {
+				return occluded;
+			}
+
+			if (!occluded) {
+//#define DEBUG_RASTER_OCCLUSION_JITTER
+#ifdef DEBUG_RASTER_OCCLUSION_JITTER
+				r_occlusion_timeout = occlusion_frame + 1;
+#else
+				r_occlusion_timeout = occlusion_frame + 9;
+#endif
+			} else if (r_occlusion_timeout) {
+				// Regular timeout, allow occlusion culling
+				// to proceed as normal after the delay.
+				if (occlusion_frame >= r_occlusion_timeout) {
+					r_occlusion_timeout = 0;
+				}
+			}
+
+			return occluded && !r_occlusion_timeout;
+		}
+
 		RID get_debug_texture();
+		const Size2i &get_occlusion_buffer_size() const { return occlusion_buffer_size; }
 
 		virtual ~HZBuffer(){};
 	};
diff --git a/servers/rendering/rendering_device.h b/servers/rendering/rendering_device.h
index 9a898a2fca..c8277024cf 100644
--- a/servers/rendering/rendering_device.h
+++ b/servers/rendering/rendering_device.h
@@ -1103,7 +1103,7 @@ private:
 
 public:
 	DrawListID draw_list_begin_for_screen(DisplayServer::WindowID p_screen = 0, const Color &p_clear_color = Color());
-	DrawListID draw_list_begin(RID p_framebuffer, InitialAction p_initial_color_action, FinalAction p_final_color_action, InitialAction p_initial_depth_action, FinalAction p_final_depth_action, const Vector<Color> &p_clear_color_values = Vector<Color>(), float p_clear_depth = 1.0, uint32_t p_clear_stencil = 0, const Rect2 &p_region = Rect2());
+	DrawListID draw_list_begin(RID p_framebuffer, InitialAction p_initial_color_action, FinalAction p_final_color_action, InitialAction p_initial_depth_action, FinalAction p_final_depth_action, const Vector<Color> &p_clear_color_values = Vector<Color>(), float p_clear_depth = 0.0, uint32_t p_clear_stencil = 0, const Rect2 &p_region = Rect2());
 
 	void draw_list_set_blend_constants(DrawListID p_list, const Color &p_color);
 	void draw_list_bind_render_pipeline(DrawListID p_list, RID p_render_pipeline);
diff --git a/servers/rendering/rendering_server_default.cpp b/servers/rendering/rendering_server_default.cpp
index f614070f68..5bf0ab0ba6 100644
--- a/servers/rendering/rendering_server_default.cpp
+++ b/servers/rendering/rendering_server_default.cpp
@@ -80,6 +80,7 @@ void RenderingServerDefault::_draw(bool p_swap_buffers, double frame_step) {
 
 	uint64_t time_usec = OS::get_singleton()->get_ticks_usec();
 
+	RENDER_TIMESTAMP("Prepare Render Frame");
 	RSG::scene->update(); //update scenes stuff before updating instances
 
 	frame_setup_time = double(OS::get_singleton()->get_ticks_usec() - time_usec) / 1000.0;
diff --git a/tests/core/io/test_marshalls.h b/tests/core/io/test_marshalls.h
index 3c0ba611c6..de8d6e1406 100644
--- a/tests/core/io/test_marshalls.h
+++ b/tests/core/io/test_marshalls.h
@@ -160,7 +160,7 @@ TEST_CASE("[Marshalls] NIL Variant encoding") {
 	uint8_t buffer[4];
 
 	CHECK(encode_variant(variant, buffer, r_len) == OK);
-	CHECK_MESSAGE(r_len == 4, "Length == 4 bytes for Variant::Type");
+	CHECK_MESSAGE(r_len == 4, "Length == 4 bytes for header");
 	CHECK_MESSAGE(buffer[0] == 0x00, "Variant::NIL");
 	CHECK(buffer[1] == 0x00);
 	CHECK(buffer[2] == 0x00);
@@ -174,7 +174,7 @@ TEST_CASE("[Marshalls] INT 32 bit Variant encoding") {
 	uint8_t buffer[8];
 
 	CHECK(encode_variant(variant, buffer, r_len) == OK);
-	CHECK_MESSAGE(r_len == 8, "Length == 4 bytes for Variant::Type + 4 bytes for int32_t");
+	CHECK_MESSAGE(r_len == 8, "Length == 4 bytes for header + 4 bytes for int32_t");
 	CHECK_MESSAGE(buffer[0] == 0x02, "Variant::INT");
 	CHECK(buffer[1] == 0x00);
 	CHECK(buffer[2] == 0x00);
@@ -192,10 +192,10 @@ TEST_CASE("[Marshalls] INT 64 bit Variant encoding") {
 	uint8_t buffer[12];
 
 	CHECK(encode_variant(variant, buffer, r_len) == OK);
-	CHECK_MESSAGE(r_len == 12, "Length == 4 bytes for Variant::Type + 8 bytes for int64_t");
+	CHECK_MESSAGE(r_len == 12, "Length == 4 bytes for header + 8 bytes for int64_t");
 	CHECK_MESSAGE(buffer[0] == 0x02, "Variant::INT");
 	CHECK(buffer[1] == 0x00);
-	CHECK_MESSAGE(buffer[2] == 0x01, "ENCODE_FLAG_64");
+	CHECK_MESSAGE(buffer[2] == 0x01, "HEADER_DATA_FLAG_64");
 	CHECK(buffer[3] == 0x00);
 	// Check value
 	CHECK(buffer[4] == 0xef);
@@ -214,7 +214,7 @@ TEST_CASE("[Marshalls] FLOAT single precision Variant encoding") {
 	uint8_t buffer[8];
 
 	CHECK(encode_variant(variant, buffer, r_len) == OK);
-	CHECK_MESSAGE(r_len == 8, "Length == 4 bytes for Variant::Type + 4 bytes for float");
+	CHECK_MESSAGE(r_len == 8, "Length == 4 bytes for header + 4 bytes for float");
 	CHECK_MESSAGE(buffer[0] == 0x03, "Variant::FLOAT");
 	CHECK(buffer[1] == 0x00);
 	CHECK(buffer[2] == 0x00);
@@ -232,10 +232,10 @@ TEST_CASE("[Marshalls] FLOAT double precision Variant encoding") {
 	uint8_t buffer[12];
 
 	CHECK(encode_variant(variant, buffer, r_len) == OK);
-	CHECK_MESSAGE(r_len == 12, "Length == 4 bytes for Variant::Type + 8 bytes for double");
+	CHECK_MESSAGE(r_len == 12, "Length == 4 bytes for header + 8 bytes for double");
 	CHECK_MESSAGE(buffer[0] == 0x03, "Variant::FLOAT");
 	CHECK(buffer[1] == 0x00);
-	CHECK_MESSAGE(buffer[2] == 0x01, "ENCODE_FLAG_64");
+	CHECK_MESSAGE(buffer[2] == 0x01, "HEADER_DATA_FLAG_64");
 	CHECK(buffer[3] == 0x00);
 	// Check value
 	CHECK(buffer[4] == 0x55);
@@ -292,7 +292,7 @@ TEST_CASE("[Marshalls] INT 64 bit Variant decoding") {
 	Variant variant;
 	int r_len;
 	uint8_t buffer[] = {
-		0x02, 0x00, 0x01, 0x00, // Variant::INT & ENCODE_FLAG_64
+		0x02, 0x00, 0x01, 0x00, // Variant::INT, HEADER_DATA_FLAG_64
 		0xef, 0xcd, 0xab, 0x89, 0x67, 0x45, 0x23, 0xf1 // value
 	};
 
@@ -318,7 +318,7 @@ TEST_CASE("[Marshalls] FLOAT double precision Variant decoding") {
 	Variant variant;
 	int r_len;
 	uint8_t buffer[] = {
-		0x03, 0x00, 0x01, 0x00, // Variant::FLOAT & ENCODE_FLAG_64
+		0x03, 0x00, 0x01, 0x00, // Variant::FLOAT, HEADER_DATA_FLAG_64
 		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0xd5, 0x3f // value
 	};
 
@@ -326,6 +326,66 @@ TEST_CASE("[Marshalls] FLOAT double precision Variant decoding") {
 	CHECK(r_len == 12);
 	CHECK(variant == Variant(0.33333333333333333));
 }
+
+TEST_CASE("[Marshalls] Typed array encoding") {
+	int r_len;
+	Array array;
+	array.set_typed(Variant::INT, StringName(), Ref<Script>());
+	array.push_back(Variant(uint64_t(0x0f123456789abcdef)));
+	uint8_t buffer[24];
+
+	CHECK(encode_variant(array, buffer, r_len) == OK);
+	CHECK_MESSAGE(r_len == 24, "Length == 4 bytes for header + 4 bytes for array type + 4 bytes for array size + 12 bytes for element");
+	CHECK_MESSAGE(buffer[0] == 0x1c, "Variant::ARRAY");
+	CHECK(buffer[1] == 0x00);
+	CHECK_MESSAGE(buffer[2] == 0x01, "HEADER_DATA_FIELD_TYPED_ARRAY_BUILTIN");
+	CHECK(buffer[3] == 0x00);
+	// Check array type.
+	CHECK_MESSAGE(buffer[4] == 0x02, "Variant::INT");
+	CHECK(buffer[5] == 0x00);
+	CHECK(buffer[6] == 0x00);
+	CHECK(buffer[7] == 0x00);
+	// Check array size.
+	CHECK(buffer[8] == 0x01);
+	CHECK(buffer[9] == 0x00);
+	CHECK(buffer[10] == 0x00);
+	CHECK(buffer[11] == 0x00);
+	// Check element type.
+	CHECK_MESSAGE(buffer[12] == 0x02, "Variant::INT");
+	CHECK(buffer[13] == 0x00);
+	CHECK_MESSAGE(buffer[14] == 0x01, "HEADER_DATA_FLAG_64");
+	CHECK(buffer[15] == 0x00);
+	// Check element value.
+	CHECK(buffer[16] == 0xef);
+	CHECK(buffer[17] == 0xcd);
+	CHECK(buffer[18] == 0xab);
+	CHECK(buffer[19] == 0x89);
+	CHECK(buffer[20] == 0x67);
+	CHECK(buffer[21] == 0x45);
+	CHECK(buffer[22] == 0x23);
+	CHECK(buffer[23] == 0xf1);
+}
+
+TEST_CASE("[Marshalls] Typed array decoding") {
+	Variant variant;
+	int r_len;
+	uint8_t buffer[] = {
+		0x1c, 0x00, 0x01, 0x00, // Variant::ARRAY, HEADER_DATA_FIELD_TYPED_ARRAY_BUILTIN
+		0x02, 0x00, 0x00, 0x00, // Array type (Variant::INT).
+		0x01, 0x00, 0x00, 0x00, // Array size.
+		0x02, 0x00, 0x01, 0x00, // Element type (Variant::INT, HEADER_DATA_FLAG_64).
+		0xef, 0xcd, 0xab, 0x89, 0x67, 0x45, 0x23, 0xf1, // Element value.
+	};
+
+	CHECK(decode_variant(variant, buffer, 24, &r_len) == OK);
+	CHECK(r_len == 24);
+	CHECK(variant.get_type() == Variant::ARRAY);
+	Array array = variant;
+	CHECK(array.get_typed_builtin() == Variant::INT);
+	CHECK(array.size() == 1);
+	CHECK(array[0] == Variant(uint64_t(0x0f123456789abcdef)));
+}
+
 } // namespace TestMarshalls
 
 #endif // TEST_MARSHALLS_H
diff --git a/tests/scene/test_code_edit.h b/tests/scene/test_code_edit.h
index bc2f7f51b1..b0a46b8107 100644
--- a/tests/scene/test_code_edit.h
+++ b/tests/scene/test_code_edit.h
@@ -3520,6 +3520,7 @@ TEST_CASE("[SceneTree][CodeEdit] completion") {
 			/* Single click selects. */
 			caret_pos.y += code_edit->get_line_height() * 2;
 			SEND_GUI_MOUSE_BUTTON_EVENT(caret_pos, MouseButton::LEFT, MouseButtonMask::LEFT, Key::NONE);
+			SEND_GUI_MOUSE_BUTTON_RELEASED_EVENT(caret_pos, MouseButton::LEFT, MouseButtonMask::NONE, Key::NONE);
 			CHECK(code_edit->get_code_completion_selected_index() == 2);
 
 			/* Double click inserts. */
diff --git a/thirdparty/README.md b/thirdparty/README.md
index 0520440e52..be386a3920 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -172,13 +172,15 @@ Files extracted from upstream source:
 ## embree
 
 - Upstream: https://github.com/embree/embree
-- Version: 3.13.5 (698442324ccddd11725fb8875275dc1384f7fb40, 2022)
+- Version: 4.3.1 (daa8de0e714e18ad5e5c9841b67c1950d9c91c51, 2024)
 - License: Apache 2.0
 
 Files extracted from upstream:
 
 - All `.cpp` files listed in `modules/raycast/godot_update_embree.py`
 - All header files in the directories listed in `modules/raycast/godot_update_embree.py`
+- All config files listed in `modules/raycast/godot_update_embree.py`
+- `LICENSE.txt`
 
 The `modules/raycast/godot_update_embree.py` script can be used to pull the
 relevant files from the latest Embree release and apply some automatic changes.
diff --git a/thirdparty/embree/LICENSE.txt b/thirdparty/embree/LICENSE.txt
new file mode 100644
index 0000000000..d645695673
--- /dev/null
+++ b/thirdparty/embree/LICENSE.txt
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/thirdparty/embree/common/algorithms/parallel_any_of.h b/thirdparty/embree/common/algorithms/parallel_any_of.h
index a64e4a1889..a95c1f6490 100644
--- a/thirdparty/embree/common/algorithms/parallel_any_of.h
+++ b/thirdparty/embree/common/algorithms/parallel_any_of.h
@@ -12,7 +12,8 @@ namespace embree
   template<typename Index, class UnaryPredicate>
     __forceinline bool parallel_any_of (Index first, Index last, UnaryPredicate pred)
   {
-    bool ret = false;
+    std::atomic_bool ret;
+    ret = false;
     
 #if defined(TASKING_TBB)
 #if TBB_INTERFACE_VERSION >= 12002
diff --git a/thirdparty/embree/common/algorithms/parallel_for.h b/thirdparty/embree/common/algorithms/parallel_for.h
index 6d411e4852..fd5213e70a 100644
--- a/thirdparty/embree/common/algorithms/parallel_for.h
+++ b/thirdparty/embree/common/algorithms/parallel_for.h
@@ -5,7 +5,7 @@
 
 #include "../tasking/taskscheduler.h"
 #include "../sys/array.h"
-#include "../math/math.h"
+#include "../math/emath.h"
 #include "../math/range.h"
 
 namespace embree
@@ -14,17 +14,17 @@ namespace embree
   template<typename Index, typename Func>
     __forceinline void parallel_for( const Index N, const Func& func)
   {
-#if defined(TASKING_INTERNAL)
+#if defined(TASKING_INTERNAL) && !defined(TASKING_TBB)
     if (N) {
+      TaskScheduler::TaskGroupContext context;
       TaskScheduler::spawn(Index(0),N,Index(1),[&] (const range<Index>& r) {
           assert(r.size() == 1);
           func(r.begin());
-        });
-      if (!TaskScheduler::wait())
-        // -- GODOT start --
-        // throw std::runtime_error("task cancelled");
-        abort();
-        // -- GODOT end --
+        },&context);
+      TaskScheduler::wait();
+      if (context.cancellingException != nullptr) {
+        std::rethrow_exception(context.cancellingException);
+      }
     }
 #elif defined(TASKING_TBB)
   #if TBB_INTERFACE_VERSION >= 12002
@@ -33,19 +33,13 @@ namespace embree
         func(i);
       },context);
     if (context.is_group_execution_cancelled())
-      // -- GODOT start --
-      // throw std::runtime_error("task cancelled");
-      abort();
-      // -- GODOT end --
+      throw std::runtime_error("task cancelled");
   #else
     tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
         func(i);
       });
     if (tbb::task::self().is_cancelled())
-      // -- GODOT start --
-      // throw std::runtime_error("task cancelled");
-      abort();
-      // -- GODOT end --
+      throw std::runtime_error("task cancelled");
   #endif
 
 #elif defined(TASKING_PPL)
@@ -62,13 +56,13 @@ namespace embree
     __forceinline void parallel_for( const Index first, const Index last, const Index minStepSize, const Func& func)
   {
     assert(first <= last);
-#if defined(TASKING_INTERNAL)
-    TaskScheduler::spawn(first,last,minStepSize,func);
-    if (!TaskScheduler::wait())
-      // -- GODOT start --
-      // throw std::runtime_error("task cancelled");
-      abort();
-      // -- GODOT end --
+#if defined(TASKING_INTERNAL) && !defined(TASKING_TBB)
+    TaskScheduler::TaskGroupContext context;
+    TaskScheduler::spawn(first,last,minStepSize,func,&context);
+    TaskScheduler::wait();
+    if (context.cancellingException != nullptr) {
+      std::rethrow_exception(context.cancellingException);
+    }
 
 #elif defined(TASKING_TBB)
   #if TBB_INTERFACE_VERSION >= 12002
@@ -77,19 +71,13 @@ namespace embree
         func(range<Index>(r.begin(),r.end()));
       },context);
     if (context.is_group_execution_cancelled())
-      // -- GODOT start --
-      // throw std::runtime_error("task cancelled");
-      abort();
-      // -- GODOT end --
+      throw std::runtime_error("task cancelled");
   #else
     tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
         func(range<Index>(r.begin(),r.end()));
       });
     if (tbb::task::self().is_cancelled())
-      // -- GODOT start --
-      // throw std::runtime_error("task cancelled");
-      abort();
-      // -- GODOT end --
+      throw std::runtime_error("task cancelled");
   #endif
 
 #elif defined(TASKING_PPL)
@@ -121,19 +109,13 @@ namespace embree
           func(i);
         },tbb::simple_partitioner(),context);
       if (context.is_group_execution_cancelled())
-        // -- GODOT start --
-        // throw std::runtime_error("task cancelled");
-        abort();
-        // -- GODOT end --
+        throw std::runtime_error("task cancelled");
     #else
       tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
           func(i);
         },tbb::simple_partitioner());
       if (tbb::task::self().is_cancelled())
-        // -- GODOT start --
-        // throw std::runtime_error("task cancelled");
-        abort();
-        // -- GODOT end --
+        throw std::runtime_error("task cancelled");
     #endif
   }
 
@@ -148,19 +130,13 @@ namespace embree
           func(i);
         },ap,context);
       if (context.is_group_execution_cancelled())
-        // -- GODOT start --
-        // throw std::runtime_error("task cancelled");
-        abort();
-        // -- GODOT end --
+        throw std::runtime_error("task cancelled");
     #else
       tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
           func(i);
         },ap);
       if (tbb::task::self().is_cancelled())
-        // -- GODOT start --
-        // throw std::runtime_error("task cancelled");
-        abort();
-        // -- GODOT end --
+        throw std::runtime_error("task cancelled");
     #endif
   }
 
diff --git a/thirdparty/embree/common/algorithms/parallel_partition.h b/thirdparty/embree/common/algorithms/parallel_partition.h
index a1cbdc8e04..53d4d6f0db 100644
--- a/thirdparty/embree/common/algorithms/parallel_partition.h
+++ b/thirdparty/embree/common/algorithms/parallel_partition.h
@@ -175,8 +175,8 @@ namespace embree
       /* calculate all left and right ranges that are on the wrong global side */
       size_t numMisplacedRangesLeft  = 0;
       size_t numMisplacedRangesRight = 0;
-      size_t numMisplacedItemsLeft   = 0;
-      size_t numMisplacedItemsRight  = 0;
+      size_t numMisplacedItemsLeft  MAYBE_UNUSED = 0;
+      size_t numMisplacedItemsRight MAYBE_UNUSED = 0;
 
       for (size_t i=0; i<numTasks; i++)
       {	    
diff --git a/thirdparty/embree/common/algorithms/parallel_reduce.h b/thirdparty/embree/common/algorithms/parallel_reduce.h
index cd0078f2e6..51ec0a6405 100644
--- a/thirdparty/embree/common/algorithms/parallel_reduce.h
+++ b/thirdparty/embree/common/algorithms/parallel_reduce.h
@@ -43,7 +43,7 @@ namespace embree
   template<typename Index, typename Value, typename Func, typename Reduction>
     __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
   {
-#if defined(TASKING_INTERNAL)
+#if defined(TASKING_INTERNAL) && !defined(TASKING_TBB)
 
     /* fast path for small number of iterations */
     Index taskCount = (last-first+minStepSize-1)/minStepSize;
diff --git a/thirdparty/embree/common/lexers/stream.h b/thirdparty/embree/common/lexers/stream.h
index a40c15f8eb..9ad72af4e6 100644
--- a/thirdparty/embree/common/lexers/stream.h
+++ b/thirdparty/embree/common/lexers/stream.h
@@ -6,7 +6,7 @@
 #include "../sys/platform.h"
 #include "../sys/ref.h"
 #include "../sys/filename.h"
-#include "../sys/string.h"
+#include "../sys/estring.h"
 
 #include <vector>
 #include <iostream>
@@ -122,17 +122,16 @@ namespace embree
   class FileStream : public Stream<int>
   {
   public:
-
-    FileStream (FILE* file, const std::string& name = "file")
-      : file(file), lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name))) {}
-
     FileStream (const FileName& fileName)
       : lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(fileName.str())))
     {
-      file = fopen(fileName.c_str(),"r");
-      if (file == nullptr) THROW_RUNTIME_ERROR("cannot open file " + fileName.str());
+      if (ifs) ifs.close();
+      ifs.open(fileName.str());
+      if (!ifs.is_open()) THROW_RUNTIME_ERROR("cannot open file " + fileName.str());
+    }
+    ~FileStream() { 
+      if (ifs) ifs.close();
     }
-    ~FileStream() { if (file) fclose(file); }
 
   public:
     ParseLocation location() {
@@ -140,14 +139,15 @@ namespace embree
     }
 
     int next() {
-      int c = fgetc(file);
+      int c = ifs.get();
       if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++;
       charNumber++;
       return c;
     }
 
+
   private:
-    FILE* file;
+    std::ifstream ifs;
     ssize_t lineNumber;           /// the line number the token is from
     ssize_t colNumber;            /// the character number in the current line
     ssize_t charNumber;           /// the character in the file
diff --git a/thirdparty/embree/common/lexers/stringstream.cpp b/thirdparty/embree/common/lexers/stringstream.cpp
index a037869506..c93da0b420 100644
--- a/thirdparty/embree/common/lexers/stringstream.cpp
+++ b/thirdparty/embree/common/lexers/stringstream.cpp
@@ -41,7 +41,9 @@ namespace embree
       int c = cin->get();
       // -- GODOT start --
       // if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input");
-      if (!isValidChar(c)) abort();
+      if (!isValidChar(c)) {
+        abort();
+      }
       // -- GODOT end --
       str.push_back((char)c);
     }
diff --git a/thirdparty/embree/common/lexers/tokenstream.cpp b/thirdparty/embree/common/lexers/tokenstream.cpp
index 6ed6f2045a..fe9de641db 100644
--- a/thirdparty/embree/common/lexers/tokenstream.cpp
+++ b/thirdparty/embree/common/lexers/tokenstream.cpp
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "tokenstream.h"
-#include "../math/math.h"
+#include "../math/emath.h"
 
 namespace embree
 {
diff --git a/thirdparty/embree/common/math/affinespace.h b/thirdparty/embree/common/math/affinespace.h
index 9d4a0f0846..f3e5404639 100644
--- a/thirdparty/embree/common/math/affinespace.h
+++ b/thirdparty/embree/common/math/affinespace.h
@@ -337,7 +337,7 @@ namespace embree
     if (D) *D = sqrtf(D_x);
     return true;
   }
-
+  
   __forceinline void AffineSpace3fa_store_unaligned(const AffineSpace3fa &source, AffineSpace3fa* ptr)
   {
     Vec3fa::storeu(&ptr->l.vx, source.l.vx);
diff --git a/thirdparty/embree/common/math/bbox.h b/thirdparty/embree/common/math/bbox.h
index e4eb3df9a4..651b29a8fe 100644
--- a/thirdparty/embree/common/math/bbox.h
+++ b/thirdparty/embree/common/math/bbox.h
@@ -56,6 +56,11 @@ namespace embree
       return BBox(min(a.lower, b.lower), max(a.upper, b.upper));
     }
 
+    /*! intersects two boxes */
+    __forceinline static const BBox intersect (const BBox& a, const BBox& b) {
+      return BBox(max(a.lower, b.lower), min(a.upper, b.upper));
+    }
+
      /*! enlarge box by some scaling factor */
     __forceinline BBox enlarge_by(const float a) const {
       return BBox(lower - T(a)*abs(lower), upper + T(a)*abs(upper));
diff --git a/thirdparty/embree/common/math/col3.h b/thirdparty/embree/common/math/col3.h
index 3f50c04393..4576bc517d 100644
--- a/thirdparty/embree/common/math/col3.h
+++ b/thirdparty/embree/common/math/col3.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "math.h"
+#include "emath.h"
 
 namespace embree
 {
diff --git a/thirdparty/embree/common/math/col4.h b/thirdparty/embree/common/math/col4.h
index 788508516b..4ef916cc3a 100644
--- a/thirdparty/embree/common/math/col4.h
+++ b/thirdparty/embree/common/math/col4.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "math.h"
+#include "emath.h"
 
 namespace embree
 {
diff --git a/thirdparty/embree/common/math/color.h b/thirdparty/embree/common/math/color.h
index e62e4ad2a4..8b28ff9447 100644
--- a/thirdparty/embree/common/math/color.h
+++ b/thirdparty/embree/common/math/color.h
@@ -3,6 +3,10 @@
 
 #pragma once
 
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+#  include "color_sycl.h"
+#else
+
 #include "constants.h"
 #include "col3.h"
 #include "col4.h"
@@ -64,6 +68,10 @@ namespace embree
       d.b = (unsigned char)(s[2]); 
       d.a = (unsigned char)(s[3]); 
     }
+    __forceinline void set(float &f) const
+    {
+      f = 0.2126f*r+0.7125f*g+0.0722f*b; // sRGB luminance.
+    }
 
     ////////////////////////////////////////////////////////////////////////////////
     /// Constants
@@ -256,3 +264,5 @@ namespace embree
     return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")";
   }
 }
+
+#endif
diff --git a/thirdparty/embree/common/math/color_sycl.h b/thirdparty/embree/common/math/color_sycl.h
new file mode 100644
index 0000000000..41b89ddecc
--- /dev/null
+++ b/thirdparty/embree/common/math/color_sycl.h
@@ -0,0 +1,219 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "constants.h"
+#include "col3.h"
+#include "col4.h"
+
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE RGBA Color Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct Color4
+  {
+    struct { float r,g,b,a; };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color4 () {}
+    //__forceinline Color4 ( const __m128 a ) : m128(a) {}
+
+    __forceinline explicit Color4 (const float v) : r(v), g(v), b(v), a(v) {}
+    __forceinline          Color4 (const float r, const float g, const float b, const float a) : r(r), g(g), b(b), a(a) {}
+
+    __forceinline explicit Color4 ( const Col3uc& other ) : r(other.r/255.0f), g(other.g/255.0f), b(other.b/255.0f), a(1.0f) {}
+    __forceinline explicit Color4 ( const Col3f&  other ) : r(other.r), g(other.g), b(other.b), a(1.0f) {}
+    __forceinline explicit Color4 ( const Col4uc& other ) : r(other.r/255.0f), g(other.g/255.0f), b(other.b/255.0f), a(other.a/255.0f) {}
+    __forceinline explicit Color4 ( const Col4f&  other ) : r(other.r), g(other.g), b(other.b), a(other.a) {}
+
+    //__forceinline Color4           ( const Color4& other ) : m128(other.m128) {}
+    //__forceinline Color4& operator=( const Color4& other ) { m128 = other.m128; return *this; }
+
+    //__forceinline operator const __m128&() const { return m128; }
+    //__forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Set
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; }
+    __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = a; }
+
+    __forceinline void set(Col3uc& d) const 
+    {
+      d.r = (unsigned char)(clamp(r)*255.0f); 
+      d.g = (unsigned char)(clamp(g)*255.0f); 
+      d.b = (unsigned char)(clamp(b)*255.0f);
+    }
+    
+    __forceinline void set(Col4uc& d) const 
+    {
+      d.r = (unsigned char)(clamp(r)*255.0f); 
+      d.g = (unsigned char)(clamp(g)*255.0f); 
+      d.b = (unsigned char)(clamp(b)*255.0f); 
+      d.a = (unsigned char)(clamp(a)*255.0f);
+    }
+    __forceinline void set(float &f) const
+    {
+      f = 0.2126f*r+0.7125f*g+0.0722f*b; // sRGB luminance.
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color4( ZeroTy   ) : r(0.0f), g(0.0f), b(0.0f), a(0.0f) {}
+    __forceinline Color4( OneTy    ) : r(1.0f), g(1.0f), b(1.0f), a(1.0f) {}
+    //__forceinline Color4( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    //__forceinline Color4( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE RGB Color Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct Color
+  {
+    struct { float r,g,b; };
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color () {}
+    //__forceinline Color ( const __m128 a ) : m128(a) {}
+
+    __forceinline explicit Color  (const float v)  : r(v), g(v), b(v) {}
+    __forceinline          Color  (const float r, const float g, const float b) : r(r), g(g), b(b) {}
+
+    //__forceinline Color           ( const Color& other ) : m128(other.m128) {}
+    //__forceinline Color& operator=( const Color& other ) { m128 = other.m128; return *this; }
+
+    //__forceinline Color           ( const Color4& other ) : m128(other.m128) {}
+    //__forceinline Color& operator=( const Color4& other ) { m128 = other.m128; return *this; }
+
+    //__forceinline operator const __m128&() const { return m128; }
+    //__forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Set
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; }
+    __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = 1.0f; }
+
+#if 0
+    __forceinline void set(Col3uc& d) const 
+    { 
+      vfloat4 s = clamp(vfloat4(m128))*255.0f;
+      d.r = (unsigned char)(s[0]); 
+      d.g = (unsigned char)(s[1]); 
+      d.b = (unsigned char)(s[2]); 
+    }
+    __forceinline void set(Col4uc& d) const 
+    { 
+      vfloat4 s = clamp(vfloat4(m128))*255.0f;
+      d.r = (unsigned char)(s[0]); 
+      d.g = (unsigned char)(s[1]); 
+      d.b = (unsigned char)(s[2]); 
+      d.a = 255;  
+    }
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color( ZeroTy   ) : r(0.0f), g(0.0f), b(0.0f) {}
+    __forceinline Color( OneTy    ) : r(1.0f), g(1.0f), b(1.0f) {}
+    //__forceinline Color( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    //__forceinline Color( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color operator +( const Color& a ) { return a; }
+  __forceinline const Color operator -( const Color& a ) { return Color(-a.r, -a.g, -a.b); }
+  __forceinline const Color abs  ( const Color& a ) { return Color(abs(a.r), abs(a.g), abs(a.b)); }
+  __forceinline const Color rcp  ( const Color& a ) { return Color(1.0f/a.r, 1.0f/a.g, 1.0f/a.b); }
+  __forceinline const Color rsqrt( const Color& a ) { return Color(1.0f/sqrt(a.r), 1.0f/sqrt(a.g), 1.0f/sqrt(a.b)); }
+  __forceinline const Color sqrt ( const Color& a ) { return Color(sqrt(a.r), sqrt(a.g), sqrt(a.b)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color operator +( const Color& a, const Color& b ) { return Color(a.r+b.r, a.g+b.g, a.b+b.b); }
+  __forceinline const Color operator -( const Color& a, const Color& b ) { return Color(a.r-b.r, a.g-b.g, a.b-b.b); }
+  __forceinline const Color operator *( const Color& a, const Color& b ) { return Color(a.r*b.r, a.g*b.g, a.b*b.b); }
+  __forceinline const Color operator *( const Color& a, const float  b ) { return a * Color(b); }
+  __forceinline const Color operator *( const float  a, const Color& b ) { return Color(a) * b; }
+  __forceinline const Color operator /( const Color& a, const Color& b ) { return a * rcp(b); }
+  __forceinline const Color operator /( const Color& a, const float  b ) { return a * rcp(b); }
+
+  __forceinline const Color min( const Color& a, const Color& b ) { return Color(min(a.r,b.r), min(a.g,b.g), min(a.b,b.b)); }
+  __forceinline const Color max( const Color& a, const Color& b ) { return Color(max(a.r,b.r), max(a.g,b.g), max(a.b,b.b)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color operator+=(Color& a, const Color& b) { return a = a + b; }
+  __forceinline const Color operator-=(Color& a, const Color& b) { return a = a - b; }
+  __forceinline const Color operator*=(Color& a, const Color& b) { return a = a * b; }
+  __forceinline const Color operator/=(Color& a, const Color& b) { return a = a / b; }
+  __forceinline const Color operator*=(Color& a, const float b      ) { return a = a * b; }
+  __forceinline const Color operator/=(Color& a, const float b      ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float reduce_add(const Color& v) { return v.r+v.g+v.b; }
+  __forceinline float reduce_mul(const Color& v) { return v.r*v.g*v.b; }
+  __forceinline float reduce_min(const Color& v) { return min(v.r,v.g,v.b); }
+  __forceinline float reduce_max(const Color& v) { return max(v.r,v.g,v.b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Color& a, const Color& b ) { return a.r == b.r && a.g == b.g && a.b == b.b; }
+  __forceinline bool operator !=( const Color& a, const Color& b ) { return a.r != b.r || a.g != b.g || a.b != b.b; }
+  __forceinline bool operator < ( const Color& a, const Color& b ) {
+    if (a.r != b.r) return a.r < b.r;
+    if (a.g != b.g) return a.g < b.g;
+    if (a.b != b.b) return a.b < b.b;
+    return false;
+  }
+
+   ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color select( bool s, const Color& t, const Color& f ) {
+    return s ? t : f;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Special Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  /*! computes luminance of a color */
+  __forceinline float luminance (const Color& a) { return madd(0.212671f,a.r,madd(0.715160f,a.g,0.072169f*a.b)); }
+
+  /*! output operator */
+  inline std::ostream& operator<<(std::ostream& cout, const Color& a) {
+    return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")";
+  }
+}
diff --git a/thirdparty/embree/common/math/math.h b/thirdparty/embree/common/math/emath.h
index 7930c17727..22a89a7669 100644
--- a/thirdparty/embree/common/math/math.h
+++ b/thirdparty/embree/common/math/emath.h
@@ -8,6 +8,10 @@
 #include "constants.h"
 #include <cmath>
 
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+#  include "math_sycl.h"
+#else
+
 #if defined(__ARM_NEON)
 #include "../simd/arm/emulation.h"
 #else
@@ -44,6 +48,9 @@ namespace embree
   __forceinline int   toInt  (const float& a) { return int(a); }
   __forceinline float toFloat(const int&   a) { return float(a); }
 
+  __forceinline int   asInt  (const float& a) { return *((int*)&a); }
+  __forceinline float asFloat(const int&   a) { return *((float*)&a); }
+
 #if defined(__WIN32__)
   __forceinline bool finite ( const float x ) { return _finite(x) != 0; }
 #endif
@@ -351,7 +358,11 @@ __forceinline float nmsub ( const float a, const float b, const float c) { retur
   __forceinline int   select(bool s, int   t,   int f) { return s ? t : f; }
   __forceinline float select(bool s, float t, float f) { return s ? t : f; }
 
-  __forceinline bool all(bool s) { return s; }
+  __forceinline bool none(bool s) { return !s; }
+  __forceinline bool all (bool s) { return s; }
+  __forceinline bool any (bool s) { return s; }
+
+  __forceinline unsigned movemask (bool s) { return (unsigned)s; }
 
   __forceinline float lerp(const float v0, const float v1, const float t) {
     return madd(1.0f-t,v0,t*v1);
@@ -453,3 +464,5 @@ __forceinline float nmsub ( const float a, const float b, const float c) { retur
     return x | (y << 1) | (z << 2);
   }
 }
+
+#endif
diff --git a/thirdparty/embree/common/math/lbbox.h b/thirdparty/embree/common/math/lbbox.h
index 2b397a05c8..7619199780 100644
--- a/thirdparty/embree/common/math/lbbox.h
+++ b/thirdparty/embree/common/math/lbbox.h
@@ -179,6 +179,48 @@ namespace embree
       bounds1 = b1;
     }
 
+     /*! calculates the linear bounds for target_time_range of primitive with it's time_range_in and bounds */
+    __forceinline LBBox(const BBox1f& time_range_in, const LBBox<T> lbounds, const BBox1f& target_time_range)
+    {
+      const BBox3f bounds0 = lbounds.bounds0;
+      const BBox3f bounds1 = lbounds.bounds1;
+      
+      /* normalize global target_time_range to local time_range_in */
+      const BBox1f time_range((target_time_range.lower-time_range_in.lower)/time_range_in.size(),
+                              (target_time_range.upper-time_range_in.lower)/time_range_in.size());
+
+      const BBox1f clipped_time_range(max(0.0f,time_range.lower), min(1.0f,time_range.upper));
+
+      /* compute bounds at begin and end of clipped time range */
+      BBox<T> b0 = lerp(bounds0,bounds1,clipped_time_range.lower);
+      BBox<T> b1 = lerp(bounds0,bounds1,clipped_time_range.upper);
+
+      /* make sure that b0 is properly bounded at time_range_in.lower */
+      {
+        const BBox<T> bt = lerp(b0, b1, (0.0f - time_range.lower) / time_range.size());
+        const T dlower = min(bounds0.lower-bt.lower, T(zero));
+        const T dupper = max(bounds0.upper-bt.upper, T(zero));
+        b0.lower += dlower; b1.lower += dlower;
+        b0.upper += dupper; b1.upper += dupper;
+      }
+
+      /* make sure that b1 is properly bounded at time_range_in.upper */
+      {
+        const BBox<T> bt = lerp(b0, b1, (1.0f - time_range.lower) / time_range.size());
+        const T dlower = min(bounds1.lower-bt.lower, T(zero));
+        const T dupper = max(bounds1.upper-bt.upper, T(zero));
+        b0.lower += dlower; b1.lower += dlower;
+        b0.upper += dupper; b1.upper += dupper;
+      }
+      
+      this->bounds0 = b0;
+      this->bounds1 = b1;
+    }
+
+    /*! calculates the linear bounds for target_time_range of primitive with it's time_range_in and bounds */
+    __forceinline LBBox(const BBox1f& time_range_in, const BBox<T>& bounds0, const BBox<T>& bounds1, const BBox1f& target_time_range)
+      : LBBox(time_range_in,LBBox(bounds0,bounds1),target_time_range) {}
+
   public:
 
     __forceinline bool empty() const {
diff --git a/thirdparty/embree/common/math/linearspace2.h b/thirdparty/embree/common/math/linearspace2.h
index 184ee695fb..e58f61ea6b 100644
--- a/thirdparty/embree/common/math/linearspace2.h
+++ b/thirdparty/embree/common/math/linearspace2.h
@@ -18,6 +18,7 @@ namespace embree
 
     /*! default matrix constructor */
     __forceinline LinearSpace2           ( ) {}
+
     __forceinline LinearSpace2           ( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; }
     __forceinline LinearSpace2& operator=( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; return *this; }
 
diff --git a/thirdparty/embree/common/math/linearspace3.h b/thirdparty/embree/common/math/linearspace3.h
index 9eaa2cc2bb..f6d2318fa0 100644
--- a/thirdparty/embree/common/math/linearspace3.h
+++ b/thirdparty/embree/common/math/linearspace3.h
@@ -19,6 +19,7 @@ namespace embree
 
     /*! default matrix constructor */
     __forceinline LinearSpace3           ( ) {}
+
     __forceinline LinearSpace3           ( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; }
     __forceinline LinearSpace3& operator=( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; return *this; }
 
@@ -90,17 +91,20 @@ namespace embree
     Vector vx,vy,vz;
   };
 
+#if !defined(__SYCL_DEVICE_ONLY__)
+  
   /*! compute transposed matrix */
   template<> __forceinline const LinearSpace3<Vec3fa> LinearSpace3<Vec3fa>::transposed() const { 
     vfloat4 rx,ry,rz; transpose((vfloat4&)vx,(vfloat4&)vy,(vfloat4&)vz,vfloat4(zero),rx,ry,rz);
     return LinearSpace3<Vec3fa>(Vec3fa(rx),Vec3fa(ry),Vec3fa(rz)); 
   }
-
+#endif
+  
   template<typename T>
     __forceinline const LinearSpace3<T> transposed(const LinearSpace3<T>& xfm) { 
     return xfm.transposed();
   }
-
+  
   ////////////////////////////////////////////////////////////////////////////////
   // Unary Operators
   ////////////////////////////////////////////////////////////////////////////////
diff --git a/thirdparty/embree/common/math/math_sycl.h b/thirdparty/embree/common/math/math_sycl.h
new file mode 100644
index 0000000000..ffb047569c
--- /dev/null
+++ b/thirdparty/embree/common/math/math_sycl.h
@@ -0,0 +1,279 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/intrinsics.h"
+#include "constants.h"
+#include <cmath>
+
+namespace embree
+{
+  __forceinline bool isvalid ( const float& v ) {
+    return (v > -FLT_LARGE) & (v < +FLT_LARGE);
+  }
+
+  __forceinline int cast_f2i(float f) {
+    return __builtin_bit_cast(int,f);
+  }
+
+  __forceinline float cast_i2f(int i) {
+    return __builtin_bit_cast(float,i);
+  }
+
+  __forceinline int   toInt  (const float& a) { return int(a); }
+  __forceinline float toFloat(const int&   a) { return float(a); }
+
+  __forceinline float asFloat(const int   a) { return __builtin_bit_cast(float,a); }
+  __forceinline int   asInt  (const float a) { return __builtin_bit_cast(int,a); }
+  
+  //__forceinline bool finite ( const float x ) { return _finite(x) != 0; }
+  __forceinline float sign ( const float x ) { return x<0?-1.0f:1.0f; }
+  __forceinline float sqr  ( const float x ) { return x*x; }
+
+  __forceinline float rcp  ( const float x ) {
+    return sycl::native::recip(x);
+  }
+
+  __forceinline float signmsk(const float a) { return asFloat(asInt(a) & 0x80000000); }
+  //__forceinline float signmsk ( const float x ) {
+  //  return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
+  //}
+  //__forceinline float xorf( const float x, const float y ) {
+  //  return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y)));
+  //}
+  //__forceinline float andf( const float x, const unsigned y ) {
+  //  return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y))));
+  //}
+  
+  __forceinline float rsqrt( const float x ) {
+    return sycl::rsqrt(x);
+  }
+
+  //__forceinline float nextafter(float x, float y) { if ((x<y) == (x>0)) return x*(1.1f+float(ulp)); else return x*(0.9f-float(ulp)); }
+  //__forceinline double nextafter(double x, double y) { return _nextafter(x, y); }
+  //__forceinline int roundf(float f) { return (int)(f + 0.5f); }
+
+  __forceinline float abs  ( const float x ) { return sycl::fabs(x); }
+  __forceinline float acos ( const float x ) { return sycl::acos(x); }
+  __forceinline float asin ( const float x ) { return sycl::asin(x); }
+  __forceinline float atan ( const float x ) { return sycl::atan(x); }
+  __forceinline float atan2( const float y, const float x ) { return sycl::atan2(y, x); }
+  __forceinline float cos  ( const float x ) { return sycl::cos(x); }
+  __forceinline float cosh ( const float x ) { return sycl::cosh(x); }
+  __forceinline float exp  ( const float x ) { return sycl::exp(x); }
+  __forceinline float fmod ( const float x, const float y ) { return sycl::fmod(x, y); }
+  __forceinline float log  ( const float x ) { return sycl::log(x); }
+  __forceinline float log10( const float x ) { return sycl::log10(x); }
+  __forceinline float pow  ( const float x, const float y ) { return sycl::pow(x, y); }
+  __forceinline float sin  ( const float x ) { return sycl::sin(x); }
+  __forceinline float sinh ( const float x ) { return sycl::sinh(x); }
+  __forceinline float sqrt ( const float x ) { return sycl::sqrt(x); }
+  __forceinline float tan  ( const float x ) { return sycl::tan(x); }
+  __forceinline float tanh ( const float x ) { return sycl::tanh(x); }
+  __forceinline float floor( const float x ) { return sycl::floor(x); }
+  __forceinline float ceil ( const float x ) { return sycl::ceil(x); }
+  __forceinline float frac ( const float x ) { return x-floor(x); }
+
+  //__forceinline double abs  ( const double x ) { return ::fabs(x); }
+  //__forceinline double sign ( const double x ) { return x<0?-1.0:1.0; }
+  //__forceinline double acos ( const double x ) { return ::acos (x); }
+  //__forceinline double asin ( const double x ) { return ::asin (x); }
+  //__forceinline double atan ( const double x ) { return ::atan (x); }
+  //__forceinline double atan2( const double y, const double x ) { return ::atan2(y, x); }
+  //__forceinline double cos  ( const double x ) { return ::cos  (x); }
+  //__forceinline double cosh ( const double x ) { return ::cosh (x); }
+  //__forceinline double exp  ( const double x ) { return ::exp  (x); }
+  //__forceinline double fmod ( const double x, const double y ) { return ::fmod (x, y); }
+  //__forceinline double log  ( const double x ) { return ::log  (x); }
+  //__forceinline double log10( const double x ) { return ::log10(x); }
+  //__forceinline double pow  ( const double x, const double y ) { return ::pow  (x, y); }
+  //__forceinline double rcp  ( const double x ) { return 1.0/x; }
+  //__forceinline double rsqrt( const double x ) { return 1.0/::sqrt(x); }
+  //__forceinline double sin  ( const double x ) { return ::sin  (x); }
+  //__forceinline double sinh ( const double x ) { return ::sinh (x); }
+  //__forceinline double sqr  ( const double x ) { return x*x; }
+  //__forceinline double sqrt ( const double x ) { return ::sqrt (x); }
+  //__forceinline double tan  ( const double x ) { return ::tan  (x); }
+  //__forceinline double tanh ( const double x ) { return ::tanh (x); }
+  //__forceinline double floor( const double x ) { return ::floor (x); }
+  //__forceinline double ceil ( const double x ) { return ::ceil (x); }
+
+/*
+#if defined(__SSE4_1__)
+  __forceinline float mini(float a, float b) {
+    const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
+    const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
+    const __m128i ci = _mm_min_epi32(ai,bi);
+    return _mm_cvtss_f32(_mm_castsi128_ps(ci));
+  }
+#endif
+
+#if defined(__SSE4_1__)
+  __forceinline float maxi(float a, float b) {
+    const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
+    const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
+    const __m128i ci = _mm_max_epi32(ai,bi);
+    return _mm_cvtss_f32(_mm_castsi128_ps(ci));
+  }
+#endif
+*/
+  
+  template<typename T>
+    __forceinline T twice(const T& a) { return a+a; }
+
+  __forceinline      int min(int      a, int      b) { return sycl::min(a,b); }
+  __forceinline unsigned min(unsigned a, unsigned b) { return sycl::min(a,b); }
+  __forceinline  int64_t min(int64_t  a, int64_t  b) { return sycl::min(a,b); }
+  __forceinline    float min(float    a, float    b) { return sycl::fmin(a,b); }
+  __forceinline   double min(double   a, double   b) { return sycl::fmin(a,b); }
+#if defined(__X86_64__)
+  __forceinline   size_t min(size_t   a, size_t   b) { return sycl::min(a,b); }
+#endif
+
+  template<typename T> __forceinline T min(const T& a, const T& b, const T& c) { return min(min(a,b),c); }
+  template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d) { return min(min(a,b),min(c,d)); }
+  template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d, const T& e) { return min(min(min(a,b),min(c,d)),e); }
+
+//  template<typename T> __forceinline T mini(const T& a, const T& b, const T& c) { return mini(mini(a,b),c); }
+//  template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d) { return mini(mini(a,b),mini(c,d)); }
+//  template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d, const T& e) { return mini(mini(mini(a,b),mini(c,d)),e); }
+
+  __forceinline      int max(int      a, int      b) { return sycl::max(a,b); }
+  __forceinline unsigned max(unsigned a, unsigned b) { return sycl::max(a,b); }
+  __forceinline  int64_t max(int64_t  a, int64_t  b) { return sycl::max(a,b); }
+  __forceinline    float max(float    a, float    b) { return sycl::fmax(a,b); }
+  __forceinline   double max(double   a, double   b) { return sycl::fmax(a,b); }
+#if defined(__X86_64__)
+  __forceinline   size_t max(size_t   a, size_t   b) { return sycl::max(a,b); }
+#endif
+
+  template<typename T> __forceinline T max(const T& a, const T& b, const T& c) { return max(max(a,b),c); }
+  template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d) { return max(max(a,b),max(c,d)); }
+  template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d, const T& e) { return max(max(max(a,b),max(c,d)),e); }
+
+//  template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c) { return maxi(maxi(a,b),c); }
+//  template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d) { return maxi(maxi(a,b),maxi(c,d)); }
+//  template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d, const T& e) { return maxi(maxi(maxi(a,b),maxi(c,d)),e); }
+
+  template<typename T> __forceinline T clamp(const T& x, const T& lower = T(zero), const T& upper = T(one)) { return max(min(x,upper),lower); }
+  template<typename T> __forceinline T clampz(const T& x, const T& upper) { return max(T(zero), min(x,upper)); }
+
+  template<typename T> __forceinline T  deg2rad ( const T& x )  { return x * T(1.74532925199432957692e-2f); }
+  template<typename T> __forceinline T  rad2deg ( const T& x )  { return x * T(5.72957795130823208768e1f); }
+  template<typename T> __forceinline T  sin2cos ( const T& x )  { return sqrt(max(T(zero),T(one)-x*x)); }
+  template<typename T> __forceinline T  cos2sin ( const T& x )  { return sin2cos(x); }
+
+  __forceinline float madd  ( const float a, const float b, const float c) { return +sycl::fma(+a,b,+c); }
+  __forceinline float msub  ( const float a, const float b, const float c) { return +sycl::fma(+a,b,-c); }
+  __forceinline float nmadd ( const float a, const float b, const float c) { return +sycl::fma(-a,b,+c); }
+  __forceinline float nmsub ( const float a, const float b, const float c) { return -sycl::fma(+a,b,+c); }
+
+  /*! random functions */
+/*
+  template<typename T> T random() { return T(0); }
+  template<> __forceinline int      random() { return int(rand()); }
+  template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 16); }
+  template<> __forceinline float  random() { return rand()/float(RAND_MAX); }
+  template<> __forceinline double random() { return rand()/double(RAND_MAX); }
+*/
+  
+  /*! selects */
+  __forceinline bool  select(bool s, bool  t , bool f) { return s ? t : f; }
+  __forceinline int   select(bool s, int   t,   int f) { return s ? t : f; }
+  __forceinline float select(bool s, float t, float f) { return s ? t : f; }
+
+  __forceinline bool none(bool s) { return !s; }
+  __forceinline bool all (bool s) { return s; }
+  __forceinline bool any (bool s) { return s; }
+
+  __forceinline unsigned movemask (bool s) { return (unsigned)s; }
+
+  __forceinline float lerp(const float v0, const float v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  template<typename T>
+    __forceinline T lerp2(const float x0, const float x1, const float x2, const float x3, const T& u, const T& v) {
+    return madd((1.0f-u),madd((1.0f-v),T(x0),v*T(x2)),u*madd((1.0f-v),T(x1),v*T(x3)));
+  }
+
+  /*! exchange */
+  template<typename T> __forceinline void xchg ( T& a, T& b ) { const T tmp = a; a = b; b = tmp; }
+
+   /*  load/store */
+  template<typename Ty> struct mem;
+ 
+  template<> struct mem<float> {
+    static __forceinline float load (bool mask, const void* ptr) { return mask ? *(float*)ptr : 0.0f; }
+    static __forceinline float loadu(bool mask, const void* ptr) { return mask ? *(float*)ptr : 0.0f; }
+  
+    static __forceinline void store (bool mask, void* ptr, const float v) { if (mask) *(float*)ptr = v; }
+    static __forceinline void storeu(bool mask, void* ptr, const float v) { if (mask) *(float*)ptr = v; }
+  };
+  
+  /*! bit reverse operation */
+  template<class T>
+    __forceinline T bitReverse(const T& vin)
+  {
+    T v = vin;
+    v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
+    v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
+    v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
+    v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
+    v = ( v >> 16             ) | ( v               << 16);
+    return v;
+  }
+
+  /*! bit interleave operation */
+  template<class T>
+    __forceinline T bitInterleave(const T& xin, const T& yin, const T& zin)
+  {
+	T x = xin, y = yin, z = zin;
+    x = (x | (x << 16)) & 0x030000FF;
+    x = (x | (x <<  8)) & 0x0300F00F;
+    x = (x | (x <<  4)) & 0x030C30C3;
+    x = (x | (x <<  2)) & 0x09249249;
+
+    y = (y | (y << 16)) & 0x030000FF;
+    y = (y | (y <<  8)) & 0x0300F00F;
+    y = (y | (y <<  4)) & 0x030C30C3;
+    y = (y | (y <<  2)) & 0x09249249;
+
+    z = (z | (z << 16)) & 0x030000FF;
+    z = (z | (z <<  8)) & 0x0300F00F;
+    z = (z | (z <<  4)) & 0x030C30C3;
+    z = (z | (z <<  2)) & 0x09249249;
+
+    return x | (y << 1) | (z << 2);
+  }
+
+  /*! bit interleave operation for 64bit data types*/
+  template<class T>
+    __forceinline T bitInterleave64(const T& xin, const T& yin, const T& zin){
+    T x = xin & 0x1fffff;
+    T y = yin & 0x1fffff;
+    T z = zin & 0x1fffff;
+
+    x = (x | x << 32) & 0x1f00000000ffff;
+    x = (x | x << 16) & 0x1f0000ff0000ff;
+    x = (x | x << 8) & 0x100f00f00f00f00f;
+    x = (x | x << 4) & 0x10c30c30c30c30c3;
+    x = (x | x << 2) & 0x1249249249249249;
+
+    y = (y | y << 32) & 0x1f00000000ffff;
+    y = (y | y << 16) & 0x1f0000ff0000ff;
+    y = (y | y << 8) & 0x100f00f00f00f00f;
+    y = (y | y << 4) & 0x10c30c30c30c30c3;
+    y = (y | y << 2) & 0x1249249249249249;
+
+    z = (z | z << 32) & 0x1f00000000ffff;
+    z = (z | z << 16) & 0x1f0000ff0000ff;
+    z = (z | z << 8) & 0x100f00f00f00f00f;
+    z = (z | z << 4) & 0x10c30c30c30c30c3;
+    z = (z | z << 2) & 0x1249249249249249;
+
+    return x | (y << 1) | (z << 2);
+  }
+}
diff --git a/thirdparty/embree/common/math/range.h b/thirdparty/embree/common/math/range.h
index 909fadb995..f397615ea2 100644
--- a/thirdparty/embree/common/math/range.h
+++ b/thirdparty/embree/common/math/range.h
@@ -4,7 +4,7 @@
 #pragma once
 
 #include "../sys/platform.h"
-#include "../math/math.h"
+#include "../math/emath.h"
 
 namespace embree
 {
diff --git a/thirdparty/embree/common/math/vec2.h b/thirdparty/embree/common/math/vec2.h
index f6d98ffa0d..4e641ec249 100644
--- a/thirdparty/embree/common/math/vec2.h
+++ b/thirdparty/embree/common/math/vec2.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "math.h"
+#include "emath.h"
 
 namespace embree
 {
@@ -34,7 +34,7 @@ namespace embree
     __forceinline          Vec2( const T& x, const T& y ) : x(x), y(y) {}
 
     __forceinline Vec2( const Vec2& other ) { x = other.x; y = other.y; }
-    __forceinline Vec2( const Vec2fa& other );
+    Vec2( const Vec2fa& other );
 
     template<typename T1> __forceinline Vec2( const Vec2<T1>& a ) : x(T(a.x)), y(T(a.y)) {}
     template<typename T1> __forceinline Vec2& operator =( const Vec2<T1>& other ) { x = other.x; y = other.y; return *this; }
@@ -232,4 +232,5 @@ namespace embree
 #if defined(__AVX512F__)
   template<> __forceinline Vec2<vfloat16>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
 #endif
+
 }
diff --git a/thirdparty/embree/common/math/vec2fa.h b/thirdparty/embree/common/math/vec2fa.h
index 4f222894c2..d57e549e68 100644
--- a/thirdparty/embree/common/math/vec2fa.h
+++ b/thirdparty/embree/common/math/vec2fa.h
@@ -4,7 +4,12 @@
 #pragma once
 
 #include "../sys/alloc.h"
-#include "math.h"
+#include "emath.h"
+
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+#  include "vec2fa_sycl.h"
+#else
+
 #include "../simd/sse.h"
 
 namespace embree
@@ -316,3 +321,5 @@ namespace embree
 
   typedef Vec2fa Vec2fa_t;
 }
+
+#endif
diff --git a/thirdparty/embree/common/math/vec2fa_sycl.h b/thirdparty/embree/common/math/vec2fa_sycl.h
new file mode 100644
index 0000000000..62d62bdd01
--- /dev/null
+++ b/thirdparty/embree/common/math/vec2fa_sycl.h
@@ -0,0 +1,270 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "emath.h"
+#include "../simd/sse.h"
+
+namespace embree
+{
+  struct Vec3fa;
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec2fa Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec2fa
+  {
+    //ALIGNED_STRUCT_(16);
+
+    typedef float Scalar;
+    enum { N = 2 };
+    struct { float x,y; };
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec2fa( ) {}
+    //__forceinline Vec2fa( const __m128 a ) : m128(a) {}
+    explicit Vec2fa(const Vec3fa& a);
+    
+    __forceinline explicit Vec2fa( const vfloat<4>& a ) {
+      x = a[0];
+      y = a[1];
+    }
+
+    __forceinline Vec2fa            ( const Vec2<float>& other  ) { x = other.x; y = other.y; }
+    __forceinline Vec2fa& operator =( const Vec2<float>& other ) { x = other.x; y = other.y; return *this; }
+
+    __forceinline Vec2fa            ( const Vec2fa& other ) { x = other.x; y = other.y; }
+    __forceinline Vec2fa& operator =( const Vec2fa& other ) { x = other.x; y = other.y; return *this; }
+
+    __forceinline explicit Vec2fa( const float a ) : x(a), y(a) {}
+    __forceinline          Vec2fa( const float x, const float y) : x(x), y(y) {}
+
+    //__forceinline explicit Vec2fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
+
+    //__forceinline operator const __m128&() const { return m128; }
+    //__forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline Vec2fa load( const void* const a ) {
+      const float* ptr = (const float*)a;
+      return Vec2fa(ptr[0],ptr[1]);
+    }
+
+    static __forceinline Vec2fa loadu( const void* const a ) {
+      const float* ptr = (const float*)a;
+      return Vec2fa(ptr[0],ptr[1]);
+    }
+
+    static __forceinline void storeu ( void* a, const Vec2fa& v ) {
+      float* ptr = (float*)a;
+      ptr[0] = v.x; ptr[1] = v.y;
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec2fa( ZeroTy   ) : x(0.0f), y(0.0f) {}
+    __forceinline Vec2fa( OneTy    ) : x(1.0f), y(1.0f) {}
+    __forceinline Vec2fa( PosInfTy ) : x(+INFINITY), y(+INFINITY) {}
+    __forceinline Vec2fa( NegInfTy ) : x(-INFINITY), y(-INFINITY) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    //__forceinline const float& operator []( const size_t index ) const { assert(index < 2); return (&x)[index]; }
+    //__forceinline       float& operator []( const size_t index )       { assert(index < 2); return (&x)[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa operator +( const Vec2fa& a ) { return a; }
+  __forceinline Vec2fa operator -( const Vec2fa& a ) { return Vec2fa(-a.x,-a.y); }
+  __forceinline Vec2fa abs  ( const Vec2fa& a ) { return Vec2fa(sycl::fabs(a.x),sycl::fabs(a.y)); }
+  __forceinline Vec2fa sign ( const Vec2fa& a ) { return Vec2fa(sycl::sign(a.x),sycl::sign(a.y)); }
+
+   //__forceinline Vec2fa rcp  ( const Vec2fa& a ) { return Vec2fa(sycl::recip(a.x),sycl::recip(a.y)); }
+  __forceinline Vec2fa rcp  ( const Vec2fa& a ) { return Vec2fa(__sycl_std::__invoke_native_recip<float>(a.x),__sycl_std::__invoke_native_recip<float>(a.y)); }
+  __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return Vec2fa(sycl::sqrt(a.x),sycl::sqrt(a.y)); }
+  __forceinline Vec2fa sqr  ( const Vec2fa& a ) { return Vec2fa(a.x*a.x,a.y*a.y); }
+  
+  __forceinline Vec2fa rsqrt( const Vec2fa& a ) { return Vec2fa(sycl::rsqrt(a.x),sycl::rsqrt(a.y)); }
+
+  __forceinline Vec2fa zero_fix(const Vec2fa& a) {
+    const float x = sycl::fabs(a.x) < min_rcp_input ? min_rcp_input : a.x;
+    const float y = sycl::fabs(a.y) < min_rcp_input ? min_rcp_input : a.y;
+    return Vec2fa(x,y);
+  }
+  __forceinline Vec2fa rcp_safe(const Vec2fa& a) {
+    return rcp(zero_fix(a));
+  }
+  __forceinline Vec2fa log ( const Vec2fa& a ) {
+    return Vec2fa(sycl::log(a.x),sycl::log(a.y));
+  }
+
+  __forceinline Vec2fa exp ( const Vec2fa& a ) {
+    return Vec2fa(sycl::exp(a.x),sycl::exp(a.y));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa operator +( const Vec2fa& a, const Vec2fa& b ) { return Vec2fa(a.x+b.x, a.y+b.y); }
+  __forceinline Vec2fa operator -( const Vec2fa& a, const Vec2fa& b ) { return Vec2fa(a.x-b.x, a.y-b.y); }
+  __forceinline Vec2fa operator *( const Vec2fa& a, const Vec2fa& b ) { return Vec2fa(a.x*b.x, a.y*b.y); }
+  __forceinline Vec2fa operator *( const Vec2fa& a, const float b ) { return a * Vec2fa(b); }
+  __forceinline Vec2fa operator *( const float a, const Vec2fa& b ) { return Vec2fa(a) * b; }
+  __forceinline Vec2fa operator /( const Vec2fa& a, const Vec2fa& b ) { return Vec2fa(a.x/b.x, a.y/b.y); }
+  __forceinline Vec2fa operator /( const Vec2fa& a, const float b        ) { return Vec2fa(a.x/b, a.y/b); }
+  __forceinline Vec2fa operator /( const        float a, const Vec2fa& b ) { return Vec2fa(a/b.x, a/b.y); }
+
+  __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) {
+    return Vec2fa(sycl::fmin(a.x,b.x), sycl::fmin(a.y,b.y));
+  }
+  __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) {
+    return Vec2fa(sycl::fmax(a.x,b.x), sycl::fmax(a.y,b.y));
+  }
+
+/*
+#if defined(__SSE4_1__)
+    __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_min_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+#if defined(__SSE4_1__)
+    __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_max_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+    __forceinline Vec2fa pow ( const Vec2fa& a, const float& b ) {
+      return Vec2fa(powf(a.x,b),powf(a.y,b));
+    }
+*/
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa madd  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return Vec2fa(madd(a.x,b.x,c.x), madd(a.y,b.y,c.y)); }
+  __forceinline Vec2fa msub  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return Vec2fa(msub(a.x,b.x,c.x), msub(a.y,b.y,c.y)); }
+  __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return Vec2fa(nmadd(a.x,b.x,c.x), nmadd(a.y,b.y,c.y)); }
+  __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return Vec2fa(nmsub(a.x,b.x,c.x), nmsub(a.y,b.y,c.y)); }
+
+  __forceinline Vec2fa madd  ( const float a, const Vec2fa& b, const Vec2fa& c) { return madd(Vec2fa(a),b,c); }
+  __forceinline Vec2fa msub  ( const float a, const Vec2fa& b, const Vec2fa& c) { return msub(Vec2fa(a),b,c); }
+  __forceinline Vec2fa nmadd ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmadd(Vec2fa(a),b,c); }
+  __forceinline Vec2fa nmsub ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmsub(Vec2fa(a),b,c); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa& operator +=( Vec2fa& a, const Vec2fa& b ) { return a = a + b; }
+  __forceinline Vec2fa& operator -=( Vec2fa& a, const Vec2fa& b ) { return a = a - b; }
+  __forceinline Vec2fa& operator *=( Vec2fa& a, const Vec2fa& b ) { return a = a * b; }
+  __forceinline Vec2fa& operator *=( Vec2fa& a, const float   b ) { return a = a * b; }
+  __forceinline Vec2fa& operator /=( Vec2fa& a, const Vec2fa& b ) { return a = a / b; }
+  __forceinline Vec2fa& operator /=( Vec2fa& a, const float   b ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float reduce_add(const Vec2fa& v) { return v.x+v.y; }
+  __forceinline float reduce_mul(const Vec2fa& v) { return v.x*v.y; }
+  __forceinline float reduce_min(const Vec2fa& v) { return sycl::fmin(v.x,v.y); }
+  __forceinline float reduce_max(const Vec2fa& v) { return sycl::fmax(v.x,v.y); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Vec2fa& a, const Vec2fa& b ) { return a.x == b.x && a.y == b.y; }
+  __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return a.x != b.x || a.y != b.y; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) {
+    return reduce_add(a*b);
+  }
+
+  __forceinline Vec2fa cross ( const Vec2fa& a ) {
+    return Vec2fa(-a.y,a.x);
+  }
+
+  __forceinline float  sqr_length ( const Vec2fa& a )                { return dot(a,a); }
+  __forceinline float  rcp_length ( const Vec2fa& a )                { return rsqrt(dot(a,a)); }
+  __forceinline float  rcp_length2( const Vec2fa& a )                { return rcp(dot(a,a)); }
+  __forceinline float  length   ( const Vec2fa& a )                  { return sqrt(dot(a,a)); }
+  __forceinline Vec2fa normalize( const Vec2fa& a )                  { return a*rsqrt(dot(a,a)); }
+  __forceinline float  distance ( const Vec2fa& a, const Vec2fa& b ) { return length(a-b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa select( bool s, const Vec2fa& t, const Vec2fa& f ) {
+    return Vec2fa(s ? t.x : f.x, s ? t.y : f.y);
+  }
+
+  __forceinline Vec2fa lerp(const Vec2fa& v0, const Vec2fa& v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  __forceinline int maxDim ( const Vec2fa& a )
+  {
+    const Vec2fa b = abs(a);
+    if (b.x > b.y) return 0;
+    else return 1;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa trunc( const Vec2fa& a ) { return Vec2fa(sycl::trunc(a.x),sycl::trunc(a.y)); }
+  __forceinline Vec2fa floor( const Vec2fa& a ) { return Vec2fa(sycl::floor(a.x),sycl::floor(a.y)); }
+  __forceinline Vec2fa ceil ( const Vec2fa& a ) { return Vec2fa(sycl::ceil (a.x),sycl::ceil (a.y)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  inline embree_ostream operator<<(embree_ostream cout, const Vec2fa& a) {
+    return cout << "(" << a.x << ", " << a.y << ")";
+  }
+
+  /*template<>
+  __forceinline vfloat_impl<4>::vfloat_impl(const Vec2fa& a)
+  {
+    v = 0;
+    const unsigned int lid = get_sub_group_local_id();
+    if (lid == 0) v = a.x;
+    if (lid == 1) v = a.y;
+  }*/
+
+  typedef Vec2fa Vec2fa_t;
+}
diff --git a/thirdparty/embree/common/math/vec3.h b/thirdparty/embree/common/math/vec3.h
index 254f6c4011..d5e78befe8 100644
--- a/thirdparty/embree/common/math/vec3.h
+++ b/thirdparty/embree/common/math/vec3.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "math.h"
+#include "emath.h"
 
 namespace embree
 {
@@ -286,6 +286,8 @@ namespace embree
 
   template<> __forceinline Vec3<float>::Vec3(const Vec3fa& a) { x = a.x; y = a.y; z = a.z; }
 
+#if !defined(__SYCL_DEVICE_ONLY__)
+
 #if defined(__AVX__)
   template<> __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
     x = a.x; y = a.y; z = a.z;
@@ -333,4 +335,23 @@ namespace embree
 #if defined(__AVX512F__)
   template<> __forceinline Vec3<vfloat16>::Vec3(const Vec3fa& a) : x(a.x), y(a.y), z(a.z) {}
 #endif
+  
+#else
+
+#if defined(__SSE__)
+  template<> __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
+    x = a.x; y = a.y; z = a.z;
+  }
+#endif
+#if defined(__AVX__)
+  template<> __forceinline Vec3<vfloat8>::Vec3(const Vec3fa& a) {
+    x = a.x; y = a.y; z = a.z;
+  }
+#endif
+#if defined(__AVX512F__)
+  template<> __forceinline Vec3<vfloat16>::Vec3(const Vec3fa& a) {
+    x = a.x; y = a.y; z = a.z;
+  }
+#endif
+#endif
 }
diff --git a/thirdparty/embree/common/math/vec3ba.h b/thirdparty/embree/common/math/vec3ba.h
index a021b522dc..bf24a2a3b6 100644
--- a/thirdparty/embree/common/math/vec3ba.h
+++ b/thirdparty/embree/common/math/vec3ba.h
@@ -4,7 +4,12 @@
 #pragma once
 
 #include "../sys/alloc.h"
-#include "math.h"
+#include "emath.h"
+
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+#  include "vec3ba_sycl.h"
+#else
+
 #include "../simd/sse.h"
 
 namespace embree
@@ -118,3 +123,5 @@ namespace embree
     return cout << "(" << (a.x ? "1" : "0") << ", " << (a.y ? "1" : "0") << ", " << (a.z ? "1" : "0") << ")";
   }
 }
+
+#endif
diff --git a/thirdparty/embree/common/math/vec3ba_sycl.h b/thirdparty/embree/common/math/vec3ba_sycl.h
new file mode 100644
index 0000000000..a2fa13de6c
--- /dev/null
+++ b/thirdparty/embree/common/math/vec3ba_sycl.h
@@ -0,0 +1,115 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "emath.h"
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec3ba Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec3ba
+  {
+    //ALIGNED_STRUCT_(16);
+    
+    struct { bool x,y,z; };
+
+    typedef bool Scalar;
+    enum { N = 3 };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3ba( ) {}
+    //__forceinline Vec3ba( const __m128  input ) : m128(input) {}
+
+    __forceinline Vec3ba( const Vec3ba& other ) : x(other.x), y(other.y), z(other.z) {}
+    __forceinline Vec3ba& operator =(const Vec3ba& other) { x = other.x; y = other.y; z = other.z; return *this; }
+
+    __forceinline explicit Vec3ba( bool a ) : x(a), y(a), z(a) {}
+    __forceinline Vec3ba( bool a, bool b, bool c) : x(a), y(b), z(c) {}
+
+    //__forceinline operator const __m128&() const { return m128; }
+    //__forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3ba( FalseTy ) : x(false), y(false), z(false) {}
+    __forceinline Vec3ba( TrueTy  ) : x(true),  y(true),  z(true)  {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    //__forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+    //__forceinline       int& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
+  };
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ba operator !( const Vec3ba& a ) { return Vec3ba(!a.x,!a.y,!a.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ba operator &( const Vec3ba& a, const Vec3ba& b ) { return Vec3ba(a.x & b.x, a.y & b.y, a.z & b.z); }
+  __forceinline Vec3ba operator |( const Vec3ba& a, const Vec3ba& b ) { return Vec3ba(a.x | b.x, a.y | b.y, a.z | b.z); }
+  __forceinline Vec3ba operator ^( const Vec3ba& a, const Vec3ba& b ) { return Vec3ba(a.x != b.x, a.y != b.y, a.z != b.z); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline Vec3ba& operator &=( Vec3ba& a, const Vec3ba& b ) { return a = a & b; }
+  __forceinline Vec3ba& operator |=( Vec3ba& a, const Vec3ba& b ) { return a = a | b; }
+  __forceinline Vec3ba& operator ^=( Vec3ba& a, const Vec3ba& b ) { return a = a ^ b; }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline bool operator ==( const Vec3ba& a, const Vec3ba& b ) {
+    return a.x == b.x && a.y == b.y && a.z == b.z;
+  }
+  __forceinline bool operator !=( const Vec3ba& a, const Vec3ba& b ) {
+    return a.x != b.x || a.y != b.y || a.z != b.z;
+  }
+/*
+  __forceinline bool operator < ( const Vec3ba& a, const Vec3ba& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    if (a.z != b.z) return a.z < b.z;
+    return false;
+  }
+*/
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+    
+  __forceinline bool reduce_and( const Vec3ba& a ) { return a.x & a.y & a.z; }
+  __forceinline bool reduce_or ( const Vec3ba& a ) { return a.x | a.y | a.z; }
+
+  __forceinline bool all       ( const Vec3ba& b ) { return reduce_and(b); }
+  __forceinline bool any       ( const Vec3ba& b ) { return reduce_or(b); }
+  __forceinline bool none      ( const Vec3ba& b ) { return !reduce_or(b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  inline embree_ostream operator<<(embree_ostream cout, const Vec3ba& a) {
+    return cout;
+  }
+}
diff --git a/thirdparty/embree/common/math/vec3fa.h b/thirdparty/embree/common/math/vec3fa.h
index 8564cf6d10..967e75da74 100644
--- a/thirdparty/embree/common/math/vec3fa.h
+++ b/thirdparty/embree/common/math/vec3fa.h
@@ -4,7 +4,12 @@
 #pragma once
 
 #include "../sys/alloc.h"
-#include "math.h"
+#include "emath.h"
+
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+#  include "vec3fa_sycl.h"
+#else
+
 #include "../simd/sse.h"
 
 namespace embree
@@ -441,7 +446,6 @@ namespace embree
     //__forceinline Vec3fx& operator =( const Vec3<float>& other ) { m128  = _mm_set_ps(0, other.z, other.y, other.x); return *this; }
 
     __forceinline Vec3fx            ( const Vec3fx& other ) { m128 = other.m128; }
-
     __forceinline Vec3fx& operator =( const Vec3fx& other ) { m128 = other.m128; return *this; }
 
     __forceinline explicit Vec3fx( const float a ) : m128(_mm_set1_ps(a)) {}
@@ -783,3 +787,5 @@ namespace embree
   
   typedef Vec3fx Vec3ff;
 }
+
+#endif
diff --git a/thirdparty/embree/common/math/vec3fa_sycl.h b/thirdparty/embree/common/math/vec3fa_sycl.h
new file mode 100644
index 0000000000..5fdb00ab99
--- /dev/null
+++ b/thirdparty/embree/common/math/vec3fa_sycl.h
@@ -0,0 +1,617 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "emath.h"
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec3fa Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec3fa
+  {
+    //ALIGNED_STRUCT_(16);
+
+    typedef float Scalar;
+    enum { N = 3 };
+    struct { float x,y,z, do_not_use; };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3fa( ) {}
+    //__forceinline Vec3fa( const __m128 a ) : m128(a) {}
+    //__forceinline explicit Vec3fa(const vfloat4& a) : x(a[0]), y(a[1]), z(a[2]) {}
+
+    __forceinline Vec3fa            ( const Vec3<float>& other  ) { x = other.x; y = other.y; z = other.z; }
+    //__forceinline Vec3fa& operator =( const Vec3<float>& other ) { x = other.x; y = other.y; z = other.z; return *this; }
+
+    __forceinline Vec3fa            ( const Vec3fa& other ) { x = other.x; y = other.y; z = other.z; }
+    __forceinline Vec3fa& operator =( const Vec3fa& other ) { x = other.x; y = other.y; z = other.z; return *this; }
+
+    __forceinline explicit Vec3fa( const float a ) : x(a), y(a), z(a) {}
+    __forceinline          Vec3fa( const float x, const float y, const float z) : x(x), y(y), z(z) {}
+
+    __forceinline explicit Vec3fa( const Vec3ia& a ) : x((float)a.x), y((float)a.y), z((float)a.z) {}
+
+    //__forceinline operator const __m128&() const { return m128; }
+    //__forceinline operator       __m128&()       { return m128; }
+    __forceinline operator vfloat4() const { return vfloat4(x,y,z,0.0f); } // FIXME: we should not need this!!
+
+    //friend __forceinline Vec3fa copy_a( const Vec3fa& a, const Vec3fa& b ) { Vec3fa c = a; c.a = b.a; return c; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline Vec3fa load( const void* const a ) {
+      const float* ptr = (const float*)a;
+      return Vec3fa(ptr[0],ptr[1],ptr[2]);
+    }
+
+    static __forceinline Vec3fa loadu( const void* const a ) {
+      const float* ptr = (const float*)a;
+      return Vec3fa(ptr[0],ptr[1],ptr[2]);
+    }
+
+    static __forceinline void storeu ( void* a, const Vec3fa& v ) {
+      float* ptr = (float*)a;
+      ptr[0] = v.x; ptr[1] = v.y; ptr[2] = v.z;
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3fa( ZeroTy   ) : x(0.0f), y(0.0f), z(0.0f) {}
+    __forceinline Vec3fa( OneTy    ) : x(1.0f), y(1.0f), z(1.0f) {}
+    __forceinline Vec3fa( PosInfTy ) : x(+INFINITY), y(+INFINITY), z(+INFINITY) {}
+    __forceinline Vec3fa( NegInfTy ) : x(-INFINITY), y(-INFINITY), z(-INFINITY) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+    __forceinline       float& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa operator +( const Vec3fa& a ) { return a; }
+  __forceinline Vec3fa operator -( const Vec3fa& a ) { return Vec3fa(-a.x,-a.y,-a.z); }
+  __forceinline Vec3fa abs  ( const Vec3fa& a ) { return Vec3fa(sycl::fabs(a.x),sycl::fabs(a.y),sycl::fabs(a.z)); }
+  __forceinline Vec3fa sign ( const Vec3fa& a ) { return Vec3fa(sycl::sign(a.x),sycl::sign(a.y),sycl::sign(a.z)); }
+
+  //__forceinline Vec3fa rcp  ( const Vec3fa& a ) { return Vec3fa(sycl::recip(a.x),sycl::recip(a.y),sycl::recip(a.z)); }
+  __forceinline Vec3fa rcp  ( const Vec3fa& a ) { return Vec3fa(__sycl_std::__invoke_native_recip<float>(a.x),__sycl_std::__invoke_native_recip<float>(a.y),__sycl_std::__invoke_native_recip<float>(a.z)); }
+  __forceinline Vec3fa sqrt ( const Vec3fa& a ) { return Vec3fa(sycl::sqrt(a.x),sycl::sqrt(a.y),sycl::sqrt(a.z)); }
+  __forceinline Vec3fa sqr  ( const Vec3fa& a ) { return Vec3fa(a.x*a.x,a.y*a.y,a.z*a.z); }
+
+  __forceinline Vec3fa rsqrt( const Vec3fa& a ) { return Vec3fa(sycl::rsqrt(a.x),sycl::rsqrt(a.y),sycl::rsqrt(a.z)); }
+
+  __forceinline Vec3fa zero_fix(const Vec3fa& a) {
+    const float x = sycl::fabs(a.x) < min_rcp_input ? min_rcp_input : a.x;
+    const float y = sycl::fabs(a.y) < min_rcp_input ? min_rcp_input : a.y;
+    const float z = sycl::fabs(a.z) < min_rcp_input ? min_rcp_input : a.z;
+    return Vec3fa(x,y,z);
+  }
+  __forceinline Vec3fa rcp_safe(const Vec3fa& a) {
+    return rcp(zero_fix(a));
+  }
+  __forceinline Vec3fa log ( const Vec3fa& a ) {
+    return Vec3fa(sycl::log(a.x),sycl::log(a.y),sycl::log(a.z));
+  }
+
+  __forceinline Vec3fa exp ( const Vec3fa& a ) {
+    return Vec3fa(sycl::exp(a.x),sycl::exp(a.y),sycl::exp(a.z));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa operator +( const Vec3fa& a, const Vec3fa& b ) { return Vec3fa(a.x+b.x, a.y+b.y, a.z+b.z); }
+  __forceinline Vec3fa operator -( const Vec3fa& a, const Vec3fa& b ) { return Vec3fa(a.x-b.x, a.y-b.y, a.z-b.z); }
+  __forceinline Vec3fa operator *( const Vec3fa& a, const Vec3fa& b ) { return Vec3fa(a.x*b.x, a.y*b.y, a.z*b.z); }
+  __forceinline Vec3fa operator *( const Vec3fa& a, const float b ) { return a * Vec3fa(b); }
+  __forceinline Vec3fa operator *( const float a, const Vec3fa& b ) { return Vec3fa(a) * b; }
+  __forceinline Vec3fa operator /( const Vec3fa& a, const Vec3fa& b ) { return Vec3fa(a.x/b.x, a.y/b.y, a.z/b.z); }
+  __forceinline Vec3fa operator /( const Vec3fa& a, const float b        ) { return Vec3fa(a.x/b, a.y/b, a.z/b); }
+  __forceinline Vec3fa operator /( const        float a, const Vec3fa& b ) { return Vec3fa(a/b.x, a/b.y, a/b.z); }
+
+  __forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) {
+    return Vec3fa(sycl::fmin(a.x,b.x), sycl::fmin(a.y,b.y), sycl::fmin(a.z,b.z));
+  }
+  __forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) {
+    return Vec3fa(sycl::fmax(a.x,b.x), sycl::fmax(a.y,b.y), sycl::fmax(a.z,b.z));
+  }
+
+/*
+#if defined(__SSE4_1__)
+    __forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_min_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+#if defined(__SSE4_1__)
+    __forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_max_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+*/
+  __forceinline Vec3fa pow ( const Vec3fa& a, const float& b ) {
+    return Vec3fa(powf(a.x,b),powf(a.y,b),powf(a.z,b));
+  }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa madd  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return Vec3fa(madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z)); }
+  __forceinline Vec3fa msub  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return Vec3fa(msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z)); }
+  __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return Vec3fa(nmadd(a.x,b.x,c.x), nmadd(a.y,b.y,c.y), nmadd(a.z,b.z,c.z)); }
+  __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return Vec3fa(nmsub(a.x,b.x,c.x), nmsub(a.y,b.y,c.y), nmsub(a.z,b.z,c.z)); }
+
+  __forceinline Vec3fa madd  ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); }
+  __forceinline Vec3fa msub  ( const float a, const Vec3fa& b, const Vec3fa& c) { return msub(Vec3fa(a),b,c); }
+  __forceinline Vec3fa nmadd ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmadd(Vec3fa(a),b,c); }
+  __forceinline Vec3fa nmsub ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmsub(Vec3fa(a),b,c); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa& operator +=( Vec3fa& a, const Vec3fa& b ) { return a = a + b; }
+  __forceinline Vec3fa& operator -=( Vec3fa& a, const Vec3fa& b ) { return a = a - b; }
+  __forceinline Vec3fa& operator *=( Vec3fa& a, const Vec3fa& b ) { return a = a * b; }
+  __forceinline Vec3fa& operator *=( Vec3fa& a, const float   b ) { return a = a * b; }
+  __forceinline Vec3fa& operator /=( Vec3fa& a, const Vec3fa& b ) { return a = a / b; }
+  __forceinline Vec3fa& operator /=( Vec3fa& a, const float   b ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float reduce_add(const Vec3fa& v) { return v.x+v.y+v.z; }
+  __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
+  __forceinline float reduce_min(const Vec3fa& v) { return sycl::fmin(sycl::fmin(v.x,v.y),v.z); }
+  __forceinline float reduce_max(const Vec3fa& v) { return sycl::fmax(sycl::fmax(v.x,v.y),v.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Vec3fa& a, const Vec3fa& b ) { return a.x == b.x && a.y == b.y && a.z == b.z; }
+  __forceinline bool operator !=( const Vec3fa& a, const Vec3fa& b ) { return a.x != b.x || a.y != b.y || a.z != b.z; }
+
+  __forceinline Vec3ba eq_mask( const Vec3fa& a, const Vec3fa& b ) { return Vec3ba(a.x == b.x, a.y == b.y, a.z == b.z); }
+  __forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return Vec3ba(a.x != b.x, a.y != b.y, a.z != b.z); }
+  __forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return Vec3ba(a.x <  b.x, a.y <  b.y, a.z <  b.z); }
+  __forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return Vec3ba(a.x <= b.x, a.y <= b.y, a.z <= b.z); }
+  __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return Vec3ba(a.x >  b.x, a.y >  b.y, a.z >  b.z); }
+  __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return Vec3ba(a.x >= b.x, a.y >= b.y, a.z >= b.z); }
+
+  __forceinline bool isvalid ( const Vec3fa& v ) {
+    return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite ( const Vec3fa& a ) {
+    return all(ge_mask(a,Vec3fa(-FLT_MAX)) & le_mask(a,Vec3fa(+FLT_MAX)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {
+    return reduce_add(a*b);
+  }
+
+  __forceinline Vec3fa cross ( const Vec3fa& a, const Vec3fa& b ) {
+    return Vec3fa(msub(a.y,b.z,a.z*b.y), msub(a.z,b.x,a.x*b.z), msub(a.x,b.y,a.y*b.x));
+  }
+  
+  __forceinline float  sqr_length ( const Vec3fa& a )                { return dot(a,a); }
+  __forceinline float  rcp_length ( const Vec3fa& a )                { return rsqrt(dot(a,a)); }
+  __forceinline float  rcp_length2( const Vec3fa& a )                { return rcp(dot(a,a)); }
+  __forceinline float  length   ( const Vec3fa& a )                  { return sqrt(dot(a,a)); }
+  __forceinline Vec3fa normalize( const Vec3fa& a )                  { return a*rsqrt(dot(a,a)); }
+  __forceinline float  distance ( const Vec3fa& a, const Vec3fa& b ) { return length(a-b); }
+  __forceinline float  halfArea ( const Vec3fa& d )                  { return madd(d.x,(d.y+d.z),d.y*d.z); }
+  __forceinline float  area     ( const Vec3fa& d )                  { return 2.0f*halfArea(d); }
+
+  __forceinline Vec3fa normalize_safe( const Vec3fa& a ) {
+    const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
+  }
+
+  /*! differentiated normalization */
+  __forceinline Vec3fa dnormalize(const Vec3fa& p, const Vec3fa& dp)
+  {
+    const float pp  = dot(p,p);
+    const float pdp = dot(p,dp);
+    return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa select( bool s, const Vec3fa& t, const Vec3fa& f ) {
+    return Vec3fa(s ? t.x : f.x, s ? t.y : f.y, s ? t.z : f.z);
+  }
+
+  __forceinline Vec3fa select( const Vec3ba& s, const Vec3fa& t, const Vec3fa& f ) {
+    return Vec3fa(s.x ? t.x : f.x, s.y ? t.y : f.y, s.z ? t.z : f.z);
+  }
+  
+  __forceinline Vec3fa lerp(const Vec3fa& v0, const Vec3fa& v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  __forceinline int maxDim ( const Vec3fa& a )
+  {
+    const Vec3fa b = abs(a);
+    if (b.x > b.y) {
+      if (b.x > b.z) return 0; else return 2;
+    } else {
+      if (b.y > b.z) return 1; else return 2;
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa trunc( const Vec3fa& a ) { return Vec3fa(sycl::trunc(a.x),sycl::trunc(a.y),sycl::trunc(a.z)); }
+  __forceinline Vec3fa floor( const Vec3fa& a ) { return Vec3fa(sycl::floor(a.x),sycl::floor(a.y),sycl::floor(a.z)); }
+  __forceinline Vec3fa ceil ( const Vec3fa& a ) { return Vec3fa(sycl::ceil (a.x),sycl::ceil (a.y),sycl::ceil (a.z)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  inline embree_ostream operator<<(embree_ostream cout, const Vec3fa& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
+  }
+
+  __forceinline Vec2fa::Vec2fa(const Vec3fa& a)
+    : x(a.x), y(a.y) {}
+
+  __forceinline Vec3ia::Vec3ia( const Vec3fa& a )
+    : x((int)a.x), y((int)a.y), z((int)a.z) {}
+
+  typedef Vec3fa Vec3fa_t;
+
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec3fx Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec3fx
+  {
+    //ALIGNED_STRUCT_(16);
+
+    typedef float Scalar;
+    enum { N = 3 };
+    struct { float x,y,z; union { int a; unsigned u; float w; }; };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3fx( ) {}
+    //__forceinline Vec3fx( const __m128 a ) : m128(a) {}
+    __forceinline explicit Vec3fx(const vfloat4& a) : x(a[0]), y(a[1]), z(a[2]), w(a[3]) {}
+
+    __forceinline explicit Vec3fx(const Vec3fa& v) : x(v.x), y(v.y), z(v.z), w(0.0f) {}
+    __forceinline operator Vec3fa() const { return Vec3fa(x,y,z); }
+    
+    __forceinline explicit Vec3fx ( const Vec3<float>& other  ) { x = other.x; y = other.y; z = other.z; }
+    //__forceinline Vec3fx& operator =( const Vec3<float>& other ) { x = other.x; y = other.y; z = other.z; return *this; }
+
+    //__forceinline Vec3fx            ( const Vec3fx& other ) { *(sycl::float4*)this = *(const sycl::float4*)&other; }
+    //__forceinline Vec3fx& operator =( const Vec3fx& other ) { *(sycl::float4*)this = *(const sycl::float4*)&other; return *this; }
+
+    __forceinline explicit Vec3fx( const float a ) : x(a), y(a), z(a), w(a) {}
+    __forceinline          Vec3fx( const float x, const float y, const float z) : x(x), y(y), z(z), w(z) {}
+
+    __forceinline Vec3fx( const Vec3fa& other, const int      a1) : x(other.x), y(other.y), z(other.z), a(a1) {}
+    __forceinline Vec3fx( const Vec3fa& other, const unsigned a1) : x(other.x), y(other.y), z(other.z), u(a1) {}
+    __forceinline Vec3fx( const Vec3fa& other, const float    w1) : x(other.x), y(other.y), z(other.z), w(w1) {}
+
+    //__forceinline Vec3fx( const float x, const float y, const float z, const int      a) : x(x), y(y), z(z), a(a) {} // not working properly!
+    //__forceinline Vec3fx( const float x, const float y, const float z, const unsigned a) : x(x), y(y), z(z), u(a) {} // not working properly!
+    __forceinline Vec3fx( const float x, const float y, const float z, const float w) : x(x), y(y), z(z), w(w) {}
+
+    __forceinline explicit Vec3fx( const Vec3ia& a ) : x((float)a.x), y((float)a.y), z((float)a.z), w(0.0f) {}
+
+    //__forceinline operator const __m128&() const { return m128; }
+    //__forceinline operator       __m128&()       { return m128; }
+    __forceinline operator vfloat4() const { return vfloat4(x,y,z,w); }
+
+    //friend __forceinline Vec3fx copy_a( const Vec3fx& a, const Vec3fx& b ) { Vec3fx c = a; c.a = b.a; return c; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline Vec3fx load( const void* const a ) {
+      const float* ptr = (const float*)a;
+      return Vec3fx(ptr[0],ptr[1],ptr[2],ptr[3]);
+    }
+
+    static __forceinline Vec3fx loadu( const void* const a ) {
+      const float* ptr = (const float*)a;
+      return Vec3fx(ptr[0],ptr[1],ptr[2],ptr[3]);
+    }
+
+    static __forceinline void storeu ( void* a, const Vec3fx& v ) {
+      float* ptr = (float*)a;
+      ptr[0] = v.x; ptr[1] = v.y; ptr[2] = v.z; ptr[3] = v.w;
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3fx( ZeroTy   ) : x(0.0f), y(0.0f), z(0.0f), w(0.0f) {}
+    __forceinline Vec3fx( OneTy    ) : x(1.0f), y(1.0f), z(1.0f), w(1.0f) {}
+    __forceinline Vec3fx( PosInfTy ) : x(+INFINITY), y(+INFINITY), z(+INFINITY), w(+INFINITY) {}
+    __forceinline Vec3fx( NegInfTy ) : x(-INFINITY), y(-INFINITY), z(-INFINITY), w(-INFINITY) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+    __forceinline       float& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx operator +( const Vec3fx& a ) { return a; }
+  __forceinline Vec3fx operator -( const Vec3fx& a ) { return Vec3fx(-a.x,-a.y,-a.z,-a.w); }
+  __forceinline Vec3fx abs  ( const Vec3fx& a ) { return Vec3fx(sycl::fabs(a.x),sycl::fabs(a.y),sycl::fabs(a.z),sycl::fabs(a.w)); }
+  __forceinline Vec3fx sign ( const Vec3fx& a ) { return Vec3fx(sycl::sign(a.x),sycl::sign(a.y),sycl::sign(a.z),sycl::sign(a.z)); }
+
+  //__forceinline Vec3fx rcp  ( const Vec3fx& a ) { return Vec3fx(sycl::recip(a.x),sycl::recip(a.y),sycl::recip(a.z)); }
+  __forceinline Vec3fx rcp  ( const Vec3fx& a ) { return Vec3fx(__sycl_std::__invoke_native_recip<float>(a.x),__sycl_std::__invoke_native_recip<float>(a.y),__sycl_std::__invoke_native_recip<float>(a.z),__sycl_std::__invoke_native_recip<float>(a.w)); }
+  __forceinline Vec3fx sqrt ( const Vec3fx& a ) { return Vec3fx(sycl::sqrt(a.x),sycl::sqrt(a.y),sycl::sqrt(a.z),sycl::sqrt(a.w)); }
+  __forceinline Vec3fx sqr  ( const Vec3fx& a ) { return Vec3fx(a.x*a.x,a.y*a.y,a.z*a.z,a.w*a.w); }
+
+  __forceinline Vec3fx rsqrt( const Vec3fx& a ) { return Vec3fx(sycl::rsqrt(a.x),sycl::rsqrt(a.y),sycl::rsqrt(a.z),sycl::rsqrt(a.w)); }
+
+  __forceinline Vec3fx zero_fix(const Vec3fx& a) {
+    const float x = sycl::fabs(a.x) < min_rcp_input ? min_rcp_input : a.x;
+    const float y = sycl::fabs(a.y) < min_rcp_input ? min_rcp_input : a.y;
+    const float z = sycl::fabs(a.z) < min_rcp_input ? min_rcp_input : a.z;
+    return Vec3fx(x,y,z);
+  }
+  __forceinline Vec3fx rcp_safe(const Vec3fx& a) {
+    return rcp(zero_fix(a));
+  }
+  __forceinline Vec3fx log ( const Vec3fx& a ) {
+    return Vec3fx(sycl::log(a.x),sycl::log(a.y),sycl::log(a.z));
+  }
+
+  __forceinline Vec3fx exp ( const Vec3fx& a ) {
+    return Vec3fx(sycl::exp(a.x),sycl::exp(a.y),sycl::exp(a.z));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx operator +( const Vec3fx& a, const Vec3fx& b ) { return Vec3fx(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); }
+  __forceinline Vec3fx operator -( const Vec3fx& a, const Vec3fx& b ) { return Vec3fx(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w); }
+  __forceinline Vec3fx operator *( const Vec3fx& a, const Vec3fx& b ) { return Vec3fx(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w); }
+  __forceinline Vec3fx operator *( const Vec3fx& a, const float b ) { return a * Vec3fx(b); }
+  __forceinline Vec3fx operator *( const float a, const Vec3fx& b ) { return Vec3fx(a) * b; }
+  __forceinline Vec3fx operator /( const Vec3fx& a, const Vec3fx& b ) { return Vec3fx(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w); }
+  __forceinline Vec3fx operator /( const Vec3fx& a, const float b        ) { return Vec3fx(a.x/b, a.y/b, a.z/b, a.w/b); }
+  __forceinline Vec3fx operator /( const        float a, const Vec3fx& b ) { return Vec3fx(a/b.x, a/b.y, a/b.z, a/b.w); }
+
+  __forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) {
+    return Vec3fx(sycl::fmin(a.x,b.x), sycl::fmin(a.y,b.y), sycl::fmin(a.z,b.z), sycl::fmin(a.w,b.w));
+  }
+  __forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) {
+    return Vec3fx(sycl::fmax(a.x,b.x), sycl::fmax(a.y,b.y), sycl::fmax(a.z,b.z), sycl::fmax(a.w,b.w));
+  }
+
+/*
+#if defined(__SSE4_1__)
+    __forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_min_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+#if defined(__SSE4_1__)
+    __forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_max_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+    __forceinline Vec3fx pow ( const Vec3fx& a, const float& b ) {
+      return Vec3fx(powf(a.x,b),powf(a.y,b),powf(a.z,b));
+    }
+*/
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx madd  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return Vec3fx(madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z), madd(a.w,b.w,c.w)); }
+  __forceinline Vec3fx msub  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return Vec3fx(msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z), msub(a.w,b.w,c.w)); }
+  __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return Vec3fx(nmadd(a.x,b.x,c.x), nmadd(a.y,b.y,c.y), nmadd(a.z,b.z,c.z), nmadd(a.w,b.w,c.w)); }
+  __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return Vec3fx(nmsub(a.x,b.x,c.x), nmsub(a.y,b.y,c.y), nmsub(a.z,b.z,c.z), nmsub(a.w,b.w,c.w)); }
+
+  __forceinline Vec3fx madd  ( const float a, const Vec3fx& b, const Vec3fx& c) { return madd(Vec3fx(a),b,c); }
+  __forceinline Vec3fx msub  ( const float a, const Vec3fx& b, const Vec3fx& c) { return msub(Vec3fx(a),b,c); }
+  __forceinline Vec3fx nmadd ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmadd(Vec3fx(a),b,c); }
+  __forceinline Vec3fx nmsub ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmsub(Vec3fx(a),b,c); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx& operator +=( Vec3fx& a, const Vec3fx& b ) { return a = a + b; }
+  __forceinline Vec3fx& operator -=( Vec3fx& a, const Vec3fx& b ) { return a = a - b; }
+  __forceinline Vec3fx& operator *=( Vec3fx& a, const Vec3fx& b ) { return a = a * b; }
+  __forceinline Vec3fx& operator *=( Vec3fx& a, const float   b ) { return a = a * b; }
+  __forceinline Vec3fx& operator /=( Vec3fx& a, const Vec3fx& b ) { return a = a / b; }
+  __forceinline Vec3fx& operator /=( Vec3fx& a, const float   b ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float reduce_add(const Vec3fx& v) { return v.x+v.y+v.z; }
+  __forceinline float reduce_mul(const Vec3fx& v) { return v.x*v.y*v.z; }
+  __forceinline float reduce_min(const Vec3fx& v) { return sycl::fmin(sycl::fmin(v.x,v.y),v.z); }
+  __forceinline float reduce_max(const Vec3fx& v) { return sycl::fmax(sycl::fmax(v.x,v.y),v.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Vec3fx& a, const Vec3fx& b ) { return a.x == b.x && a.y == b.y && a.z == b.z; }
+  __forceinline bool operator !=( const Vec3fx& a, const Vec3fx& b ) { return a.x != b.x || a.y != b.y || a.z != b.z; }
+
+  __forceinline Vec3ba eq_mask( const Vec3fx& a, const Vec3fx& b ) { return Vec3ba(a.x == b.x, a.y == b.y, a.z == b.z); }
+  __forceinline Vec3ba neq_mask(const Vec3fx& a, const Vec3fx& b ) { return Vec3ba(a.x != b.x, a.y != b.y, a.z != b.z); }
+  __forceinline Vec3ba lt_mask( const Vec3fx& a, const Vec3fx& b ) { return Vec3ba(a.x <  b.x, a.y <  b.y, a.z <  b.z); }
+  __forceinline Vec3ba le_mask( const Vec3fx& a, const Vec3fx& b ) { return Vec3ba(a.x <= b.x, a.y <= b.y, a.z <= b.z); }
+  __forceinline Vec3ba gt_mask( const Vec3fx& a, const Vec3fx& b ) { return Vec3ba(a.x >  b.x, a.y >  b.y, a.z >  b.z); }
+  __forceinline Vec3ba ge_mask( const Vec3fx& a, const Vec3fx& b ) { return Vec3ba(a.x >= b.x, a.y >= b.y, a.z >= b.z); }
+
+  __forceinline bool isvalid ( const Vec3fx& v ) {
+    return all(gt_mask(v,Vec3fx(-FLT_LARGE)) & lt_mask(v,Vec3fx(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite ( const Vec3fx& a ) {
+    return all(ge_mask(a,Vec3fx(-FLT_MAX)) & le_mask(a,Vec3fx(+FLT_MAX)));
+  }
+
+  __forceinline bool isvalid4 ( const Vec3fx& v ) {
+    const bool valid_x = v.x >= -FLT_LARGE & v.x <= +FLT_LARGE;
+    const bool valid_y = v.y >= -FLT_LARGE & v.y <= +FLT_LARGE;
+    const bool valid_z = v.z >= -FLT_LARGE & v.z <= +FLT_LARGE;
+    const bool valid_w = v.w >= -FLT_LARGE & v.w <= +FLT_LARGE;
+    return valid_x & valid_y & valid_z & valid_w;
+  }
+
+  __forceinline bool is_finite4 ( const Vec3fx& v ) {
+    const bool finite_x = v.x >= -FLT_MAX & v.x <= +FLT_MAX;
+    const bool finite_y = v.y >= -FLT_MAX & v.y <= +FLT_MAX;
+    const bool finite_z = v.z >= -FLT_MAX & v.z <= +FLT_MAX;
+    const bool finite_w = v.w >= -FLT_MAX & v.w <= +FLT_MAX;
+    return finite_x & finite_y & finite_z & finite_w;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) {
+    return reduce_add(a*b);
+  }
+
+  __forceinline Vec3fx cross ( const Vec3fx& a, const Vec3fx& b ) {
+    return Vec3fx(msub(a.y,b.z,a.z*b.y), msub(a.z,b.x,a.x*b.z), msub(a.x,b.y,a.y*b.x));
+  }
+  
+  __forceinline float  sqr_length ( const Vec3fx& a )                { return dot(a,a); }
+  __forceinline float  rcp_length ( const Vec3fx& a )                { return rsqrt(dot(a,a)); }
+  __forceinline float  rcp_length2( const Vec3fx& a )                { return rcp(dot(a,a)); }
+  __forceinline float  length   ( const Vec3fx& a )                  { return sqrt(dot(a,a)); }
+  __forceinline Vec3fx normalize( const Vec3fx& a )                  { return a*rsqrt(dot(a,a)); }
+  __forceinline float  distance ( const Vec3fx& a, const Vec3fx& b ) { return length(a-b); }
+  __forceinline float  halfArea ( const Vec3fx& d )                  { return madd(d.x,(d.y+d.z),d.y*d.z); }
+  __forceinline float  area     ( const Vec3fx& d )                  { return 2.0f*halfArea(d); }
+
+  __forceinline Vec3fx normalize_safe( const Vec3fx& a ) {
+    const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
+  }
+
+  /*! differentiated normalization */
+  __forceinline Vec3fx dnormalize(const Vec3fx& p, const Vec3fx& dp)
+  {
+    const float pp  = dot(p,p);
+    const float pdp = dot(p,dp);
+    return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx select( bool s, const Vec3fx& t, const Vec3fx& f ) {
+    return Vec3fx(s ? t.x : f.x, s ? t.y : f.y, s ? t.z : f.z, s ? t.w : f.w);
+  }
+
+  __forceinline Vec3fx select( const Vec3ba& s, const Vec3fx& t, const Vec3fx& f ) {
+    return Vec3fx(s.x ? t.x : f.x, s.y ? t.y : f.y, s.z ? t.z : f.z);
+  }
+  
+  __forceinline Vec3fx lerp(const Vec3fx& v0, const Vec3fx& v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  __forceinline int maxDim ( const Vec3fx& a )
+  {
+    const Vec3fx b = abs(a);
+    if (b.x > b.y) {
+      if (b.x > b.z) return 0; else return 2;
+    } else {
+      if (b.y > b.z) return 1; else return 2;
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx trunc( const Vec3fx& a ) { return Vec3fx(sycl::trunc(a.x),sycl::trunc(a.y),sycl::trunc(a.z),sycl::trunc(a.w)); }
+  __forceinline Vec3fx floor( const Vec3fx& a ) { return Vec3fx(sycl::floor(a.x),sycl::floor(a.y),sycl::floor(a.z),sycl::floor(a.w)); }
+  __forceinline Vec3fx ceil ( const Vec3fx& a ) { return Vec3fx(sycl::ceil (a.x),sycl::ceil (a.y),sycl::ceil (a.z),sycl::ceil (a.w)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  inline embree_ostream operator<<(embree_ostream cout, const Vec3fx& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << "," << a.w << ")";
+  }
+
+  typedef Vec3fx Vec3ff;
+
+  //__forceinline Vec2fa::Vec2fa(const Vec3fx& a)
+  //  : x(a.x), y(a.y) {}
+
+  //__forceinline Vec3ia::Vec3ia( const Vec3fx& a )
+  //  : x((int)a.x), y((int)a.y), z((int)a.z) {}
+}
diff --git a/thirdparty/embree/common/math/vec3ia.h b/thirdparty/embree/common/math/vec3ia.h
index d4cc3125cd..1472fe9135 100644
--- a/thirdparty/embree/common/math/vec3ia.h
+++ b/thirdparty/embree/common/math/vec3ia.h
@@ -4,7 +4,12 @@
 #pragma once
 
 #include "../sys/alloc.h"
-#include "math.h"
+#include "emath.h"
+
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+#  include "vec3ia_sycl.h"
+#else
+
 #include "../simd/sse.h"
 
 namespace embree
@@ -194,3 +199,5 @@ namespace embree
     return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
   }
 }
+
+#endif
diff --git a/thirdparty/embree/common/math/vec3ia_sycl.h b/thirdparty/embree/common/math/vec3ia_sycl.h
new file mode 100644
index 0000000000..5a3d396373
--- /dev/null
+++ b/thirdparty/embree/common/math/vec3ia_sycl.h
@@ -0,0 +1,178 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "emath.h"
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec3ia Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec3ia
+  {
+    ALIGNED_STRUCT_(16);
+
+    struct { int x,y,z; };
+
+    typedef int Scalar;
+    enum { N = 3 };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3ia( ) {}
+    //__forceinline Vec3ia( const __m128i a ) : m128(a) {}
+
+    __forceinline Vec3ia( const Vec3ia& other ) : x(other.x), y(other.y), z(other.z) {}
+    __forceinline Vec3ia& operator =(const Vec3ia& other) { x = other.x; y = other.y; z = other.z; return *this; }
+
+    __forceinline explicit Vec3ia( const int a ) : x(a), y(a), z(a) {}
+    __forceinline          Vec3ia( const int x, const int y, const int z) : x(x), y(y), z(z) {}
+    //__forceinline explicit Vec3ia( const __m128 a ) : m128(_mm_cvtps_epi32(a)) {}
+    __forceinline explicit Vec3ia(const vint4& a) : x(a[0]), y(a[1]), z(a[2]) {}
+
+    __forceinline explicit Vec3ia( const Vec3fa& a );
+
+    //__forceinline operator const __m128i&() const { return m128; }
+    //__forceinline operator       __m128i&()       { return m128; }
+    __forceinline operator vint4() const { return vint4(x,y,z,z); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3ia( ZeroTy   ) : x(0), y(0), z(0) {}
+    __forceinline Vec3ia( OneTy    ) : x(1), y(1), z(1) {}
+    __forceinline Vec3ia( PosInfTy ) : x(0x7FFFFFFF), y(0x7FFFFFFF), z(0x7FFFFFFF) {}
+    __forceinline Vec3ia( NegInfTy ) : x(0x80000000), y(0x80000000), z(0x80000000) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+    __forceinline       int& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
+  };
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ia operator +( const Vec3ia& a ) { return Vec3ia(+a.x,+a.y,+a.z); }
+  __forceinline Vec3ia operator -( const Vec3ia& a ) { return Vec3ia(-a.x,-a.y,-a.z); }
+  __forceinline Vec3ia abs       ( const Vec3ia& a ) { return Vec3ia(sycl::abs(a.x),sycl::abs(a.y),sycl::abs(a.z)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ia operator +( const Vec3ia& a, const Vec3ia& b ) { return Vec3ia(a.x+b.x, a.y+b.y, a.z+b.z); }
+  __forceinline Vec3ia operator +( const Vec3ia& a, const int     b ) { return a+Vec3ia(b); }
+  __forceinline Vec3ia operator +( const int     a, const Vec3ia& b ) { return Vec3ia(a)+b; }
+
+  __forceinline Vec3ia operator -( const Vec3ia& a, const Vec3ia& b ) { return Vec3ia(a.x-b.x, a.y-b.y, a.z-b.z); }
+  __forceinline Vec3ia operator -( const Vec3ia& a, const int     b ) { return a-Vec3ia(b); }
+  __forceinline Vec3ia operator -( const int     a, const Vec3ia& b ) { return Vec3ia(a)-b; }
+
+  __forceinline Vec3ia operator *( const Vec3ia& a, const Vec3ia& b ) { return Vec3ia(a.x*b.x, a.y*b.y, a.z*b.z); }
+  __forceinline Vec3ia operator *( const Vec3ia& a, const int     b ) { return a * Vec3ia(b); }
+  __forceinline Vec3ia operator *( const int     a, const Vec3ia& b ) { return Vec3ia(a) * b; }
+
+  __forceinline Vec3ia operator &( const Vec3ia& a, const Vec3ia& b ) { return Vec3ia(a.x&b.x, a.y&b.y, a.z&b.z); }
+  __forceinline Vec3ia operator &( const Vec3ia& a, const int     b ) { return a & Vec3ia(b); }
+  __forceinline Vec3ia operator &( const int     a, const Vec3ia& b ) { return Vec3ia(a) & b; }
+
+  __forceinline Vec3ia operator |( const Vec3ia& a, const Vec3ia& b ) { return Vec3ia(a.x|b.x, a.y|b.y, a.z|b.z); }
+  __forceinline Vec3ia operator |( const Vec3ia& a, const int     b ) { return a | Vec3ia(b); }
+  __forceinline Vec3ia operator |( const int     a, const Vec3ia& b ) { return Vec3ia(a) | b; }
+
+  __forceinline Vec3ia operator ^( const Vec3ia& a, const Vec3ia& b ) { return Vec3ia(a.x^b.x, a.y^b.y, a.z^b.z); }
+  __forceinline Vec3ia operator ^( const Vec3ia& a, const int     b ) { return a ^ Vec3ia(b); }
+  __forceinline Vec3ia operator ^( const int     a, const Vec3ia& b ) { return Vec3ia(a) ^ b; }
+
+  __forceinline Vec3ia operator <<( const Vec3ia& a, const int n ) { return Vec3ia(a.x<<n, a.y<<n, a.z<<n); }
+  __forceinline Vec3ia operator >>( const Vec3ia& a, const int n ) { return Vec3ia(a.x>>n, a.y>>n, a.z>>n); }
+
+  __forceinline Vec3ia sll ( const Vec3ia& a, const int b ) { return Vec3ia(a.x<<b, a.y<<b, a.z<<b); }
+  __forceinline Vec3ia sra ( const Vec3ia& a, const int b ) { return Vec3ia(a.x>>b, a.y>>b, a.z>>b); }
+  __forceinline Vec3ia srl ( const Vec3ia& a, const int b ) { return Vec3ia(unsigned(a.x)>>b, unsigned(a.y)>>b, unsigned(a.z)>>b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ia& operator +=( Vec3ia& a, const Vec3ia& b ) { return a = a + b; }
+  __forceinline Vec3ia& operator +=( Vec3ia& a, const int&   b ) { return a = a + b; }
+  
+  __forceinline Vec3ia& operator -=( Vec3ia& a, const Vec3ia& b ) { return a = a - b; }
+  __forceinline Vec3ia& operator -=( Vec3ia& a, const int&   b ) { return a = a - b; }
+  
+  __forceinline Vec3ia& operator *=( Vec3ia& a, const Vec3ia& b ) { return a = a * b; }
+  __forceinline Vec3ia& operator *=( Vec3ia& a, const int&    b ) { return a = a * b; }
+  
+  __forceinline Vec3ia& operator &=( Vec3ia& a, const Vec3ia& b ) { return a = a & b; }
+  __forceinline Vec3ia& operator &=( Vec3ia& a, const int&    b ) { return a = a & b; }
+  
+  __forceinline Vec3ia& operator |=( Vec3ia& a, const Vec3ia& b ) { return a = a | b; }
+  __forceinline Vec3ia& operator |=( Vec3ia& a, const int&    b ) { return a = a | b; }
+  
+  __forceinline Vec3ia& operator <<=( Vec3ia& a, const int& b ) { return a = a << b; }
+  __forceinline Vec3ia& operator >>=( Vec3ia& a, const int& b ) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline int reduce_add(const Vec3ia& v) { return v.x+v.y+v.z; }
+  __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; }
+  __forceinline int reduce_min(const Vec3ia& v) { return sycl::min(sycl::min(v.x,v.y),v.z); }
+  __forceinline int reduce_max(const Vec3ia& v) { return sycl::max(sycl::max(v.x,v.y),v.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Vec3ia& a, const Vec3ia& b ) { return a.x == b.x & a.y == b.y & a.z == b.z; }
+  __forceinline bool operator !=( const Vec3ia& a, const Vec3ia& b ) { return a.x != b.x & a.y != b.y & a.z != b.z; }
+
+/*
+  __forceinline bool operator < ( const Vec3ia& a, const Vec3ia& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    if (a.z != b.z) return a.z < b.z;
+    return false;
+  }
+*/
+  __forceinline Vec3ba eq_mask( const Vec3ia& a, const Vec3ia& b ) { return Vec3ba(a.x == b.x, a.y == b.y, a.z == b.z); }
+  __forceinline Vec3ba lt_mask( const Vec3ia& a, const Vec3ia& b ) { return Vec3ba(a.x <  b.x, a.y <  b.y, a.z <  b.z); }
+  __forceinline Vec3ba gt_mask( const Vec3ia& a, const Vec3ia& b ) { return Vec3ba(a.x >  b.x, a.y >  b.y, a.z >  b.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) {
+    const int x = m.x ? t.x : f.x;
+    const int y = m.y ? t.y : f.y;
+    const int z = m.z ? t.z : f.z;
+    return Vec3ia(x,y,z);
+  }
+  
+  __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return Vec3ia(sycl::min(a.x,b.x), sycl::min(a.y,b.y), sycl::min(a.z,b.z)); }
+  __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return Vec3ia(sycl::max(a.x,b.x), sycl::max(a.y,b.y), sycl::max(a.z,b.z)); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  inline embree_ostream operator<<(embree_ostream cout, const Vec3ia& a) {
+    return cout;
+  }
+}
diff --git a/thirdparty/embree/common/math/vec4.h b/thirdparty/embree/common/math/vec4.h
index 10c53f47b4..5647859257 100644
--- a/thirdparty/embree/common/math/vec4.h
+++ b/thirdparty/embree/common/math/vec4.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "math.h"
+#include "emath.h"
 #include "vec3.h"
 
 namespace embree
@@ -221,6 +221,8 @@ namespace embree
 {
   template<> __forceinline Vec4<float>::Vec4( const Vec3fx& a ) { x = a.x; y = a.y; z = a.z; w = a.w; }
 
+#if !defined(__SYCL_DEVICE_ONLY__)
+
 #if defined(__AVX__)
   template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) {
     x = a.x; y = a.y; z = a.z; w = a.w;
@@ -240,4 +242,25 @@ namespace embree
 #if defined(__AVX512F__)
   template<> __forceinline Vec4<vfloat16>::Vec4( const Vec3fx& a ) : x(a.x), y(a.y), z(a.z), w(a.w) {}
 #endif
+
+#else
+
+#if defined(__SSE__)
+  template<> __forceinline Vec4<vfloat4>::Vec4(const Vec3fx& a) {
+    x = a.x; y = a.y; z = a.z; w = a.w;
+  }
+#endif
+#if defined(__AVX__)
+  template<> __forceinline Vec4<vfloat8>::Vec4(const Vec3fx& a) {
+    x = a.x; y = a.y; z = a.z; w = a.w;
+  }
+#endif
+#if defined(__AVX512F__)
+  template<> __forceinline Vec4<vfloat16>::Vec4(const Vec3fx& a) {
+    x = a.x; y = a.y; z = a.z; w = a.w;
+  }
+#endif
+  
+#endif
 }
+
diff --git a/thirdparty/embree/common/simd/arm/emulation.h b/thirdparty/embree/common/simd/arm/emulation.h
index 4327298019..8eea1ffe71 100644
--- a/thirdparty/embree/common/simd/arm/emulation.h
+++ b/thirdparty/embree/common/simd/arm/emulation.h
@@ -34,6 +34,7 @@ __forceinline __m128 _mm_broadcast_ss (float const * mem_addr)
 #define _MM_SET_EXCEPTION_MASK(x)
 // #define _MM_SET_FLUSH_ZERO_MODE(x)
 
+/*
 __forceinline int _mm_getcsr()
 {
   return 0;
@@ -43,6 +44,7 @@ __forceinline void _mm_mfence()
 {
   __sync_synchronize();
 }
+*/
 
 __forceinline __m128i _mm_load4epu8_epi32(__m128i *ptr)
 {
diff --git a/thirdparty/embree/common/simd/arm/sse2neon.h b/thirdparty/embree/common/simd/arm/sse2neon.h
index 43416662d7..b18d41e783 100644
--- a/thirdparty/embree/common/simd/arm/sse2neon.h
+++ b/thirdparty/embree/common/simd/arm/sse2neon.h
@@ -4,8 +4,6 @@
 // This header file provides a simple API translation layer
 // between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
 //
-// This header file does not yet translate all of the SSE intrinsics.
-//
 // Contributors to this work are:
 //   John W. Ratcliff <jratcliffscarab@gmail.com>
 //   Brandon Rowlett <browlett@nvidia.com>
@@ -13,8 +11,8 @@
 //   Eric van Beurden <evanbeurden@nvidia.com>
 //   Alexander Potylitsin <apotylitsin@nvidia.com>
 //   Hasindu Gamaarachchi <hasindu2008@gmail.com>
-//   Jim Huang <jserv@biilabs.io>
-//   Mark Cheng <marktwtn@biilabs.io>
+//   Jim Huang <jserv@ccns.ncku.edu.tw>
+//   Mark Cheng <marktwtn@gmail.com>
 //   Malcolm James MacLeod <malcolm@gulden.com>
 //   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
 //   Sebastian Pop <spop@amazon.com>
@@ -22,9 +20,12 @@
 //   Danila Kutenin <danilak@google.com>
 //   François Turban (JishinMaster) <francois.turban@gmail.com>
 //   Pei-Hsuan Hung <afcidk@gmail.com>
-//   Yang-Hao Yuan <yanghau@biilabs.io>
+//   Yang-Hao Yuan <yuanyanghau@gmail.com>
 //   Syoyo Fujita <syoyo@lighttransport.com>
 //   Brecht Van Lommel <brecht@blender.org>
+//   Jonathan Hue <jhue@adobe.com>
+//   Cuda Chen <clh960524@gmail.com>
+//   Aymen Qader <aymen.qader@arm.com>
 
 /*
  * sse2neon is freely redistributable under the MIT License.
@@ -54,7 +55,7 @@
  * This would slow down the computation a bit, but gives consistent result with
  * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
  */
-/* _mm_min_ps and _mm_max_ps */
+/* _mm_min|max_ps|ss|pd|sd */
 #ifndef SSE2NEON_PRECISE_MINMAX
 #define SSE2NEON_PRECISE_MINMAX (0)
 #endif
@@ -91,9 +92,61 @@
 #define _sse2neon_unlikely(x) (x)
 #endif
 
+/* C language does not allow initializing a variable with a function call. */
+#ifdef __cplusplus
+#define _sse2neon_const static const
+#else
+#define _sse2neon_const const
+#endif
+
 #include <stdint.h>
 #include <stdlib.h>
 
+#if defined(_WIN32)
+/* Definitions for _mm_{malloc,free} are provided by <malloc.h>
+ * from both MinGW-w64 and MSVC.
+ */
+#define SSE2NEON_ALLOC_DEFINED
+#endif
+
+/* If using MSVC */
+#ifdef _MSC_VER
+#include <intrin.h>
+#if (defined(_M_AMD64) || defined(__x86_64__)) || \
+    (defined(_M_ARM) || defined(__arm__))
+#define SSE2NEON_HAS_BITSCAN64
+#endif
+#endif
+
+/* Compiler barrier */
+#define SSE2NEON_BARRIER()                     \
+    do {                                       \
+        __asm__ __volatile__("" ::: "memory"); \
+        (void) 0;                              \
+    } while (0)
+
+/* Memory barriers
+ * __atomic_thread_fence does not include a compiler barrier; instead,
+ * the barrier is part of __atomic_load/__atomic_store's "volatile-like"
+ * semantics.
+ */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
+#include <stdatomic.h>
+#endif
+
+FORCE_INLINE void _sse2neon_smp_mb(void)
+{
+    SSE2NEON_BARRIER();
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
+    !defined(__STDC_NO_ATOMICS__)
+    atomic_thread_fence(memory_order_seq_cst);
+#elif defined(__GNUC__) || defined(__clang__)
+    __atomic_thread_fence(__ATOMIC_SEQ_CST);
+#else
+    /* FIXME: MSVC support */
+#endif
+}
+
 /* Architecture-specific build options */
 /* FIXME: #pragma GCC push_options is only available on GCC */
 #if defined(__GNUC__)
@@ -114,27 +167,70 @@
 #pragma GCC push_options
 #pragma GCC target("+simd")
 #endif
+#elif __ARM_ARCH == 8
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
+#error \
+    "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
+#endif
+#if !defined(__clang__)
+#pragma GCC push_options
+#endif
 #else
 #error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
 #endif
 #endif
 
 #include <arm_neon.h>
+#if !defined(__aarch64__) && (__ARM_ARCH == 8)
+#if defined __has_include && __has_include(<arm_acle.h>)
+#include <arm_acle.h>
+#endif
+#endif
+
+/* Apple Silicon cache lines are double of what is commonly used by Intel, AMD
+ * and other Arm microarchtectures use.
+ * From sysctl -a on Apple M1:
+ * hw.cachelinesize: 128
+ */
+#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))
+#define SSE2NEON_CACHELINE_SIZE 128
+#else
+#define SSE2NEON_CACHELINE_SIZE 64
+#endif
 
 /* Rounding functions require either Aarch64 instructions or libm failback */
 #if !defined(__aarch64__)
 #include <math.h>
 #endif
 
+/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only
+ * or even not accessible in user mode.
+ * To write or access to these registers in user mode,
+ * we have to perform syscall instead.
+ */
+#if !defined(__aarch64__)
+#include <sys/time.h>
+#endif
+
 /* "__has_builtin" can be used to query support for built-in functions
  * provided by gcc/clang and other compilers that support it.
  */
 #ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
 /* Compatibility with gcc <= 9 */
-#if __GNUC__ <= 9
+#if defined(__GNUC__) && (__GNUC__ <= 9)
 #define __has_builtin(x) HAS##x
 #define HAS__builtin_popcount 1
 #define HAS__builtin_popcountll 1
+
+// __builtin_shuffle introduced in GCC 4.7.0
+#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7))
+#define HAS__builtin_shuffle 1
+#else
+#define HAS__builtin_shuffle 0
+#endif
+
+#define HAS__builtin_shufflevector 0
+#define HAS__builtin_nontemporal_store 0
 #else
 #define __has_builtin(x) 0
 #endif
@@ -159,6 +255,26 @@
 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
     (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
 
+#if __has_builtin(__builtin_shufflevector)
+#define _sse2neon_shuffle(type, a, b, ...) \
+    __builtin_shufflevector(a, b, __VA_ARGS__)
+#elif __has_builtin(__builtin_shuffle)
+#define _sse2neon_shuffle(type, a, b, ...) \
+    __extension__({                        \
+        type tmp = {__VA_ARGS__};          \
+        __builtin_shuffle(a, b, tmp);      \
+    })
+#endif
+
+#ifdef _sse2neon_shuffle
+#define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__)
+#define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__)
+#define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__)
+#define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__)
+#define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__)
+#define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__)
+#endif
+
 /* Rounding mode macros. */
 #define _MM_FROUND_TO_NEAREST_INT 0x00
 #define _MM_FROUND_TO_NEG_INF 0x01
@@ -166,6 +282,13 @@
 #define _MM_FROUND_TO_ZERO 0x03
 #define _MM_FROUND_CUR_DIRECTION 0x04
 #define _MM_FROUND_NO_EXC 0x08
+#define _MM_FROUND_RAISE_EXC 0x00
+#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
 #define _MM_ROUND_NEAREST 0x0000
 #define _MM_ROUND_DOWN 0x2000
 #define _MM_ROUND_UP 0x4000
@@ -198,10 +321,17 @@ typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
 #else
 typedef float32x4_t __m128d;
 #endif
-// Note: upstream sse2neon declares __m128i as int64x2_t.  However, there's
-// many places within embree that assume __m128i can be indexed as a
-// 4 element u32.
-typedef int32x4_t __m128i; /* 128-bit vector containing integers */
+typedef int64x2_t __m128i; /* 128-bit vector containing integers */
+
+// __int64 is defined in the Intrinsics Guide which maps to different datatype
+// in different data model
+#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
+#if (defined(__x86_64__) || defined(__i386__))
+#define __int64 long long
+#else
+#define __int64 int64_t
+#endif
+#endif
 
 /* type-safe casting between types */
 
@@ -233,28 +363,28 @@ typedef int32x4_t __m128i; /* 128-bit vector containing integers */
 #define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
 #define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
 
-#define vreinterpretq_m128i_s8(x) vreinterpretq_s32_s8(x)
-#define vreinterpretq_m128i_s16(x) vreinterpretq_s32_s16(x)
-#define vreinterpretq_m128i_s32(x) (x)
-#define vreinterpretq_m128i_s64(x) vreinterpretq_s32_s64(x)
+#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
+#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
+#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
+#define vreinterpretq_m128i_s64(x) (x)
 
-#define vreinterpretq_m128i_u8(x) vreinterpretq_s32_u8(x)
-#define vreinterpretq_m128i_u16(x) vreinterpretq_s32_u16(x)
-#define vreinterpretq_m128i_u32(x) vreinterpretq_s32_u32(x)
-#define vreinterpretq_m128i_u64(x) vreinterpretq_s32_u64(x)
+#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
+#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
+#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
+#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
 
-#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s32(x)
-#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s32(x)
+#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
+#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
 
-#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s32(x)
-#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s32(x)
-#define vreinterpretq_s32_m128i(x) (x)
-#define vreinterpretq_s64_m128i(x) vreinterpretq_s64_s32(x)
+#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
+#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
+#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
+#define vreinterpretq_s64_m128i(x) (x)
 
-#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s32(x)
-#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s32(x)
-#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s32(x)
-#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s32(x)
+#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
+#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
+#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
+#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
 
 #define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
 #define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
@@ -394,7 +524,7 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
 
 // Older gcc does not define vld1q_u8_x4 type
 #if defined(__GNUC__) && !defined(__clang__) &&                        \
-    ((__GNUC__ <= 10 && defined(__arm__)) ||                           \
+    ((__GNUC__ <= 12 && defined(__arm__)) ||                           \
      (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
      (__GNUC__ <= 9 && defined(__aarch64__)))
 FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
@@ -414,6 +544,57 @@ FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
 }
 #endif
 
+#if !defined(__aarch64__)
+/* emulate vaddv u8 variant */
+FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
+{
+    const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
+    return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
+}
+#else
+// Wraps vaddv_u8
+FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
+{
+    return vaddv_u8(v8);
+}
+#endif
+
+#if !defined(__aarch64__)
+/* emulate vaddvq u8 variant */
+FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
+{
+    uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
+    uint8_t res = 0;
+    for (int i = 0; i < 8; ++i)
+        res += tmp[i];
+    return res;
+}
+#else
+// Wraps vaddvq_u8
+FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
+{
+    return vaddvq_u8(a);
+}
+#endif
+
+#if !defined(__aarch64__)
+/* emulate vaddvq u16 variant */
+FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
+{
+    uint32x4_t m = vpaddlq_u16(a);
+    uint64x2_t n = vpaddlq_u32(m);
+    uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
+
+    return vget_lane_u32((uint32x2_t) o, 0);
+}
+#else
+// Wraps vaddvq_u16
+FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
+{
+    return vaddvq_u16(a);
+}
+#endif
+
 /* Function Naming Conventions
  * The naming convention of SSE intrinsics is straightforward. A generic SSE
  * intrinsic function is given as follows:
@@ -491,16 +672,12 @@ FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
     +------+------+------+------+------+------+-------------+
  */
 
-/* Constants for use with _mm_prefetch.  */
+/* Constants for use with _mm_prefetch. */
 enum _mm_hint {
-    _MM_HINT_NTA = 0,  /* load data to L1 and L2 cache, mark it as NTA */
-    _MM_HINT_T0 = 1,   /* load data to L1 and L2 cache */
-    _MM_HINT_T1 = 2,   /* load data to L2 cache only */
-    _MM_HINT_T2 = 3,   /* load data to L2 cache only, mark it as NTA */
-    _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
-    _MM_HINT_ET0 = 5,  /* exclusive version of _MM_HINT_T0 */
-    _MM_HINT_ET1 = 6,  /* exclusive version of _MM_HINT_T1 */
-    _MM_HINT_ET2 = 7   /* exclusive version of _MM_HINT_T2 */
+    _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
+    _MM_HINT_T0 = 1,  /* load data to L1 and L2 cache */
+    _MM_HINT_T1 = 2,  /* load data to L2 cache only */
+    _MM_HINT_T2 = 3,  /* load data to L2 cache only, mark it as NTA */
 };
 
 // The bit field mapping to the FPCR(floating-point control register)
@@ -661,7 +838,8 @@ FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
     *sum = t;
 }
 
-#if defined(__ARM_FEATURE_CRYPTO)
+#if defined(__ARM_FEATURE_CRYPTO) && \
+    (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64))
 // Wraps vmull_p64
 FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
 {
@@ -970,6 +1148,11 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
         vreinterpretq_m128i_s16(ret);                                          \
     })
 
+/* MMX */
+
+//_mm_empty is a no-op on arm
+FORCE_INLINE void _mm_empty(void) {}
+
 /* SSE */
 
 // Adds the four single-precision, floating-point values of a and b.
@@ -1035,7 +1218,7 @@ FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
 //     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16
 FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
 {
     return vreinterpret_m64_u16(
@@ -1050,7 +1233,7 @@ FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
 //     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8
 FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
 {
     return vreinterpret_m64_u8(
@@ -1333,7 +1516,7 @@ FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
 //   dst[95:64] := a[95:64]
 //   dst[127:96] := a[127:96]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps
 FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
 {
     return vreinterpretq_m128_f32(
@@ -1349,10 +1532,10 @@ FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
 //       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
 FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
 {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     return vreinterpret_m64_s32(
         vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
 #else
@@ -1368,7 +1551,7 @@ FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
 //   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
 //   dst[127:32] := a[127:32]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss
 FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
 {
     return vreinterpretq_m128_f32(
@@ -1377,10 +1560,10 @@ FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
 
 // Convert the lower single-precision (32-bit) floating-point element in a to a
 // 32-bit integer, and store the result in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
 FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
 {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
                           0);
 #else
@@ -1399,7 +1582,7 @@ FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
 //      dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps
 FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
 {
     return vreinterpretq_m128_f32(
@@ -1415,7 +1598,7 @@ FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
 //   dst[95:64] := a[95:64]
 //   dst[127:96] := a[127:96]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps
 FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
 {
     return vreinterpretq_m128_f32(
@@ -1425,7 +1608,7 @@ FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
 
 // Convert packed signed 32-bit integers in a to packed single-precision
 // (32-bit) floating-point elements, store the results in the lower 2 elements
-// of dst, then covert the packed signed 32-bit integers in b to
+// of dst, then convert the packed signed 32-bit integers in b to
 // single-precision (32-bit) floating-point element, and store the results in
 // the upper 2 elements of dst.
 //
@@ -1434,7 +1617,7 @@ FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
 //   dst[95:64] := Convert_Int32_To_FP32(b[31:0])
 //   dst[127:96] := Convert_Int32_To_FP32(b[63:32])
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps
 FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
 {
     return vreinterpretq_m128_f32(vcvtq_f32_s32(
@@ -1450,7 +1633,7 @@ FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
 //      dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps
 FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
 {
     return vreinterpretq_m128_f32(vcvtq_f32_s32(
@@ -1472,23 +1655,11 @@ FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
 //     FI
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16
 FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
 {
-    const __m128 i16Min = _mm_set_ps1((float) INT16_MIN);
-    const __m128 i16Max = _mm_set_ps1((float) INT16_MAX);
-    const __m128 i32Max = _mm_set_ps1((float) INT32_MAX);
-    const __m128i maxMask = _mm_castps_si128(
-        _mm_and_ps(_mm_cmpge_ps(a, i16Max), _mm_cmple_ps(a, i32Max)));
-    const __m128i betweenMask = _mm_castps_si128(
-        _mm_and_ps(_mm_cmpgt_ps(a, i16Min), _mm_cmplt_ps(a, i16Max)));
-    const __m128i minMask = _mm_cmpeq_epi32(_mm_or_si128(maxMask, betweenMask),
-                                            _mm_setzero_si128());
-    __m128i max = _mm_and_si128(maxMask, _mm_set1_epi32(INT16_MAX));
-    __m128i min = _mm_and_si128(minMask, _mm_set1_epi32(INT16_MIN));
-    __m128i cvt = _mm_and_si128(betweenMask, _mm_cvtps_epi32(a));
-    __m128i res32 = _mm_or_si128(_mm_or_si128(max, min), cvt);
-    return vreinterpret_m64_s16(vmovn_s32(vreinterpretq_s32_m128i(res32)));
+    return vreinterpret_m64_s16(
+        vqmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));
 }
 
 // Convert packed single-precision (32-bit) floating-point elements in a to
@@ -1499,7 +1670,7 @@ FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
 //       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32
 #define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
 
 // Convert packed single-precision (32-bit) floating-point elements in a to
@@ -1517,28 +1688,11 @@ FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
 //     FI
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi8
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8
 FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
 {
-    const __m128 i8Min = _mm_set_ps1((float) INT8_MIN);
-    const __m128 i8Max = _mm_set_ps1((float) INT8_MAX);
-    const __m128 i32Max = _mm_set_ps1((float) INT32_MAX);
-    const __m128i maxMask = _mm_castps_si128(
-        _mm_and_ps(_mm_cmpge_ps(a, i8Max), _mm_cmple_ps(a, i32Max)));
-    const __m128i betweenMask = _mm_castps_si128(
-        _mm_and_ps(_mm_cmpgt_ps(a, i8Min), _mm_cmplt_ps(a, i8Max)));
-    const __m128i minMask = _mm_cmpeq_epi32(_mm_or_si128(maxMask, betweenMask),
-                                            _mm_setzero_si128());
-    __m128i max = _mm_and_si128(maxMask, _mm_set1_epi32(INT8_MAX));
-    __m128i min = _mm_and_si128(minMask, _mm_set1_epi32(INT8_MIN));
-    __m128i cvt = _mm_and_si128(betweenMask, _mm_cvtps_epi32(a));
-    __m128i res32 = _mm_or_si128(_mm_or_si128(max, min), cvt);
-    int16x4_t res16 = vmovn_s32(vreinterpretq_s32_m128i(res32));
-    int8x8_t res8 = vmovn_s16(vcombine_s16(res16, res16));
-    uint32_t bitMask[2] = {0xFFFFFFFF, 0};
-    int8x8_t mask = vreinterpret_s8_u32(vld1_u32(bitMask));
-
-    return vreinterpret_m64_s8(vorr_s8(vand_s8(mask, res8), vdup_n_s8(0)));
+    return vreinterpret_m64_s8(vqmovn_s16(
+        vcombine_s16(vreinterpret_s16_m64(_mm_cvtps_pi16(a)), vdup_n_s16(0))));
 }
 
 // Convert packed unsigned 16-bit integers in a to packed single-precision
@@ -1550,7 +1704,7 @@ FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
 //      dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps
 FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
 {
     return vreinterpretq_m128_f32(
@@ -1567,7 +1721,7 @@ FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
 //      dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps
 FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
 {
     return vreinterpretq_m128_f32(vcvtq_f32_u32(
@@ -1581,7 +1735,7 @@ FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
 //   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
 //   dst[127:32] := a[127:32]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss
 #define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
 
 // Convert the signed 64-bit integer b to a single-precision (32-bit)
@@ -1591,7 +1745,7 @@ FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
 //   dst[31:0] := Convert_Int64_To_FP32(b[63:0])
 //   dst[127:32] := a[127:32]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss
 FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
 {
     return vreinterpretq_m128_f32(
@@ -1602,7 +1756,7 @@ FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
 //
 //   dst[31:0] := a[31:0]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32
 FORCE_INLINE float _mm_cvtss_f32(__m128 a)
 {
     return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
@@ -1613,7 +1767,7 @@ FORCE_INLINE float _mm_cvtss_f32(__m128 a)
 //
 //   dst[31:0] := Convert_FP32_To_Int32(a[31:0])
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32
 #define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
 
 // Convert the lower single-precision (32-bit) floating-point element in a to a
@@ -1621,10 +1775,10 @@ FORCE_INLINE float _mm_cvtss_f32(__m128 a)
 //
 //   dst[63:0] := Convert_FP32_To_Int64(a[31:0])
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
 FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
 {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
 #else
     float32_t data = vgetq_lane_f32(
@@ -1641,7 +1795,7 @@ FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
 //      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi
 FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
 {
     return vreinterpret_m64_s32(
@@ -1653,7 +1807,7 @@ FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
 //
 //   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si
 FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
 {
     return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
@@ -1667,7 +1821,7 @@ FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
 //      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32
 #define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
 
 // Convert the lower single-precision (32-bit) floating-point element in a to a
@@ -1675,7 +1829,7 @@ FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
 //
 //   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32
 #define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
 
 // Convert the lower single-precision (32-bit) floating-point element in a to a
@@ -1683,7 +1837,7 @@ FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
 //
 //   dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64
 FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
 {
     return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
@@ -1725,21 +1879,23 @@ FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
 
 // Extract a 16-bit integer from a, selected with imm8, and store the result in
 // the lower element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_pi16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16
 #define _mm_extract_pi16(a, imm) \
     (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
 
 // Free aligned memory that was allocated with _mm_malloc.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free
+#if !defined(SSE2NEON_ALLOC_DEFINED)
 FORCE_INLINE void _mm_free(void *addr)
 {
     free(addr);
 }
+#endif
 
 // Macro: Get the flush zero bits from the MXCSR control and status register.
 // The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
 // _MM_FLUSH_ZERO_OFF
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_FLUSH_ZERO_MODE
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE
 FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
 {
     union {
@@ -1752,9 +1908,9 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
     } r;
 
 #if defined(__aarch64__)
-    asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
 #else
-    asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
 #endif
 
     return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
@@ -1763,7 +1919,7 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
 // Macro: Get the rounding mode bits from the MXCSR control and status register.
 // The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
 // _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_ROUNDING_MODE
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
 FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
 {
     union {
@@ -1776,9 +1932,9 @@ FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
     } r;
 
 #if defined(__aarch64__)
-    asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
 #else
-    asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
 #endif
 
     if (r.field.bit22) {
@@ -1790,7 +1946,7 @@ FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
 
 // Copy a to dst, and insert the 16-bit integer i into dst at the location
 // specified by imm8.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16
 #define _mm_insert_pi16(a, b, imm)                               \
     __extension__({                                              \
         vreinterpret_m64_s16(                                    \
@@ -1812,7 +1968,7 @@ FORCE_INLINE __m128 _mm_load_ps(const float *p)
 //   dst[95:64] := MEM[mem_addr+31:mem_addr]
 //   dst[127:96] := MEM[mem_addr+31:mem_addr]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1
 #define _mm_load_ps1 _mm_load1_ps
 
 // Loads an single - precision, floating - point value into the low word and
@@ -1873,7 +2029,7 @@ FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
 //   dst[95:64] := MEM[mem_addr+63:mem_addr+32]
 //   dst[127:96] := MEM[mem_addr+31:mem_addr]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps
 FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
 {
     float32x4_t v = vrev64q_f32(vld1q_f32(p));
@@ -1894,7 +2050,7 @@ FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
 //   dst[15:0] := MEM[mem_addr+15:mem_addr]
 //   dst[MAX:16] := 0
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16
 FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
 {
     return vreinterpretq_m128i_s16(
@@ -1906,7 +2062,7 @@ FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
 //   dst[MAX:64] := 0
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
 FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
 {
     return vreinterpretq_m128i_s64(
@@ -1916,6 +2072,7 @@ FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
 // Allocate aligned blocks of memory.
 // https://software.intel.com/en-us/
 //         cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
+#if !defined(SSE2NEON_ALLOC_DEFINED)
 FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
 {
     void *ptr;
@@ -1927,11 +2084,12 @@ FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
         return ptr;
     return NULL;
 }
+#endif
 
 // Conditionally store 8-bit integer elements from a into memory using mask
 // (elements are not stored when the highest bit is not set in the corresponding
 // element) and a non-temporal memory hint.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmove_si64
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64
 FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
 {
     int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
@@ -1945,7 +2103,7 @@ FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
 // Conditionally store 8-bit integer elements from a into memory using mask
 // (elements are not stored when the highest bit is not set in the corresponding
 // element) and a non-temporal memory hint.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_maskmovq
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq
 #define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
 
 // Compare packed signed 16-bit integers in a and b, and store packed maximum
@@ -1956,7 +2114,7 @@ FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
 //      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16
 FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
 {
     return vreinterpret_m64_s16(
@@ -1971,7 +2129,7 @@ FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
 #if SSE2NEON_PRECISE_MINMAX
     float32x4_t _a = vreinterpretq_f32_m128(a);
     float32x4_t _b = vreinterpretq_f32_m128(b);
-    return vbslq_f32(vcltq_f32(_b, _a), _a, _b);
+    return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b));
 #else
     return vreinterpretq_m128_f32(
         vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
@@ -1986,7 +2144,7 @@ FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
 //      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8
 FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
 {
     return vreinterpret_m64_u8(
@@ -2011,7 +2169,7 @@ FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
 //      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16
 FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
 {
     return vreinterpret_m64_s16(
@@ -2026,7 +2184,7 @@ FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
 #if SSE2NEON_PRECISE_MINMAX
     float32x4_t _a = vreinterpretq_f32_m128(a);
     float32x4_t _b = vreinterpretq_f32_m128(b);
-    return vbslq_f32(vcltq_f32(_a, _b), _a, _b);
+    return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b));
 #else
     return vreinterpretq_m128_f32(
         vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
@@ -2041,7 +2199,7 @@ FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
 //      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8
 FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
 {
     return vreinterpret_m64_u8(
@@ -2095,7 +2253,7 @@ FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
 
 // Create mask from the most significant bit of each 8-bit element in a, and
 // store the result in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pi8
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8
 FORCE_INLINE int _mm_movemask_pi8(__m64 a)
 {
     uint8x8_t input = vreinterpret_u8_m64(a);
@@ -2159,7 +2317,7 @@ FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
 //   dst[31:0] := a[31:0] * b[31:0]
 //   dst[127:32] := a[127:32]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss
 FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
 {
     return _mm_move_ss(a, _mm_mul_ps(a, b));
@@ -2168,7 +2326,7 @@ FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
 // Multiply the packed unsigned 16-bit integers in a and b, producing
 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
 // integers in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16
 FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
 {
     return vreinterpret_m64_u16(vshrn_n_u32(
@@ -2192,7 +2350,7 @@ FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
 //     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb
 #define _m_pavgb(a, b) _mm_avg_pu8(a, b)
 
 // Average packed unsigned 16-bit integers in a and b, and store the results in
@@ -2203,74 +2361,87 @@ FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
 //     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw
 #define _m_pavgw(a, b) _mm_avg_pu16(a, b)
 
 // Extract a 16-bit integer from a, selected with imm8, and store the result in
 // the lower element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw
 #define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
 
 // Copy a to dst, and insert the 16-bit integer i into dst at the location
 // specified by imm8.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw
 #define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
 
 // Compare packed signed 16-bit integers in a and b, and store packed maximum
 // values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw
 #define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
 
 // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
 // values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub
 #define _m_pmaxub(a, b) _mm_max_pu8(a, b)
 
 // Compare packed signed 16-bit integers in a and b, and store packed minimum
 // values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw
 #define _m_pminsw(a, b) _mm_min_pi16(a, b)
 
 // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
 // values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub
 #define _m_pminub(a, b) _mm_min_pu8(a, b)
 
 // Create mask from the most significant bit of each 8-bit element in a, and
 // store the result in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmovmskb
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb
 #define _m_pmovmskb(a) _mm_movemask_pi8(a)
 
 // Multiply the packed unsigned 16-bit integers in a and b, producing
 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
 // integers in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw
 #define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
 
-// Loads one cache line of data from address p to a location closer to the
-// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
-FORCE_INLINE void _mm_prefetch(const void *p, int i)
+// Fetch the line of data from memory that contains address p to a location in
+// the cache heirarchy specified by the locality hint i.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch
+FORCE_INLINE void _mm_prefetch(char const *p, int i)
 {
-    (void) i;
-    __builtin_prefetch(p);
+    switch (i) {
+    case _MM_HINT_NTA:
+        __builtin_prefetch(p, 0, 0);
+        break;
+    case _MM_HINT_T0:
+        __builtin_prefetch(p, 0, 3);
+        break;
+    case _MM_HINT_T1:
+        __builtin_prefetch(p, 0, 2);
+        break;
+    case _MM_HINT_T2:
+        __builtin_prefetch(p, 0, 1);
+        break;
+    }
 }
 
 // Compute the absolute differences of packed unsigned 8-bit integers in a and
 // b, then horizontally sum each consecutive 8 differences to produce four
 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
 // 16 bits of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_psadbw
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw
 #define _m_psadbw(a, b) _mm_sad_pu8(a, b)
 
 // Shuffle 16-bit integers in a using the control in imm8, and store the results
 // in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pshufw
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw
 #define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
 
 // Compute the approximate reciprocal of packed single-precision (32-bit)
 // floating-point elements in a, and store the results in dst. The maximum
 // relative error for this approximation is less than 1.5*2^-12.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps
 FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
 {
     float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
@@ -2290,7 +2461,7 @@ FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
 //   dst[31:0] := (1.0 / a[31:0])
 //   dst[127:32] := a[127:32]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss
 FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
 {
     return _mm_move_ss(a, _mm_rcp_ps(a));
@@ -2317,7 +2488,7 @@ FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
 // (32-bit) floating-point element in a, store the result in the lower element
 // of dst, and copy the upper 3 packed elements from a to the upper elements of
 // dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss
 FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
 {
     return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
@@ -2327,7 +2498,7 @@ FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
 // b, then horizontally sum each consecutive 8 differences to produce four
 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
 // 16 bits of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8
 FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
 {
     uint64x1_t t = vpaddl_u32(vpaddl_u16(
@@ -2339,7 +2510,7 @@ FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
 // Macro: Set the flush zero bits of the MXCSR control and status register to
 // the value in unsigned 32-bit integer a. The flush zero may contain any of the
 // following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_FLUSH_ZERO_MODE
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE
 FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
 {
     // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
@@ -2354,17 +2525,17 @@ FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
     } r;
 
 #if defined(__aarch64__)
-    asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
 #else
-    asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
 #endif
 
     r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
 
 #if defined(__aarch64__)
-    asm volatile("msr FPCR, %0" ::"r"(r)); /* write */
+    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
 #else
-    asm volatile("vmsr FPSCR, %0" ::"r"(r));        /* write */
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
 #endif
 }
 
@@ -2387,7 +2558,7 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w)
 // the value in unsigned 32-bit integer a. The rounding mode may contain any of
 // the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
 // _MM_ROUND_TOWARD_ZERO
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
 FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
 {
     union {
@@ -2400,9 +2571,9 @@ FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
     } r;
 
 #if defined(__aarch64__)
-    asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
 #else
-    asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
 #endif
 
     switch (rounding) {
@@ -2424,19 +2595,18 @@ FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
     }
 
 #if defined(__aarch64__)
-    asm volatile("msr FPCR, %0" ::"r"(r)); /* write */
+    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
 #else
-    asm volatile("vmsr FPSCR, %0" ::"r"(r));        /* write */
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
 #endif
 }
 
 // Copy single-precision (32-bit) floating-point element a to the lower element
 // of dst, and zero the upper 3 elements.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss
 FORCE_INLINE __m128 _mm_set_ss(float a)
 {
-    float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
+    return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0));
 }
 
 // Sets the four single-precision, floating-point values to w.
@@ -2449,11 +2619,18 @@ FORCE_INLINE __m128 _mm_set1_ps(float _w)
     return vreinterpretq_m128_f32(vdupq_n_f32(_w));
 }
 
+// FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
 FORCE_INLINE void _mm_setcsr(unsigned int a)
 {
     _MM_SET_ROUNDING_MODE(a);
 }
 
+// FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
+FORCE_INLINE unsigned int _mm_getcsr()
+{
+    return _MM_GET_ROUNDING_MODE();
+}
+
 // Sets the four single-precision, floating-point values to the four inputs in
 // reverse order.
 // https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
@@ -2472,11 +2649,11 @@ FORCE_INLINE __m128 _mm_setzero_ps(void)
 
 // Shuffle 16-bit integers in a using the control in imm8, and store the results
 // in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi16
-#if __has_builtin(__builtin_shufflevector)
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
+#ifdef _sse2neon_shuffle
 #define _mm_shuffle_pi16(a, imm)                                           \
     __extension__({                                                        \
-        vreinterpret_m64_s16(__builtin_shufflevector(                      \
+        vreinterpret_m64_s16(vshuffle_s16(                                 \
             vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
             ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)));  \
     })
@@ -2499,25 +2676,48 @@ FORCE_INLINE __m128 _mm_setzero_ps(void)
     })
 #endif
 
-// Guarantees that every preceding store is globally visible before any
-// subsequent store.
-// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
+// Perform a serializing operation on all store-to-memory instructions that were
+// issued prior to this instruction. Guarantees that every store instruction
+// that precedes, in program order, is globally visible before any store
+// instruction which follows the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence
 FORCE_INLINE void _mm_sfence(void)
 {
-    __sync_synchronize();
+    _sse2neon_smp_mb();
+}
+
+// Perform a serializing operation on all load-from-memory and store-to-memory
+// instructions that were issued prior to this instruction. Guarantees that
+// every memory access that precedes, in program order, the memory fence
+// instruction is globally visible before any memory instruction which follows
+// the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence
+FORCE_INLINE void _mm_mfence(void)
+{
+    _sse2neon_smp_mb();
+}
+
+// Perform a serializing operation on all load-from-memory instructions that
+// were issued prior to this instruction. Guarantees that every load instruction
+// that precedes, in program order, is globally visible before any load
+// instruction which follows the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence
+FORCE_INLINE void _mm_lfence(void)
+{
+    _sse2neon_smp_mb();
 }
 
 // FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
 // int imm)
-#if __has_builtin(__builtin_shufflevector)
-#define _mm_shuffle_ps(a, b, imm)                                \
-    __extension__({                                              \
-        float32x4_t _input1 = vreinterpretq_f32_m128(a);         \
-        float32x4_t _input2 = vreinterpretq_f32_m128(b);         \
-        float32x4_t _shuf = __builtin_shufflevector(             \
-            _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
-            (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
-        vreinterpretq_m128_f32(_shuf);                           \
+#ifdef _sse2neon_shuffle
+#define _mm_shuffle_ps(a, b, imm)                                              \
+    __extension__({                                                            \
+        float32x4_t _input1 = vreinterpretq_f32_m128(a);                       \
+        float32x4_t _input2 = vreinterpretq_f32_m128(b);                       \
+        float32x4_t _shuf =                                                    \
+            vshuffleq_s32(_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+                          (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
+        vreinterpretq_m128_f32(_shuf);                                         \
     })
 #else  // generic
 #define _mm_shuffle_ps(a, b, imm)                          \
@@ -2652,7 +2852,7 @@ FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
 //   MEM[mem_addr+95:mem_addr+64] := a[31:0]
 //   MEM[mem_addr+127:mem_addr+96] := a[31:0]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1
 FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
 {
     float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
@@ -2675,7 +2875,7 @@ FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
 //   MEM[mem_addr+95:mem_addr+64] := a[31:0]
 //   MEM[mem_addr+127:mem_addr+96] := a[31:0]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps
 #define _mm_store1_ps _mm_store_ps1
 
 // Stores the upper two single-precision, floating-point values of a to the
@@ -2711,7 +2911,7 @@ FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
 //   MEM[mem_addr+95:mem_addr+64] := a[63:32]
 //   MEM[mem_addr+127:mem_addr+96] := a[31:0]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps
 FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
 {
     float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
@@ -2727,14 +2927,14 @@ FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
 }
 
 // Stores 16-bits of integer data a at the address p.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16
 FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
 {
     vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
 }
 
 // Stores 64-bits of integer data a at the address p.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si64
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64
 FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
 {
     vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
@@ -2742,7 +2942,7 @@ FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
 
 // Store 64-bits of integer data from a into memory using a non-temporal memory
 // hint.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pi
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi
 FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
 {
     vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
@@ -2750,11 +2950,11 @@ FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
 
 // Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
 // point elements) from a into memory using a non-temporal memory hint.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps
 FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
 {
 #if __has_builtin(__builtin_nontemporal_store)
-    __builtin_nontemporal_store(a, (float32x4_t *) p);
+    __builtin_nontemporal_store(reinterpret_cast<float32x4_t>(a), (float32x4_t *) p);
 #else
     vst1q_f32(p, vreinterpretq_f32_m128(a));
 #endif
@@ -2782,7 +2982,7 @@ FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
 //   dst[31:0] := a[31:0] - b[31:0]
 //   dst[127:32] := a[127:32]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss
 FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
 {
     return _mm_move_ss(a, _mm_sub_ps(a, b));
@@ -2791,7 +2991,7 @@ FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
 // Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
 // (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
 // transposed matrix in these vectors (row0 now contains column 0, etc.).
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS
 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
     do {                                                  \
         float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
@@ -2816,7 +3016,7 @@ FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
 #define _mm_ucomineq_ss _mm_comineq_ss
 
 // Return vector of type __m128i with undefined elements.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_undefined_si128
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128
 FORCE_INLINE __m128i _mm_undefined_si128(void)
 {
 #if defined(__GNUC__) || defined(__clang__)
@@ -2831,7 +3031,7 @@ FORCE_INLINE __m128i _mm_undefined_si128(void)
 }
 
 // Return vector of type __m128 with undefined elements.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps
 FORCE_INLINE __m128 _mm_undefined_ps(void)
 {
 #if defined(__GNUC__) || defined(__clang__)
@@ -2944,7 +3144,7 @@ FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
 
 // Add packed double-precision (64-bit) floating-point elements in a and b, and
 // store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
 FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -2967,7 +3167,7 @@ FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
 //   dst[63:0] := a[63:0] + b[63:0]
 //   dst[127:64] := a[127:64]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
 FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -2986,7 +3186,7 @@ FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
 //
 //   dst[63:0] := a[63:0] + b[63:0]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64
 FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
 {
     return vreinterpret_m64_s64(
@@ -3016,7 +3216,7 @@ FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
 //     dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8
 FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_s8(
@@ -3025,7 +3225,7 @@ FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
 
 // Add packed unsigned 16-bit integers in a and b using saturation, and store
 // the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16
 FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_u16(
@@ -3049,7 +3249,7 @@ FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
 //     dst[i+63:i] := a[i+63:i] AND b[i+63:i]
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd
 FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
 {
     return vreinterpretq_m128d_s64(
@@ -3072,11 +3272,11 @@ FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
 // elements in a and then AND with b, and store the results in dst.
 //
 //   FOR j := 0 to 1
-//       i := j*64
-//       dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+// 	     i := j*64
+// 	     dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd
 FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
 {
     // *NOTE* argument swap
@@ -3129,17 +3329,17 @@ FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
 
 // Shift a left by imm8 bytes while shifting in zeros, and store the results in
 // dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bslli_si128
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128
 #define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
 
 // Shift a right by imm8 bytes while shifting in zeros, and store the results in
 // dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bsrli_si128
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128
 #define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
 
 // Cast vector of type __m128d to type __m128. This intrinsic is only used for
 // compilation and does not generate any instructions, thus it has zero latency.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps
 FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
 {
     return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
@@ -3147,7 +3347,7 @@ FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
 
 // Cast vector of type __m128d to type __m128i. This intrinsic is only used for
 // compilation and does not generate any instructions, thus it has zero latency.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128
 FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
 {
     return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
@@ -3155,7 +3355,7 @@ FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
 
 // Cast vector of type __m128 to type __m128d. This intrinsic is only used for
 // compilation and does not generate any instructions, thus it has zero latency.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd
 FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
 {
     return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
@@ -3171,7 +3371,7 @@ FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
 
 // Cast vector of type __m128i to type __m128d. This intrinsic is only used for
 // compilation and does not generate any instructions, thus it has zero latency.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
 FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
 {
 #if defined(__aarch64__)
@@ -3189,13 +3389,29 @@ FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
     return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
 }
 
-// Cache line containing p is flushed and invalidated from all caches in the
-// coherency domain. :
-// https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
+// Invalidate and flush the cache line that contains p from all levels of the
+// cache hierarchy.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush
+#if defined(__APPLE__)
+#include <libkern/OSCacheControl.h>
+#endif
 FORCE_INLINE void _mm_clflush(void const *p)
 {
     (void) p;
-    // no corollary for Neon?
+
+    /* sys_icache_invalidate is supported since macOS 10.5.
+     * However, it does not work on non-jailbroken iOS devices, although the
+     * compilation is successful.
+     */
+#if defined(__APPLE__)
+    sys_icache_invalidate((void *) (uintptr_t) p, SSE2NEON_CACHELINE_SIZE);
+#elif defined(__GNUC__) || defined(__clang__)
+    uintptr_t ptr = (uintptr_t) p;
+    __builtin___clear_cache((char *) ptr,
+                            (char *) ptr + SSE2NEON_CACHELINE_SIZE);
+#else
+    /* FIXME: MSVC support */
+#endif
 }
 
 // Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
@@ -3226,7 +3442,7 @@ FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
 
 // Compare packed double-precision (64-bit) floating-point elements in a and b
 // for equality, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
 FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -3244,7 +3460,7 @@ FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
 // Compare the lower double-precision (64-bit) floating-point elements in a and
 // b for equality, store the result in the lower element of dst, and copy the
 // upper element from a to the upper element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd
 FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
 {
     return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
@@ -3252,7 +3468,7 @@ FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
 
 // Compare packed double-precision (64-bit) floating-point elements in a and b
 // for greater-than-or-equal, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
 FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -3274,7 +3490,7 @@ FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
 // Compare the lower double-precision (64-bit) floating-point elements in a and
 // b for greater-than-or-equal, store the result in the lower element of dst,
 // and copy the upper element from a to the upper element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
 FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -3333,7 +3549,7 @@ FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
 
 // Compare packed double-precision (64-bit) floating-point elements in a and b
 // for greater-than, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
 FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -3355,7 +3571,7 @@ FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
 // Compare the lower double-precision (64-bit) floating-point elements in a and
 // b for greater-than, store the result in the lower element of dst, and copy
 // the upper element from a to the upper element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
 FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -3375,7 +3591,7 @@ FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
 
 // Compare packed double-precision (64-bit) floating-point elements in a and b
 // for less-than-or-equal, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
 FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -3397,7 +3613,7 @@ FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
 // Compare the lower double-precision (64-bit) floating-point elements in a and
 // b for less-than-or-equal, store the result in the lower element of dst, and
 // copy the upper element from a to the upper element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
 FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -3451,7 +3667,7 @@ FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
 
 // Compare packed double-precision (64-bit) floating-point elements in a and b
 // for less-than, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
 FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -3473,7 +3689,7 @@ FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
 // Compare the lower double-precision (64-bit) floating-point elements in a and
 // b for less-than, store the result in the lower element of dst, and copy the
 // upper element from a to the upper element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
 FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -3492,7 +3708,7 @@ FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
 
 // Compare packed double-precision (64-bit) floating-point elements in a and b
 // for not-equal, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
 FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -3510,7 +3726,7 @@ FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
 // Compare the lower double-precision (64-bit) floating-point elements in a and
 // b for not-equal, store the result in the lower element of dst, and copy the
 // upper element from a to the upper element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd
 FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
 {
     return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
@@ -3518,7 +3734,7 @@ FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
 
 // Compare packed double-precision (64-bit) floating-point elements in a and b
 // for not-greater-than-or-equal, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
 FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -3543,7 +3759,7 @@ FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
 // Compare the lower double-precision (64-bit) floating-point elements in a and
 // b for not-greater-than-or-equal, store the result in the lower element of
 // dst, and copy the upper element from a to the upper element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd
 FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
 {
     return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
@@ -3551,7 +3767,7 @@ FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
 
 // Compare packed double-precision (64-bit) floating-point elements in a and b
 // for not-greater-than, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cmpngt_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
 FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -3576,7 +3792,7 @@ FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
 // Compare the lower double-precision (64-bit) floating-point elements in a and
 // b for not-greater-than, store the result in the lower element of dst, and
 // copy the upper element from a to the upper element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd
 FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
 {
     return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
@@ -3584,7 +3800,7 @@ FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
 
 // Compare packed double-precision (64-bit) floating-point elements in a and b
 // for not-less-than-or-equal, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
 FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -3609,7 +3825,7 @@ FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
 // Compare the lower double-precision (64-bit) floating-point elements in a and
 // b for not-less-than-or-equal, store the result in the lower element of dst,
 // and copy the upper element from a to the upper element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd
 FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
 {
     return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
@@ -3617,7 +3833,7 @@ FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
 
 // Compare packed double-precision (64-bit) floating-point elements in a and b
 // for not-less-than, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
 FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -3642,7 +3858,7 @@ FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
 // Compare the lower double-precision (64-bit) floating-point elements in a and
 // b for not-less-than, store the result in the lower element of dst, and copy
 // the upper element from a to the upper element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd
 FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
 {
     return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
@@ -3650,7 +3866,7 @@ FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
 
 // Compare packed double-precision (64-bit) floating-point elements in a and b
 // to see if neither is NaN, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
 FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -3682,7 +3898,7 @@ FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
 // Compare the lower double-precision (64-bit) floating-point elements in a and
 // b to see if neither is NaN, store the result in the lower element of dst, and
 // copy the upper element from a to the upper element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
 FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -3704,7 +3920,7 @@ FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
 
 // Compare packed double-precision (64-bit) floating-point elements in a and b
 // to see if either is NaN, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
 FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -3737,7 +3953,7 @@ FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
 // Compare the lower double-precision (64-bit) floating-point elements in a and
 // b to see if either is NaN, store the result in the lower element of dst, and
 // copy the upper element from a to the upper element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
 FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -3759,7 +3975,7 @@ FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
 
 // Compare the lower double-precision (64-bit) floating-point element in a and b
 // for greater-than-or-equal, and return the boolean result (0 or 1).
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
 FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -3774,7 +3990,7 @@ FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
 
 // Compare the lower double-precision (64-bit) floating-point element in a and b
 // for greater-than, and return the boolean result (0 or 1).
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
 FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -3789,7 +4005,7 @@ FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
 
 // Compare the lower double-precision (64-bit) floating-point element in a and b
 // for less-than-or-equal, and return the boolean result (0 or 1).
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
 FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -3804,7 +4020,7 @@ FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
 
 // Compare the lower double-precision (64-bit) floating-point element in a and b
 // for less-than, and return the boolean result (0 or 1).
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
 FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -3819,7 +4035,7 @@ FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
 
 // Compare the lower double-precision (64-bit) floating-point element in a and b
 // for equality, and return the boolean result (0 or 1).
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
 FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -3840,7 +4056,7 @@ FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
 
 // Compare the lower double-precision (64-bit) floating-point element in a and b
 // for not-equal, and return the boolean result (0 or 1).
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd
 FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
 {
     return !_mm_comieq_sd(a, b);
@@ -3855,7 +4071,7 @@ FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
 //     dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
 FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
 {
 #if defined(__aarch64__)
@@ -3885,13 +4101,21 @@ FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
 //      dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_epi32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
 FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
 {
+// vrnd32xq_f64 not supported on clang
+#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
+    float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a));
+    int64x2_t integers = vcvtq_s64_f64(rounded);
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
+#else
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
     double d0 = ((double *) &rnd)[0];
     double d1 = ((double *) &rnd)[1];
     return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
+#endif
 }
 
 // Convert packed double-precision (64-bit) floating-point elements in a to
@@ -3903,7 +4127,7 @@ FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
 //      dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_pi32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
 FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
 {
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
@@ -3924,7 +4148,7 @@ FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
 //   ENDFOR
 //   dst[127:64] := 0
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
 FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
 {
 #if defined(__aarch64__)
@@ -3946,7 +4170,7 @@ FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
 //     dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
 FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
 {
 #if defined(__aarch64__)
@@ -3972,7 +4196,9 @@ FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
 // does not support! It is supported on ARMv8-A however.
 FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
 {
-#if defined(__aarch64__)
+#if defined(__ARM_FEATURE_FRINT)
+    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a)));
+#elif defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     switch (_MM_GET_ROUNDING_MODE()) {
     case _MM_ROUND_NEAREST:
         return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
@@ -4029,7 +4255,7 @@ FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
 //     dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
 FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
 {
 #if defined(__aarch64__)
@@ -4046,7 +4272,7 @@ FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
 //
 //   dst[63:0] := a[63:0]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
 FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
 {
 #if defined(__aarch64__)
@@ -4061,7 +4287,7 @@ FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
 //
 //   dst[31:0] := Convert_FP64_To_Int32(a[63:0])
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
 FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
 {
 #if defined(__aarch64__)
@@ -4078,7 +4304,7 @@ FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
 //
 //   dst[63:0] := Convert_FP64_To_Int64(a[63:0])
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
 FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
 {
 #if defined(__aarch64__)
@@ -4095,14 +4321,14 @@ FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
 //
 //   dst[63:0] := Convert_FP64_To_Int64(a[63:0])
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64x
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x
 #define _mm_cvtsd_si64x _mm_cvtsd_si64
 
 // Convert the lower double-precision (64-bit) floating-point element in b to a
 // single-precision (32-bit) floating-point element, store the result in the
 // lower element of dst, and copy the upper 3 packed elements from a to the
 // upper elements of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_ss
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
 FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -4119,7 +4345,7 @@ FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
 //
 //   dst[31:0] := a[31:0]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32
 FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
 {
     return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
@@ -4129,20 +4355,20 @@ FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
 //
 //   dst[63:0] := a[63:0]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64
 FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
 {
     return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
 }
 
 // Copy the lower 64-bit integer in a to dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
 #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
 
 // Convert the signed 32-bit integer b to a double-precision (64-bit)
 // floating-point element, store the result in the lower element of dst, and
 // copy the upper element from a to the upper element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
 FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
 {
 #if defined(__aarch64__)
@@ -4159,7 +4385,7 @@ FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
 //
 //   dst[63:0] := a[63:0]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
 #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
 
 // Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
@@ -4179,7 +4405,7 @@ FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
 // Convert the signed 64-bit integer b to a double-precision (64-bit)
 // floating-point element, store the result in the lower element of dst, and
 // copy the upper element from a to the upper element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
 FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
 {
 #if defined(__aarch64__)
@@ -4204,13 +4430,13 @@ FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
 
 // Copy 64-bit integer a to the lower element of dst, and zero the upper
 // element.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_si128
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128
 #define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
 
 // Convert the signed 64-bit integer b to a double-precision (64-bit)
 // floating-point element, store the result in the lower element of dst, and
 // copy the upper element from a to the upper element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd
 #define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
 
 // Convert the lower single-precision (32-bit) floating-point element in b to a
@@ -4221,7 +4447,7 @@ FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
 //   dst[63:0] := Convert_FP32_To_FP64(b[31:0])
 //   dst[127:64] := a[127:64]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
 FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
 {
     double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
@@ -4236,7 +4462,7 @@ FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
 
 // Convert packed double-precision (64-bit) floating-point elements in a to
 // packed 32-bit integers with truncation, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_epi32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
 FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
 {
     double a0 = ((double *) &a)[0];
@@ -4246,7 +4472,7 @@ FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
 
 // Convert packed double-precision (64-bit) floating-point elements in a to
 // packed 32-bit integers with truncation, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_pi32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
 FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
 {
     double a0 = ((double *) &a)[0];
@@ -4268,7 +4494,7 @@ FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
 //
 //   dst[63:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
 FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
 {
     double ret = *((double *) &a);
@@ -4280,7 +4506,7 @@ FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
 //
 //   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
 FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
 {
 #if defined(__aarch64__)
@@ -4296,7 +4522,7 @@ FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
 //
 //   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x
 #define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
 
 // Divide packed double-precision (64-bit) floating-point elements in a by
@@ -4307,7 +4533,7 @@ FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
 //    dst[i+63:i] := a[i+63:i] / b[i+63:i]
 //  ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
 FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -4327,7 +4553,7 @@ FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
 // lower double-precision (64-bit) floating-point element in b, store the result
 // in the lower element of dst, and copy the upper element from a to the upper
 // element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
 FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -4363,7 +4589,7 @@ FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
 //
 //   dst[127:0] := MEM[mem_addr+127:mem_addr]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
 FORCE_INLINE __m128d _mm_load_pd(const double *p)
 {
 #if defined(__aarch64__)
@@ -4381,7 +4607,7 @@ FORCE_INLINE __m128d _mm_load_pd(const double *p)
 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1
 #define _mm_load_pd1 _mm_load1_pd
 
 // Load a double-precision (64-bit) floating-point element from memory into the
@@ -4391,7 +4617,7 @@ FORCE_INLINE __m128d _mm_load_pd(const double *p)
 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
 //   dst[127:64] := 0
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
 FORCE_INLINE __m128d _mm_load_sd(const double *p)
 {
 #if defined(__aarch64__)
@@ -4416,7 +4642,7 @@ FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
 FORCE_INLINE __m128d _mm_load1_pd(const double *p)
 {
 #if defined(__aarch64__)
@@ -4433,7 +4659,7 @@ FORCE_INLINE __m128d _mm_load1_pd(const double *p)
 //   dst[63:0] := a[63:0]
 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
 FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
 {
 #if defined(__aarch64__)
@@ -4446,7 +4672,7 @@ FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
 }
 
 // Load 64-bit integer from memory into the first element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64
 FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
 {
     /* Load the lower 64 bits of the value pointed to by p into the
@@ -4463,7 +4689,7 @@ FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
 //   dst[127:64] := a[127:64]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
 FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
 {
 #if defined(__aarch64__)
@@ -4483,7 +4709,7 @@ FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
 //   dst[63:0] := MEM[mem_addr+127:mem_addr+64]
 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
 FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
 {
 #if defined(__aarch64__)
@@ -4496,7 +4722,7 @@ FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
 }
 
 // Loads two double-precision from unaligned memory, floating-point values.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd
 FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
 {
     return _mm_load_pd(p);
@@ -4514,7 +4740,7 @@ FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
 //   dst[31:0] := MEM[mem_addr+31:mem_addr]
 //   dst[MAX:32] := 0
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
 FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
 {
     return vreinterpretq_m128i_s32(
@@ -4533,6 +4759,12 @@ FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
 {
     int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
                               vget_low_s16(vreinterpretq_s16_m128i(b)));
+#if defined(__aarch64__)
+    int32x4_t high =
+        vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b));
+
+    return vreinterpretq_m128i_s32(vpaddq_s32(low, high));
+#else
     int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
                                vget_high_s16(vreinterpretq_s16_m128i(b)));
 
@@ -4540,13 +4772,14 @@ FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
     int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
 
     return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
+#endif
 }
 
 // Conditionally store 8-bit integer elements from a into memory using mask
 // (elements are not stored when the highest bit is not set in the corresponding
 // element) and a non-temporal memory hint. mem_addr does not need to be aligned
 // on any particular boundary.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmoveu_si128
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128
 FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
 {
     int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
@@ -4577,12 +4810,18 @@ FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
 
 // Compare packed double-precision (64-bit) floating-point elements in a and b,
 // and store packed maximum values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
 FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
+#if SSE2NEON_PRECISE_MINMAX
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
+    return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
+#else
     return vreinterpretq_m128d_f64(
         vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#endif
 #else
     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
@@ -4599,7 +4838,7 @@ FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
 // Compare the lower double-precision (64-bit) floating-point elements in a and
 // b, store the maximum value in the lower element of dst, and copy the upper
 // element from a to the upper element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
 FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -4607,8 +4846,8 @@ FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
 #else
     double *da = (double *) &a;
     double *db = (double *) &b;
-    double c[2] = {fmax(da[0], db[0]), da[1]};
-    return vld1q_f32((float32_t *) c);
+    double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
 #endif
 }
 
@@ -4632,12 +4871,18 @@ FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
 
 // Compare packed double-precision (64-bit) floating-point elements in a and b,
 // and store packed minimum values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
 FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
+#if SSE2NEON_PRECISE_MINMAX
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
+    return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
+#else
     return vreinterpretq_m128d_f64(
         vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#endif
 #else
     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
@@ -4653,7 +4898,7 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
 // Compare the lower double-precision (64-bit) floating-point elements in a and
 // b, store the minimum value in the lower element of dst, and copy the upper
 // element from a to the upper element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
 FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -4661,8 +4906,8 @@ FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
 #else
     double *da = (double *) &a;
     double *db = (double *) &b;
-    double c[2] = {fmin(da[0], db[0]), da[1]};
-    return vld1q_f32((float32_t *) c);
+    double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
 #endif
 }
 
@@ -4672,7 +4917,7 @@ FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
 //   dst[63:0] := a[63:0]
 //   dst[127:64] := 0
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64
 FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
 {
     return vreinterpretq_m128i_s64(
@@ -4686,7 +4931,7 @@ FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
 //   dst[63:0] := b[63:0]
 //   dst[127:64] := a[127:64]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd
 FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
 {
     return vreinterpretq_m128d_f32(
@@ -4780,7 +5025,7 @@ FORCE_INLINE int _mm_movemask_epi8(__m128i a)
 
 // Set each bit of mask dst based on the most significant bit of the
 // corresponding packed double-precision (64-bit) floating-point element in a.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd
 FORCE_INLINE int _mm_movemask_pd(__m128d a)
 {
     uint64x2_t input = vreinterpretq_u64_m128d(a);
@@ -4792,7 +5037,7 @@ FORCE_INLINE int _mm_movemask_pd(__m128d a)
 //
 //   dst[63:0] := a[63:0]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64
 FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
 {
     return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
@@ -4804,7 +5049,7 @@ FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
 //   dst[63:0] := a[63:0]
 //   dst[127:64] := 0
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64
 FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
 {
     return vreinterpretq_m128i_s64(
@@ -4826,7 +5071,7 @@ FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
 
 // Multiply packed double-precision (64-bit) floating-point elements in a and b,
 // and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
 FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -4845,7 +5090,7 @@ FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
 // Multiply the lower double-precision (64-bit) floating-point element in a and
 // b, store the result in the lower element of dst, and copy the upper element
 // from a to the upper element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd
 FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
 {
     return _mm_move_sd(a, _mm_mul_pd(a, b));
@@ -4856,7 +5101,7 @@ FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
 //
 //   dst[63:0] := a[31:0] * b[31:0]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32
 FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
 {
     return vreinterpret_m64_u64(vget_low_u64(
@@ -4892,7 +5137,7 @@ FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
 // Multiply the packed unsigned 16-bit integers in a and b, producing
 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
 // integers in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16
 FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
 {
     uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
@@ -4931,7 +5176,7 @@ FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
 
 // Compute the bitwise OR of packed double-precision (64-bit) floating-point
 // elements in a and b, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd
 FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
 {
     return vreinterpretq_m128d_s64(
@@ -5001,7 +5246,7 @@ FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
 
 // Pause the processor. This is typically used in spin-wait loops and depending
 // on the x86 processor typical values are in the 40-100 cycle range. The
-// 'yield' instruction isn't a good fit beacuse it's effectively a nop on most
+// 'yield' instruction isn't a good fit because it's effectively a nop on most
 // Arm cores. Experience with several databases has shown has shown an 'isb' is
 // a reasonable approximation.
 FORCE_INLINE void _mm_pause()
@@ -5013,7 +5258,7 @@ FORCE_INLINE void _mm_pause()
 // b, then horizontally sum each consecutive 8 differences to produce two
 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
 // 16 bits of 64-bit elements in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8
 FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
 {
     uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
@@ -5089,7 +5334,7 @@ FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
 
 // Set packed double-precision (64-bit) floating-point elements in dst with the
 // supplied values.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd
 FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
 {
     double ALIGN_STRUCT(16) data[2] = {e0, e1};
@@ -5102,15 +5347,19 @@ FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
 
 // Broadcast double-precision (64-bit) floating-point value a to all elements of
 // dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1
 #define _mm_set_pd1 _mm_set1_pd
 
 // Copy double-precision (64-bit) floating-point element a to the lower element
 // of dst, and zero the upper element.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
 FORCE_INLINE __m128d _mm_set_sd(double a)
 {
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));
+#else
     return _mm_set_pd(0, a);
+#endif
 }
 
 // Sets the 8 signed 16-bit integer values to w.
@@ -5147,7 +5396,7 @@ FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
 }
 
 // Sets the 2 signed 64-bit integer values to i.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x
 FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
 {
     return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
@@ -5168,7 +5417,7 @@ FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
 
 // Broadcast double-precision (64-bit) floating-point value a to all elements of
 // dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
 FORCE_INLINE __m128d _mm_set1_pd(double d)
 {
 #if defined(__aarch64__)
@@ -5207,7 +5456,7 @@ FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
 }
 
 // Set packed 64-bit integers in dst with the supplied values in reverse order.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64
 FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
 {
     return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
@@ -5242,14 +5491,14 @@ FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
 
 // Set packed double-precision (64-bit) floating-point elements in dst with the
 // supplied values in reverse order.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd
 FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
 {
     return _mm_set_pd(e0, e1);
 }
 
 // Return vector of type __m128d with all elements set to zero.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
 FORCE_INLINE __m128d _mm_setzero_pd(void)
 {
 #if defined(__aarch64__)
@@ -5270,14 +5519,14 @@ FORCE_INLINE __m128i _mm_setzero_si128(void)
 // https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
 // FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
 //                                        __constrange(0,255) int imm)
-#if __has_builtin(__builtin_shufflevector)
-#define _mm_shuffle_epi32(a, imm)                              \
-    __extension__({                                            \
-        int32x4_t _input = vreinterpretq_s32_m128i(a);         \
-        int32x4_t _shuf = __builtin_shufflevector(             \
-            _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
-            ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
-        vreinterpretq_m128i_s32(_shuf);                        \
+#ifdef _sse2neon_shuffle
+#define _mm_shuffle_epi32(a, imm)                                            \
+    __extension__({                                                          \
+        int32x4_t _input = vreinterpretq_s32_m128i(a);                       \
+        int32x4_t _shuf =                                                    \
+            vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+                          ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
+        vreinterpretq_m128i_s32(_shuf);                                      \
     })
 #else  // generic
 #define _mm_shuffle_epi32(a, imm)                        \
@@ -5340,12 +5589,12 @@ FORCE_INLINE __m128i _mm_setzero_si128(void)
 //   dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
 //   dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd
-#if __has_builtin(__builtin_shufflevector)
-#define _mm_shuffle_pd(a, b, imm8)                                          \
-    vreinterpretq_m128d_s64(__builtin_shufflevector(                        \
-        vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \
-        ((imm8 & 0x2) >> 1) + 2))
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd
+#ifdef _sse2neon_shuffle
+#define _mm_shuffle_pd(a, b, imm8)                                            \
+    vreinterpretq_m128d_s64(                                                  \
+        vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
+                      imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2))
 #else
 #define _mm_shuffle_pd(a, b, imm8)                                     \
     _mm_castsi128_pd(_mm_set_epi64x(                                   \
@@ -5355,15 +5604,15 @@ FORCE_INLINE __m128i _mm_setzero_si128(void)
 
 // FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
 //                                          __constrange(0,255) int imm)
-#if __has_builtin(__builtin_shufflevector)
-#define _mm_shufflehi_epi16(a, imm)                             \
-    __extension__({                                             \
-        int16x8_t _input = vreinterpretq_s16_m128i(a);          \
-        int16x8_t _shuf = __builtin_shufflevector(              \
-            _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
-            (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
-            (((imm) >> 6) & 0x3) + 4);                          \
-        vreinterpretq_m128i_s16(_shuf);                         \
+#ifdef _sse2neon_shuffle
+#define _mm_shufflehi_epi16(a, imm)                                           \
+    __extension__({                                                           \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);                        \
+        int16x8_t _shuf =                                                     \
+            vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
+                          (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
+                          (((imm) >> 6) & 0x3) + 4);                          \
+        vreinterpretq_m128i_s16(_shuf);                                       \
     })
 #else  // generic
 #define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
@@ -5371,11 +5620,11 @@ FORCE_INLINE __m128i _mm_setzero_si128(void)
 
 // FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
 //                                          __constrange(0,255) int imm)
-#if __has_builtin(__builtin_shufflevector)
+#ifdef _sse2neon_shuffle
 #define _mm_shufflelo_epi16(a, imm)                                  \
     __extension__({                                                  \
         int16x8_t _input = vreinterpretq_s16_m128i(a);               \
-        int16x8_t _shuf = __builtin_shufflevector(                   \
+        int16x8_t _shuf = vshuffleq_s16(                             \
             _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
             (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
         vreinterpretq_m128i_s16(_shuf);                              \
@@ -5396,7 +5645,7 @@ FORCE_INLINE __m128i _mm_setzero_si128(void)
 //     FI
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16
 FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
 {
     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
@@ -5419,7 +5668,7 @@ FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
 //     FI
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32
 FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
 {
     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
@@ -5442,7 +5691,7 @@ FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
 //     FI
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi64
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64
 FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
 {
     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
@@ -5465,7 +5714,7 @@ FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
 //     FI
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16
 FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
 {
     if (_sse2neon_unlikely(imm & ~15))
@@ -5486,7 +5735,7 @@ FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
 //     FI
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32
 FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
 {
     if (_sse2neon_unlikely(imm & ~31))
@@ -5507,7 +5756,7 @@ FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
 //     FI
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi64
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64
 FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
 {
     if (_sse2neon_unlikely(imm & ~63))
@@ -5525,19 +5774,23 @@ FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
 //   FI
 //   dst[127:0] := a[127:0] << (tmp*8)
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_si128
-FORCE_INLINE __m128i _mm_slli_si128(__m128i a, int imm)
-{
-    if (_sse2neon_unlikely(imm & ~15))
-        return _mm_setzero_si128();
-    uint8x16_t tmp[2] = {vdupq_n_u8(0), vreinterpretq_u8_m128i(a)};
-    return vreinterpretq_m128i_u8(
-        vld1q_u8(((uint8_t const *) tmp) + (16 - imm)));
-}
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
+#define _mm_slli_si128(a, imm)                                         \
+    __extension__({                                                    \
+        int8x16_t ret;                                                 \
+        if (_sse2neon_unlikely(imm == 0))                              \
+            ret = vreinterpretq_s8_m128i(a);                           \
+        else if (_sse2neon_unlikely((imm) & ~15))                      \
+            ret = vdupq_n_s8(0);                                       \
+        else                                                           \
+            ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(a),   \
+                           ((imm <= 0 || imm > 15) ? 0 : (16 - imm))); \
+        vreinterpretq_m128i_s8(ret);                                   \
+    })
 
 // Compute the square root of packed double-precision (64-bit) floating-point
 // elements in a, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
 FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
 {
 #if defined(__aarch64__)
@@ -5552,7 +5805,7 @@ FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
 // Compute the square root of the lower double-precision (64-bit) floating-point
 // element in b, store the result in the lower element of dst, and copy the
 // upper element from a to the upper element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
 FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -5574,7 +5827,7 @@ FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
 //     FI
 //  ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16
 FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
 {
     int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
@@ -5595,7 +5848,7 @@ FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
 //     FI
 //  ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32
 FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
 {
     int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
@@ -5616,7 +5869,7 @@ FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
 //     FI
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
 FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
 {
     const int count = (imm & ~15) ? 15 : imm;
@@ -5635,21 +5888,21 @@ FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
 //     FI
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32
 // FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_srai_epi32(a, imm)                                             \
-    __extension__({                                                        \
-        __m128i ret;                                                       \
-        if (_sse2neon_unlikely((imm) == 0)) {                              \
-            ret = a;                                                       \
-        } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) {            \
-            ret = vreinterpretq_m128i_s32(                                 \
-                vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \
-        } else {                                                           \
-            ret = vreinterpretq_m128i_s32(                                 \
-                vshrq_n_s32(vreinterpretq_s32_m128i(a), 31));              \
-        }                                                                  \
-        ret;                                                               \
+#define _mm_srai_epi32(a, imm)                                               \
+    __extension__({                                                          \
+        __m128i ret;                                                         \
+        if (_sse2neon_unlikely((imm) == 0)) {                                \
+            ret = a;                                                         \
+        } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) {              \
+            ret = vreinterpretq_m128i_s32(                                   \
+                vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-(imm)))); \
+        } else {                                                             \
+            ret = vreinterpretq_m128i_s32(                                   \
+                vshrq_n_s32(vreinterpretq_s32_m128i(a), 31));                \
+        }                                                                    \
+        ret;                                                                 \
     })
 
 // Shift packed 16-bit integers in a right by count while shifting in zeros, and
@@ -5664,7 +5917,7 @@ FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
 //     FI
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16
 FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
 {
     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
@@ -5687,7 +5940,7 @@ FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
 //     FI
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32
 FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
 {
     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
@@ -5710,7 +5963,7 @@ FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
 //     FI
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi64
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64
 FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
 {
     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
@@ -5733,17 +5986,17 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
 //     FI
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
-#define _mm_srli_epi16(a, imm)                                             \
-    __extension__({                                                        \
-        __m128i ret;                                                       \
-        if (_sse2neon_unlikely(imm & ~15)) {                               \
-            ret = _mm_setzero_si128();                                     \
-        } else {                                                           \
-            ret = vreinterpretq_m128i_u16(                                 \
-                vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \
-        }                                                                  \
-        ret;                                                               \
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
+#define _mm_srli_epi16(a, imm)                                               \
+    __extension__({                                                          \
+        __m128i ret;                                                         \
+        if (_sse2neon_unlikely((imm) & ~15)) {                               \
+            ret = _mm_setzero_si128();                                       \
+        } else {                                                             \
+            ret = vreinterpretq_m128i_u16(                                   \
+                vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-(imm)))); \
+        }                                                                    \
+        ret;                                                                 \
     })
 
 // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
@@ -5758,18 +6011,18 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
 //     FI
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32
 // FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_srli_epi32(a, imm)                                             \
-    __extension__({                                                        \
-        __m128i ret;                                                       \
-        if (_sse2neon_unlikely(imm & ~31)) {                               \
-            ret = _mm_setzero_si128();                                     \
-        } else {                                                           \
-            ret = vreinterpretq_m128i_u32(                                 \
-                vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \
-        }                                                                  \
-        ret;                                                               \
+#define _mm_srli_epi32(a, imm)                                               \
+    __extension__({                                                          \
+        __m128i ret;                                                         \
+        if (_sse2neon_unlikely((imm) & ~31)) {                               \
+            ret = _mm_setzero_si128();                                       \
+        } else {                                                             \
+            ret = vreinterpretq_m128i_u32(                                   \
+                vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-(imm)))); \
+        }                                                                    \
+        ret;                                                                 \
     })
 
 // Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
@@ -5784,17 +6037,17 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
 //     FI
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
-#define _mm_srli_epi64(a, imm)                                             \
-    __extension__({                                                        \
-        __m128i ret;                                                       \
-        if (_sse2neon_unlikely(imm & ~63)) {                               \
-            ret = _mm_setzero_si128();                                     \
-        } else {                                                           \
-            ret = vreinterpretq_m128i_u64(                                 \
-                vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \
-        }                                                                  \
-        ret;                                                               \
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64
+#define _mm_srli_epi64(a, imm)                                               \
+    __extension__({                                                          \
+        __m128i ret;                                                         \
+        if (_sse2neon_unlikely((imm) & ~63)) {                               \
+            ret = _mm_setzero_si128();                                       \
+        } else {                                                             \
+            ret = vreinterpretq_m128i_u64(                                   \
+                vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-(imm)))); \
+        }                                                                    \
+        ret;                                                                 \
     })
 
 // Shift a right by imm8 bytes while shifting in zeros, and store the results in
@@ -5806,19 +6059,22 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
 //   FI
 //   dst[127:0] := a[127:0] >> (tmp*8)
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_si128
-FORCE_INLINE __m128i _mm_srli_si128(__m128i a, int imm)
-{
-    if (_sse2neon_unlikely(imm & ~15))
-        return _mm_setzero_si128();
-    uint8x16_t tmp[2] = {vreinterpretq_u8_m128i(a), vdupq_n_u8(0)};
-    return vreinterpretq_m128i_u8(vld1q_u8(((uint8_t const *) tmp) + imm));
-}
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128
+#define _mm_srli_si128(a, imm)                                       \
+    __extension__({                                                  \
+        int8x16_t ret;                                               \
+        if (_sse2neon_unlikely((imm) & ~15))                         \
+            ret = vdupq_n_s8(0);                                     \
+        else                                                         \
+            ret = vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), \
+                           (imm > 15 ? 0 : imm));                    \
+        vreinterpretq_m128i_s8(ret);                                 \
+    })
 
 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
 // elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
 // or a general-protection exception may be generated.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
 FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
 {
 #if defined(__aarch64__)
@@ -5831,7 +6087,7 @@ FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
 // Store the lower double-precision (64-bit) floating-point element from a into
 // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
 // boundary or a general-protection exception may be generated.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
 FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
 {
 #if defined(__aarch64__)
@@ -5847,7 +6103,7 @@ FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
 
 // Store the lower double-precision (64-bit) floating-point element from a into
 // memory. mem_addr does not need to be aligned on any particular boundary.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_store_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
 FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
 {
 #if defined(__aarch64__)
@@ -5867,7 +6123,7 @@ FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
 // Store the lower double-precision (64-bit) floating-point element from a into
 // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
 // boundary or a general-protection exception may be generated.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd
 #define _mm_store1_pd _mm_store_pd1
 
 // Store the upper double-precision (64-bit) floating-point element from a into
@@ -5875,7 +6131,7 @@ FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
 //
 //   MEM[mem_addr+63:mem_addr] := a[127:64]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
 FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
 {
 #if defined(__aarch64__)
@@ -5889,9 +6145,7 @@ FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
 // https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
 FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
 {
-    uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
-    uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
-    *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
+    vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));
 }
 
 // Store the lower double-precision (64-bit) floating-point element from a into
@@ -5899,7 +6153,7 @@ FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
 //
 //   MEM[mem_addr+63:mem_addr] := a[63:0]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
 FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
 {
 #if defined(__aarch64__)
@@ -5916,7 +6170,7 @@ FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
 //   MEM[mem_addr+63:mem_addr] := a[127:64]
 //   MEM[mem_addr+127:mem_addr+64] := a[63:0]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd
 FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
 {
     float32x4_t f = vreinterpretq_f32_m128d(a);
@@ -5926,21 +6180,21 @@ FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
 // elements) from a into memory. mem_addr does not need to be aligned on any
 // particular boundary.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd
 FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
 {
     _mm_store_pd(mem_addr, a);
 }
 
 // Stores 128-bits of integer data a at the address p.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si128
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128
 FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
 {
     vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
 }
 
 // Stores 32-bits of integer data a at the address p.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32
 FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
 {
     vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
@@ -5950,11 +6204,11 @@ FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
 // elements) from a into memory using a non-temporal memory hint. mem_addr must
 // be aligned on a 16-byte boundary or a general-protection exception may be
 // generated.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd
 FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
 {
 #if __has_builtin(__builtin_nontemporal_store)
-    __builtin_nontemporal_store(a, (float32x4_t *) p);
+    __builtin_nontemporal_store(reinterpret_cast<float32x4_t>(a), (float32x4_t *) p);
 #elif defined(__aarch64__)
     vst1q_f64(p, vreinterpretq_f64_m128d(a));
 #else
@@ -5978,15 +6232,24 @@ FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
 // Store 32-bit integer a into memory using a non-temporal hint to minimize
 // cache pollution. If the cache line containing address mem_addr is already in
 // the cache, the cache will be updated.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32
 FORCE_INLINE void _mm_stream_si32(int *p, int a)
 {
     vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
 }
 
+// Store 64-bit integer a into memory using a non-temporal hint to minimize
+// cache pollution. If the cache line containing address mem_addr is already in
+// the cache, the cache will be updated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64
+FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
+{
+    vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
+}
+
 // Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
 // store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16
 FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_s16(
@@ -6020,7 +6283,7 @@ FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
 
 // Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
 // store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8
 FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_s8(
@@ -6036,7 +6299,7 @@ FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
 //     dst[i+63:i] := a[i+63:i] - b[i+63:i]
 //   ENDFOR
 //
-//  https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd
+//  https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
 FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -6056,7 +6319,7 @@ FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
 // the lower double-precision (64-bit) floating-point element in a, store the
 // result in the lower element of dst, and copy the upper element from a to the
 // upper element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd
 FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
 {
     return _mm_move_sd(a, _mm_sub_pd(a, b));
@@ -6066,7 +6329,7 @@ FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
 //
 //   dst[63:0] := a[63:0] - b[63:0]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64
 FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
 {
     return vreinterpret_m64_s64(
@@ -6135,7 +6398,7 @@ FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
 #define _mm_ucomineq_sd _mm_comineq_sd
 
 // Return vector of type __m128d with undefined elements.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd
 FORCE_INLINE __m128d _mm_undefined_pd(void)
 {
 #if defined(__GNUC__) || defined(__clang__)
@@ -6240,7 +6503,7 @@ FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
 //   }
 //   dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
 FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -6343,7 +6606,7 @@ FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
 //   }
 //   dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
 FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -6364,7 +6627,7 @@ FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
 //      dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd
 FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
 {
     return vreinterpretq_m128d_s64(
@@ -6394,10 +6657,10 @@ FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
 //   FI
 // ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd
 FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
 {
-    __m128d mask = _mm_set_pd(1.0f, -1.0f);
+    _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
 #if defined(__aarch64__)
     return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
                                              vreinterpretq_f64_m128d(b),
@@ -6410,10 +6673,10 @@ FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
 // Alternatively add and subtract packed single-precision (32-bit)
 // floating-point elements in a to/from packed elements in b, and store the
 // results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps
 FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
 {
-    __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f};
+    _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
 #if defined(__aarch64__) || defined(__ARM_FEATURE_FMA) /* VFPv4+ */
     return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
                                             vreinterpretq_f32_m128(mask),
@@ -6425,7 +6688,7 @@ FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
 
 // Horizontally add adjacent pairs of double-precision (64-bit) floating-point
 // elements in a and b, and pack the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
 FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__)
@@ -6459,13 +6722,14 @@ FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
 
 // Horizontally subtract adjacent pairs of double-precision (64-bit)
 // floating-point elements in a and b, and pack the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
 FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
 {
 #if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vsubq_f64(
-        vuzp1q_f64(vreinterpretq_f64_m128d(_a), vreinterpretq_f64_m128d(_b)),
-        vuzp2q_f64(vreinterpretq_f64_m128d(_a), vreinterpretq_f64_m128d(_b))));
+    float64x2_t a = vreinterpretq_f64_m128d(_a);
+    float64x2_t b = vreinterpretq_f64_m128d(_b);
+    return vreinterpretq_m128d_f64(
+        vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
 #else
     double *da = (double *) &_a;
     double *db = (double *) &_b;
@@ -6474,18 +6738,18 @@ FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
 #endif
 }
 
-// Horizontally substract adjacent pairs of single-precision (32-bit)
+// Horizontally subtract adjacent pairs of single-precision (32-bit)
 // floating-point elements in a and b, and pack the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps
 FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
 {
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
 #if defined(__aarch64__)
-    return vreinterpretq_m128_f32(vsubq_f32(
-        vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)),
-        vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b))));
+    return vreinterpretq_m128_f32(
+        vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
 #else
-    float32x4x2_t c =
-        vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b));
+    float32x4x2_t c = vuzpq_f32(a, b);
     return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
 #endif
 }
@@ -6496,7 +6760,7 @@ FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
 //
 //   dst[127:0] := MEM[mem_addr+127:mem_addr]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128
 #define _mm_lddqu_si128 _mm_loadu_si128
 
 // Load a double-precision (64-bit) floating-point element from memory into both
@@ -6505,15 +6769,15 @@ FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd
 #define _mm_loaddup_pd _mm_load1_pd
 
 // Duplicate the low double-precision (64-bit) floating-point element from a,
 // and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
 FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
 {
-#if (__aarch64__)
+#if defined(__aarch64__)
     return vreinterpretq_m128d_f64(
         vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
 #else
@@ -6524,11 +6788,14 @@ FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
 
 // Duplicate odd-indexed single-precision (32-bit) floating-point elements
 // from a, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
 FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
 {
-#if __has_builtin(__builtin_shufflevector)
-    return vreinterpretq_m128_f32(__builtin_shufflevector(
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
+#elif defined(_sse2neon_shuffle)
+    return vreinterpretq_m128_f32(vshuffleq_s32(
         vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
 #else
     float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
@@ -6540,11 +6807,14 @@ FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
 
 // Duplicate even-indexed single-precision (32-bit) floating-point elements
 // from a, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
 FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
 {
-#if __has_builtin(__builtin_shufflevector)
-    return vreinterpretq_m128_f32(__builtin_shufflevector(
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
+#elif defined(_sse2neon_shuffle)
+    return vreinterpretq_m128_f32(vshuffleq_s32(
         vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
 #else
     float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
@@ -6564,7 +6834,7 @@ FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
 //     dst[i+15:i] := ABS(a[i+15:i])
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16
 FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
 {
     return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
@@ -6578,7 +6848,7 @@ FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
 //     dst[i+31:i] := ABS(a[i+31:i])
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32
 FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
 {
     return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
@@ -6592,7 +6862,7 @@ FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
 //     dst[i+7:i] := ABS(a[i+7:i])
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8
 FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
 {
     return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
@@ -6606,7 +6876,7 @@ FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
 //     dst[i+15:i] := ABS(a[i+15:i])
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16
 FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
 {
     return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
@@ -6620,7 +6890,7 @@ FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
 //     dst[i+31:i] := ABS(a[i+31:i])
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32
 FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
 {
     return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
@@ -6634,7 +6904,7 @@ FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
 //     dst[i+7:i] := ABS(a[i+7:i])
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8
 FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
 {
     return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
@@ -6646,24 +6916,21 @@ FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
 //   tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8)
 //   dst[127:0] := tmp[127:0]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8
-FORCE_INLINE __m128i _mm_alignr_epi8(__m128i a, __m128i b, int imm)
-{
-    if (_sse2neon_unlikely(imm & ~31))
-        return _mm_setzero_si128();
-    int idx;
-    uint8x16_t tmp[2];
-    if (imm >= 16) {
-        idx = imm - 16;
-        tmp[0] = vreinterpretq_u8_m128i(a);
-        tmp[1] = vdupq_n_u8(0);
-    } else {
-        idx = imm;
-        tmp[0] = vreinterpretq_u8_m128i(b);
-        tmp[1] = vreinterpretq_u8_m128i(a);
-    }
-    return vreinterpretq_m128i_u8(vld1q_u8(((uint8_t const *) tmp) + idx));
-}
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
+#define _mm_alignr_epi8(a, b, imm)                                            \
+    __extension__({                                                           \
+        uint8x16_t _a = vreinterpretq_u8_m128i(a);                            \
+        uint8x16_t _b = vreinterpretq_u8_m128i(b);                            \
+        __m128i ret;                                                          \
+        if (_sse2neon_unlikely((imm) & ~31))                                  \
+            ret = vreinterpretq_m128i_u8(vdupq_n_u8(0));                      \
+        else if (imm >= 16)                                                   \
+            ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0);                \
+        else                                                                  \
+            ret =                                                             \
+                vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \
+        ret;                                                                  \
+    })
 
 // Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
 // the result right by imm8 bytes, and store the low 8 bytes in dst.
@@ -6671,7 +6938,7 @@ FORCE_INLINE __m128i _mm_alignr_epi8(__m128i a, __m128i b, int imm)
 //   tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8)
 //   dst[63:0] := tmp[63:0]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8
 #define _mm_alignr_pi8(a, b, imm)                                           \
     __extension__({                                                         \
         __m64 ret;                                                          \
@@ -6679,13 +6946,13 @@ FORCE_INLINE __m128i _mm_alignr_epi8(__m128i a, __m128i b, int imm)
             ret = vreinterpret_m64_s8(vdup_n_s8(0));                        \
         } else {                                                            \
             uint8x8_t tmp_low, tmp_high;                                    \
-            if (imm >= 8) {                                                 \
-                const int idx = imm - 8;                                    \
+            if ((imm) >= 8) {                                               \
+                const int idx = (imm) -8;                                   \
                 tmp_low = vreinterpret_u8_m64(a);                           \
                 tmp_high = vdup_n_u8(0);                                    \
                 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
             } else {                                                        \
-                const int idx = imm;                                        \
+                const int idx = (imm);                                      \
                 tmp_low = vreinterpret_u8_m64(b);                           \
                 tmp_high = vreinterpret_u8_m64(a);                          \
                 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
@@ -6715,14 +6982,18 @@ FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
 {
     int32x4_t a = vreinterpretq_s32_m128i(_a);
     int32x4_t b = vreinterpretq_s32_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(vpaddq_s32(a, b));
+#else
     return vreinterpretq_m128i_s32(
         vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
                      vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
+#endif
 }
 
 // Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
 // signed 16-bit results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16
 FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
 {
     return vreinterpret_m64_s16(
@@ -6731,7 +7002,7 @@ FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
 
 // Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
 // signed 32-bit results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32
 FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
 {
     return vreinterpret_m64_s32(
@@ -6762,7 +7033,7 @@ FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
 
 // Horizontally add adjacent pairs of signed 16-bit integers in a and b using
 // saturation, and pack the signed 16-bit results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadds_pi16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16
 FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
 {
     int16x4_t a = vreinterpret_s16_m64(_a);
@@ -6775,101 +7046,96 @@ FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
 #endif
 }
 
-// Computes pairwise difference of each argument as a 16-bit signed or unsigned
-// integer values a and b.
+// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
+// the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16
 FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
 {
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-    // Interleave using vshrn/vmovn
-    // [a0|a2|a4|a6|b0|b2|b4|b6]
-    // [a1|a3|a5|a7|b1|b3|b5|b7]
-    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
-    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
-    // Subtract
-    return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int16x8x2_t c = vuzpq_s16(a, b);
+    return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1]));
+#endif
 }
 
-// Computes pairwise difference of each argument as a 32-bit signed or unsigned
-// integer values a and b.
+// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
+// the signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32
 FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
 {
-    int64x2_t a = vreinterpretq_s64_m128i(_a);
-    int64x2_t b = vreinterpretq_s64_m128i(_b);
-    // Interleave using vshrn/vmovn
-    // [a0|a2|b0|b2]
-    // [a1|a2|b1|b3]
-    int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
-    int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
-    // Subtract
-    return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(
+        vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
+#else
+    int32x4x2_t c = vuzpq_s32(a, b);
+    return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1]));
+#endif
 }
 
 // Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
 // the signed 16-bit results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pi16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16
 FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
 {
-    int32x4_t ab =
-        vcombine_s32(vreinterpret_s32_m64(_a), vreinterpret_s32_m64(_b));
-
-    int16x4_t ab_low_bits = vmovn_s32(ab);
-    int16x4_t ab_high_bits = vshrn_n_s32(ab, 16);
-
-    return vreinterpret_m64_s16(vsub_s16(ab_low_bits, ab_high_bits));
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+#if defined(__aarch64__)
+    return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+#else
+    int16x4x2_t c = vuzp_s16(a, b);
+    return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1]));
+#endif
 }
 
 // Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
 // the signed 32-bit results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_hsub_pi32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32
 FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
 {
-#if defined(__aarch64__)
     int32x2_t a = vreinterpret_s32_m64(_a);
     int32x2_t b = vreinterpret_s32_m64(_b);
-    return vreinterpret_m64_s32(vsub_s32(vtrn1_s32(a, b), vtrn2_s32(a, b)));
+#if defined(__aarch64__)
+    return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
 #else
-    int32x2x2_t trn_ab =
-        vtrn_s32(vreinterpret_s32_m64(_a), vreinterpret_s32_m64(_b));
-    return vreinterpret_m64_s32(vsub_s32(trn_ab.val[0], trn_ab.val[1]));
+    int32x2x2_t c = vuzp_s32(a, b);
+    return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1]));
 #endif
 }
 
 // Computes saturated pairwise difference of each argument as a 16-bit signed
 // integer values a and b.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16
 FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
 {
-#if defined(__aarch64__)
     int16x8_t a = vreinterpretq_s16_m128i(_a);
     int16x8_t b = vreinterpretq_s16_m128i(_b);
-    return vreinterpretq_s64_s16(
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
         vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
 #else
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-    // Interleave using vshrn/vmovn
-    // [a0|a2|a4|a6|b0|b2|b4|b6]
-    // [a1|a3|a5|a7|b1|b3|b5|b7]
-    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
-    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
-    // Saturated subtract
-    return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
+    int16x8x2_t c = vuzpq_s16(a, b);
+    return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1]));
 #endif
 }
 
 // Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
 // using saturation, and pack the signed 16-bit results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_pi16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16
 FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
 {
     int16x4_t a = vreinterpret_s16_m64(_a);
     int16x4_t b = vreinterpret_s16_m64(_b);
 #if defined(__aarch64__)
-    return vreinterpret_s64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+    return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
 #else
-    int16x4x2_t res = vuzp_s16(a, b);
-    return vreinterpret_s64_s16(vqsub_s16(res.val[0], res.val[1]));
+    int16x4x2_t c = vuzp_s16(a, b);
+    return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1]));
 #endif
 }
 
@@ -6921,7 +7187,7 @@ FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
 // signed 8-bit integer from b, producing intermediate signed 16-bit integers.
 // Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
 // pack the saturated results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maddubs_pi16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16
 FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
 {
     uint16x4_t a = vreinterpret_u16_m64(_a);
@@ -6975,7 +7241,7 @@ FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
 // Multiply packed signed 16-bit integers in a and b, producing intermediate
 // signed 32-bit integers. Truncate each intermediate integer to the 18 most
 // significant bits, round by adding 1, and store bits [16:1] to dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhrs_pi16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16
 FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
 {
     int32x4_t mul_extend =
@@ -6987,7 +7253,7 @@ FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
 
 // Shuffle packed 8-bit integers in a according to shuffle control mask in the
 // corresponding 8-bit element of b, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8
 FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
 {
     int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
@@ -7028,11 +7294,11 @@ FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
 //     FI
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi8
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8
 FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
 {
     const int8x8_t controlMask =
-        vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t)(0x1 << 7 | 0x07)));
+        vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07)));
     int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
     return vreinterpret_m64_s8(res);
 }
@@ -7142,7 +7408,7 @@ FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
     int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
 #endif
 
-    // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a')
+    // bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a')
     // based on ltMask
     int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
     // res = masked & (~zeroMask)
@@ -7166,7 +7432,7 @@ FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
 //      FI
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16
 FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
 {
     int16x4_t a = vreinterpret_s16_m64(_a);
@@ -7183,7 +7449,7 @@ FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
     int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
 #endif
 
-    // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a')
+    // bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a')
     // based on ltMask
     int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
     // res = masked & (~zeroMask)
@@ -7207,7 +7473,7 @@ FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
 //      FI
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32
 FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
 {
     int32x2_t a = vreinterpret_s32_m64(_a);
@@ -7224,7 +7490,7 @@ FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
     int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
 #endif
 
-    // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a')
+    // bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a')
     // based on ltMask
     int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
     // res = masked & (~zeroMask)
@@ -7248,7 +7514,7 @@ FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
 //      FI
 //   ENDFOR
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8
 FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
 {
     int8x8_t a = vreinterpret_s8_m64(_a);
@@ -7265,7 +7531,7 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
     int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
 #endif
 
-    // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a')
+    // bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a')
     // based on ltMask
     int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
     // res = masked & (~zeroMask)
@@ -7309,7 +7575,7 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
 
 // Blend packed double-precision (64-bit) floating-point elements from a and b
 // using control mask imm8, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd
 #define _mm_blend_pd(a, b, imm)                                \
     __extension__({                                            \
         const uint64_t _mask[2] = {                            \
@@ -7323,7 +7589,7 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
 
 // Blend packed single-precision (32-bit) floating-point elements from a and b
 // using mask, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
 FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
 {
     const uint32_t ALIGN_STRUCT(16)
@@ -7360,7 +7626,7 @@ FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
 
 // Blend packed double-precision (64-bit) floating-point elements from a and b
 // using mask, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd
 FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
 {
     uint64x2_t mask =
@@ -7378,7 +7644,7 @@ FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
 
 // Blend packed single-precision (32-bit) floating-point elements from a and b
 // using mask, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps
 FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
 {
     // Use a signed shift right to create a mask with the sign bit
@@ -7392,7 +7658,7 @@ FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
 // Round the packed double-precision (64-bit) floating-point elements in a up
 // to an integer value, and store the results as packed double-precision
 // floating-point elements in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
 FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
 {
 #if defined(__aarch64__)
@@ -7406,10 +7672,10 @@ FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
 // Round the packed single-precision (32-bit) floating-point elements in a up to
 // an integer value, and store the results as packed single-precision
 // floating-point elements in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
 FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
 {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
 #else
     float *f = (float *) &a;
@@ -7421,7 +7687,7 @@ FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
 // an integer value, store the result as a double-precision floating-point
 // element in the lower element of dst, and copy the upper element from a to the
 // upper element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd
 FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
 {
     return _mm_move_sd(a, _mm_ceil_pd(b));
@@ -7435,7 +7701,7 @@ FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
 //   dst[31:0] := CEIL(b[31:0])
 //   dst[127:32] := a[127:32]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss
 FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
 {
     return _mm_move_ss(a, _mm_ceil_ps(b));
@@ -7542,7 +7808,7 @@ FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
 
 // Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
 // and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16
 FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
 {
     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx HGFE DCBA */
@@ -7575,7 +7841,7 @@ FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
 // Conditionally multiply the packed double-precision (64-bit) floating-point
 // elements in a and b using the high 4 bits in imm8, sum the four products, and
 // conditionally store the sum in dst using the low 4 bits of imm8.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd
 FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
 {
     // Generate mask value from constant immediate bit value
@@ -7621,7 +7887,7 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
 // Conditionally multiply the packed single-precision (32-bit) floating-point
 // elements in a and b using the high 4 bits in imm8, sum the four products,
 // and conditionally store the sum in dst using the low 4 bits of imm.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps
 FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
 {
 #if defined(__aarch64__)
@@ -7677,7 +7943,7 @@ FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
 // Extracts the selected signed or unsigned 8-bit integer from a and zero
 // extends.
 // FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8
 #define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
 
 // Extracts the selected single-precision (32-bit) floating-point from a.
@@ -7687,7 +7953,7 @@ FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
 // Round the packed double-precision (64-bit) floating-point elements in a down
 // to an integer value, and store the results as packed double-precision
 // floating-point elements in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
 FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
 {
 #if defined(__aarch64__)
@@ -7701,10 +7967,10 @@ FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
 // Round the packed single-precision (32-bit) floating-point elements in a down
 // to an integer value, and store the results as packed single-precision
 // floating-point elements in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
 FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
 {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
 #else
     float *f = (float *) &a;
@@ -7716,7 +7982,7 @@ FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
 // an integer value, store the result as a double-precision floating-point
 // element in the lower element of dst, and copy the upper element from a to the
 // upper element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd
 FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
 {
     return _mm_move_sd(a, _mm_floor_pd(b));
@@ -7730,7 +7996,7 @@ FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
 //   dst[31:0] := FLOOR(b[31:0])
 //   dst[127:32] := a[127:32]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss
 FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
 {
     return _mm_move_ss(a, _mm_floor_ps(b));
@@ -7769,7 +8035,7 @@ FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
 // Copy a to tmp, then insert a single-precision (32-bit) floating-point
 // element from b into tmp using the control in imm8. Store tmp to dst using
 // the mask in imm8 (elements are zeroed out when the corresponding bit is set).
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=insert_ps
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
 #define _mm_insert_ps(a, b, imm8)                                              \
     __extension__({                                                            \
         float32x4_t tmp1 =                                                     \
@@ -7808,7 +8074,7 @@ FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
 
 // Compare packed signed 8-bit integers in a and b, and store packed maximum
 // values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8
 FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_s8(
@@ -7817,7 +8083,7 @@ FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
 
 // Compare packed unsigned 16-bit integers in a and b, and store packed maximum
 // values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16
 FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_u16(
@@ -7826,7 +8092,7 @@ FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
 
 // Compare packed unsigned 32-bit integers in a and b, and store packed maximum
 // values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
 FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_u32(
@@ -7851,7 +8117,7 @@ FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
 
 // Compare packed signed 8-bit integers in a and b, and store packed minimum
 // values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8
 FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_s8(
@@ -7860,7 +8126,7 @@ FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
 
 // Compare packed unsigned 16-bit integers in a and b, and store packed minimum
 // values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16
 FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_u16(
@@ -7869,7 +8135,7 @@ FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
 
 // Compare packed unsigned 32-bit integers in a and b, and store packed minimum
 // values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
 FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_u32(
@@ -7892,15 +8158,22 @@ FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
 //   dst[18:16] := index[2:0]
 //   dst[127:19] := 0
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16
 FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
 {
     __m128i dst;
     uint16_t min, idx = 0;
-    // Find the minimum value
 #if defined(__aarch64__)
+    // Find the minimum value
     min = vminvq_u16(vreinterpretq_u16_m128i(a));
+
+    // Get the index of the minimum value
+    static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7};
+    uint16x8_t minv = vdupq_n_u16(min);
+    uint16x8_t cmeq = vceqq_u16(minv, vreinterpretq_u16_m128i(a));
+    idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq));
 #else
+    // Find the minimum value
     __m64 tmp;
     tmp = vreinterpret_m64_u16(
         vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
@@ -7910,7 +8183,6 @@ FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
     tmp = vreinterpret_m64_u16(
         vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
     min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
-#endif
     // Get the index of the minimum value
     int i;
     for (i = 0; i < 8; i++) {
@@ -7920,6 +8192,7 @@ FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
         }
         a = _mm_srli_si128(a, 2);
     }
+#endif
     // Generate result
     dst = _mm_setzero_si128();
     dst = vreinterpretq_m128i_u16(
@@ -7935,7 +8208,7 @@ FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
 // quadruplets from a. One quadruplet is selected from b starting at on the
 // offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
 // integers selected from a starting at the offset specified in imm8.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mpsadbw_epu8
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8
 FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
 {
     uint8x16_t _a, _b;
@@ -7982,13 +8255,13 @@ FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
 
     int16x8_t c04, c15, c26, c37;
     uint8x8_t low_b = vget_low_u8(_b);
-    c04 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
-    _a = vextq_u8(_a, _a, 1);
-    c15 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
-    _a = vextq_u8(_a, _a, 1);
-    c26 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
-    _a = vextq_u8(_a, _a, 1);
-    c37 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
+    c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b));
+    uint8x16_t _a_1 = vextq_u8(_a, _a, 1);
+    c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b));
+    uint8x16_t _a_2 = vextq_u8(_a, _a, 2);
+    c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
+    uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
+    c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
 #if defined(__aarch64__)
     // |0|4|2|6|
     c04 = vpaddq_s16(c04, c26);
@@ -8056,7 +8329,7 @@ FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
 // Round the packed double-precision (64-bit) floating-point elements in a using
 // the rounding parameter, and store the results as packed double-precision
 // floating-point elements in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
 FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
 {
 #if defined(__aarch64__)
@@ -8128,7 +8401,7 @@ FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
 // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
 FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
 {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     switch (rounding) {
     case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
         return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
@@ -8185,7 +8458,7 @@ FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
 // the rounding parameter, store the result as a double-precision floating-point
 // element in the lower element of dst, and copy the upper element from a to the
 // upper element of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd
 FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
 {
     return _mm_move_sd(a, _mm_round_pd(b, rounding));
@@ -8205,7 +8478,7 @@ FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
 //     (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress
 //     exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
 //     _MM_SET_ROUNDING_MODE
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss
 FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
 {
     return _mm_move_ss(a, _mm_round_ps(b, rounding));
@@ -8217,7 +8490,7 @@ FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
 //
 //   dst[127:0] := MEM[mem_addr+127:mem_addr]
 //
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128
 FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
 {
 #if __has_builtin(__builtin_nontemporal_store)
@@ -8229,16 +8502,16 @@ FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
 
 // Compute the bitwise NOT of a and then AND with a 128-bit vector containing
 // all 1's, and return 1 if the result is zero, otherwise return 0.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones
 FORCE_INLINE int _mm_test_all_ones(__m128i a)
 {
-    return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
+    return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
            ~(uint64_t) 0;
 }
 
 // Compute the bitwise AND of 128 bits (representing integer data) in a and
 // mask, and return 1 if the result is zero, otherwise return 0.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros
 FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
 {
     int64x2_t a_and_mask =
@@ -8251,7 +8524,7 @@ FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
 // the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
 // zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
 // otherwise return 0.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_test_mix_ones_zero
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero
 FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
 {
     uint64x2_t zf =
@@ -8266,12 +8539,11 @@ FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
 // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
 // otherwise set CF to 0. Return the CF value.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
 FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
 {
     int64x2_t s64 =
-        vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))),
-                  vreinterpretq_s64_m128i(b));
+        vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a));
     return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
 }
 
@@ -8280,14 +8552,14 @@ FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
 // otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
 // otherwise return 0.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128
 #define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
 
 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
 // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
 // otherwise set CF to 0. Return the ZF value.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
 FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
 {
     int64x2_t s64 =
@@ -8297,6 +8569,756 @@ FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
 
 /* SSE4.2 */
 
+const static uint16_t _sse2neon_cmpestr_mask16b[8] ALIGN_STRUCT(16) = {
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+};
+const static uint8_t _sse2neon_cmpestr_mask8b[16] ALIGN_STRUCT(16) = {
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+};
+
+/* specify the source data format */
+#define _SIDD_UBYTE_OPS 0x00 /* unsigned 8-bit characters */
+#define _SIDD_UWORD_OPS 0x01 /* unsigned 16-bit characters */
+#define _SIDD_SBYTE_OPS 0x02 /* signed 8-bit characters */
+#define _SIDD_SWORD_OPS 0x03 /* signed 16-bit characters */
+
+/* specify the comparison operation */
+#define _SIDD_CMP_EQUAL_ANY 0x00     /* compare equal any: strchr */
+#define _SIDD_CMP_RANGES 0x04        /* compare ranges */
+#define _SIDD_CMP_EQUAL_EACH 0x08    /* compare equal each: strcmp */
+#define _SIDD_CMP_EQUAL_ORDERED 0x0C /* compare equal ordered */
+
+/* specify the polarity */
+#define _SIDD_POSITIVE_POLARITY 0x00
+#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
+#define _SIDD_NEGATIVE_POLARITY 0x10 /* negate results */
+#define _SIDD_MASKED_NEGATIVE_POLARITY \
+    0x30 /* negate results only before end of string */
+
+/* specify the output selection in _mm_cmpXstri */
+#define _SIDD_LEAST_SIGNIFICANT 0x00
+#define _SIDD_MOST_SIGNIFICANT 0x40
+
+/* specify the output selection in _mm_cmpXstrm */
+#define _SIDD_BIT_MASK 0x00
+#define _SIDD_UNIT_MASK 0x40
+
+/* Pattern Matching for C macros.
+ * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms
+ */
+
+/* catenate */
+#define SSE2NEON_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__
+#define SSE2NEON_CAT(a, b) SSE2NEON_PRIMITIVE_CAT(a, b)
+
+#define SSE2NEON_IIF(c) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_IIF_, c)
+/* run the 2nd parameter */
+#define SSE2NEON_IIF_0(t, ...) __VA_ARGS__
+/* run the 1st parameter */
+#define SSE2NEON_IIF_1(t, ...) t
+
+#define SSE2NEON_COMPL(b) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_COMPL_, b)
+#define SSE2NEON_COMPL_0 1
+#define SSE2NEON_COMPL_1 0
+
+#define SSE2NEON_DEC(x) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_DEC_, x)
+#define SSE2NEON_DEC_1 0
+#define SSE2NEON_DEC_2 1
+#define SSE2NEON_DEC_3 2
+#define SSE2NEON_DEC_4 3
+#define SSE2NEON_DEC_5 4
+#define SSE2NEON_DEC_6 5
+#define SSE2NEON_DEC_7 6
+#define SSE2NEON_DEC_8 7
+#define SSE2NEON_DEC_9 8
+#define SSE2NEON_DEC_10 9
+#define SSE2NEON_DEC_11 10
+#define SSE2NEON_DEC_12 11
+#define SSE2NEON_DEC_13 12
+#define SSE2NEON_DEC_14 13
+#define SSE2NEON_DEC_15 14
+#define SSE2NEON_DEC_16 15
+
+/* detection */
+#define SSE2NEON_CHECK_N(x, n, ...) n
+#define SSE2NEON_CHECK(...) SSE2NEON_CHECK_N(__VA_ARGS__, 0, )
+#define SSE2NEON_PROBE(x) x, 1,
+
+#define SSE2NEON_NOT(x) SSE2NEON_CHECK(SSE2NEON_PRIMITIVE_CAT(SSE2NEON_NOT_, x))
+#define SSE2NEON_NOT_0 SSE2NEON_PROBE(~)
+
+#define SSE2NEON_BOOL(x) SSE2NEON_COMPL(SSE2NEON_NOT(x))
+#define SSE2NEON_IF(c) SSE2NEON_IIF(SSE2NEON_BOOL(c))
+
+#define SSE2NEON_EAT(...)
+#define SSE2NEON_EXPAND(...) __VA_ARGS__
+#define SSE2NEON_WHEN(c) SSE2NEON_IF(c)(SSE2NEON_EXPAND, SSE2NEON_EAT)
+
+/* recursion */
+/* deferred expression */
+#define SSE2NEON_EMPTY()
+#define SSE2NEON_DEFER(id) id SSE2NEON_EMPTY()
+#define SSE2NEON_OBSTRUCT(...) __VA_ARGS__ SSE2NEON_DEFER(SSE2NEON_EMPTY)()
+#define SSE2NEON_EXPAND(...) __VA_ARGS__
+
+#define SSE2NEON_EVAL(...) \
+    SSE2NEON_EVAL1(SSE2NEON_EVAL1(SSE2NEON_EVAL1(__VA_ARGS__)))
+#define SSE2NEON_EVAL1(...) \
+    SSE2NEON_EVAL2(SSE2NEON_EVAL2(SSE2NEON_EVAL2(__VA_ARGS__)))
+#define SSE2NEON_EVAL2(...) \
+    SSE2NEON_EVAL3(SSE2NEON_EVAL3(SSE2NEON_EVAL3(__VA_ARGS__)))
+#define SSE2NEON_EVAL3(...) __VA_ARGS__
+
+#define SSE2NEON_REPEAT(count, macro, ...)                         \
+    SSE2NEON_WHEN(count)                                           \
+    (SSE2NEON_OBSTRUCT(SSE2NEON_REPEAT_INDIRECT)()(                \
+        SSE2NEON_DEC(count), macro,                                \
+        __VA_ARGS__) SSE2NEON_OBSTRUCT(macro)(SSE2NEON_DEC(count), \
+                                              __VA_ARGS__))
+#define SSE2NEON_REPEAT_INDIRECT() SSE2NEON_REPEAT
+
+#define SSE2NEON_SIZE_OF_byte 8
+#define SSE2NEON_NUMBER_OF_LANES_byte 16
+#define SSE2NEON_SIZE_OF_word 16
+#define SSE2NEON_NUMBER_OF_LANES_word 8
+
+#define SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE(i, type)                         \
+    mtx[i] = vreinterpretq_m128i_##type(vceqq_##type(                          \
+        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \
+        vreinterpretq_##type##_m128i(a)));
+
+#define SSE2NEON_FILL_LANE(i, type) \
+    vec_b[i] =                      \
+        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i));
+
+#define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size,        \
+                       number_of_lanes, byte_or_word)                         \
+    do {                                                                      \
+        SSE2NEON_CAT(                                                         \
+            data_type_prefix,                                                 \
+            SSE2NEON_CAT(size,                                                \
+                         SSE2NEON_CAT(x, SSE2NEON_CAT(number_of_lanes, _t)))) \
+        vec_b[number_of_lanes];                                               \
+        __m128i mask = SSE2NEON_IIF(byte_or_word)(                            \
+            vreinterpretq_m128i_u16(vdupq_n_u16(0xff)),                       \
+            vreinterpretq_m128i_u32(vdupq_n_u32(0xffff)));                    \
+        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, SSE2NEON_FILL_LANE,    \
+                                      SSE2NEON_CAT(type_prefix, size)))       \
+        for (int i = 0; i < number_of_lanes; i++) {                           \
+            mtx[i] = SSE2NEON_CAT(vreinterpretq_m128i_u,                      \
+                                  size)(SSE2NEON_CAT(vbslq_u, size)(          \
+                SSE2NEON_CAT(vreinterpretq_u,                                 \
+                             SSE2NEON_CAT(size, _m128i))(mask),               \
+                SSE2NEON_CAT(vcgeq_, SSE2NEON_CAT(type_prefix, size))(        \
+                    vec_b[i],                                                 \
+                    SSE2NEON_CAT(                                             \
+                        vreinterpretq_,                                       \
+                        SSE2NEON_CAT(type_prefix,                             \
+                                     SSE2NEON_CAT(size, _m128i(a))))),        \
+                SSE2NEON_CAT(vcleq_, SSE2NEON_CAT(type_prefix, size))(        \
+                    vec_b[i],                                                 \
+                    SSE2NEON_CAT(                                             \
+                        vreinterpretq_,                                       \
+                        SSE2NEON_CAT(type_prefix,                             \
+                                     SSE2NEON_CAT(size, _m128i(a)))))));      \
+        }                                                                     \
+    } while (0)
+
+#define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes)                         \
+    do {                                                                     \
+        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes,                       \
+                                      SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE, \
+                                      SSE2NEON_CAT(u, size)))                \
+    } while (0)
+
+#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type)                                     \
+    static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \
+                                                int lb)                       \
+    {                                                                         \
+        __m128i mtx[16];                                                      \
+        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),          \
+                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));            \
+        return SSE2NEON_CAT(                                                  \
+            _sse2neon_aggregate_equal_any_,                                   \
+            SSE2NEON_CAT(                                                     \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                        \
+                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,       \
+                                             type))))(la, lb, mtx);           \
+    }
+
+#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word)            \
+    static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \
+                                                 int lb)                       \
+    {                                                                          \
+        __m128i mtx[16];                                                       \
+        PCMPSTR_RANGES(                                                        \
+            a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),   \
+            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word);      \
+        return SSE2NEON_CAT(                                                   \
+            _sse2neon_aggregate_ranges_,                                       \
+            SSE2NEON_CAT(                                                      \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
+                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,        \
+                                             type))))(la, lb, mtx);            \
+    }
+
+#define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type)                                  \
+    static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la,         \
+                                                    __m128i b, int lb)         \
+    {                                                                          \
+        __m128i mtx[16];                                                       \
+        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),           \
+                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));             \
+        return SSE2NEON_CAT(                                                   \
+            _sse2neon_aggregate_equal_ordered_,                                \
+            SSE2NEON_CAT(                                                      \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
+                SSE2NEON_CAT(x,                                                \
+                             SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type))))( \
+            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx);       \
+    }
+
+static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
+    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
+    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u8(
+            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u8(
+            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
+        int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
+        res |= (tmp << j);
+    }
+    return res;
+}
+
+static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint16x8_t vec =
+        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u16(
+            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u16(
+            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
+        int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
+        res |= (tmp << j);
+    }
+    return res;
+}
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix) \
+    prefix##IMPL(byte) \
+    prefix##IMPL(word)
+/* clang-format on */
+
+SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_)
+
+static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint16x8_t vec =
+        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u16(
+            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u16(
+            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
+        __m128i tmp = vreinterpretq_m128i_u32(
+            vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16));
+        uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]),
+                                       vreinterpretq_u32_m128i(tmp));
+#if defined(__aarch64__)
+        int t = vaddvq_u32(vec_res) ? 1 : 0;
+#else
+        uint64x2_t sumh = vpaddlq_u32(vec_res);
+        int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
+#endif
+        res |= (t << j);
+    }
+    return res;
+}
+
+static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
+    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
+    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u8(
+            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u8(
+            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
+        __m128i tmp = vreinterpretq_m128i_u16(
+            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8));
+        uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]),
+                                       vreinterpretq_u16_m128i(tmp));
+        int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
+        res |= (t << j);
+    }
+    return res;
+}
+
+#define SSE2NEON_CMP_RANGES_IS_BYTE 1
+#define SSE2NEON_CMP_RANGES_IS_WORD 0
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_CMP_RANGES(prefix)             \
+    prefix##IMPL(byte, uint, u, prefix##IS_BYTE)         \
+    prefix##IMPL(byte, int, s, prefix##IS_BYTE)          \
+    prefix##IMPL(word, uint, u, prefix##IS_WORD)         \
+    prefix##IMPL(word, int, s, prefix##IS_WORD)
+/* clang-format on */
+
+SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_)
+
+#undef SSE2NEON_CMP_RANGES_IS_BYTE
+#undef SSE2NEON_CMP_RANGES_IS_WORD
+
+static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
+{
+    uint8x16_t mtx =
+        vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b));
+    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
+    int m1 = 0x10000 - (1 << la);
+    int tb = 0x10000 - (1 << lb);
+    uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
+    uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
+    vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);
+    vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);
+    vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);
+    vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);
+    tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);
+    tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);
+
+    res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
+    res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
+    res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo);
+    res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi);
+    res_lo = vand_u8(res_lo, vec_mask);
+    res_hi = vand_u8(res_hi, vec_mask);
+
+    int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8);
+    return res;
+}
+
+static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
+{
+    uint16x8_t mtx =
+        vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
+    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
+    int m1 = 0x100 - (1 << la);
+    int tb = 0x100 - (1 << lb);
+    uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
+    uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
+    uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
+    uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask);
+    mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx);
+    mtx = vbslq_u16(vec1, tmp, mtx);
+    mtx = vandq_u16(mtx, vec_mask);
+    return _sse2neon_vaddvq_u16(mtx);
+}
+
+#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1
+#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
+
+#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type)  \
+    static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes(   \
+        int bound, int la, int lb, __m128i mtx[16])                            \
+    {                                                                          \
+        int res = 0;                                                           \
+        int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la);          \
+        uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)(                   \
+            vld1_u##size(_sse2neon_cmpestr_mask##size##b),                     \
+            vld1q_u##size(_sse2neon_cmpestr_mask##size##b));                   \
+        uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)(     \
+            vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask),       \
+                             vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \
+            vtstq_u##size(vdupq_n_u##size(m1), vec_mask));                     \
+        uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \
+        uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0);      \
+        for (int j = 0; j < lb; j++) {                                         \
+            mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size(                \
+                vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j])));   \
+        }                                                                      \
+        for (int j = lb; j < bound; j++) {                                     \
+            mtx[j] = vreinterpretq_m128i_u##size(                              \
+                vbslq_u##size(vec1, vec_minusone, vec_zero));                  \
+        }                                                                      \
+        unsigned SSE2NEON_IIF(data_type)(char, short) *ptr =                   \
+            (unsigned SSE2NEON_IIF(data_type)(char, short) *) mtx;             \
+        for (int i = 0; i < bound; i++) {                                      \
+            int val = 1;                                                       \
+            for (int j = 0, k = i; j < bound - i && k < bound; j++, k++)       \
+                val &= ptr[k * bound + j];                                     \
+            res += val << i;                                                   \
+        }                                                                      \
+        return res;                                                            \
+    }
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \
+    prefix##IMPL(8, 16, prefix##IS_UBYTE)               \
+    prefix##IMPL(16, 8, prefix##IS_UWORD)
+/* clang-format on */
+
+SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_)
+
+#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE
+#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \
+    prefix##IMPL(byte)                              \
+    prefix##IMPL(word)
+/* clang-format on */
+
+SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(SSE2NEON_CMP_EQUAL_ORDERED_)
+
+#define SSE2NEON_CMPESTR_LIST                          \
+    _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
+    _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any)         \
+    _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
+    _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any)         \
+    _(CMP_UBYTE_RANGES, cmp_ubyte_ranges)              \
+    _(CMP_UWORD_RANGES, cmp_uword_ranges)              \
+    _(CMP_SBYTE_RANGES, cmp_sbyte_ranges)              \
+    _(CMP_SWORD_RANGES, cmp_sword_ranges)              \
+    _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
+    _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each)       \
+    _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
+    _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each)       \
+    _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
+    _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \
+    _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
+    _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered)
+
+enum {
+#define _(name, func_suffix) name,
+    SSE2NEON_CMPESTR_LIST
+#undef _
+};
+typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
+static cmpestr_func_t _sse2neon_cmpfunc_table[] = {
+#define _(name, func_suffix) _sse2neon_##func_suffix,
+    SSE2NEON_CMPESTR_LIST
+#undef _
+};
+
+FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
+{
+    switch (imm8 & 0x30) {
+    case _SIDD_NEGATIVE_POLARITY:
+        res ^= 0xffffffff;
+        break;
+    case _SIDD_MASKED_NEGATIVE_POLARITY:
+        res ^= (1 << lb) - 1;
+        break;
+    default:
+        break;
+    }
+
+    return res & ((bound == 8) ? 0xFF : 0xFFFF);
+}
+
+FORCE_INLINE int _sse2neon_clz(unsigned int x)
+{
+#if _MSC_VER
+    DWORD cnt = 0;
+    if (_BitScanForward(&cnt, x))
+        return cnt;
+    return 32;
+#else
+    return x != 0 ? __builtin_clz(x) : 32;
+#endif
+}
+
+FORCE_INLINE int _sse2neon_ctz(unsigned int x)
+{
+#if _MSC_VER
+    DWORD cnt = 0;
+    if (_BitScanReverse(&cnt, x))
+        return 31 - cnt;
+    return 32;
+#else
+    return x != 0 ? __builtin_ctz(x) : 32;
+#endif
+}
+
+FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
+{
+#if _MSC_VER
+    unsigned long cnt;
+#ifdef defined(SSE2NEON_HAS_BITSCAN64)
+    (defined(_M_AMD64) || defined(__x86_64__))
+        if((_BitScanForward64(&cnt, x))
+            return (int)(cnt);
+#else
+    if (_BitScanForward(&cnt, (unsigned long) (x)))
+        return (int) cnt;
+    if (_BitScanForward(&cnt, (unsigned long) (x >> 32)))
+        return (int) (cnt + 32);
+#endif
+    return 64;
+#else
+    return x != 0 ? __builtin_ctzll(x) : 64;
+#endif
+}
+
+#define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)
+
+#define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \
+    const int var = (imm & 0x01) ? 8 : 16
+
+#define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \
+    int tmp1 = la ^ (la >> 31);                  \
+    la = tmp1 - (la >> 31);                      \
+    int tmp2 = lb ^ (lb >> 31);                  \
+    lb = tmp2 - (lb >> 31);                      \
+    la = SSE2NEON_MIN(la, bound);                \
+    lb = SSE2NEON_MIN(lb, bound)
+
+// Compare all pairs of character in string a and b,
+// then aggregate the result.
+// As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the
+// length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of
+// string a and b.
+#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE)                  \
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);                        \
+    SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb);                        \
+    int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \
+    r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)
+
+#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8)          \
+    return (r2 == 0) ? bound                                     \
+                     : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \
+                                      : _sse2neon_ctz(r2))
+
+#define SSE2NEON_CMPSTR_GENERATE_MASK(dst)                                     \
+    __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0));                       \
+    if (imm8 & 0x40) {                                                         \
+        if (bound == 8) {                                                      \
+            uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2),                        \
+                                       vld1q_u16(_sse2neon_cmpestr_mask16b));  \
+            dst = vreinterpretq_m128i_u16(vbslq_u16(                           \
+                tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst)));          \
+        } else {                                                               \
+            uint8x16_t vec_r2 =                                                \
+                vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8));                \
+            uint8x16_t tmp =                                                   \
+                vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b));          \
+            dst = vreinterpretq_m128i_u8(                                      \
+                vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst)));   \
+        }                                                                      \
+    } else {                                                                   \
+        if (bound == 16) {                                                     \
+            dst = vreinterpretq_m128i_u16(                                     \
+                vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \
+        } else {                                                               \
+            dst = vreinterpretq_m128i_u8(                                      \
+                vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0));     \
+        }                                                                      \
+    }                                                                          \
+    return dst
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and returns 1 if b did not contain a null character and the
+// resulting mask was zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra
+FORCE_INLINE int _mm_cmpestra(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    int lb_cpy = lb;
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    return !r2 & (lb_cpy > bound);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc
+FORCE_INLINE int _mm_cmpestrc(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    return r2 != 0;
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and store the generated index in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri
+FORCE_INLINE int _mm_cmpestri(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and store the generated mask in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm
+FORCE_INLINE __m128i
+_mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns bit 0 of the resulting bit mask.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro
+FORCE_INLINE int _mm_cmpestro(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    return r2 & 1;
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if any character in a was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs
+FORCE_INLINE int _mm_cmpestrs(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    return la <= (bound - 1);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if any character in b was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz
+FORCE_INLINE int _mm_cmpestrz(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    return lb <= (bound - 1);
+}
+
+#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8)                         \
+    do {                                                                 \
+        if (imm8 & 0x01) {                                               \
+            uint16x8_t equal_mask_##str =                                \
+                vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \
+            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
+            uint64_t matches_##str =                                     \
+                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
+            len = _sse2neon_ctzll(matches_##str) >> 3;                   \
+        } else {                                                         \
+            uint16x8_t equal_mask_##str = vreinterpretq_u16_u8(          \
+                vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0)));   \
+            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
+            uint64_t matches_##str =                                     \
+                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
+            len = _sse2neon_ctzll(matches_##str) >> 2;                   \
+        }                                                                \
+    } while (0)
+
+#define SSE2NEON_CMPISTRX_LEN_PAIR(a, b, la, lb) \
+    int la, lb;                                  \
+    do {                                         \
+        SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);   \
+        SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);   \
+    } while (0)
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if b did not contain a null character and the resulting
+// mask was zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra
+FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    return !r2 & (lb >= bound);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc
+FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    return r2 != 0;
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and store the generated index in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri
+FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and store the generated mask in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm
+FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns bit 0 of the resulting bit mask.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro
+FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    return r2 & 1;
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if any character in a was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs
+FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    int la;
+    SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);
+    return la <= (bound - 1);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if any character in b was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz
+FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    int lb;
+    SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);
+    return lb <= (bound - 1);
+}
+
 // Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
 // in b for greater than.
 FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
@@ -8320,6 +9342,8 @@ FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
     __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
                          : [c] "+r"(crc)
                          : [v] "r"(v));
+#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
+    crc = __crc32ch(crc, v);
 #else
     crc = _mm_crc32_u8(crc, v & 0xff);
     crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
@@ -8336,6 +9360,8 @@ FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
     __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
                          : [c] "+r"(crc)
                          : [v] "r"(v));
+#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
+    crc = __crc32cw(crc, v);
 #else
     crc = _mm_crc32_u16(crc, v & 0xffff);
     crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
@@ -8353,8 +9379,8 @@ FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
                          : [c] "+r"(crc)
                          : [v] "r"(v));
 #else
-    crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
-    crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
+    crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff);
+    crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
 #endif
     return crc;
 }
@@ -8368,6 +9394,8 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
     __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
                          : [c] "+r"(crc)
                          : [v] "r"(v));
+#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
+    crc = __crc32cb(crc, v);
 #else
     crc ^= v;
     for (int bit = 0; bit < 8; bit++) {
@@ -8384,7 +9412,7 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
 
 #if !defined(__ARM_FEATURE_CRYPTO)
 /* clang-format off */
-#define SSE2NEON_AES_DATA(w)                                           \
+#define SSE2NEON_AES_SBOX(w)                                           \
     {                                                                  \
         w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
         w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
@@ -8424,53 +9452,115 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
         w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
         w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
     }
+#define SSE2NEON_AES_RSBOX(w)                                          \
+    {                                                                  \
+        w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \
+        w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \
+        w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \
+        w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \
+        w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \
+        w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \
+        w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \
+        w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \
+        w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \
+        w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \
+        w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \
+        w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \
+        w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \
+        w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \
+        w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \
+        w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \
+        w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \
+        w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \
+        w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \
+        w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \
+        w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \
+        w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \
+        w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \
+        w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \
+        w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \
+        w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \
+        w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \
+        w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \
+        w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \
+        w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \
+        w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \
+        w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \
+        w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \
+        w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \
+        w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \
+        w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \
+        w(0x55), w(0x21), w(0x0c), w(0x7d)                             \
+    }
 /* clang-format on */
 
 /* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
 #define SSE2NEON_AES_H0(x) (x)
-static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
+static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
+static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
 #undef SSE2NEON_AES_H0
 
+/* x_time function and matrix multiply function */
+#if !defined(__aarch64__)
+#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
+#define SSE2NEON_MULTIPLY(x, y)                                  \
+    (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^           \
+     ((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^              \
+     ((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \
+     ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))
+#endif
+
 // In the absence of crypto extensions, implement aesenc using regular neon
 // intrinsics instead. See:
 // https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
 // https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
 // https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
 // for more information Reproduced with permission of the author.
-FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
 {
 #if defined(__aarch64__)
-    static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
-                                         0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
-                                         0xc, 0x1, 0x6, 0xb};
-    static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
-                                       0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
+    static const uint8_t shift_rows[] = {
+        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
+        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
+    };
+    static const uint8_t ror32by8[] = {
+        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
+    };
 
     uint8x16_t v;
-    uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
 
-    // shift rows
+    /* shift rows */
     w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
 
-    // sub bytes
-    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
-
-    // mix columns
-    w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
+    /* sub bytes */
+    // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and
+    // look up each of the table. After each lookup, we load the next table
+    // which locates at the next 64-bytes. In the meantime, the index in the
+    // table would be smaller than it was, so the index parameters of
+    // `vqtbx4q_u8()` need to be added the same constant as the loaded tables.
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
+    // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
+
+    /* mix columns */
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
     w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
     w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
 
-    //  add round key
+    /* add round key */
     return vreinterpretq_m128i_u8(w) ^ RoundKey;
 
-#else /* ARMv7-A NEON implementation */
-#define SSE2NEON_AES_B2W(b0, b1, b2, b3)                                       \
-    (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \
-     (b0))
+#else /* ARMv7-A implementation for a table-based AES */
+#define SSE2NEON_AES_B2W(b0, b1, b2, b3)                 \
+    (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
+     ((uint32_t) (b1) << 8) | (uint32_t) (b0))
+// muliplying 'x' by 2 in GF(2^8)
 #define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
+// muliplying 'x' by 3 in GF(2^8)
 #define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
 #define SSE2NEON_AES_U0(p) \
     SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
@@ -8480,11 +9570,14 @@ FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
     SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
 #define SSE2NEON_AES_U3(p) \
     SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
+
+    // this generates a table containing every possible permutation of
+    // shift_rows() and sub_bytes() with mix_columns().
     static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
-        SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
-        SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
-        SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
-        SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
+        SSE2NEON_AES_SBOX(SSE2NEON_AES_U0),
+        SSE2NEON_AES_SBOX(SSE2NEON_AES_U1),
+        SSE2NEON_AES_SBOX(SSE2NEON_AES_U2),
+        SSE2NEON_AES_SBOX(SSE2NEON_AES_U3),
     };
 #undef SSE2NEON_AES_B2W
 #undef SSE2NEON_AES_F2
@@ -8494,11 +9587,15 @@ FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
 #undef SSE2NEON_AES_U2
 #undef SSE2NEON_AES_U3
 
-    uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
-    uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
-    uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
-    uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
+    uint32_t x0 = _mm_cvtsi128_si32(a);  // get a[31:0]
+    uint32_t x1 =
+        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));  // get a[63:32]
+    uint32_t x2 =
+        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA));  // get a[95:64]
+    uint32_t x3 =
+        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));  // get a[127:96]
 
+    // finish the modulo addition step in mix_columns()
     __m128i out = _mm_set_epi32(
         (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
          aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
@@ -8513,34 +9610,210 @@ FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
 #endif
 }
 
+// Perform one round of an AES decryption flow on data (state) in a using the
+// round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
+FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t inv_shift_rows[] = {
+        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
+        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
+    };
+    static const uint8_t ror32by8[] = {
+        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    // inverse shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
+
+    // inverse sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
+
+    // inverse mix columns
+    // muliplying 'v' by 4 in GF(2^8)
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
+    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
+    v ^= w;
+    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
+
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
+                                 0x1b);  // muliplying 'v' by 2 in GF(2^8)
+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+
+    // add round key
+    return vreinterpretq_m128i_u8(w) ^ RoundKey;
+
+#else /* ARMv7-A NEON implementation */
+    /* FIXME: optimized for NEON */
+    uint8_t i, e, f, g, h, v[4][4];
+    uint8_t *_a = (uint8_t *) &a;
+    for (i = 0; i < 16; ++i) {
+        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
+    }
+
+    // inverse mix columns
+    for (i = 0; i < 4; ++i) {
+        e = v[i][0];
+        f = v[i][1];
+        g = v[i][2];
+        h = v[i][3];
+
+        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
+                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
+        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
+                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
+        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
+                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
+        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
+                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
+    }
+
+    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
+#endif
+}
+
 // Perform the last round of an AES encryption flow on data (state) in a using
 // the round key in RoundKey, and store the result in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
 {
+#if defined(__aarch64__)
+    static const uint8_t shift_rows[] = {
+        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
+        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    // shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+
+    // sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
+
+    // add round key
+    return vreinterpretq_m128i_u8(v) ^ RoundKey;
+
+#else /* ARMv7-A implementation */
+    uint8_t v[16] = {
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
+    };
+
+    return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey;
+#endif
+}
+
+// Perform the last round of an AES decryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
+FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t inv_shift_rows[] = {
+        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
+        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    // inverse shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
+
+    // inverse sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
+
+    // add round key
+    return vreinterpretq_m128i_u8(v) ^ RoundKey;
+
+#else /* ARMv7-A NEON implementation */
     /* FIXME: optimized for NEON */
-    uint8_t v[4][4] = {
-        {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
-         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
-         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
-         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
-        {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
-         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
-         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
-         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
-        {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
-         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
-         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
-         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
-        {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
-         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
-         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
-         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
+    uint8_t v[4][4];
+    uint8_t *_a = (uint8_t *) &a;
+    for (int i = 0; i < 16; ++i) {
+        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
+    }
+
+    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
+#endif
+}
+
+// Perform the InvMixColumns transformation on a and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
+FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
+{
+#if defined(__aarch64__)
+    static const uint8_t ror32by8[] = {
+        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
     };
-    for (int i = 0; i < 16; i++)
-        vreinterpretq_nth_u8_m128i(a, i) =
-            v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
-    return a;
+    uint8x16_t v = vreinterpretq_u8_m128i(a);
+    uint8x16_t w;
+
+    // multiplying 'v' by 4 in GF(2^8)
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
+    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
+    v ^= w;
+    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
+
+    // multiplying 'v' by 2 in GF(2^8)
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+    return vreinterpretq_m128i_u8(w);
+
+#else /* ARMv7-A NEON implementation */
+    uint8_t i, e, f, g, h, v[4][4];
+    vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a));
+    for (i = 0; i < 4; ++i) {
+        e = v[i][0];
+        f = v[i][1];
+        g = v[i][2];
+        h = v[i][3];
+
+        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
+                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
+        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
+                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
+        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
+                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
+        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
+                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
+    }
+
+    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v));
+#endif
 }
 
 // Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
@@ -8548,19 +9821,43 @@ FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
 // https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
 // for details.
 //
-// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
-FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
 {
-    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
-    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
+#if defined(__aarch64__)
+    uint8x16_t _a = vreinterpretq_u8_m128i(a);
+    uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);
+
+    uint32x4_t select_mask = {0xffffffff, 0x0, 0xffffffff, 0x0};
+    uint64x2_t v_mask = vshrq_n_u64(vreinterpretq_u64_u8(v), 32);
+    uint32x4_t x = vbslq_u32(select_mask, vreinterpretq_u32_u64(v_mask),
+                             vreinterpretq_u32_u8(v));
+    uint32x4_t ror_x = vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 24));
+    uint32x4_t ror_xor_x = veorq_u32(ror_x, vdupq_n_u32(rcon));
+
+    return vreinterpretq_m128i_u32(vbslq_u32(select_mask, x, ror_xor_x));
+
+#else /* ARMv7-A NEON implementation */
+    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
+    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));
     for (int i = 0; i < 4; ++i) {
-        ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
-        ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
+        ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]];
+        ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]];
     }
     return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
                          ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
+#endif
 }
-#undef SSE2NEON_AES_DATA
+#undef SSE2NEON_AES_SBOX
+#undef SSE2NEON_AES_RSBOX
+
+#if defined(__aarch64__)
+#undef SSE2NEON_XT
+#undef SSE2NEON_MULTIPLY
+#endif
 
 #else /* __ARM_FEATURE_CRYPTO */
 // Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
@@ -8576,7 +9873,19 @@ FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
         vreinterpretq_u8_m128i(b));
 }
 
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
+// Perform one round of an AES decryption flow on data (state) in a using the
+// round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
+FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
+{
+    return vreinterpretq_m128i_u8(veorq_u8(
+        vaesimcq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+        vreinterpretq_u8_m128i(RoundKey)));
+}
+
+// Perform the last round of an AES encryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
 {
     return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
@@ -8584,6 +9893,27 @@ FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
                          RoundKey);
 }
 
+// Perform the last round of an AES decryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
+FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
+{
+    return vreinterpretq_m128i_u8(
+               vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
+           vreinterpretq_u8_m128i(RoundKey);
+}
+
+// Perform the InvMixColumns transformation on a and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
+FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
+{
+    return vreinterpretq_m128i_u8(vaesimcq_u8(a));
+}
+
+// Assist in expanding the AES cipher key by computing steps towards generating
+// a round key for encryption cipher using data from a and an 8-bit round
+// constant specified in imm8, and store the result in dst."
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
 FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
 {
     // AESE does ShiftRows and SubBytes on A
@@ -8605,7 +9935,7 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
 
 // Perform a carry-less multiplication of two 64-bit integers, selected from a
 // and b according to imm8, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128
 FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
 {
     uint64x2_t a = vreinterpretq_u64_m128i(_a);
@@ -8640,9 +9970,9 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
     } r;
 
 #if defined(__aarch64__)
-    asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
 #else
-    asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
 #endif
 
     return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
@@ -8650,7 +9980,7 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
 
 // Count the number of bits set to 1 in unsigned 32-bit integer a, and
 // return that count in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
 FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
 {
 #if defined(__aarch64__)
@@ -8677,7 +10007,7 @@ FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
 
 // Count the number of bits set to 1 in unsigned 64-bit integer a, and
 // return that count in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
 FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
 {
 #if defined(__aarch64__)
@@ -8717,17 +10047,55 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
     } r;
 
 #if defined(__aarch64__)
-    asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
 #else
-    asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
 #endif
 
     r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
 
 #if defined(__aarch64__)
-    asm volatile("msr FPCR, %0" ::"r"(r)); /* write */
+    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
 #else
-    asm volatile("vmsr FPSCR, %0" ::"r"(r));        /* write */
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
+#endif
+}
+
+// Return the current 64-bit value of the processor's time-stamp counter.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
+
+FORCE_INLINE uint64_t _rdtsc(void)
+{
+#if defined(__aarch64__)
+    uint64_t val;
+
+    /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the
+     * system counter is at least 56 bits wide; from Armv8.6, the counter
+     * must be 64 bits wide.  So the system counter could be less than 64
+     * bits wide and it is attributed with the flag 'cap_user_time_short'
+     * is true.
+     */
+    __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));
+
+    return val;
+#else
+    uint32_t pmccntr, pmuseren, pmcntenset;
+    // Read the user mode Performance Monitoring Unit (PMU)
+    // User Enable Register (PMUSERENR) access permissions.
+    __asm__ __volatile__("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
+    if (pmuseren & 1) {  // Allows reading PMUSERENR for user mode code.
+        __asm__ __volatile__("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
+        if (pmcntenset & 0x80000000UL) {  // Is it counting?
+            __asm__ __volatile__("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
+            // The counter is set up to count every 64th cycle
+            return (uint64_t) (pmccntr) << 6;
+        }
+    }
+
+    // Fallback to syscall as we can't enable PMUSERENR in user mode.
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec;
 #endif
 }
 
@@ -8740,4 +10108,4 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
 #pragma GCC pop_options
 #endif
 
-#endif
-\ No newline at end of file
+#endif
diff --git a/thirdparty/embree/common/simd/avx.h b/thirdparty/embree/common/simd/avx.h
index d3100306ee..7c63749f56 100644
--- a/thirdparty/embree/common/simd/avx.h
+++ b/thirdparty/embree/common/simd/avx.h
@@ -31,4 +31,3 @@
 #if defined(__AVX512F__)
 #include "avx512.h"
 #endif
-
diff --git a/thirdparty/embree/common/simd/simd.h b/thirdparty/embree/common/simd/simd.h
index 34e37b08b1..e777d2df01 100644
--- a/thirdparty/embree/common/simd/simd.h
+++ b/thirdparty/embree/common/simd/simd.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "../math/math.h"
+#include "../math/emath.h"
 
 /* include SSE wrapper classes */
 #if defined(__SSE__) || defined(__ARM_NEON)
diff --git a/thirdparty/embree/common/simd/varying.h b/thirdparty/embree/common/simd/varying.h
index 9b98d326be..20fd3cac48 100644
--- a/thirdparty/embree/common/simd/varying.h
+++ b/thirdparty/embree/common/simd/varying.h
@@ -15,7 +15,7 @@ namespace embree
     __forceinline const float& operator [](size_t index) const { assert(index < N); return f[index]; }
     __forceinline       float& operator [](size_t index)       { assert(index < N); return f[index]; }
   };
-
+  
   template<int N>
   struct vdouble_impl
   {
@@ -31,7 +31,7 @@ namespace embree
     __forceinline const int& operator [](size_t index) const { assert(index < N); return i[index]; }
     __forceinline       int& operator [](size_t index)       { assert(index < N); return i[index]; }
   };
-
+  
   template<int N>
   struct vuint_impl
   {
diff --git a/thirdparty/embree/common/simd/vboolf4_sse2.h b/thirdparty/embree/common/simd/vboolf4_sse2.h
index 9e0fdf5c6f..e96525c9a7 100644
--- a/thirdparty/embree/common/simd/vboolf4_sse2.h
+++ b/thirdparty/embree/common/simd/vboolf4_sse2.h
@@ -119,7 +119,7 @@ namespace embree
 #if defined(__aarch64__)
   template<int i0, int i1, int i2, int i3>
   __forceinline vboolf4 shuffle(const vboolf4& v) {
-    return vreinterpretq_f32_u8(vqtbl1q_u8( vreinterpretq_u8_s32(v), _MN_SHUFFLE(i0, i1, i2, i3)));
+    return vreinterpretq_f32_u8(vqtbl1q_u8( vreinterpretq_u8_s32((int32x4_t)v.v), _MN_SHUFFLE(i0, i1, i2, i3)));
   }
 
   template<int i0, int i1, int i2, int i3>
diff --git a/thirdparty/embree/common/simd/vfloat16_avx512.h b/thirdparty/embree/common/simd/vfloat16_avx512.h
index 75c471cc0c..b6160a438c 100644
--- a/thirdparty/embree/common/simd/vfloat16_avx512.h
+++ b/thirdparty/embree/common/simd/vfloat16_avx512.h
@@ -316,6 +316,17 @@ namespace embree
     return madd(t,b-a,a);
   }
 
+  __forceinline bool isvalid (const vfloat16& v) {
+    return all((v > vfloat16(-FLT_LARGE)) & (v < vfloat16(+FLT_LARGE)));
+  }
+
+  __forceinline void xchg(vboolf16 m, vfloat16& a, vfloat16& b)
+  {
+    vfloat16 c = a;
+    a = select(m,b,a);
+    b = select(m,c,b); 
+  }
+
   ////////////////////////////////////////////////////////////////////////////////
   /// Rounding Functions
   ////////////////////////////////////////////////////////////////////////////////
diff --git a/thirdparty/embree/common/simd/vfloat4_sse2.h b/thirdparty/embree/common/simd/vfloat4_sse2.h
index 6d7e11fe72..fccf11fe0c 100644
--- a/thirdparty/embree/common/simd/vfloat4_sse2.h
+++ b/thirdparty/embree/common/simd/vfloat4_sse2.h
@@ -32,6 +32,8 @@ namespace embree
     
     __forceinline vfloat() {}
     __forceinline vfloat(const vfloat4& other) { v = other.v; }
+    //__forceinline vfloat(const vfloat4& other) = default;
+    
     __forceinline vfloat4& operator =(const vfloat4& other) { v = other.v; return *this; }
 
     __forceinline vfloat(__m128 a) : v(a) {}
diff --git a/thirdparty/embree/common/simd/vint4_sse2.h b/thirdparty/embree/common/simd/vint4_sse2.h
index eea03a771e..e9e4a5a2c2 100644
--- a/thirdparty/embree/common/simd/vint4_sse2.h
+++ b/thirdparty/embree/common/simd/vint4_sse2.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "../math/math.h"
+#include "../math/emath.h"
 
 #define vboolf vboolf_impl
 #define vboold vboold_impl
diff --git a/thirdparty/embree/common/simd/vuint4_sse2.h b/thirdparty/embree/common/simd/vuint4_sse2.h
index f7817da6be..c2e86c6633 100644
--- a/thirdparty/embree/common/simd/vuint4_sse2.h
+++ b/thirdparty/embree/common/simd/vuint4_sse2.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "../math/math.h"
+#include "../math/emath.h"
 
 #define vboolf vboolf_impl
 #define vboold vboold_impl
diff --git a/thirdparty/embree/common/sys/alloc.cpp b/thirdparty/embree/common/sys/alloc.cpp
index abdd269069..71616a3982 100644
--- a/thirdparty/embree/common/sys/alloc.cpp
+++ b/thirdparty/embree/common/sys/alloc.cpp
@@ -12,33 +12,177 @@
   
 namespace embree
 {
-  void* alignedMalloc(size_t size, size_t align) 
+  size_t total_allocations = 0;
+
+#if defined(EMBREE_SYCL_SUPPORT)
+  
+  __thread sycl::context* tls_context_tutorial = nullptr;
+  __thread sycl::device* tls_device_tutorial = nullptr;
+  
+  __thread sycl::context* tls_context_embree = nullptr;
+  __thread sycl::device* tls_device_embree = nullptr;
+  
+  void enableUSMAllocEmbree(sycl::context* context, sycl::device* device)
+  {
+    // -- GODOT start --
+    // if (tls_context_embree != nullptr) throw std::runtime_error("USM allocation already enabled");
+    // if (tls_device_embree != nullptr) throw std::runtime_error("USM allocation already enabled");
+    if (tls_context_embree != nullptr) {
+      abort();
+    }
+    if (tls_device_embree != nullptr) {
+      abort();
+    }
+    // -- GODOT end --
+    tls_context_embree = context;
+    tls_device_embree = device;
+  }
+
+  void disableUSMAllocEmbree()
+  {
+    // -- GODOT start --
+    // if (tls_context_embree  == nullptr) throw std::runtime_error("USM allocation not enabled");
+    // if (tls_device_embree  == nullptr) throw std::runtime_error("USM allocation not enabled");
+    if (tls_context_embree  == nullptr) {
+      abort();
+    }
+    if (tls_device_embree  == nullptr) {
+      abort();
+    }
+    // -- GODOT end --
+    tls_context_embree = nullptr;
+    tls_device_embree = nullptr;
+  }
+
+  void enableUSMAllocTutorial(sycl::context* context, sycl::device* device)
+  {
+    //if (tls_context_tutorial != nullptr) throw std::runtime_error("USM allocation already enabled");
+    //if (tls_device_tutorial != nullptr) throw std::runtime_error("USM allocation already enabled");
+    tls_context_tutorial = context;
+    tls_device_tutorial = device;
+  }
+
+  void disableUSMAllocTutorial()
+  {
+    // -- GODOT start --
+    // if (tls_context_tutorial  == nullptr) throw std::runtime_error("USM allocation not enabled");
+    // if (tls_device_tutorial  == nullptr) throw std::runtime_error("USM allocation not enabled");
+    if (tls_context_tutorial  == nullptr) {
+      abort();
+    }
+    if (tls_device_tutorial  == nullptr) {
+      abort();
+    }
+    // -- GODOT end --
+    
+    tls_context_tutorial = nullptr;
+    tls_device_tutorial = nullptr;
+  }
+
+#endif
+  
+  void* alignedMalloc(size_t size, size_t align)
   {
     if (size == 0)
       return nullptr;
-    
+
     assert((align & (align-1)) == 0);
     void* ptr = _mm_malloc(size,align);
-
-    if (size != 0 && ptr == nullptr)
-      // -- GODOT start --
-      // throw std::bad_alloc();
+    // -- GODOT start --
+    // if (size != 0 && ptr == nullptr)
+    //   throw std::bad_alloc();
+    if (size != 0 && ptr == nullptr) {
       abort();
-      // -- GODOT end --
-    
+    }
+    // -- GODOT end --
     return ptr;
   }
-  
+
   void alignedFree(void* ptr)
   {
     if (ptr)
       _mm_free(ptr);
   }
 
+#if defined(EMBREE_SYCL_SUPPORT)
+  
+  void* alignedSYCLMalloc(sycl::context* context, sycl::device* device, size_t size, size_t align, EmbreeUSMMode mode)
+  {
+    assert(context);
+    assert(device);
+    
+    if (size == 0)
+      return nullptr;
+
+    assert((align & (align-1)) == 0);
+    total_allocations++;    
+
+    void* ptr = nullptr;
+    if (mode == EMBREE_USM_SHARED_DEVICE_READ_ONLY)
+      ptr = sycl::aligned_alloc_shared(align,size,*device,*context,sycl::ext::oneapi::property::usm::device_read_only());
+    else
+      ptr = sycl::aligned_alloc_shared(align,size,*device,*context);
+      
+    // -- GODOT start --
+    // if (size != 0 && ptr == nullptr)
+    //   throw std::bad_alloc();
+    if (size != 0 && ptr == nullptr) {
+      abort();
+    }
+    // -- GODOT end --
+
+    return ptr;
+  }
+  
+  static MutexSys g_alloc_mutex;
+  
+  void* alignedSYCLMalloc(size_t size, size_t align, EmbreeUSMMode mode)
+  {
+    if (tls_context_tutorial) return alignedSYCLMalloc(tls_context_tutorial, tls_device_tutorial, size, align, mode);
+    if (tls_context_embree  ) return alignedSYCLMalloc(tls_context_embree,   tls_device_embree,   size, align, mode);
+    return nullptr;
+  }
+
+  void alignedSYCLFree(sycl::context* context, void* ptr)
+  {
+    assert(context);
+    if (ptr) {
+      sycl::free(ptr,*context);
+    }
+  }
+
+  void alignedSYCLFree(void* ptr)
+  {
+    if (tls_context_tutorial) return alignedSYCLFree(tls_context_tutorial, ptr);
+    if (tls_context_embree  ) return alignedSYCLFree(tls_context_embree, ptr);
+  }
+
+#endif
+
+  void* alignedUSMMalloc(size_t size, size_t align, EmbreeUSMMode mode)
+  {
+#if defined(EMBREE_SYCL_SUPPORT)
+    if (tls_context_embree || tls_context_tutorial)
+      return alignedSYCLMalloc(size,align,mode);
+    else
+#endif
+      return alignedMalloc(size,align);
+  }
+
+  void alignedUSMFree(void* ptr)
+  {
+#if defined(EMBREE_SYCL_SUPPORT)
+    if (tls_context_embree || tls_context_tutorial)
+      return alignedSYCLFree(ptr);
+    else
+#endif
+      return alignedFree(ptr);
+  }
+
   static bool huge_pages_enabled = false;
   static MutexSys os_init_mutex;
 
-  __forceinline bool isHugePageCandidate(const size_t bytes) 
+  __forceinline bool isHugePageCandidate(const size_t bytes)
   {
     if (!huge_pages_enabled)
       return false;
@@ -133,7 +277,9 @@ namespace embree
     char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE);
     // -- GODOT start --
     // if (ptr == nullptr) throw std::bad_alloc();
-    if (ptr == nullptr) abort();
+    if (ptr == nullptr) {
+      abort();
+    }
     // -- GODOT end --
     hugepages = false;
     return ptr;
@@ -150,11 +296,13 @@ namespace embree
     if (bytesNew >= bytesOld)
       return bytesOld;
 
-    if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT))
-      // -- GODOT start --
-      // throw std::bad_alloc();
+    // -- GODOT start --
+    // if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT))
+    //   throw std::bad_alloc();
+    if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT)) {
       abort();
-      // -- GODOT end --
+    }
+    // -- GODOT end --
 
     return bytesNew;
   }
@@ -164,11 +312,13 @@ namespace embree
     if (bytes == 0) 
       return;
 
-    if (!VirtualFree(ptr,0,MEM_RELEASE))
-      // -- GODOT start --
-      // throw std::bad_alloc();
+    // -- GODOT start --
+    // if (!VirtualFree(ptr,0,MEM_RELEASE))
+    //   throw std::bad_alloc();
+    if (!VirtualFree(ptr,0,MEM_RELEASE)) {
       abort();
-      // -- GODOT end --
+    }
+    // -- GODOT end --
   }
 
   void os_advise(void *ptr, size_t bytes)
@@ -274,7 +424,9 @@ namespace embree
     void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
     // -- GODOT start --
     // if (ptr == MAP_FAILED) throw std::bad_alloc();
-    if (ptr == MAP_FAILED) abort();
+    if (ptr == MAP_FAILED) {
+      abort();
+    }
     // -- GODOT end --
     hugepages = false;
 
@@ -291,11 +443,13 @@ namespace embree
     if (bytesNew >= bytesOld)
       return bytesOld;
 
-    if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1)
-      // -- GODOT start --
-      // throw std::bad_alloc();
+    // -- GODOT start --
+    // if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1)
+    //   throw std::bad_alloc();
+    if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1) {
       abort();
-      // -- GODOT end --
+    }
+    // -- GODOT end --
 
     return bytesNew;
   }
@@ -308,11 +462,13 @@ namespace embree
     /* for hugepages we need to also align the size */
     const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
     bytes = (bytes+pageSize-1) & ~(pageSize-1);
-    if (munmap(ptr,bytes) == -1)
-      // -- GODOT start --
-      // throw std::bad_alloc();
+    // -- GODOT start --
+    // if (munmap(ptr,bytes) == -1)
+    //   throw std::bad_alloc();
+    if (munmap(ptr,bytes) == -1) {
       abort();
-      // -- GODOT end --
+    }
+    // -- GODOT end --
   }
 
   /* hint for transparent huge pages (THP) */
diff --git a/thirdparty/embree/common/sys/alloc.h b/thirdparty/embree/common/sys/alloc.h
index 4fa474ec1d..28b17f988d 100644
--- a/thirdparty/embree/common/sys/alloc.h
+++ b/thirdparty/embree/common/sys/alloc.h
@@ -9,20 +9,72 @@
 
 namespace embree
 {
-#define ALIGNED_STRUCT_(align)                                           \
-  void* operator new(size_t size) { return alignedMalloc(size,align); } \
-  void operator delete(void* ptr) { alignedFree(ptr); }                 \
+#if defined(EMBREE_SYCL_SUPPORT)
+
+  /* enables SYCL USM allocation */
+  void enableUSMAllocEmbree(sycl::context* context, sycl::device* device);
+  void enableUSMAllocTutorial(sycl::context* context, sycl::device* device);
+
+  /* disables SYCL USM allocation */
+  void disableUSMAllocEmbree();
+  void disableUSMAllocTutorial();
+
+#endif
+  
+#define ALIGNED_STRUCT_(align)                                            \
+  void* operator new(size_t size) { return alignedMalloc(size,align); }   \
+  void operator delete(void* ptr) { alignedFree(ptr); }                   \
   void* operator new[](size_t size) { return alignedMalloc(size,align); } \
   void operator delete[](void* ptr) { alignedFree(ptr); }
+  
+#define ALIGNED_STRUCT_USM_(align)                                          \
+  void* operator new(size_t size) { return alignedUSMMalloc(size,align); }   \
+  void operator delete(void* ptr) { alignedUSMFree(ptr); }                   \
+  void* operator new[](size_t size) { return alignedUSMMalloc(size,align); } \
+  void operator delete[](void* ptr) { alignedUSMFree(ptr); }
+  
+#define ALIGNED_CLASS_(align)                                          \
+ public:                                                               \
+    ALIGNED_STRUCT_(align)                                             \
+ private:
 
-#define ALIGNED_CLASS_(align)                                           \
+#define ALIGNED_CLASS_USM_(align)                                          \
  public:                                                               \
-    ALIGNED_STRUCT_(align)                                              \
+    ALIGNED_STRUCT_USM_(align)                                             \
  private:
+
+  enum EmbreeUSMMode {
+    EMBREE_USM_SHARED = 0,
+    EMBREE_USM_SHARED_DEVICE_READ_WRITE = 0,
+    EMBREE_USM_SHARED_DEVICE_READ_ONLY = 1
+  };
   
   /*! aligned allocation */
   void* alignedMalloc(size_t size, size_t align);
   void alignedFree(void* ptr);
+
+  /*! aligned allocation using SYCL USM */
+  void* alignedUSMMalloc(size_t size, size_t align = 16, EmbreeUSMMode mode = EMBREE_USM_SHARED_DEVICE_READ_ONLY);
+  void alignedUSMFree(void* ptr);
+
+#if defined(EMBREE_SYCL_SUPPORT)
+  
+  /*! aligned allocation using SYCL USM */
+  void* alignedSYCLMalloc(sycl::context* context, sycl::device* device, size_t size, size_t align, EmbreeUSMMode mode);
+  void alignedSYCLFree(sycl::context* context, void* ptr);
+
+  // deleter functor to use as deleter in std unique or shared pointers that
+  // capture raw pointers created by sycl::malloc and it's variants
+  template<typename T>
+  struct sycl_deleter
+  {
+    void operator()(T const* ptr)
+    {
+      alignedUSMFree((void*)ptr);
+    }
+  };
+
+#endif
   
   /*! allocator that performs aligned allocations */
   template<typename T, size_t alignment>
@@ -95,6 +147,37 @@ namespace embree
       bool hugepages;
     };
 
+  /*! allocator that newer performs allocations */
+  template<typename T>
+    struct no_allocator
+    {
+      typedef T value_type;
+      typedef T* pointer;
+      typedef const T* const_pointer;
+      typedef T& reference;
+      typedef const T& const_reference;
+      typedef std::size_t size_type;
+      typedef std::ptrdiff_t difference_type;
+
+      __forceinline pointer allocate( size_type n ) {
+        // -- GODOT start --
+        // throw std::runtime_error("no allocation supported");
+        abort();
+        // -- GODOT end --
+      }
+
+      __forceinline void deallocate( pointer p, size_type n ) {
+      }
+
+      __forceinline void construct( pointer p, const_reference val ) {
+        new (p) T(val);
+      }
+
+      __forceinline void destroy( pointer p ) {
+        p->~T();
+      }
+    };
+
   /*! allocator for IDs */
   template<typename T, size_t max_id>
     struct IDPool
diff --git a/thirdparty/embree/common/sys/atomic.h b/thirdparty/embree/common/sys/atomic.h
index 67af254f36..cf9909aad9 100644
--- a/thirdparty/embree/common/sys/atomic.h
+++ b/thirdparty/embree/common/sys/atomic.h
@@ -36,7 +36,7 @@ namespace embree
   };
 
   template<typename T>
-    __forceinline void atomic_min(std::atomic<T>& aref, const T& bref)
+    __forceinline void _atomic_min(std::atomic<T>& aref, const T& bref)
   {
     const T b = bref.load();
     while (true) {
@@ -47,7 +47,7 @@ namespace embree
   }
 
   template<typename T>
-    __forceinline void atomic_max(std::atomic<T>& aref, const T& bref)
+    __forceinline void _atomic_max(std::atomic<T>& aref, const T& bref)
   {
     const T b = bref.load();
     while (true) {
diff --git a/thirdparty/embree/common/sys/barrier.h b/thirdparty/embree/common/sys/barrier.h
index c56513a2ed..e1580f41a9 100644
--- a/thirdparty/embree/common/sys/barrier.h
+++ b/thirdparty/embree/common/sys/barrier.h
@@ -34,7 +34,7 @@ namespace embree
     void* opaque;
   };
 
-  /*! fast active barrier using atomitc counter */
+  /*! fast active barrier using atomic counter */
   struct BarrierActive 
   {
   public:
diff --git a/thirdparty/embree/common/sys/string.cpp b/thirdparty/embree/common/sys/estring.cpp
index f42fdc8536..c66c5c5b84 100644
--- a/thirdparty/embree/common/sys/string.cpp
+++ b/thirdparty/embree/common/sys/estring.cpp
@@ -1,7 +1,7 @@
 // Copyright 2009-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include "string.h"
+#include "estring.h"
 
 #include <algorithm>
 #include <ctype.h>
diff --git a/thirdparty/embree/common/sys/string.h b/thirdparty/embree/common/sys/estring.h
index 820076b21c..63051ad3c3 100644
--- a/thirdparty/embree/common/sys/string.h
+++ b/thirdparty/embree/common/sys/estring.h
@@ -28,6 +28,42 @@ namespace embree
     std::streamsize precision;
   };
 
+  struct IndentOStream : public std::streambuf
+  {
+    explicit IndentOStream(std::ostream &ostream, int indent = 2)
+      : streambuf(ostream.rdbuf())
+      , start_of_line(true)
+      , ident_str(indent, ' ')
+      , stream(&ostream)
+    {
+      // set streambuf of ostream to this and save original streambuf
+      stream->rdbuf(this);
+    }
+
+    virtual ~IndentOStream()
+    {
+      if (stream != NULL) {
+        // restore old streambuf
+        stream->rdbuf(streambuf);
+      }
+    }
+
+  protected:
+    virtual int overflow(int ch) {
+      if (start_of_line && ch != '\n') {
+        streambuf->sputn(ident_str.data(), ident_str.size());
+      }
+      start_of_line = ch == '\n';
+      return streambuf->sputc(ch);
+    }
+
+  private:
+    std::streambuf *streambuf;
+    bool start_of_line;
+    std::string ident_str;
+    std::ostream *stream;
+  };
+
   std::string toLowerCase(const std::string& s);
   std::string toUpperCase(const std::string& s);
 
diff --git a/thirdparty/embree/common/sys/intrinsics.h b/thirdparty/embree/common/sys/intrinsics.h
index 2c2f6eccda..f5074bb29d 100644
--- a/thirdparty/embree/common/sys/intrinsics.h
+++ b/thirdparty/embree/common/sys/intrinsics.h
@@ -64,7 +64,7 @@ namespace embree
 /// Windows Platform
 ////////////////////////////////////////////////////////////////////////////////
   
-#if defined(__WIN32__)
+#if defined(__WIN32__) && !defined(__INTEL_LLVM_COMPILER)
   
   __forceinline size_t read_tsc()  
   {
@@ -89,7 +89,7 @@ namespace embree
 #endif
   }
   
-#if defined(__X86_64__)
+#if defined(__X86_64__) || defined (__aarch64__)
   __forceinline size_t bsf(size_t v) {
 #if defined(__AVX2__) 
     return _tzcnt_u64(v);
@@ -113,7 +113,7 @@ namespace embree
     return i;
   }
   
-#if defined(__X86_64__)
+#if defined(__X86_64__) || defined (__aarch64__)
   __forceinline size_t bscf(size_t& v) 
   {
     size_t i = bsf(v);
@@ -138,7 +138,7 @@ namespace embree
 #endif
   }
   
-#if defined(__X86_64__)
+#if defined(__X86_64__) || defined (__aarch64__)
   __forceinline size_t bsr(size_t v) {
 #if defined(__AVX2__) 
     return 63 -_lzcnt_u64(v);
@@ -196,49 +196,6 @@ namespace embree
   
 #else
   
-#if defined(__i386__) && defined(__PIC__)
-  
-  __forceinline void __cpuid(int out[4], int op) 
-  {
-    asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
-                  "cpuid\n\t"
-                  "xchg{l}\t{%%}ebx, %1\n\t"
-                  : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) 
-                  : "0"(op)); 
-  }
-  
-  __forceinline void __cpuid_count(int out[4], int op1, int op2) 
-  {
-    asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
-                  "cpuid\n\t"
-                  "xchg{l}\t{%%}ebx, %1\n\t"
-                  : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3])
-                  : "0" (op1), "2" (op2)); 
-  }
-  
-#elif defined(__X86_ASM__)
-
-  __forceinline void __cpuid(int out[4], int op) {
-#if defined(__ARM_NEON)
-    if (op == 0) { // Get CPU name
-      out[0] = 0x41524d20;
-      out[1] = 0x41524d20;
-      out[2] = 0x41524d20;
-      out[3] = 0x41524d20;
-    }
-#else
-    asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op));
-#endif
-  }
-
-#if !defined(__ARM_NEON)
-  __forceinline void __cpuid_count(int out[4], int op1, int op2) {
-    asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2)); 
-  }
-#endif
-
-#endif
-
   __forceinline uint64_t read_tsc()  {
 #if defined(__X86_ASM__)
     uint32_t high,low;
@@ -263,6 +220,13 @@ namespace embree
 #endif
 #endif
   }
+
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+  __forceinline unsigned int   bsf(unsigned v) {
+    return sycl::ctz(v);
+  }
+
+#else
   
 #if defined(__64BIT__)
   __forceinline unsigned bsf(unsigned v) 
@@ -280,6 +244,13 @@ namespace embree
 #endif
   }
 #endif
+#endif
+  
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+  __forceinline size_t bsf(size_t v) {
+    return sycl::ctz(v);
+  }
+#else
   
   __forceinline size_t bsf(size_t v) {
 #if defined(__AVX2__) && !defined(__aarch64__)
@@ -294,6 +265,7 @@ namespace embree
     return __builtin_ctzl(v);
 #endif
   }
+#endif
 
   __forceinline int bscf(int& v) 
   {
@@ -434,6 +406,41 @@ namespace embree
   
 #endif
   
+#if !defined(__WIN32__)
+
+#if defined(__i386__) && defined(__PIC__)
+
+  __forceinline void __cpuid(int out[4], int op)
+  {
+    asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
+                  "cpuid\n\t"
+                  "xchg{l}\t{%%}ebx, %1\n\t"
+                  : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
+                  : "0"(op));
+  }
+
+  __forceinline void __cpuid_count(int out[4], int op1, int op2)
+  {
+    asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
+                  "cpuid\n\t"
+                  "xchg{l}\t{%%}ebx, %1\n\t"
+                  : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3])
+                  : "0" (op1), "2" (op2));
+  }
+
+#elif defined(__X86_ASM__)
+
+  __forceinline void __cpuid(int out[4], int op) {
+    asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op));
+  }
+
+  __forceinline void __cpuid_count(int out[4], int op1, int op2) {
+    asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2));
+  }
+
+#endif
+#endif
+
 ////////////////////////////////////////////////////////////////////////////////
 /// All Platforms
 ////////////////////////////////////////////////////////////////////////////////
@@ -459,8 +466,16 @@ namespace embree
 #endif
 #endif
 
-#if defined(__SSE4_2__) || defined(__ARM_NEON)
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
 
+  __forceinline unsigned int popcnt(unsigned int in) {
+    return sycl::popcount(in);
+  }
+  
+#else
+  
+#if defined(__SSE4_2__) || defined(__ARM_NEON)
+  
   __forceinline int popcnt(int in) {
     return _mm_popcnt_u32(in);
   }
@@ -476,6 +491,8 @@ namespace embree
 #endif
   
 #endif
+  
+#endif
 
 #if defined(__X86_ASM__)
   __forceinline uint64_t rdtsc()
diff --git a/thirdparty/embree/common/sys/mutex.h b/thirdparty/embree/common/sys/mutex.h
index 26af6c582c..0f7345cf45 100644
--- a/thirdparty/embree/common/sys/mutex.h
+++ b/thirdparty/embree/common/sys/mutex.h
@@ -86,8 +86,8 @@ namespace embree
 
   class PaddedSpinLock : public SpinLock
   {
-    private:
-      char padding[CPU_CACHELINE_SIZE - sizeof(SpinLock)];
+  private:
+    MAYBE_UNUSED char padding[CPU_CACHELINE_SIZE - sizeof(SpinLock)];
   };
   /*! safe mutex lock and unlock helper */
   template<typename Mutex> class Lock {
diff --git a/thirdparty/embree/common/sys/platform.h b/thirdparty/embree/common/sys/platform.h
index 728bf6ed7d..d4a9b9e119 100644
--- a/thirdparty/embree/common/sys/platform.h
+++ b/thirdparty/embree/common/sys/platform.h
@@ -3,7 +3,9 @@
 
 #pragma once
 
+#if !defined(_CRT_SECURE_NO_WARNINGS)
 #define _CRT_SECURE_NO_WARNINGS
+#endif
 
 #include <cstddef>
 #include <cassert>
@@ -18,6 +20,30 @@
 #include <cstring>
 #include <stdint.h>
 #include <functional>
+#include <mutex>
+
+#if defined(EMBREE_SYCL_SUPPORT)
+
+#define __SYCL_USE_NON_VARIADIC_SPIRV_OCL_PRINTF__
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#pragma clang diagnostic ignored "-W#pragma-messages"
+
+#include <sycl/sycl.hpp>
+
+#pragma clang diagnostic pop
+
+#include "sycl.h"
+
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+#define CONSTANT __attribute__((opencl_constant))
+#else
+#define CONSTANT
+#endif
+
+#endif
+
 
 ////////////////////////////////////////////////////////////////////////////////
 /// detect platform
@@ -115,7 +141,7 @@
 #else
 #define __restrict__           //__restrict // causes issues with MSVC
 #endif
-#if !defined(__thread)
+#if !defined(__thread) && !defined(__INTEL_LLVM_COMPILER)
 #define __thread               __declspec(thread)
 #endif
 #if !defined(__aligned)
@@ -148,6 +174,10 @@
   #define MAYBE_UNUSED
 #endif
 
+#if !defined(_unused)
+#define _unused(x) ((void)(x))
+#endif
+
 #if defined(_MSC_VER) && (_MSC_VER < 1900) // before VS2015 deleted functions are not supported properly
   #define DELETED
 #else
@@ -155,7 +185,7 @@
 #endif
 
 #if !defined(likely)
-#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) || defined(__SYCL_DEVICE_ONLY__)
 #define   likely(expr) (expr)
 #define unlikely(expr) (expr)
 #else
@@ -171,22 +201,27 @@
 /* debug printing macros */
 #define STRING(x) #x
 #define TOSTRING(x) STRING(x)
-#define PING embree_cout << __FILE__ << " (" << __LINE__ << "): " << __FUNCTION__ << embree_endl
+#define PING embree_cout_uniform << __FILE__ << " (" << __LINE__ << "): " << __FUNCTION__ << embree_endl
 #define PRINT(x) embree_cout << STRING(x) << " = " << (x) << embree_endl
 #define PRINT2(x,y) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << embree_endl
 #define PRINT3(x,y,z) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << embree_endl
 #define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl
 
+#define UPRINT(x) embree_cout_uniform << STRING(x) << " = " << (x) << embree_endl
+#define UPRINT2(x,y) embree_cout_uniform << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << embree_endl
+#define UPRINT3(x,y,z) embree_cout_uniform << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << embree_endl
+#define UPRINT4(x,y,z,w) embree_cout_uniform << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl
+
 #if defined(DEBUG) // only report file and line in debug mode
   // -- GODOT start --
-  // #define THROW_RUNTIME_ERROR(str)
+  // #define THROW_RUNTIME_ERROR(str) \
   //   throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
   #define THROW_RUNTIME_ERROR(str) \
     printf("%s (%d): %s", __FILE__, __LINE__, std::string(str).c_str()), abort();
   // -- GODOT end --
 #else
   // -- GODOT start --
-  // #define THROW_RUNTIME_ERROR(str)
+  // #define THROW_RUNTIME_ERROR(str) \
   //   throw std::runtime_error(str);
   #define THROW_RUNTIME_ERROR(str) \
     abort();
@@ -323,13 +358,209 @@ __forceinline std::string toString(long long value) {
 #define DISABLE_DEPRECATED_WARNING __pragma(warning (disable: 4996)) // warning: function was declared deprecated
 #define ENABLE_DEPRECATED_WARNING  __pragma(warning (enable : 4996)) // warning: function was declared deprecated
 #endif
+ 
+////////////////////////////////////////////////////////////////////////////////
+/// SYCL specific
+////////////////////////////////////////////////////////////////////////////////
+
+
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+
+#define sycl_printf0(format, ...) {               \
+    static const CONSTANT char fmt[] = format;               \
+    if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true)))       \
+      sycl::ext::oneapi::experimental::printf(fmt, __VA_ARGS__ );  \
+  }
+
+#define sycl_printf0_(format) {               \
+    static const CONSTANT char fmt[] = format;               \
+    if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true)))       \
+      sycl::ext::oneapi::experimental::printf(fmt);                \
+  }
+
+#else
+
+#define sycl_printf0(format, ...) {                          \
+    static const CONSTANT char fmt[] = format;               \
+    sycl::ext::oneapi::experimental::printf(fmt, __VA_ARGS__ );    \
+  }
+
+#define sycl_printf0_(format) {                              \
+    static const CONSTANT char fmt[] = format;               \
+    sycl::ext::oneapi::experimental::printf(fmt);                  \
+  }
+
+#endif
+
+#define sycl_printf(format, ...) {               \
+    static const CONSTANT char fmt[] = format;               \
+    sycl::ext::oneapi::experimental::printf(fmt, __VA_ARGS__ );    \
+  }
+
+#define sycl_printf_(format) {               \
+    static const CONSTANT char fmt[] = format;               \
+    sycl::ext::oneapi::experimental::printf(fmt);                  \
+  }
+
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+
+namespace embree
+{
+  struct sycl_ostream_ {
+    sycl_ostream_ (bool uniform) : uniform(uniform) {}
+    bool uniform = false;
+  };
+  struct sycl_endl_ {};
+
+#define embree_ostream embree::sycl_ostream_
+#define embree_cout embree::sycl_ostream_(false)
+#define embree_cout_uniform embree::sycl_ostream_(true)
+#define embree_endl embree::sycl_endl_()
+  
+  inline sycl_ostream_ operator <<(sycl_ostream_ cout, int   i)
+  {
+    if (cout.uniform) {
+      if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true)))
+        sycl_printf("%i",i);
+    }
+    else
+      sycl_printf("%i ",i);
+    
+    return cout;
+  }
+  
+  inline sycl_ostream_ operator <<(sycl_ostream_ cout, unsigned int i)
+  {
+    if (cout.uniform) {
+      if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true)))
+        sycl_printf("%u",i);
+    } else
+      sycl_printf("%u ",i);
+
+    return cout;
+  }
+
+  inline sycl_ostream_ operator <<(sycl_ostream_ cout, float f)
+  {
+    if (cout.uniform) {
+      if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true)))
+        sycl_printf("%f",f);
+    } else
+      sycl_printf("%f ",f);
+
+    return cout;
+  }
+
+  inline sycl_ostream_ operator <<(sycl_ostream_ cout, double d)
+  {
+    if (cout.uniform) {
+      if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true)))
+        sycl_printf("%f",d);
+    } else
+      sycl_printf("%f ",d);
+
+    return cout;
+  }
+
+  inline sycl_ostream_ operator <<(sycl_ostream_ cout, uint64_t l)
+  {
+    if (cout.uniform) {
+      if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true)))
+        sycl_printf("%lu",l);
+    } else
+      sycl_printf("%lu ",l);
+    
+    return cout;
+  }
+
+  inline sycl_ostream_ operator <<(sycl_ostream_ cout, long l)
+  {
+    if (cout.uniform) {
+      if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true)))
+        sycl_printf("%l",l);
+    } else
+      sycl_printf("%l ",l);
+    
+    return cout;
+  }
+  
+    
+  inline sycl_ostream_ operator <<(sycl_ostream_ cout, void* p)
+  {
+    if (cout.uniform) {
+      if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true)))
+        sycl_printf("%p",p);
+    } else
+      sycl_printf("%p ",p);
+    
+    return cout;
+  }
+
+  inline sycl_ostream_ operator <<(sycl_ostream_ cout, const char* c)
+  {
+     if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true)))
+       sycl_printf("%s",c);
+     return cout;
+  }
+
+  inline sycl_ostream_ operator <<(sycl_ostream_ cout, sycl_endl_)
+  {
+    if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true)))
+      sycl_printf_("\n");
+    return cout;
+  }
+}
+
+#else
 
-/* embree output stream */
 #define embree_ostream std::ostream&
 #define embree_cout std::cout
 #define embree_cout_uniform std::cout
 #define embree_endl std::endl
-  
+
+#endif
+
+#if defined(EMBREE_SYCL_SUPPORT)
+
+  /* printing out sycle vector types */
+  __forceinline embree_ostream operator<<(embree_ostream out, const sycl::float4& v) {
+    return out << "(" << v.x() << "," << v.y() << "," << v.z() << "," << v.w() << ")";
+  }
+  __forceinline embree_ostream operator<<(embree_ostream out, const sycl::float3& v) {
+    return out << "(" << v.x() << "," << v.y() << "," << v.z()  << ")";
+  }
+  __forceinline embree_ostream operator<<(embree_ostream out, const sycl::float2& v) {
+    return out << "(" << v.x() << "," << v.y()  << ")";
+  }
+  __forceinline embree_ostream operator<<(embree_ostream out, const sycl::int4& v) {
+    return out << "(" << v.x() << "," << v.y() << "," << v.z() << "," << v.w() << ")";
+  }
+  __forceinline embree_ostream operator<<(embree_ostream out, const sycl::int3& v) {
+    return out << "(" << v.x() << "," << v.y() << "," << v.z()  << ")";
+  }
+  __forceinline embree_ostream operator<<(embree_ostream out, const sycl::int2& v) {
+    return out << "(" << v.x() << "," << v.y()  << ")";
+  }
+  __forceinline embree_ostream operator<<(embree_ostream out, const sycl::uint4& v) {
+    return out << "(" << v.x() << "," << v.y() << "," << v.z() << "," << v.w() << ")";
+  }
+  __forceinline embree_ostream operator<<(embree_ostream out, const sycl::uint3& v) {
+    return out << "(" << v.x() << "," << v.y() << "," << v.z()  << ")";
+  }
+  __forceinline embree_ostream operator<<(embree_ostream out, const sycl::uint2& v) {
+    return out << "(" << v.x() << "," << v.y()  << ")";
+  }
+
+#endif
+
+inline void tab(std::ostream& cout, int n) {
+  for (int i=0; i<n; i++) cout << "  ";
+}
+
+inline std::string tab(int depth) {
+  return std::string(2*depth,' ');
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 /// Some macros for static profiling
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/thirdparty/embree/common/sys/sycl.h b/thirdparty/embree/common/sys/sycl.h
new file mode 100644
index 0000000000..2558eb052f
--- /dev/null
+++ b/thirdparty/embree/common/sys/sycl.h
@@ -0,0 +1,307 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+
+using sycl::float16;
+using sycl::float8;
+using sycl::float4;
+using sycl::float3;
+using sycl::float2;
+using sycl::int16;
+using sycl::int8;
+using sycl::int4;
+using sycl::int3;
+using sycl::int2;
+using sycl::uint16;
+using sycl::uint8;
+using sycl::uint4;
+using sycl::uint3;
+using sycl::uint2;
+using sycl::uchar16;
+using sycl::uchar8;
+using sycl::uchar4;
+using sycl::uchar3;
+using sycl::uchar2;
+using sycl::ushort16;
+using sycl::ushort8;
+using sycl::ushort4;
+using sycl::ushort3;
+using sycl::ushort2;
+
+#ifdef __SYCL_DEVICE_ONLY__
+#define GLOBAL __attribute__((opencl_global))
+#define LOCAL  __attribute__((opencl_local))
+
+SYCL_EXTERNAL extern int   work_group_reduce_add(int x);
+SYCL_EXTERNAL extern float work_group_reduce_min(float x);
+SYCL_EXTERNAL extern float work_group_reduce_max(float x);
+
+SYCL_EXTERNAL extern float atomic_min(volatile GLOBAL float *p, float val);
+SYCL_EXTERNAL extern float atomic_min(volatile LOCAL  float *p, float val);
+SYCL_EXTERNAL extern float atomic_max(volatile GLOBAL float *p, float val);
+SYCL_EXTERNAL extern float atomic_max(volatile LOCAL  float *p, float val);
+
+SYCL_EXTERNAL extern "C" unsigned int intel_sub_group_ballot(bool valid);
+
+SYCL_EXTERNAL extern "C" void __builtin_IB_assume_uniform(void *p);
+
+// Load message caching control
+
+  enum LSC_LDCC {
+    LSC_LDCC_DEFAULT,
+    LSC_LDCC_L1UC_L3UC,     // Override to L1 uncached and L3 uncached
+    LSC_LDCC_L1UC_L3C,      // Override to L1 uncached and L3 cached
+    LSC_LDCC_L1C_L3UC,      // Override to L1 cached and L3 uncached
+    LSC_LDCC_L1C_L3C,       // Override to L1 cached and L3 cached
+    LSC_LDCC_L1S_L3UC,      // Override to L1 streaming load and L3 uncached
+    LSC_LDCC_L1S_L3C,       // Override to L1 streaming load and L3 cached
+    LSC_LDCC_L1IAR_L3C,     // Override to L1 invalidate-after-read, and L3 cached
+  };
+
+ 
+
+// Store message caching control (also used for atomics)
+
+  enum LSC_STCC {
+    LSC_STCC_DEFAULT,
+    LSC_STCC_L1UC_L3UC,     // Override to L1 uncached and L3 uncached
+    LSC_STCC_L1UC_L3WB,     // Override to L1 uncached and L3 written back
+    LSC_STCC_L1WT_L3UC,     // Override to L1 written through and L3 uncached
+    LSC_STCC_L1WT_L3WB,     // Override to L1 written through and L3 written back
+    LSC_STCC_L1S_L3UC,      // Override to L1 streaming and L3 uncached
+    LSC_STCC_L1S_L3WB,      // Override to L1 streaming and L3 written back
+    LSC_STCC_L1WB_L3WB,     // Override to L1 written through and L3 written back
+  };
+
+ 
+
+///////////////////////////////////////////////////////////////////////
+
+// LSC Loads
+
+///////////////////////////////////////////////////////////////////////
+
+SYCL_EXTERNAL /* extern "C" */ uint32_t     __builtin_IB_lsc_load_global_uchar_to_uint (const GLOBAL uint8_t *base,      int elemOff, enum LSC_LDCC cacheOpt);  //D8U32
+SYCL_EXTERNAL /* extern "C" */ uint32_t     __builtin_IB_lsc_load_global_ushort_to_uint(const GLOBAL uint16_t *base,     int elemOff, enum LSC_LDCC cacheOpt);  //D16U32
+SYCL_EXTERNAL /* extern "C" */ uint32_t     __builtin_IB_lsc_load_global_uint          (const GLOBAL uint32_t *base,     int elemOff, enum LSC_LDCC cacheOpt);  //D32V1
+SYCL_EXTERNAL /* extern "C" */ sycl::uint2  __builtin_IB_lsc_load_global_uint2         (const GLOBAL sycl::uint2  *base, int elemOff, enum LSC_LDCC cacheOpt);  //D32V2
+SYCL_EXTERNAL /* extern "C" */ sycl::uint3  __builtin_IB_lsc_load_global_uint3         (const GLOBAL sycl::uint3  *base, int elemOff, enum LSC_LDCC cacheOpt);  //D32V3
+SYCL_EXTERNAL /* extern "C" */ sycl::uint4  __builtin_IB_lsc_load_global_uint4         (const GLOBAL sycl::uint4  *base, int elemOff, enum LSC_LDCC cacheOpt);  //D32V4
+SYCL_EXTERNAL /* extern "C" */ sycl::uint8  __builtin_IB_lsc_load_global_uint8         (const GLOBAL sycl::uint8  *base, int elemOff, enum LSC_LDCC cacheOpt);  //D32V8
+SYCL_EXTERNAL /* extern "C" */ uint64_t     __builtin_IB_lsc_load_global_ulong         (const GLOBAL uint64_t     *base, int elemOff, enum LSC_LDCC cacheOpt);  //D64V1
+SYCL_EXTERNAL /* extern "C" */ sycl::ulong2 __builtin_IB_lsc_load_global_ulong2        (const GLOBAL sycl::ulong2 *base, int elemOff, enum LSC_LDCC cacheOpt);  //D64V2
+SYCL_EXTERNAL /* extern "C" */ sycl::ulong3 __builtin_IB_lsc_load_global_ulong3        (const GLOBAL sycl::ulong3 *base, int elemOff, enum LSC_LDCC cacheOpt);  //D64V3
+SYCL_EXTERNAL /* extern "C" */ sycl::ulong4 __builtin_IB_lsc_load_global_ulong4        (const GLOBAL sycl::ulong4 *base, int elemOff, enum LSC_LDCC cacheOpt);  //D64V4
+SYCL_EXTERNAL /* extern "C" */ sycl::ulong8 __builtin_IB_lsc_load_global_ulong8        (const GLOBAL sycl::ulong8 *base, int elemOff, enum LSC_LDCC cacheOpt);  //D64V8
+  
+//     global address space
+SYCL_EXTERNAL extern "C"  void  __builtin_IB_lsc_store_global_uchar_from_uint (GLOBAL uint8_t  *base,     int immElemOff, uint32_t val,     enum LSC_STCC cacheOpt);  //D8U32
+SYCL_EXTERNAL extern "C"  void  __builtin_IB_lsc_store_global_ushort_from_uint(GLOBAL uint16_t *base,     int immElemOff, uint32_t val,     enum LSC_STCC cacheOpt);  //D16U32
+SYCL_EXTERNAL extern "C"  void  __builtin_IB_lsc_store_global_uint            (GLOBAL uint32_t *base,     int immElemOff, uint32_t val,     enum LSC_STCC cacheOpt);  //D32V1
+SYCL_EXTERNAL extern "C"  void  __builtin_IB_lsc_store_global_uint2           (GLOBAL sycl::uint2  *base, int immElemOff, sycl::uint2 val,  enum LSC_STCC cacheOpt);  //D32V2
+SYCL_EXTERNAL extern "C"  void  __builtin_IB_lsc_store_global_uint3           (GLOBAL sycl::uint3  *base, int immElemOff, sycl::uint3 val,  enum LSC_STCC cacheOpt);  //D32V3
+SYCL_EXTERNAL extern "C"  void  __builtin_IB_lsc_store_global_uint4           (GLOBAL sycl::uint4  *base, int immElemOff, sycl::uint4 val,  enum LSC_STCC cacheOpt);  //D32V4
+SYCL_EXTERNAL extern "C"  void  __builtin_IB_lsc_store_global_uint8           (GLOBAL sycl::uint8  *base, int immElemOff, sycl::uint8 val,  enum LSC_STCC cacheOpt);  //D32V8
+SYCL_EXTERNAL extern "C"  void  __builtin_IB_lsc_store_global_ulong           (GLOBAL uint64_t *base,     int immElemOff, uint64_t val,     enum LSC_STCC cacheOpt);  //D64V1
+SYCL_EXTERNAL extern "C"  void  __builtin_IB_lsc_store_global_ulong2          (GLOBAL sycl::ulong2 *base, int immElemOff, sycl::ulong2 val, enum LSC_STCC cacheOpt);  //D64V2
+SYCL_EXTERNAL extern "C"  void  __builtin_IB_lsc_store_global_ulong3          (GLOBAL sycl::ulong3 *base, int immElemOff, sycl::ulong3 val, enum LSC_STCC cacheOpt);  //D64V3
+SYCL_EXTERNAL extern "C"  void  __builtin_IB_lsc_store_global_ulong4          (GLOBAL sycl::ulong4 *base, int immElemOff, sycl::ulong4 val, enum LSC_STCC cacheOpt);  //D64V4
+SYCL_EXTERNAL extern "C"  void  __builtin_IB_lsc_store_global_ulong8          (GLOBAL sycl::ulong8 *base, int immElemOff, sycl::ulong8 val, enum LSC_STCC cacheOpt);  //D64V8
+
+///////////////////////////////////////////////////////////////////////
+// prefetching
+///////////////////////////////////////////////////////////////////////
+//
+// LSC Pre-Fetch Load functions with CacheControls
+//     global address space
+SYCL_EXTERNAL extern "C"  void __builtin_IB_lsc_prefetch_global_uchar (const GLOBAL uint8_t *base,      int immElemOff, enum LSC_LDCC cacheOpt); //D8U32
+SYCL_EXTERNAL extern "C"  void __builtin_IB_lsc_prefetch_global_ushort(const GLOBAL uint16_t *base,     int immElemOff, enum LSC_LDCC cacheOpt); //D16U32
+SYCL_EXTERNAL extern "C"  void __builtin_IB_lsc_prefetch_global_uint  (const GLOBAL uint32_t *base,     int immElemOff, enum LSC_LDCC cacheOpt); //D32V1
+SYCL_EXTERNAL extern "C"  void __builtin_IB_lsc_prefetch_global_uint2 (const GLOBAL sycl::uint2 *base,  int immElemOff, enum LSC_LDCC cacheOpt); //D32V2
+SYCL_EXTERNAL extern "C"  void __builtin_IB_lsc_prefetch_global_uint3 (const GLOBAL sycl::uint3 *base,  int immElemOff, enum LSC_LDCC cacheOpt); //D32V3
+SYCL_EXTERNAL extern "C"  void __builtin_IB_lsc_prefetch_global_uint4 (const GLOBAL sycl::uint4 *base,  int immElemOff, enum LSC_LDCC cacheOpt); //D32V4
+SYCL_EXTERNAL extern "C"  void __builtin_IB_lsc_prefetch_global_uint8 (const GLOBAL sycl::uint8 *base,  int immElemOff, enum LSC_LDCC cacheOpt); //D32V8
+SYCL_EXTERNAL extern "C"  void __builtin_IB_lsc_prefetch_global_ulong (const GLOBAL uint64_t *base,     int immElemOff, enum LSC_LDCC cacheOpt); //D64V1
+SYCL_EXTERNAL extern "C"  void __builtin_IB_lsc_prefetch_global_ulong2(const GLOBAL sycl::ulong2 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V2
+SYCL_EXTERNAL extern "C"  void __builtin_IB_lsc_prefetch_global_ulong3(const GLOBAL sycl::ulong3 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V3
+SYCL_EXTERNAL extern "C"  void __builtin_IB_lsc_prefetch_global_ulong4(const GLOBAL sycl::ulong4 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V4
+SYCL_EXTERNAL extern "C"  void __builtin_IB_lsc_prefetch_global_ulong8(const GLOBAL sycl::ulong8 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V8
+
+#else
+
+#define GLOBAL 
+#define LOCAL 
+
+/* dummy functions for host */
+inline int   work_group_reduce_add(int x) { return x; }
+inline float work_group_reduce_min(float x) { return x; }
+inline float work_group_reduce_max(float x) { return x; }
+
+inline float atomic_min(volatile float *p, float val) { return val; };
+inline float atomic_max(volatile float *p, float val) { return val; };
+
+inline uint32_t intel_sub_group_ballot(bool valid) { return 0; }
+
+#endif
+
+/* creates a temporary that is enforced to be uniform */
+#define SYCL_UNIFORM_VAR(Ty,tmp,k)					\
+  Ty tmp##_data;							\
+  Ty* p##tmp##_data = (Ty*) sub_group_broadcast((uint64_t)&tmp##_data,k);	\
+  Ty& tmp = *p##tmp##_data;
+
+#if !defined(__forceinline)
+#define __forceinline          inline __attribute__((always_inline))
+#endif
+
+#if __SYCL_COMPILER_VERSION < 20210801
+#define all_of_group all_of
+#define any_of_group any_of
+#define none_of_group none_of
+#define group_broadcast broadcast
+#define reduce_over_group reduce
+#define exclusive_scan_over_group exclusive_scan
+#define inclusive_scan_over_group inclusive_scan
+#endif
+
+namespace embree
+{
+  template<typename T>
+  __forceinline T cselect(const bool mask, const T &a, const T &b)
+  {
+    return sycl::select(b,a,(int)mask);
+  }
+  
+  template<typename T, typename M>
+  __forceinline T cselect(const M &mask, const T &a, const T &b)
+  {
+    return sycl::select(b,a,mask);
+  }
+  
+  __forceinline const sycl::sub_group this_sub_group() {
+    return sycl::ext::oneapi::experimental::this_sub_group(); 
+  }
+  
+  __forceinline const uint32_t get_sub_group_local_id() {
+    return this_sub_group().get_local_id()[0];
+  }
+
+  __forceinline const uint32_t get_sub_group_size() {
+    return this_sub_group().get_max_local_range().size();
+  }
+
+  __forceinline const uint32_t get_sub_group_id() {
+    return this_sub_group().get_group_id()[0];
+  }
+  
+  __forceinline const uint32_t get_num_sub_groups() {
+    return this_sub_group().get_group_range().size();
+  }
+  
+  __forceinline uint32_t sub_group_ballot(bool pred) {
+    return intel_sub_group_ballot(pred);
+  }
+
+  __forceinline bool sub_group_all_of(bool pred) {
+    return sycl::all_of_group(this_sub_group(),pred);
+  }
+
+  __forceinline bool sub_group_any_of(bool pred) {
+    return sycl::any_of_group(this_sub_group(),pred);
+  }
+  
+  __forceinline bool sub_group_none_of(bool pred) {
+    return sycl::none_of_group(this_sub_group(),pred);
+  }
+
+  template <typename T> __forceinline T sub_group_broadcast(T x, sycl::id<1> local_id) {
+    return sycl::group_broadcast<sycl::sub_group>(this_sub_group(),x,local_id);
+  }
+  
+  template <typename T> __forceinline T sub_group_make_uniform(T x) {
+    return sub_group_broadcast(x,sycl::ctz(intel_sub_group_ballot(true)));
+  }
+
+  __forceinline void assume_uniform_array(void* ptr) {
+#ifdef __SYCL_DEVICE_ONLY__
+    __builtin_IB_assume_uniform(ptr);
+#endif
+  }
+
+  template <typename T, class BinaryOperation> __forceinline T sub_group_reduce(T x, BinaryOperation binary_op) {
+    return sycl::reduce_over_group<sycl::sub_group>(this_sub_group(),x,binary_op);
+  }
+
+  template <typename T, class BinaryOperation> __forceinline T sub_group_reduce(T x, T init, BinaryOperation binary_op) {
+    return sycl::reduce_over_group<sycl::sub_group>(this_sub_group(),x,init,binary_op);
+  }
+  
+  template <typename T> __forceinline T sub_group_reduce_min(T x, T init) {
+    return sub_group_reduce(x, init, sycl::ext::oneapi::minimum<T>());
+  }
+
+  template <typename T> __forceinline T sub_group_reduce_min(T x) {
+    return sub_group_reduce(x, sycl::ext::oneapi::minimum<T>());
+  }
+
+  template <typename T> __forceinline T sub_group_reduce_max(T x) {
+    return sub_group_reduce(x, sycl::ext::oneapi::maximum<T>());
+  }
+  
+  template <typename T> __forceinline T sub_group_reduce_add(T x) {
+    return sub_group_reduce(x, sycl::ext::oneapi::plus<T>());
+  }
+
+  template <typename T, class BinaryOperation> __forceinline T sub_group_exclusive_scan(T x, BinaryOperation binary_op) {
+    return sycl::exclusive_scan_over_group(this_sub_group(),x,binary_op);
+  }
+
+  template <typename T, class BinaryOperation> __forceinline T sub_group_exclusive_scan_min(T x) {
+    return sub_group_exclusive_scan(x,sycl::ext::oneapi::minimum<T>());
+  }
+
+  template <typename T, class BinaryOperation> __forceinline T sub_group_exclusive_scan(T x, T init, BinaryOperation binary_op) {
+    return sycl::exclusive_scan_over_group(this_sub_group(),x,init,binary_op);
+  }
+
+  template <typename T, class BinaryOperation> __forceinline T sub_group_inclusive_scan(T x, BinaryOperation binary_op) {
+    return sycl::inclusive_scan_over_group(this_sub_group(),x,binary_op);
+  }
+
+  template <typename T, class BinaryOperation> __forceinline T sub_group_inclusive_scan(T x, BinaryOperation binary_op, T init) {
+    return sycl::inclusive_scan_over_group(this_sub_group(),x,binary_op,init);
+  }
+
+  template <typename T> __forceinline T sub_group_shuffle(T x, sycl::id<1> local_id) {
+    return this_sub_group().shuffle(x, local_id);
+  }
+
+  template <typename T> __forceinline T sub_group_shuffle_down(T x, uint32_t delta) {
+    return this_sub_group().shuffle_down(x, delta);
+  }
+  
+  template <typename T> __forceinline T sub_group_shuffle_up(T x, uint32_t delta) {
+    return this_sub_group().shuffle_up(x, delta);
+  }
+
+  template <typename T> __forceinline T sub_group_load(const void* src) {
+    return this_sub_group().load(sycl::multi_ptr<T,sycl::access::address_space::global_space>((T*)src));
+  }
+
+  template <typename T> __forceinline void sub_group_store(void* dst, const T& x) {
+    this_sub_group().store(sycl::multi_ptr<T,sycl::access::address_space::global_space>((T*)dst),x);
+  }
+}
+
+#if __SYCL_COMPILER_VERSION < 20210801
+#undef all_of_group
+#undef any_of_group
+#undef none_of_group
+#undef group_broadcast
+#undef reduce_over_group
+#undef exclusive_scan_over_group
+#undef inclusive_scan_over_group
+#endif
diff --git a/thirdparty/embree/common/sys/sysinfo.cpp b/thirdparty/embree/common/sys/sysinfo.cpp
index 7f7a009a1e..d01eab3c9d 100644
--- a/thirdparty/embree/common/sys/sysinfo.cpp
+++ b/thirdparty/embree/common/sys/sysinfo.cpp
@@ -1,9 +1,15 @@
 // Copyright 2009-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
+#if defined(__INTEL_LLVM_COMPILER)
+// prevents "'__thiscall' calling convention is not supported for this target" warning from TBB
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wignored-attributes"
+#endif
+
 #include "sysinfo.h"
 #include "intrinsics.h"
-#include "string.h"
+#include "estring.h"
 #include "ref.h"
 #if defined(__FREEBSD__)
 #include <sys/cpuset.h>
@@ -690,3 +696,6 @@ namespace embree
 }
 #endif
 
+#if defined(__INTEL_LLVM_COMPILER)
+#pragma clang diagnostic pop
+#endif
diff --git a/thirdparty/embree/common/sys/thread.cpp b/thirdparty/embree/common/sys/thread.cpp
index 530c3c7810..8b072067e6 100644
--- a/thirdparty/embree/common/sys/thread.cpp
+++ b/thirdparty/embree/common/sys/thread.cpp
@@ -3,7 +3,7 @@
 
 #include "thread.h"
 #include "sysinfo.h"
-#include "string.h"
+#include "estring.h"
 
 #include <iostream>
 #if defined(__ARM_NEON)
diff --git a/thirdparty/embree/common/sys/vector.h b/thirdparty/embree/common/sys/vector.h
index d05e1deb18..226cd34c21 100644
--- a/thirdparty/embree/common/sys/vector.h
+++ b/thirdparty/embree/common/sys/vector.h
@@ -8,6 +8,8 @@
 
 namespace embree
 {
+  class Device;
+  
    template<typename T, typename allocator>
     class vector_t
     {
@@ -25,6 +27,12 @@ namespace embree
       template<typename M>
       __forceinline explicit vector_t (M alloc, size_t sz) 
       : alloc(alloc), size_active(0), size_alloced(0), items(nullptr) { internal_resize_init(sz); }
+
+      __forceinline vector_t (Device* alloc)
+        : vector_t(alloc,0) {}
+
+      __forceinline vector_t(void* data, size_t bytes)
+        : size_active(0), size_alloced(bytes/sizeof(T)), items((T*)data) {}
     
       __forceinline ~vector_t() {
         clear();
@@ -65,6 +73,10 @@ namespace embree
         return *this;
       }
 
+      __forceinline allocator& getAlloc() {
+	return alloc;
+      }
+
       /********************** Iterators  ****************************/
     
       __forceinline       iterator begin()       { return items; };
@@ -215,6 +227,10 @@ namespace embree
         if (new_alloced <= size_alloced) 
           return size_alloced;
 
+        /* if current size is 0 allocate exact requested size */
+        if (size_alloced == 0)
+          return new_alloced;
+
         /* resize to next power of 2 otherwise */
         size_t new_size_alloced = size_alloced;
         while (new_size_alloced < new_alloced) {
@@ -237,8 +253,12 @@ namespace embree
   /*! vector class that performs aligned allocations */
   template<typename T>
     using avector = vector_t<T,aligned_allocator<T,std::alignment_of<T>::value> >;
-  
+
   /*! vector class that performs OS allocations */
   template<typename T>
     using ovector = vector_t<T,os_allocator<T> >;
+
+  /*! vector class with externally managed data buffer */
+  template<typename T>
+    using evector = vector_t<T,no_allocator<T>>;
 }
diff --git a/thirdparty/embree/common/tasking/taskscheduler.h b/thirdparty/embree/common/tasking/taskscheduler.h
index 8f3dd87689..edfffe0e57 100644
--- a/thirdparty/embree/common/tasking/taskscheduler.h
+++ b/thirdparty/embree/common/tasking/taskscheduler.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#if defined(TASKING_INTERNAL)
+#if defined(TASKING_INTERNAL) && !defined(TASKING_TBB)
 #  include "taskschedulerinternal.h"
 #elif defined(TASKING_TBB)
 #  include "taskschedulertbb.h"
diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.cpp b/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
index ad438588a3..88b88a30ec 100644
--- a/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
+++ b/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "taskschedulerinternal.h"
-#include "../math/math.h"
+#include "../math/emath.h"
 #include "../sys/sysinfo.h"
 #include <algorithm>
 
@@ -50,11 +50,11 @@ namespace embree
       thread.task = this;
       // -- GODOT start --
       // try {
-      // if (thread.scheduler->cancellingException == nullptr)
+      //   if (context->cancellingException == nullptr)
           closure->execute();
       // } catch (...) {
-      //   if (thread.scheduler->cancellingException == nullptr)
-      //     thread.scheduler->cancellingException = std::current_exception();
+      //   if (context->cancellingException == nullptr)
+      //     context->cancellingException = std::current_exception();
       // }
       // -- GODOT end --
       thread.task = prevTask;
@@ -152,7 +152,8 @@ namespace embree
   {
     Lock<MutexSys> lock(g_mutex);
     assert(newNumThreads);
-    newNumThreads = min(newNumThreads, (size_t) getNumberOfLogicalThreads());
+    if (newNumThreads == std::numeric_limits<size_t>::max())
+      newNumThreads = (size_t) getNumberOfLogicalThreads();
 
     numThreads = newNumThreads;
     if (!startThreads && !running) return;
@@ -232,7 +233,8 @@ namespace embree
   TaskScheduler::TaskScheduler()
     : threadCounter(0), anyTasksRunning(0), hasRootTask(false)
   {
-    threadLocal.resize(2*getNumberOfLogicalThreads()); // FIXME: this has to be 2x as in the compatibility join mode with rtcCommitScene the worker threads also join. When disallowing rtcCommitScene to join a build we can remove the 2x.
+    assert(threadPool);
+    threadLocal.resize(2 * TaskScheduler::threadCount()); // FIXME: this has to be 2x as in the compatibility join mode with rtcCommitScene the worker threads also join. When disallowing rtcCommitScene to join a build we can remove the 2x.
     for (size_t i=0; i<threadLocal.size(); i++)
       threadLocal[i].store(nullptr);
   }
@@ -293,11 +295,7 @@ namespace embree
     size_t threadIndex = allocThreadIndex();
     condition.wait(mutex, [&] () { return hasRootTask.load(); });
     mutex.unlock();
-    // -- GODOT start --
-    // std::exception_ptr except = thread_loop(threadIndex);
-    // if (except != nullptr) std::rethrow_exception(except);
     thread_loop(threadIndex);
-    // -- GODOT end --
   }
 
   void TaskScheduler::reset() {
@@ -321,18 +319,15 @@ namespace embree
     return old;
   }
 
-  dll_export bool TaskScheduler::wait()
+  dll_export void TaskScheduler::wait()
   {
     Thread* thread = TaskScheduler::thread();
-    if (thread == nullptr) return true;
+    if (thread == nullptr)
+      return;
     while (thread->tasks.execute_local_internal(*thread,thread->task)) {};
-    return thread->scheduler->cancellingException == nullptr;
   }
 
-// -- GODOT start --
-//   std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex)
   void TaskScheduler::thread_loop(size_t threadIndex)
-// -- GODOT end --
   {
     /* allocate thread structure */
     std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation
@@ -354,11 +349,6 @@ namespace embree
     threadLocal[threadIndex].store(nullptr);
     swapThread(oldThread);
 
-    /* remember exception to throw */
-    // -- GODOT start --
-    // std::exception_ptr except = nullptr;
-    // if (cancellingException != nullptr) except = cancellingException;
-    // -- GODOT end --
     /* wait for all threads to terminate */
     threadCounter--;
 #if defined(__WIN32__)
@@ -376,10 +366,6 @@ namespace embree
           yield();
 #endif
 	}
-     // -- GODOT start --
-     // return except;
-     return;
-     // -- GODOT end --
   }
 
   bool TaskScheduler::steal_from_other_threads(Thread& thread)
diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.h b/thirdparty/embree/common/tasking/taskschedulerinternal.h
index 6cc2495195..e72d3b72ba 100644
--- a/thirdparty/embree/common/tasking/taskschedulerinternal.h
+++ b/thirdparty/embree/common/tasking/taskschedulerinternal.h
@@ -12,7 +12,7 @@
 #include "../sys/ref.h"
 #include "../sys/atomic.h"
 #include "../math/range.h"
-#include "../../include/embree3/rtcore.h"
+#include "../../include/embree4/rtcore.h"
 
 #include <list>
 
@@ -38,6 +38,13 @@ namespace embree
       virtual void execute() = 0;
     };
 
+
+    struct TaskGroupContext {
+      TaskGroupContext() : cancellingException(nullptr) {}
+
+      std::exception_ptr cancellingException;
+    };
+
     /*! builds a task interface from a closure */
     template<typename Closure>
     struct ClosureTaskFunction : public TaskFunction
@@ -76,16 +83,16 @@ namespace embree
 	: state(DONE) {}
 
       /*! construction of new task */
-      __forceinline Task (TaskFunction* closure, Task* parent, size_t stackPtr, size_t N)
-        : dependencies(1), stealable(true), closure(closure), parent(parent), stackPtr(stackPtr), N(N)
+      __forceinline Task (TaskFunction* closure, Task* parent, TaskGroupContext* context, size_t stackPtr, size_t N)
+        : dependencies(1), stealable(true), closure(closure), parent(parent), context(context), stackPtr(stackPtr), N(N)
       {
         if (parent) parent->add_dependencies(+1);
 	switch_state(DONE,INITIALIZED);
       }
 
       /*! construction of stolen task, stealing thread will decrement initial dependency */
-      __forceinline Task (TaskFunction* closure, Task* parent)
-        : dependencies(1), stealable(false), closure(closure), parent(parent), stackPtr(-1), N(1)
+      __forceinline Task (TaskFunction* closure, Task* parent, TaskGroupContext* context)
+        : dependencies(1), stealable(false), closure(closure), parent(parent), context(context), stackPtr(-1), N(1)
       {
 	switch_state(DONE,INITIALIZED);
       }
@@ -95,7 +102,7 @@ namespace embree
       {
         if (!stealable) return false;
 	if (!try_switch_state(INITIALIZED,DONE)) return false;
-	new (&child) Task(closure, this);
+	new (&child) Task(closure, this, context);
         return true;
       }
 
@@ -110,6 +117,7 @@ namespace embree
       std::atomic<bool> stealable;       //!< true if task can be stolen
       TaskFunction* closure;             //!< the closure to execute
       Task* parent;                      //!< parent task to signal when we are finished
+      TaskGroupContext* context;
       size_t stackPtr;                   //!< stack location where closure is stored
       size_t N;                          //!< approximative size of task
     };
@@ -122,28 +130,32 @@ namespace embree
       __forceinline void* alloc(size_t bytes, size_t align = 64)
       {
         size_t ofs = bytes + ((align - stackPtr) & (align-1));
-        if (stackPtr + ofs > CLOSURE_STACK_SIZE)
-          // -- GODOT start --
-          // throw std::runtime_error("closure stack overflow");
+        // -- GODOT start --
+        // if (stackPtr + ofs > CLOSURE_STACK_SIZE)
+        //   throw std::runtime_error("closure stack overflow");
+        if (stackPtr + ofs > CLOSURE_STACK_SIZE) {
           abort();
-          // -- GODOT end --
+        }
+        // -- GODOT end --
         stackPtr += ofs;
         return &stack[stackPtr-bytes];
       }
 
       template<typename Closure>
-      __forceinline void push_right(Thread& thread, const size_t size, const Closure& closure)
+      __forceinline void push_right(Thread& thread, const size_t size, const Closure& closure, TaskGroupContext* context)
       {
-        if (right >= TASK_STACK_SIZE)
-           // -- GODOT start --
-           // throw std::runtime_error("task stack overflow");
-           abort();
-           // -- GODOT end --
+        // -- GODOT start --
+        // if (right >= TASK_STACK_SIZE)
+        //   throw std::runtime_error("task stack overflow");
+        if (right >= TASK_STACK_SIZE) {
+          abort();
+        }
+        // -- GODOT end --
 
 	/* allocate new task on right side of stack */
         size_t oldStackPtr = stackPtr;
         TaskFunction* func = new (alloc(sizeof(ClosureTaskFunction<Closure>))) ClosureTaskFunction<Closure>(closure);
-        new (&(tasks[right.load()])) Task(func,thread.task,oldStackPtr,size);
+        new (&tasks[right.load()]) Task(func,thread.task,context,oldStackPtr,size);
         right++;
 
 	/* also move left pointer */
@@ -178,7 +190,7 @@ namespace embree
       : threadIndex(threadIndex), task(nullptr), scheduler(scheduler) {}
 
       __forceinline size_t threadCount() {
-        return scheduler->threadCounter;
+          return scheduler->threadCounter;
       }
 
       size_t threadIndex;              //!< ID of this thread
@@ -244,10 +256,7 @@ namespace embree
     void wait_for_threads(size_t threadCount);
 
     /*! thread loop for all worker threads */
-    // -- GODOT start --
-    // std::exception_ptr thread_loop(size_t threadIndex);
     void thread_loop(size_t threadIndex);
-    // -- GODOT end --
 
     /*! steals a task from a different thread */
     bool steal_from_other_threads(Thread& thread);
@@ -257,7 +266,7 @@ namespace embree
 
     /* spawn a new task at the top of the threads task stack */
     template<typename Closure>
-      void spawn_root(const Closure& closure, size_t size = 1, bool useThreadPool = true)
+      void spawn_root(const Closure& closure, TaskGroupContext* context, size_t size = 1, bool useThreadPool = true)
     {
       if (useThreadPool) startThreads();
 
@@ -267,7 +276,7 @@ namespace embree
       assert(threadLocal[threadIndex].load() == nullptr);
       threadLocal[threadIndex] = &thread;
       Thread* oldThread = swapThread(&thread);
-      thread.tasks.push_right(thread,size,closure);
+      thread.tasks.push_right(thread,size,closure,context);
       {
         Lock<MutexSys> lock(mutex);
 	anyTasksRunning++;
@@ -286,51 +295,52 @@ namespace embree
 
       /* remember exception to throw */
       std::exception_ptr except = nullptr;
-      if (cancellingException != nullptr) except = cancellingException;
+      if (context->cancellingException != nullptr) except = context->cancellingException;
 
       /* wait for all threads to terminate */
       threadCounter--;
       while (threadCounter > 0) yield();
-      cancellingException = nullptr;
+      context->cancellingException = nullptr;
 
       /* re-throw proper exception */
-      if (except != nullptr)
+      if (except != nullptr) {
         std::rethrow_exception(except);
+      }
     }
 
     /* spawn a new task at the top of the threads task stack */
     template<typename Closure>
-    static __forceinline void spawn(size_t size, const Closure& closure)
+    static __forceinline void spawn(size_t size, const Closure& closure, TaskGroupContext* context)
     {
       Thread* thread = TaskScheduler::thread();
-      if (likely(thread != nullptr)) thread->tasks.push_right(*thread,size,closure);
-      else                           instance()->spawn_root(closure,size);
+      if (likely(thread != nullptr)) thread->tasks.push_right(*thread,size,closure,context);
+      else                           instance()->spawn_root(closure,context,size);
     }
 
     /* spawn a new task at the top of the threads task stack */
     template<typename Closure>
-    static __forceinline void spawn(const Closure& closure) {
-      spawn(1,closure);
+    static __forceinline void spawn(const Closure& closure, TaskGroupContext* taskGroupContext) {
+      spawn(1,closure,taskGroupContext);
     }
 
     /* spawn a new task set  */
     template<typename Index, typename Closure>
-    static void spawn(const Index begin, const Index end, const Index blockSize, const Closure& closure)
+    static void spawn(const Index begin, const Index end, const Index blockSize, const Closure& closure, TaskGroupContext* context)
     {
       spawn(end-begin, [=]()
-        {
-	  if (end-begin <= blockSize) {
-	    return closure(range<Index>(begin,end));
-	  }
-	  const Index center = (begin+end)/2;
-	  spawn(begin,center,blockSize,closure);
-	  spawn(center,end  ,blockSize,closure);
-	  wait();
-	});
+      {
+        if (end-begin <= blockSize) {
+          return closure(range<Index>(begin,end));
+        }
+        const Index center = (begin+end)/2;
+        spawn(begin,center,blockSize,closure,context);
+        spawn(center,end  ,blockSize,closure,context);
+        wait();
+      },context);
     }
 
     /* work on spawned subtasks and wait until all have finished */
-    dll_export static bool wait();
+    dll_export static void wait();
 
     /* returns the ID of the current thread */
     dll_export static size_t threadID();
@@ -366,7 +376,6 @@ namespace embree
     std::atomic<size_t> threadCounter;
     std::atomic<size_t> anyTasksRunning;
     std::atomic<bool> hasRootTask;
-    std::exception_ptr cancellingException;
     MutexSys mutex;
     ConditionSys condition;
 
diff --git a/thirdparty/embree/common/tasking/taskschedulertbb.h b/thirdparty/embree/common/tasking/taskschedulertbb.h
index 042ba7bc4c..e1f647eb06 100644
--- a/thirdparty/embree/common/tasking/taskschedulertbb.h
+++ b/thirdparty/embree/common/tasking/taskschedulertbb.h
@@ -15,6 +15,12 @@
 #  define NOMINMAX
 #endif
 
+#if defined(__INTEL_LLVM_COMPILER)
+// prevents "'__thiscall' calling convention is not supported for this target" warning from TBB
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wignored-attributes"
+#endif
+
 // We need to define these to avoid implicit linkage against
 // tbb_debug.lib under Windows. When removing these lines debug build
 // under Windows fails.
@@ -25,6 +31,18 @@
 #include "tbb/tbb.h"
 #include "tbb/parallel_sort.h"
 
+#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR >= 8)
+#  define USE_TASK_ARENA 1
+#else
+#  define USE_TASK_ARENA 0
+#endif
+
+#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION >= 11009) // TBB 2019 Update 9
+#  define TASKING_TBB_USE_TASK_ISOLATION 1
+#else
+#  define TASKING_TBB_USE_TASK_ISOLATION 0
+#endif
+
 namespace embree
 {
   struct TaskScheduler
@@ -65,3 +83,7 @@ namespace embree
   };
 
 };
+
+#if defined(__INTEL_LLVM_COMPILER)
+#pragma clang diagnostic pop
+#endif
+\ No newline at end of file
diff --git a/thirdparty/embree/include/embree3/rtcore_scene.h b/thirdparty/embree/include/embree3/rtcore_scene.h
deleted file mode 100644
index 34d87a2ce4..0000000000
--- a/thirdparty/embree/include/embree3/rtcore_scene.h
+++ /dev/null
@@ -1,163 +0,0 @@
-// Copyright 2009-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "rtcore_device.h"
-
-RTC_NAMESPACE_BEGIN
-  
-/* Forward declarations for ray structures */
-struct RTCRayHit;
-struct RTCRayHit4;
-struct RTCRayHit8;
-struct RTCRayHit16;
-struct RTCRayHitNp;
-
-/* Scene flags */
-enum RTCSceneFlags
-{
-  RTC_SCENE_FLAG_NONE                    = 0,
-  RTC_SCENE_FLAG_DYNAMIC                 = (1 << 0),
-  RTC_SCENE_FLAG_COMPACT                 = (1 << 1),
-  RTC_SCENE_FLAG_ROBUST                  = (1 << 2),
-  RTC_SCENE_FLAG_CONTEXT_FILTER_FUNCTION = (1 << 3)
-};
-
-/* Creates a new scene. */
-RTC_API RTCScene rtcNewScene(RTCDevice device);
-
-/* Returns the device the scene got created in. The reference count of
- * the device is incremented by this function. */
-RTC_API RTCDevice rtcGetSceneDevice(RTCScene hscene);
-   
-/* Retains the scene (increments the reference count). */
-RTC_API void rtcRetainScene(RTCScene scene);
-
-/* Releases the scene (decrements the reference count). */
-RTC_API void rtcReleaseScene(RTCScene scene);
-
-
-/* Attaches the geometry to a scene. */
-RTC_API unsigned int rtcAttachGeometry(RTCScene scene, RTCGeometry geometry);
-
-/* Attaches the geometry to a scene using the specified geometry ID. */
-RTC_API void rtcAttachGeometryByID(RTCScene scene, RTCGeometry geometry, unsigned int geomID);
-
-/* Detaches the geometry from the scene. */
-RTC_API void rtcDetachGeometry(RTCScene scene, unsigned int geomID);
-
-/* Gets a geometry handle from the scene. This function is not thread safe and should get used during rendering. */
-RTC_API RTCGeometry rtcGetGeometry(RTCScene scene, unsigned int geomID);
-
-/* Gets a geometry handle from the scene. This function is thread safe and should NOT get used during rendering. */
-RTC_API RTCGeometry rtcGetGeometryThreadSafe(RTCScene scene, unsigned int geomID);
-
-
-/* Commits the scene. */
-RTC_API void rtcCommitScene(RTCScene scene);
-
-/* Commits the scene from multiple threads. */
-RTC_API void rtcJoinCommitScene(RTCScene scene);
-
-
-/* Progress monitor callback function */
-typedef bool (*RTCProgressMonitorFunction)(void* ptr, double n);
-
-/* Sets the progress monitor callback function of the scene. */
-RTC_API void rtcSetSceneProgressMonitorFunction(RTCScene scene, RTCProgressMonitorFunction progress, void* ptr);
-
-/* Sets the build quality of the scene. */
-RTC_API void rtcSetSceneBuildQuality(RTCScene scene, enum RTCBuildQuality quality);
-
-/* Sets the scene flags. */
-RTC_API void rtcSetSceneFlags(RTCScene scene, enum RTCSceneFlags flags);
-
-/* Returns the scene flags. */
-RTC_API enum RTCSceneFlags rtcGetSceneFlags(RTCScene scene);
-
-/* Returns the axis-aligned bounds of the scene. */
-RTC_API void rtcGetSceneBounds(RTCScene scene, struct RTCBounds* bounds_o);
-
-/* Returns the linear axis-aligned bounds of the scene. */
-RTC_API void rtcGetSceneLinearBounds(RTCScene scene, struct RTCLinearBounds* bounds_o);
-
-
-/* Perform a closest point query of the scene. */
-RTC_API bool rtcPointQuery(RTCScene scene, struct RTCPointQuery* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void* userPtr);
-
-/* Perform a closest point query with a packet of 4 points with the scene. */
-RTC_API bool rtcPointQuery4(const int* valid, RTCScene scene, struct RTCPointQuery4* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void** userPtr);
-
-/* Perform a closest point query with a packet of 4 points with the scene. */
-RTC_API bool rtcPointQuery8(const int* valid, RTCScene scene, struct RTCPointQuery8* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void** userPtr);
-
-/* Perform a closest point query with a packet of 4 points with the scene. */
-RTC_API bool rtcPointQuery16(const int* valid, RTCScene scene, struct RTCPointQuery16* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void** userPtr);
-
-/* Intersects a single ray with the scene. */
-RTC_API void rtcIntersect1(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit* rayhit);
-
-/* Intersects a packet of 4 rays with the scene. */
-RTC_API void rtcIntersect4(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit4* rayhit);
-
-/* Intersects a packet of 8 rays with the scene. */
-RTC_API void rtcIntersect8(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit8* rayhit);
-
-/* Intersects a packet of 16 rays with the scene. */
-RTC_API void rtcIntersect16(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit16* rayhit);
-
-/* Intersects a stream of M rays with the scene. */
-RTC_API void rtcIntersect1M(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit* rayhit, unsigned int M, size_t byteStride);
-
-/* Intersects a stream of pointers to M rays with the scene. */
-RTC_API void rtcIntersect1Mp(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit** rayhit, unsigned int M);
-
-/* Intersects a stream of M ray packets of size N in SOA format with the scene. */
-RTC_API void rtcIntersectNM(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHitN* rayhit, unsigned int N, unsigned int M, size_t byteStride);
-
-/* Intersects a stream of M ray packets of size N in SOA format with the scene. */
-RTC_API void rtcIntersectNp(RTCScene scene, struct RTCIntersectContext* context, const struct RTCRayHitNp* rayhit, unsigned int N);
-
-/* Tests a single ray for occlusion with the scene. */
-RTC_API void rtcOccluded1(RTCScene scene, struct RTCIntersectContext* context, struct RTCRay* ray);
-
-/* Tests a packet of 4 rays for occlusion occluded with the scene. */
-RTC_API void rtcOccluded4(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRay4* ray);
-
-/* Tests a packet of 8 rays for occlusion with the scene. */
-RTC_API void rtcOccluded8(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRay8* ray);
-
-/* Tests a packet of 16 rays for occlusion with the scene. */
-RTC_API void rtcOccluded16(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRay16* ray);
-
-/* Tests a stream of M rays for occlusion with the scene. */
-RTC_API void rtcOccluded1M(RTCScene scene, struct RTCIntersectContext* context, struct RTCRay* ray, unsigned int M, size_t byteStride);
-
-/* Tests a stream of pointers to M rays for occlusion with the scene. */
-RTC_API void rtcOccluded1Mp(RTCScene scene, struct RTCIntersectContext* context, struct RTCRay** ray, unsigned int M);
-
-/* Tests a stream of M ray packets of size N in SOA format for occlusion with the scene. */
-RTC_API void rtcOccludedNM(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayN* ray, unsigned int N, unsigned int M, size_t byteStride);
-
-/* Tests a stream of M ray packets of size N in SOA format for occlusion with the scene. */
-RTC_API void rtcOccludedNp(RTCScene scene, struct RTCIntersectContext* context, const struct RTCRayNp* ray, unsigned int N);
-
-/*! collision callback */
-struct RTCCollision { unsigned int geomID0; unsigned int primID0; unsigned int geomID1; unsigned int primID1; };
-typedef void (*RTCCollideFunc) (void* userPtr, struct RTCCollision* collisions, unsigned int num_collisions);
-
-/*! Performs collision detection of two scenes */
-RTC_API void rtcCollide (RTCScene scene0, RTCScene scene1, RTCCollideFunc callback, void* userPtr);
- 
-#if defined(__cplusplus)
-
-/* Helper for easily combining scene flags */
-inline RTCSceneFlags operator|(RTCSceneFlags a, RTCSceneFlags b) {
-  return (RTCSceneFlags)((size_t)a | (size_t)b);
-}
-
-#endif
-
-RTC_NAMESPACE_END
-
diff --git a/thirdparty/embree/include/embree3/rtcore.h b/thirdparty/embree/include/embree4/rtcore.h
index 450ab4c535..450ab4c535 100644
--- a/thirdparty/embree/include/embree3/rtcore.h
+++ b/thirdparty/embree/include/embree4/rtcore.h
diff --git a/thirdparty/embree/include/embree3/rtcore_buffer.h b/thirdparty/embree/include/embree4/rtcore_buffer.h
index 6b8eba9769..8721ce4a44 100644
--- a/thirdparty/embree/include/embree3/rtcore_buffer.h
+++ b/thirdparty/embree/include/embree4/rtcore_buffer.h
@@ -27,6 +27,8 @@ enum RTCBufferType
   RTC_BUFFER_TYPE_VERTEX_CREASE_WEIGHT = 21,
   RTC_BUFFER_TYPE_HOLE                 = 22,
 
+  RTC_BUFFER_TYPE_TRANSFORM            = 23,
+
   RTC_BUFFER_TYPE_FLAGS = 32
 };
 
diff --git a/thirdparty/embree/include/embree3/rtcore_builder.h b/thirdparty/embree/include/embree4/rtcore_builder.h
index 4bff999fed..4bff999fed 100644
--- a/thirdparty/embree/include/embree3/rtcore_builder.h
+++ b/thirdparty/embree/include/embree4/rtcore_builder.h
diff --git a/thirdparty/embree/include/embree3/rtcore_common.h b/thirdparty/embree/include/embree4/rtcore_common.h
index 894628e47c..57448ddaea 100644
--- a/thirdparty/embree/include/embree3/rtcore_common.h
+++ b/thirdparty/embree/include/embree4/rtcore_common.h
@@ -12,7 +12,7 @@
 RTC_NAMESPACE_BEGIN
 
 #if defined(_WIN32)
-#if defined(_M_X64)
+#if defined(_M_X64) || defined(_M_ARM64)
 typedef long long ssize_t;
 #else
 typedef int ssize_t;
@@ -41,6 +41,12 @@ typedef int ssize_t;
 #  define RTC_FORCEINLINE inline __attribute__((always_inline))
 #endif
 
+#if defined(__cplusplus)
+#  define RTC_OPTIONAL_ARGUMENT = nullptr
+#else
+#  define RTC_OPTIONAL_ARGUMENT
+#endif
+
 /* Invalid geometry ID */
 #define RTC_INVALID_GEOMETRY_ID ((unsigned int)-1)
 
@@ -141,7 +147,9 @@ enum RTCFormat
   RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR = 0x9244,
 
   /* special 12-byte format for grids */
-  RTC_FORMAT_GRID = 0xA001
+  RTC_FORMAT_GRID = 0xA001,
+
+  RTC_FORMAT_QUATERNION_DECOMPOSITION = 0xB001,
 };
 
 /* Build quality levels */
@@ -167,12 +175,138 @@ struct RTC_ALIGN(16) RTCLinearBounds
   struct RTCBounds bounds1;
 };
 
-/* Intersection context flags */
-enum RTCIntersectContextFlags
+/* Feature flags for SYCL specialization constants */
+enum RTCFeatureFlags
 {
-  RTC_INTERSECT_CONTEXT_FLAG_NONE       = 0,
-  RTC_INTERSECT_CONTEXT_FLAG_INCOHERENT = (0 << 0), // optimize for incoherent rays
-  RTC_INTERSECT_CONTEXT_FLAG_COHERENT   = (1 << 0)  // optimize for coherent rays
+  RTC_FEATURE_FLAG_NONE = 0,
+
+  RTC_FEATURE_FLAG_MOTION_BLUR = 1 << 0,
+
+  RTC_FEATURE_FLAG_TRIANGLE = 1 << 1,
+  RTC_FEATURE_FLAG_QUAD = 1 << 2,
+  RTC_FEATURE_FLAG_GRID = 1 << 3,
+
+  RTC_FEATURE_FLAG_SUBDIVISION = 1 << 4,
+
+  RTC_FEATURE_FLAG_CONE_LINEAR_CURVE = 1 << 5,
+  RTC_FEATURE_FLAG_ROUND_LINEAR_CURVE  = 1 << 6,
+  RTC_FEATURE_FLAG_FLAT_LINEAR_CURVE = 1 << 7,
+
+  RTC_FEATURE_FLAG_ROUND_BEZIER_CURVE = 1 << 8,
+  RTC_FEATURE_FLAG_FLAT_BEZIER_CURVE = 1 << 9,
+  RTC_FEATURE_FLAG_NORMAL_ORIENTED_BEZIER_CURVE = 1 << 10,
+
+  RTC_FEATURE_FLAG_ROUND_BSPLINE_CURVE = 1 << 11,
+  RTC_FEATURE_FLAG_FLAT_BSPLINE_CURVE = 1 << 12,
+  RTC_FEATURE_FLAG_NORMAL_ORIENTED_BSPLINE_CURVE = 1 << 13,
+
+  RTC_FEATURE_FLAG_ROUND_HERMITE_CURVE = 1 << 14,
+  RTC_FEATURE_FLAG_FLAT_HERMITE_CURVE = 1 << 15,
+  RTC_FEATURE_FLAG_NORMAL_ORIENTED_HERMITE_CURVE = 1 << 16,
+
+  RTC_FEATURE_FLAG_ROUND_CATMULL_ROM_CURVE = 1 << 17,
+  RTC_FEATURE_FLAG_FLAT_CATMULL_ROM_CURVE = 1 << 18,
+  RTC_FEATURE_FLAG_NORMAL_ORIENTED_CATMULL_ROM_CURVE = 1 << 19,
+
+  RTC_FEATURE_FLAG_SPHERE_POINT = 1 << 20,
+  RTC_FEATURE_FLAG_DISC_POINT = 1 << 21,
+  RTC_FEATURE_FLAG_ORIENTED_DISC_POINT = 1 << 22,
+
+  RTC_FEATURE_FLAG_POINT =
+    RTC_FEATURE_FLAG_SPHERE_POINT |
+    RTC_FEATURE_FLAG_DISC_POINT |
+    RTC_FEATURE_FLAG_ORIENTED_DISC_POINT,
+
+  RTC_FEATURE_FLAG_ROUND_CURVES =
+    RTC_FEATURE_FLAG_ROUND_LINEAR_CURVE |
+    RTC_FEATURE_FLAG_ROUND_BEZIER_CURVE |
+    RTC_FEATURE_FLAG_ROUND_BSPLINE_CURVE |
+    RTC_FEATURE_FLAG_ROUND_HERMITE_CURVE |
+    RTC_FEATURE_FLAG_ROUND_CATMULL_ROM_CURVE,
+  
+  RTC_FEATURE_FLAG_FLAT_CURVES =
+    RTC_FEATURE_FLAG_FLAT_LINEAR_CURVE |
+    RTC_FEATURE_FLAG_FLAT_BEZIER_CURVE |
+    RTC_FEATURE_FLAG_FLAT_BSPLINE_CURVE |
+    RTC_FEATURE_FLAG_FLAT_HERMITE_CURVE |
+    RTC_FEATURE_FLAG_FLAT_CATMULL_ROM_CURVE,
+  
+  RTC_FEATURE_FLAG_NORMAL_ORIENTED_CURVES =
+    RTC_FEATURE_FLAG_NORMAL_ORIENTED_BEZIER_CURVE |
+    RTC_FEATURE_FLAG_NORMAL_ORIENTED_BSPLINE_CURVE |
+    RTC_FEATURE_FLAG_NORMAL_ORIENTED_HERMITE_CURVE |
+    RTC_FEATURE_FLAG_NORMAL_ORIENTED_CATMULL_ROM_CURVE,
+  
+  RTC_FEATURE_FLAG_LINEAR_CURVES =
+    RTC_FEATURE_FLAG_CONE_LINEAR_CURVE |
+    RTC_FEATURE_FLAG_ROUND_LINEAR_CURVE |
+    RTC_FEATURE_FLAG_FLAT_LINEAR_CURVE,
+  
+  RTC_FEATURE_FLAG_BEZIER_CURVES =
+    RTC_FEATURE_FLAG_ROUND_BEZIER_CURVE |
+    RTC_FEATURE_FLAG_FLAT_BEZIER_CURVE |
+    RTC_FEATURE_FLAG_NORMAL_ORIENTED_BEZIER_CURVE,
+
+  RTC_FEATURE_FLAG_BSPLINE_CURVES =
+    RTC_FEATURE_FLAG_ROUND_BSPLINE_CURVE |
+    RTC_FEATURE_FLAG_FLAT_BSPLINE_CURVE |
+    RTC_FEATURE_FLAG_NORMAL_ORIENTED_BSPLINE_CURVE,
+
+  RTC_FEATURE_FLAG_HERMITE_CURVES =
+    RTC_FEATURE_FLAG_ROUND_HERMITE_CURVE |
+    RTC_FEATURE_FLAG_FLAT_HERMITE_CURVE |
+    RTC_FEATURE_FLAG_NORMAL_ORIENTED_HERMITE_CURVE,
+  
+  RTC_FEATURE_FLAG_CURVES =
+    RTC_FEATURE_FLAG_CONE_LINEAR_CURVE |
+    RTC_FEATURE_FLAG_ROUND_LINEAR_CURVE |
+    RTC_FEATURE_FLAG_FLAT_LINEAR_CURVE |
+    RTC_FEATURE_FLAG_ROUND_BEZIER_CURVE |
+    RTC_FEATURE_FLAG_FLAT_BEZIER_CURVE |
+    RTC_FEATURE_FLAG_NORMAL_ORIENTED_BEZIER_CURVE |
+    RTC_FEATURE_FLAG_ROUND_BSPLINE_CURVE |
+    RTC_FEATURE_FLAG_FLAT_BSPLINE_CURVE |
+    RTC_FEATURE_FLAG_NORMAL_ORIENTED_BSPLINE_CURVE |
+    RTC_FEATURE_FLAG_ROUND_HERMITE_CURVE |
+    RTC_FEATURE_FLAG_FLAT_HERMITE_CURVE |
+    RTC_FEATURE_FLAG_NORMAL_ORIENTED_HERMITE_CURVE |
+    RTC_FEATURE_FLAG_ROUND_CATMULL_ROM_CURVE |
+    RTC_FEATURE_FLAG_FLAT_CATMULL_ROM_CURVE |
+    RTC_FEATURE_FLAG_NORMAL_ORIENTED_CATMULL_ROM_CURVE,
+  
+  RTC_FEATURE_FLAG_INSTANCE = 1 << 23,
+
+  RTC_FEATURE_FLAG_FILTER_FUNCTION_IN_ARGUMENTS = 1 << 24,
+  RTC_FEATURE_FLAG_FILTER_FUNCTION_IN_GEOMETRY = 1 << 25,
+
+  RTC_FEATURE_FLAG_FILTER_FUNCTION =
+    RTC_FEATURE_FLAG_FILTER_FUNCTION_IN_ARGUMENTS |
+    RTC_FEATURE_FLAG_FILTER_FUNCTION_IN_GEOMETRY,
+
+  RTC_FEATURE_FLAG_USER_GEOMETRY_CALLBACK_IN_ARGUMENTS = 1 << 26,
+  RTC_FEATURE_FLAG_USER_GEOMETRY_CALLBACK_IN_GEOMETRY = 1 << 27,
+
+  RTC_FEATURE_FLAG_USER_GEOMETRY =
+    RTC_FEATURE_FLAG_USER_GEOMETRY_CALLBACK_IN_ARGUMENTS |
+    RTC_FEATURE_FLAG_USER_GEOMETRY_CALLBACK_IN_GEOMETRY,
+
+  RTC_FEATURE_FLAG_32_BIT_RAY_MASK = 1 << 28,
+
+  RTC_FEATURE_FLAG_INSTANCE_ARRAY = 1 << 29,
+
+  RTC_FEATURE_FLAG_ALL = 0xffffffff,
+};
+
+/* Ray query flags */
+enum RTCRayQueryFlags
+{
+  /* matching intel_ray_flags_t layout */
+  RTC_RAY_QUERY_FLAG_NONE       = 0,
+  RTC_RAY_QUERY_FLAG_INVOKE_ARGUMENT_FILTER = (1 << 1), // enable argument filter for each geometry
+
+  /* embree specific flags */
+  RTC_RAY_QUERY_FLAG_INCOHERENT = (0 << 16), // optimize for incoherent rays
+  RTC_RAY_QUERY_FLAG_COHERENT   = (1 << 16), // optimize for coherent rays
 };
 
 /* Arguments for RTCFilterFunctionN */
@@ -180,7 +314,7 @@ struct RTCFilterFunctionNArguments
 {
   int* valid;
   void* geometryUserPtr;
-  struct RTCIntersectContext* context;
+  struct RTCRayQueryContext* context;
   struct RTCRayN* ray;
   struct RTCHitN* hit;
   unsigned int N;
@@ -189,38 +323,41 @@ struct RTCFilterFunctionNArguments
 /* Filter callback function */
 typedef void (*RTCFilterFunctionN)(const struct RTCFilterFunctionNArguments* args);
 
-/* Intersection context passed to intersect/occluded calls */
-struct RTCIntersectContext
+/* Intersection callback function */
+struct RTCIntersectFunctionNArguments;
+typedef void (*RTCIntersectFunctionN)(const struct RTCIntersectFunctionNArguments* args);
+
+/* Occlusion callback function */
+struct RTCOccludedFunctionNArguments;
+typedef void (*RTCOccludedFunctionN)(const struct RTCOccludedFunctionNArguments* args);
+
+/* Ray query context passed to intersect/occluded calls */
+struct RTCRayQueryContext
 {
-  enum RTCIntersectContextFlags flags;               // intersection flags
-  RTCFilterFunctionN filter;                         // filter function to execute
-  
 #if RTC_MAX_INSTANCE_LEVEL_COUNT > 1
   unsigned int instStackSize;                        // Number of instances currently on the stack.
 #endif
   unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // The current stack of instance ids.
-  
-#if RTC_MIN_WIDTH
-  float minWidthDistanceFactor;                      // curve radius is set to this factor times distance to ray origin
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+  unsigned int instPrimID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // The current stack of instance primitive ids.
 #endif
 };
 
-/* Initializes an intersection context. */
-RTC_FORCEINLINE void rtcInitIntersectContext(struct RTCIntersectContext* context)
+/* Initializes an ray query context. */
+RTC_FORCEINLINE void rtcInitRayQueryContext(struct RTCRayQueryContext* context)
 {
   unsigned l = 0;
-  context->flags = RTC_INTERSECT_CONTEXT_FLAG_INCOHERENT;
-  context->filter = NULL;
-  
+
 #if RTC_MAX_INSTANCE_LEVEL_COUNT > 1
   context->instStackSize = 0;
 #endif
-  for (; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+
+  for (; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) {
     context->instID[l] = RTC_INVALID_GEOMETRY_ID;
-  
-#if RTC_MIN_WIDTH
-  context->minWidthDistanceFactor = 0.0f;
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+    context->instPrimID[l] = RTC_INVALID_GEOMETRY_ID;
 #endif
+  }
 }
 
 /* Point query structure for closest point query */
@@ -278,15 +415,28 @@ struct RTC_ALIGN(16) RTCPointQueryContext
   // instance ids.
   unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT];
 
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+  // instance prim ids.
+  unsigned int instPrimID[RTC_MAX_INSTANCE_LEVEL_COUNT];
+#endif
+
   // number of instances currently on the stack.
   unsigned int instStackSize;
 };
 
-/* Initializes an intersection context. */
+/* Initializes an ray query context. */
 RTC_FORCEINLINE void rtcInitPointQueryContext(struct RTCPointQueryContext* context)
 {
+  unsigned l = 0;
+
   context->instStackSize = 0;
-  context->instID[0] = RTC_INVALID_GEOMETRY_ID;
+
+  for (; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) {
+    context->instID[l] = RTC_INVALID_GEOMETRY_ID;
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+    context->instPrimID[l] = RTC_INVALID_GEOMETRY_ID;
+#endif
+  }
 }
 
 struct RTC_ALIGN(16) RTCPointQueryFunctionArguments
@@ -308,7 +458,7 @@ struct RTC_ALIGN(16) RTCPointQueryFunctionArguments
   struct RTCPointQueryContext* context;
 
   // If the current instance transform M (= context->world2inst[context->instStackSize]) 
-  // is a similarity matrix, i.e there is a constant factor similarityScale such that,
+  // is a similarity matrix, i.e there is a constant factor similarityScale such that
   //    for all x,y: dist(Mx, My) = similarityScale * dist(x, y),
   // The similarity scale is 0, if the current instance transform is not a
   // similarity transform and vice versa. The similarity scale allows to compute
@@ -322,5 +472,31 @@ struct RTC_ALIGN(16) RTCPointQueryFunctionArguments
 };
 
 typedef bool (*RTCPointQueryFunction)(struct RTCPointQueryFunctionArguments* args);
-  
+
+#if defined(EMBREE_SYCL_SUPPORT) && defined(SYCL_LANGUAGE_VERSION)
+
+/* returns function pointer to be usable in SYCL kernel */
+template<auto F>
+inline decltype(F) rtcGetSYCLDeviceFunctionPointer(sycl::queue& queue)
+{
+  sycl::buffer<cl_ulong> fptr_buf(1);
+  {
+    auto fptr_acc = fptr_buf.get_host_access();
+    fptr_acc[0] = 0;
+  }
+
+  queue.submit([&](sycl::handler& cgh) {
+      auto fptr_acc = fptr_buf.get_access<sycl::access::mode::discard_write>(cgh);
+      cgh.single_task([=]() {
+	  fptr_acc[0] = reinterpret_cast<cl_ulong>(F);
+	});
+    });
+  queue.wait_and_throw();
+
+  auto fptr_acc = fptr_buf.get_host_access();
+  return (decltype(F)) fptr_acc[0];
+}
+
+#endif
+
 RTC_NAMESPACE_END
diff --git a/thirdparty/embree/include/embree3/rtcore_config.h b/thirdparty/embree/include/embree4/rtcore_config.h
index 0b399ef040..cb3a8678a7 100644
--- a/thirdparty/embree/include/embree3/rtcore_config.h
+++ b/thirdparty/embree/include/embree4/rtcore_config.h
@@ -3,21 +3,32 @@
 
 #pragma once
 
-#define RTC_VERSION_MAJOR 3
-#define RTC_VERSION_MINOR 13
-#define RTC_VERSION_PATCH 5
-#define RTC_VERSION 31305
-#define RTC_VERSION_STRING "3.13.5"
+#if !defined(EMBREE_SYCL_SUPPORT)
+// #cmakedefine EMBREE_SYCL_SUPPORT
+#endif
+
+#define RTC_VERSION_MAJOR 4
+#define RTC_VERSION_MINOR 3
+#define RTC_VERSION_PATCH 1
+#define RTC_VERSION 40301
+#define RTC_VERSION_STRING "4.3.1"
 
 #define RTC_MAX_INSTANCE_LEVEL_COUNT 1
 
+// #cmakedefine EMBREE_GEOMETRY_INSTANCE_ARRAY
+#if defined(EMBREE_GEOMETRY_INSTANCE_ARRAY)
+  #define RTC_GEOMETRY_INSTANCE_ARRAY
+#endif
+
+// #cmakedefine01 EMBREE_SYCL_GEOMETRY_CALLBACK
+
 #define EMBREE_MIN_WIDTH 0
 #define RTC_MIN_WIDTH EMBREE_MIN_WIDTH
 
 #if !defined(EMBREE_STATIC_LIB)
-#   define EMBREE_STATIC_LIB
+#define EMBREE_STATIC_LIB
 #endif
-/* #undef EMBREE_API_NAMESPACE*/
+// #cmakedefine EMBREE_API_NAMESPACE
 
 #if defined(EMBREE_API_NAMESPACE)
 #  define RTC_NAMESPACE
@@ -56,3 +67,14 @@
 #else
 #  define RTC_API RTC_API_IMPORT
 #endif
+
+#if defined(ISPC)
+#  define RTC_SYCL_INDIRECTLY_CALLABLE
+#elif defined(__SYCL_DEVICE_ONLY__)
+#  define RTC_SYCL_INDIRECTLY_CALLABLE [[intel::device_indirectly_callable]] SYCL_EXTERNAL
+#  define RTC_SYCL_API                 SYCL_EXTERNAL
+#else
+#  define RTC_SYCL_INDIRECTLY_CALLABLE
+#  define RTC_SYCL_API RTC_API
+#endif
+
diff --git a/thirdparty/embree/include/embree3/rtcore_device.h b/thirdparty/embree/include/embree4/rtcore_device.h
index 2dd3047603..5ca99bca0a 100644
--- a/thirdparty/embree/include/embree3/rtcore_device.h
+++ b/thirdparty/embree/include/embree4/rtcore_device.h
@@ -13,6 +13,24 @@ typedef struct RTCDeviceTy* RTCDevice;
 /* Creates a new Embree device. */
 RTC_API RTCDevice rtcNewDevice(const char* config);
 
+#if defined(EMBREE_SYCL_SUPPORT) && defined(SYCL_LANGUAGE_VERSION)
+
+
+/* Creates a new Embree SYCL device. */
+RTC_API_EXTERN_C RTCDevice rtcNewSYCLDevice(sycl::context context, const char* config);
+
+/* Checks if SYCL device is supported by Embree. */
+RTC_API bool rtcIsSYCLDeviceSupported(const sycl::device sycl_device);
+
+/* SYCL selector for Embree supported devices */
+RTC_API int rtcSYCLDeviceSelector(const sycl::device sycl_device);
+
+/* Set the SYCL device to be used to allocate data */
+RTC_API void rtcSetDeviceSYCLDevice(RTCDevice device, const sycl::device sycl_device);
+
+#endif
+
+
 /* Retains the Embree device (increments the reference count). */
 RTC_API void rtcRetainDevice(RTCDevice device);
   
@@ -30,8 +48,8 @@ enum RTCDeviceProperty
   RTC_DEVICE_PROPERTY_NATIVE_RAY4_SUPPORTED  = 32,
   RTC_DEVICE_PROPERTY_NATIVE_RAY8_SUPPORTED  = 33,
   RTC_DEVICE_PROPERTY_NATIVE_RAY16_SUPPORTED = 34,
-  RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED   = 35,
 
+  RTC_DEVICE_PROPERTY_BACKFACE_CULLING_SPHERES_ENABLED = 62,
   RTC_DEVICE_PROPERTY_BACKFACE_CULLING_CURVES_ENABLED = 63,
   RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED          = 64,
   RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED    = 65,
@@ -66,7 +84,7 @@ enum RTCError
   RTC_ERROR_INVALID_OPERATION = 3,
   RTC_ERROR_OUT_OF_MEMORY     = 4,
   RTC_ERROR_UNSUPPORTED_CPU   = 5,
-  RTC_ERROR_CANCELLED         = 6
+  RTC_ERROR_CANCELLED         = 6,
 };
 
 /* Returns the error code. */
diff --git a/thirdparty/embree/include/embree3/rtcore_geometry.h b/thirdparty/embree/include/embree4/rtcore_geometry.h
index d1de17491c..140d6721d5 100644
--- a/thirdparty/embree/include/embree3/rtcore_geometry.h
+++ b/thirdparty/embree/include/embree4/rtcore_geometry.h
@@ -48,7 +48,8 @@ enum RTCGeometryType
   RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_CATMULL_ROM_CURVE  = 60, // flat normal-oriented Catmull-Rom curves
 
   RTC_GEOMETRY_TYPE_USER     = 120, // user-defined geometry
-  RTC_GEOMETRY_TYPE_INSTANCE = 121  // scene instance
+  RTC_GEOMETRY_TYPE_INSTANCE = 121,  // scene instance
+  RTC_GEOMETRY_TYPE_INSTANCE_ARRAY = 122,  // scene instance array
 };
 
 /* Interpolation modes for subdivision surfaces */
@@ -86,30 +87,24 @@ struct RTCIntersectFunctionNArguments
   int* valid;
   void* geometryUserPtr;
   unsigned int primID;
-  struct RTCIntersectContext* context;
+  struct RTCRayQueryContext* context;
   struct RTCRayHitN* rayhit;
   unsigned int N;
   unsigned int geomID;
 };
 
-/* Intersection callback function */
-typedef void (*RTCIntersectFunctionN)(const struct RTCIntersectFunctionNArguments* args);
-
 /* Arguments for RTCOccludedFunctionN */
 struct RTCOccludedFunctionNArguments
 {
   int* valid;
   void* geometryUserPtr;
   unsigned int primID;
-  struct RTCIntersectContext* context;
+  struct RTCRayQueryContext* context;
   struct RTCRayN* ray;
   unsigned int N;
   unsigned int geomID;
 };
 
-/* Occlusion callback function */
-typedef void (*RTCOccludedFunctionN)(const struct RTCOccludedFunctionNArguments* args);
-
 /* Arguments for RTCDisplacementFunctionN */
 struct RTCDisplacementFunctionNArguments
 {
@@ -192,6 +187,9 @@ RTC_API void rtcSetGeometryIntersectFilterFunction(RTCGeometry geometry, RTCFilt
 /* Sets the occlusion filter callback function of the geometry. */
 RTC_API void rtcSetGeometryOccludedFilterFunction(RTCGeometry geometry, RTCFilterFunctionN filter);
 
+/* Enables argument version of intersection or occlusion filter function. */
+RTC_API void rtcSetGeometryEnableFilterFunctionFromArguments(RTCGeometry geometry, bool enable);
+
 /* Sets the user-defined data pointer of the geometry. */
 RTC_API void rtcSetGeometryUserData(RTCGeometry geometry, void* ptr);
 
@@ -214,15 +212,17 @@ RTC_API void rtcSetGeometryIntersectFunction(RTCGeometry geometry, RTCIntersectF
 RTC_API void rtcSetGeometryOccludedFunction(RTCGeometry geometry, RTCOccludedFunctionN occluded);
 
 /* Invokes the intersection filter from the intersection callback function. */
-RTC_API void rtcFilterIntersection(const struct RTCIntersectFunctionNArguments* args, const struct RTCFilterFunctionNArguments* filterArgs);
+RTC_SYCL_API void rtcInvokeIntersectFilterFromGeometry(const struct RTCIntersectFunctionNArguments* args, const struct RTCFilterFunctionNArguments* filterArgs);
 
 /* Invokes the occlusion filter from the occlusion callback function. */
-RTC_API void rtcFilterOcclusion(const struct RTCOccludedFunctionNArguments* args, const struct RTCFilterFunctionNArguments* filterArgs);
-
+RTC_SYCL_API void rtcInvokeOccludedFilterFromGeometry(const struct RTCOccludedFunctionNArguments* args, const struct RTCFilterFunctionNArguments* filterArgs);
 
 /* Sets the instanced scene of an instance geometry. */
 RTC_API void rtcSetGeometryInstancedScene(RTCGeometry geometry, RTCScene scene);
 
+/* Sets the instanced scenes of an instance array geometry. */
+RTC_API void rtcSetGeometryInstancedScenes(RTCGeometry geometry, RTCScene* scenes, size_t numScenes);
+
 /* Sets the transformation of an instance for the specified time step. */
 RTC_API void rtcSetGeometryTransform(RTCGeometry geometry, unsigned int timeStep, enum RTCFormat format, const void* xfm);
 
@@ -232,6 +232,12 @@ RTC_API void rtcSetGeometryTransformQuaternion(RTCGeometry geometry, unsigned in
 /* Returns the interpolated transformation of an instance for the specified time. */
 RTC_API void rtcGetGeometryTransform(RTCGeometry geometry, float time, enum RTCFormat format, void* xfm);
 
+/*
+ * Returns the interpolated transformation of the instPrimID'th instance of an
+ * instance array for the specified time. If geometry is an regular instance,
+ * instPrimID must be 0.
+ */
+RTC_API void rtcGetGeometryTransformEx(RTCGeometry geometry, unsigned int instPrimID, float time, enum RTCFormat format, void* xfm);
 
 /* Sets the uniform tessellation rate of the geometry. */
 RTC_API void rtcSetGeometryTessellationRate(RTCGeometry geometry, float tessellationRate);
diff --git a/thirdparty/embree/include/embree3/rtcore_quaternion.h b/thirdparty/embree/include/embree4/rtcore_quaternion.h
index bd5fe1d89a..bd5fe1d89a 100644
--- a/thirdparty/embree/include/embree3/rtcore_quaternion.h
+++ b/thirdparty/embree/include/embree4/rtcore_quaternion.h
diff --git a/thirdparty/embree/include/embree3/rtcore_ray.h b/thirdparty/embree/include/embree4/rtcore_ray.h
index a2ee6dabbb..7fc3f00cf9 100644
--- a/thirdparty/embree/include/embree3/rtcore_ray.h
+++ b/thirdparty/embree/include/embree4/rtcore_ray.h
@@ -39,6 +39,9 @@ struct RTC_ALIGN(16) RTCHit
   unsigned int primID; // primitive ID
   unsigned int geomID; // geometry ID
   unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+  unsigned int instPrimID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance primitive ID
+#endif
 };
 
 /* Combined ray/hit structure for a single ray */
@@ -80,6 +83,9 @@ struct RTC_ALIGN(16) RTCHit4
   unsigned int primID[4];
   unsigned int geomID[4];
   unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][4];
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+  unsigned int instPrimID[RTC_MAX_INSTANCE_LEVEL_COUNT][4];
+#endif
 };
 
 /* Combined ray/hit structure for a packet of 4 rays */
@@ -121,6 +127,9 @@ struct RTC_ALIGN(32) RTCHit8
   unsigned int primID[8];
   unsigned int geomID[8];
   unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][8];
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+  unsigned int instPrimID[RTC_MAX_INSTANCE_LEVEL_COUNT][8];
+#endif
 };
 
 /* Combined ray/hit structure for a packet of 8 rays */
@@ -162,6 +171,9 @@ struct RTC_ALIGN(64) RTCHit16
   unsigned int primID[16];
   unsigned int geomID[16];
   unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][16];
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+  unsigned int instPrimID[RTC_MAX_INSTANCE_LEVEL_COUNT][16];
+#endif
 };
 
 /* Combined ray/hit structure for a packet of 16 rays */
@@ -171,47 +183,6 @@ struct RTCRayHit16
   struct RTCHit16 hit;
 };
 
-/* Ray structure for a packet/stream of N rays in pointer SOA layout */
-struct RTCRayNp
-{
-  float* org_x;
-  float* org_y;
-  float* org_z;
-  float* tnear;
-
-  float* dir_x;
-  float* dir_y;
-  float* dir_z;
-  float* time;
-
-  float* tfar;
-  unsigned int* mask;
-  unsigned int* id;
-  unsigned int* flags;
-};
-
-/* Hit structure for a packet/stream of N rays in pointer SOA layout */
-struct RTCHitNp
-{
-  float* Ng_x;
-  float* Ng_y;
-  float* Ng_z;
-
-  float* u;
-  float* v;
-
-  unsigned int* primID;
-  unsigned int* geomID;
-  unsigned int* instID[RTC_MAX_INSTANCE_LEVEL_COUNT];
-};
-
-/* Combined ray/hit structure for a packet/stream of N rays in pointer SOA layout */
-struct RTCRayHitNp
-{
-  struct RTCRayNp ray;
-  struct RTCHitNp hit;
-};
-
 struct RTCRayN;
 struct RTCHitN;
 struct RTCRayHitN;
@@ -242,9 +213,12 @@ RTC_FORCEINLINE float& RTCHitN_Ng_z(RTCHitN* hit, unsigned int N, unsigned int i
 RTC_FORCEINLINE float& RTCHitN_u(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[3*N+i]; }
 RTC_FORCEINLINE float& RTCHitN_v(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[4*N+i]; }
 
-RTC_FORCEINLINE unsigned int& RTCHitN_primID(RTCHitN* hit, unsigned int N, unsigned int i) { return ((unsigned*)hit)[5*N+i]; }
-RTC_FORCEINLINE unsigned int& RTCHitN_geomID(RTCHitN* hit, unsigned int N, unsigned int i) { return ((unsigned*)hit)[6*N+i]; }
-RTC_FORCEINLINE unsigned int& RTCHitN_instID(RTCHitN* hit, unsigned int N, unsigned int i, unsigned int l) { return ((unsigned*)hit)[7*N+i+N*l]; }
+RTC_FORCEINLINE unsigned int& RTCHitN_primID    (RTCHitN* hit, unsigned int N, unsigned int i) { return ((unsigned*)hit)[5*N+i]; }
+RTC_FORCEINLINE unsigned int& RTCHitN_geomID    (RTCHitN* hit, unsigned int N, unsigned int i) { return ((unsigned*)hit)[6*N+i]; }
+RTC_FORCEINLINE unsigned int& RTCHitN_instID    (RTCHitN* hit, unsigned int N, unsigned int i, unsigned int l) { return ((unsigned*)hit)[7*N +                                  N*l + i]; }
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+RTC_FORCEINLINE unsigned int& RTCHitN_instPrimID(RTCHitN* hit, unsigned int N, unsigned int i, unsigned int l) { return ((unsigned*)hit)[7*N + N*RTC_MAX_INSTANCE_LEVEL_COUNT + N*l + i]; }
+#endif
 
 /* Helper functions to extract RTCRayN and RTCHitN from RTCRayHitN */
 RTC_FORCEINLINE RTCRayN* RTCRayHitN_RayN(RTCRayHitN* rayhit, unsigned int N) { return (RTCRayN*)&((float*)rayhit)[0*N]; }
@@ -284,6 +258,9 @@ struct RTCHitNt
   unsigned int primID[N];
   unsigned int geomID[N];
   unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][N];
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+  unsigned int instPrimID[RTC_MAX_INSTANCE_LEVEL_COUNT][N];
+#endif
 };
 
 /* Helper structure for a combined ray/hit packet of compile-time size N */
@@ -322,8 +299,12 @@ RTC_FORCEINLINE RTCHit rtcGetHitFromHitN(RTCHitN* hitN, unsigned int N, unsigned
   hit.v      = RTCHitN_v(hitN,N,i);
   hit.primID = RTCHitN_primID(hitN,N,i);
   hit.geomID = RTCHitN_geomID(hitN,N,i);
-  for (unsigned int l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; l++)
+  for (unsigned int l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; l++) {
     hit.instID[l] = RTCHitN_instID(hitN,N,i,l);
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+    hit.instPrimID[l] = RTCHitN_instPrimID(hitN,N,i,l);
+#endif
+  }
   return hit;
 }
 
@@ -336,8 +317,12 @@ RTC_FORCEINLINE void rtcCopyHitToHitN(RTCHitN* hitN, const RTCHit* hit, unsigned
   RTCHitN_v(hitN,N,i)      = hit->v;
   RTCHitN_primID(hitN,N,i) = hit->primID;
   RTCHitN_geomID(hitN,N,i) = hit->geomID;
-  for (unsigned int l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; l++)
+  for (unsigned int l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; l++) {
     RTCHitN_instID(hitN,N,i,l) = hit->instID[l];
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+    RTCHitN_instPrimID(hitN,N,i,l) = hit->instPrimID[l];
+#endif
+  }
 }
 
 RTC_FORCEINLINE RTCRayHit rtcGetRayHitFromRayHitN(RTCRayHitN* rayhitN, unsigned int N, unsigned int i)
@@ -366,8 +351,12 @@ RTC_FORCEINLINE RTCRayHit rtcGetRayHitFromRayHitN(RTCRayHitN* rayhitN, unsigned
   rh.hit.v      = RTCHitN_v(hit,N,i);
   rh.hit.primID = RTCHitN_primID(hit,N,i);
   rh.hit.geomID = RTCHitN_geomID(hit,N,i);
-  for (unsigned int l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; l++)
+  for (unsigned int l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; l++) {
     rh.hit.instID[l] = RTCHitN_instID(hit,N,i,l);
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+    rh.hit.instPrimID[l] = RTCHitN_instPrimID(hit,N,i,l);
+#endif
+  }
 
   return rh;
 }
diff --git a/thirdparty/embree/include/embree4/rtcore_scene.h b/thirdparty/embree/include/embree4/rtcore_scene.h
new file mode 100644
index 0000000000..e37af4f962
--- /dev/null
+++ b/thirdparty/embree/include/embree4/rtcore_scene.h
@@ -0,0 +1,252 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_device.h"
+
+RTC_NAMESPACE_BEGIN
+  
+/* Forward declarations for ray structures */
+struct RTCRayHit;
+struct RTCRayHit4;
+struct RTCRayHit8;
+struct RTCRayHit16;
+
+/* Scene flags */
+enum RTCSceneFlags
+{
+  RTC_SCENE_FLAG_NONE                    = 0,
+  RTC_SCENE_FLAG_DYNAMIC                 = (1 << 0),
+  RTC_SCENE_FLAG_COMPACT                 = (1 << 1),
+  RTC_SCENE_FLAG_ROBUST                  = (1 << 2),
+  RTC_SCENE_FLAG_FILTER_FUNCTION_IN_ARGUMENTS = (1 << 3)
+};
+
+/* Additional arguments for rtcIntersect1/4/8/16 calls */
+struct RTCIntersectArguments
+{
+  enum RTCRayQueryFlags flags;     // intersection flags
+  enum RTCFeatureFlags feature_mask;       // selectively enable features for traversal
+  struct RTCRayQueryContext* context;     // optional pointer to ray query context
+  RTCFilterFunctionN filter;               // filter function to execute
+  RTCIntersectFunctionN intersect;         // user geometry intersection callback to execute
+#if RTC_MIN_WIDTH
+  float minWidthDistanceFactor;            // curve radius is set to this factor times distance to ray origin
+#endif
+};
+
+/* Initializes intersection arguments. */
+RTC_FORCEINLINE void rtcInitIntersectArguments(struct RTCIntersectArguments* args)
+{
+  args->flags = RTC_RAY_QUERY_FLAG_INCOHERENT;
+  args->feature_mask = RTC_FEATURE_FLAG_ALL;
+  args->context = NULL;
+  args->filter = NULL;
+  args->intersect = NULL;
+
+#if RTC_MIN_WIDTH
+  args->minWidthDistanceFactor = 0.0f;
+#endif
+}
+
+/* Additional arguments for rtcOccluded1/4/8/16 calls */
+struct RTCOccludedArguments
+{
+  enum RTCRayQueryFlags flags;     // intersection flags
+  enum RTCFeatureFlags feature_mask;       // selectively enable features for traversal
+  struct RTCRayQueryContext* context;     // optional pointer to ray query context
+  RTCFilterFunctionN filter;               // filter function to execute
+  RTCOccludedFunctionN occluded;           // user geometry occlusion callback to execute
+
+#if RTC_MIN_WIDTH
+  float minWidthDistanceFactor;            // curve radius is set to this factor times distance to ray origin
+#endif
+};
+
+/* Initializes an intersection arguments. */
+RTC_FORCEINLINE void rtcInitOccludedArguments(struct RTCOccludedArguments* args)
+{
+  args->flags = RTC_RAY_QUERY_FLAG_INCOHERENT;
+  args->feature_mask = RTC_FEATURE_FLAG_ALL;
+  args->context = NULL;
+  args->filter = NULL;
+  args->occluded = NULL;
+
+#if RTC_MIN_WIDTH
+  args->minWidthDistanceFactor = 0.0f;
+#endif
+}
+
+/* Creates a new scene. */
+RTC_API RTCScene rtcNewScene(RTCDevice device);
+
+/* Returns the device the scene got created in. The reference count of
+ * the device is incremented by this function. */
+RTC_API RTCDevice rtcGetSceneDevice(RTCScene hscene);
+   
+/* Retains the scene (increments the reference count). */
+RTC_API void rtcRetainScene(RTCScene scene);
+
+/* Releases the scene (decrements the reference count). */
+RTC_API void rtcReleaseScene(RTCScene scene);
+
+
+/* Attaches the geometry to a scene. */
+RTC_API unsigned int rtcAttachGeometry(RTCScene scene, RTCGeometry geometry);
+
+/* Attaches the geometry to a scene using the specified geometry ID. */
+RTC_API void rtcAttachGeometryByID(RTCScene scene, RTCGeometry geometry, unsigned int geomID);
+
+/* Detaches the geometry from the scene. */
+RTC_API void rtcDetachGeometry(RTCScene scene, unsigned int geomID);
+
+/* Gets a geometry handle from the scene. This function is not thread safe and should get used during rendering. */
+RTC_API RTCGeometry rtcGetGeometry(RTCScene scene, unsigned int geomID);
+
+/* Gets a geometry handle from the scene. This function is thread safe and should NOT get used during rendering. */
+RTC_API RTCGeometry rtcGetGeometryThreadSafe(RTCScene scene, unsigned int geomID);
+
+/* Gets the user-defined data pointer of the geometry. This function is not thread safe and should get used during rendering. */
+RTC_SYCL_API void* rtcGetGeometryUserDataFromScene(RTCScene scene, unsigned int geomID);
+
+/* Returns the interpolated transformation of an instance for the specified time. */
+RTC_SYCL_API void rtcGetGeometryTransformFromScene(RTCScene scene, unsigned int geomID, float time, enum RTCFormat format, void* xfm);
+
+
+/* Commits the scene. */
+RTC_API void rtcCommitScene(RTCScene scene);
+
+/* Commits the scene from multiple threads. */
+RTC_API void rtcJoinCommitScene(RTCScene scene);
+
+
+/* Progress monitor callback function */
+typedef bool (*RTCProgressMonitorFunction)(void* ptr, double n);
+
+/* Sets the progress monitor callback function of the scene. */
+RTC_API void rtcSetSceneProgressMonitorFunction(RTCScene scene, RTCProgressMonitorFunction progress, void* ptr);
+
+/* Sets the build quality of the scene. */
+RTC_API void rtcSetSceneBuildQuality(RTCScene scene, enum RTCBuildQuality quality);
+
+/* Sets the scene flags. */
+RTC_API void rtcSetSceneFlags(RTCScene scene, enum RTCSceneFlags flags);
+
+/* Returns the scene flags. */
+RTC_API enum RTCSceneFlags rtcGetSceneFlags(RTCScene scene);
+
+/* Returns the axis-aligned bounds of the scene. */
+RTC_API void rtcGetSceneBounds(RTCScene scene, struct RTCBounds* bounds_o);
+
+/* Returns the linear axis-aligned bounds of the scene. */
+RTC_API void rtcGetSceneLinearBounds(RTCScene scene, struct RTCLinearBounds* bounds_o);
+
+
+/* Perform a closest point query of the scene. */
+RTC_API bool rtcPointQuery(RTCScene scene, struct RTCPointQuery* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void* userPtr);
+
+/* Perform a closest point query with a packet of 4 points with the scene. */
+RTC_API bool rtcPointQuery4(const int* valid, RTCScene scene, struct RTCPointQuery4* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void** userPtr);
+
+/* Perform a closest point query with a packet of 4 points with the scene. */
+RTC_API bool rtcPointQuery8(const int* valid, RTCScene scene, struct RTCPointQuery8* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void** userPtr);
+
+/* Perform a closest point query with a packet of 4 points with the scene. */
+RTC_API bool rtcPointQuery16(const int* valid, RTCScene scene, struct RTCPointQuery16* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void** userPtr);
+
+
+/* Intersects a single ray with the scene. */
+RTC_SYCL_API void rtcIntersect1(RTCScene scene, struct RTCRayHit* rayhit, struct RTCIntersectArguments* args RTC_OPTIONAL_ARGUMENT);
+
+/* Intersects a packet of 4 rays with the scene. */
+RTC_API void rtcIntersect4(const int* valid, RTCScene scene, struct RTCRayHit4* rayhit, struct RTCIntersectArguments* args RTC_OPTIONAL_ARGUMENT);
+
+/* Intersects a packet of 8 rays with the scene. */
+RTC_API void rtcIntersect8(const int* valid, RTCScene scene, struct RTCRayHit8* rayhit, struct RTCIntersectArguments* args RTC_OPTIONAL_ARGUMENT);
+
+/* Intersects a packet of 16 rays with the scene. */
+RTC_API void rtcIntersect16(const int* valid, RTCScene scene, struct RTCRayHit16* rayhit, struct RTCIntersectArguments* args RTC_OPTIONAL_ARGUMENT);
+
+
+/* Forwards ray inside user geometry callback. */
+RTC_SYCL_API void rtcForwardIntersect1(const struct RTCIntersectFunctionNArguments* args, RTCScene scene, struct RTCRay* ray, unsigned int instID);
+
+/* Forwards ray inside user geometry callback. Extended to handle instance arrays using instPrimID parameter. */
+RTC_SYCL_API void rtcForwardIntersect1Ex(const struct RTCIntersectFunctionNArguments* args, RTCScene scene, struct RTCRay* ray, unsigned int instID, unsigned int instPrimID);
+
+/* Forwards ray packet of size 4 inside user geometry callback. */
+RTC_API void rtcForwardIntersect4(const int* valid, const struct RTCIntersectFunctionNArguments* args, RTCScene scene, struct RTCRay4* ray, unsigned int instID);
+
+/* Forwards ray packet of size 4 inside user geometry callback. Extended to handle instance arrays using instPrimID parameter. */
+RTC_API void rtcForwardIntersect4Ex(const int* valid, const struct RTCIntersectFunctionNArguments* args, RTCScene scene, struct RTCRay4* ray, unsigned int instID, unsigned int primInstID);
+
+/* Forwards ray packet of size 8 inside user geometry callback. */
+RTC_API void rtcForwardIntersect8(const int* valid, const struct RTCIntersectFunctionNArguments* args, RTCScene scene, struct RTCRay8* ray, unsigned int instID);
+
+/* Forwards ray packet of size 4 inside user geometry callback. Extended to handle instance arrays using instPrimID parameter. */
+RTC_API void rtcForwardIntersect8Ex(const int* valid, const struct RTCIntersectFunctionNArguments* args, RTCScene scene, struct RTCRay8* ray, unsigned int instID, unsigned int primInstID);
+
+/* Forwards ray packet of size 16 inside user geometry callback. */
+RTC_API void rtcForwardIntersect16(const int* valid, const struct RTCIntersectFunctionNArguments* args, RTCScene scene, struct RTCRay16* ray, unsigned int instID);
+
+/* Forwards ray packet of size 4 inside user geometry callback. Extended to handle instance arrays using instPrimID parameter. */
+RTC_API void rtcForwardIntersect16Ex(const int* valid, const struct RTCIntersectFunctionNArguments* args, RTCScene scene, struct RTCRay16* ray, unsigned int instID, unsigned int primInstID);
+
+
+/* Tests a single ray for occlusion with the scene. */
+RTC_SYCL_API void rtcOccluded1(RTCScene scene, struct RTCRay* ray, struct RTCOccludedArguments* args RTC_OPTIONAL_ARGUMENT);
+
+/* Tests a packet of 4 rays for occlusion occluded with the scene. */
+RTC_API void rtcOccluded4(const int* valid, RTCScene scene, struct RTCRay4* ray, struct RTCOccludedArguments* args RTC_OPTIONAL_ARGUMENT);
+
+/* Tests a packet of 8 rays for occlusion with the scene. */
+RTC_API void rtcOccluded8(const int* valid, RTCScene scene, struct RTCRay8* ray, struct RTCOccludedArguments* args RTC_OPTIONAL_ARGUMENT);
+
+/* Tests a packet of 16 rays for occlusion with the scene. */
+RTC_API void rtcOccluded16(const int* valid, RTCScene scene, struct RTCRay16* ray, struct RTCOccludedArguments* args RTC_OPTIONAL_ARGUMENT);
+
+
+/* Forwards single occlusion ray inside user geometry callback. */
+RTC_SYCL_API void rtcForwardOccluded1(const struct RTCOccludedFunctionNArguments* args, RTCScene scene, struct RTCRay* ray, unsigned int instID);
+
+/* Forwards single occlusion ray inside user geometry callback. Extended to handle instance arrays using instPrimID parameter. */
+RTC_SYCL_API void rtcForwardOccluded1Ex(const struct RTCOccludedFunctionNArguments* args, RTCScene scene, struct RTCRay* ray, unsigned int instID, unsigned int instPrimID);
+
+/* Forwards occlusion ray packet of size 4 inside user geometry callback. */
+RTC_API void rtcForwardOccluded4(const int* valid, const struct RTCOccludedFunctionNArguments* args, RTCScene scene, struct RTCRay4* ray, unsigned int instID);
+
+/* Forwards occlusion ray packet of size 4 inside user geometry callback. Extended to handle instance arrays using instPrimID parameter. */
+RTC_API void rtcForwardOccluded4Ex(const int* valid, const struct RTCOccludedFunctionNArguments* args, RTCScene scene, struct RTCRay4* ray, unsigned int instID, unsigned int instPrimID);
+
+/* Forwards occlusion ray packet of size 8 inside user geometry callback. */
+RTC_API void rtcForwardOccluded8(const int* valid, const struct RTCOccludedFunctionNArguments* args, RTCScene scene, struct RTCRay8* ray, unsigned int instID);
+
+/* Forwards occlusion ray packet of size 8 inside user geometry callback. Extended to handle instance arrays using instPrimID parameter. */
+RTC_API void rtcForwardOccluded8Ex(const int* valid, const struct RTCOccludedFunctionNArguments* args, RTCScene scene, struct RTCRay8* ray, unsigned int instID, unsigned int instPrimID);
+
+/* Forwards occlusion ray packet of size 16 inside user geometry callback. */
+RTC_API void rtcForwardOccluded16(const int* valid, const struct RTCOccludedFunctionNArguments* args, RTCScene scene, struct RTCRay16* ray, unsigned int instID);
+
+/* Forwards occlusion ray packet of size 16 inside user geometry callback. Extended to handle instance arrays using instPrimID parameter. */
+RTC_API void rtcForwardOccluded16Ex(const int* valid, const struct RTCOccludedFunctionNArguments* args, RTCScene scene, struct RTCRay16* ray, unsigned int instID, unsigned int instPrimID);
+
+
+/*! collision callback */
+struct RTCCollision { unsigned int geomID0; unsigned int primID0; unsigned int geomID1; unsigned int primID1; };
+typedef void (*RTCCollideFunc) (void* userPtr, struct RTCCollision* collisions, unsigned int num_collisions);
+
+/*! Performs collision detection of two scenes */
+RTC_API void rtcCollide (RTCScene scene0, RTCScene scene1, RTCCollideFunc callback, void* userPtr);
+ 
+#if defined(__cplusplus)
+
+/* Helper for easily combining scene flags */
+inline RTCSceneFlags operator|(RTCSceneFlags a, RTCSceneFlags b) {
+  return (RTCSceneFlags)((size_t)a | (size_t)b);
+}
+
+#endif
+
+RTC_NAMESPACE_END
+
diff --git a/thirdparty/embree/kernels/builders/bvh_builder_morton.h b/thirdparty/embree/kernels/builders/bvh_builder_morton.h
index cba32ca73c..87d4786810 100644
--- a/thirdparty/embree/kernels/builders/bvh_builder_morton.h
+++ b/thirdparty/embree/kernels/builders/bvh_builder_morton.h
@@ -5,6 +5,7 @@
 
 #include "../common/builder.h"
 #include "../../common/algorithms/parallel_reduce.h"
+#include "../../common/algorithms/parallel_sort.h"
 
 namespace embree
 {
@@ -101,7 +102,7 @@ namespace embree
         }
       };
 
-#if defined (__AVX2__)
+#if defined (__AVX2__) || defined(__SYCL_DEVICE_ONLY__)
 
       /*! for AVX2 there is a fast scalar bitInterleave */
       struct MortonCodeGenerator
diff --git a/thirdparty/embree/kernels/builders/bvh_builder_msmblur.h b/thirdparty/embree/kernels/builders/bvh_builder_msmblur.h
index 6e73c0d250..d4e3388db5 100644
--- a/thirdparty/embree/kernels/builders/bvh_builder_msmblur.h
+++ b/thirdparty/embree/kernels/builders/bvh_builder_msmblur.h
@@ -7,7 +7,7 @@
 #define MBLUR_NUM_OBJECT_BINS   32
 
 #include "../bvh/bvh.h"
-#include "../common/primref_mb.h"
+#include "../builders/primref_mb.h"
 #include "heuristic_binning_array_aligned.h"
 #include "heuristic_timesplit_array.h"
 
@@ -141,16 +141,17 @@ namespace embree
     struct VirtualRecalculatePrimRef
     {
       Scene* scene;
+      const SubGridBuildData * const sgrids;
       
-      __forceinline VirtualRecalculatePrimRef (Scene* scene)
-        : scene(scene) {}
+      __forceinline VirtualRecalculatePrimRef (Scene* scene, const SubGridBuildData * const sgrids = nullptr)
+        : scene(scene), sgrids(sgrids) {}
       
       __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const
       {
         const unsigned geomID = prim.geomID();
         const unsigned primID = prim.primID();
         const Geometry* mesh = scene->get(geomID);
-        const LBBox3fa lbounds = mesh->vlinearBounds(primID, time_range);
+        const LBBox3fa lbounds = mesh->vlinearBounds(primID, time_range, sgrids);
         const range<int> tbounds = mesh->timeSegmentRange(time_range);
         return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID);
       }
@@ -166,7 +167,7 @@ namespace embree
       }
       
       __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const {
-        return scene->get(prim.geomID())->vlinearBounds(prim.primID(), time_range);
+        return scene->get(prim.geomID())->vlinearBounds(prim.primID(), time_range, sgrids);
       }
       
       __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const {
diff --git a/thirdparty/embree/kernels/builders/bvh_builder_sah.h b/thirdparty/embree/kernels/builders/bvh_builder_sah.h
index 24c5faf8be..7a0c0bdb8e 100644
--- a/thirdparty/embree/kernels/builders/bvh_builder_sah.h
+++ b/thirdparty/embree/kernels/builders/bvh_builder_sah.h
@@ -7,13 +7,8 @@
 #include "heuristic_spatial_array.h"
 #include "heuristic_openmerge_array.h"
 
-#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL
-#  define NUM_OBJECT_BINS 16
-#  define NUM_SPATIAL_BINS 16
-#else
-#  define NUM_OBJECT_BINS 32
-#  define NUM_SPATIAL_BINS 16
-#endif
+#define NUM_OBJECT_BINS 32
+#define NUM_SPATIAL_BINS 16
 
 namespace embree
 {
diff --git a/thirdparty/embree/kernels/builders/heuristic_binning.h b/thirdparty/embree/kernels/builders/heuristic_binning.h
index 41be6183b8..d66726d09b 100644
--- a/thirdparty/embree/kernels/builders/heuristic_binning.h
+++ b/thirdparty/embree/kernels/builders/heuristic_binning.h
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "priminfo.h"
+#include "priminfo_mb.h"
 #include "../../common/algorithms/parallel_reduce.h"
 #include "../../common/algorithms/parallel_partition.h"
 
@@ -390,6 +391,63 @@ namespace embree
         }
 	return Split(bestSAH,bestDim,bestPos,mapping);
       }
+
+      /*! finds the best split by scanning binning information */
+      __forceinline Split best_block_size(const BinMapping<BINS>& mapping, const size_t blockSize) const
+      {
+	/* sweep from right to left and compute parallel prefix of merged bounds */
+	vfloat4 rAreas[BINS];
+	vuint4 rCounts[BINS];
+	vuint4 count = 0; BBox bx = empty; BBox by = empty; BBox bz = empty;
+	for (size_t i=mapping.size()-1; i>0; i--)
+        {
+          count += counts(i);
+          rCounts[i] = count;
+          bx.extend(bounds(i,0)); rAreas[i][0] = expectedApproxHalfArea(bx);
+          by.extend(bounds(i,1)); rAreas[i][1] = expectedApproxHalfArea(by);
+          bz.extend(bounds(i,2)); rAreas[i][2] = expectedApproxHalfArea(bz);
+          rAreas[i][3] = 0.0f;
+        }
+	/* sweep from left to right and compute SAH */
+	vuint4 blocks_add = blockSize-1;
+        vfloat4 blocks_factor = 1.0f/float(blockSize);
+	vuint4 ii = 1; vfloat4 vbestSAH = pos_inf; vuint4 vbestPos = 0; 
+	count = 0; bx = empty; by = empty; bz = empty;
+	for (size_t i=1; i<mapping.size(); i++, ii+=1)
+        {
+          count += counts(i-1);
+          bx.extend(bounds(i-1,0)); float Ax = expectedApproxHalfArea(bx);
+          by.extend(bounds(i-1,1)); float Ay = expectedApproxHalfArea(by);
+          bz.extend(bounds(i-1,2)); float Az = expectedApproxHalfArea(bz);
+          const vfloat4 lArea = vfloat4(Ax,Ay,Az,Az);
+          const vfloat4 rArea = rAreas[i];
+          const vfloat4 lCount = floor(vfloat4(count     +blocks_add)*blocks_factor);
+          const vfloat4 rCount = floor(vfloat4(rCounts[i]+blocks_add)*blocks_factor);
+          const vfloat4 sah = madd(lArea,lCount,rArea*rCount);
+
+          vbestPos = select(sah < vbestSAH,ii ,vbestPos);
+          vbestSAH = select(sah < vbestSAH,sah,vbestSAH);
+        }
+	
+	/* find best dimension */
+	float bestSAH = inf;
+	int   bestDim = -1;
+	int   bestPos = 0;
+	for (int dim=0; dim<3; dim++) 
+        {
+          /* ignore zero sized dimensions */
+          if (unlikely(mapping.invalid(dim)))
+            continue;
+          
+          /* test if this is a better dimension */
+          if (vbestSAH[dim] < bestSAH && vbestPos[dim] != 0) {
+            bestDim = dim;
+            bestPos = vbestPos[dim];
+            bestSAH = vbestSAH[dim];
+          }
+        }
+	return Split(bestSAH,bestDim,bestPos,mapping);
+      }
       
       /*! calculates extended split information */
       __forceinline void getSplitInfo(const BinMapping<BINS>& mapping, const Split& split, SplitInfoT<BBox>& info) const 
diff --git a/thirdparty/embree/kernels/builders/heuristic_binning_array_aligned.h b/thirdparty/embree/kernels/builders/heuristic_binning_array_aligned.h
index ab3b97efb9..51dda9b49d 100644
--- a/thirdparty/embree/kernels/builders/heuristic_binning_array_aligned.h
+++ b/thirdparty/embree/kernels/builders/heuristic_binning_array_aligned.h
@@ -22,6 +22,9 @@ namespace embree
 
       __forceinline PrimInfoRange (size_t begin, size_t end, const CentGeomBBox3fa& centGeomBounds)
         : CentGeomBBox3fa(centGeomBounds), range<size_t>(begin,end) {}
+
+      __forceinline PrimInfoRange (range<size_t> r, const CentGeomBBox3fa& centGeomBounds)
+        : CentGeomBBox3fa(centGeomBounds), range<size_t>(r) {}
       
       __forceinline float leafSAH() const { 
 	return expectedApproxHalfArea(geomBounds)*float(size()); 
@@ -30,7 +33,45 @@ namespace embree
       __forceinline float leafSAH(size_t block_shift) const { 
 	return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<<block_shift)-1) >> block_shift);
       }
+
+      __forceinline range<size_t> get_range() const {
+        return range<size_t>(begin(),end());
+      }
+
+      template<typename PrimRef> 
+        __forceinline void add_primref(const PrimRef& prim) 
+      {
+        CentGeomBBox3fa::extend_primref(prim);
+        _end++;
+      }
     };
+
+    inline void performFallbackSplit(PrimRef* const prims, const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo)
+    {
+      const size_t begin = pinfo.begin();
+      const size_t end   = pinfo.end();
+      const size_t center = (begin + end)/2;
+      
+      CentGeomBBox3fa left(empty);
+      for (size_t i=begin; i<center; i++)
+        left.extend_center2(prims[i]);
+      new (&linfo) PrimInfoRange(begin,center,left);
+      
+      CentGeomBBox3fa right(empty);
+      for (size_t i=center; i<end; i++)
+        right.extend_center2(prims[i]);
+      new (&rinfo) PrimInfoRange(center,end,right);
+    }
+
+    template<typename Type, typename getTypeFunc>
+    inline void performTypeSplit(const getTypeFunc& getType, Type type, PrimRef* const prims, range<size_t> range, PrimInfoRange& linfo, PrimInfoRange& rinfo)
+    {
+      CentGeomBBox3fa local_left(empty), local_right(empty);
+      auto isLeft = [&] (const PrimRef& ref) { return type == getType(ref.geomID()); };
+      const size_t center = serial_partitioning(prims,range.begin(),range.end(),local_left,local_right,isLeft,CentGeomBBox3fa::extend_ref);
+      linfo = PrimInfoRange(make_range(range.begin(),center     ),local_left);
+      rinfo = PrimInfoRange(make_range(center       ,range.end()),local_right);
+    }
     
     /*! Performs standard object binning */
     template<typename PrimRef, size_t BINS>
@@ -69,6 +110,24 @@ namespace embree
           return binner.best(mapping,logBlockSize);
         }
 
+        /*! finds the best split */
+        __noinline const Split find_block_size(const PrimInfoRange& pinfo, const size_t blockSize)
+        {
+          if (likely(pinfo.size() < PARALLEL_THRESHOLD))
+            return find_block_size_template<false>(pinfo,blockSize);
+          else
+            return find_block_size_template<true>(pinfo,blockSize);
+        }
+
+        template<bool parallel>
+        __forceinline const Split find_block_size_template(const PrimInfoRange& pinfo, const size_t blockSize)
+        {
+          Binner binner(empty);
+          const BinMapping<BINS> mapping(pinfo);
+          bin_serial_or_parallel<parallel>(binner,prims,pinfo.begin(),pinfo.end(),PARALLEL_FIND_BLOCK_SIZE,mapping);
+          return binner.best_block_size(mapping,blockSize);
+        }
+
         /*! array partitioning */
         __forceinline void split(const Split& split, const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo)
         {
@@ -121,21 +180,8 @@ namespace embree
           std::sort(&prims[pinfo.begin()],&prims[pinfo.end()]);
         }
 
-        void splitFallback(const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo)
-        {
-          const size_t begin = pinfo.begin();
-          const size_t end   = pinfo.end();
-          const size_t center = (begin + end)/2;
-
-          CentGeomBBox3fa left(empty);
-          for (size_t i=begin; i<center; i++)
-            left.extend_center2(prims[i]);
-          new (&linfo) PrimInfoRange(begin,center,left);
-
-          CentGeomBBox3fa right(empty);
-          for (size_t i=center; i<end; i++)
-            right.extend_center2(prims[i]);
-          new (&rinfo) PrimInfoRange(center,end,right);
+        void splitFallback(const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo) {
+          performFallbackSplit(prims,pinfo,linfo,rinfo);
         }
 
         void splitByGeometry(const range<size_t>& range, PrimInfoRange& linfo, PrimInfoRange& rinfo)
@@ -156,6 +202,8 @@ namespace embree
         PrimRef* const prims;
       };
 
+#if !defined(RTHWIF_STANDALONE)
+    
     /*! Performs standard object binning */
     template<typename PrimRefMB, size_t BINS>
       struct HeuristicArrayBinningMB
@@ -196,5 +244,6 @@ namespace embree
           new (&rset) SetMB(right,set.prims,range<size_t>(center,end  ),set.time_range);
         }
       };
+#endif
   }
 }
diff --git a/thirdparty/embree/kernels/builders/heuristic_spatial.h b/thirdparty/embree/kernels/builders/heuristic_spatial.h
index 8b3499ac8d..25abf025ae 100644
--- a/thirdparty/embree/kernels/builders/heuristic_spatial.h
+++ b/thirdparty/embree/kernels/builders/heuristic_spatial.h
@@ -3,7 +3,6 @@
 
 #pragma once
 
-#include "../common/scene.h"
 #include "priminfo.h"
 
 namespace embree
diff --git a/thirdparty/embree/kernels/builders/heuristic_timesplit_array.h b/thirdparty/embree/kernels/builders/heuristic_timesplit_array.h
index b968e01c90..fe97862eb6 100644
--- a/thirdparty/embree/kernels/builders/heuristic_timesplit_array.h
+++ b/thirdparty/embree/kernels/builders/heuristic_timesplit_array.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "../common/primref_mb.h"
+#include "../builders/primref_mb.h"
 #include "../../common/algorithms/parallel_filter.h"
 
 #define MBLUR_TIME_SPLIT_THRESHOLD 1.25f
diff --git a/thirdparty/embree/kernels/builders/priminfo.h b/thirdparty/embree/kernels/builders/priminfo.h
index fee515247a..52f035e869 100644
--- a/thirdparty/embree/kernels/builders/priminfo.h
+++ b/thirdparty/embree/kernels/builders/priminfo.h
@@ -3,9 +3,7 @@
 
 #pragma once
 
-#include "../common/default.h"
-#include "../common/primref.h"
-#include "../common/primref_mb.h"
+#include "primref.h"
 
 namespace embree
 {
@@ -41,6 +39,10 @@ namespace embree
         centBounds.extend(center);
       }
 
+      static void extend_ref (CentGeom& pinfo, const PrimRef& ref) {
+        pinfo.extend_primref(ref);
+      };
+      
        template<typename PrimRef> 
          __forceinline void extend_center2(const PrimRef& prim) 
        {
@@ -84,6 +86,9 @@ namespace embree
       __forceinline PrimInfoT (EmptyTy) 
 	: CentGeom<BBox>(empty), begin(0), end(0) {}
 
+      __forceinline PrimInfoT (size_t N) 
+	: CentGeom<BBox>(empty), begin(0), end(N) {}
+
       __forceinline PrimInfoT (size_t begin, size_t end, const CentGeomBBox3fa& centGeomBounds) 
         : CentGeom<BBox>(centGeomBounds), begin(begin), end(end) {}
 
@@ -158,205 +163,5 @@ namespace embree
 
     typedef PrimInfoT<BBox3fa> PrimInfo;
     //typedef PrimInfoT<LBBox3fa> PrimInfoMB;
-
-    /*! stores bounding information for a set of primitives */
-    template<typename BBox>
-      class PrimInfoMBT : public CentGeom<BBox>
-    {
-    public:
-      using CentGeom<BBox>::geomBounds;
-      using CentGeom<BBox>::centBounds;
-
-      __forceinline PrimInfoMBT () {
-      } 
-
-      __forceinline PrimInfoMBT (EmptyTy)
-        : CentGeom<BBox>(empty), object_range(0,0), num_time_segments(0), max_num_time_segments(0), max_time_range(0.0f,1.0f), time_range(1.0f,0.0f) {}
-
-      __forceinline PrimInfoMBT (size_t begin, size_t end)
-        : CentGeom<BBox>(empty), object_range(begin,end), num_time_segments(0), max_num_time_segments(0), max_time_range(0.0f,1.0f), time_range(1.0f,0.0f) {}
-
-      template<typename PrimRef> 
-        __forceinline void add_primref(const PrimRef& prim) 
-      {
-        CentGeom<BBox>::extend_primref(prim);
-        time_range.extend(prim.time_range);
-        object_range._end++;
-        num_time_segments += prim.size();
-        if (max_num_time_segments < prim.totalTimeSegments()) {
-          max_num_time_segments = prim.totalTimeSegments();
-          max_time_range = prim.time_range;
-        }
-      }
-
-      __forceinline void merge(const PrimInfoMBT& other)
-      {
-        CentGeom<BBox>::merge(other);
-        time_range.extend(other.time_range);
-        object_range._begin += other.object_range.begin();
-        object_range._end += other.object_range.end();
-        num_time_segments += other.num_time_segments;
-        if (max_num_time_segments < other.max_num_time_segments) {
-          max_num_time_segments = other.max_num_time_segments;
-          max_time_range = other.max_time_range;
-        }
-      }
-
-      static __forceinline const PrimInfoMBT merge2(const PrimInfoMBT& a, const PrimInfoMBT& b) {
-        PrimInfoMBT r = a; r.merge(b); return r;
-      }
-
-      __forceinline size_t begin() const {
-        return object_range.begin();
-      }
-
-      __forceinline size_t end() const {
-        return object_range.end();
-      }
-      
-      /*! returns the number of primitives */
-      __forceinline size_t size() const { 
-	return object_range.size(); 
-      }
-
-      __forceinline float halfArea() const {
-        return time_range.size()*expectedApproxHalfArea(geomBounds);
-      }
-
-      __forceinline float leafSAH() const { 
-	return time_range.size()*expectedApproxHalfArea(geomBounds)*float(num_time_segments); 
-      }
-      
-      __forceinline float leafSAH(size_t block_shift) const { 
-	return time_range.size()*expectedApproxHalfArea(geomBounds)*float((num_time_segments+(size_t(1)<<block_shift)-1) >> block_shift);
-      }
-
-      __forceinline float align_time(float ct) const
-      {
-        //return roundf(ct * float(numTimeSegments)) / float(numTimeSegments);
-        float t0 = (ct-max_time_range.lower)/max_time_range.size();
-        float t1 = roundf(t0 * float(max_num_time_segments)) / float(max_num_time_segments);
-        return t1*max_time_range.size()+max_time_range.lower;
-      }
-      
-      /*! stream output */
-      friend embree_ostream operator<<(embree_ostream cout, const PrimInfoMBT& pinfo) 
-      {
-	return cout << "PrimInfo { " << 
-          "object_range = " << pinfo.object_range << 
-          ", time_range = " << pinfo.time_range << 
-          ", time_segments = " << pinfo.num_time_segments << 
-          ", geomBounds = " << pinfo.geomBounds << 
-          ", centBounds = " << pinfo.centBounds << 
-          "}";
-      }
-      
-    public:
-      range<size_t> object_range; //!< primitive range
-      size_t num_time_segments;  //!< total number of time segments of all added primrefs
-      size_t max_num_time_segments; //!< maximum number of time segments of a primitive
-      BBox1f max_time_range; //!< time range of primitive with max_num_time_segments
-      BBox1f time_range; //!< merged time range of primitives when merging prims, or additionally clipped with build time range when used in SetMB
-    };
-
-    typedef PrimInfoMBT<typename PrimRefMB::BBox> PrimInfoMB;
-
-    struct SetMB : public PrimInfoMB
-    {
-      static const size_t PARALLEL_THRESHOLD = 3 * 1024;
-      static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
-      static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
-
-      typedef mvector<PrimRefMB>* PrimRefVector;
-
-      __forceinline SetMB() {}
-
-       __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims)
-         : PrimInfoMB(pinfo_i), prims(prims) {}
-
-      __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims, range<size_t> object_range_in, BBox1f time_range_in)
-        : PrimInfoMB(pinfo_i), prims(prims)
-      {
-        object_range = object_range_in;
-        time_range = intersect(time_range,time_range_in);
-      }
-      
-      __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims, BBox1f time_range_in)
-        : PrimInfoMB(pinfo_i), prims(prims)
-      {
-        time_range = intersect(time_range,time_range_in);
-      }
-
-      void deterministic_order() const 
-      {
-        /* required as parallel partition destroys original primitive order */
-        PrimRefMB* prim = prims->data();
-        std::sort(&prim[object_range.begin()],&prim[object_range.end()]);
-      }
-
-      template<typename RecalculatePrimRef>
-      __forceinline LBBox3fa linearBounds(const RecalculatePrimRef& recalculatePrimRef) const
-      {
-        auto reduce = [&](const range<size_t>& r) -> LBBox3fa
-        {
-          LBBox3fa cbounds(empty);
-          for (size_t j = r.begin(); j < r.end(); j++)
-          {
-            PrimRefMB& ref = (*prims)[j];
-            const LBBox3fa bn = recalculatePrimRef.linearBounds(ref, time_range);
-            cbounds.extend(bn);
-          };
-          return cbounds;
-        };
-        
-        return parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, LBBox3fa(empty),
-                               reduce,
-                               [&](const LBBox3fa& b0, const LBBox3fa& b1) -> LBBox3fa { return embree::merge(b0, b1); });
-      }
-
-      template<typename RecalculatePrimRef>
-        __forceinline LBBox3fa linearBounds(const RecalculatePrimRef& recalculatePrimRef, const LinearSpace3fa& space) const
-      {
-        auto reduce = [&](const range<size_t>& r) -> LBBox3fa
-        {
-          LBBox3fa cbounds(empty);
-          for (size_t j = r.begin(); j < r.end(); j++)
-          {
-            PrimRefMB& ref = (*prims)[j];
-            const LBBox3fa bn = recalculatePrimRef.linearBounds(ref, time_range, space);
-            cbounds.extend(bn);
-          };
-          return cbounds;
-        };
-        
-        return parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, LBBox3fa(empty),
-                               reduce,
-                               [&](const LBBox3fa& b0, const LBBox3fa& b1) -> LBBox3fa { return embree::merge(b0, b1); });
-      }
-
-      template<typename RecalculatePrimRef>
-        const SetMB primInfo(const RecalculatePrimRef& recalculatePrimRef, const LinearSpace3fa& space) const
-      {
-        auto computePrimInfo = [&](const range<size_t>& r) -> PrimInfoMB
-        {
-          PrimInfoMB pinfo(empty);
-          for (size_t j=r.begin(); j<r.end(); j++)
-          {
-            PrimRefMB& ref = (*prims)[j];
-            PrimRefMB ref1 = recalculatePrimRef(ref,time_range,space);
-            pinfo.add_primref(ref1);
-          };
-          return pinfo;
-        };
-        
-        const PrimInfoMB pinfo = parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, 
-                                                 PrimInfoMB(empty), computePrimInfo, PrimInfoMB::merge2);
-
-        return SetMB(pinfo,prims,object_range,time_range);
-      }
-      
-    public:
-      PrimRefVector prims;
-    };
 //}
 }
diff --git a/thirdparty/embree/kernels/builders/priminfo_mb.h b/thirdparty/embree/kernels/builders/priminfo_mb.h
new file mode 100644
index 0000000000..4005f04da9
--- /dev/null
+++ b/thirdparty/embree/kernels/builders/priminfo_mb.h
@@ -0,0 +1,210 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primref_mb.h"
+
+namespace embree
+{
+    /*! stores bounding information for a set of primitives */
+    template<typename BBox>
+      class PrimInfoMBT : public CentGeom<BBox>
+    {
+    public:
+      using CentGeom<BBox>::geomBounds;
+      using CentGeom<BBox>::centBounds;
+
+      __forceinline PrimInfoMBT () {
+      } 
+
+      __forceinline PrimInfoMBT (EmptyTy)
+        : CentGeom<BBox>(empty), object_range(0,0), num_time_segments(0), max_num_time_segments(0), max_time_range(0.0f,1.0f), time_range(1.0f,0.0f) {}
+
+      __forceinline PrimInfoMBT (size_t begin, size_t end)
+        : CentGeom<BBox>(empty), object_range(begin,end), num_time_segments(0), max_num_time_segments(0), max_time_range(0.0f,1.0f), time_range(1.0f,0.0f) {}
+
+      template<typename PrimRef> 
+        __forceinline void add_primref(const PrimRef& prim) 
+      {
+        CentGeom<BBox>::extend_primref(prim);
+        time_range.extend(prim.time_range);
+        object_range._end++;
+        num_time_segments += prim.size();
+        if (max_num_time_segments < prim.totalTimeSegments()) {
+          max_num_time_segments = prim.totalTimeSegments();
+          max_time_range = prim.time_range;
+        }
+      }
+
+      __forceinline void merge(const PrimInfoMBT& other)
+      {
+        CentGeom<BBox>::merge(other);
+        time_range.extend(other.time_range);
+        object_range._begin += other.object_range.begin();
+        object_range._end += other.object_range.end();
+        num_time_segments += other.num_time_segments;
+        if (max_num_time_segments < other.max_num_time_segments) {
+          max_num_time_segments = other.max_num_time_segments;
+          max_time_range = other.max_time_range;
+        }
+      }
+
+      static __forceinline const PrimInfoMBT merge2(const PrimInfoMBT& a, const PrimInfoMBT& b) {
+        PrimInfoMBT r = a; r.merge(b); return r;
+      }
+
+      __forceinline size_t begin() const {
+        return object_range.begin();
+      }
+
+      __forceinline size_t end() const {
+        return object_range.end();
+      }
+      
+      /*! returns the number of primitives */
+      __forceinline size_t size() const { 
+	return object_range.size(); 
+      }
+
+      __forceinline float halfArea() const {
+        return time_range.size()*expectedApproxHalfArea(geomBounds);
+      }
+
+      __forceinline float leafSAH() const { 
+	return time_range.size()*expectedApproxHalfArea(geomBounds)*float(num_time_segments); 
+      }
+      
+      __forceinline float leafSAH(size_t block_shift) const { 
+	return time_range.size()*expectedApproxHalfArea(geomBounds)*float((num_time_segments+(size_t(1)<<block_shift)-1) >> block_shift);
+      }
+
+      __forceinline float align_time(float ct) const
+      {
+        //return roundf(ct * float(numTimeSegments)) / float(numTimeSegments);
+        float t0 = (ct-max_time_range.lower)/max_time_range.size();
+        float t1 = roundf(t0 * float(max_num_time_segments)) / float(max_num_time_segments);
+        return t1*max_time_range.size()+max_time_range.lower;
+      }
+      
+      /*! stream output */
+      friend embree_ostream operator<<(embree_ostream cout, const PrimInfoMBT& pinfo) 
+      {
+	return cout << "PrimInfo { " << 
+          "object_range = " << pinfo.object_range << 
+          ", time_range = " << pinfo.time_range << 
+          ", time_segments = " << pinfo.num_time_segments << 
+          ", geomBounds = " << pinfo.geomBounds << 
+          ", centBounds = " << pinfo.centBounds << 
+          "}";
+      }
+      
+    public:
+      range<size_t> object_range; //!< primitive range
+      size_t num_time_segments;  //!< total number of time segments of all added primrefs
+      size_t max_num_time_segments; //!< maximum number of time segments of a primitive
+      BBox1f max_time_range; //!< time range of primitive with max_num_time_segments
+      BBox1f time_range; //!< merged time range of primitives when merging prims, or additionally clipped with build time range when used in SetMB
+    };
+
+    typedef PrimInfoMBT<typename PrimRefMB::BBox> PrimInfoMB;
+
+    struct SetMB : public PrimInfoMB
+    {
+      static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+      static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+      static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+      typedef mvector<PrimRefMB>* PrimRefVector;
+
+      __forceinline SetMB() {}
+
+       __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims)
+         : PrimInfoMB(pinfo_i), prims(prims) {}
+
+      __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims, range<size_t> object_range_in, BBox1f time_range_in)
+        : PrimInfoMB(pinfo_i), prims(prims)
+      {
+        object_range = object_range_in;
+        time_range = intersect(time_range,time_range_in);
+      }
+      
+      __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims, BBox1f time_range_in)
+        : PrimInfoMB(pinfo_i), prims(prims)
+      {
+        time_range = intersect(time_range,time_range_in);
+      }
+
+      void deterministic_order() const 
+      {
+        /* required as parallel partition destroys original primitive order */
+        PrimRefMB* prim = prims->data();
+        std::sort(&prim[object_range.begin()],&prim[object_range.end()]);
+      }
+
+      template<typename RecalculatePrimRef>
+      __forceinline LBBox3fa linearBounds(const RecalculatePrimRef& recalculatePrimRef) const
+      {
+        auto reduce = [&](const range<size_t>& r) -> LBBox3fa
+        {
+          LBBox3fa cbounds(empty);
+          for (size_t j = r.begin(); j < r.end(); j++)
+          {
+            PrimRefMB& ref = (*prims)[j];
+            const LBBox3fa bn = recalculatePrimRef.linearBounds(ref, time_range);
+            cbounds.extend(bn);
+          };
+          return cbounds;
+        };
+        
+        return parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, LBBox3fa(empty),
+                               reduce,
+                               [&](const LBBox3fa& b0, const LBBox3fa& b1) -> LBBox3fa { return embree::merge(b0, b1); });
+      }
+
+      template<typename RecalculatePrimRef>
+        __forceinline LBBox3fa linearBounds(const RecalculatePrimRef& recalculatePrimRef, const LinearSpace3fa& space) const
+      {
+        auto reduce = [&](const range<size_t>& r) -> LBBox3fa
+        {
+          LBBox3fa cbounds(empty);
+          for (size_t j = r.begin(); j < r.end(); j++)
+          {
+            PrimRefMB& ref = (*prims)[j];
+            const LBBox3fa bn = recalculatePrimRef.linearBounds(ref, time_range, space);
+            cbounds.extend(bn);
+          };
+          return cbounds;
+        };
+        
+        return parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, LBBox3fa(empty),
+                               reduce,
+                               [&](const LBBox3fa& b0, const LBBox3fa& b1) -> LBBox3fa { return embree::merge(b0, b1); });
+      }
+
+      template<typename RecalculatePrimRef>
+        const SetMB primInfo(const RecalculatePrimRef& recalculatePrimRef, const LinearSpace3fa& space) const
+      {
+        auto computePrimInfo = [&](const range<size_t>& r) -> PrimInfoMB
+        {
+          PrimInfoMB pinfo(empty);
+          for (size_t j=r.begin(); j<r.end(); j++)
+          {
+            PrimRefMB& ref = (*prims)[j];
+            PrimRefMB ref1 = recalculatePrimRef(ref,time_range,space);
+            pinfo.add_primref(ref1);
+          };
+          return pinfo;
+        };
+        
+        const PrimInfoMB pinfo = parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, 
+                                                 PrimInfoMB(empty), computePrimInfo, PrimInfoMB::merge2);
+
+        return SetMB(pinfo,prims,object_range,time_range);
+      }
+      
+    public:
+      PrimRefVector prims;
+    };
+//}
+}
diff --git a/thirdparty/embree/kernels/common/primref.h b/thirdparty/embree/kernels/builders/primref.h
index d61763487b..cf349c4a0c 100644
--- a/thirdparty/embree/kernels/common/primref.h
+++ b/thirdparty/embree/kernels/builders/primref.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "default.h"
+#include "../common/default.h"
 
 namespace embree
 {
@@ -118,7 +118,8 @@ namespace embree
     std::swap(a,b);
 #endif
   }
-
+  
+  
   /************************************************************************************/
   /************************************************************************************/
   /************************************************************************************/
diff --git a/thirdparty/embree/kernels/common/primref_mb.h b/thirdparty/embree/kernels/builders/primref_mb.h
index fb08a05003..2c4bef8b8b 100644
--- a/thirdparty/embree/kernels/common/primref_mb.h
+++ b/thirdparty/embree/kernels/builders/primref_mb.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "default.h"
+#include "../common/default.h"
 
 #define MBLUR_BIN_LBBOX 1
 
diff --git a/thirdparty/embree/kernels/builders/primrefgen.cpp b/thirdparty/embree/kernels/builders/primrefgen.cpp
index e2d7c27bd8..33f67697cb 100644
--- a/thirdparty/embree/kernels/builders/primrefgen.cpp
+++ b/thirdparty/embree/kernels/builders/primrefgen.cpp
@@ -55,6 +55,29 @@ namespace embree
       return pinfo;
     }
 
+    PrimInfo createPrimRefArray(Scene* scene, Geometry::GTypeMask types, bool mblur, const size_t numPrimRefs, mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids, BuildProgressMonitor& progressMonitor)
+    {
+      ParallelForForPrefixSumState<PrimInfo> pstate;
+      Scene::Iterator2 iter(scene,types,mblur);
+      
+      /* first try */
+      progressMonitor(0);
+      pstate.init(iter,size_t(1024));
+      PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+         return mesh->createPrimRefArray(prims,sgrids,r,k,(unsigned)geomID);
+       }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != numPrimRefs)
+      {
+        progressMonitor(0);
+        pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+          return mesh->createPrimRefArray(prims,sgrids,r,base.size(),(unsigned)geomID);
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      }
+      return pinfo;
+    }
+
     PrimInfo createPrimRefArrayMBlur(Scene* scene, Geometry::GTypeMask types, const size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor, size_t itime)
     {
       ParallelForForPrefixSumState<PrimInfo> pstate;
@@ -104,6 +127,32 @@ namespace embree
       return pinfo;
     }
 
+    PrimInfoMB createPrimRefArrayMSMBlur(Scene* scene, Geometry::GTypeMask types, const size_t numPrimRefs, mvector<PrimRefMB>& prims, mvector<SubGridBuildData>& sgrids, BuildProgressMonitor& progressMonitor, BBox1f t0t1)
+    {
+      ParallelForForPrefixSumState<PrimInfoMB> pstate;
+      Scene::Iterator2 iter(scene,types,true);
+      
+      /* first try */
+      progressMonitor(0);
+      pstate.init(iter,size_t(1024));
+      PrimInfoMB pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfoMB(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfoMB {
+         return mesh->createPrimRefMBArray(prims,sgrids,t0t1,r,k,(unsigned)geomID);
+      }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); });
+      
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != numPrimRefs)
+      {
+        progressMonitor(0);
+        pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfoMB(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfoMB& base) -> PrimInfoMB {
+          return mesh->createPrimRefMBArray(prims,sgrids,t0t1,r,base.size(),(unsigned)geomID);
+        }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); });
+      }
+
+      /* the BVH starts with that time range, even though primitives might have smaller/larger time range */
+      pinfo.time_range = t0t1;
+      return pinfo;
+    }
+
     template<typename Mesh>
     size_t createMortonCodeArray(Mesh* mesh, mvector<BVHBuilderMorton::BuildPrim>& morton, BuildProgressMonitor& progressMonitor)
     {
@@ -218,26 +267,8 @@ namespace embree
 
       /* second run to fill primrefs and SubGridBuildData arrays */
       pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
-          k = base.size();
-          size_t p_index = k;
-          PrimInfo pinfo(empty);
-          for (size_t j=r.begin(); j<r.end(); j++)
-          {
-            if (!mesh->valid(j)) continue;
-            const GridMesh::Grid &g = mesh->grid(j);
-            for (unsigned int y=0; y<g.resY-1u; y+=2)
-              for (unsigned int x=0; x<g.resX-1u; x+=2)
-              {
-                BBox3fa bounds = empty;
-                if (!mesh->buildBounds(g,x,y,bounds)) continue; // get bounds of subgrid
-                const PrimRef prim(bounds,(unsigned)geomID,(unsigned)p_index);
-                pinfo.add_center2(prim);
-                sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j));
-                prims[p_index++] = prim;                
-              }
-          }
-          return pinfo;
-        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+        return mesh->createPrimRefArray(prims,sgrids,r,base.size(),geomID);
+      }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
       assert(pinfo.size() == numPrimitives);
       return pinfo;
     }
@@ -269,40 +300,60 @@ namespace embree
       prims.resize(numPrimitives); 
 
       /* second run to fill primrefs and SubGridBuildData arrays */
-      pinfo = parallel_prefix_sum( pstate, size_t(0), mesh->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo
-                                   {
-
-                                     size_t p_index = base.size();
-                                     PrimInfo pinfo(empty);
-                                     for (size_t j=r.begin(); j<r.end(); j++)
-                                     {
-                                       if (!mesh->valid(j)) continue;
-                                       const GridMesh::Grid &g = mesh->grid(j);
-                                       for (unsigned int y=0; y<g.resY-1u; y+=2)
-                                         for (unsigned int x=0; x<g.resX-1u; x+=2)
-                                         {
-                                           BBox3fa bounds = empty;
-                                           if (!mesh->buildBounds(g,x,y,bounds)) continue; // get bounds of subgrid
-                                           const PrimRef prim(bounds,geomID_,unsigned(p_index));
-                                           pinfo.add_center2(prim);
-                                           sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j));
-                                           prims[p_index++] = prim;                
-                                         }
-                                     }
-                                     return pinfo;
-                                   }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      pinfo = parallel_prefix_sum( pstate, size_t(0), mesh->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo {
+        return mesh->createPrimRefArray(prims,sgrids,r,base.size(),geomID_);
+      }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
 
       return pinfo;
     }
+
+    PrimInfoMB createPrimRefArrayMSMBlurGrid(Scene* scene, mvector<PrimRefMB>& prims, mvector<SubGridBuildData>& sgrids, BuildProgressMonitor& progressMonitor, BBox1f t0t1)
+    {
+      /* first run to get #primitives */
+      ParallelForForPrefixSumState<PrimInfoMB> pstate;
+      Scene::Iterator<GridMesh,true> iter(scene);
+      
+      pstate.init(iter,size_t(1024));
+      /* iterate over all meshes in the scene */
+      PrimInfoMB pinfoMB = parallel_for_for_prefix_sum0( pstate, iter, PrimInfoMB(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t /*geomID*/) -> PrimInfoMB {
+                                                                                            
+         PrimInfoMB pinfoMB(empty);
+         for (size_t j=r.begin(); j<r.end(); j++)
+         {
+           if (!mesh->valid(j, mesh->timeSegmentRange(t0t1))) continue;
+           LBBox3fa bounds(empty);
+           PrimInfoMB gridMB(0,mesh->getNumSubGrids(j));
+           pinfoMB.merge(gridMB);
+         }
+         return pinfoMB;
+      }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); });
+      
+      size_t numPrimitives = pinfoMB.size();
+      if (numPrimitives == 0) return pinfoMB;
+      
+      /* resize arrays */
+      sgrids.resize(numPrimitives); 
+      prims.resize(numPrimitives); 
+      /* second run to fill primrefs and SubGridBuildData arrays */
+      pinfoMB = parallel_for_for_prefix_sum1( pstate, iter, PrimInfoMB(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfoMB& base) -> PrimInfoMB {
+        return mesh->createPrimRefMBArray(prims,sgrids,t0t1,r,base.size(),(unsigned)geomID);                                                                                 
+      }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); });
+      
+      assert(pinfoMB.size() == numPrimitives);
+      pinfoMB.time_range = t0t1;
+      return pinfoMB;
+    }
+    
 #endif
     
     // ====================================================================================================
     // ====================================================================================================
     // ====================================================================================================
-
+    
     IF_ENABLED_TRIS (template size_t createMortonCodeArray<TriangleMesh>(TriangleMesh* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor));
     IF_ENABLED_QUADS(template size_t createMortonCodeArray<QuadMesh>(QuadMesh* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor));
     IF_ENABLED_USER (template size_t createMortonCodeArray<UserGeometry>(UserGeometry* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor));
     IF_ENABLED_INSTANCE (template size_t createMortonCodeArray<Instance>(Instance* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor));
+    IF_ENABLED_INSTANCE_ARRAY (template size_t createMortonCodeArray<InstanceArray>(InstanceArray* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor));
   }
 }
diff --git a/thirdparty/embree/kernels/builders/primrefgen.h b/thirdparty/embree/kernels/builders/primrefgen.h
index c09a848ba3..0e2ab094fb 100644
--- a/thirdparty/embree/kernels/builders/primrefgen.h
+++ b/thirdparty/embree/kernels/builders/primrefgen.h
@@ -4,9 +4,8 @@
 #pragma once
 
 #include "../common/scene.h"
-#include "../common/primref.h"
-#include "../common/primref_mb.h"
 #include "priminfo.h"
+#include "priminfo_mb.h"
 #include "bvh_builder_morton.h"
 
 namespace embree
@@ -16,19 +15,23 @@ namespace embree
     PrimInfo createPrimRefArray(Geometry* geometry, unsigned int geomID, size_t numPrimitives, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor);
    
     PrimInfo createPrimRefArray(Scene* scene, Geometry::GTypeMask types, bool mblur, size_t numPrimitives, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor);
+
+    PrimInfo createPrimRefArray(Scene* scene, Geometry::GTypeMask types, bool mblur, size_t numPrimitives, mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids, BuildProgressMonitor& progressMonitor);
    
     PrimInfo createPrimRefArrayMBlur(Scene* scene, Geometry::GTypeMask types, size_t numPrimitives, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor, size_t itime = 0);
 
     PrimInfoMB createPrimRefArrayMSMBlur(Scene* scene, Geometry::GTypeMask types, size_t numPrimitives, mvector<PrimRefMB>& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1 = BBox1f(0.0f,1.0f));
 
+    PrimInfoMB createPrimRefArrayMSMBlur(Scene* scene, Geometry::GTypeMask types, size_t numPrimitives, mvector<PrimRefMB>& prims, mvector<SubGridBuildData>& sgrids, BuildProgressMonitor& progressMonitor, BBox1f t0t1 = BBox1f(0.0f,1.0f));
+
     template<typename Mesh>
       size_t createMortonCodeArray(Mesh* mesh, mvector<BVHBuilderMorton::BuildPrim>& morton, BuildProgressMonitor& progressMonitor);
 
     /* special variants for grids */
-    PrimInfo createPrimRefArrayGrids(Scene* scene, mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids);
+    PrimInfo createPrimRefArrayGrids(Scene* scene, mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids); // FIXME: remove
 
     PrimInfo createPrimRefArrayGrids(GridMesh* mesh, mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids);
-    
+
+    PrimInfoMB createPrimRefArrayMSMBlurGrid(Scene* scene, mvector<PrimRefMB>& prims, mvector<SubGridBuildData>& sgrids, BuildProgressMonitor& progressMonitor, BBox1f t0t1 = BBox1f(0.0f,1.0f));
   }
 }
-
diff --git a/thirdparty/embree/kernels/builders/primrefgen_presplit.h b/thirdparty/embree/kernels/builders/primrefgen_presplit.h
index aa2026a85e..db9010995d 100644
--- a/thirdparty/embree/kernels/builders/primrefgen_presplit.h
+++ b/thirdparty/embree/kernels/builders/primrefgen_presplit.h
@@ -3,10 +3,12 @@
 
 #pragma once
 
-#include "../builders/primrefgen.h"
+#include "../../common/algorithms/parallel_reduce.h"
+#include "../../common/algorithms/parallel_sort.h"
 #include "../builders/heuristic_spatial.h"
 #include "../builders/splitter.h"
 
+#include "../../common/algorithms/parallel_partition.h"
 #include "../../common/algorithms/parallel_for_for.h"
 #include "../../common/algorithms/parallel_for_for_prefix_sum.h"
 
@@ -14,15 +16,87 @@
 #define CHECK_PRESPLIT(x) 
 
 #define GRID_SIZE 1024
+//#define MAX_PRESPLITS_PER_PRIMITIVE_LOG 6
 #define MAX_PRESPLITS_PER_PRIMITIVE_LOG 5
 #define MAX_PRESPLITS_PER_PRIMITIVE (1<<MAX_PRESPLITS_PER_PRIMITIVE_LOG)
-#define PRIORITY_CUTOFF_THRESHOLD 1.0f
+//#define PRIORITY_CUTOFF_THRESHOLD 2.0f
 #define PRIORITY_SPLIT_POS_WEIGHT 1.5f
 
 namespace embree
 {  
   namespace isa
   {
+    struct SplittingGrid
+    {
+      __forceinline SplittingGrid(const BBox3fa& bounds)
+      {
+        base = bounds.lower;
+        const Vec3fa diag = bounds.size();
+        extend = max(diag.x,max(diag.y,diag.z));		
+        scale = extend == 0.0f ? 0.0f : GRID_SIZE / extend;
+      }
+
+      __forceinline bool split_pos(const PrimRef& prim, unsigned int& dim_o, float& fsplit_o) const
+      {
+        /* compute morton code */
+        const Vec3fa lower = prim.lower;
+        const Vec3fa upper = prim.upper;
+        const Vec3fa glower = (lower-base)*Vec3fa(scale)+Vec3fa(0.2f);
+        const Vec3fa gupper = (upper-base)*Vec3fa(scale)-Vec3fa(0.2f);
+        Vec3ia ilower(floor(glower));
+        Vec3ia iupper(floor(gupper));
+        
+        /* this ignores dimensions that are empty */
+        iupper = (Vec3ia)select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper));
+        
+        /* compute a morton code for the lower and upper grid coordinates. */
+        const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z);
+        const unsigned int upper_code = bitInterleave(iupper.x,iupper.y,iupper.z);
+
+        /* if all bits are equal then we cannot split */
+        if (unlikely(lower_code == upper_code))
+          return false;
+		    
+        /* compute octree level and dimension to perform the split in */
+        const unsigned int diff = 31 - lzcnt(lower_code^upper_code);
+        const unsigned int level = diff / 3;
+        const unsigned int dim   = diff % 3;
+      
+        /* now we compute the grid position of the split */
+        const unsigned int isplit = iupper[dim] & ~((1<<level)-1);
+			    
+        /* compute world space position of split */
+        const float inv_grid_size = 1.0f / GRID_SIZE;
+        const float fsplit = base[dim] + isplit * inv_grid_size * extend;
+        assert(prim.lower[dim] <= fsplit && prim.upper[dim] >= fsplit);
+
+        dim_o = dim;
+        fsplit_o = fsplit;
+        return true;
+      }
+
+      __forceinline Vec2i computeMC(const PrimRef& ref) const
+      {
+        const Vec3fa lower = ref.lower;
+        const Vec3fa upper = ref.upper;
+        const Vec3fa glower = (lower-base)*Vec3fa(scale)+Vec3fa(0.2f);
+        const Vec3fa gupper = (upper-base)*Vec3fa(scale)-Vec3fa(0.2f);
+        Vec3ia ilower(floor(glower));
+        Vec3ia iupper(floor(gupper));
+        
+        /* this ignores dimensions that are empty */
+        iupper = (Vec3ia)select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper));
+        
+        /* compute a morton code for the lower and upper grid coordinates. */
+        const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z);
+        const unsigned int upper_code = bitInterleave(iupper.x,iupper.y,iupper.z);
+        return Vec2i(lower_code,upper_code);
+      }
+      
+      Vec3fa base;
+      float scale;
+      float extend;
+    };
 
     struct PresplitItem
     {
@@ -32,30 +106,30 @@ namespace embree
       };
       unsigned int index;
       
-      __forceinline operator unsigned() const
-      {
-	return reinterpret_cast<const unsigned&>(priority);
-      }
-      __forceinline bool operator < (const PresplitItem& item) const
-      {
-	return (priority < item.priority);
+      __forceinline operator unsigned() const {
+	return data;
       }
 
-      template<typename Mesh>
-      __forceinline static float compute_priority(const PrimRef &ref, Scene *scene, const Vec2i &mc)
+      template<typename ProjectedPrimitiveAreaFunc>
+      __forceinline static float compute_priority(const ProjectedPrimitiveAreaFunc& primitiveArea, const PrimRef &ref, const Vec2i &mc)
       {
-	const unsigned int geomID = ref.geomID();
-	const unsigned int primID = ref.primID();
 	const float area_aabb  = area(ref.bounds());
-	const float area_prim  = ((Mesh*)scene->get(geomID))->projectedPrimitiveArea(primID);
+	const float area_prim  = primitiveArea(ref);
+        if (area_prim == 0.0f) return 0.0f;
         const unsigned int diff = 31 - lzcnt(mc.x^mc.y);
-        assert(area_prim <= area_aabb);
-        //const float priority = powf((area_aabb - area_prim) * powf(PRIORITY_SPLIT_POS_WEIGHT,(float)diff),1.0f/4.0f);   
-        const float priority = sqrtf(sqrtf( (area_aabb - area_prim) * powf(PRIORITY_SPLIT_POS_WEIGHT,(float)diff) ));
+        //assert(area_prim <= area_aabb); // may trigger due to numerical issues 
+        const float area_diff = max(0.0f, area_aabb - area_prim);
+        //const float priority = powf(area_diff * powf(PRIORITY_SPLIT_POS_WEIGHT,(float)diff),1.0f/4.0f);   
+        const float priority = sqrtf(sqrtf( area_diff * powf(PRIORITY_SPLIT_POS_WEIGHT,(float)diff) ));
+        //const float priority = sqrtf(sqrtf( area_diff ) );
+        //const float priority = sqrtfarea_diff;
+        //const float priority = area_diff; // 104 fps !!!!!!!!!!
+        //const float priority = 0.2f*area_aabb + 0.8f*area_diff; // 104 fps
+        //const float priority = area_aabb * max(area_aabb/area_prim,32.0f); 
+        //const float priority = area_prim;
         assert(priority >= 0.0f && priority < FLT_LARGE);
 	return priority;      
       }
-
     
     };
 
@@ -63,77 +137,96 @@ namespace embree
       return cout << "index " << item.index << " priority " << item.priority;    
     };
 
-    template<typename SplitterFactory>    
-      void splitPrimitive(SplitterFactory &Splitter,
-                          const PrimRef &prim,
-                          const unsigned int geomID,
-                          const unsigned int primID,
-                          const unsigned int split_level,
-                          const Vec3fa &grid_base, 
-                          const float grid_scale,
-                          const float grid_extend,
+#if 1
+    
+    template<typename Splitter>    
+      void splitPrimitive(const Splitter& splitter,
+                          const PrimRef& prim,
+                          const unsigned int splitprims,
+                          const SplittingGrid& grid,
                           PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE],
                           unsigned int& numSubPrims)
     {
-      assert(split_level <= MAX_PRESPLITS_PER_PRIMITIVE_LOG);
-      if (split_level == 0)
+      assert(splitprims > 0 && splitprims <= MAX_PRESPLITS_PER_PRIMITIVE);
+      
+      if (splitprims == 1)
       {
         assert(numSubPrims < MAX_PRESPLITS_PER_PRIMITIVE);
         subPrims[numSubPrims++] = prim;
       }
       else
       {
-        const Vec3fa lower = prim.lower;
-        const Vec3fa upper = prim.upper;
-        const Vec3fa glower = (lower-grid_base)*Vec3fa(grid_scale)+Vec3fa(0.2f);
-        const Vec3fa gupper = (upper-grid_base)*Vec3fa(grid_scale)-Vec3fa(0.2f);
-        Vec3ia ilower(floor(glower));
-        Vec3ia iupper(floor(gupper));
-
-        /* this ignores dimensions that are empty */
-        iupper = (Vec3ia)(select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper)));
-
-        /* compute a morton code for the lower and upper grid coordinates. */
-        const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z);
-        const unsigned int upper_code = bitInterleave(iupper.x,iupper.y,iupper.z);
-			
-        /* if all bits are equal then we cannot split */
-        if(unlikely(lower_code == upper_code))
+        unsigned int dim; float fsplit;
+        if (!grid.split_pos(prim, dim, fsplit))
         {
           assert(numSubPrims < MAX_PRESPLITS_PER_PRIMITIVE);
           subPrims[numSubPrims++] = prim;
           return;
         }
-		    
-        /* compute octree level and dimension to perform the split in */
-        const unsigned int diff = 31 - lzcnt(lower_code^upper_code);
-        const unsigned int level = diff / 3;
-        const unsigned int dim   = diff % 3;
+          
+        /* split primitive */
+        PrimRef left,right;
+        splitter(prim,dim,fsplit,left,right);
+        assert(!left.bounds().empty());
+        assert(!right.bounds().empty());
+
+        const unsigned int splitprims_left = splitprims/2;
+        const unsigned int splitprims_right = splitprims - splitprims_left;
+        splitPrimitive(splitter,left,splitprims_left,grid,subPrims,numSubPrims);
+        splitPrimitive(splitter,right,splitprims_right,grid,subPrims,numSubPrims);
+      }
+    }
+
+#else
+    
+    template<typename Splitter>    
+      void splitPrimitive(const Splitter& splitter,
+                          const PrimRef& prim,
+                          const unsigned int targetSubPrims,
+                          const SplittingGrid& grid,
+                          PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE],
+                          unsigned int& numSubPrims)
+    {
+      assert(targetSubPrims > 0 && targetSubPrims <= MAX_PRESPLITS_PER_PRIMITIVE);
       
-        /* now we compute the grid position of the split */
-        const unsigned int isplit = iupper[dim] & ~((1<<level)-1);
-			    
-        /* compute world space position of split */
-        const float inv_grid_size = 1.0f / GRID_SIZE;
-        const float fsplit = grid_base[dim] + isplit * inv_grid_size * grid_extend;
+      auto compare = [] ( const PrimRef& a, const PrimRef& b ) {
+        return area(a.bounds()) < area(b.bounds());
+      };
+      
+      subPrims[numSubPrims++] = prim;
+
+      while (numSubPrims < targetSubPrims)
+      {
+        /* get top heap element */
+        std::pop_heap(subPrims+0,subPrims+numSubPrims, compare);
+        PrimRef top = subPrims[--numSubPrims];
 
-        assert(prim.lower[dim] <= fsplit &&
-               prim.upper[dim] >= fsplit);
-		
+        unsigned int dim; float fsplit;
+        if (!grid.split_pos(top, dim, fsplit))
+        {
+          assert(numSubPrims < MAX_PRESPLITS_PER_PRIMITIVE);
+          subPrims[numSubPrims++] = top;
+          return;
+        }
+          
         /* split primitive */
-        const auto splitter = Splitter(prim);
-        BBox3fa left,right;
-        splitter(prim.bounds(),dim,fsplit,left,right);
-        assert(!left.empty());
-        assert(!right.empty());
+        PrimRef left,right;
+        splitter(top,dim,fsplit,left,right);
+        assert(!left.bounds().empty());
+        assert(!right.bounds().empty());
 
-			    
-        splitPrimitive(Splitter,PrimRef(left ,geomID,primID),geomID,primID,split_level-1,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
-        splitPrimitive(Splitter,PrimRef(right,geomID,primID),geomID,primID,split_level-1,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
+        subPrims[numSubPrims++] = left;
+        std::push_heap(subPrims+0, subPrims+numSubPrims, compare);
+
+        subPrims[numSubPrims++] = right;
+        std::push_heap(subPrims+0, subPrims+numSubPrims, compare);
       }
     }
     
-    
+#endif
+
+#if !defined(RTHWIF_STANDALONE)
+
     template<typename Mesh, typename SplitterFactory>    
       PrimInfo createPrimRefArray_presplit(Geometry* geometry, unsigned int geomID, size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
     {
@@ -155,87 +248,40 @@ namespace embree
 	}
       return pinfo;	
     }
+#endif
     
-    __forceinline Vec2i computeMC(const Vec3fa &grid_base, const float grid_scale, const PrimRef &ref)
+    template<typename SplitPrimitiveFunc, typename ProjectedPrimitiveAreaFunc, typename PrimVector>
+    PrimInfo createPrimRefArray_presplit(size_t numPrimRefs,
+                                         PrimVector& prims,
+                                         const PrimInfo& pinfo,
+                                         const SplitPrimitiveFunc& splitPrimitive,
+                                         const ProjectedPrimitiveAreaFunc& primitiveArea)
     {
-      const Vec3fa lower = ref.lower;
-      const Vec3fa upper = ref.upper;
-      const Vec3fa glower = (lower-grid_base)*Vec3fa(grid_scale)+Vec3fa(0.2f);
-      const Vec3fa gupper = (upper-grid_base)*Vec3fa(grid_scale)-Vec3fa(0.2f);
-      Vec3ia ilower(floor(glower));
-      Vec3ia iupper(floor(gupper));
-      
-      /* this ignores dimensions that are empty */
-      iupper = (Vec3ia)select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper));
-
-      /* compute a morton code for the lower and upper grid coordinates. */
-      const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z);
-      const unsigned int upper_code = bitInterleave(iupper.x,iupper.y,iupper.z);
-      return Vec2i(lower_code,upper_code);
-    }
-
-    template<typename Mesh, typename SplitterFactory>    
-      PrimInfo createPrimRefArray_presplit(Scene* scene, Geometry::GTypeMask types, bool mblur, size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
-    {	
       static const size_t MIN_STEP_SIZE = 128;
 
-      ParallelForForPrefixSumState<PrimInfo> pstate;
-      Scene::Iterator2 iter(scene,types,mblur);
-
-      /* first try */
-      progressMonitor(0);
-      pstate.init(iter,size_t(1024));
-      PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
-	  return mesh->createPrimRefArray(prims,r,k,(unsigned)geomID);
-	}, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
-      
-      /* if we need to filter out geometry, run again */
-      if (pinfo.size() != numPrimRefs)
-	{
-	  progressMonitor(0);
-	  pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
-	      return mesh->createPrimRefArray(prims,r,base.size(),(unsigned)geomID);
-	    }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
-	}
-
       /* use correct number of primitives */
       size_t numPrimitives = pinfo.size();
-      const size_t alloc_numPrimitives = prims.size(); 
-      const size_t numSplitPrimitivesBudget = alloc_numPrimitives - numPrimitives;
-
-      /* set up primitive splitter */
-      SplitterFactory Splitter(scene);
-
-
-      DBG_PRESPLIT(
-        const size_t org_numPrimitives = pinfo.size();
-        PRINT(numPrimitives);		
-        PRINT(alloc_numPrimitives);		
-        PRINT(numSplitPrimitivesBudget);
-        );
+      const size_t numPrimitivesExt = prims.size(); 
+      const size_t numSplitPrimitivesBudget = numPrimitivesExt - numPrimitives;
 
       /* allocate double buffer presplit items */
-      const size_t presplit_allocation_size = sizeof(PresplitItem)*alloc_numPrimitives;
-      PresplitItem *presplitItem     = (PresplitItem*)alignedMalloc(presplit_allocation_size,64);
-      PresplitItem *tmp_presplitItem = (PresplitItem*)alignedMalloc(presplit_allocation_size,64);
+      avector<PresplitItem> preSplitItem0(numPrimitivesExt);
+      avector<PresplitItem> preSplitItem1(numPrimitivesExt);
 
       /* compute grid */
-      const Vec3fa grid_base    = pinfo.geomBounds.lower;
-      const Vec3fa grid_diag    = pinfo.geomBounds.size();
-      const float grid_extend   = max(grid_diag.x,max(grid_diag.y,grid_diag.z));		
-      const float grid_scale    = grid_extend == 0.0f ? 0.0f : GRID_SIZE / grid_extend;
-
+      SplittingGrid grid(pinfo.geomBounds);
+      
       /* init presplit items and get total sum */
       const float psum = parallel_reduce( size_t(0), numPrimitives, size_t(MIN_STEP_SIZE), 0.0f, [&](const range<size_t>& r) -> float {
           float sum = 0.0f;
           for (size_t i=r.begin(); i<r.end(); i++)
           {		
-            presplitItem[i].index = (unsigned int)i;
-            const Vec2i mc = computeMC(grid_base,grid_scale,prims[i]);
+            preSplitItem0[i].index = (unsigned int)i;
+            const Vec2i mc = grid.computeMC(prims[i]);
             /* if all bits are equal then we cannot split */
-            presplitItem[i].priority = (mc.x != mc.y) ? PresplitItem::compute_priority<Mesh>(prims[i],scene,mc) : 0.0f;    
+            preSplitItem0[i].priority = (mc.x != mc.y) ? PresplitItem::compute_priority(primitiveArea,prims[i],mc) : 0.0f;    
             /* FIXME: sum undeterministic */
-            sum += presplitItem[i].priority;
+            sum += preSplitItem0[i].priority;
           }
           return sum;
         },[](const float& a, const float& b) -> float { return a+b; });
@@ -245,132 +291,178 @@ namespace embree
       parallel_for( size_t(0), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& r) -> void {
           for (size_t i=r.begin(); i<r.end(); i++)
           {
-            if (presplitItem[i].priority > 0.0f)
-            {
-              const float rel_p = (float)numSplitPrimitivesBudget * presplitItem[i].priority * inv_psum;
-              if (rel_p >= PRIORITY_CUTOFF_THRESHOLD) // need at least a split budget that generates two sub-prims
-              {
-                presplitItem[i].priority = max(min(ceilf(logf(rel_p)/logf(2.0f)),(float)MAX_PRESPLITS_PER_PRIMITIVE_LOG),1.0f);
-                //presplitItem[i].priority = min(floorf(logf(rel_p)/logf(2.0f)),(float)MAX_PRESPLITS_PER_PRIMITIVE_LOG);
-                assert(presplitItem[i].priority >= 0.0f && presplitItem[i].priority <= (float)MAX_PRESPLITS_PER_PRIMITIVE_LOG);
-              }
-              else
-                presplitItem[i].priority = 0.0f;
+            if (preSplitItem0[i].priority <= 0.0f) {
+              preSplitItem0[i].data = 1;
+              continue;
             }
+              
+            const float rel_p = (float)numSplitPrimitivesBudget * preSplitItem0[i].priority * inv_psum;
+            if (rel_p < 1) {
+              preSplitItem0[i].data = 1;
+              continue;
+            }
+            
+            //preSplitItem0[i].data = max(min(ceilf(rel_p),(float)MAX_PRESPLITS_PER_PRIMITIVE),1.0f);
+            preSplitItem0[i].data = max(min(ceilf(logf(rel_p)/logf(2.0f)),(float)MAX_PRESPLITS_PER_PRIMITIVE_LOG),1.0f);
+            preSplitItem0[i].data = 1 << preSplitItem0[i].data;
+            assert(preSplitItem0[i].data <= MAX_PRESPLITS_PER_PRIMITIVE);
           }
         });
 
-      auto isLeft = [&] (const PresplitItem &ref) { return ref.priority < PRIORITY_CUTOFF_THRESHOLD; };        
-      size_t center = parallel_partitioning(presplitItem,0,numPrimitives,isLeft,1024);
+      auto isLeft = [&] (const PresplitItem &ref) { return ref.data <= 1; };        
+      size_t center = parallel_partitioning(preSplitItem0.data(),0,numPrimitives,isLeft,1024);
+      assert(center <= numPrimitives);
 
       /* anything to split ? */
-      if (center < numPrimitives)
+      if (center >= numPrimitives)
+        return pinfo;
+            
+      size_t numPrimitivesToSplit = numPrimitives - center;
+      assert(preSplitItem0[center].data >= 1.0f);
+      
+      /* sort presplit items in ascending order */
+      radix_sort_u32(preSplitItem0.data() + center,preSplitItem1.data() + center,numPrimitivesToSplit,1024);
+      
+      CHECK_PRESPLIT(
+        parallel_for( size_t(center+1), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& r) -> void {
+          for (size_t i=r.begin(); i<r.end(); i++)
+            assert(preSplitItem0[i-1].data <= preSplitItem0[i].data);
+          });
+      );
+      
+      unsigned int* primOffset0 = (unsigned int*)preSplitItem1.data();
+      unsigned int* primOffset1 = (unsigned int*)preSplitItem1.data() + numPrimitivesToSplit;
+      
+      /* compute actual number of sub-primitives generated within the [center;numPrimitives-1] range */
+      const size_t totalNumSubPrims = parallel_reduce( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), size_t(0), [&](const range<size_t>& t) -> size_t {
+        size_t sum = 0;
+        for (size_t i=t.begin(); i<t.end(); i++)
+        {	
+          const unsigned int primrefID  = preSplitItem0[i].index;	
+          const unsigned int splitprims = preSplitItem0[i].data;
+          assert(splitprims >= 1 && splitprims <= MAX_PRESPLITS_PER_PRIMITIVE);
+          
+          unsigned int numSubPrims = 0;
+          PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE];	
+          splitPrimitive(prims[primrefID],splitprims,grid,subPrims,numSubPrims);
+          assert(numSubPrims);
+          
+          numSubPrims--; // can reuse slot 
+          sum+=numSubPrims;
+          preSplitItem0[i].data = (numSubPrims << 16) | splitprims;
+          
+          primOffset0[i-center] = numSubPrims;
+        }
+        return sum;
+      },[](const size_t& a, const size_t& b) -> size_t { return a+b; });
+
+      /* if we are over budget, need to shrink the range */
+      if (totalNumSubPrims > numSplitPrimitivesBudget) 
       {
-        size_t numPrimitivesToSplit = numPrimitives - center;
-        assert(presplitItem[center].priority >= 1.0f);
-
-        /* sort presplit items in ascending order */
-        radix_sort_u32(presplitItem + center,tmp_presplitItem + center,numPrimitivesToSplit,1024);
-
-        CHECK_PRESPLIT(
-          parallel_for( size_t(center+1), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& r) -> void {
-              for (size_t i=r.begin(); i<r.end(); i++)
-                assert(presplitItem[i-1].priority <= presplitItem[i].priority);
-            });
-          );
-
-        unsigned int* primOffset0 = (unsigned int*)tmp_presplitItem;
-        unsigned int* primOffset1 = (unsigned int*)tmp_presplitItem + numPrimitivesToSplit;
-
-        /* compute actual number of sub-primitives generated within the [center;numPrimitives-1] range */
-        const size_t totalNumSubPrims = parallel_reduce( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), size_t(0), [&](const range<size_t>& t) -> size_t {
-            size_t sum = 0;
-            for (size_t i=t.begin(); i<t.end(); i++)
-            {	
-              PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE];	
-              assert(presplitItem[i].priority >= 1.0f);
-              const unsigned int  primrefID = presplitItem[i].index;	
-              const float prio              = presplitItem[i].priority;
-              const unsigned int   geomID   = prims[primrefID].geomID();
-              const unsigned int   primID   = prims[primrefID].primID();
-              const unsigned int split_levels = (unsigned int)prio;
-              unsigned int numSubPrims = 0;
-              splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
-              assert(numSubPrims);
-              numSubPrims--; // can reuse slot 
-              sum+=numSubPrims;
-              presplitItem[i].data = (numSubPrims << MAX_PRESPLITS_PER_PRIMITIVE_LOG) | split_levels;
-              primOffset0[i-center] = numSubPrims;
-            }
-            return sum;
-          },[](const size_t& a, const size_t& b) -> size_t { return a+b; });
+        size_t new_center = numPrimitives-1;
+        size_t sum = 0;
+        for (;new_center>=center;new_center--)
+        {
+          const unsigned int numSubPrims = preSplitItem0[new_center].data >> 16;
+          if (unlikely(sum + numSubPrims >= numSplitPrimitivesBudget)) break;
+          sum += numSubPrims;
+        }
+        new_center++;
         
-        /* if we are over budget, need to shrink the range */
-        if (totalNumSubPrims > numSplitPrimitivesBudget) 
+        primOffset0 += new_center - center;
+        numPrimitivesToSplit -= new_center - center;
+        center = new_center;
+        assert(numPrimitivesToSplit == (numPrimitives - center));
+      }
+      
+      /* parallel prefix sum to compute offsets for storing sub-primitives */
+      const unsigned int offset = parallel_prefix_sum(primOffset0,primOffset1,numPrimitivesToSplit,(unsigned int)0,std::plus<unsigned int>());
+      assert(numPrimitives+offset <= numPrimitivesExt);
+      
+      /* iterate over range, and split primitives into sub primitives and append them to prims array */		    
+      parallel_for( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& rn) -> void {
+        for (size_t j=rn.begin(); j<rn.end(); j++)		    
         {
-          size_t new_center = numPrimitives-1;
-          size_t sum = 0;
-          for (;new_center>=center;new_center--)
-          {
-            const unsigned int numSubPrims = presplitItem[new_center].data >> MAX_PRESPLITS_PER_PRIMITIVE_LOG;
-            if (unlikely(sum + numSubPrims >= numSplitPrimitivesBudget)) break;
-            sum += numSubPrims;
-          }
-          new_center++;
-
-          primOffset0 += new_center - center;
-          numPrimitivesToSplit -= new_center - center;
-          center = new_center;
-          assert(numPrimitivesToSplit == (numPrimitives - center));
+          const unsigned int primrefID = preSplitItem0[j].index;	
+          const unsigned int splitprims = preSplitItem0[j].data & 0xFFFF;
+          assert(splitprims >= 1 && splitprims <= MAX_PRESPLITS_PER_PRIMITIVE);
+          
+          unsigned int numSubPrims = 0;
+          PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE];
+          splitPrimitive(prims[primrefID],splitprims,grid,subPrims,numSubPrims);
+
+          const unsigned int numSubPrimsExpected MAYBE_UNUSED = preSplitItem0[j].data >> 16;
+          assert(numSubPrims-1 == numSubPrimsExpected);
+          
+          const size_t newID = numPrimitives + primOffset1[j-center];
+          assert(newID+numSubPrims-1 <= numPrimitivesExt);
+          
+          prims[primrefID] = subPrims[0];
+          for (size_t i=1;i<numSubPrims;i++)
+            prims[newID+i-1] = subPrims[i];
         }
+      });
 
-        /* parallel prefix sum to compute offsets for storing sub-primitives */
-        const unsigned int offset = parallel_prefix_sum(primOffset0,primOffset1,numPrimitivesToSplit,(unsigned int)0,std::plus<unsigned int>());
-        assert(numPrimitives+offset <= alloc_numPrimitives);
-
-        /* iterate over range, and split primitives into sub primitives and append them to prims array */		    
-        parallel_for( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& rn) -> void {
-            for (size_t j=rn.begin(); j<rn.end(); j++)		    
-            {
-              PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE];
-              const unsigned int  primrefID = presplitItem[j].index;	
-              const unsigned int   geomID   = prims[primrefID].geomID();
-              const unsigned int   primID   = prims[primrefID].primID();
-              const unsigned int split_levels = presplitItem[j].data & ((unsigned int)(1 << MAX_PRESPLITS_PER_PRIMITIVE_LOG)-1);
-
-              assert(split_levels);
-              assert(split_levels <= MAX_PRESPLITS_PER_PRIMITIVE_LOG);
-              unsigned int numSubPrims = 0;
-              splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
-              const size_t newID = numPrimitives + primOffset1[j-center];              
-              assert(newID+numSubPrims-1 <= alloc_numPrimitives);
-              prims[primrefID] = subPrims[0];
-              for (size_t i=1;i<numSubPrims;i++)
-                prims[newID+i-1] = subPrims[i];
-            }
-          });
-
-        numPrimitives += offset;
-        DBG_PRESPLIT(
-          PRINT(pinfo.size());
-          PRINT(numPrimitives);
-          PRINT((float)numPrimitives/org_numPrimitives));                
-      }
+      numPrimitives += offset;
                 
       /* recompute centroid bounding boxes */
-      pinfo = parallel_reduce(size_t(0),numPrimitives,size_t(MIN_STEP_SIZE),PrimInfo(empty),[&] (const range<size_t>& r) -> PrimInfo {
+      const PrimInfo pinfo1 = parallel_reduce(size_t(0),numPrimitives,size_t(MIN_STEP_SIZE),PrimInfo(empty),[&] (const range<size_t>& r) -> PrimInfo {
           PrimInfo p(empty);
           for (size_t j=r.begin(); j<r.end(); j++)
             p.add_center2(prims[j]);
           return p;
         }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
   
-      assert(pinfo.size() == numPrimitives);
+      assert(pinfo1.size() == numPrimitives);
       
-      /* free double buffer presplit items */
-      alignedFree(tmp_presplitItem);		
-      alignedFree(presplitItem);
-      return pinfo;	
+      return pinfo1;	
+    }
+
+#if !defined(RTHWIF_STANDALONE)
+    
+     template<typename Mesh, typename SplitterFactory>    
+      PrimInfo createPrimRefArray_presplit(Scene* scene, Geometry::GTypeMask types, bool mblur, size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
+    {
+      ParallelForForPrefixSumState<PrimInfo> pstate;
+      Scene::Iterator2 iter(scene,types,mblur);
+
+      /* first try */
+      progressMonitor(0);
+      pstate.init(iter,size_t(1024));
+      PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+	  return mesh->createPrimRefArray(prims,r,k,(unsigned)geomID);
+	}, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != numPrimRefs)
+	{
+	  progressMonitor(0);
+	  pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+	      return mesh->createPrimRefArray(prims,r,base.size(),(unsigned)geomID);
+	    }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+	}
+
+
+      SplitterFactory Splitter(scene);
+        
+      auto split_primitive = [&] (const PrimRef &prim,
+                                  const unsigned int splitprims,
+                                  const SplittingGrid& grid,
+                                  PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE],
+                                  unsigned int& numSubPrims)
+      {
+         const auto splitter = Splitter(prim);
+         splitPrimitive(splitter,prim,splitprims,grid,subPrims,numSubPrims);
+      };
+      
+      auto primitiveArea = [&] (const PrimRef &ref) {
+        const unsigned int geomID = ref.geomID();
+        const unsigned int primID = ref.primID();
+        return ((Mesh*)scene->get(geomID))->projectedPrimitiveArea(primID);
+      };
+      
+      return createPrimRefArray_presplit(numPrimRefs,prims,pinfo,split_primitive,primitiveArea);
     }
+#endif 
   }
 }
diff --git a/thirdparty/embree/kernels/builders/splitter.h b/thirdparty/embree/kernels/builders/splitter.h
index da89d0b178..3daf55e801 100644
--- a/thirdparty/embree/kernels/builders/splitter.h
+++ b/thirdparty/embree/kernels/builders/splitter.h
@@ -3,8 +3,11 @@
 
 #pragma once
 
+#if !defined(RTHWIF_STANDALONE)
 #include "../common/scene.h"
-#include "../common/primref.h"
+#endif
+
+#include "../builders/primref.h"
 
 namespace embree
 {
@@ -15,6 +18,41 @@ namespace embree
                                     const size_t dim, 
                                     const float pos, 
                                     const Vec3fa (&v)[N+1],
+                                    BBox3fa& left_o, 
+                                    BBox3fa& right_o)
+    {
+      BBox3fa left = empty, right = empty;
+      /* clip triangle to left and right box by processing all edges */
+      for (size_t i=0; i<N; i++)
+      {
+        const Vec3fa &v0 = v[i]; 
+        const Vec3fa &v1 = v[i+1]; 
+        const float v0d = v0[dim];
+        const float v1d = v1[dim];
+        
+        if (v0d <= pos) left. extend(v0); // this point is on left side
+        if (v0d >= pos) right.extend(v0); // this point is on right side
+        
+        if ((v0d < pos && pos < v1d) || (v1d < pos && pos < v0d)) // the edge crosses the splitting location
+        {
+          assert((v1d-v0d) != 0.0f);
+          const float inv_length = 1.0f/(v1d-v0d);
+          const Vec3fa c = madd(Vec3fa((pos-v0d)*inv_length),v1-v0,v0);
+          left.extend(c);
+          right.extend(c);
+        }
+      }
+      
+      /* clip against current bounds */
+      left_o  = intersect(left,bounds);
+      right_o = intersect(right,bounds);
+    }
+    
+    template<size_t N>
+    __forceinline void splitPolygon(const BBox3fa& bounds, 
+                                    const size_t dim, 
+                                    const float pos, 
+                                    const Vec3fa (&v)[N+1],
                                     const Vec3fa (&inv_length)[N],
                                     BBox3fa& left_o, 
                                     BBox3fa& right_o)
@@ -78,7 +116,9 @@ namespace embree
       new (&left_o ) PrimRef(intersect(left ,prim.bounds()),prim.geomID(), prim.primID());
       new (&right_o) PrimRef(intersect(right,prim.bounds()),prim.geomID(), prim.primID());
     }
-    
+
+#if !defined(RTHWIF_STANDALONE)
+
     struct TriangleSplitter
     {
       __forceinline TriangleSplitter(const Scene* scene, const PrimRef& prim)
@@ -173,6 +213,13 @@ namespace embree
       __forceinline DummySplitter(const Scene* scene, const PrimRef& prim)
       {
       }
+
+      __forceinline void operator() (const PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const {
+      }
+      
+      __forceinline void operator() (const BBox3fa& prim, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const {
+      }
+      
     };
     
     struct DummySplitterFactory
@@ -187,7 +234,7 @@ namespace embree
     private:
       const Scene* scene;
     };
-    
+#endif 
   }
 }
 
diff --git a/thirdparty/embree/kernels/bvh/bvh.h b/thirdparty/embree/kernels/bvh/bvh.h
index 565eec5a58..e0ffb86af9 100644
--- a/thirdparty/embree/kernels/bvh/bvh.h
+++ b/thirdparty/embree/kernels/bvh/bvh.h
@@ -85,7 +85,7 @@ namespace embree
     typedef BVHNodeRecord<NodeRef>     NodeRecord;
     typedef BVHNodeRecordMB<NodeRef>   NodeRecordMB;
     typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;
-    
+
   public:
     
     /*! BVHN default constructor. */
diff --git a/thirdparty/embree/kernels/bvh/bvh4_factory.cpp b/thirdparty/embree/kernels/bvh/bvh4_factory.cpp
index 890d5e7b7c..d03940deea 100644
--- a/thirdparty/embree/kernels/bvh/bvh4_factory.cpp
+++ b/thirdparty/embree/kernels/bvh/bvh4_factory.cpp
@@ -17,6 +17,7 @@
 #include "../geometry/subdivpatch1.h"
 #include "../geometry/object.h"
 #include "../geometry/instance.h"
+#include "../geometry/instance_array.h"
 #include "../geometry/subgrid.h"
 #include "../common/accelinstance.h"
 
@@ -66,6 +67,9 @@ namespace embree
   DECLARE_SYMBOL2(Accel::Intersector1,BVH4InstanceIntersector1);
   DECLARE_SYMBOL2(Accel::Intersector1,BVH4InstanceMBIntersector1);
 
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4InstanceArrayIntersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4InstanceArrayMBIntersector1);
+
   DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Moeller);
   DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridMBIntersector1Moeller);
   DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Pluecker);
@@ -104,6 +108,9 @@ namespace embree
   DECLARE_SYMBOL2(Accel::Intersector4,BVH4InstanceIntersector4Chunk);
   DECLARE_SYMBOL2(Accel::Intersector4,BVH4InstanceMBIntersector4Chunk);
 
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4InstanceArrayIntersector4Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4InstanceArrayMBIntersector4Chunk);
+
   DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridMoeller);
   DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridMBIntersector4HybridMoeller);
   DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridPluecker);
@@ -142,6 +149,9 @@ namespace embree
   DECLARE_SYMBOL2(Accel::Intersector8,BVH4InstanceIntersector8Chunk);
   DECLARE_SYMBOL2(Accel::Intersector8,BVH4InstanceMBIntersector8Chunk);
 
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4InstanceArrayIntersector8Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4InstanceArrayMBIntersector8Chunk);
+
   DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridMoeller);
   DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridMBIntersector8HybridMoeller);
   DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridPluecker);
@@ -180,33 +190,20 @@ namespace embree
   DECLARE_SYMBOL2(Accel::Intersector16,BVH4InstanceIntersector16Chunk);
   DECLARE_SYMBOL2(Accel::Intersector16,BVH4InstanceMBIntersector16Chunk);
 
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4InstanceArrayIntersector16Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4InstanceArrayMBIntersector16Chunk);
+
   DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridMoeller);
   DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridMBIntersector16HybridMoeller);
   DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridPluecker);
 
-  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4IntersectorStreamPacketFallback);
-
-  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4IntersectorStreamMoeller);
-  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4IntersectorStreamMoellerNoFilter);
-  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4iIntersectorStreamMoeller);
-  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4vIntersectorStreamPluecker);
-  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4iIntersectorStreamPluecker);
-
-  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamMoeller);
-  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamMoellerNoFilter);
-  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4iIntersectorStreamMoeller);
-  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamPluecker);
-  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4iIntersectorStreamPluecker);
-
-  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4VirtualIntersectorStream);
-  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4InstanceIntersectorStream);
-
   DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool);
   DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool);
   DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool);
   DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool);
   DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool);
   DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelInstanceArraySAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool);
 
   DECLARE_ISA_FUNCTION(Builder*,BVH4Curve4vBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
   DECLARE_ISA_FUNCTION(Builder*,BVH4Curve4iBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
@@ -237,7 +234,10 @@ namespace embree
 
   DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
   DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
-  
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceArraySceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceArrayMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+
   DECLARE_ISA_FUNCTION(Builder*,BVH4GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
   DECLARE_ISA_FUNCTION(Builder*,BVH4GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
 
@@ -266,6 +266,7 @@ namespace embree
     IF_ENABLED_QUADS (SELECT_SYMBOL_DEFAULT_AVX(features,BVH4BuilderTwoLevelQuadMeshSAH));
     IF_ENABLED_USER (SELECT_SYMBOL_DEFAULT_AVX(features,BVH4BuilderTwoLevelVirtualSAH));
     IF_ENABLED_INSTANCE (SELECT_SYMBOL_DEFAULT_AVX(features,BVH4BuilderTwoLevelInstanceSAH));
+    IF_ENABLED_INSTANCE_ARRAY (SELECT_SYMBOL_DEFAULT_AVX(features,BVH4BuilderTwoLevelInstanceArraySAH));
 
     IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Curve4vBuilder_OBB_New));
     IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Curve4iBuilder_OBB_New));
@@ -296,7 +297,10 @@ namespace embree
 
     IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4InstanceSceneBuilderSAH));
     IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4InstanceMBSceneBuilderSAH));
-    
+
+    IF_ENABLED_INSTANCE_ARRAY(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4InstanceArraySceneBuilderSAH));
+    IF_ENABLED_INSTANCE_ARRAY(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4InstanceArrayMBSceneBuilderSAH));
+
     IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4GridSceneBuilderSAH));
     IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4GridMBSceneBuilderSAH));
 
@@ -349,6 +353,9 @@ namespace embree
     IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4InstanceIntersector1));
     IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4InstanceMBIntersector1));
 
+    IF_ENABLED_INSTANCE_ARRAY(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4InstanceArrayIntersector1));
+    IF_ENABLED_INSTANCE_ARRAY(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4InstanceArrayMBIntersector1));
+
     IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4GridIntersector1Moeller));
     IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4GridMBIntersector1Moeller))
     IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4GridIntersector1Pluecker));
@@ -389,7 +396,10 @@ namespace embree
 
     IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4InstanceIntersector4Chunk));
     IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4InstanceMBIntersector4Chunk));
-    
+
+    IF_ENABLED_INSTANCE_ARRAY(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4InstanceArrayIntersector4Chunk));
+    IF_ENABLED_INSTANCE_ARRAY(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4InstanceArrayMBIntersector4Chunk));
+
     IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersector4HybridMoeller));
 
     IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4GridIntersector4HybridMoeller));
@@ -424,13 +434,16 @@ namespace embree
 
     IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4SubdivPatch1Intersector8));
     IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4SubdivPatch1MBIntersector8));
-    
+
     IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4VirtualIntersector8Chunk));
     IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4VirtualMBIntersector8Chunk));
 
     IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4InstanceIntersector8Chunk));
     IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4InstanceMBIntersector8Chunk));
 
+    IF_ENABLED_INSTANCE_ARRAY(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4InstanceArrayIntersector8Chunk));
+    IF_ENABLED_INSTANCE_ARRAY(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4InstanceArrayMBIntersector8Chunk));
+
     IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4GridIntersector8HybridMoeller));
     IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4GridMBIntersector8HybridMoeller));
     IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4GridIntersector8HybridPluecker));
@@ -470,29 +483,13 @@ namespace embree
     IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512(features,BVH4InstanceIntersector16Chunk));
     IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512(features,BVH4InstanceMBIntersector16Chunk));
 
+    IF_ENABLED_INSTANCE_ARRAY(SELECT_SYMBOL_INIT_AVX512(features,BVH4InstanceArrayIntersector16Chunk));
+    IF_ENABLED_INSTANCE_ARRAY(SELECT_SYMBOL_INIT_AVX512(features,BVH4InstanceArrayMBIntersector16Chunk));
+
     IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512(features,BVH4GridIntersector16HybridMoeller));
     IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512(features,BVH4GridMBIntersector16HybridMoeller));
     IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512(features,BVH4GridIntersector16HybridPluecker));
 
-    /* select stream intersectors */
-    SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4IntersectorStreamPacketFallback);
-
-    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4IntersectorStreamMoeller));
-    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4IntersectorStreamMoellerNoFilter));
-    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4iIntersectorStreamMoeller));
-    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4vIntersectorStreamPluecker));
-    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4iIntersectorStreamPluecker));
-
-    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersectorStreamMoeller));
-    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersectorStreamMoellerNoFilter));
-    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iIntersectorStreamMoeller));
-    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersectorStreamPluecker));
-    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iIntersectorStreamPluecker));
-
-    IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4VirtualIntersectorStream));
-    
-    IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4InstanceIntersectorStream));
-
 #endif
   }
 
@@ -509,7 +506,6 @@ namespace embree
       intersectors.intersector4  = BVH4OBBVirtualCurveIntersector4Hybrid();
       intersectors.intersector8  = BVH4OBBVirtualCurveIntersector8Hybrid();
       intersectors.intersector16 = BVH4OBBVirtualCurveIntersector16Hybrid();
-      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
 #endif
       return intersectors;
     }
@@ -523,7 +519,6 @@ namespace embree
       intersectors.intersector4  = BVH4OBBVirtualCurveIntersectorRobust4Hybrid();
       intersectors.intersector8  = BVH4OBBVirtualCurveIntersectorRobust8Hybrid();
       intersectors.intersector16 = BVH4OBBVirtualCurveIntersectorRobust16Hybrid();
-      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
 #endif
       return intersectors;
     }
@@ -545,7 +540,6 @@ namespace embree
       intersectors.intersector4  = BVH4OBBVirtualCurveIntersector4HybridMB();
       intersectors.intersector8  = BVH4OBBVirtualCurveIntersector8HybridMB();
       intersectors.intersector16 = BVH4OBBVirtualCurveIntersector16HybridMB();
-      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
 #endif
       return intersectors;
     }
@@ -559,7 +553,6 @@ namespace embree
       intersectors.intersector4  = BVH4OBBVirtualCurveIntersectorRobust4HybridMB();
       intersectors.intersector8  = BVH4OBBVirtualCurveIntersectorRobust8HybridMB();
       intersectors.intersector16 = BVH4OBBVirtualCurveIntersectorRobust16HybridMB();
-      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
 #endif
       return intersectors;
     }
@@ -581,8 +574,6 @@ namespace embree
     intersectors.intersector8_nofilter  = BVH4Triangle4Intersector8HybridMoellerNoFilter();
     intersectors.intersector16_filter   = BVH4Triangle4Intersector16HybridMoeller();
     intersectors.intersector16_nofilter = BVH4Triangle4Intersector16HybridMoellerNoFilter();
-    intersectors.intersectorN_filter    = BVH4Triangle4IntersectorStreamMoeller();
-    intersectors.intersectorN_nofilter  = BVH4Triangle4IntersectorStreamMoellerNoFilter();
 #endif
     return intersectors;
   }
@@ -597,7 +588,6 @@ namespace embree
     intersectors.intersector4  = BVH4Triangle4vIntersector4HybridPluecker();
     intersectors.intersector8  = BVH4Triangle4vIntersector8HybridPluecker();
     intersectors.intersector16 = BVH4Triangle4vIntersector16HybridPluecker();
-    intersectors.intersectorN  = BVH4Triangle4vIntersectorStreamPluecker();
 #endif
     return intersectors;
   }
@@ -614,7 +604,6 @@ namespace embree
       intersectors.intersector4  = BVH4Triangle4iIntersector4HybridMoeller();
       intersectors.intersector8  = BVH4Triangle4iIntersector8HybridMoeller();
       intersectors.intersector16 = BVH4Triangle4iIntersector16HybridMoeller();
-      intersectors.intersectorN  = BVH4Triangle4iIntersectorStreamMoeller();
 #endif
       return intersectors;
     }
@@ -627,7 +616,6 @@ namespace embree
       intersectors.intersector4  = BVH4Triangle4iIntersector4HybridPluecker();
       intersectors.intersector8  = BVH4Triangle4iIntersector8HybridPluecker();
       intersectors.intersector16 = BVH4Triangle4iIntersector16HybridPluecker();
-      intersectors.intersectorN  = BVH4Triangle4iIntersectorStreamPluecker();
 #endif
       return intersectors;
     }
@@ -647,7 +635,6 @@ namespace embree
       intersectors.intersector4  = BVH4Triangle4vMBIntersector4HybridMoeller();
       intersectors.intersector8  = BVH4Triangle4vMBIntersector8HybridMoeller();
       intersectors.intersector16 = BVH4Triangle4vMBIntersector16HybridMoeller();
-      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
 #endif
       return intersectors;
     }
@@ -660,7 +647,6 @@ namespace embree
       intersectors.intersector4  = BVH4Triangle4vMBIntersector4HybridPluecker();
       intersectors.intersector8  = BVH4Triangle4vMBIntersector8HybridPluecker();
       intersectors.intersector16 = BVH4Triangle4vMBIntersector16HybridPluecker();
-      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
 #endif
       return intersectors;
     }
@@ -680,7 +666,6 @@ namespace embree
       intersectors.intersector4  = BVH4Triangle4iMBIntersector4HybridMoeller();
       intersectors.intersector8  = BVH4Triangle4iMBIntersector8HybridMoeller();
       intersectors.intersector16 = BVH4Triangle4iMBIntersector16HybridMoeller();
-      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
 #endif
       return intersectors;
     }
@@ -693,7 +678,6 @@ namespace embree
       intersectors.intersector4  = BVH4Triangle4iMBIntersector4HybridPluecker();
       intersectors.intersector8  = BVH4Triangle4iMBIntersector8HybridPluecker();
       intersectors.intersector16 = BVH4Triangle4iMBIntersector16HybridPluecker();
-      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
 #endif
       return intersectors;
     }
@@ -716,8 +700,6 @@ namespace embree
       intersectors.intersector8_nofilter  = BVH4Quad4vIntersector8HybridMoellerNoFilter();
       intersectors.intersector16_filter   = BVH4Quad4vIntersector16HybridMoeller();
       intersectors.intersector16_nofilter = BVH4Quad4vIntersector16HybridMoellerNoFilter();
-      intersectors.intersectorN_filter    = BVH4Quad4vIntersectorStreamMoeller();
-      intersectors.intersectorN_nofilter  = BVH4Quad4vIntersectorStreamMoellerNoFilter();
 #endif
       return intersectors;
     }
@@ -730,7 +712,6 @@ namespace embree
       intersectors.intersector4  = BVH4Quad4vIntersector4HybridPluecker();
       intersectors.intersector8  = BVH4Quad4vIntersector8HybridPluecker();
       intersectors.intersector16 = BVH4Quad4vIntersector16HybridPluecker();
-      intersectors.intersectorN  = BVH4Quad4vIntersectorStreamPluecker();
 #endif
       return intersectors;
     }
@@ -750,7 +731,6 @@ namespace embree
       intersectors.intersector4 = BVH4Quad4iIntersector4HybridMoeller();
       intersectors.intersector8 = BVH4Quad4iIntersector8HybridMoeller();
       intersectors.intersector16= BVH4Quad4iIntersector16HybridMoeller();
-      intersectors.intersectorN = BVH4Quad4iIntersectorStreamMoeller();
 #endif
       return intersectors;
     }
@@ -763,7 +743,6 @@ namespace embree
       intersectors.intersector4 = BVH4Quad4iIntersector4HybridPluecker();
       intersectors.intersector8 = BVH4Quad4iIntersector8HybridPluecker();
       intersectors.intersector16= BVH4Quad4iIntersector16HybridPluecker();
-      intersectors.intersectorN = BVH4Quad4iIntersectorStreamPluecker();
 #endif
       return intersectors;
     }
@@ -783,7 +762,6 @@ namespace embree
       intersectors.intersector4 = BVH4Quad4iMBIntersector4HybridMoeller();
       intersectors.intersector8 = BVH4Quad4iMBIntersector8HybridMoeller();
       intersectors.intersector16= BVH4Quad4iMBIntersector16HybridMoeller();
-      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
 #endif
       return intersectors;
     }
@@ -796,7 +774,6 @@ namespace embree
       intersectors.intersector4 = BVH4Quad4iMBIntersector4HybridPluecker();
       intersectors.intersector8 = BVH4Quad4iMBIntersector8HybridPluecker();
       intersectors.intersector16= BVH4Quad4iMBIntersector16HybridPluecker();
-      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
 #endif
       return intersectors;
     }
@@ -829,7 +806,6 @@ namespace embree
     intersectors.intersector4  = BVH4VirtualIntersector4Chunk();
     intersectors.intersector8  = BVH4VirtualIntersector8Chunk();
     intersectors.intersector16 = BVH4VirtualIntersector16Chunk();
-    intersectors.intersectorN  = BVH4VirtualIntersectorStream();
 #endif
     intersectors.collider      = BVH4ColliderUserGeom();
     return intersectors;
@@ -844,7 +820,6 @@ namespace embree
     intersectors.intersector4  = BVH4VirtualMBIntersector4Chunk();
     intersectors.intersector8  = BVH4VirtualMBIntersector8Chunk();
     intersectors.intersector16 = BVH4VirtualMBIntersector16Chunk();
-    intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
 #endif
     return intersectors;
   }
@@ -858,7 +833,6 @@ namespace embree
     intersectors.intersector4  = BVH4InstanceIntersector4Chunk();
     intersectors.intersector8  = BVH4InstanceIntersector8Chunk();
     intersectors.intersector16 = BVH4InstanceIntersector16Chunk();
-    intersectors.intersectorN  = BVH4InstanceIntersectorStream();
 #endif
     return intersectors;
   }
@@ -872,7 +846,32 @@ namespace embree
     intersectors.intersector4  = BVH4InstanceMBIntersector4Chunk();
     intersectors.intersector8  = BVH4InstanceMBIntersector8Chunk();
     intersectors.intersector16 = BVH4InstanceMBIntersector16Chunk();
-    intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4InstanceArrayIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4InstanceArrayIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4InstanceArrayIntersector4Chunk();
+    intersectors.intersector8  = BVH4InstanceArrayIntersector8Chunk();
+    intersectors.intersector16 = BVH4InstanceArrayIntersector16Chunk();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4InstanceArrayMBIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4InstanceArrayMBIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4InstanceArrayMBIntersector4Chunk();
+    intersectors.intersector8  = BVH4InstanceArrayMBIntersector8Chunk();
+    intersectors.intersector16 = BVH4InstanceArrayMBIntersector16Chunk();
 #endif
     return intersectors;
   }
@@ -886,7 +885,6 @@ namespace embree
     intersectors.intersector4  = BVH4SubdivPatch1Intersector4();
     intersectors.intersector8  = BVH4SubdivPatch1Intersector8();
     intersectors.intersector16 = BVH4SubdivPatch1Intersector16();
-    intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
 #endif
     return intersectors;
   }
@@ -900,7 +898,6 @@ namespace embree
     intersectors.intersector4  = BVH4SubdivPatch1MBIntersector4();
     intersectors.intersector8  = BVH4SubdivPatch1MBIntersector8();
     intersectors.intersector16 = BVH4SubdivPatch1MBIntersector16();
-    intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
 #endif
     return intersectors;
   }
@@ -1255,6 +1252,35 @@ namespace embree
     return new AccelInstance(accel,builder,intersectors);
   }
 
+  Accel* BVH4Factory::BVH4InstanceArray(Scene* scene, BuildVariant bvariant)
+  {
+    BVH4* accel = new BVH4(InstanceArrayPrimitive::type,scene);
+    Accel::Intersectors intersectors = BVH4InstanceArrayIntersectors(accel);
+    auto gtype = Geometry::MTY_INSTANCE_ARRAY;
+
+    Builder* builder = nullptr;
+    if (scene->device->object_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4InstanceArraySceneBuilderSAH(accel,scene,gtype); break;
+      case BuildVariant::DYNAMIC     : builder = BVH4BuilderTwoLevelInstanceArraySAH(accel,scene,gtype,false); break;
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->object_builder == "sah") { builder = BVH4InstanceArraySceneBuilderSAH(accel,scene,gtype); }
+    else if (scene->device->object_builder == "dynamic") { builder = BVH4BuilderTwoLevelInstanceArraySAH(accel,scene,gtype,false); }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH4<Object>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4InstanceArrayMB(Scene* scene)
+  {
+    BVH4* accel = new BVH4(InstanceArrayPrimitive::type,scene);
+    Accel::Intersectors intersectors = BVH4InstanceArrayMBIntersectors(accel);
+    Builder* builder = BVH4InstanceArrayMBSceneBuilderSAH(accel,scene,Geometry::MTY_INSTANCE_ARRAY);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
   Accel::Intersectors BVH4Factory::BVH4GridIntersectors(BVH4* bvh, IntersectVariant ivariant)
   {
     Accel::Intersectors intersectors;
@@ -1266,7 +1292,6 @@ namespace embree
       intersectors.intersector4  = BVH4GridIntersector4HybridMoeller();
       intersectors.intersector8  = BVH4GridIntersector8HybridMoeller();
       intersectors.intersector16 = BVH4GridIntersector16HybridMoeller();
-      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
 #endif
     }
     else /* if (ivariant == IntersectVariant::ROBUST) */
@@ -1276,7 +1301,6 @@ namespace embree
       intersectors.intersector4  = BVH4GridIntersector4HybridPluecker();
       intersectors.intersector8  = BVH4GridIntersector8HybridPluecker();
       intersectors.intersector16 = BVH4GridIntersector16HybridPluecker();
-      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
 #endif      
     }
     return intersectors;
@@ -1291,7 +1315,6 @@ namespace embree
     intersectors.intersector4  = BVH4GridMBIntersector4HybridMoeller();
     intersectors.intersector8  = BVH4GridMBIntersector8HybridMoeller();
     intersectors.intersector16 = BVH4GridMBIntersector16HybridMoeller();
-    intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
 #endif
     return intersectors;
   }
diff --git a/thirdparty/embree/kernels/bvh/bvh4_factory.h b/thirdparty/embree/kernels/bvh/bvh4_factory.h
index 30973971a4..abf51dd108 100644
--- a/thirdparty/embree/kernels/bvh/bvh4_factory.h
+++ b/thirdparty/embree/kernels/bvh/bvh4_factory.h
@@ -48,6 +48,9 @@ namespace embree
     Accel* BVH4Instance(Scene* scene, bool isExpensive, BuildVariant bvariant = BuildVariant::STATIC);
     Accel* BVH4InstanceMB(Scene* scene, bool isExpensive);
 
+    Accel* BVH4InstanceArray(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC);
+    Accel* BVH4InstanceArrayMB(Scene* scene);
+
     Accel* BVH4Grid(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
     Accel* BVH4GridMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
 
@@ -77,7 +80,10 @@ namespace embree
 
     Accel::Intersectors BVH4InstanceIntersectors(BVH4* bvh);
     Accel::Intersectors BVH4InstanceMBIntersectors(BVH4* bvh);
-    
+
+    Accel::Intersectors BVH4InstanceArrayIntersectors(BVH4* bvh);
+    Accel::Intersectors BVH4InstanceArrayMBIntersectors(BVH4* bvh);
+
     Accel::Intersectors BVH4SubdivPatch1Intersectors(BVH4* bvh);
     Accel::Intersectors BVH4SubdivPatch1MBIntersectors(BVH4* bvh);
 
@@ -122,7 +128,10 @@ namespace embree
 
     DEFINE_SYMBOL2(Accel::Intersector1,BVH4InstanceIntersector1);
     DEFINE_SYMBOL2(Accel::Intersector1,BVH4InstanceMBIntersector1);
-        
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4InstanceArrayIntersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4InstanceArrayMBIntersector1);
+
     DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Moeller);
     DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridMBIntersector1Moeller);
     DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Pluecker);
@@ -161,6 +170,9 @@ namespace embree
     DEFINE_SYMBOL2(Accel::Intersector4,BVH4InstanceIntersector4Chunk);
     DEFINE_SYMBOL2(Accel::Intersector4,BVH4InstanceMBIntersector4Chunk);
 
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4InstanceArrayIntersector4Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4InstanceArrayMBIntersector4Chunk);
+
     DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridMoeller);
     DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridMBIntersector4HybridMoeller);
     DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridPluecker);
@@ -201,6 +213,9 @@ namespace embree
     DEFINE_SYMBOL2(Accel::Intersector8,BVH4InstanceIntersector8Chunk);
     DEFINE_SYMBOL2(Accel::Intersector8,BVH4InstanceMBIntersector8Chunk);
 
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4InstanceArrayIntersector8Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4InstanceArrayMBIntersector8Chunk);
+
     DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridMoeller);
     DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridMBIntersector8HybridMoeller);
     DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridPluecker);
@@ -241,30 +256,13 @@ namespace embree
     DEFINE_SYMBOL2(Accel::Intersector16,BVH4InstanceIntersector16Chunk);
     DEFINE_SYMBOL2(Accel::Intersector16,BVH4InstanceMBIntersector16Chunk);
 
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4InstanceArrayIntersector16Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4InstanceArrayMBIntersector16Chunk);
+
     DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridMoeller);
     DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridMBIntersector16HybridMoeller);
     DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridPluecker);
 
-    // ==============
-
-    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4IntersectorStreamPacketFallback);
-
-    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4IntersectorStreamMoeller);
-    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4IntersectorStreamMoellerNoFilter);
-    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4iIntersectorStreamMoeller);
-    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4vIntersectorStreamPluecker);
-    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4iIntersectorStreamPluecker);
-
-    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamMoeller);
-    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamMoellerNoFilter);
-    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4iIntersectorStreamMoeller);
-    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamPluecker);
-    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4iIntersectorStreamPluecker);
-
-    DEFINE_SYMBOL2(Accel::IntersectorN,BVH4VirtualIntersectorStream);
-    
-    DEFINE_SYMBOL2(Accel::IntersectorN,BVH4InstanceIntersectorStream);
-       
     // SAH scene builders
   private:
     DEFINE_ISA_FUNCTION(Builder*,BVH4Curve4vBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
@@ -294,6 +292,9 @@ namespace embree
     DEFINE_ISA_FUNCTION(Builder*,BVH4InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
     DEFINE_ISA_FUNCTION(Builder*,BVH4InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
 
+    DEFINE_ISA_FUNCTION(Builder*,BVH4InstanceArraySceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4InstanceArrayMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+
     DEFINE_ISA_FUNCTION(Builder*,BVH4GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
     DEFINE_ISA_FUNCTION(Builder*,BVH4GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
 
@@ -312,5 +313,6 @@ namespace embree
     DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool);
     DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool);
     DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelInstanceArraySAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool);
   };
 }
diff --git a/thirdparty/embree/kernels/bvh/bvh8_factory.cpp b/thirdparty/embree/kernels/bvh/bvh8_factory.cpp
index d4521af241..7c0f7565fa 100644
--- a/thirdparty/embree/kernels/bvh/bvh8_factory.cpp
+++ b/thirdparty/embree/kernels/bvh/bvh8_factory.cpp
@@ -21,6 +21,7 @@
 #include "../geometry/subdivpatch1.h"
 #include "../geometry/object.h"
 #include "../geometry/instance.h"
+#include "../geometry/instance_array.h"
 #include "../geometry/subgrid.h"
 #include "../common/accelinstance.h"
 
@@ -66,6 +67,9 @@ namespace embree
   DECLARE_SYMBOL2(Accel::Intersector1,BVH8InstanceIntersector1);
   DECLARE_SYMBOL2(Accel::Intersector1,BVH8InstanceMBIntersector1);
 
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8InstanceArrayIntersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8InstanceArrayMBIntersector1);
+
   DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Moeller);
   DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridMBIntersector1Moeller);
   DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Pluecker);
@@ -101,6 +105,9 @@ namespace embree
   DECLARE_SYMBOL2(Accel::Intersector4,BVH8InstanceIntersector4Chunk);
   DECLARE_SYMBOL2(Accel::Intersector4,BVH8InstanceMBIntersector4Chunk);
 
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8InstanceArrayIntersector4Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8InstanceArrayMBIntersector4Chunk);
+
   DECLARE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridMoeller);
   DECLARE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridPluecker);
 
@@ -135,6 +142,9 @@ namespace embree
   DECLARE_SYMBOL2(Accel::Intersector8,BVH8InstanceIntersector8Chunk);
   DECLARE_SYMBOL2(Accel::Intersector8,BVH8InstanceMBIntersector8Chunk);
 
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8InstanceArrayIntersector8Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8InstanceArrayMBIntersector8Chunk);
+
   DECLARE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridMoeller);
   DECLARE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridPluecker);
 
@@ -169,27 +179,12 @@ namespace embree
   DECLARE_SYMBOL2(Accel::Intersector16,BVH8InstanceIntersector16Chunk);
   DECLARE_SYMBOL2(Accel::Intersector16,BVH8InstanceMBIntersector16Chunk);
 
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8InstanceArrayIntersector16Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8InstanceArrayMBIntersector16Chunk);
+
   DECLARE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridMoeller);
   DECLARE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridPluecker);
 
-  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8IntersectorStreamPacketFallback);
-
-  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoeller);
-  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoellerNoFilter);
-  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamMoeller);
-  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4vIntersectorStreamPluecker);
-  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamPluecker);
-
-  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoeller);
-  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoellerNoFilter);
-  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamMoeller);
-  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamPluecker);
-  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamPluecker);
-
-  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8VirtualIntersectorStream);
-
-  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8InstanceIntersectorStream);
-
   DECLARE_ISA_FUNCTION(Builder*,BVH8Curve8vBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
   DECLARE_ISA_FUNCTION(Builder*,BVH8OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
 
@@ -212,6 +207,9 @@ namespace embree
   DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
   DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
 
+  DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceArraySceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceArrayMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+
   DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
   DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
   DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
@@ -224,6 +222,7 @@ namespace embree
   DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool);
   DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool);
   DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelInstanceArraySAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool);
 
   BVH8Factory::BVH8Factory(int bfeatures, int ifeatures)
   {
@@ -256,6 +255,9 @@ namespace embree
 
     IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX(features,BVH8InstanceSceneBuilderSAH));
     IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX(features,BVH8InstanceMBSceneBuilderSAH));
+
+    IF_ENABLED_INSTANCE_ARRAY(SELECT_SYMBOL_INIT_AVX(features,BVH8InstanceArraySceneBuilderSAH));
+    IF_ENABLED_INSTANCE_ARRAY(SELECT_SYMBOL_INIT_AVX(features,BVH8InstanceArrayMBSceneBuilderSAH));
     
     IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX(features,BVH8GridSceneBuilderSAH));
     IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX(features,BVH8GridMBSceneBuilderSAH));
@@ -270,6 +272,7 @@ namespace embree
     IF_ENABLED_QUADS (SELECT_SYMBOL_INIT_AVX(features,BVH8BuilderTwoLevelQuadMeshSAH));
     IF_ENABLED_USER  (SELECT_SYMBOL_INIT_AVX(features,BVH8BuilderTwoLevelVirtualSAH));
     IF_ENABLED_INSTANCE (SELECT_SYMBOL_INIT_AVX(features,BVH8BuilderTwoLevelInstanceSAH));
+    IF_ENABLED_INSTANCE_ARRAY (SELECT_SYMBOL_INIT_AVX(features,BVH8BuilderTwoLevelInstanceArraySAH));
   }
 
   void BVH8Factory::selectIntersectors(int features)
@@ -313,6 +316,9 @@ namespace embree
     IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceIntersector1));
     IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceMBIntersector1));
 
+    IF_ENABLED_INSTANCE_ARRAY(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceArrayIntersector1));
+    IF_ENABLED_INSTANCE_ARRAY(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceArrayMBIntersector1));
+
     IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8GridIntersector1Moeller));
     IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8GridMBIntersector1Moeller))
     IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8GridIntersector1Pluecker));
@@ -351,6 +357,9 @@ namespace embree
     IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceIntersector4Chunk));
     IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceMBIntersector4Chunk));
 
+    IF_ENABLED_INSTANCE_ARRAY(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceArrayIntersector4Chunk));
+    IF_ENABLED_INSTANCE_ARRAY(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceArrayMBIntersector4Chunk));
+
     IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8GridIntersector4HybridMoeller));
     IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8GridIntersector4HybridPluecker));
 
@@ -386,6 +395,9 @@ namespace embree
     IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceIntersector8Chunk));
     IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceMBIntersector8Chunk));
 
+    IF_ENABLED_INSTANCE_ARRAY(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceArrayIntersector8Chunk));
+    IF_ENABLED_INSTANCE_ARRAY(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceArrayMBIntersector8Chunk));
+
     IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8GridIntersector8HybridMoeller));
     IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8GridIntersector8HybridPluecker));
 
@@ -421,29 +433,12 @@ namespace embree
     IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512(features,BVH8InstanceIntersector16Chunk));
     IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512(features,BVH8InstanceMBIntersector16Chunk));
 
+    IF_ENABLED_INSTANCE_ARRAY(SELECT_SYMBOL_INIT_AVX512(features,BVH8InstanceArrayIntersector16Chunk));
+    IF_ENABLED_INSTANCE_ARRAY(SELECT_SYMBOL_INIT_AVX512(features,BVH8InstanceArrayMBIntersector16Chunk));
+
     IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512(features,BVH8GridIntersector16HybridMoeller));
     IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512(features,BVH8GridIntersector16HybridPluecker));
 
-    /* select stream intersectors */
-
-    SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8IntersectorStreamPacketFallback);
-
-    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4IntersectorStreamMoeller));
-    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4IntersectorStreamMoellerNoFilter));
-    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iIntersectorStreamMoeller));
-    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vIntersectorStreamPluecker));
-    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iIntersectorStreamPluecker));
-
-    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersectorStreamMoeller));
-    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersectorStreamMoellerNoFilter));
-    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iIntersectorStreamMoeller));
-    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersectorStreamPluecker));
-    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iIntersectorStreamPluecker));
-
-    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8VirtualIntersectorStream));
-
-    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceIntersectorStream));
-
 #endif
   }
 
@@ -460,7 +455,6 @@ namespace embree
       intersectors.intersector4  = BVH8OBBVirtualCurveIntersector4Hybrid();
       intersectors.intersector8  = BVH8OBBVirtualCurveIntersector8Hybrid();
       intersectors.intersector16 = BVH8OBBVirtualCurveIntersector16Hybrid();
-      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
 #endif
       return intersectors;
     }
@@ -474,7 +468,6 @@ namespace embree
       intersectors.intersector4  = BVH8OBBVirtualCurveIntersectorRobust4Hybrid();
       intersectors.intersector8  = BVH8OBBVirtualCurveIntersectorRobust8Hybrid();
       intersectors.intersector16 = BVH8OBBVirtualCurveIntersectorRobust16Hybrid();
-      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
 #endif
       return intersectors;
     }
@@ -496,7 +489,6 @@ namespace embree
       intersectors.intersector4  = BVH8OBBVirtualCurveIntersector4HybridMB();
       intersectors.intersector8  = BVH8OBBVirtualCurveIntersector8HybridMB();
       intersectors.intersector16 = BVH8OBBVirtualCurveIntersector16HybridMB();
-      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
 #endif
       return intersectors;
     }
@@ -510,7 +502,6 @@ namespace embree
       intersectors.intersector4  = BVH8OBBVirtualCurveIntersectorRobust4HybridMB();
       intersectors.intersector8  = BVH8OBBVirtualCurveIntersectorRobust8HybridMB();
       intersectors.intersector16 = BVH8OBBVirtualCurveIntersectorRobust16HybridMB();
-      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
 #endif
       return intersectors;
     }
@@ -532,8 +523,6 @@ namespace embree
     intersectors.intersector8_nofilter  = BVH8Triangle4Intersector8HybridMoellerNoFilter();
     intersectors.intersector16_filter   = BVH8Triangle4Intersector16HybridMoeller();
     intersectors.intersector16_nofilter = BVH8Triangle4Intersector16HybridMoellerNoFilter();
-    intersectors.intersectorN_filter    = BVH8Triangle4IntersectorStreamMoeller();
-    intersectors.intersectorN_nofilter  = BVH8Triangle4IntersectorStreamMoellerNoFilter();
 #endif
     return intersectors;
   }
@@ -554,7 +543,6 @@ namespace embree
     intersectors.intersector4    = BVH8Triangle4vIntersector4HybridPluecker();
     intersectors.intersector8    = BVH8Triangle4vIntersector8HybridPluecker();
     intersectors.intersector16   = BVH8Triangle4vIntersector16HybridPluecker();
-    intersectors.intersectorN    = BVH8Triangle4vIntersectorStreamPluecker();
 #endif
     return intersectors;
   }
@@ -571,7 +559,6 @@ namespace embree
       intersectors.intersector4  = BVH8Triangle4iIntersector4HybridMoeller();
       intersectors.intersector8  = BVH8Triangle4iIntersector8HybridMoeller();
       intersectors.intersector16 = BVH8Triangle4iIntersector16HybridMoeller();
-      intersectors.intersectorN  = BVH8Triangle4iIntersectorStreamMoeller();
 #endif
       return intersectors;
     }
@@ -584,7 +571,6 @@ namespace embree
       intersectors.intersector4  = BVH8Triangle4iIntersector4HybridPluecker();
       intersectors.intersector8  = BVH8Triangle4iIntersector8HybridPluecker();
       intersectors.intersector16 = BVH8Triangle4iIntersector16HybridPluecker();
-      intersectors.intersectorN  = BVH8Triangle4iIntersectorStreamPluecker();
 #endif
       return intersectors;
     }
@@ -604,7 +590,6 @@ namespace embree
       intersectors.intersector4  = BVH8Triangle4vMBIntersector4HybridMoeller();
       intersectors.intersector8  = BVH8Triangle4vMBIntersector8HybridMoeller();
       intersectors.intersector16 = BVH8Triangle4vMBIntersector16HybridMoeller();
-      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
 #endif
       return intersectors;
     }
@@ -617,7 +602,6 @@ namespace embree
       intersectors.intersector4  = BVH8Triangle4vMBIntersector4HybridPluecker();
       intersectors.intersector8  = BVH8Triangle4vMBIntersector8HybridPluecker();
       intersectors.intersector16 = BVH8Triangle4vMBIntersector16HybridPluecker();
-      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
 #endif
       return intersectors;
     }
@@ -637,7 +621,6 @@ namespace embree
       intersectors.intersector4  = BVH8Triangle4iMBIntersector4HybridMoeller();
       intersectors.intersector8  = BVH8Triangle4iMBIntersector8HybridMoeller();
       intersectors.intersector16 = BVH8Triangle4iMBIntersector16HybridMoeller();
-      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
 #endif
       return intersectors;
     }
@@ -650,7 +633,6 @@ namespace embree
       intersectors.intersector4  = BVH8Triangle4iMBIntersector4HybridPluecker();
       intersectors.intersector8  = BVH8Triangle4iMBIntersector8HybridPluecker();
       intersectors.intersector16 = BVH8Triangle4iMBIntersector16HybridPluecker();
-      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
 #endif
       return intersectors;
     }
@@ -673,8 +655,6 @@ namespace embree
       intersectors.intersector8_nofilter  = BVH8Quad4vIntersector8HybridMoellerNoFilter();
       intersectors.intersector16_filter   = BVH8Quad4vIntersector16HybridMoeller();
       intersectors.intersector16_nofilter = BVH8Quad4vIntersector16HybridMoellerNoFilter();
-      intersectors.intersectorN_filter    = BVH8Quad4vIntersectorStreamMoeller();
-      intersectors.intersectorN_nofilter  = BVH8Quad4vIntersectorStreamMoellerNoFilter();
 #endif
       return intersectors;
     }
@@ -687,7 +667,6 @@ namespace embree
       intersectors.intersector4  = BVH8Quad4vIntersector4HybridPluecker();
       intersectors.intersector8  = BVH8Quad4vIntersector8HybridPluecker();
       intersectors.intersector16 = BVH8Quad4vIntersector16HybridPluecker();
-      intersectors.intersectorN  = BVH8Quad4vIntersectorStreamPluecker();
 #endif
       return intersectors;
     }
@@ -707,7 +686,6 @@ namespace embree
       intersectors.intersector4  = BVH8Quad4iIntersector4HybridMoeller();
       intersectors.intersector8  = BVH8Quad4iIntersector8HybridMoeller();
       intersectors.intersector16 = BVH8Quad4iIntersector16HybridMoeller();
-      intersectors.intersectorN  = BVH8Quad4iIntersectorStreamMoeller();
 #endif
       return intersectors;
     }
@@ -720,7 +698,6 @@ namespace embree
       intersectors.intersector4  = BVH8Quad4iIntersector4HybridPluecker();
       intersectors.intersector8  = BVH8Quad4iIntersector8HybridPluecker();
       intersectors.intersector16 = BVH8Quad4iIntersector16HybridPluecker();
-      intersectors.intersectorN  = BVH8Quad4iIntersectorStreamPluecker();
 #endif
       return intersectors;
     }
@@ -740,7 +717,6 @@ namespace embree
       intersectors.intersector4  = BVH8Quad4iMBIntersector4HybridMoeller();
       intersectors.intersector8  = BVH8Quad4iMBIntersector8HybridMoeller();
       intersectors.intersector16 = BVH8Quad4iMBIntersector16HybridMoeller();
-      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
 #endif
       return intersectors;
     }
@@ -753,7 +729,6 @@ namespace embree
       intersectors.intersector4  = BVH8Quad4iMBIntersector4HybridPluecker();
       intersectors.intersector8  = BVH8Quad4iMBIntersector8HybridPluecker();
       intersectors.intersector16 = BVH8Quad4iMBIntersector16HybridPluecker();
-      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
 #endif
       return intersectors;
     }
@@ -794,7 +769,6 @@ namespace embree
     intersectors.intersector4  = BVH8VirtualIntersector4Chunk();
     intersectors.intersector8  = BVH8VirtualIntersector8Chunk();
     intersectors.intersector16 = BVH8VirtualIntersector16Chunk();
-    intersectors.intersectorN  = BVH8VirtualIntersectorStream();
 #endif
     intersectors.collider      = BVH8ColliderUserGeom();
     return intersectors;
@@ -809,7 +783,6 @@ namespace embree
     intersectors.intersector4  = BVH8VirtualMBIntersector4Chunk();
     intersectors.intersector8  = BVH8VirtualMBIntersector8Chunk();
     intersectors.intersector16 = BVH8VirtualMBIntersector16Chunk();
-    intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
 #endif
     return intersectors;
   }
@@ -823,7 +796,19 @@ namespace embree
     intersectors.intersector4  = BVH8InstanceIntersector4Chunk();
     intersectors.intersector8  = BVH8InstanceIntersector8Chunk();
     intersectors.intersector16 = BVH8InstanceIntersector16Chunk();
-    intersectors.intersectorN  = BVH8InstanceIntersectorStream();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8InstanceArrayIntersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH8InstanceArrayIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH8InstanceArrayIntersector4Chunk();
+    intersectors.intersector8  = BVH8InstanceArrayIntersector8Chunk();
+    intersectors.intersector16 = BVH8InstanceArrayIntersector16Chunk();
 #endif
     return intersectors;
   }
@@ -837,7 +822,19 @@ namespace embree
     intersectors.intersector4  = BVH8InstanceMBIntersector4Chunk();
     intersectors.intersector8  = BVH8InstanceMBIntersector8Chunk();
     intersectors.intersector16 = BVH8InstanceMBIntersector16Chunk();
-    intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8InstanceArrayMBIntersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH8InstanceArrayMBIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH8InstanceArrayMBIntersector4Chunk();
+    intersectors.intersector8  = BVH8InstanceArrayMBIntersector8Chunk();
+    intersectors.intersector16 = BVH8InstanceArrayMBIntersector16Chunk();
 #endif
     return intersectors;
   }
@@ -1086,6 +1083,28 @@ namespace embree
     return new AccelInstance(accel,builder,intersectors);
   }
 
+  Accel* BVH8Factory::BVH8InstanceArray(Scene* scene, BuildVariant bvariant)
+  {
+    BVH8* accel = new BVH8(InstanceArrayPrimitive::type,scene);
+    Accel::Intersectors intersectors = BVH8InstanceArrayIntersectors(accel);
+    auto gtype = Geometry::MTY_INSTANCE_ARRAY;
+    // Builder* builder = BVH8InstanceSceneBuilderSAH(accel,scene,gtype);
+
+    Builder* builder = nullptr;
+    if (scene->device->object_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8InstanceArraySceneBuilderSAH(accel,scene,gtype); break;
+      case BuildVariant::DYNAMIC     : builder = BVH8BuilderTwoLevelInstanceArraySAH(accel,scene,gtype,false); break;
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->object_builder == "sah") builder = BVH8InstanceArraySceneBuilderSAH(accel,scene,gtype);
+    else if (scene->device->object_builder == "dynamic") builder = BVH8BuilderTwoLevelInstanceArraySAH(accel,scene,gtype,false);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH8<Object>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
   Accel* BVH8Factory::BVH8InstanceMB(Scene* scene, bool isExpensive)
   {
     BVH8* accel = new BVH8(InstancePrimitive::type,scene);
@@ -1095,6 +1114,15 @@ namespace embree
     return new AccelInstance(accel,builder,intersectors);
   }
 
+  Accel* BVH8Factory::BVH8InstanceArrayMB(Scene* scene)
+  {
+    BVH8* accel = new BVH8(InstanceArrayPrimitive::type,scene);
+    Accel::Intersectors intersectors = BVH8InstanceArrayMBIntersectors(accel);
+    auto gtype = Geometry::MTY_INSTANCE_ARRAY;
+    Builder* builder = BVH8InstanceArrayMBSceneBuilderSAH(accel,scene,gtype);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
   Accel::Intersectors BVH8Factory::BVH8GridIntersectors(BVH8* bvh, IntersectVariant ivariant)
   {
     Accel::Intersectors intersectors;
@@ -1106,7 +1134,6 @@ namespace embree
       intersectors.intersector4  = BVH8GridIntersector4HybridMoeller();
       intersectors.intersector8  = BVH8GridIntersector8HybridMoeller();
       intersectors.intersector16 = BVH8GridIntersector16HybridMoeller();
-      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
 #endif
     }
     else /* if (ivariant == IntersectVariant::ROBUST) */
@@ -1116,7 +1143,6 @@ namespace embree
       intersectors.intersector4  = BVH8GridIntersector4HybridPluecker();
       intersectors.intersector8  = BVH8GridIntersector8HybridPluecker();
       intersectors.intersector16 = BVH8GridIntersector16HybridPluecker();
-      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
 #endif            
     }
     return intersectors;
@@ -1131,7 +1157,6 @@ namespace embree
     intersectors.intersector4  = nullptr;
     intersectors.intersector8  = nullptr;
     intersectors.intersector16 = nullptr;
-    intersectors.intersectorN  = nullptr;
 #endif
     return intersectors;
   }
diff --git a/thirdparty/embree/kernels/bvh/bvh8_factory.h b/thirdparty/embree/kernels/bvh/bvh8_factory.h
index 198d6f1df0..e55310b703 100644
--- a/thirdparty/embree/kernels/bvh/bvh8_factory.h
+++ b/thirdparty/embree/kernels/bvh/bvh8_factory.h
@@ -39,6 +39,9 @@ namespace embree
     Accel* BVH8Instance(Scene* scene, bool isExpensive, BuildVariant bvariant = BuildVariant::STATIC);
     Accel* BVH8InstanceMB(Scene* scene, bool isExpensive);
 
+    Accel* BVH8InstanceArray(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC);
+    Accel* BVH8InstanceArrayMB(Scene* scene);
+
     Accel* BVH8Grid(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
     Accel* BVH8GridMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
 
@@ -70,6 +73,9 @@ namespace embree
     Accel::Intersectors BVH8InstanceIntersectors(BVH8* bvh);
     Accel::Intersectors BVH8InstanceMBIntersectors(BVH8* bvh);
 
+    Accel::Intersectors BVH8InstanceArrayIntersectors(BVH8* bvh);
+    Accel::Intersectors BVH8InstanceArrayMBIntersectors(BVH8* bvh);
+
     Accel::Intersectors BVH8GridIntersectors(BVH8* bvh, IntersectVariant ivariant);
     Accel::Intersectors BVH8GridMBIntersectors(BVH8* bvh, IntersectVariant ivariant);
 
@@ -111,6 +117,9 @@ namespace embree
     DEFINE_SYMBOL2(Accel::Intersector1,BVH8InstanceIntersector1);
     DEFINE_SYMBOL2(Accel::Intersector1,BVH8InstanceMBIntersector1);
 
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8InstanceArrayIntersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8InstanceArrayMBIntersector1);
+
     DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Moeller);
     DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridMBIntersector1Moeller);
     DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Pluecker);
@@ -145,7 +154,10 @@ namespace embree
 
     DEFINE_SYMBOL2(Accel::Intersector4,BVH8InstanceIntersector4Chunk);
     DEFINE_SYMBOL2(Accel::Intersector4,BVH8InstanceMBIntersector4Chunk);
-    
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8InstanceArrayIntersector4Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8InstanceArrayMBIntersector4Chunk);
+
     DEFINE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridMoeller);
     DEFINE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridPluecker);
 
@@ -180,9 +192,12 @@ namespace embree
     DEFINE_SYMBOL2(Accel::Intersector8,BVH8InstanceIntersector8Chunk);
     DEFINE_SYMBOL2(Accel::Intersector8,BVH8InstanceMBIntersector8Chunk);
 
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8InstanceArrayIntersector8Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8InstanceArrayMBIntersector8Chunk);
+
     DEFINE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridMoeller);
     DEFINE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridPluecker);
-   
+
     DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16Hybrid);
     DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16HybridMB);
     DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16Hybrid);
@@ -213,27 +228,12 @@ namespace embree
 
     DEFINE_SYMBOL2(Accel::Intersector16,BVH8InstanceIntersector16Chunk);
     DEFINE_SYMBOL2(Accel::Intersector16,BVH8InstanceMBIntersector16Chunk);
-   
-    DEFINE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridMoeller);
-    DEFINE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridPluecker);
-
-    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8IntersectorStreamPacketFallback);
-
-    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoeller);
-    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoellerNoFilter);
-    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamMoeller);
-    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4vIntersectorStreamPluecker);
-    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamPluecker);
 
-    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoeller);
-    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoellerNoFilter);
-    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamMoeller);
-    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamPluecker);
-    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8InstanceArrayIntersector16Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8InstanceArrayMBIntersector16Chunk);
 
-    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8VirtualIntersectorStream);
-    
-    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8InstanceIntersectorStream);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridPluecker);
 
     // SAH scene builders
   private:
@@ -258,6 +258,9 @@ namespace embree
 
     DEFINE_ISA_FUNCTION(Builder*,BVH8InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
     DEFINE_ISA_FUNCTION(Builder*,BVH8InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+    
+    DEFINE_ISA_FUNCTION(Builder*,BVH8InstanceArraySceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8InstanceArrayMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
 
     DEFINE_ISA_FUNCTION(Builder*,BVH8GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
     DEFINE_ISA_FUNCTION(Builder*,BVH8GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
@@ -276,5 +279,6 @@ namespace embree
     DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool);
     DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool);
     DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelInstanceArraySAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool);
   };
 }
diff --git a/thirdparty/embree/kernels/bvh/bvh_builder_morton.cpp b/thirdparty/embree/kernels/bvh/bvh_builder_morton.cpp
index 4a4d8d71df..7d6548cb2a 100644
--- a/thirdparty/embree/kernels/bvh/bvh_builder_morton.cpp
+++ b/thirdparty/embree/kernels/bvh/bvh_builder_morton.cpp
@@ -17,6 +17,7 @@
 #include "../geometry/quadi.h"
 #include "../geometry/object.h"
 #include "../geometry/instance.h"
+#include "../geometry/instance_array.h"
 
 #if defined(__64BIT__)
 #  define ROTATE_TREE 1 // specifies number of tree rotation rounds to perform
@@ -399,6 +400,50 @@ namespace embree
       unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
     };
 
+    template<int N>
+    struct CreateMortonLeaf<N,InstanceArrayPrimitive>
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+
+      __forceinline CreateMortonLeaf (InstanceArray* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+        : mesh(mesh), morton(morton), geomID_(geomID) {}
+
+      __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+      {
+        vfloat4 lower(pos_inf);
+        vfloat4 upper(neg_inf);
+        size_t items = current.size();
+        size_t start = current.begin();
+        assert(items <= 1);
+
+        /* allocate leaf node */
+        InstanceArrayPrimitive* accel = (InstanceArrayPrimitive*) alloc.malloc1(items*sizeof(InstanceArrayPrimitive),BVH::byteAlignment);
+        NodeRef ref = BVH::encodeLeaf((char*)accel,items);
+        const InstanceArray* instance = this->mesh;
+
+        BBox3fa bounds = empty;
+        for (size_t i=0; i<items; i++)
+        {
+          const unsigned int primID = morton[start+i].index;
+          bounds.extend(instance->bounds(primID));
+          new (&accel[i]) InstanceArrayPrimitive(geomID_, primID);
+        }
+
+        BBox3fx box_o = (BBox3fx&)bounds;
+#if ROTATE_TREE
+        if (N == 4)
+          box_o.lower.a = current.size();
+#endif
+        return NodeRecord(ref,box_o);
+      }
+    private:
+      InstanceArray* mesh;
+      BVHBuilderMorton::BuildPrim* morton;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+    };
+
     template<typename Mesh>
     struct CalculateMeshBounds
     {
@@ -523,7 +568,14 @@ namespace embree
 #if defined(EMBREE_GEOMETRY_INSTANCE)
     Builder* BVH4InstanceMeshBuilderMortonGeneral (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,Instance,InstancePrimitive>((BVH4*)bvh,mesh,gtype,geomID,1,BVH4::maxLeafBlocks); }
 #if defined(__AVX__)
-    Builder* BVH8InstanceMeshBuilderMortonGeneral (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,Instance,InstancePrimitive>((BVH8*)bvh,mesh,gtype,geomID,1,BVH4::maxLeafBlocks); }    
+    Builder* BVH8InstanceMeshBuilderMortonGeneral (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,Instance,InstancePrimitive>((BVH8*)bvh,mesh,gtype,geomID,1,BVH4::maxLeafBlocks); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE_ARRAY)
+    Builder* BVH4InstanceArrayMeshBuilderMortonGeneral (void* bvh, InstanceArray* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,InstanceArray,InstanceArrayPrimitive>((BVH4*)bvh,mesh,gtype,geomID,1,BVH4::maxLeafBlocks); }
+#if defined(__AVX__)
+    Builder* BVH8InstanceArrayMeshBuilderMortonGeneral (void* bvh, InstanceArray* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,InstanceArray,InstanceArrayPrimitive>((BVH8*)bvh,mesh,gtype,geomID,1,BVH4::maxLeafBlocks); }
 #endif
 #endif
 
diff --git a/thirdparty/embree/kernels/bvh/bvh_builder_sah.cpp b/thirdparty/embree/kernels/bvh/bvh_builder_sah.cpp
index fad02fcc04..e20c088bba 100644
--- a/thirdparty/embree/kernels/bvh/bvh_builder_sah.cpp
+++ b/thirdparty/embree/kernels/bvh/bvh_builder_sah.cpp
@@ -15,6 +15,7 @@
 #include "../geometry/quadi.h"
 #include "../geometry/object.h"
 #include "../geometry/instance.h"
+#include "../geometry/instance_array.h"
 #include "../geometry/subgrid.h"
 
 #include "../common/state.h"
@@ -150,7 +151,7 @@ namespace embree
             const size_t leaf_bytes = size_t(1.2*Primitive::blocks(numPrimitives)*sizeof(Primitive));
             bvh->alloc.init_estimate(node_bytes+leaf_bytes);
             settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,numPrimitives,node_bytes+leaf_bytes);
-            prims.resize(numPrimitives); 
+            prims.resize(numPrimitives);
 
             PrimInfo pinfo = mesh ?
               createPrimRefArray(mesh,geomID_,numPrimitives,prims,bvh->scene->progressInterface) :
@@ -518,14 +519,35 @@ namespace embree
 #endif
 
 #if defined(EMBREE_GEOMETRY_INSTANCE)
-    Builder* BVH4InstanceSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderSAH<4,InstancePrimitive>((BVH4*)bvh,scene,4,1.0f,1,1,gtype); }
+    Builder* BVH4InstanceSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) {
+      return new BVHNBuilderSAH<4,InstancePrimitive>((BVH4*)bvh,scene,4,1.0f,1,1,gtype);
+    }
     Builder* BVH4InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) {
       return new BVHNBuilderSAH<4,InstancePrimitive>((BVH4*)bvh,mesh,geomID,4,1.0f,1,inf,gtype);
     }
 #if defined(__AVX__)
-    Builder* BVH8InstanceSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderSAH<8,InstancePrimitive>((BVH8*)bvh,scene,8,1.0f,1,1,gtype); }
+    Builder* BVH8InstanceSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) {
+      return new BVHNBuilderSAH<8,InstancePrimitive>((BVH8*)bvh,scene,8,1.0f,1,1,gtype);
+    }
     Builder* BVH8InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) {
-      return new BVHNBuilderSAH<8,InstancePrimitive>((BVH8*)bvh,mesh,geomID,8,1.0f,1,inf,gtype);
+      return new BVHNBuilderSAH<8,InstancePrimitive>((BVH8*)bvh,mesh,geomID,8,1.0f,1,1,gtype);
+    }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE_ARRAY)
+    Builder* BVH4InstanceArraySceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) {
+      return new BVHNBuilderSAH<4,InstanceArrayPrimitive>((BVH4*)bvh,scene,4,1.0f,1,1,gtype);
+    }
+    Builder* BVH4InstanceArrayMeshBuilderSAH (void* bvh, InstanceArray* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) {
+      return new BVHNBuilderSAH<4,InstanceArrayPrimitive>((BVH4*)bvh,mesh,geomID,4,1.0f,1,1,gtype);
+    }
+#if defined(__AVX__)
+    Builder* BVH8InstanceArraySceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) {
+      return new BVHNBuilderSAH<8,InstanceArrayPrimitive>((BVH8*)bvh,scene,8,1.0f,1,1,gtype);
+    }
+    Builder* BVH8InstanceArrayMeshBuilderSAH (void* bvh, InstanceArray* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) {
+      return new BVHNBuilderSAH<8,InstanceArrayPrimitive>((BVH8*)bvh,mesh,geomID,8,1.0f,1,1,gtype);
     }
 #endif
 #endif
diff --git a/thirdparty/embree/kernels/bvh/bvh_builder_sah_mb.cpp b/thirdparty/embree/kernels/bvh/bvh_builder_sah_mb.cpp
index d163a80ab1..0dcf98a5be 100644
--- a/thirdparty/embree/kernels/bvh/bvh_builder_sah_mb.cpp
+++ b/thirdparty/embree/kernels/bvh/bvh_builder_sah_mb.cpp
@@ -17,6 +17,7 @@
 #include "../geometry/quadi.h"
 #include "../geometry/object.h"
 #include "../geometry/instance.h"
+#include "../geometry/instance_array.h"
 #include "../geometry/subgrid.h"
 
 #include "../common/state.h"
@@ -695,6 +696,13 @@ namespace embree
 #endif
 #endif
 
+#if defined(EMBREE_GEOMETRY_INSTANCE_ARRAY)
+    Builder* BVH4InstanceArrayMBSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderMBlurSAH<4,InstanceArray,InstanceArrayPrimitive>((BVH4*)bvh,scene,4,1.0f,1,1,gtype); }
+#if defined(__AVX__)
+    Builder* BVH8InstanceArrayMBSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderMBlurSAH<8,InstanceArray,InstanceArrayPrimitive>((BVH8*)bvh,scene,8,1.0f,1,1,gtype); }
+#endif
+#endif
+
 #if defined(EMBREE_GEOMETRY_GRID)
     Builder* BVH4GridMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAHGrid<4>((BVH4*)bvh,scene,4,1.0f,4,4); }
 #if defined(__AVX__)
diff --git a/thirdparty/embree/kernels/bvh/bvh_builder_twolevel.cpp b/thirdparty/embree/kernels/bvh/bvh_builder_twolevel.cpp
index 5d45ed3748..990b1d59ad 100644
--- a/thirdparty/embree/kernels/bvh/bvh_builder_twolevel.cpp
+++ b/thirdparty/embree/kernels/bvh/bvh_builder_twolevel.cpp
@@ -1,6 +1,10 @@
 // Copyright 2009-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
+#if !defined(_CRT_SECURE_NO_WARNINGS)
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
 #include "bvh_builder_twolevel.h"
 #include "bvh_statistics.h"
 #include "../builders/bvh_builder_sah.h"
@@ -333,6 +337,12 @@ namespace embree
     }
 #endif
 
+#if defined(EMBREE_GEOMETRY_INSTANCE_ARRAY)
+    Builder* BVH4BuilderTwoLevelInstanceArraySAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<4,InstanceArray,InstanceArrayPrimitive>((BVH4*)bvh,scene,gtype,useMortonBuilder);
+    }
+#endif
+
 #if defined(__AVX__)
 #if defined(EMBREE_GEOMETRY_TRIANGLE)
     Builder* BVH8BuilderTwoLevelTriangle4MeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
@@ -364,6 +374,12 @@ namespace embree
     }
 #endif
 
+#if defined(EMBREE_GEOMETRY_INSTANCE_ARRAY)
+    Builder* BVH8BuilderTwoLevelInstanceArraySAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<8,InstanceArray,InstanceArrayPrimitive>((BVH8*)bvh,scene,gtype,useMortonBuilder);
+    }
+#endif
+
 #endif
   }
 }
diff --git a/thirdparty/embree/kernels/bvh/bvh_builder_twolevel.h b/thirdparty/embree/kernels/bvh/bvh_builder_twolevel.h
index dc7ec7d278..97ae41a87d 100644
--- a/thirdparty/embree/kernels/bvh/bvh_builder_twolevel.h
+++ b/thirdparty/embree/kernels/bvh/bvh_builder_twolevel.h
@@ -7,7 +7,6 @@
 
 #include "bvh_builder_twolevel_internal.h"
 #include "bvh.h"
-#include "../common/primref.h"
 #include "../builders/priminfo.h"
 #include "../builders/primrefgen.h"
 
diff --git a/thirdparty/embree/kernels/bvh/bvh_builder_twolevel_internal.h b/thirdparty/embree/kernels/bvh/bvh_builder_twolevel_internal.h
index 023b52b780..b28afffae9 100644
--- a/thirdparty/embree/kernels/bvh/bvh_builder_twolevel_internal.h
+++ b/thirdparty/embree/kernels/bvh/bvh_builder_twolevel_internal.h
@@ -11,6 +11,7 @@
 #include "../geometry/quadi.h"
 #include "../geometry/object.h"
 #include "../geometry/instance.h"
+#include "../geometry/instance_array.h"
 
 namespace embree
 {
@@ -32,6 +33,9 @@ namespace embree
   DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshBuilderMortonGeneral,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t);
   DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshBuilderSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t);
   DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshRefitSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t)
+  DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceArrayMeshBuilderMortonGeneral,void* COMMA InstanceArray* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceArrayMeshBuilderSAH,void* COMMA InstanceArray* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceArrayMeshRefitSAH,void* COMMA InstanceArray* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t)
   DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
   DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
   DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
@@ -50,6 +54,9 @@ namespace embree
   DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshBuilderMortonGeneral,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t);
   DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshBuilderSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t);
   DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshRefitSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t) 
+  DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceArrayMeshBuilderMortonGeneral,void* COMMA InstanceArray* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceArrayMeshBuilderSAH,void* COMMA InstanceArray* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceArrayMeshRefitSAH,void* COMMA InstanceArray* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t) 
   
   namespace isa
   {
@@ -89,6 +96,11 @@ namespace embree
         Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshBuilderMortonGeneral(bvh,mesh,gtype,geomID,0);}
       };
       template<>
+      struct MortonBuilder<4,InstanceArray,InstanceArrayPrimitive> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, InstanceArray* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceArrayMeshBuilderMortonGeneral(bvh,mesh,gtype,geomID,0);}
+      };
+      template<>
       struct MortonBuilder<8,TriangleMesh,Triangle4> {
         MortonBuilder () {}
         Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
@@ -118,6 +130,11 @@ namespace embree
         MortonBuilder () {}
         Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshBuilderMortonGeneral(bvh,mesh,gtype,geomID,0);}
       };
+      template<>
+      struct MortonBuilder<8,InstanceArray,InstanceArrayPrimitive> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, InstanceArray* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceArrayMeshBuilderMortonGeneral(bvh,mesh,gtype,geomID,0);}
+      };
 
       template<int N, typename Mesh, typename Primitive>
       struct SAHBuilder {};
@@ -152,6 +169,11 @@ namespace embree
         Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshBuilderSAH(bvh,mesh,gtype,geomID,0);}
       };
       template<>
+      struct SAHBuilder<4,InstanceArray,InstanceArrayPrimitive> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, InstanceArray* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceArrayMeshBuilderSAH(bvh,mesh,gtype,geomID,0);}
+      };
+      template<>
       struct SAHBuilder<8,TriangleMesh,Triangle4> {
         SAHBuilder () {}
         Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshBuilderSAH(bvh,mesh,geomID,0);}
@@ -181,6 +203,11 @@ namespace embree
         SAHBuilder () {}
         Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshBuilderSAH(bvh,mesh,gtype,geomID,0);}
       };
+      template<>
+      struct SAHBuilder<8,InstanceArray,InstanceArrayPrimitive> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, InstanceArray* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceArrayMeshBuilderSAH(bvh,mesh,gtype,geomID,0);}
+      };
 
       template<int N, typename Mesh, typename Primitive>
       struct RefitBuilder {};
@@ -215,6 +242,11 @@ namespace embree
         Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshRefitSAH(bvh,mesh,gtype,geomID,0);}
       };
       template<>
+      struct RefitBuilder<4,InstanceArray,InstanceArrayPrimitive> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, InstanceArray* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceArrayMeshRefitSAH(bvh,mesh,gtype,geomID,0);}
+      };
+      template<>
       struct RefitBuilder<8,TriangleMesh,Triangle4> {
         RefitBuilder () {}
         Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshRefitSAH(bvh,mesh,geomID,0);}
@@ -244,7 +276,12 @@ namespace embree
         RefitBuilder () {}
         Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshRefitSAH(bvh,mesh,gtype,geomID,0);}
       };
-      
+      template<>
+      struct RefitBuilder<8,InstanceArray,InstanceArrayPrimitive> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, InstanceArray* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceArrayMeshRefitSAH(bvh,mesh,gtype,geomID,0);}
+      };
+
       template<int N, typename Mesh, typename Primitive>
       struct MeshBuilder {
         MeshBuilder () {}
diff --git a/thirdparty/embree/kernels/bvh/bvh_collider.cpp b/thirdparty/embree/kernels/bvh/bvh_collider.cpp
index 9428c0b88e..a22d701827 100644
--- a/thirdparty/embree/kernels/bvh/bvh_collider.cpp
+++ b/thirdparty/embree/kernels/bvh/bvh_collider.cpp
@@ -2,7 +2,9 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "bvh_collider.h"
+
 #include "../geometry/triangle_triangle_intersector.h"
+#include "../../common/algorithms/parallel_for.h"
 
 namespace embree
 { 
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector1.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector1.cpp
index 9594f402c3..1d797df88f 100644
--- a/thirdparty/embree/kernels/bvh/bvh_intersector1.cpp
+++ b/thirdparty/embree/kernels/bvh/bvh_intersector1.cpp
@@ -19,6 +19,7 @@
 #include "../geometry/subdivpatch1_intersector.h"
 #include "../geometry/object_intersector.h"
 #include "../geometry/instance_intersector.h"
+#include "../geometry/instance_array_intersector.h"
 #include "../geometry/subgrid_intersector.h"
 #include "../geometry/subgrid_mb_intersector.h"
 #include "../geometry/curve_intersector_virtual.h"
@@ -30,7 +31,7 @@ namespace embree
     template<int N, int types, bool robust, typename PrimitiveIntersector1>
     void BVHNIntersector1<N, types, robust, PrimitiveIntersector1>::intersect(const Accel::Intersectors* __restrict__ This,
                                                                               RayHit& __restrict__ ray,
-                                                                              IntersectContext* __restrict__ context)
+                                                                              RayQueryContext* __restrict__ context)
     {
       const BVH* __restrict__ bvh = (const BVH*)This->ptr;
       
@@ -115,7 +116,7 @@ namespace embree
     template<int N, int types, bool robust, typename PrimitiveIntersector1>
     void BVHNIntersector1<N, types, robust, PrimitiveIntersector1>::occluded(const Accel::Intersectors* __restrict__ This,
                                                                              Ray& __restrict__ ray,
-                                                                             IntersectContext* __restrict__ context)
+                                                                             RayQueryContext* __restrict__ context)
     {
       const BVH* __restrict__ bvh = (const BVH*)This->ptr;
       
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector1.h b/thirdparty/embree/kernels/bvh/bvh_intersector1.h
index 2df3d6eddb..3cec4e113b 100644
--- a/thirdparty/embree/kernels/bvh/bvh_intersector1.h
+++ b/thirdparty/embree/kernels/bvh/bvh_intersector1.h
@@ -26,8 +26,8 @@ namespace embree
       static const size_t stackSize = 1+(N-1)*BVH::maxDepth+3; // +3 due to 16-wide store
 
     public:
-      static void intersect (const Accel::Intersectors* This, RayHit& ray, IntersectContext* context);
-      static void occluded  (const Accel::Intersectors* This, Ray& ray, IntersectContext* context);
+      static void intersect (const Accel::Intersectors* This, RayHit& ray, RayQueryContext* context);
+      static void occluded  (const Accel::Intersectors* This, Ray& ray, RayQueryContext* context);
       static bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context);
     };
   }
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector1_bvh4.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector1_bvh4.cpp
index 831d613367..1baecc7bbc 100644
--- a/thirdparty/embree/kernels/bvh/bvh_intersector1_bvh4.cpp
+++ b/thirdparty/embree/kernels/bvh/bvh_intersector1_bvh4.cpp
@@ -48,6 +48,9 @@ namespace embree
     IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR1(BVH4InstanceIntersector1,BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<InstanceIntersector1> >));
     IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR1(BVH4InstanceMBIntersector1,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<InstanceIntersector1MB> >));
 
+    IF_ENABLED_INSTANCE_ARRAY(DEFINE_INTERSECTOR1(BVH4InstanceArrayIntersector1,BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<InstanceArrayIntersector1> >));
+    IF_ENABLED_INSTANCE_ARRAY(DEFINE_INTERSECTOR1(BVH4InstanceArrayMBIntersector1,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<InstanceArrayIntersector1MB> >));
+
     IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(QBVH4Triangle4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_QN1 COMMA false COMMA ArrayIntersector1<TriangleMiIntersector1Pluecker<4 COMMA true> > >));
     IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(QBVH4Quad4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_QN1 COMMA false COMMA ArrayIntersector1<QuadMiIntersector1Pluecker<4 COMMA true> > >));
 
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp
index 1d393fd06b..e9e2262984 100644
--- a/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp
+++ b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp
@@ -20,6 +20,7 @@
 #include "../geometry/subdivpatch1_intersector.h"
 #include "../geometry/object_intersector.h"
 #include "../geometry/instance_intersector.h"
+#include "../geometry/instance_array_intersector.h"
 #include "../geometry/subgrid_intersector.h"
 #include "../geometry/subgrid_mb_intersector.h"
 #include "../geometry/curve_intersector_virtual.h"
@@ -41,7 +42,7 @@ namespace embree
                                                                                                 Precalculations& pre,
                                                                                                 RayHitK<K>& ray,
                                                                                                 const TravRayK<K, robust>& tray,
-                                                                                                IntersectContext* context)
+                                                                                                RayQueryContext* context)
     {
       /* stack state */
       StackItemT<NodeRef> stack[stackSizeSingle];  // stack of nodes
@@ -105,7 +106,7 @@ namespace embree
     void BVHNIntersectorKHybrid<N, K, types, robust, PrimitiveIntersectorK, single>::intersect(vint<K>* __restrict__ valid_i,
                                                                                                Accel::Intersectors* __restrict__ This,
                                                                                                RayHitK<K>& __restrict__ ray,
-                                                                                               IntersectContext* __restrict__ context)
+                                                                                               RayQueryContext* __restrict__ context)
     {
       BVH* __restrict__ bvh = (BVH*)This->ptr;
       
@@ -373,7 +374,7 @@ namespace embree
     void BVHNIntersectorKHybrid<N, K, types, robust, PrimitiveIntersectorK, single>::intersectCoherent(vint<K>* __restrict__ valid_i,
                                                                                                        Accel::Intersectors* __restrict__ This,
                                                                                                        RayHitK<K>& __restrict__ ray,
-                                                                                                       IntersectContext* context)
+                                                                                                       RayQueryContext* context)
     {
       BVH* __restrict__ bvh = (BVH*)This->ptr;
       
@@ -539,7 +540,7 @@ namespace embree
                                                                                                Precalculations& pre,
                                                                                                RayK<K>& ray,
                                                                                                const TravRayK<K, robust>& tray,
-                                                                                               IntersectContext* context)
+                                                                                               RayQueryContext* context)
       {
         /* stack state */
         NodeRef stack[stackSizeSingle];  // stack of nodes that still need to get traversed
@@ -599,7 +600,7 @@ namespace embree
     void BVHNIntersectorKHybrid<N, K, types, robust, PrimitiveIntersectorK, single>::occluded(vint<K>* __restrict__ valid_i,
                                                                                               Accel::Intersectors* __restrict__ This,
                                                                                               RayK<K>& __restrict__ ray,
-                                                                                              IntersectContext* context)
+                                                                                              RayQueryContext* context)
     {
       BVH* __restrict__ bvh = (BVH*)This->ptr;
       
@@ -784,7 +785,7 @@ namespace embree
     void BVHNIntersectorKHybrid<N, K, types, robust, PrimitiveIntersectorK, single>::occludedCoherent(vint<K>* __restrict__ valid_i,
                                                                                                       Accel::Intersectors* __restrict__ This,
                                                                                                       RayK<K>& __restrict__ ray,
-                                                                                                      IntersectContext* context)
+                                                                                                      RayQueryContext* context)
     {
       BVH* __restrict__ bvh = (BVH*)This->ptr;
       
@@ -861,7 +862,7 @@ namespace embree
 #if defined(__AVX__)
             //STAT3(normal.trav_hit_boxes[popcnt(m_frustum_node)], 1, 1, 1);
 #endif
-            size_t num_child_hits = 0;
+            //size_t num_child_hits = 0;
             do {
               const size_t i = bscf(m_frustum_node);
               vfloat<K> lnearP;
@@ -875,7 +876,7 @@ namespace embree
                 assert(child != BVH::emptyNode);
                 BVHN<N>::prefetch(child);
                 if (likely(cur != BVH::emptyNode)) {
-                  num_child_hits++;
+                  //num_child_hits++;
                   stackPtr->ptr  = cur;
                   stackPtr->mask = m_active;
                   stackPtr++;
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.h b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.h
index 50ebf375c4..1240bd5ab5 100644
--- a/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.h
+++ b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.h
@@ -38,16 +38,16 @@ namespace embree
 
     private:
       static void intersect1(Accel::Intersectors* This, const BVH* bvh, NodeRef root, size_t k, Precalculations& pre,
-                             RayHitK<K>& ray, const TravRayK<K, robust>& tray, IntersectContext* context);
+                             RayHitK<K>& ray, const TravRayK<K, robust>& tray, RayQueryContext* context);
       static bool occluded1(Accel::Intersectors* This, const BVH* bvh, NodeRef root, size_t k, Precalculations& pre,
-                            RayK<K>& ray, const TravRayK<K, robust>& tray, IntersectContext* context);
+                            RayK<K>& ray, const TravRayK<K, robust>& tray, RayQueryContext* context);
 
     public:
-      static void intersect(vint<K>* valid, Accel::Intersectors* This, RayHitK<K>& ray, IntersectContext* context);
-      static void occluded (vint<K>* valid, Accel::Intersectors* This, RayK<K>& ray, IntersectContext* context);
+      static void intersect(vint<K>* valid, Accel::Intersectors* This, RayHitK<K>& ray, RayQueryContext* context);
+      static void occluded (vint<K>* valid, Accel::Intersectors* This, RayK<K>& ray, RayQueryContext* context);
 
-      static void intersectCoherent(vint<K>* valid, Accel::Intersectors* This, RayHitK<K>& ray, IntersectContext* context);
-      static void occludedCoherent (vint<K>* valid, Accel::Intersectors* This, RayK<K>& ray, IntersectContext* context);
+      static void intersectCoherent(vint<K>* valid, Accel::Intersectors* This, RayHitK<K>& ray, RayQueryContext* context);
+      static void occludedCoherent (vint<K>* valid, Accel::Intersectors* This, RayK<K>& ray, RayQueryContext* context);
 
     };
 
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp
index 2137da6a25..72cefa9e8e 100644
--- a/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp
+++ b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp
@@ -48,6 +48,9 @@ namespace embree
     IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR4(BVH4InstanceIntersector4Chunk, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA InstanceIntersectorK<4>> >));
     IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR4(BVH4InstanceMBIntersector4Chunk, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersectorK_1<4 COMMA InstanceIntersectorKMB<4>> >));
 
+    IF_ENABLED_INSTANCE_ARRAY(DEFINE_INTERSECTOR4(BVH4InstanceArrayIntersector4Chunk, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA InstanceArrayIntersectorK<4>> >));
+    IF_ENABLED_INSTANCE_ARRAY(DEFINE_INTERSECTOR4(BVH4InstanceArrayMBIntersector4Chunk, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersectorK_1<4 COMMA InstanceArrayIntersectorKMB<4>> >));
+
     IF_ENABLED_GRIDS(DEFINE_INTERSECTOR4(BVH4GridIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA SubGridIntersectorKMoeller <4 COMMA 4 COMMA true> >));
     //IF_ENABLED_GRIDS(DEFINE_INTERSECTOR4(BVH4GridIntersector4HybridMoeller, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA SubGridIntersectorKMoeller <4 COMMA 4 COMMA true> >));
     
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_stream.cpp
deleted file mode 100644
index 4a74d8468d..0000000000
--- a/thirdparty/embree/kernels/bvh/bvh_intersector_stream.cpp
+++ /dev/null
@@ -1,528 +0,0 @@
-// Copyright 2009-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include "bvh_intersector_stream.h"
-
-#include "../geometry/intersector_iterators.h"
-#include "../geometry/triangle_intersector.h"
-#include "../geometry/trianglev_intersector.h"
-#include "../geometry/trianglev_mb_intersector.h"
-#include "../geometry/trianglei_intersector.h"
-#include "../geometry/quadv_intersector.h"
-#include "../geometry/quadi_intersector.h"
-#include "../geometry/linei_intersector.h"
-#include "../geometry/subdivpatch1_intersector.h"
-#include "../geometry/object_intersector.h"
-#include "../geometry/instance_intersector.h"
-
-#include "../common/scene.h"
-#include <bitset>
-
-namespace embree
-{
-  namespace isa
-  {
-    __aligned(64) static const int shiftTable[32] = { 
-      (int)1 << 0, (int)1 << 1, (int)1 << 2, (int)1 << 3, (int)1 << 4, (int)1 << 5, (int)1 << 6, (int)1 << 7,  
-      (int)1 << 8, (int)1 << 9, (int)1 << 10, (int)1 << 11, (int)1 << 12, (int)1 << 13, (int)1 << 14, (int)1 << 15,  
-      (int)1 << 16, (int)1 << 17, (int)1 << 18, (int)1 << 19, (int)1 << 20, (int)1 << 21, (int)1 << 22, (int)1 << 23,  
-      (int)1 << 24, (int)1 << 25, (int)1 << 26, (int)1 << 27, (int)1 << 28, (int)1 << 29, (int)1 << 30, (int)1 << 31
-    };
-
-    template<int N, int types, bool robust, typename PrimitiveIntersector>
-    __forceinline void BVHNIntersectorStream<N, types, robust, PrimitiveIntersector>::intersect(Accel::Intersectors* __restrict__ This,
-                                                                                                    RayHitN** inputPackets,
-                                                                                                    size_t numOctantRays,
-                                                                                                    IntersectContext* context)
-    {
-      /* we may traverse an empty BVH in case all geometry was invalid */
-      BVH* __restrict__ bvh = (BVH*) This->ptr;
-      if (bvh->root == BVH::emptyNode)
-        return;
-      
-      // Only the coherent code path is implemented
-      assert(context->isCoherent());
-      intersectCoherent(This, (RayHitK<VSIZEL>**)inputPackets, numOctantRays, context);
-    }
-
-    template<int N, int types, bool robust, typename PrimitiveIntersector>
-    template<int K>
-    __forceinline void BVHNIntersectorStream<N, types, robust, PrimitiveIntersector>::intersectCoherent(Accel::Intersectors* __restrict__ This,
-                                                                                                            RayHitK<K>** inputPackets,
-                                                                                                            size_t numOctantRays,
-                                                                                                            IntersectContext* context)
-    {
-      assert(context->isCoherent());
-
-      BVH* __restrict__ bvh = (BVH*) This->ptr;
-      __aligned(64) StackItemMaskCoherent stack[stackSizeSingle];  // stack of nodes
-      assert(numOctantRays <= MAX_INTERNAL_STREAM_SIZE);
-
-      __aligned(64) TravRayKStream<K, robust> packets[MAX_INTERNAL_STREAM_SIZE/K];
-      __aligned(64) Frustum<robust> frustum;
-
-      bool commonOctant = true;
-      const size_t m_active = initPacketsAndFrustum((RayK<K>**)inputPackets, numOctantRays, packets, frustum, commonOctant);
-      if (unlikely(m_active == 0)) return;
-
-      /* case of non-common origin */
-      if (unlikely(!commonOctant))
-      {
-        const size_t numPackets = (numOctantRays+K-1)/K; 
-        for (size_t i = 0; i < numPackets; i++)
-          This->intersect(inputPackets[i]->tnear() <= inputPackets[i]->tfar, *inputPackets[i], context);
-        return;
-      }
-
-      stack[0].mask   = m_active;
-      stack[0].parent = 0;
-      stack[0].child  = bvh->root;
-
-      ///////////////////////////////////////////////////////////////////////////////////
-      ///////////////////////////////////////////////////////////////////////////////////
-      ///////////////////////////////////////////////////////////////////////////////////
-
-      StackItemMaskCoherent* stackPtr = stack + 1;
-
-      while (1) pop:
-      {
-        if (unlikely(stackPtr == stack)) break;
-
-        STAT3(normal.trav_stack_pop,1,1,1);
-        stackPtr--;
-        /*! pop next node */
-        NodeRef cur = NodeRef(stackPtr->child);
-        size_t m_trav_active = stackPtr->mask;
-        assert(m_trav_active);
-        NodeRef parent = stackPtr->parent;
-
-        while (1)
-        {
-          if (unlikely(cur.isLeaf())) break;
-          const AABBNode* __restrict__ const node = cur.getAABBNode();
-          parent = cur;
-
-          __aligned(64) size_t maskK[N];
-          for (size_t i = 0; i < N; i++)
-            maskK[i] = m_trav_active;
-          vfloat<N> dist;
-          const size_t m_node_hit = traverseCoherentStream(m_trav_active, packets, node, frustum, maskK, dist);
-          if (unlikely(m_node_hit == 0)) goto pop;
-
-          BVHNNodeTraverserStreamHitCoherent<N, types>::traverseClosestHit(cur, m_trav_active, vbool<N>((int)m_node_hit), dist, (size_t*)maskK, stackPtr);
-          assert(m_trav_active);
-        }
-
-        /* non-root and leaf => full culling test for all rays */
-        if (unlikely(parent != 0 && cur.isLeaf()))
-        {
-          const AABBNode* __restrict__ const node = parent.getAABBNode();
-          size_t boxID = 0xff;
-          for (size_t i = 0; i < N; i++)
-            if (node->child(i) == cur) { boxID = i; break; }
-          assert(boxID < N);
-          assert(cur == node->child(boxID));
-          m_trav_active = intersectAABBNodePacket(m_trav_active, packets, node, boxID, frustum.nf);
-        }
-
-        /*! this is a leaf node */
-        assert(cur != BVH::emptyNode);
-        STAT3(normal.trav_leaves, 1, 1, 1);
-        size_t num; PrimitiveK<K>* prim = (PrimitiveK<K>*)cur.leaf(num);
-
-        size_t bits = m_trav_active;
-
-        /*! intersect stream of rays with all primitives */
-        size_t lazy_node = 0;
-#if defined(__SSE4_2__)
-        STAT_USER(1,(popcnt(bits)+K-1)/K*4);
-#endif
-        while(bits)
-        {
-          size_t i = bsf(bits) / K;
-          const size_t m_isec = ((((size_t)1 << K)-1) << (i*K));
-          assert(m_isec & bits);
-          bits &= ~m_isec;
-
-          TravRayKStream<K, robust>& p = packets[i];
-          vbool<K> m_valid = p.tnear <= p.tfar;
-          PrimitiveIntersectorK<K>::intersectK(m_valid, This, *inputPackets[i], context, prim, num, lazy_node);
-          p.tfar = min(p.tfar, inputPackets[i]->tfar);
-        };
-
-      } // traversal + intersection
-    }
-
-    template<int N, int types, bool robust, typename PrimitiveIntersector>
-    __forceinline void BVHNIntersectorStream<N, types, robust, PrimitiveIntersector>::occluded(Accel::Intersectors* __restrict__ This,
-                                                                                                   RayN** inputPackets,
-                                                                                                   size_t numOctantRays,
-                                                                                                   IntersectContext* context)
-    {
-      /* we may traverse an empty BVH in case all geometry was invalid */
-      BVH* __restrict__ bvh = (BVH*) This->ptr;
-      if (bvh->root == BVH::emptyNode)
-        return;
-      
-      if (unlikely(context->isCoherent()))
-        occludedCoherent(This, (RayK<VSIZEL>**)inputPackets, numOctantRays, context);
-      else
-        occludedIncoherent(This, (RayK<VSIZEX>**)inputPackets, numOctantRays, context);
-    }
-
-    template<int N, int types, bool robust, typename PrimitiveIntersector>
-    template<int K>
-    __noinline void BVHNIntersectorStream<N, types, robust, PrimitiveIntersector>::occludedCoherent(Accel::Intersectors* __restrict__ This,
-                                                                                                        RayK<K>** inputPackets,
-                                                                                                        size_t numOctantRays,
-                                                                                                        IntersectContext* context)
-    {
-      assert(context->isCoherent());
-
-      BVH* __restrict__ bvh = (BVH*)This->ptr;
-      __aligned(64) StackItemMaskCoherent stack[stackSizeSingle];  // stack of nodes
-      assert(numOctantRays <= MAX_INTERNAL_STREAM_SIZE);
-
-      /* inactive rays should have been filtered out before */
-      __aligned(64) TravRayKStream<K, robust> packets[MAX_INTERNAL_STREAM_SIZE/K];
-      __aligned(64) Frustum<robust> frustum;
-
-      bool commonOctant = true;
-      size_t m_active = initPacketsAndFrustum(inputPackets, numOctantRays, packets, frustum, commonOctant);
-
-      /* valid rays */
-      if (unlikely(m_active == 0)) return;
-
-      /* case of non-common origin */
-      if (unlikely(!commonOctant))
-      {
-        const size_t numPackets = (numOctantRays+K-1)/K; 
-        for (size_t i = 0; i < numPackets; i++)
-          This->occluded(inputPackets[i]->tnear() <= inputPackets[i]->tfar, *inputPackets[i], context);
-        return;
-      }
-
-      stack[0].mask   = m_active;
-      stack[0].parent = 0;
-      stack[0].child  = bvh->root;
-
-      ///////////////////////////////////////////////////////////////////////////////////
-      ///////////////////////////////////////////////////////////////////////////////////
-      ///////////////////////////////////////////////////////////////////////////////////
-
-      StackItemMaskCoherent* stackPtr = stack + 1;
-
-      while (1) pop:
-      {
-        if (unlikely(stackPtr == stack)) break;
-
-        STAT3(normal.trav_stack_pop,1,1,1);
-        stackPtr--;
-        /*! pop next node */
-        NodeRef cur = NodeRef(stackPtr->child);
-        size_t m_trav_active = stackPtr->mask & m_active;
-        if (unlikely(!m_trav_active)) continue;
-        assert(m_trav_active);
-        NodeRef parent = stackPtr->parent;
-
-        while (1)
-        {
-          if (unlikely(cur.isLeaf())) break;
-          const AABBNode* __restrict__ const node = cur.getAABBNode();
-          parent = cur;
-
-          __aligned(64) size_t maskK[N];
-          for (size_t i = 0; i < N; i++)
-            maskK[i] = m_trav_active;
-
-          vfloat<N> dist;
-          const size_t m_node_hit = traverseCoherentStream(m_trav_active, packets, node, frustum, maskK, dist);
-          if (unlikely(m_node_hit == 0)) goto pop;
-
-          BVHNNodeTraverserStreamHitCoherent<N, types>::traverseAnyHit(cur, m_trav_active, vbool<N>((int)m_node_hit), (size_t*)maskK, stackPtr);
-          assert(m_trav_active);
-        }
-
-        /* non-root and leaf => full culling test for all rays */
-        if (unlikely(parent != 0 && cur.isLeaf()))
-        {
-          const AABBNode* __restrict__ const node = parent.getAABBNode();
-          size_t boxID = 0xff;
-          for (size_t i = 0; i < N; i++)
-            if (node->child(i) == cur) { boxID = i; break; }
-          assert(boxID < N);
-          assert(cur == node->child(boxID));
-          m_trav_active = intersectAABBNodePacket(m_trav_active, packets, node, boxID, frustum.nf);
-        }
-
-        /*! this is a leaf node */
-        assert(cur != BVH::emptyNode);
-        STAT3(normal.trav_leaves, 1, 1, 1);
-        size_t num; PrimitiveK<K>* prim = (PrimitiveK<K>*)cur.leaf(num);
-
-        size_t bits = m_trav_active & m_active;
-        /*! intersect stream of rays with all primitives */
-        size_t lazy_node = 0;
-#if defined(__SSE4_2__)
-        STAT_USER(1,(popcnt(bits)+K-1)/K*4);
-#endif
-        while (bits)
-        {
-          size_t i = bsf(bits) / K;
-          const size_t m_isec = ((((size_t)1 << K)-1) << (i*K));
-          assert(m_isec & bits);
-          bits &= ~m_isec;
-          TravRayKStream<K, robust>& p = packets[i];
-          vbool<K> m_valid = p.tnear <= p.tfar;
-          vbool<K> m_hit = PrimitiveIntersectorK<K>::occludedK(m_valid, This, *inputPackets[i], context, prim, num, lazy_node);
-          inputPackets[i]->tfar = select(m_hit & m_valid, vfloat<K>(neg_inf), inputPackets[i]->tfar);
-          m_active &= ~((size_t)movemask(m_hit) << (i*K));
-        }
-
-      } // traversal + intersection
-    }
-
-
-    template<int N, int types, bool robust, typename PrimitiveIntersector>
-    template<int K>
-    __forceinline void BVHNIntersectorStream<N, types, robust, PrimitiveIntersector>::occludedIncoherent(Accel::Intersectors* __restrict__ This,
-                                                                                                             RayK<K>** inputPackets,
-                                                                                                             size_t numOctantRays,
-                                                                                                             IntersectContext* context)
-    {
-      assert(!context->isCoherent());
-      assert(types & BVH_FLAG_ALIGNED_NODE);
-
-      __aligned(64) TravRayKStream<K,robust> packet[MAX_INTERNAL_STREAM_SIZE/K];
-
-      assert(numOctantRays <= 32);
-      const size_t numPackets = (numOctantRays+K-1)/K;
-      size_t m_active = 0;
-      for (size_t i = 0; i < numPackets; i++)
-      {
-        const vfloat<K> tnear = inputPackets[i]->tnear();
-        const vfloat<K> tfar  = inputPackets[i]->tfar;
-        vbool<K> m_valid = (tnear <= tfar) & (tnear >= 0.0f);
-        m_active |= (size_t)movemask(m_valid) << (K*i);
-        const Vec3vf<K>& org = inputPackets[i]->org;
-        const Vec3vf<K>& dir = inputPackets[i]->dir;
-        vfloat<K> packet_min_dist = max(tnear, 0.0f);
-        vfloat<K> packet_max_dist = select(m_valid, tfar, neg_inf);
-        new (&packet[i]) TravRayKStream<K,robust>(org, dir, packet_min_dist, packet_max_dist);
-      }
-
-      BVH* __restrict__ bvh = (BVH*)This->ptr;
-
-      StackItemMaskT<NodeRef> stack[stackSizeSingle]; // stack of nodes
-      StackItemMaskT<NodeRef>* stackPtr = stack + 1;  // current stack pointer
-      stack[0].ptr = bvh->root;
-      stack[0].mask = m_active;
-
-      size_t terminated = ~m_active;
-
-      /* near/far offsets based on first ray */
-      const NearFarPrecalculations nf(Vec3fa(packet[0].rdir.x[0], packet[0].rdir.y[0], packet[0].rdir.z[0]), N);
-
-      while (1) pop:
-      {
-        if (unlikely(stackPtr == stack)) break;
-        STAT3(shadow.trav_stack_pop,1,1,1);
-        stackPtr--;
-        NodeRef cur = NodeRef(stackPtr->ptr);
-        size_t cur_mask = stackPtr->mask & (~terminated);
-        if (unlikely(cur_mask == 0)) continue;
-
-        while (true)
-        {
-          /*! stop if we found a leaf node */
-          if (unlikely(cur.isLeaf())) break;
-          const AABBNode* __restrict__ const node = cur.getAABBNode();
-
-          const vint<N> vmask = traverseIncoherentStream(cur_mask, packet, node, nf, shiftTable);
-
-          size_t mask = movemask(vmask != vint<N>(zero));
-          if (unlikely(mask == 0)) goto pop;
-
-          __aligned(64) unsigned int child_mask[N];
-          vint<N>::storeu(child_mask, vmask); // this explicit store here causes much better code generation
-          
-          /*! one child is hit, continue with that child */
-          size_t r = bscf(mask);
-          assert(r < N);
-          cur = node->child(r);         
-          BVHN<N>::prefetch(cur,types);
-          cur_mask = child_mask[r];
-
-          /* simple in order sequence */
-          assert(cur != BVH::emptyNode);
-          if (likely(mask == 0)) continue;
-          stackPtr->ptr  = cur;
-          stackPtr->mask = cur_mask;
-          stackPtr++;
-
-          for (; ;)
-          {
-            r = bscf(mask);
-            assert(r < N);
-
-            cur = node->child(r);          
-            BVHN<N>::prefetch(cur,types);
-            cur_mask = child_mask[r];            
-            assert(cur != BVH::emptyNode);
-            if (likely(mask == 0)) break;
-            stackPtr->ptr  = cur;
-            stackPtr->mask = cur_mask;
-            stackPtr++;
-          }
-        }
-        
-        /*! this is a leaf node */
-        assert(cur != BVH::emptyNode);
-        STAT3(shadow.trav_leaves,1,1,1);
-        size_t num; PrimitiveK<K>* prim = (PrimitiveK<K>*)cur.leaf(num);
-
-        size_t bits = cur_mask;
-        size_t lazy_node = 0;
-
-        for (; bits != 0;)
-        {
-          const size_t rayID = bscf(bits);
-
-          RayK<K> &ray = *inputPackets[rayID / K];
-          const size_t k = rayID % K;
-          if (PrimitiveIntersectorK<K>::occluded(This, ray, k, context, prim, num, lazy_node))
-          {
-            ray.tfar[k] = neg_inf;
-            terminated |= (size_t)1 << rayID;
-          }
-
-          /* lazy node */
-          if (unlikely(lazy_node))
-          {
-            stackPtr->ptr = lazy_node;
-            stackPtr->mask = cur_mask;
-            stackPtr++;
-          }
-        }
-
-        if (unlikely(terminated == (size_t)-1)) break;
-      }
-    }
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// ArrayIntersectorKStream Definitions
-    ////////////////////////////////////////////////////////////////////////////////
-
-    template<bool filter>
-    struct Triangle4IntersectorStreamMoeller {
-      template<int K> using Type = ArrayIntersectorKStream<K,TriangleMIntersectorKMoeller<4 COMMA K COMMA true>>;
-    };
-
-    template<bool filter>
-    struct Triangle4vIntersectorStreamPluecker {
-      template<int K> using Type = ArrayIntersectorKStream<K,TriangleMvIntersectorKPluecker<4 COMMA K COMMA true>>;
-    };
-
-    template<bool filter>
-    struct Triangle4iIntersectorStreamMoeller {
-      template<int K> using Type = ArrayIntersectorKStream<K,TriangleMiIntersectorKMoeller<4 COMMA K COMMA true>>;
-    };
-
-    template<bool filter>
-    struct Triangle4iIntersectorStreamPluecker {
-      template<int K> using Type = ArrayIntersectorKStream<K,TriangleMiIntersectorKPluecker<4 COMMA K COMMA true>>;
-    };
-
-    template<bool filter>
-    struct Quad4vIntersectorStreamMoeller {
-      template<int K> using Type = ArrayIntersectorKStream<K,QuadMvIntersectorKMoeller<4 COMMA K COMMA true>>;
-    };
-
-    template<bool filter>
-    struct Quad4iIntersectorStreamMoeller {
-      template<int K> using Type = ArrayIntersectorKStream<K,QuadMiIntersectorKMoeller<4 COMMA K COMMA true>>;
-    };
-
-    template<bool filter>
-    struct Quad4vIntersectorStreamPluecker {
-      template<int K> using Type = ArrayIntersectorKStream<K,QuadMvIntersectorKPluecker<4 COMMA K COMMA true>>;
-    };
-
-    template<bool filter>
-    struct Quad4iIntersectorStreamPluecker {
-      template<int K> using Type = ArrayIntersectorKStream<K,QuadMiIntersectorKPluecker<4 COMMA K COMMA true>>;
-    };
-
-    struct ObjectIntersectorStream {
-      template<int K> using Type = ArrayIntersectorKStream<K,ObjectIntersectorK<K COMMA false>>;
-    };
-
-    struct InstanceIntersectorStream {
-      template<int K> using Type = ArrayIntersectorKStream<K,InstanceIntersectorK<K>>;
-    };
-
-    // =====================================================================================================
-    // =====================================================================================================
-    // =====================================================================================================
-
-    template<int N>
-    void BVHNIntersectorStreamPacketFallback<N>::intersect(Accel::Intersectors* __restrict__ This,
-                                                               RayHitN** inputRays,
-                                                               size_t numTotalRays,
-                                                               IntersectContext* context)
-    {
-      if (unlikely(context->isCoherent()))
-        intersectK(This, (RayHitK<VSIZEL>**)inputRays, numTotalRays, context);
-      else
-        intersectK(This, (RayHitK<VSIZEX>**)inputRays, numTotalRays, context);
-    }
-
-    template<int N>
-    void BVHNIntersectorStreamPacketFallback<N>::occluded(Accel::Intersectors* __restrict__ This,
-                                                              RayN** inputRays,
-                                                              size_t numTotalRays,
-                                                              IntersectContext* context)
-    {
-      if (unlikely(context->isCoherent()))
-        occludedK(This, (RayK<VSIZEL>**)inputRays, numTotalRays, context);
-      else
-        occludedK(This, (RayK<VSIZEX>**)inputRays, numTotalRays, context);
-    }
-
-    template<int N>
-    template<int K>
-    __noinline void BVHNIntersectorStreamPacketFallback<N>::intersectK(Accel::Intersectors* __restrict__ This,
-                                                                              RayHitK<K>** inputRays,
-                                                                              size_t numTotalRays,
-                                                                              IntersectContext* context)
-    {
-      /* fallback to packets */
-      for (size_t i = 0; i < numTotalRays; i += K)
-      {
-        const vint<K> vi = vint<K>(int(i)) + vint<K>(step);
-        vbool<K> valid = vi < vint<K>(int(numTotalRays));
-        RayHitK<K>& ray = *(inputRays[i / K]);
-        valid &= ray.tnear() <= ray.tfar;
-        This->intersect(valid, ray, context);
-      }
-    }
-
-    template<int N>
-    template<int K>
-    __noinline void BVHNIntersectorStreamPacketFallback<N>::occludedK(Accel::Intersectors* __restrict__ This,
-                                                                             RayK<K>** inputRays,
-                                                                             size_t numTotalRays,
-                                                                             IntersectContext* context)
-    {
-      /* fallback to packets */
-      for (size_t i = 0; i < numTotalRays; i += K)
-      {
-        const vint<K> vi = vint<K>(int(i)) + vint<K>(step);
-        vbool<K> valid = vi < vint<K>(int(numTotalRays));
-        RayK<K>& ray = *(inputRays[i / K]);
-        valid &= ray.tnear() <= ray.tfar;
-        This->occluded(valid, ray, context);
-      }
-    }
-  }
-}
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h b/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h
deleted file mode 100644
index c7e040fadb..0000000000
--- a/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h
+++ /dev/null
@@ -1,281 +0,0 @@
-// Copyright 2009-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "node_intersector_packet_stream.h"
-#include "node_intersector_frustum.h"
-#include "bvh_traverser_stream.h"
-
-namespace embree
-{
-  namespace isa 
-  {
-    /*! BVH ray stream intersector. */
-    template<int N, int types, bool robust, typename PrimitiveIntersector>
-    class BVHNIntersectorStream
-    {
-      /* shortcuts for frequently used types */
-      template<int K> using PrimitiveIntersectorK = typename PrimitiveIntersector::template Type<K>;
-      template<int K> using PrimitiveK = typename PrimitiveIntersectorK<K>::PrimitiveK;
-      typedef BVHN<N> BVH;
-      typedef typename BVH::NodeRef NodeRef;
-      typedef typename BVH::BaseNode BaseNode;
-      typedef typename BVH::AABBNode AABBNode;
-      typedef typename BVH::AABBNodeMB AABBNodeMB;
-
-      template<int K>
-      __forceinline static size_t initPacketsAndFrustum(RayK<K>** inputPackets, size_t numOctantRays,
-                                                        TravRayKStream<K, robust>* packets, Frustum<robust>& frustum, bool& commonOctant)
-      {
-        const size_t numPackets = (numOctantRays+K-1)/K;
-
-        Vec3vf<K> tmp_min_rdir(pos_inf);
-        Vec3vf<K> tmp_max_rdir(neg_inf);
-        Vec3vf<K> tmp_min_org(pos_inf);
-        Vec3vf<K> tmp_max_org(neg_inf);
-        vfloat<K> tmp_min_dist(pos_inf);
-        vfloat<K> tmp_max_dist(neg_inf);
-
-        size_t m_active = 0;
-        for (size_t i = 0; i < numPackets; i++)
-        {
-          const vfloat<K> tnear = inputPackets[i]->tnear();
-          const vfloat<K> tfar  = inputPackets[i]->tfar;
-          vbool<K> m_valid = (tnear <= tfar) & (tnear >= 0.0f);
-
-#if defined(EMBREE_IGNORE_INVALID_RAYS)
-          m_valid &= inputPackets[i]->valid();
-#endif
-
-          m_active |= (size_t)movemask(m_valid) << (i*K);
-
-          vfloat<K> packet_min_dist = max(tnear, 0.0f);
-          vfloat<K> packet_max_dist = select(m_valid, tfar, neg_inf);
-          tmp_min_dist = min(tmp_min_dist, packet_min_dist);
-          tmp_max_dist = max(tmp_max_dist, packet_max_dist);
-
-          const Vec3vf<K>& org = inputPackets[i]->org;
-          const Vec3vf<K>& dir = inputPackets[i]->dir;
-
-          new (&packets[i]) TravRayKStream<K, robust>(org, dir, packet_min_dist, packet_max_dist);
-
-          tmp_min_rdir = min(tmp_min_rdir, select(m_valid, packets[i].rdir, Vec3vf<K>(pos_inf)));
-          tmp_max_rdir = max(tmp_max_rdir, select(m_valid, packets[i].rdir, Vec3vf<K>(neg_inf)));
-          tmp_min_org  = min(tmp_min_org , select(m_valid,org , Vec3vf<K>(pos_inf)));
-          tmp_max_org  = max(tmp_max_org , select(m_valid,org , Vec3vf<K>(neg_inf)));
-        }
-
-        m_active &= (numOctantRays == (8 * sizeof(size_t))) ? (size_t)-1 : (((size_t)1 << numOctantRays)-1);
-
-        
-        const Vec3fa reduced_min_rdir(reduce_min(tmp_min_rdir.x),
-                                      reduce_min(tmp_min_rdir.y),
-                                      reduce_min(tmp_min_rdir.z));
-
-        const Vec3fa reduced_max_rdir(reduce_max(tmp_max_rdir.x),
-                                      reduce_max(tmp_max_rdir.y),
-                                      reduce_max(tmp_max_rdir.z));
-
-        const Vec3fa reduced_min_origin(reduce_min(tmp_min_org.x),
-                                        reduce_min(tmp_min_org.y),
-                                        reduce_min(tmp_min_org.z));
-
-        const Vec3fa reduced_max_origin(reduce_max(tmp_max_org.x),
-                                        reduce_max(tmp_max_org.y),
-                                        reduce_max(tmp_max_org.z));
-
-        commonOctant =
-          (reduced_max_rdir.x < 0.0f || reduced_min_rdir.x >= 0.0f) &&
-          (reduced_max_rdir.y < 0.0f || reduced_min_rdir.y >= 0.0f) &&
-          (reduced_max_rdir.z < 0.0f || reduced_min_rdir.z >= 0.0f);
-        
-        const float frustum_min_dist = reduce_min(tmp_min_dist);
-        const float frustum_max_dist = reduce_max(tmp_max_dist);
-
-        frustum.init(reduced_min_origin, reduced_max_origin,
-                     reduced_min_rdir, reduced_max_rdir,
-                     frustum_min_dist, frustum_max_dist,
-                     N);
-        
-        return m_active;
-      }
-
-      template<int K>
-      __forceinline static size_t intersectAABBNodePacket(size_t m_active,
-                                                             const TravRayKStream<K,robust>* packets,
-                                                             const AABBNode* __restrict__ node,
-                                                             size_t boxID,
-                                                             const NearFarPrecalculations& nf)
-      {
-        assert(m_active);
-        const size_t startPacketID = bsf(m_active) / K;
-        const size_t endPacketID   = bsr(m_active) / K;
-        size_t m_trav_active = 0;
-        for (size_t i = startPacketID; i <= endPacketID; i++)
-        {
-          const size_t m_hit = intersectNodeK<N>(node, boxID, packets[i], nf);
-          m_trav_active |= m_hit << (i*K);
-        } 
-        return m_trav_active;
-      }
-      
-      template<int K>
-      __forceinline static size_t traverseCoherentStream(size_t m_active,
-                                                         TravRayKStream<K, robust>* packets,
-                                                         const AABBNode* __restrict__ node,
-                                                         const Frustum<robust>& frustum,
-                                                         size_t* maskK,
-                                                         vfloat<N>& dist)
-      {
-        size_t m_node_hit = intersectNodeFrustum<N>(node, frustum, dist);
-        const size_t first_index    = bsf(m_active);
-        const size_t first_packetID = first_index / K;
-        const size_t first_rayID    = first_index % K;
-        size_t m_first_hit = intersectNode1<N>(node, packets[first_packetID], first_rayID, frustum.nf);
-
-        /* this make traversal independent of the ordering of rays */
-        size_t m_node = m_node_hit ^ m_first_hit;
-        while (unlikely(m_node))
-        {
-          const size_t boxID = bscf(m_node);
-          const size_t m_current = m_active & intersectAABBNodePacket(m_active, packets, node, boxID, frustum.nf);
-          m_node_hit ^= m_current ? (size_t)0 : ((size_t)1 << boxID);
-          maskK[boxID] = m_current;
-        }
-        return m_node_hit;
-      }
-      
-      // TODO: explicit 16-wide path for KNL
-      template<int K>
-      __forceinline static vint<N> traverseIncoherentStream(size_t m_active,
-                                                             TravRayKStreamFast<K>* __restrict__ packets,
-                                                             const AABBNode* __restrict__ node,
-                                                             const NearFarPrecalculations& nf,
-                                                             const int shiftTable[32])
-      {
-        const vfloat<N> bminX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
-        const vfloat<N> bminY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
-        const vfloat<N> bminZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
-        const vfloat<N> bmaxX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
-        const vfloat<N> bmaxY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
-        const vfloat<N> bmaxZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
-        assert(m_active);
-        vint<N> vmask(zero);
-        do
-        {   
-          STAT3(shadow.trav_nodes,1,1,1);
-          const size_t rayID = bscf(m_active);
-          assert(rayID < MAX_INTERNAL_STREAM_SIZE);
-          TravRayKStream<K,robust> &p = packets[rayID / K];
-          const size_t i = rayID % K;
-          const vint<N> bitmask(shiftTable[rayID]);
-
-#if defined (__aarch64__)
-          const vfloat<N> tNearX = madd(bminX, p.rdir.x[i], p.neg_org_rdir.x[i]);
-          const vfloat<N> tNearY = madd(bminY, p.rdir.y[i], p.neg_org_rdir.y[i]);
-          const vfloat<N> tNearZ = madd(bminZ, p.rdir.z[i], p.neg_org_rdir.z[i]);
-          const vfloat<N> tFarX  = madd(bmaxX, p.rdir.x[i], p.neg_org_rdir.x[i]);
-          const vfloat<N> tFarY  = madd(bmaxY, p.rdir.y[i], p.neg_org_rdir.y[i]);
-          const vfloat<N> tFarZ  = madd(bmaxZ, p.rdir.z[i], p.neg_org_rdir.z[i]);
-#else
-          const vfloat<N> tNearX = msub(bminX, p.rdir.x[i], p.org_rdir.x[i]);
-          const vfloat<N> tNearY = msub(bminY, p.rdir.y[i], p.org_rdir.y[i]);
-          const vfloat<N> tNearZ = msub(bminZ, p.rdir.z[i], p.org_rdir.z[i]);
-          const vfloat<N> tFarX  = msub(bmaxX, p.rdir.x[i], p.org_rdir.x[i]);
-          const vfloat<N> tFarY  = msub(bmaxY, p.rdir.y[i], p.org_rdir.y[i]);
-          const vfloat<N> tFarZ  = msub(bmaxZ, p.rdir.z[i], p.org_rdir.z[i]); 
-#endif
-
-          const vfloat<N> tNear  = maxi(tNearX, tNearY, tNearZ, vfloat<N>(p.tnear[i]));
-          const vfloat<N> tFar   = mini(tFarX , tFarY , tFarZ,  vfloat<N>(p.tfar[i]));      
-
-          const vbool<N> hit_mask = tNear <= tFar;
-#if defined(__AVX2__)
-          vmask = vmask | (bitmask & vint<N>(hit_mask));
-#else
-          vmask = select(hit_mask, vmask | bitmask, vmask);
-#endif
-        } while(m_active);
-        return vmask;        
-      }
-
-      template<int K>
-      __forceinline static vint<N> traverseIncoherentStream(size_t m_active,
-                                                             TravRayKStreamRobust<K>* __restrict__ packets,
-                                                             const AABBNode* __restrict__ node,
-                                                             const NearFarPrecalculations& nf,
-                                                             const int shiftTable[32])
-      {
-        const vfloat<N> bminX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
-        const vfloat<N> bminY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
-        const vfloat<N> bminZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
-        const vfloat<N> bmaxX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
-        const vfloat<N> bmaxY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
-        const vfloat<N> bmaxZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
-        assert(m_active);
-        vint<N> vmask(zero);
-        do
-        {   
-          STAT3(shadow.trav_nodes,1,1,1);
-          const size_t rayID = bscf(m_active);
-          assert(rayID < MAX_INTERNAL_STREAM_SIZE);
-          TravRayKStream<K,robust> &p = packets[rayID / K];
-          const size_t i = rayID % K;
-          const vint<N> bitmask(shiftTable[rayID]);
-          const vfloat<N> tNearX = (bminX - p.org.x[i]) * p.rdir.x[i];
-          const vfloat<N> tNearY = (bminY - p.org.y[i]) * p.rdir.y[i];
-          const vfloat<N> tNearZ = (bminZ - p.org.z[i]) * p.rdir.z[i];
-          const vfloat<N> tFarX  = (bmaxX - p.org.x[i]) * p.rdir.x[i];
-          const vfloat<N> tFarY  = (bmaxY - p.org.y[i]) * p.rdir.y[i];
-          const vfloat<N> tFarZ  = (bmaxZ - p.org.z[i]) * p.rdir.z[i];
-          const vfloat<N> tNear  = maxi(tNearX, tNearY, tNearZ, vfloat<N>(p.tnear[i]));
-          const vfloat<N> tFar   = mini(tFarX , tFarY , tFarZ,  vfloat<N>(p.tfar[i]));
-          const float round_down  = 1.0f-2.0f*float(ulp);
-          const float round_up    = 1.0f+2.0f*float(ulp);
-          const vbool<N> hit_mask = round_down*tNear <= round_up*tFar;
-#if defined(__AVX2__)
-          vmask = vmask | (bitmask & vint<N>(hit_mask));
-#else
-          vmask = select(hit_mask, vmask | bitmask, vmask);
-#endif
-        } while(m_active);
-        return vmask;
-      }
-                                                         
-
-      static const size_t stackSizeSingle = 1+(N-1)*BVH::maxDepth;
-
-    public:
-      static void intersect(Accel::Intersectors* This, RayHitN** inputRays, size_t numRays, IntersectContext* context);
-      static void occluded (Accel::Intersectors* This, RayN** inputRays, size_t numRays, IntersectContext* context);
-
-    private:
-      template<int K>
-      static void intersectCoherent(Accel::Intersectors* This, RayHitK<K>** inputRays, size_t numRays, IntersectContext* context);
-
-      template<int K>
-      static void occludedCoherent(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context);
-
-      template<int K>
-      static void occludedIncoherent(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context);
-    };
-
-
-    /*! BVH ray stream intersector with direct fallback to packets. */
-    template<int N>
-    class BVHNIntersectorStreamPacketFallback
-    {
-    public:
-      static void intersect(Accel::Intersectors* This, RayHitN** inputRays, size_t numRays, IntersectContext* context);
-      static void occluded (Accel::Intersectors* This, RayN** inputRays, size_t numRays, IntersectContext* context);
-
-    private:
-      template<int K>
-      static void intersectK(Accel::Intersectors* This, RayHitK<K>** inputRays, size_t numRays, IntersectContext* context);
-
-      template<int K>
-      static void occludedK(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context);
-    };
-  }
-}
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream_bvh4.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_stream_bvh4.cpp
deleted file mode 100644
index c3e5f137b8..0000000000
--- a/thirdparty/embree/kernels/bvh/bvh_intersector_stream_bvh4.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright 2009-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include "bvh_intersector_stream.cpp"
-
-namespace embree
-{
-  namespace isa
-  {
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// General BVHIntersectorStreamPacketFallback Intersector
-    ////////////////////////////////////////////////////////////////////////////////
-
-    DEFINE_INTERSECTORN(BVH4IntersectorStreamPacketFallback,BVHNIntersectorStreamPacketFallback<4>);
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// BVH4IntersectorStream Definitions
-    ////////////////////////////////////////////////////////////////////////////////
-
-    IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4iIntersectorStreamMoeller,        BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Triangle4iIntersectorStreamMoeller<true>>));
-    IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4vIntersectorStreamPluecker,       BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA true  COMMA Triangle4vIntersectorStreamPluecker<true>>));
-    IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4iIntersectorStreamPluecker,       BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA true  COMMA Triangle4iIntersectorStreamPluecker<true>>));
-    IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4IntersectorStreamMoeller,         BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Triangle4IntersectorStreamMoeller<true>>));
-    IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4IntersectorStreamMoellerNoFilter, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Triangle4IntersectorStreamMoeller<false>>));
-
-    IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4vIntersectorStreamMoeller,        BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Quad4vIntersectorStreamMoeller<true>>));
-    IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4vIntersectorStreamMoellerNoFilter,BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Quad4vIntersectorStreamMoeller<false>>));
-    IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4iIntersectorStreamMoeller,        BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Quad4iIntersectorStreamMoeller<true>>));
-    IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4vIntersectorStreamPluecker,       BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA true  COMMA Quad4vIntersectorStreamPluecker<true>>));
-    IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4iIntersectorStreamPluecker,       BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA true  COMMA Quad4iIntersectorStreamPluecker<true>>));
-
-    IF_ENABLED_USER(DEFINE_INTERSECTORN(BVH4VirtualIntersectorStream,BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA ObjectIntersectorStream>));
-    IF_ENABLED_INSTANCE(DEFINE_INTERSECTORN(BVH4InstanceIntersectorStream,BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA InstanceIntersectorStream>));
-  }
-}
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.cpp
deleted file mode 100644
index b858eb163f..0000000000
--- a/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.cpp
+++ /dev/null
@@ -1,657 +0,0 @@
-// Copyright 2009-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include "bvh_intersector_stream_filters.h"
-#include "bvh_intersector_stream.h"
-
-namespace embree
-{
-  namespace isa
-  {
-    template<int K, bool intersect>
-    __noinline void RayStreamFilter::filterAOS(Scene* scene, void* _rayN, size_t N, size_t stride, IntersectContext* context)
-    {
-      RayStreamAOS rayN(_rayN);
-
-      /* use fast path for coherent ray mode */
-      if (unlikely(context->isCoherent()))
-      {
-        __aligned(64) RayTypeK<K, intersect> rays[MAX_INTERNAL_STREAM_SIZE / K];
-        __aligned(64) RayTypeK<K, intersect>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K];
-
-        for (size_t i = 0; i < N; i += MAX_INTERNAL_STREAM_SIZE)
-        {
-          const size_t size = min(N - i, MAX_INTERNAL_STREAM_SIZE);
-
-          /* convert from AOS to SOA */
-          for (size_t j = 0; j < size; j += K)
-          {
-            const vint<K> vij = vint<K>(int(i+j)) + vint<K>(step);
-            const vbool<K> valid = vij < vint<K>(int(N));
-            const vint<K> offset = vij * int(stride);
-            const size_t packetIndex = j / K;
-
-            RayTypeK<K, intersect> ray = rayN.getRayByOffset<K>(valid, offset);
-            ray.tnear() = select(valid, ray.tnear(), zero);
-            ray.tfar  = select(valid, ray.tfar,  neg_inf);
-
-            rays[packetIndex] = ray;
-            rayPtrs[packetIndex] = &rays[packetIndex]; // rayPtrs might get reordered for occludedN
-          }
-
-          /* trace stream */
-          scene->intersectors.intersectN(rayPtrs, size, context);
-
-          /* convert from SOA to AOS */
-          for (size_t j = 0; j < size; j += K)
-          {
-            const vint<K> vij = vint<K>(int(i+j)) + vint<K>(step);
-            const vbool<K> valid = vij < vint<K>(int(N));
-            const vint<K> offset = vij * int(stride);
-            const size_t packetIndex = j / K;
-            rayN.setHitByOffset(valid, offset, rays[packetIndex]);
-          }
-        }
-      }
-      else if (unlikely(!intersect))
-      {
-        /* octant sorting for occlusion rays */
-        __aligned(64) unsigned int octants[8][MAX_INTERNAL_STREAM_SIZE];
-        __aligned(64) RayK<K> rays[MAX_INTERNAL_STREAM_SIZE / K];
-        __aligned(64) RayK<K>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K];
-
-        unsigned int raysInOctant[8];
-        for (unsigned int i = 0; i < 8; i++)
-          raysInOctant[i] = 0;
-        size_t inputRayID = 0;
-
-        for (;;)
-        {
-          int curOctant = -1;
-
-          /* sort rays into octants */
-          for (; inputRayID < N;)
-          {
-            const Ray& ray = rayN.getRayByOffset(inputRayID * stride);
-
-            /* skip invalid rays */
-            if (unlikely(ray.tnear() > ray.tfar || ray.tfar < 0.0f)) { inputRayID++; continue; } // ignore invalid or already occluded rays
-#if defined(EMBREE_IGNORE_INVALID_RAYS)
-            if (unlikely(!ray.valid())) { inputRayID++; continue; }
-#endif
-
-            const unsigned int octantID = movemask(vfloat4(Vec3fa(ray.dir)) < 0.0f) & 0x7;
-
-            assert(octantID < 8);
-            octants[octantID][raysInOctant[octantID]++] = (unsigned int)inputRayID;
-            inputRayID++;
-            if (unlikely(raysInOctant[octantID] == MAX_INTERNAL_STREAM_SIZE))
-            {
-              curOctant = octantID;
-              break;
-            }
-          }
-
-          /* need to flush rays in octant? */
-          if (unlikely(curOctant == -1))
-          {
-            for (unsigned int i = 0; i < 8; i++)
-              if (raysInOctant[i]) { curOctant = i; break; }
-          }
-
-          /* all rays traced? */
-          if (unlikely(curOctant == -1))
-            break;
-        
-          unsigned int* const rayIDs = &octants[curOctant][0];
-          const unsigned int numOctantRays = raysInOctant[curOctant];
-          assert(numOctantRays);
-
-          for (unsigned int j = 0; j < numOctantRays; j += K)
-          {
-            const vint<K> vi = vint<K>(int(j)) + vint<K>(step);
-            const vbool<K> valid = vi < vint<K>(int(numOctantRays));
-            const vint<K> offset = *(vint<K>*)&rayIDs[j] * int(stride);
-            RayK<K>& ray = rays[j/K];
-            rayPtrs[j/K] = &ray;
-            ray = rayN.getRayByOffset<K>(valid, offset);
-            ray.tnear() = select(valid, ray.tnear(), zero);
-            ray.tfar  = select(valid, ray.tfar,  neg_inf);
-          }
-
-          scene->intersectors.occludedN(rayPtrs, numOctantRays, context);
-
-          for (unsigned int j = 0; j < numOctantRays; j += K)
-          {
-            const vint<K> vi = vint<K>(int(j)) + vint<K>(step);
-            const vbool<K> valid = vi < vint<K>(int(numOctantRays));
-            const vint<K> offset = *(vint<K>*)&rayIDs[j] * int(stride);
-            rayN.setHitByOffset<K>(valid, offset, rays[j/K]);
-          }
-
-          raysInOctant[curOctant] = 0;
-        }
-      }
-      else
-      {
-        /* fallback to packets */
-        for (size_t i = 0; i < N; i += K)
-        {
-          const vint<K> vi = vint<K>(int(i)) + vint<K>(step);
-          vbool<K> valid = vi < vint<K>(int(N));
-          const vint<K> offset = vi * int(stride);
-
-          RayTypeK<K, intersect> ray = rayN.getRayByOffset<K>(valid, offset);
-          valid &= ray.tnear() <= ray.tfar;
-
-          scene->intersectors.intersect(valid, ray, context);
-
-          rayN.setHitByOffset<K>(valid, offset, ray);
-        }
-      }
-    }
-
-    template<int K, bool intersect>
-    __noinline void RayStreamFilter::filterAOP(Scene* scene, void** _rayN, size_t N, IntersectContext* context)
-    {
-      RayStreamAOP rayN(_rayN);
-
-      /* use fast path for coherent ray mode */
-      if (unlikely(context->isCoherent()))
-      {
-        __aligned(64) RayTypeK<K, intersect> rays[MAX_INTERNAL_STREAM_SIZE / K];
-        __aligned(64) RayTypeK<K, intersect>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K];
-
-        for (size_t i = 0; i < N; i += MAX_INTERNAL_STREAM_SIZE)
-        {
-          const size_t size = min(N - i, MAX_INTERNAL_STREAM_SIZE);
-
-          /* convert from AOP to SOA */
-          for (size_t j = 0; j < size; j += K)
-          {
-            const vint<K> vij = vint<K>(int(i+j)) + vint<K>(step);
-            const vbool<K> valid = vij < vint<K>(int(N));
-            const size_t packetIndex = j / K;
-
-            RayTypeK<K, intersect> ray = rayN.getRayByIndex<K>(valid, vij);
-            ray.tnear() = select(valid, ray.tnear(), zero);
-            ray.tfar  = select(valid, ray.tfar,  neg_inf);
-
-            rays[packetIndex] = ray;
-            rayPtrs[packetIndex] = &rays[packetIndex]; // rayPtrs might get reordered for occludedN
-          }
-
-          /* trace stream */
-          scene->intersectors.intersectN(rayPtrs, size, context);
-
-          /* convert from SOA to AOP */
-          for (size_t j = 0; j < size; j += K)
-          {
-            const vint<K> vij = vint<K>(int(i+j)) + vint<K>(step);
-            const vbool<K> valid = vij < vint<K>(int(N));
-            const size_t packetIndex = j / K;
-
-            rayN.setHitByIndex<K>(valid, vij, rays[packetIndex]);
-          }
-        }
-      }
-      else if (unlikely(!intersect))
-      {
-        /* octant sorting for occlusion rays */
-        __aligned(64) unsigned int octants[8][MAX_INTERNAL_STREAM_SIZE];
-        __aligned(64) RayK<K> rays[MAX_INTERNAL_STREAM_SIZE / K];
-        __aligned(64) RayK<K>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K];
-
-        unsigned int raysInOctant[8];
-        for (unsigned int i = 0; i < 8; i++)
-          raysInOctant[i] = 0;
-        size_t inputRayID = 0;
-
-        for (;;)
-        {
-          int curOctant = -1;
-
-          /* sort rays into octants */
-          for (; inputRayID < N;)
-          {
-            const Ray& ray = rayN.getRayByIndex(inputRayID);
-
-            /* skip invalid rays */
-            if (unlikely(ray.tnear() > ray.tfar || ray.tfar < 0.0f)) { inputRayID++; continue; } // ignore invalid or already occluded rays
-#if defined(EMBREE_IGNORE_INVALID_RAYS)
-            if (unlikely(!ray.valid())) { inputRayID++; continue; }
-#endif
-
-            const unsigned int octantID = movemask(lt_mask(ray.dir,Vec3fa(0.0f)));
-
-            assert(octantID < 8);
-            octants[octantID][raysInOctant[octantID]++] = (unsigned int)inputRayID;
-            inputRayID++;
-            if (unlikely(raysInOctant[octantID] == MAX_INTERNAL_STREAM_SIZE))
-            {
-              curOctant = octantID;
-              break;
-            }
-          }
-
-          /* need to flush rays in octant? */
-          if (unlikely(curOctant == -1))
-          {
-            for (unsigned int i = 0; i < 8; i++)
-              if (raysInOctant[i]) { curOctant = i; break; }
-          }
-
-          /* all rays traced? */
-          if (unlikely(curOctant == -1))
-            break;
-
-          unsigned int* const rayIDs = &octants[curOctant][0];
-          const unsigned int numOctantRays = raysInOctant[curOctant];
-          assert(numOctantRays);
-
-          for (unsigned int j = 0; j < numOctantRays; j += K)
-          {
-            const vint<K> vi = vint<K>(int(j)) + vint<K>(step);
-            const vbool<K> valid = vi < vint<K>(int(numOctantRays));
-            const vint<K> index = *(vint<K>*)&rayIDs[j];
-            RayK<K>& ray = rays[j/K];
-            rayPtrs[j/K] = &ray;
-            ray = rayN.getRayByIndex<K>(valid, index);
-            ray.tnear() = select(valid, ray.tnear(), zero);
-            ray.tfar  = select(valid, ray.tfar,  neg_inf);
-          }
-
-          scene->intersectors.occludedN(rayPtrs, numOctantRays, context);
-
-          for (unsigned int j = 0; j < numOctantRays; j += K)
-          {
-            const vint<K> vi = vint<K>(int(j)) + vint<K>(step);
-            const vbool<K> valid = vi < vint<K>(int(numOctantRays));
-            const vint<K> index = *(vint<K>*)&rayIDs[j];
-            rayN.setHitByIndex<K>(valid, index, rays[j/K]);
-          }
-
-          raysInOctant[curOctant] = 0;
-        }
-      }
-      else
-      {
-        /* fallback to packets */
-        for (size_t i = 0; i < N; i += K)
-        {
-          const vint<K> vi = vint<K>(int(i)) + vint<K>(step);
-          vbool<K> valid = vi < vint<K>(int(N));
-
-          RayTypeK<K, intersect> ray = rayN.getRayByIndex<K>(valid, vi);
-          valid &= ray.tnear() <= ray.tfar;
-
-          scene->intersectors.intersect(valid, ray, context);
-
-          rayN.setHitByIndex<K>(valid, vi, ray);
-        }
-      }
-    }
-
-    template<int K, bool intersect>
-    __noinline void RayStreamFilter::filterSOA(Scene* scene, char* rayData, size_t N, size_t numPackets, size_t stride, IntersectContext* context)
-    {
-      const size_t rayDataAlignment = (size_t)rayData % (K*sizeof(float));
-      const size_t offsetAlignment  = (size_t)stride  % (K*sizeof(float));
-
-      /* fast path for packets with the correct width and data alignment */
-      if (likely(N == K &&
-                 !rayDataAlignment &&
-                 !offsetAlignment))
-      {
-        if (unlikely(context->isCoherent()))
-        {
-          __aligned(64) RayTypeK<K, intersect>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K];
-
-          size_t packetIndex = 0;
-          for (size_t i = 0; i < numPackets; i++)
-          {
-            const size_t offset = i * stride;
-            RayTypeK<K, intersect>& ray = *(RayTypeK<K, intersect>*)(rayData + offset);
-            rayPtrs[packetIndex++] = &ray;
-
-            /* trace as stream */
-            if (unlikely(packetIndex == MAX_INTERNAL_STREAM_SIZE / K))
-            {
-              const size_t size = packetIndex*K;
-              scene->intersectors.intersectN(rayPtrs, size, context);
-              packetIndex = 0;
-            }
-          }
-
-          /* flush remaining packets */
-          if (unlikely(packetIndex > 0))
-          {
-            const size_t size = packetIndex*K;
-            scene->intersectors.intersectN(rayPtrs, size, context);
-          }
-        }
-        else if (unlikely(!intersect))
-        {
-          /* octant sorting for occlusion rays */
-          RayStreamSOA rayN(rayData, K);
-
-          __aligned(64) unsigned int octants[8][MAX_INTERNAL_STREAM_SIZE];
-          __aligned(64) RayK<K> rays[MAX_INTERNAL_STREAM_SIZE / K];
-          __aligned(64) RayK<K>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K];
-
-          unsigned int raysInOctant[8];
-          for (unsigned int i = 0; i < 8; i++)
-            raysInOctant[i] = 0;
-          size_t inputRayID = 0;
-
-          for (;;)
-          {
-            int curOctant = -1;
-
-            /* sort rays into octants */
-            for (; inputRayID < N*numPackets;)
-            {
-              const size_t offset = (inputRayID / K) * stride + (inputRayID % K) * sizeof(float);
-
-              /* skip invalid rays */
-              if (unlikely(!rayN.isValidByOffset(offset))) { inputRayID++; continue; } // ignore invalid or already occluded rays
-  #if defined(EMBREE_IGNORE_INVALID_RAYS)
-              __aligned(64) Ray ray = rayN.getRayByOffset(offset);
-              if (unlikely(!ray.valid())) { inputRayID++; continue; }
-  #endif
-
-              const unsigned int octantID = (unsigned int)rayN.getOctantByOffset(offset);
-
-              assert(octantID < 8);
-              octants[octantID][raysInOctant[octantID]++] = (unsigned int)offset;
-              inputRayID++;
-              if (unlikely(raysInOctant[octantID] == MAX_INTERNAL_STREAM_SIZE))
-              {
-                curOctant = octantID;
-                break;
-              }
-            }
-
-            /* need to flush rays in octant? */
-            if (unlikely(curOctant == -1))
-            {
-              for (unsigned int i = 0; i < 8; i++)
-                if (raysInOctant[i]) { curOctant = i; break; }
-            }
-
-            /* all rays traced? */
-            if (unlikely(curOctant == -1))
-              break;
-
-            unsigned int* const rayOffsets = &octants[curOctant][0];
-            const unsigned int numOctantRays = raysInOctant[curOctant];
-            assert(numOctantRays);
-
-            for (unsigned int j = 0; j < numOctantRays; j += K)
-            {
-              const vint<K> vi = vint<K>(int(j)) + vint<K>(step);
-              const vbool<K> valid = vi < vint<K>(int(numOctantRays));
-              const vint<K> offset = *(vint<K>*)&rayOffsets[j];
-              RayK<K>& ray = rays[j/K];
-              rayPtrs[j/K] = &ray;
-              ray = rayN.getRayByOffset<K>(valid, offset);
-              ray.tnear() = select(valid, ray.tnear(), zero);
-              ray.tfar  = select(valid, ray.tfar,  neg_inf);
-            }
-
-            scene->intersectors.occludedN(rayPtrs, numOctantRays, context);
-
-            for (unsigned int j = 0; j < numOctantRays; j += K)
-            {
-              const vint<K> vi = vint<K>(int(j)) + vint<K>(step);
-              const vbool<K> valid = vi < vint<K>(int(numOctantRays));
-              const vint<K> offset = *(vint<K>*)&rayOffsets[j];
-              rayN.setHitByOffset(valid, offset, rays[j/K]);
-            }
-            raysInOctant[curOctant] = 0;
-          }
-        }
-        else
-        {
-          /* fallback to packets */
-          for (size_t i = 0; i < numPackets; i++)
-          {
-            const size_t offset = i * stride;
-            RayTypeK<K, intersect>& ray = *(RayTypeK<K, intersect>*)(rayData + offset);
-            const vbool<K> valid = ray.tnear() <= ray.tfar;
-
-            scene->intersectors.intersect(valid, ray, context);
-          }
-        }
-      }
-      else
-      {
-        /* fallback to packets for arbitrary packet size and alignment */
-        for (size_t i = 0; i < numPackets; i++)
-        {
-          const size_t offsetN = i * stride;
-          RayStreamSOA rayN(rayData + offsetN, N);
-
-          for (size_t j = 0; j < N; j += K)
-          {
-            const size_t offset = j * sizeof(float);
-            vbool<K> valid = (vint<K>(int(j)) + vint<K>(step)) < vint<K>(int(N));
-            RayTypeK<K, intersect> ray = rayN.getRayByOffset<K>(valid, offset);
-            valid &= ray.tnear() <= ray.tfar;
-
-            scene->intersectors.intersect(valid, ray, context);
-
-            rayN.setHitByOffset(valid, offset, ray);
-          }
-        }
-      }
-    }
-
-    template<int K, bool intersect>
-    __noinline void RayStreamFilter::filterSOP(Scene* scene, const void* _rayN, size_t N, IntersectContext* context)
-    { 
-      RayStreamSOP& rayN = *(RayStreamSOP*)_rayN;
-
-      /* use fast path for coherent ray mode */
-      if (unlikely(context->isCoherent()))
-      {
-        __aligned(64) RayTypeK<K, intersect> rays[MAX_INTERNAL_STREAM_SIZE / K];
-        __aligned(64) RayTypeK<K, intersect>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K];
-
-        for (size_t i = 0; i < N; i += MAX_INTERNAL_STREAM_SIZE)
-        {
-          const size_t size = min(N - i, MAX_INTERNAL_STREAM_SIZE);
-
-          /* convert from SOP to SOA */
-          for (size_t j = 0; j < size; j += K)
-          {
-            const vint<K> vij = vint<K>(int(i+j)) + vint<K>(step);
-            const vbool<K> valid = vij < vint<K>(int(N));
-            const size_t offset = (i+j) * sizeof(float);
-            const size_t packetIndex = j / K;
-
-            RayTypeK<K, intersect> ray = rayN.getRayByOffset<K>(valid, offset);
-            ray.tnear() = select(valid, ray.tnear(), zero);
-            ray.tfar  = select(valid, ray.tfar,  neg_inf);
-
-            rays[packetIndex] = ray;
-            rayPtrs[packetIndex] = &rays[packetIndex]; // rayPtrs might get reordered for occludedN
-          }
-
-          /* trace stream */
-          scene->intersectors.intersectN(rayPtrs, size, context);
-
-          /* convert from SOA to SOP */
-          for (size_t j = 0; j < size; j += K)
-          {
-            const vint<K> vij = vint<K>(int(i+j)) + vint<K>(step);
-            const vbool<K> valid = vij < vint<K>(int(N));
-            const size_t offset = (i+j) * sizeof(float);
-            const size_t packetIndex = j / K;
-
-            rayN.setHitByOffset(valid, offset, rays[packetIndex]);
-          }
-        }
-      }
-      else if (unlikely(!intersect))
-      {
-        /* octant sorting for occlusion rays */
-        __aligned(64) unsigned int octants[8][MAX_INTERNAL_STREAM_SIZE];
-        __aligned(64) RayK<K> rays[MAX_INTERNAL_STREAM_SIZE / K];
-        __aligned(64) RayK<K>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K];
-
-        unsigned int raysInOctant[8];
-        for (unsigned int i = 0; i < 8; i++)
-          raysInOctant[i] = 0;
-        size_t inputRayID = 0;
-
-        for (;;)
-        {
-          int curOctant = -1;
-
-          /* sort rays into octants */
-          for (; inputRayID < N;)
-          {
-            const size_t offset = inputRayID * sizeof(float);
-            /* skip invalid rays */
-            if (unlikely(!rayN.isValidByOffset(offset))) { inputRayID++; continue; } // ignore invalid or already occluded rays
-#if defined(EMBREE_IGNORE_INVALID_RAYS)
-            __aligned(64) Ray ray = rayN.getRayByOffset(offset);
-            if (unlikely(!ray.valid())) { inputRayID++; continue; }
-#endif
-
-            const unsigned int octantID = (unsigned int)rayN.getOctantByOffset(offset);
-
-            assert(octantID < 8);
-            octants[octantID][raysInOctant[octantID]++] = (unsigned int)offset;
-            inputRayID++;
-            if (unlikely(raysInOctant[octantID] == MAX_INTERNAL_STREAM_SIZE))
-            {
-              curOctant = octantID;
-              break;
-            }
-          }
-
-          /* need to flush rays in octant? */
-          if (unlikely(curOctant == -1))
-          {
-            for (unsigned int i = 0; i < 8; i++)
-              if (raysInOctant[i]) { curOctant = i; break; }
-          }
-
-          /* all rays traced? */
-          if (unlikely(curOctant == -1))
-            break;
-
-          unsigned int* const rayOffsets = &octants[curOctant][0];
-          const unsigned int numOctantRays = raysInOctant[curOctant];
-          assert(numOctantRays);
-
-          for (unsigned int j = 0; j < numOctantRays; j += K)
-          {
-            const vint<K> vi = vint<K>(int(j)) + vint<K>(step);
-            const vbool<K> valid = vi < vint<K>(int(numOctantRays));
-            const vint<K> offset = *(vint<K>*)&rayOffsets[j];
-            RayK<K>& ray = rays[j/K];
-            rayPtrs[j/K] = &ray;
-            ray = rayN.getRayByOffset<K>(valid, offset);
-            ray.tnear() = select(valid, ray.tnear(), zero);
-            ray.tfar  = select(valid, ray.tfar,  neg_inf);
-          }
-
-          scene->intersectors.occludedN(rayPtrs, numOctantRays, context);
-
-          for (unsigned int j = 0; j < numOctantRays; j += K)
-          {
-            const vint<K> vi = vint<K>(int(j)) + vint<K>(step);
-            const vbool<K> valid = vi < vint<K>(int(numOctantRays));
-            const vint<K> offset = *(vint<K>*)&rayOffsets[j];
-            rayN.setHitByOffset(valid, offset, rays[j/K]);
-          }
-
-          raysInOctant[curOctant] = 0;
-        }
-      }
-      else
-      {
-        /* fallback to packets */
-        for (size_t i = 0; i < N; i += K)
-        {
-          const vint<K> vi = vint<K>(int(i)) + vint<K>(step);
-          vbool<K> valid = vi < vint<K>(int(N));
-          const size_t offset = i * sizeof(float);
-
-          RayTypeK<K, intersect> ray = rayN.getRayByOffset<K>(valid, offset);
-          valid &= ray.tnear() <= ray.tfar;
-
-          scene->intersectors.intersect(valid, ray, context);
-
-          rayN.setHitByOffset(valid, offset, ray);
-        }
-      }
-    }
-
-
-    void RayStreamFilter::intersectAOS(Scene* scene, RTCRayHit* _rayN, size_t N, size_t stride, IntersectContext* context) {
-      if (unlikely(context->isCoherent()))
-        filterAOS<VSIZEL, true>(scene, _rayN, N, stride, context);
-      else
-        filterAOS<VSIZEX, true>(scene, _rayN, N, stride, context);
-    }
-
-    void RayStreamFilter::occludedAOS(Scene* scene, RTCRay* _rayN, size_t N, size_t stride, IntersectContext* context) {
-      if (unlikely(context->isCoherent()))
-        filterAOS<VSIZEL, false>(scene, _rayN, N, stride, context);
-      else
-        filterAOS<VSIZEX, false>(scene, _rayN, N, stride, context);
-    }
-
-    void RayStreamFilter::intersectAOP(Scene* scene, RTCRayHit** _rayN, size_t N, IntersectContext* context) {
-      if (unlikely(context->isCoherent()))
-        filterAOP<VSIZEL, true>(scene, (void**)_rayN, N, context);
-      else
-        filterAOP<VSIZEX, true>(scene, (void**)_rayN, N, context);
-    }
-
-    void RayStreamFilter::occludedAOP(Scene* scene, RTCRay** _rayN, size_t N, IntersectContext* context) {
-      if (unlikely(context->isCoherent()))
-        filterAOP<VSIZEL, false>(scene, (void**)_rayN, N, context);
-      else
-        filterAOP<VSIZEX, false>(scene, (void**)_rayN, N, context);
-    }
-
-    void RayStreamFilter::intersectSOA(Scene* scene, char* rayData, size_t N, size_t numPackets, size_t stride, IntersectContext* context) {
-      if (unlikely(context->isCoherent()))
-        filterSOA<VSIZEL, true>(scene, rayData, N, numPackets, stride, context);
-      else
-        filterSOA<VSIZEX, true>(scene, rayData, N, numPackets, stride, context);
-    }
-
-    void RayStreamFilter::occludedSOA(Scene* scene, char* rayData, size_t N, size_t numPackets, size_t stride, IntersectContext* context) {
-      if (unlikely(context->isCoherent()))
-        filterSOA<VSIZEL, false>(scene, rayData, N, numPackets, stride, context);
-      else
-        filterSOA<VSIZEX, false>(scene, rayData, N, numPackets, stride, context);
-    }
-
-    void RayStreamFilter::intersectSOP(Scene* scene, const RTCRayHitNp* _rayN, size_t N, IntersectContext* context) {
-      if (unlikely(context->isCoherent()))
-        filterSOP<VSIZEL, true>(scene, _rayN, N, context);
-      else
-        filterSOP<VSIZEX, true>(scene, _rayN, N, context);
-    }
-
-    void RayStreamFilter::occludedSOP(Scene* scene, const RTCRayNp* _rayN, size_t N, IntersectContext* context) {
-      if (unlikely(context->isCoherent()))
-        filterSOP<VSIZEL, false>(scene, _rayN, N, context);
-      else
-        filterSOP<VSIZEX, false>(scene, _rayN, N, context);
-    }
-
-
-    RayStreamFilterFuncs rayStreamFilterFuncs() {
-      return RayStreamFilterFuncs(RayStreamFilter::intersectAOS, RayStreamFilter::intersectAOP, RayStreamFilter::intersectSOA, RayStreamFilter::intersectSOP,
-                                  RayStreamFilter::occludedAOS,  RayStreamFilter::occludedAOP,  RayStreamFilter::occludedSOA,  RayStreamFilter::occludedSOP);
-    }
-  };
-};
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.h b/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.h
deleted file mode 100644
index e7df7c2ae2..0000000000
--- a/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright 2009-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "../common/default.h"
-#include "../common/ray.h"
-#include "../common/scene.h"
-
-namespace embree
-{
-  namespace isa
-  {
-    class RayStreamFilter
-    {
-    public:
-      static void intersectAOS(Scene* scene, RTCRayHit* rays, size_t N, size_t stride, IntersectContext* context);
-      static void intersectAOP(Scene* scene, RTCRayHit** rays, size_t N, IntersectContext* context);
-      static void intersectSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context);
-      static void intersectSOP(Scene* scene, const RTCRayHitNp* rays, size_t N, IntersectContext* context);
-
-      static void occludedAOS(Scene* scene, RTCRay* rays, size_t N, size_t stride, IntersectContext* context);
-      static void occludedAOP(Scene* scene, RTCRay** rays, size_t N, IntersectContext* context);
-      static void occludedSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context);
-      static void occludedSOP(Scene* scene, const RTCRayNp* rays, size_t N, IntersectContext* context);
-
-    private:
-      template<int K, bool intersect>
-      static void filterAOS(Scene* scene, void* rays, size_t N, size_t stride, IntersectContext* context);
-
-      template<int K, bool intersect>
-      static void filterAOP(Scene* scene, void** rays, size_t N, IntersectContext* context);
-
-      template<int K, bool intersect>
-      static void filterSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context);
-
-      template<int K, bool intersect>
-      static void filterSOP(Scene* scene, const void* rays, size_t N, IntersectContext* context);
-    };
-  }
-};
diff --git a/thirdparty/embree/kernels/bvh/bvh_refit.cpp b/thirdparty/embree/kernels/bvh/bvh_refit.cpp
index bf5c8538ba..9f8554112b 100644
--- a/thirdparty/embree/kernels/bvh/bvh_refit.cpp
+++ b/thirdparty/embree/kernels/bvh/bvh_refit.cpp
@@ -11,6 +11,9 @@
 #include "../geometry/quadv.h"
 #include "../geometry/object.h"
 #include "../geometry/instance.h"
+#include "../geometry/instance_array.h"
+
+#include "../../common/algorithms/parallel_for.h"
 
 namespace embree
 {
@@ -82,7 +85,7 @@ namespace embree
     template<int N>
     BBox3fa BVHNRefitter<N>::refit_toplevel(NodeRef& ref,
                                             size_t &subtrees,
-											const BBox3fa *const subTreeBounds,
+                                            const BBox3fa *const subTreeBounds,
                                             const size_t depth)
     {
       if (depth >= MAX_SUB_TREE_EXTRACTION_DEPTH) 
@@ -236,12 +239,20 @@ namespace embree
 #if defined(EMBREE_GEOMETRY_INSTANCE)
     Builder* BVH4InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode);
     Builder* BVH4InstanceMeshRefitSAH (void* accel, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,Instance,InstancePrimitive>((BVH4*)accel,BVH4InstanceMeshBuilderSAH(accel,mesh,gtype,geomID,mode),mesh,mode); }
-
 #if  defined(__AVX__)
     Builder* BVH8InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode);
     Builder* BVH8InstanceMeshRefitSAH (void* accel, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,Instance,InstancePrimitive>((BVH8*)accel,BVH8InstanceMeshBuilderSAH(accel,mesh,gtype,geomID,mode),mesh,mode); }
 #endif
 #endif
 
+#if defined(EMBREE_GEOMETRY_INSTANCE_ARRAY)
+    Builder* BVH4InstanceArrayMeshBuilderSAH (void* bvh, InstanceArray* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode);
+    Builder* BVH4InstanceArrayMeshRefitSAH (void* accel, InstanceArray* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,InstanceArray,InstanceArrayPrimitive>((BVH4*)accel,BVH4InstanceArrayMeshBuilderSAH(accel,mesh,gtype,geomID,mode),mesh,mode); }
+
+#if  defined(__AVX__)
+    Builder* BVH8InstanceArrayMeshBuilderSAH (void* bvh, InstanceArray* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode);
+    Builder* BVH8InstanceArrayMeshRefitSAH (void* accel, InstanceArray* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,InstanceArray,InstanceArrayPrimitive>((BVH8*)accel,BVH8InstanceArrayMeshBuilderSAH(accel,mesh,gtype,geomID,mode),mesh,mode); }
+#endif
+#endif
   }
 }
diff --git a/thirdparty/embree/kernels/bvh/bvh_traverser_stream.h b/thirdparty/embree/kernels/bvh/bvh_traverser_stream.h
deleted file mode 100644
index 852981e69d..0000000000
--- a/thirdparty/embree/kernels/bvh/bvh_traverser_stream.h
+++ /dev/null
@@ -1,149 +0,0 @@
-// Copyright 2009-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "bvh.h"
-#include "../common/ray.h"
-#include "../common/stack_item.h"
-
-namespace embree
-{
-  namespace isa
-  {
-    template<int N, int types>
-    class BVHNNodeTraverserStreamHitCoherent
-    {
-      typedef BVHN<N> BVH;
-      typedef typename BVH::NodeRef NodeRef;
-      typedef typename BVH::BaseNode BaseNode;
-
-    public:
-      template<class T>
-      static __forceinline void traverseClosestHit(NodeRef& cur,
-                                                   size_t& m_trav_active,
-                                                   const vbool<N>& vmask,
-                                                   const vfloat<N>& tNear,
-                                                   const T* const tMask,
-                                                   StackItemMaskCoherent*& stackPtr)
-      {
-        const NodeRef parent = cur;
-        size_t mask = movemask(vmask);
-        assert(mask != 0);
-        const BaseNode* node = cur.baseNode();
-
-        /*! one child is hit, continue with that child */
-        const size_t r0 = bscf(mask);
-        assert(r0 < 8);
-        cur = node->child(r0);
-        BVHN<N>::prefetch(cur,types);
-        m_trav_active = tMask[r0];
-        assert(cur != BVH::emptyNode);
-        if (unlikely(mask == 0)) return;
-
-        const unsigned int* const tNear_i = (unsigned int*)&tNear;
-
-        /*! two children are hit, push far child, and continue with closer child */
-        NodeRef c0 = cur;
-        unsigned int d0 = tNear_i[r0];
-        const size_t r1 = bscf(mask);
-        assert(r1 < 8);
-        NodeRef c1 = node->child(r1);
-        BVHN<N>::prefetch(c1,types);
-        unsigned int d1 = tNear_i[r1];
-
-        assert(c0 != BVH::emptyNode);
-        assert(c1 != BVH::emptyNode);
-        if (likely(mask == 0)) {
-          if (d0 < d1) {
-            assert(tNear[r1] >= 0.0f);
-            stackPtr->mask    = tMask[r1];
-            stackPtr->parent  = parent;
-            stackPtr->child   = c1;
-            stackPtr++;
-            cur = c0;
-            m_trav_active = tMask[r0];
-            return;
-          }
-          else {
-            assert(tNear[r0] >= 0.0f);
-            stackPtr->mask    = tMask[r0];
-            stackPtr->parent  = parent;
-            stackPtr->child   = c0;
-            stackPtr++;
-            cur = c1;
-            m_trav_active = tMask[r1];
-            return;
-          }
-        }
-
-        /*! slow path for more than two hits */
-        size_t hits = movemask(vmask);
-        const vint<N> dist_i = select(vmask, (asInt(tNear) & 0xfffffff8) | vint<N>(step), 0);
-        const vint<N> dist_i_sorted = usort_descending(dist_i);
-        const vint<N> sorted_index = dist_i_sorted & 7;
-
-        size_t i = 0;
-        for (;;)
-        {
-          const unsigned int index = sorted_index[i];
-          assert(index < 8);
-          cur = node->child(index);
-          m_trav_active = tMask[index];
-          assert(m_trav_active);
-          BVHN<N>::prefetch(cur,types);
-          bscf(hits);
-          if (unlikely(hits==0)) break;
-          i++;
-          assert(cur != BVH::emptyNode);
-          assert(tNear[index] >= 0.0f);
-          stackPtr->mask    = m_trav_active;
-          stackPtr->parent  = parent;
-          stackPtr->child   = cur;
-          stackPtr++;
-        }
-      }
-
-      template<class T>
-      static __forceinline void traverseAnyHit(NodeRef& cur,
-                                               size_t& m_trav_active,
-                                               const vbool<N>& vmask,
-                                               const T* const tMask,
-                                               StackItemMaskCoherent*& stackPtr)
-      {
-        const NodeRef parent = cur;
-        size_t mask = movemask(vmask);
-        assert(mask != 0);
-        const BaseNode* node = cur.baseNode();
-
-        /*! one child is hit, continue with that child */
-        size_t r = bscf(mask);
-        cur = node->child(r);
-        BVHN<N>::prefetch(cur,types);
-        m_trav_active = tMask[r];
-
-        /* simple in order sequence */
-        assert(cur != BVH::emptyNode);
-        if (likely(mask == 0)) return;
-        stackPtr->mask    = m_trav_active;
-        stackPtr->parent  = parent;
-        stackPtr->child   = cur;
-        stackPtr++;
-
-        for (; ;)
-        {
-          r = bscf(mask);
-          cur = node->child(r);
-          BVHN<N>::prefetch(cur,types);
-          m_trav_active = tMask[r];
-          assert(cur != BVH::emptyNode);
-          if (likely(mask == 0)) return;
-          stackPtr->mask    = m_trav_active;
-          stackPtr->parent  = parent;
-          stackPtr->child   = cur;
-          stackPtr++;
-        }
-      }
-    };
-  }
-}
diff --git a/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h b/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h
deleted file mode 100644
index 943fd7043f..0000000000
--- a/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h
+++ /dev/null
@@ -1,215 +0,0 @@
-// Copyright 2009-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "node_intersector.h"
-
-namespace embree
-{
-  namespace isa
-  {
-    //////////////////////////////////////////////////////////////////////////////////////
-    // Ray packet structure used in stream traversal
-    //////////////////////////////////////////////////////////////////////////////////////
-
-    template<int K, bool robust>
-    struct TravRayKStream;
-
-    /* Fast variant */
-    template<int K>
-    struct TravRayKStream<K, false>
-    {
-      __forceinline TravRayKStream() {}
-
-      __forceinline TravRayKStream(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar)
-      {
-        init(ray_org, ray_dir);
-        tnear = ray_tnear;
-        tfar = ray_tfar;
-      }
-
-      __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir)
-      {
-        rdir = rcp_safe(ray_dir);
-#if defined(__aarch64__)
-        neg_org_rdir = -(ray_org * rdir);
-#else
-        org_rdir = ray_org * rdir;
-#endif
-      }
-
-      Vec3vf<K> rdir;
-#if defined(__aarch64__)
-      Vec3vf<K> neg_org_rdir;
-#else
-      Vec3vf<K> org_rdir;
-#endif
-      vfloat<K> tnear;
-      vfloat<K> tfar;
-    };
-
-    template<int K>
-    using TravRayKStreamFast = TravRayKStream<K, false>;
-
-    /* Robust variant */
-    template<int K>
-    struct TravRayKStream<K, true>
-    {
-      __forceinline TravRayKStream() {}
-
-      __forceinline TravRayKStream(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar)
-      {
-        init(ray_org, ray_dir);
-        tnear = ray_tnear;
-        tfar = ray_tfar;
-      }
-
-      __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir)
-      {
-        rdir = vfloat<K>(1.0f)/(zero_fix(ray_dir));
-        org = ray_org;
-      }
-
-      Vec3vf<K> rdir;
-      Vec3vf<K> org;
-      vfloat<K> tnear;
-      vfloat<K> tfar;
-    };
-
-    template<int K>
-    using TravRayKStreamRobust = TravRayKStream<K, true>;
-
-    //////////////////////////////////////////////////////////////////////////////////////
-    // Fast AABBNode intersection
-    //////////////////////////////////////////////////////////////////////////////////////
-
-    template<int N, int K>
-    __forceinline size_t intersectNode1(const typename BVHN<N>::AABBNode* __restrict__ node,
-                                        const TravRayKStreamFast<K>& ray, size_t k, const NearFarPrecalculations& nf)
-    {
-      const vfloat<N> bminX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
-      const vfloat<N> bminY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
-      const vfloat<N> bminZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
-      const vfloat<N> bmaxX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
-      const vfloat<N> bmaxY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
-      const vfloat<N> bmaxZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
-
-#if defined (__aarch64__)
-      const vfloat<N> rminX = madd(bminX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.neg_org_rdir.x[k]));
-      const vfloat<N> rminY = madd(bminY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.neg_org_rdir.y[k]));
-      const vfloat<N> rminZ = madd(bminZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.neg_org_rdir.z[k]));
-      const vfloat<N> rmaxX = madd(bmaxX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.neg_org_rdir.x[k]));
-      const vfloat<N> rmaxY = madd(bmaxY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.neg_org_rdir.y[k]));
-      const vfloat<N> rmaxZ = madd(bmaxZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.neg_org_rdir.z[k]));
-#else
-      const vfloat<N> rminX = msub(bminX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.org_rdir.x[k]));
-      const vfloat<N> rminY = msub(bminY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.org_rdir.y[k]));
-      const vfloat<N> rminZ = msub(bminZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.org_rdir.z[k]));
-      const vfloat<N> rmaxX = msub(bmaxX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.org_rdir.x[k]));
-      const vfloat<N> rmaxY = msub(bmaxY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.org_rdir.y[k]));
-      const vfloat<N> rmaxZ = msub(bmaxZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.org_rdir.z[k]));
-#endif
-      const vfloat<N> rmin  = maxi(rminX, rminY, rminZ, vfloat<N>(ray.tnear[k]));
-      const vfloat<N> rmax  = mini(rmaxX, rmaxY, rmaxZ, vfloat<N>(ray.tfar[k]));
-
-      const vbool<N> vmask_first_hit = rmin <= rmax;
-
-      return movemask(vmask_first_hit) & (((size_t)1 << N)-1);
-    }
-
-    template<int N, int K>
-    __forceinline size_t intersectNodeK(const typename BVHN<N>::AABBNode* __restrict__ node, size_t i,
-                                        const TravRayKStreamFast<K>& ray, const NearFarPrecalculations& nf)
-    {
-      char* ptr = (char*)&node->lower_x + i*sizeof(float);
-      const vfloat<K> bminX = *(const float*)(ptr + nf.nearX);
-      const vfloat<K> bminY = *(const float*)(ptr + nf.nearY);
-      const vfloat<K> bminZ = *(const float*)(ptr + nf.nearZ);
-      const vfloat<K> bmaxX = *(const float*)(ptr + nf.farX);
-      const vfloat<K> bmaxY = *(const float*)(ptr + nf.farY);
-      const vfloat<K> bmaxZ = *(const float*)(ptr + nf.farZ);
-
-#if defined (__aarch64__)
-      const vfloat<K> rminX = madd(bminX, ray.rdir.x, ray.neg_org_rdir.x);
-      const vfloat<K> rminY = madd(bminY, ray.rdir.y, ray.neg_org_rdir.y);
-      const vfloat<K> rminZ = madd(bminZ, ray.rdir.z, ray.neg_org_rdir.z);
-      const vfloat<K> rmaxX = madd(bmaxX, ray.rdir.x, ray.neg_org_rdir.x);
-      const vfloat<K> rmaxY = madd(bmaxY, ray.rdir.y, ray.neg_org_rdir.y);
-      const vfloat<K> rmaxZ = madd(bmaxZ, ray.rdir.z, ray.neg_org_rdir.z);
-#else
-      const vfloat<K> rminX = msub(bminX, ray.rdir.x, ray.org_rdir.x);
-      const vfloat<K> rminY = msub(bminY, ray.rdir.y, ray.org_rdir.y);
-      const vfloat<K> rminZ = msub(bminZ, ray.rdir.z, ray.org_rdir.z);
-      const vfloat<K> rmaxX = msub(bmaxX, ray.rdir.x, ray.org_rdir.x);
-      const vfloat<K> rmaxY = msub(bmaxY, ray.rdir.y, ray.org_rdir.y);
-      const vfloat<K> rmaxZ = msub(bmaxZ, ray.rdir.z, ray.org_rdir.z);
-#endif
-
-      const vfloat<K> rmin  = maxi(rminX, rminY, rminZ, ray.tnear);
-      const vfloat<K> rmax  = mini(rmaxX, rmaxY, rmaxZ, ray.tfar);
-
-      const vbool<K> vmask_first_hit = rmin <= rmax;
-
-      return movemask(vmask_first_hit);
-    }
-
-    //////////////////////////////////////////////////////////////////////////////////////
-    // Robust AABBNode intersection
-    //////////////////////////////////////////////////////////////////////////////////////
-
-    template<int N, int K>
-    __forceinline size_t intersectNode1(const typename BVHN<N>::AABBNode* __restrict__ node,
-                                        const TravRayKStreamRobust<K>& ray, size_t k, const NearFarPrecalculations& nf)
-    {
-      const vfloat<N> bminX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
-      const vfloat<N> bminY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
-      const vfloat<N> bminZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
-      const vfloat<N> bmaxX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
-      const vfloat<N> bmaxY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
-      const vfloat<N> bmaxZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
-
-      const vfloat<N> rminX = (bminX - vfloat<N>(ray.org.x[k])) * vfloat<N>(ray.rdir.x[k]);
-      const vfloat<N> rminY = (bminY - vfloat<N>(ray.org.y[k])) * vfloat<N>(ray.rdir.y[k]);
-      const vfloat<N> rminZ = (bminZ - vfloat<N>(ray.org.z[k])) * vfloat<N>(ray.rdir.z[k]);
-      const vfloat<N> rmaxX = (bmaxX - vfloat<N>(ray.org.x[k])) * vfloat<N>(ray.rdir.x[k]);
-      const vfloat<N> rmaxY = (bmaxY - vfloat<N>(ray.org.y[k])) * vfloat<N>(ray.rdir.y[k]);
-      const vfloat<N> rmaxZ = (bmaxZ - vfloat<N>(ray.org.z[k])) * vfloat<N>(ray.rdir.z[k]);
-      const float round_up = 1.0f+3.0f*float(ulp); // FIXME: use per instruction rounding for AVX512
-      const vfloat<N> rmin  =            max(rminX, rminY, rminZ, vfloat<N>(ray.tnear[k]));
-      const vfloat<N> rmax  = round_up  *min(rmaxX, rmaxY, rmaxZ, vfloat<N>(ray.tfar[k]));
-
-      const vbool<N> vmask_first_hit = rmin <= rmax;
-
-      return movemask(vmask_first_hit) & (((size_t)1 << N)-1);
-    }
-
-    template<int N, int K>
-    __forceinline size_t intersectNodeK(const typename BVHN<N>::AABBNode* __restrict__ node, size_t i,
-                                        const TravRayKStreamRobust<K>& ray, const NearFarPrecalculations& nf)
-    {
-      char *ptr = (char*)&node->lower_x + i*sizeof(float);
-      const vfloat<K> bminX = *(const float*)(ptr + nf.nearX);
-      const vfloat<K> bminY = *(const float*)(ptr + nf.nearY);
-      const vfloat<K> bminZ = *(const float*)(ptr + nf.nearZ);
-      const vfloat<K> bmaxX = *(const float*)(ptr + nf.farX);
-      const vfloat<K> bmaxY = *(const float*)(ptr + nf.farY);
-      const vfloat<K> bmaxZ = *(const float*)(ptr + nf.farZ);
-
-      const vfloat<K> rminX = (bminX - ray.org.x) * ray.rdir.x;
-      const vfloat<K> rminY = (bminY - ray.org.y) * ray.rdir.y;
-      const vfloat<K> rminZ = (bminZ - ray.org.z) * ray.rdir.z;
-      const vfloat<K> rmaxX = (bmaxX - ray.org.x) * ray.rdir.x;
-      const vfloat<K> rmaxY = (bmaxY - ray.org.y) * ray.rdir.y;
-      const vfloat<K> rmaxZ = (bmaxZ - ray.org.z) * ray.rdir.z;
-
-      const float round_up  = 1.0f+3.0f*float(ulp);
-      const vfloat<K> rmin  =            max(rminX, rminY, rminZ, vfloat<K>(ray.tnear));
-      const vfloat<K> rmax  = round_up * min(rmaxX, rmaxY, rmaxZ, vfloat<K>(ray.tfar));
-
-      const vbool<K> vmask_first_hit = rmin <= rmax;
-
-      return movemask(vmask_first_hit);
-    }
-  }
-}
diff --git a/thirdparty/embree/kernels/common/accel.h b/thirdparty/embree/kernels/common/accel.h
index d24326ce92..7d959377ae 100644
--- a/thirdparty/embree/kernels/common/accel.h
+++ b/thirdparty/embree/kernels/common/accel.h
@@ -17,7 +17,7 @@ namespace embree
   {
     ALIGNED_CLASS_(16);
   public:
-    enum Type { TY_UNKNOWN = 0, TY_ACCELN = 1, TY_ACCEL_INSTANCE = 2, TY_BVH4 = 3, TY_BVH8 = 4 };
+    enum Type { TY_UNKNOWN = 0, TY_ACCELN = 1, TY_ACCEL_INSTANCE = 2, TY_BVH4 = 3, TY_BVH8 = 4, TY_GPU = 5 };
 
   public:
     AccelData (const Type type) 
@@ -73,61 +73,49 @@ namespace embree
     /*! Type of intersect function pointer for single rays. */
     typedef void (*IntersectFunc)(Intersectors* This,  /*!< this pointer to accel */
                                   RTCRayHit& ray,      /*!< ray to intersect */
-                                  IntersectContext* context);
+                                  RayQueryContext* context);
     
     /*! Type of intersect function pointer for ray packets of size 4. */
     typedef void (*IntersectFunc4)(const void* valid,  /*!< pointer to valid mask */
                                    Intersectors* This, /*!< this pointer to accel */
                                    RTCRayHit4& ray,    /*!< ray packet to intersect */
-                                   IntersectContext* context);
+                                   RayQueryContext* context);
     
     /*! Type of intersect function pointer for ray packets of size 8. */
     typedef void (*IntersectFunc8)(const void* valid,  /*!< pointer to valid mask */
                                    Intersectors* This, /*!< this pointer to accel */
                                    RTCRayHit8& ray,    /*!< ray packet to intersect */
-                                   IntersectContext* context);
+                                   RayQueryContext* context);
     
     /*! Type of intersect function pointer for ray packets of size 16. */
     typedef void (*IntersectFunc16)(const void* valid,  /*!< pointer to valid mask */
                                     Intersectors* This, /*!< this pointer to accel */
                                     RTCRayHit16& ray,   /*!< ray packet to intersect */
-                                    IntersectContext* context);
+                                    RayQueryContext* context);
 
-    /*! Type of intersect function pointer for ray packets of size N. */
-    typedef void (*IntersectFuncN)(Intersectors* This, /*!< this pointer to accel */
-                                   RTCRayHitN** ray,   /*!< ray stream to intersect */
-                                   const size_t N,     /*!< number of rays in stream */
-                                   IntersectContext* context /*!< layout flags */);
-    
-    
     /*! Type of occlusion function pointer for single rays. */
     typedef void (*OccludedFunc) (Intersectors* This, /*!< this pointer to accel */
                                   RTCRay& ray,        /*!< ray to test occlusion */
-                                  IntersectContext* context);
+                                  RayQueryContext* context);
     
     /*! Type of occlusion function pointer for ray packets of size 4. */
     typedef void (*OccludedFunc4) (const void* valid,  /*!< pointer to valid mask */
                                    Intersectors* This, /*!< this pointer to accel */
                                    RTCRay4& ray,       /*!< ray packet to test occlusion. */
-                                   IntersectContext* context);
+                                   RayQueryContext* context);
     
     /*! Type of occlusion function pointer for ray packets of size 8. */
     typedef void (*OccludedFunc8) (const void* valid,  /*!< pointer to valid mask */
                                    Intersectors* This, /*!< this pointer to accel */
                                    RTCRay8& ray,       /*!< ray packet to test occlusion. */
-                                   IntersectContext* context);
+                                   RayQueryContext* context);
     
     /*! Type of occlusion function pointer for ray packets of size 16. */
     typedef void (*OccludedFunc16) (const void* valid,  /*!< pointer to valid mask */
                                     Intersectors* This, /*!< this pointer to accel */
                                     RTCRay16& ray,      /*!< ray packet to test occlusion. */
-                                    IntersectContext* context);
+                                    RayQueryContext* context);
 
-    /*! Type of intersect function pointer for ray packets of size N. */
-    typedef void (*OccludedFuncN)(Intersectors* This, /*!< this pointer to accel */
-                                  RTCRayN** ray,      /*!< ray stream to test occlusion */
-                                  const size_t N,     /*!< number of rays in stream */
-                                  IntersectContext* context /*!< layout flags */);
     typedef void (*ErrorFunc) ();
 
     struct Collider
@@ -217,30 +205,13 @@ namespace embree
       const char* name;
     };
 
-    struct IntersectorN 
-    {
-      IntersectorN (ErrorFunc error = nullptr)
-      : intersect((IntersectFuncN)error), occluded((OccludedFuncN)error), name(nullptr) {}
-
-      IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name)
-      : intersect(intersect), occluded(occluded), name(name) {}
-
-      operator bool() const { return name; }
-      
-    public:
-      static const char* type;
-      IntersectFuncN intersect;
-      OccludedFuncN occluded;
-      const char* name;
-    };
-   
     struct Intersectors 
     {
       Intersectors() 
-      : ptr(nullptr), leafIntersector(nullptr), collider(nullptr), intersector1(nullptr), intersector4(nullptr), intersector8(nullptr), intersector16(nullptr), intersectorN(nullptr) {}
+      : ptr(nullptr), leafIntersector(nullptr), collider(nullptr), intersector1(nullptr), intersector4(nullptr), intersector8(nullptr), intersector16(nullptr) {}
 
       Intersectors (ErrorFunc error) 
-      : ptr(nullptr), leafIntersector(nullptr), collider(error), intersector1(error), intersector4(error), intersector8(error), intersector16(error), intersectorN(error) {}
+      : ptr(nullptr), leafIntersector(nullptr), collider(error), intersector1(error), intersector4(error), intersector8(error), intersector16(error) {}
 
       void print(size_t ident) 
       {
@@ -264,10 +235,6 @@ namespace embree
           for (size_t i=0; i<ident; i++) std::cout << " ";
           std::cout << "intersector16 = " << intersector16.name << std::endl;
         }
-        if (intersectorN.name) {
-          for (size_t i=0; i<ident; i++) std::cout << " ";
-          std::cout << "intersectorN = " << intersectorN.name << std::endl;
-        }        
       }
 
       void select(bool filter)
@@ -284,10 +251,6 @@ namespace embree
           if (filter) intersector16 = intersector16_filter;
           else         intersector16 = intersector16_nofilter;
         }
-        if (intersectorN_filter) {
-          if (filter) intersectorN = intersectorN_filter;
-          else        intersectorN = intersectorN_nofilter;
-        }        
       }
 
       __forceinline bool pointQuery (PointQuery* query, PointQueryContext* context) {
@@ -302,133 +265,138 @@ namespace embree
       }
 
       /*! Intersects a single ray with the scene. */
-      __forceinline void intersect (RTCRayHit& ray, IntersectContext* context) {
+      __forceinline void intersect (RTCRayHit& ray, RayQueryContext* context) {
         assert(intersector1.intersect);
         intersector1.intersect(this,ray,context);
       }
 
       /*! Intersects a packet of 4 rays with the scene. */
-      __forceinline void intersect4 (const void* valid, RTCRayHit4& ray, IntersectContext* context) {
+      __forceinline void intersect4 (const void* valid, RTCRayHit4& ray, RayQueryContext* context) {
         assert(intersector4.intersect);
         intersector4.intersect(valid,this,ray,context);
       }
       
       /*! Intersects a packet of 8 rays with the scene. */
-      __forceinline void intersect8 (const void* valid, RTCRayHit8& ray, IntersectContext* context) {
+      __forceinline void intersect8 (const void* valid, RTCRayHit8& ray, RayQueryContext* context) {
         assert(intersector8.intersect);
         intersector8.intersect(valid,this,ray,context);
       }
       
       /*! Intersects a packet of 16 rays with the scene. */
-      __forceinline void intersect16 (const void* valid, RTCRayHit16& ray, IntersectContext* context) {
+      __forceinline void intersect16 (const void* valid, RTCRayHit16& ray, RayQueryContext* context) {
         assert(intersector16.intersect);
         intersector16.intersect(valid,this,ray,context);
       }
+
+      /*! Intersects a packet of 4 rays with the scene. */
+      __forceinline void intersect (const void* valid, RTCRayHit4& ray, RayQueryContext* context) {
+        assert(intersector4.intersect);
+        intersector4.intersect(valid,this,ray,context);
+      }
       
-      /*! Intersects a stream of N rays in SOA layout with the scene. */
-      __forceinline void intersectN (RTCRayHitN** rayN, const size_t N, IntersectContext* context)
-      {
-        assert(intersectorN.intersect);
-        intersectorN.intersect(this,rayN,N,context);
+      /*! Intersects a packet of 8 rays with the scene. */
+      __forceinline void intersect (const void* valid, RTCRayHit8& ray, RayQueryContext* context) {
+        assert(intersector8.intersect);
+        intersector8.intersect(valid,this,ray,context);
+      }
+      
+      /*! Intersects a packet of 16 rays with the scene. */
+      __forceinline void intersect (const void* valid, RTCRayHit16& ray, RayQueryContext* context) {
+        assert(intersector16.intersect);
+        intersector16.intersect(valid,this,ray,context);
       }
       
 #if defined(__SSE__) || defined(__ARM_NEON)
-      __forceinline void intersect(const vbool4& valid, RayHitK<4>& ray, IntersectContext* context) {
+      __forceinline void intersect(const vbool4& valid, RayHitK<4>& ray, RayQueryContext* context) {
         const vint<4> mask = valid.mask32();
         intersect4(&mask,(RTCRayHit4&)ray,context);
       }
 #endif
 #if defined(__AVX__)
-      __forceinline void intersect(const vbool8& valid, RayHitK<8>& ray, IntersectContext* context) {
+      __forceinline void intersect(const vbool8& valid, RayHitK<8>& ray, RayQueryContext* context) {
         const vint<8> mask = valid.mask32();
         intersect8(&mask,(RTCRayHit8&)ray,context);
       }
 #endif
 #if defined(__AVX512F__)
-      __forceinline void intersect(const vbool16& valid, RayHitK<16>& ray, IntersectContext* context) {
+      __forceinline void intersect(const vbool16& valid, RayHitK<16>& ray, RayQueryContext* context) {
         const vint<16> mask = valid.mask32();
         intersect16(&mask,(RTCRayHit16&)ray,context);
       }
 #endif
       
-      template<int K>
-      __forceinline void intersectN (RayHitK<K>** rayN, const size_t N, IntersectContext* context)
-      {
-        intersectN((RTCRayHitN**)rayN,N,context);
-      }
-
       /*! Tests if single ray is occluded by the scene. */
-      __forceinline void occluded (RTCRay& ray, IntersectContext* context) {
+      __forceinline void occluded (RTCRay& ray, RayQueryContext* context) {
         assert(intersector1.occluded);
         intersector1.occluded(this,ray,context);
       }
       
       /*! Tests if a packet of 4 rays is occluded by the scene. */
-      __forceinline void occluded4 (const void* valid, RTCRay4& ray, IntersectContext* context) {
+      __forceinline void occluded4 (const void* valid, RTCRay4& ray, RayQueryContext* context) {
         assert(intersector4.occluded);
         intersector4.occluded(valid,this,ray,context);
       }
       
       /*! Tests if a packet of 8 rays is occluded by the scene. */
-      __forceinline void occluded8 (const void* valid, RTCRay8& ray, IntersectContext* context) {
+      __forceinline void occluded8 (const void* valid, RTCRay8& ray, RayQueryContext* context) {
         assert(intersector8.occluded);
         intersector8.occluded(valid,this,ray,context);
       }
       
       /*! Tests if a packet of 16 rays is occluded by the scene. */
-      __forceinline void occluded16 (const void* valid, RTCRay16& ray, IntersectContext* context) {
+      __forceinline void occluded16 (const void* valid, RTCRay16& ray, RayQueryContext* context) {
         assert(intersector16.occluded);
         intersector16.occluded(valid,this,ray,context);
       }
+
+      /*! Tests if a packet of 4 rays is occluded by the scene. */
+      __forceinline void occluded (const void* valid, RTCRay4& ray, RayQueryContext* context) {
+        assert(intersector4.occluded);
+        intersector4.occluded(valid,this,ray,context);
+      }
       
-      /*! Tests if a stream of N rays in SOA layout is occluded by the scene. */
-      __forceinline void occludedN (RTCRayN** rayN, const size_t N, IntersectContext* context)
-      {
-        assert(intersectorN.occluded);
-        intersectorN.occluded(this,rayN,N,context);
+      /*! Tests if a packet of 8 rays is occluded by the scene. */
+      __forceinline void occluded (const void* valid, RTCRay8& ray, RayQueryContext* context) {
+        assert(intersector8.occluded);
+        intersector8.occluded(valid,this,ray,context);
+      }
+      
+      /*! Tests if a packet of 16 rays is occluded by the scene. */
+      __forceinline void occluded (const void* valid, RTCRay16& ray, RayQueryContext* context) {
+        assert(intersector16.occluded);
+        intersector16.occluded(valid,this,ray,context);
       }
       
 #if defined(__SSE__) || defined(__ARM_NEON)
-      __forceinline void occluded(const vbool4& valid, RayK<4>& ray, IntersectContext* context) {
+      __forceinline void occluded(const vbool4& valid, RayK<4>& ray, RayQueryContext* context) {
         const vint<4> mask = valid.mask32();
         occluded4(&mask,(RTCRay4&)ray,context);
       }
 #endif
 #if defined(__AVX__)
-      __forceinline void occluded(const vbool8& valid, RayK<8>& ray, IntersectContext* context) {
+      __forceinline void occluded(const vbool8& valid, RayK<8>& ray, RayQueryContext* context) {
         const vint<8> mask = valid.mask32();
         occluded8(&mask,(RTCRay8&)ray,context);
       }
 #endif
 #if defined(__AVX512F__)
-      __forceinline void occluded(const vbool16& valid, RayK<16>& ray, IntersectContext* context) {
+      __forceinline void occluded(const vbool16& valid, RayK<16>& ray, RayQueryContext* context) {
         const vint<16> mask = valid.mask32();
         occluded16(&mask,(RTCRay16&)ray,context);
       }
 #endif
 
-      template<int K>
-      __forceinline void occludedN (RayK<K>** rayN, const size_t N, IntersectContext* context)
-      {
-        occludedN((RTCRayN**)rayN,N,context);
-      }
-
       /*! Tests if single ray is occluded by the scene. */
-      __forceinline void intersect(RTCRay& ray, IntersectContext* context) {
+      __forceinline void intersect(RTCRay& ray, RayQueryContext* context) {
         occluded(ray, context);
       }
 
       /*! Tests if a packet of K rays is occluded by the scene. */
       template<int K>
-      __forceinline void intersect(const vbool<K>& valid, RayK<K>& ray, IntersectContext* context) {
+      __forceinline void intersect(const vbool<K>& valid, RayK<K>& ray, RayQueryContext* context) {
         occluded(valid, ray, context);
       }
 
-      /*! Tests if a packet of N rays in SOA layout is occluded by the scene. */
-      template<int K>
-      __forceinline void intersectN(RayK<K>** rayN, const size_t N, IntersectContext* context) {
-        occludedN(rayN, N, context);
-      }
       
     public:
       AccelData* ptr;
@@ -444,9 +412,6 @@ namespace embree
       Intersector16 intersector16;
       Intersector16 intersector16_filter;
       Intersector16 intersector16_nofilter;
-      IntersectorN intersectorN;
-      IntersectorN intersectorN_filter;
-      IntersectorN intersectorN_nofilter;      
     };
   
   public:
@@ -506,51 +471,4 @@ namespace embree
                                 (Accel::OccludedFunc16)intersector::occluded,   \
                                 TOSTRING(isa) "::" TOSTRING(symbol));           \
   }
-
-#define DEFINE_INTERSECTORN(symbol,intersector)                               \
-  Accel::IntersectorN symbol() {                                              \
-    return Accel::IntersectorN((Accel::IntersectFuncN)intersector::intersect, \
-                               (Accel::OccludedFuncN)intersector::occluded,   \
-                               TOSTRING(isa) "::" TOSTRING(symbol));          \
-  }
-
-  /* ray stream filter interface */
-  typedef void (*intersectStreamAOS_func)(Scene* scene, RTCRayHit*  _rayN, const size_t N, const size_t stride, IntersectContext* context);
-  typedef void (*intersectStreamAOP_func)(Scene* scene, RTCRayHit** _rayN, const size_t N, IntersectContext* context);
-  typedef void (*intersectStreamSOA_func)(Scene* scene, char* rayN, const size_t N, const size_t streams, const size_t stream_offset, IntersectContext* context);
-  typedef void (*intersectStreamSOP_func)(Scene* scene, const RTCRayHitNp* rayN, const size_t N, IntersectContext* context);
-
-  typedef void (*occludedStreamAOS_func)(Scene* scene, RTCRay*  _rayN, const size_t N, const size_t stride, IntersectContext* context);
-  typedef void (*occludedStreamAOP_func)(Scene* scene, RTCRay** _rayN, const size_t N, IntersectContext* context);
-  typedef void (*occludedStreamSOA_func)(Scene* scene, char* rayN, const size_t N, const size_t streams, const size_t stream_offset, IntersectContext* context);
-  typedef void (*occludedStreamSOP_func)(Scene* scene, const RTCRayNp* rayN, const size_t N, IntersectContext* context);
-
-  struct RayStreamFilterFuncs
-  {
-    RayStreamFilterFuncs()
-    : intersectAOS(nullptr), intersectAOP(nullptr), intersectSOA(nullptr), intersectSOP(nullptr),
-      occludedAOS(nullptr),  occludedAOP(nullptr),  occludedSOA(nullptr),  occludedSOP(nullptr) {}
-
-    RayStreamFilterFuncs(void (*ptr) ())
-    : intersectAOS((intersectStreamAOS_func) ptr), intersectAOP((intersectStreamAOP_func) ptr), intersectSOA((intersectStreamSOA_func) ptr), intersectSOP((intersectStreamSOP_func) ptr),
-      occludedAOS((occludedStreamAOS_func) ptr),   occludedAOP((occludedStreamAOP_func) ptr),   occludedSOA((occludedStreamSOA_func) ptr),   occludedSOP((occludedStreamSOP_func) ptr) {}
-
-    RayStreamFilterFuncs(intersectStreamAOS_func intersectAOS, intersectStreamAOP_func intersectAOP, intersectStreamSOA_func intersectSOA, intersectStreamSOP_func intersectSOP,
-                         occludedStreamAOS_func  occludedAOS,  occludedStreamAOP_func  occludedAOP,  occludedStreamSOA_func  occludedSOA,  occludedStreamSOP_func  occludedSOP)
-    : intersectAOS(intersectAOS), intersectAOP(intersectAOP), intersectSOA(intersectSOA), intersectSOP(intersectSOP),
-      occludedAOS(occludedAOS),   occludedAOP(occludedAOP),   occludedSOA(occludedSOA),   occludedSOP(occludedSOP) {}
-
-  public:
-    intersectStreamAOS_func intersectAOS;
-    intersectStreamAOP_func intersectAOP;
-    intersectStreamSOA_func intersectSOA;
-    intersectStreamSOP_func intersectSOP;
-
-    occludedStreamAOS_func occludedAOS;
-    occludedStreamAOP_func occludedAOP;
-    occludedStreamSOA_func occludedSOA;
-    occludedStreamSOP_func occludedSOP;
-  }; 
-
-  typedef RayStreamFilterFuncs (*RayStreamFilterFuncsType)();
 }
diff --git a/thirdparty/embree/kernels/common/acceln.cpp b/thirdparty/embree/kernels/common/acceln.cpp
index 111c62083d..9edb684db7 100644
--- a/thirdparty/embree/kernels/common/acceln.cpp
+++ b/thirdparty/embree/kernels/common/acceln.cpp
@@ -3,7 +3,7 @@
 
 #include "acceln.h"
 #include "ray.h"
-#include "../../include/embree3/rtcore_ray.h"
+#include "../../include/embree4/rtcore_ray.h"
 #include "../../common/algorithms/parallel_for.h"
 
 namespace embree
@@ -41,7 +41,7 @@ namespace embree
     return changed;
   }
 
-  void AccelN::intersect (Accel::Intersectors* This_in, RTCRayHit& ray, IntersectContext* context) 
+  void AccelN::intersect (Accel::Intersectors* This_in, RTCRayHit& ray, RayQueryContext* context) 
   {
     AccelN* This = (AccelN*)This_in->ptr;
     for (size_t i=0; i<This->accels.size(); i++)
@@ -49,7 +49,7 @@ namespace embree
         This->accels[i]->intersectors.intersect(ray,context);
   }
 
-  void AccelN::intersect4 (const void* valid, Accel::Intersectors* This_in, RTCRayHit4& ray, IntersectContext* context) 
+  void AccelN::intersect4 (const void* valid, Accel::Intersectors* This_in, RTCRayHit4& ray, RayQueryContext* context) 
   {
     AccelN* This = (AccelN*)This_in->ptr;
     for (size_t i=0; i<This->accels.size(); i++)
@@ -57,7 +57,7 @@ namespace embree
         This->accels[i]->intersectors.intersect4(valid,ray,context);
   }
 
-  void AccelN::intersect8 (const void* valid, Accel::Intersectors* This_in, RTCRayHit8& ray, IntersectContext* context) 
+  void AccelN::intersect8 (const void* valid, Accel::Intersectors* This_in, RTCRayHit8& ray, RayQueryContext* context) 
   {
     AccelN* This = (AccelN*)This_in->ptr;
     for (size_t i=0; i<This->accels.size(); i++)
@@ -65,7 +65,7 @@ namespace embree
         This->accels[i]->intersectors.intersect8(valid,ray,context);
   }
 
-  void AccelN::intersect16 (const void* valid, Accel::Intersectors* This_in, RTCRayHit16& ray, IntersectContext* context) 
+  void AccelN::intersect16 (const void* valid, Accel::Intersectors* This_in, RTCRayHit16& ray, RayQueryContext* context) 
   {
     AccelN* This = (AccelN*)This_in->ptr;
     for (size_t i=0; i<This->accels.size(); i++)
@@ -73,15 +73,7 @@ namespace embree
         This->accels[i]->intersectors.intersect16(valid,ray,context);
   }
 
-  void AccelN::intersectN (Accel::Intersectors* This_in, RTCRayHitN** ray, const size_t N, IntersectContext* context)
-  {
-    AccelN* This = (AccelN*)This_in->ptr;
-    for (size_t i=0; i<This->accels.size(); i++)
-      if (!This->accels[i]->isEmpty())
-        This->accels[i]->intersectors.intersectN(ray,N,context);
-  }
-
-  void AccelN::occluded (Accel::Intersectors* This_in, RTCRay& ray, IntersectContext* context) 
+  void AccelN::occluded (Accel::Intersectors* This_in, RTCRay& ray, RayQueryContext* context) 
   {
     AccelN* This = (AccelN*)This_in->ptr;
     for (size_t i=0; i<This->accels.size(); i++) {
@@ -91,7 +83,7 @@ namespace embree
     }
   }
 
-  void AccelN::occluded4 (const void* valid, Accel::Intersectors* This_in, RTCRay4& ray, IntersectContext* context) 
+  void AccelN::occluded4 (const void* valid, Accel::Intersectors* This_in, RTCRay4& ray, RayQueryContext* context) 
   {
     AccelN* This = (AccelN*)This_in->ptr;
     for (size_t i=0; i<This->accels.size(); i++) {
@@ -105,7 +97,7 @@ namespace embree
     }
   }
 
-  void AccelN::occluded8 (const void* valid, Accel::Intersectors* This_in, RTCRay8& ray, IntersectContext* context) 
+  void AccelN::occluded8 (const void* valid, Accel::Intersectors* This_in, RTCRay8& ray, RayQueryContext* context) 
   {
     AccelN* This = (AccelN*)This_in->ptr;
     for (size_t i=0; i<This->accels.size(); i++) {
@@ -121,7 +113,7 @@ namespace embree
     }
   }
 
-  void AccelN::occluded16 (const void* valid, Accel::Intersectors* This_in, RTCRay16& ray, IntersectContext* context) 
+  void AccelN::occluded16 (const void* valid, Accel::Intersectors* This_in, RTCRay16& ray, RayQueryContext* context) 
   {
     AccelN* This = (AccelN*)This_in->ptr;
     for (size_t i=0; i<This->accels.size(); i++) {
@@ -141,15 +133,6 @@ namespace embree
     }
   }
 
-  void AccelN::occludedN (Accel::Intersectors* This_in, RTCRayN** ray, const size_t N, IntersectContext* context)
-  {
-    AccelN* This = (AccelN*)This_in->ptr;
-    size_t M = N;
-    for (size_t i=0; i<This->accels.size(); i++)
-      if (!This->accels[i]->isEmpty())
-        This->accels[i]->intersectors.occludedN(ray,M,context);
-  }
-
   void AccelN::accels_print(size_t ident)
   {
     for (size_t i=0; i<accels.size(); i++)
@@ -201,7 +184,6 @@ namespace embree
       intersectors.intersector4  = Intersector4(&intersect4,&occluded4,valid4 ? "AccelN::intersector4" : nullptr);
       intersectors.intersector8  = Intersector8(&intersect8,&occluded8,valid8 ? "AccelN::intersector8" : nullptr);
       intersectors.intersector16 = Intersector16(&intersect16,&occluded16,valid16 ? "AccelN::intersector16": nullptr);
-      intersectors.intersectorN  = IntersectorN(&intersectN,&occludedN,"AccelN::intersectorN");
 
       /*! calculate bounds */
       bounds = empty;
diff --git a/thirdparty/embree/kernels/common/acceln.h b/thirdparty/embree/kernels/common/acceln.h
index 0445b2e811..cc3406826c 100644
--- a/thirdparty/embree/kernels/common/acceln.h
+++ b/thirdparty/embree/kernels/common/acceln.h
@@ -22,18 +22,16 @@ namespace embree
     static bool pointQuery (Accel::Intersectors* This, PointQuery* query, PointQueryContext* context);
 
   public:
-    static void intersect (Accel::Intersectors* This, RTCRayHit& ray, IntersectContext* context);
-    static void intersect4 (const void* valid, Accel::Intersectors* This, RTCRayHit4& ray, IntersectContext* context);
-    static void intersect8 (const void* valid, Accel::Intersectors* This, RTCRayHit8& ray, IntersectContext* context);
-    static void intersect16 (const void* valid, Accel::Intersectors* This, RTCRayHit16& ray, IntersectContext* context);
-    static void intersectN (Accel::Intersectors* This, RTCRayHitN** ray, const size_t N, IntersectContext* context);
+    static void intersect (Accel::Intersectors* This, RTCRayHit& ray, RayQueryContext* context);
+    static void intersect4 (const void* valid, Accel::Intersectors* This, RTCRayHit4& ray, RayQueryContext* context);
+    static void intersect8 (const void* valid, Accel::Intersectors* This, RTCRayHit8& ray, RayQueryContext* context);
+    static void intersect16 (const void* valid, Accel::Intersectors* This, RTCRayHit16& ray, RayQueryContext* context);
 
   public:
-    static void occluded (Accel::Intersectors* This, RTCRay& ray, IntersectContext* context);
-    static void occluded4 (const void* valid, Accel::Intersectors* This, RTCRay4& ray, IntersectContext* context);
-    static void occluded8 (const void* valid, Accel::Intersectors* This, RTCRay8& ray, IntersectContext* context);
-    static void occluded16 (const void* valid, Accel::Intersectors* This, RTCRay16& ray, IntersectContext* context);
-    static void occludedN (Accel::Intersectors* This, RTCRayN** ray, const size_t N, IntersectContext* context);
+    static void occluded (Accel::Intersectors* This, RTCRay& ray, RayQueryContext* context);
+    static void occluded4 (const void* valid, Accel::Intersectors* This, RTCRay4& ray, RayQueryContext* context);
+    static void occluded8 (const void* valid, Accel::Intersectors* This, RTCRay8& ray, RayQueryContext* context);
+    static void occluded16 (const void* valid, Accel::Intersectors* This, RTCRay16& ray, RayQueryContext* context);
 
   public:
     void accels_print(size_t ident);
diff --git a/thirdparty/embree/kernels/common/accelset.h b/thirdparty/embree/kernels/common/accelset.h
index 1b67120c97..f78830e397 100644
--- a/thirdparty/embree/kernels/common/accelset.h
+++ b/thirdparty/embree/kernels/common/accelset.h
@@ -17,11 +17,15 @@ namespace embree
   struct IntersectFunctionNArguments : public RTCIntersectFunctionNArguments
   {
     Geometry* geometry;
+    RTCScene forward_scene;
+    RTCIntersectArguments* args;
   };
 
   struct OccludedFunctionNArguments : public RTCOccludedFunctionNArguments
   {
     Geometry* geometry;
+    RTCScene forward_scene;
+    RTCIntersectArguments* args;
   };
 
   /*! Base class for set of acceleration structures. */
@@ -138,10 +142,9 @@ namespace embree
   public:
 
       /*! Intersects a single ray with the scene. */
-      __forceinline void intersect (RayHit& ray, unsigned int geomID, unsigned int primID, IntersectContext* context) 
+      __forceinline bool intersect (RayHit& ray, unsigned int geomID, unsigned int primID, RayQueryContext* context) 
       {
         assert(primID < size());
-        assert(intersectorN.intersect);
         
         int mask = -1;
         IntersectFunctionNArguments args;
@@ -153,17 +156,94 @@ namespace embree
         args.geomID = geomID;
         args.primID = primID;
         args.geometry = this;
+        args.forward_scene = nullptr;
+        args.args = context->args;
+
+        IntersectFuncN intersectFunc = nullptr;
+        intersectFunc = intersectorN.intersect;
         
-        intersectorN.intersect(&args);
+        if (context->getIntersectFunction())
+          intersectFunc = context->getIntersectFunction();
+
+        assert(intersectFunc);
+        intersectFunc(&args);
+
+        return mask != 0;
       }
 
       /*! Tests if single ray is occluded by the scene. */
-      __forceinline void occluded (Ray& ray, unsigned int geomID, unsigned int primID, IntersectContext* context)
+      __forceinline bool occluded (Ray& ray, unsigned int geomID, unsigned int primID, RayQueryContext* context)
       {
         assert(primID < size());
-        assert(intersectorN.occluded);
+
+        int mask = -1;
+        OccludedFunctionNArguments args;
+        args.valid = &mask;
+        args.geometryUserPtr = userPtr;
+        args.context = context->user;
+        args.ray = (RTCRayN*)&ray;
+        args.N = 1;
+        args.geomID = geomID;
+        args.primID = primID;
+        args.geometry = this;
+        args.forward_scene = nullptr;
+        args.args = context->args;
+
+        OccludedFuncN occludedFunc = nullptr;
+        occludedFunc = intersectorN.occluded;
+
+        if (context->getOccludedFunction())
+          occludedFunc = context->getOccludedFunction();
+
+        assert(occludedFunc);
+        occludedFunc(&args);
+
+        return mask != 0;
+      }
+
+      /*! Intersects a single ray with the scene. */
+    __forceinline bool intersect (RayHit& ray, unsigned int geomID, unsigned int primID, RayQueryContext* context, RTCScene& forward_scene) 
+    {
+        assert(primID < size());
         
         int mask = -1;
+        IntersectFunctionNArguments args;
+        args.valid = &mask;
+        args.geometryUserPtr = userPtr;
+        args.context = context->user;
+        args.rayhit = (RTCRayHitN*)&ray;
+        args.N = 1;
+        args.geomID = geomID;
+        args.primID = primID;
+        args.geometry = this;
+        args.forward_scene = nullptr;
+        args.args = nullptr;
+
+        typedef void (*RTCIntersectFunctionSYCL)(const void* args);
+        RTCIntersectFunctionSYCL intersectFunc = nullptr;
+        
+#if EMBREE_SYCL_GEOMETRY_CALLBACK
+        if (context->args->feature_mask & RTC_FEATURE_FLAG_USER_GEOMETRY_CALLBACK_IN_GEOMETRY)
+          intersectFunc = (RTCIntersectFunctionSYCL) intersectorN.intersect;
+#endif
+        
+        if (context->args->feature_mask & RTC_FEATURE_FLAG_USER_GEOMETRY_CALLBACK_IN_ARGUMENTS)
+          if (context->getIntersectFunction())
+            intersectFunc = (RTCIntersectFunctionSYCL) context->getIntersectFunction();
+
+        if (intersectFunc)
+          intersectFunc(&args);
+        
+        forward_scene = args.forward_scene;
+        return mask != 0;
+      }
+
+      /*! Tests if single ray is occluded by the scene. */
+    __forceinline bool occluded (Ray& ray, unsigned int geomID, unsigned int primID, RayQueryContext* context, RTCScene& forward_scene)
+      {
+        assert(primID < size());
+
+        int mask = -1;
         OccludedFunctionNArguments args;
         args.valid = &mask;
         args.geometryUserPtr = userPtr;
@@ -173,16 +253,33 @@ namespace embree
         args.geomID = geomID;
         args.primID = primID;
         args.geometry = this;
+        args.forward_scene = nullptr;
+        args.args = nullptr;
+
+        typedef void (*RTCOccludedFunctionSYCL)(const void* args);
+        RTCOccludedFunctionSYCL occludedFunc = nullptr;
+
+#if EMBREE_SYCL_GEOMETRY_CALLBACK
+        if (context->args->feature_mask & RTC_FEATURE_FLAG_USER_GEOMETRY_CALLBACK_IN_GEOMETRY)
+          occludedFunc = (RTCOccludedFunctionSYCL) intersectorN.occluded;
+#endif
+        
+        if (context->args->feature_mask & RTC_FEATURE_FLAG_USER_GEOMETRY_CALLBACK_IN_ARGUMENTS)
+          if (context->getOccludedFunction())
+            occludedFunc = (RTCOccludedFunctionSYCL) context->getOccludedFunction();
+
+        if (occludedFunc)
+          occludedFunc(&args);
         
-        intersectorN.occluded(&args);
+        forward_scene = args.forward_scene;
+        return mask != 0;
       }
-   
+
       /*! Intersects a packet of K rays with the scene. */
       template<int K>
-        __forceinline void intersect (const vbool<K>& valid, RayHitK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context) 
+        __forceinline void intersect (const vbool<K>& valid, RayHitK<K>& ray, unsigned int geomID, unsigned int primID, RayQueryContext* context) 
       {
         assert(primID < size());
-        assert(intersectorN.intersect);
         
         vint<K> mask = valid.mask32();
         IntersectFunctionNArguments args;
@@ -194,16 +291,24 @@ namespace embree
         args.geomID = geomID;
         args.primID = primID;
         args.geometry = this;
-         
-        intersectorN.intersect(&args);
+        args.forward_scene = nullptr;
+        args.args = context->args;
+
+        IntersectFuncN intersectFunc = nullptr;
+        intersectFunc = intersectorN.intersect;
+        
+        if (context->getIntersectFunction())
+          intersectFunc = context->getIntersectFunction();
+
+        assert(intersectFunc);
+        intersectFunc(&args);
       }
 
       /*! Tests if a packet of K rays is occluded by the scene. */
       template<int K>
-        __forceinline void occluded (const vbool<K>& valid, RayK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context)
+        __forceinline void occluded (const vbool<K>& valid, RayK<K>& ray, unsigned int geomID, unsigned int primID, RayQueryContext* context)
       {
         assert(primID < size());
-        assert(intersectorN.occluded);
         
         vint<K> mask = valid.mask32();
         OccludedFunctionNArguments args;
@@ -215,8 +320,17 @@ namespace embree
         args.geomID = geomID;
         args.primID = primID;
         args.geometry = this;
+        args.forward_scene = nullptr;
+        args.args = context->args;
+
+        OccludedFuncN occludedFunc = nullptr;
+        occludedFunc = intersectorN.occluded;
         
-        intersectorN.occluded(&args);
+        if (context->getOccludedFunction())
+          occludedFunc = context->getOccludedFunction();
+
+        assert(occludedFunc);
+        occludedFunc(&args);
       }
 
     public:
diff --git a/thirdparty/embree/kernels/common/alloc.cpp b/thirdparty/embree/kernels/common/alloc.cpp
index 38a76225f4..cc2f9976f2 100644
--- a/thirdparty/embree/kernels/common/alloc.cpp
+++ b/thirdparty/embree/kernels/common/alloc.cpp
@@ -10,7 +10,7 @@
 namespace embree
 {
   __thread FastAllocator::ThreadLocal2* FastAllocator::thread_local_allocator2 = nullptr;
-  SpinLock FastAllocator::s_thread_local_allocators_lock;
+  MutexSys FastAllocator::s_thread_local_allocators_lock;
   std::vector<std::unique_ptr<FastAllocator::ThreadLocal2>> FastAllocator::s_thread_local_allocators;
    
   struct fast_allocator_regression_test : public RegressionTest
diff --git a/thirdparty/embree/kernels/common/alloc.h b/thirdparty/embree/kernels/common/alloc.h
index 12769df2c8..840d48c327 100644
--- a/thirdparty/embree/kernels/common/alloc.h
+++ b/thirdparty/embree/kernels/common/alloc.h
@@ -6,11 +6,9 @@
 #include "default.h"
 #include "device.h"
 #include "scene.h"
-#include "primref.h"
+#include "../builders/primref.h"
 
-#if defined(APPLE) && defined(__aarch64__)
-#include <mutex>
-#endif
+#include "../../common/tasking/taskscheduler.h"
 
 namespace embree
 {
@@ -18,7 +16,7 @@ namespace embree
   {
     /*! maximum supported alignment */
     static const size_t maxAlignment = 64;
-
+    
     /*! maximum allocation size */
 
     /* default settings */
@@ -39,14 +37,14 @@ namespace embree
     public:
 
       /*! Constructor for usage with ThreadLocalData */
-      __forceinline ThreadLocal (ThreadLocal2* parent) 
-	: parent(parent), ptr(nullptr), cur(0), end(0), allocBlockSize(0), bytesUsed(0), bytesWasted(0) {}
+      __forceinline ThreadLocal (ThreadLocal2* parent)
+        : parent(parent), ptr(nullptr), cur(0), end(0), allocBlockSize(0), bytesUsed(0), bytesWasted(0) {}
 
       /*! initialize allocator */
-      void init(FastAllocator* alloc) 
+      void init(FastAllocator* alloc)
       {
         ptr = nullptr;
-	cur = end = 0;
+        cur = end = 0;
         bytesUsed = 0;
         bytesWasted = 0;
         allocBlockSize = 0;
@@ -54,64 +52,62 @@ namespace embree
       }
 
       /* Allocate aligned memory from the threads memory block. */
-      __forceinline void* malloc(FastAllocator* alloc, size_t bytes, size_t align = 16) 
+      __forceinline void* malloc(FastAllocator* alloc, size_t bytes, size_t align = 16)
       {
         /* bind the thread local allocator to the proper FastAllocator*/
         parent->bind(alloc);
 
         assert(align <= maxAlignment);
-	bytesUsed += bytes;
+        bytesUsed += bytes;
 
         /* try to allocate in local block */
-	size_t ofs = (align - cur) & (align-1);
+        size_t ofs = (align - cur) & (align-1);
         cur += bytes + ofs;
         if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
-	cur -= bytes + ofs;
-        
+        cur -= bytes + ofs;
+
         /* if allocation is too large allocate with parent allocator */
         if (4*bytes > allocBlockSize) {
           return alloc->malloc(bytes,maxAlignment,false);
-	}
+        }
 
         /* get new partial block if allocation failed */
         size_t blockSize = allocBlockSize;
         ptr = (char*) alloc->malloc(blockSize,maxAlignment,true);
- 	bytesWasted += end-cur;
-	cur = 0; end = blockSize;
+        bytesWasted += end-cur;
+        cur = 0; end = blockSize;
 
         /* retry allocation */
-	ofs = (align - cur) & (align-1);
+        ofs = (align - cur) & (align-1);
         cur += bytes + ofs;
         if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
-	cur -= bytes + ofs;
+        cur -= bytes + ofs;
 
         /* get new full block if allocation failed */
         blockSize = allocBlockSize;
         ptr = (char*) alloc->malloc(blockSize,maxAlignment,false);
-	bytesWasted += end-cur;
-	cur = 0; end = blockSize;
+        bytesWasted += end-cur;
+        cur = 0; end = blockSize;
 
         /* retry allocation */
-	ofs = (align - cur) & (align-1);
+        ofs = (align - cur) & (align-1);
         cur += bytes + ofs;
         if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
-	cur -= bytes + ofs;
+        cur -= bytes + ofs;
 
         /* should never happen as large allocations get handled specially above */
         assert(false);
         return nullptr;
       }
 
-      
-      /*! returns amount of used bytes */
       __forceinline size_t getUsedBytes() const { return bytesUsed; }
-  
+
       /*! returns amount of free bytes */
       __forceinline size_t getFreeBytes() const { return end-cur; }
-      
+
       /*! returns amount of wasted bytes */
       __forceinline size_t getWastedBytes() const { return bytesWasted; }
-  
+
     private:
       ThreadLocal2* parent;
       char*  ptr;            //!< pointer to memory block
@@ -136,11 +132,7 @@ namespace embree
       {
         assert(alloc_i);
         if (alloc.load() == alloc_i) return;
-#if defined(APPLE) && defined(__aarch64__)
-        std::scoped_lock lock(mutex);
-#else
-        Lock<SpinLock> lock(mutex);
-#endif
+        Lock<MutexSys> lock(mutex);
         //if (alloc.load() == alloc_i) return; // not required as only one thread calls bind
         if (alloc.load()) {
           alloc.load()->bytesUsed   += alloc0.getUsedBytes()   + alloc1.getUsedBytes();
@@ -158,11 +150,7 @@ namespace embree
       {
         assert(alloc_i);
         if (alloc.load() != alloc_i) return;
-#if defined(APPLE) && defined(__aarch64__)
-        std::scoped_lock lock(mutex);
-#else
-        Lock<SpinLock> lock(mutex);
-#endif
+        Lock<MutexSys> lock(mutex);
         if (alloc.load() != alloc_i) return; // required as a different thread calls unbind
         alloc.load()->bytesUsed   += alloc0.getUsedBytes()   + alloc1.getUsedBytes();
         alloc.load()->bytesFree   += alloc0.getFreeBytes()   + alloc1.getFreeBytes();
@@ -173,26 +161,47 @@ namespace embree
       }
 
     public:
-#if defined(APPLE) && defined(__aarch64__)
-      std::mutex mutex;
-#else
-      SpinLock mutex;        //!< required as unbind is called from other threads
-#endif
+      MutexSys mutex;
       std::atomic<FastAllocator*> alloc;  //!< parent allocator
       ThreadLocal alloc0;
       ThreadLocal alloc1;
     };
 
-    FastAllocator (Device* device, bool osAllocation) 
-      : device(device), slotMask(0), usedBlocks(nullptr), freeBlocks(nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE), estimatedSize(0),
-        growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? EMBREE_OS_MALLOC : ALIGNED_MALLOC),
-        primrefarray(device,0)
+    FastAllocator (Device* device,
+                   bool osAllocation,
+                   bool useUSM = false,
+                   bool blockAllocation = true)
+      : device(device)
+      , slotMask(0)
+      , defaultBlockSize(PAGE_SIZE)
+      , estimatedSize(0)
+      , growSize(PAGE_SIZE)
+      , maxGrowSize(maxAllocationSize)
+      , usedBlocks(nullptr)
+      , freeBlocks(nullptr)
+      , useUSM(useUSM)
+      , blockAllocation(blockAllocation)
+      , use_single_mode(false)
+      , log2_grow_size_scale(0)
+      , bytesUsed(0)
+      , bytesFree(0)
+      , bytesWasted(0)
+      , atype(osAllocation ? EMBREE_OS_MALLOC : ALIGNED_MALLOC)
+      , primrefarray(device,0)
     {
+      // -- GODOT start --
+      // if (osAllocation && useUSM)
+      //   throw std::runtime_error("USM allocation cannot be combined with OS allocation.");
+      if (osAllocation && useUSM) {
+        abort();
+      }
+      // -- GODOT end --
+
       for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
       {
         threadUsedBlocks[i] = nullptr;
         threadBlocks[i] = nullptr;
-        assert(!slotMutex[i].isLocked());
+        //assert(!slotMutex[i].isLocked());
       }
     }
 
@@ -233,11 +242,7 @@ namespace embree
       ThreadLocal2* alloc = thread_local_allocator2;
       if (alloc == nullptr) {
         thread_local_allocator2 = alloc = new ThreadLocal2;
-#if defined(APPLE) && defined(__aarch64__)
-        std::scoped_lock lock(s_thread_local_allocators_lock);
-#else
-        Lock<SpinLock> lock(s_thread_local_allocators_lock);
-#endif
+        Lock<MutexSys> lock(s_thread_local_allocators_lock);
         s_thread_local_allocators.push_back(make_unique(alloc));
       }
       return alloc;
@@ -247,11 +252,7 @@ namespace embree
 
     __forceinline void join(ThreadLocal2* alloc)
     {
-#if defined(APPLE) && defined(__aarch64__)
-      std::scoped_lock lock(s_thread_local_allocators_lock);
-#else
-      Lock<SpinLock> lock(thread_local_allocators_lock);
-#endif
+      Lock<MutexSys> lock(s_thread_local_allocators_lock);
       thread_local_allocators.push_back(alloc);
     }
 
@@ -412,7 +413,7 @@ namespace embree
       slotMask = MAX_THREAD_USED_BLOCK_SLOTS-1; // FIXME: remove
       if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
       if (bytesReserve == 0) bytesReserve = bytesAllocate;
-      freeBlocks = Block::create(device,bytesAllocate,bytesReserve,nullptr,atype);
+      freeBlocks = Block::create(device,useUSM,bytesAllocate,bytesReserve,nullptr,atype);
       estimatedSize = bytesEstimate;
       initGrowSizeAndNumSlots(bytesEstimate,true);
     }
@@ -478,8 +479,8 @@ namespace embree
       bytesUsed.store(0);
       bytesFree.store(0);
       bytesWasted.store(0);
-      if (usedBlocks.load() != nullptr) usedBlocks.load()->clear_list(device); usedBlocks = nullptr;
-      if (freeBlocks.load() != nullptr) freeBlocks.load()->clear_list(device); freeBlocks = nullptr;
+      if (usedBlocks.load() != nullptr) usedBlocks.load()->clear_list(device,useUSM); usedBlocks = nullptr;
+      if (freeBlocks.load() != nullptr) freeBlocks.load()->clear_list(device,useUSM); freeBlocks = nullptr;
       for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) {
         threadUsedBlocks[i] = nullptr;
         threadBlocks[i] = nullptr;
@@ -503,9 +504,16 @@ namespace embree
         /* allocate using current block */
         size_t threadID = TaskScheduler::threadID();
         size_t slot = threadID & slotMask;
-	Block* myUsedBlocks = threadUsedBlocks[slot];
+        Block* myUsedBlocks = threadUsedBlocks[slot];
         if (myUsedBlocks) {
           void* ptr = myUsedBlocks->malloc(device,bytes,align,partial);
+          // -- GODOT start --
+          // if (ptr == nullptr && !blockAllocation)
+          //   throw std::bad_alloc();
+          if (ptr == nullptr && !blockAllocation) {
+            abort();
+          }
+          // -- GODOT end --
           if (ptr) return ptr;
         }
 
@@ -516,16 +524,12 @@ namespace embree
         /* parallel block creation in case of no freeBlocks, avoids single global mutex */
         if (likely(freeBlocks.load() == nullptr))
         {
-#if defined(APPLE) && defined(__aarch64__)
-          std::scoped_lock lock(slotMutex[slot]);
-#else
-          Lock<SpinLock> lock(slotMutex[slot]);
-#endif
+          Lock<MutexSys> lock(slotMutex[slot]);
           if (myUsedBlocks == threadUsedBlocks[slot]) {
             const size_t alignedBytes = (bytes+(align-1)) & ~(align-1);
             const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes);
             assert(allocSize >= bytes);
-            threadBlocks[slot] = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,threadBlocks[slot],atype); // FIXME: a large allocation might throw away a block here!
+            threadBlocks[slot] = threadUsedBlocks[slot] = Block::create(device,useUSM,allocSize,allocSize,threadBlocks[slot],atype); // FIXME: a large allocation might throw away a block here!
             // FIXME: a direct allocation should allocate inside the block here, and not in the next loop! a different thread could do some allocation and make the large allocation fail.
           }
           continue;
@@ -533,24 +537,20 @@ namespace embree
 
         /* if this fails allocate new block */
         {
-#if defined(APPLE) && defined(__aarch64__)
-            std::scoped_lock lock(mutex);
-#else
-            Lock<SpinLock> lock(mutex);
-#endif
-	  if (myUsedBlocks == threadUsedBlocks[slot])
-	  {
+          Lock<MutexSys> lock(mutex);
+          if (myUsedBlocks == threadUsedBlocks[slot])
+          {
             if (freeBlocks.load() != nullptr) {
-	      Block* nextFreeBlock = freeBlocks.load()->next;
-	      freeBlocks.load()->next = usedBlocks;
-	      __memory_barrier();
-	      usedBlocks = freeBlocks.load();
+              Block* nextFreeBlock = freeBlocks.load()->next;
+              freeBlocks.load()->next = usedBlocks;
+              __memory_barrier();
+              usedBlocks = freeBlocks.load();
               threadUsedBlocks[slot] = freeBlocks.load();
-	      freeBlocks = nextFreeBlock;
-	    } else {
+              freeBlocks = nextFreeBlock;
+            } else {
               const size_t allocSize = min(growSize*incGrowSizeScale(),maxGrowSize);
-	      usedBlocks = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,usedBlocks,atype); // FIXME: a large allocation should get delivered directly, like above!
-	    }
+              usedBlocks = threadUsedBlocks[slot] = Block::create(device,useUSM,allocSize,allocSize,usedBlocks,atype); // FIXME: a large allocation should get delivered directly, like above!
+            }
           }
         }
       }
@@ -559,11 +559,7 @@ namespace embree
     /*! add new block */
     void addBlock(void* ptr, ssize_t bytes)
     {
-#if defined(APPLE) && defined(__aarch64__)
-      std::scoped_lock lock(mutex);
-#else
-      Lock<SpinLock> lock(mutex);
-#endif
+      Lock<MutexSys> lock(mutex);
       const size_t sizeof_Header = offsetof(Block,data[0]);
       void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1));
       size_t ofs = (size_t) aptr - (size_t) ptr;
@@ -723,7 +719,12 @@ namespace embree
 
     void print_blocks()
     {
-      std::cout << "  estimatedSize = " << estimatedSize << ", slotMask = " << slotMask << ", use_single_mode = " << use_single_mode << ", maxGrowSize = " << maxGrowSize << ", defaultBlockSize = " << defaultBlockSize << std::endl;
+      std::cout << "  estimatedSize = " << estimatedSize
+                << ", slotMask = " << slotMask
+                << ", use_single_mode = " << use_single_mode
+                << ", maxGrowSize = " << maxGrowSize
+                << ", defaultBlockSize = " << defaultBlockSize
+                << std::endl;
 
       std::cout << "  used blocks = ";
       if (usedBlocks.load() != nullptr) usedBlocks.load()->print_list();
@@ -738,7 +739,19 @@ namespace embree
 
     struct Block
     {
-      static Block* create(MemoryMonitorInterface* device, size_t bytesAllocate, size_t bytesReserve, Block* next, AllocationType atype)
+      __forceinline static void* blockAlignedMalloc(Device* device, bool useUSM, size_t bytesAllocate, size_t bytesAlignment)
+      {
+        if (useUSM) return device->malloc(bytesAllocate, bytesAlignment);
+	else        return alignedMalloc (bytesAllocate, bytesAlignment);
+      }
+
+      __forceinline static void blockAlignedFree(Device* device, bool useUSM, void* ptr)
+      {
+        if (useUSM) return device->free(ptr);
+	else        return alignedFree(ptr);
+      }
+
+      static Block* create(Device* device, bool useUSM, size_t bytesAllocate, size_t bytesReserve, Block* next, AllocationType atype)
       {
         /* We avoid using os_malloc for small blocks as this could
          * cause a risk of fragmenting the virtual address space and
@@ -766,7 +779,7 @@ namespace embree
           {
             const size_t alignment = maxAlignment;
             if (device) device->memoryMonitor(bytesAllocate+alignment,false);
-            ptr = alignedMalloc(bytesAllocate,alignment);
+            ptr = blockAlignedMalloc(device,useUSM,bytesAllocate,alignment);
 
             /* give hint to transparently convert these pages to 2MB pages */
             const size_t ptr_aligned_begin = ((size_t)ptr) & ~size_t(PAGE_SIZE_2M-1);
@@ -780,7 +793,7 @@ namespace embree
           {
             const size_t alignment = maxAlignment;
             if (device) device->memoryMonitor(bytesAllocate+alignment,false);
-            ptr = alignedMalloc(bytesAllocate,alignment);
+            ptr = blockAlignedMalloc(device,useUSM,bytesAllocate,alignment);
             return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
           }
         }
@@ -812,23 +825,23 @@ namespace embree
         return head;
       }
 
-      void clear_list(MemoryMonitorInterface* device)
+      void clear_list(Device* device, bool useUSM)
       {
         Block* block = this;
         while (block) {
           Block* next = block->next;
-          block->clear_block(device);
+          block->clear_block(device, useUSM);
           block = next;
         }
       }
 
-      void clear_block (MemoryMonitorInterface* device)
+      void clear_block (Device* device, bool useUSM)
       {
         const size_t sizeof_Header = offsetof(Block,data[0]);
         const ssize_t sizeof_Alloced = wasted+sizeof_Header+getBlockAllocatedBytes();
 
         if (atype == ALIGNED_MALLOC) {
-          alignedFree(this);
+          blockAlignedFree(device, useUSM, this);
           if (device) device->memoryMonitor(-sizeof_Alloced,true);
         }
 
@@ -847,16 +860,16 @@ namespace embree
         size_t bytes = bytes_in;
         assert(align <= maxAlignment);
         bytes = (bytes+(align-1)) & ~(align-1);
-	if (unlikely(cur+bytes > reserveEnd && !partial)) return nullptr;
-	const size_t i = cur.fetch_add(bytes);
+        if (unlikely(cur+bytes > reserveEnd && !partial)) return nullptr;
+        const size_t i = cur.fetch_add(bytes);
         if (unlikely(i+bytes > reserveEnd && !partial)) return nullptr;
         if (unlikely(i > reserveEnd)) return nullptr;
         bytes_in = bytes = min(bytes,reserveEnd-i);
-        
-	if (i+bytes > allocEnd) {
+
+        if (i+bytes > allocEnd) {
           if (device) device->memoryMonitor(i+bytes-max(i,allocEnd),true);
         }
-	return &data[i];
+        return &data[i];
       }
 
       void* ptr() {
@@ -874,7 +887,7 @@ namespace embree
       }
 
       size_t getBlockFreeBytes() const {
-	return getBlockAllocatedBytes() - getBlockUsedBytes();
+        return getBlockAllocatedBytes() - getBlockUsedBytes();
       }
 
       size_t getBlockAllocatedBytes() const {
@@ -963,40 +976,40 @@ namespace embree
       char data[1];              //!< here starts memory to use for allocations
     };
 
+  public:
+    static const size_t blockHeaderSize = offsetof(Block,data[0]);
+
   private:
     Device* device;
-    SpinLock mutex;
     size_t slotMask;
+    size_t defaultBlockSize;
+    size_t estimatedSize;
+    size_t growSize;
+    size_t maxGrowSize;
+
+    MutexSys mutex;
+    MutexSys slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
     std::atomic<Block*> threadUsedBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
+    std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
     std::atomic<Block*> usedBlocks;
     std::atomic<Block*> freeBlocks;
 
-    std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
-#if defined(APPLE) && defined(__aarch64__)
-    std::mutex slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
-#else
-    PaddedSpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
-#endif
-
+    bool useUSM;
+    bool blockAllocation = true;
     bool use_single_mode;
-    size_t defaultBlockSize;
-    size_t estimatedSize;
-    size_t growSize;
-    size_t maxGrowSize;
+
     std::atomic<size_t> log2_grow_size_scale; //!< log2 of scaling factor for grow size // FIXME: remove
     std::atomic<size_t> bytesUsed;
     std::atomic<size_t> bytesFree;
     std::atomic<size_t> bytesWasted;
+
     static __thread ThreadLocal2* thread_local_allocator2;
-    static SpinLock s_thread_local_allocators_lock;
+    static MutexSys s_thread_local_allocators_lock;
     static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators;
-#if defined(APPLE) && defined(__aarch64__)
-    std::mutex thread_local_allocators_lock;
-#else
-    SpinLock thread_local_allocators_lock;
-#endif
+
     std::vector<ThreadLocal2*> thread_local_allocators;
     AllocationType atype;
+
     mvector<PrimRef> primrefarray;     //!< primrefarray used to allocate nodes
   };
 }
diff --git a/thirdparty/embree/kernels/common/buffer.h b/thirdparty/embree/kernels/common/buffer.h
index 793012c04d..831f5815e8 100644
--- a/thirdparty/embree/kernels/common/buffer.h
+++ b/thirdparty/embree/kernels/common/buffer.h
@@ -13,8 +13,8 @@ namespace embree
   {
   public:
     /*! Buffer construction */
-    Buffer() 
-      : device(nullptr), ptr(nullptr), numBytes(0), shared(false) {}
+    //Buffer() 
+    //: device(nullptr), ptr(nullptr), numBytes(0), shared(false) {}
 
     /*! Buffer construction */
     Buffer(Device* device, size_t numBytes_in, void* ptr_in = nullptr)
@@ -77,19 +77,17 @@ namespace embree
     /*! allocated buffer */
     void alloc()
     {
-      if (device)
-        device->memoryMonitor(this->bytes(), false);
+      device->memoryMonitor(this->bytes(), false);
       size_t b = (this->bytes()+15) & ssize_t(-16);
-      ptr = (char*)alignedMalloc(b,16);
+      ptr = (char*)device->malloc(b,16);
     }
     
     /*! frees the buffer */
     void free()
     {
       if (shared) return;
-      alignedFree(ptr); 
-      if (device)
-        device->memoryMonitor(-ssize_t(this->bytes()), true);
+      device->free(ptr); 
+      device->memoryMonitor(-ssize_t(this->bytes()), true);
       ptr = nullptr;
     }
     
@@ -246,6 +244,24 @@ namespace embree
   public:
     typedef Vec3fa value_type;
 
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+
+     /*! access to the ith element of the buffer */
+    __forceinline const Vec3fa operator [](size_t i) const
+    {
+      assert(i<num);
+      return Vec3fa::loadu(ptr_ofs + i*stride);
+    }
+    
+    /*! writes the i'th element */
+    __forceinline void store(size_t i, const Vec3fa& v)
+    {
+      assert(i<num);
+      Vec3fa::storeu(ptr_ofs + i*stride, v);
+    }
+    
+#else
+
     /*! access to the ith element of the buffer */
     __forceinline const Vec3fa operator [](size_t i) const
     {
@@ -259,5 +275,6 @@ namespace embree
       assert(i<num);
       vfloat4::storeu((float*)(ptr_ofs + i*stride), (vfloat4)v);
     }
+#endif
   };
 }
diff --git a/thirdparty/embree/kernels/common/builder.h b/thirdparty/embree/kernels/common/builder.h
index 07fe7b069b..4f6a226810 100644
--- a/thirdparty/embree/kernels/common/builder.h
+++ b/thirdparty/embree/kernels/common/builder.h
@@ -7,7 +7,7 @@
 #include "accel.h"
 
 namespace embree
-{
+{  
 #define MODE_HIGH_QUALITY (1<<8)
 
   /*! virtual interface for all hierarchy builders */
diff --git a/thirdparty/embree/kernels/common/context.h b/thirdparty/embree/kernels/common/context.h
index ccd88bdeac..936d03e54d 100644
--- a/thirdparty/embree/kernels/common/context.h
+++ b/thirdparty/embree/kernels/common/context.h
@@ -11,35 +11,62 @@ namespace embree
 {
   class Scene;
 
-  struct IntersectContext
+  struct RayQueryContext
   {
   public:
-    __forceinline IntersectContext(Scene* scene, RTCIntersectContext* user_context)
-      : scene(scene), user(user_context) {}
+
+    __forceinline RayQueryContext(Scene* scene, RTCRayQueryContext* user_context, RTCIntersectArguments* args)
+      : scene(scene), user(user_context), args(args) {}
+
+    __forceinline RayQueryContext(Scene* scene, RTCRayQueryContext* user_context, RTCOccludedArguments* args)
+      : scene(scene), user(user_context), args((RTCIntersectArguments*)args) {}
 
     __forceinline bool hasContextFilter() const {
-      return user->filter != nullptr;
+      return args->filter != nullptr;
+    }
+
+    RTCFilterFunctionN getFilter() const {
+      return args->filter;
+    }
+
+    RTCIntersectFunctionN getIntersectFunction() const {
+      return args->intersect;
+    }
+    
+    RTCOccludedFunctionN getOccludedFunction() const {
+      return (RTCOccludedFunctionN) args->intersect;
     }
 
     __forceinline bool isCoherent() const {
-      return embree::isCoherent(user->flags);
+      return embree::isCoherent(args->flags);
     }
 
     __forceinline bool isIncoherent() const {
-      return embree::isIncoherent(user->flags);
+      return embree::isIncoherent(args->flags);
     }
-    
+
+    __forceinline bool enforceArgumentFilterFunction() const {
+      return args->flags & RTC_RAY_QUERY_FLAG_INVOKE_ARGUMENT_FILTER;
+    }
+
+#if RTC_MIN_WIDTH
+    __forceinline float getMinWidthDistanceFactor() const {
+      return args->minWidthDistanceFactor;
+    }
+#endif
+
   public:
-    Scene* scene;
-    RTCIntersectContext* user;
+    Scene* scene = nullptr;
+    RTCRayQueryContext* user = nullptr;
+    RTCIntersectArguments* args = nullptr;
   };
 
   template<int M, typename Geometry>
-      __forceinline Vec4vf<M> enlargeRadiusToMinWidth(const IntersectContext* context, const Geometry* geom, const Vec3vf<M>& ray_org, const Vec4vf<M>& v)
+      __forceinline Vec4vf<M> enlargeRadiusToMinWidth(const RayQueryContext* context, const Geometry* geom, const Vec3vf<M>& ray_org, const Vec4vf<M>& v)
     {
 #if RTC_MIN_WIDTH
       const vfloat<M> d = length(Vec3vf<M>(v) - ray_org);
-      const vfloat<M> r = clamp(context->user->minWidthDistanceFactor*d, v.w, geom->maxRadiusScale*v.w);
+      const vfloat<M> r = clamp(context->getMinWidthDistanceFactor()*d, v.w, geom->maxRadiusScale*v.w);
       return Vec4vf<M>(v.x,v.y,v.z,r);
 #else
       return v;
@@ -47,16 +74,21 @@ namespace embree
     }
 
     template<typename Geometry>
-    __forceinline Vec3ff enlargeRadiusToMinWidth(const IntersectContext* context, const Geometry* geom, const Vec3fa& ray_org, const Vec3ff& v)
+    __forceinline Vec3ff enlargeRadiusToMinWidth(const RayQueryContext* context, const Geometry* geom, const Vec3fa& ray_org, const Vec3ff& v)
   {
 #if RTC_MIN_WIDTH
     const float d = length(Vec3fa(v) - ray_org);
-    const float r = clamp(context->user->minWidthDistanceFactor*d, v.w, geom->maxRadiusScale*v.w);
+    const float r = clamp(context->getMinWidthDistanceFactor()*d, v.w, geom->maxRadiusScale*v.w);
     return Vec3ff(v.x,v.y,v.z,r);
 #else
     return v;
 #endif
   }
+
+  template<typename Geometry>
+    __forceinline Vec3ff enlargeRadiusToMinWidth(const RayQueryContext* context, const Geometry* geom, const Vec3fa& ray_org, const Vec4f& v) {
+    return enlargeRadiusToMinWidth(context,geom,ray_org,Vec3ff(v.x,v.y,v.z,v.w));
+  }
   
   enum PointQueryType
   {
@@ -66,7 +98,7 @@ namespace embree
   };
 
   typedef bool (*PointQueryFunction)(struct RTCPointQueryFunctionArguments* args);
-  
+
   struct PointQueryContext
   {
   public:
@@ -78,6 +110,7 @@ namespace embree
                                     float similarityScale,
                                     void* userPtr)
       : scene(scene)
+      , tstate(nullptr)
       , query_ws(query_ws)
       , query_type(query_type)
       , func(func)
@@ -88,16 +121,24 @@ namespace embree
       , geomID(RTC_INVALID_GEOMETRY_ID)
       , query_radius(query_ws->radius)
     { 
+      update();
+    }
+
+  public:
+    __forceinline void update()
+    {
       if (query_type == POINT_QUERY_TYPE_AABB) {
         assert(similarityScale == 0.f);
         updateAABB();
       }
+      else{
+        query_radius = Vec3fa(query_ws->radius * similarityScale);
+      }
       if (userContext->instStackSize == 0) {
         assert(similarityScale == 1.f);
       }
     }
 
-  public:
     __forceinline void updateAABB() 
     {
       if (likely(query_ws->radius == (float)inf || userContext->instStackSize == 0)) {
@@ -113,12 +154,13 @@ namespace embree
 
 public:
     Scene* scene;
+    void* tstate;
 
     PointQuery* query_ws; // the original world space point query 
     PointQueryType query_type;
     PointQueryFunction func;
     RTCPointQueryContext* userContext;
-    const float similarityScale;
+    float similarityScale;
 
     void* userPtr;
 
diff --git a/thirdparty/embree/kernels/common/default.h b/thirdparty/embree/kernels/common/default.h
index f15d61b768..3b00ad3c88 100644
--- a/thirdparty/embree/kernels/common/default.h
+++ b/thirdparty/embree/kernels/common/default.h
@@ -13,11 +13,11 @@
 #include "../../common/sys/mutex.h"
 #include "../../common/sys/vector.h"
 #include "../../common/sys/array.h"
-#include "../../common/sys/string.h"
+#include "../../common/sys/estring.h"
 #include "../../common/sys/regression.h"
 #include "../../common/sys/vector.h"
 
-#include "../../common/math/math.h"
+#include "../../common/math/emath.h"
 #include "../../common/math/transcendental.h"
 #include "../../common/simd/simd.h"
 #include "../../common/math/vec2.h"
@@ -35,8 +35,6 @@
 #include "../../common/math/range.h"
 #include "../../common/lexers/tokenstream.h"
 
-#include "../../common/tasking/taskscheduler.h"
-
 #define COMMA ,
 
 #include "../config.h"
@@ -217,7 +215,7 @@ namespace embree
   __forceinline int getTimeSegment(float time, float numTimeSegments, float& ftime)
   {
     const float timeScaled = time * numTimeSegments;
-    const float itimef = clamp(floorf(timeScaled), 0.0f, numTimeSegments-1.0f);
+    const float itimef = clamp(floor(timeScaled), 0.0f, numTimeSegments-1.0f);
     ftime = timeScaled - itimef;
     return int(itimef);
   }
@@ -225,7 +223,7 @@ namespace embree
   __forceinline int getTimeSegment(float time, float start_time, float end_time, float numTimeSegments, float& ftime)
   {
     const float timeScaled = (time-start_time)/(end_time-start_time) * numTimeSegments;
-    const float itimef = clamp(floorf(timeScaled), 0.0f, numTimeSegments-1.0f);
+    const float itimef = clamp(floor(timeScaled), 0.0f, numTimeSegments-1.0f);
     ftime = timeScaled - itimef;
     return int(itimef);
   }
diff --git a/thirdparty/embree/kernels/common/device.cpp b/thirdparty/embree/kernels/common/device.cpp
index 833ec65139..07214532a1 100644
--- a/thirdparty/embree/kernels/common/device.cpp
+++ b/thirdparty/embree/kernels/common/device.cpp
@@ -2,6 +2,9 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "device.h"
+
+#include "../../common/tasking/taskscheduler.h"
+
 #include "../hash.h"
 #include "scene_triangle_mesh.h"
 #include "scene_user_geometry.h"
@@ -19,9 +22,12 @@
 #include "../bvh/bvh4_factory.h"
 #include "../bvh/bvh8_factory.h"
 
-#include "../../common/tasking/taskscheduler.h"
 #include "../../common/sys/alloc.h"
 
+#if defined(EMBREE_SYCL_SUPPORT)
+#  include "../level_zero/ze_wrapper.h"
+#endif
+
 namespace embree
 {
   /*! some global variables that can be set via rtcSetParameter1i for debugging purposes */
@@ -30,13 +36,18 @@ namespace embree
   ssize_t Device::debug_int2 = 0;
   ssize_t Device::debug_int3 = 0;
 
-  DECLARE_SYMBOL2(RayStreamFilterFuncs,rayStreamFilterFuncs);
-
   static MutexSys g_mutex;
   static std::map<Device*,size_t> g_cache_size_map;
   static std::map<Device*,size_t> g_num_threads_map;
+  
+  struct TaskArena
+  {
+#if USE_TASK_ARENA
+    std::unique_ptr<tbb::task_arena> arena;
+#endif
+  };
 
-  Device::Device (const char* cfg)
+  Device::Device (const char* cfg) : arena(new TaskArena())
   {
     /* check that CPU supports lowest ISA */
     if (!hasISA(ISA)) {
@@ -48,12 +59,12 @@ namespace embree
     case CPU::UNKNOWN:         frequency_level = FREQUENCY_SIMD256; break;
     case CPU::XEON_ICE_LAKE:   frequency_level = FREQUENCY_SIMD256; break;
     case CPU::CORE_ICE_LAKE:   frequency_level = FREQUENCY_SIMD256; break;
-    case CPU::CORE_TIGER_LAKE: frequency_level = FREQUENCY_SIMD128; break;
-    case CPU::CORE_COMET_LAKE: frequency_level = FREQUENCY_SIMD128; break;
-    case CPU::CORE_CANNON_LAKE:frequency_level = FREQUENCY_SIMD128; break;
-    case CPU::CORE_KABY_LAKE:  frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE_TIGER_LAKE: frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::CORE_COMET_LAKE: frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::CORE_CANNON_LAKE:frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::CORE_KABY_LAKE:  frequency_level = FREQUENCY_SIMD256; break;
     case CPU::XEON_SKY_LAKE:   frequency_level = FREQUENCY_SIMD128; break;
-    case CPU::CORE_SKY_LAKE:   frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE_SKY_LAKE:   frequency_level = FREQUENCY_SIMD256; break;
     case CPU::XEON_BROADWELL:  frequency_level = FREQUENCY_SIMD256; break;
     case CPU::CORE_BROADWELL:  frequency_level = FREQUENCY_SIMD256; break;
     case CPU::XEON_HASWELL:    frequency_level = FREQUENCY_SIMD256; break;
@@ -66,11 +77,7 @@ namespace embree
     case CPU::CORE1:           frequency_level = FREQUENCY_SIMD128; break;
     case CPU::XEON_PHI_KNIGHTS_MILL   : frequency_level = FREQUENCY_SIMD512; break;
     case CPU::XEON_PHI_KNIGHTS_LANDING: frequency_level = FREQUENCY_SIMD512; break;
-#if defined(__APPLE__)
-    case CPU::ARM:             frequency_level = FREQUENCY_SIMD256; break; // Apple M1 supports high throughput for SIMD4
-#else
-    case CPU::ARM:             frequency_level = FREQUENCY_SIMD128; break;
-#endif
+    case CPU::ARM:             frequency_level = FREQUENCY_SIMD256; break;
     }
 
     /* initialize global state */
@@ -126,13 +133,6 @@ namespace embree
 
     /* setup tasking system */
     initTaskingSystem(numThreads);
-
-    /* ray stream SOA to AOS conversion */
-#if defined(EMBREE_RAY_PACKETS)
-    RayStreamFilterFuncsType rayStreamFilterFuncs;
-    SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(enabled_cpu_features,rayStreamFilterFuncs);
-    rayStreamFilters = rayStreamFilterFuncs();
-#endif
   }
 
   Device::~Device ()
@@ -174,6 +174,9 @@ namespace embree
 #if defined (EMBREE_BACKFACE_CULLING_CURVES)
     v += "backfacecullingcurves ";
 #endif
+#if defined (EMBREE_BACKFACE_CULLING_SPHERES)
+    v += "backfacecullingspheres ";
+#endif
 #if defined(EMBREE_FILTER_FUNCTION)
     v += "intersection_filter ";
 #endif
@@ -367,7 +370,7 @@ namespace embree
 #if USE_TASK_ARENA
     const size_t nThreads = min(maxNumThreads,TaskScheduler::threadCount());
     const size_t uThreads = min(max(numUserThreads,(size_t)1),nThreads);
-    arena = make_unique(new tbb::task_arena((int)nThreads,(unsigned int)uThreads));
+    arena->arena = make_unique(new tbb::task_arena((int)nThreads,(unsigned int)uThreads));
 #endif
   }
 
@@ -386,8 +389,21 @@ namespace embree
       TaskScheduler::create(maxNumThreads,State::set_affinity,State::start_threads);
     }
 #if USE_TASK_ARENA
-    arena.reset();
+    arena->arena.reset();
+#endif
+  }
+
+  void Device::execute(bool join, const std::function<void()>& func)
+  {
+#if USE_TASK_ARENA
+    if (join) {
+      arena->arena->execute(func);
+    }
+    else
 #endif
+    {
+      func();
+    }
   }
 
   void Device::setProperty(const RTCDeviceProperty prop, ssize_t val)
@@ -450,12 +466,6 @@ namespace embree
     case RTC_DEVICE_PROPERTY_NATIVE_RAY16_SUPPORTED: return 0;
 #endif
 
-#if defined(EMBREE_RAY_PACKETS)
-    case RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED:  return 1;
-#else
-    case RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED:  return 0;
-#endif
-    
 #if defined(EMBREE_RAY_MASK)
     case RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED: return 1;
 #else
@@ -474,6 +484,12 @@ namespace embree
     case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_CURVES_ENABLED: return 0;
 #endif
 
+#if defined(EMBREE_BACKFACE_CULLING_SPHERES)
+    case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_SPHERES_ENABLED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_SPHERES_ENABLED: return 0;
+#endif
+
 #if defined(EMBREE_COMPACT_POLYS)
     case RTC_DEVICE_PROPERTY_COMPACT_POLYS_ENABLED: return 1;
 #else
@@ -557,4 +573,158 @@ namespace embree
     default: throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown readable property"); break;
     };
   }
+
+  void* Device::malloc(size_t size, size_t align) {
+    return alignedMalloc(size,align);
+  }
+
+  void Device::free(void* ptr) {
+    alignedFree(ptr);
+  }
+
+
+#if defined(EMBREE_SYCL_SUPPORT)
+
+  DeviceGPU::DeviceGPU(sycl::context sycl_context, const char* cfg)
+    : Device(cfg), gpu_context(sycl_context)
+  {
+    /* initialize ZeWrapper */
+    if (ZeWrapper::init() != ZE_RESULT_SUCCESS)
+       throw_RTCError(RTC_ERROR_UNKNOWN, "cannot initialize ZeWrapper");
+     
+    /* take first device as default device */
+    auto devices = gpu_context.get_devices();
+    if (devices.size() == 0)
+      throw_RTCError(RTC_ERROR_UNKNOWN, "SYCL context contains no device");
+    gpu_device = devices[0];
+
+    /* check if RTAS build extension is available */
+    sycl::platform platform = gpu_device.get_platform();
+    ze_driver_handle_t hDriver = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(platform);
+    
+    uint32_t count = 0;
+    std::vector<ze_driver_extension_properties_t> extensions;
+    ze_result_t result = ZeWrapper::zeDriverGetExtensionProperties(hDriver,&count,extensions.data());
+    if (result != ZE_RESULT_SUCCESS)
+      throw_RTCError(RTC_ERROR_UNKNOWN, "zeDriverGetExtensionProperties failed");
+    
+    extensions.resize(count);
+    result = ZeWrapper::zeDriverGetExtensionProperties(hDriver,&count,extensions.data());
+    if (result != ZE_RESULT_SUCCESS)
+      throw_RTCError(RTC_ERROR_UNKNOWN, "zeDriverGetExtensionProperties failed");
+
+#if defined(EMBREE_SYCL_L0_RTAS_BUILDER)
+    bool ze_rtas_builder = false;
+    for (uint32_t i=0; i<extensions.size(); i++)
+    {
+      if (strncmp("ZE_experimental_rtas_builder",extensions[i].name,sizeof(extensions[i].name)) == 0)
+        ze_rtas_builder = true;
+    }
+    if (!ze_rtas_builder)
+      throw_RTCError(RTC_ERROR_UNKNOWN, "ZE_experimental_rtas_builder extension not found");
+
+    result = ZeWrapper::initRTASBuilder(hDriver,ZeWrapper::LEVEL_ZERO);
+    if (result == ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE)
+      throw_RTCError(RTC_ERROR_UNKNOWN, "cannot load ZE_experimental_rtas_builder extension");
+    if (result != ZE_RESULT_SUCCESS)
+      throw_RTCError(RTC_ERROR_UNKNOWN, "cannot initialize ZE_experimental_rtas_builder extension");
+#else
+    ZeWrapper::initRTASBuilder(hDriver,ZeWrapper::INTERNAL);
+#endif
+
+    if (State::verbosity(1))
+    {
+      if (ZeWrapper::rtas_builder == ZeWrapper::INTERNAL)
+        std::cout << "  Internal RTAS Builder" << std::endl;
+      else
+        std::cout << "  Level Zero RTAS Builder" << std::endl;
+    }
+
+    /* check if extension library can get loaded */
+    ze_rtas_parallel_operation_exp_handle_t hParallelOperation;
+    result = ZeWrapper::zeRTASParallelOperationCreateExp(hDriver, &hParallelOperation);
+    if (result == ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE)
+      throw_RTCError(RTC_ERROR_UNKNOWN, "Level Zero RTAS Build Extension cannot get loaded");
+    if (result == ZE_RESULT_SUCCESS)
+      ZeWrapper::zeRTASParallelOperationDestroyExp(hParallelOperation);
+
+    gpu_maxWorkGroupSize = getGPUDevice().get_info<sycl::info::device::max_work_group_size>();
+    gpu_maxComputeUnits  = getGPUDevice().get_info<sycl::info::device::max_compute_units>();    
+
+    if (State::verbosity(1))
+    {
+      sycl::platform platform = gpu_context.get_platform();
+      std::cout << "  Platform              : " << platform.get_info<sycl::info::platform::name>() << std::endl;
+      std::cout << "    Device              : " << getGPUDevice().get_info<sycl::info::device::name>() << std::endl;
+      std::cout << "    Max Work Group Size : " << gpu_maxWorkGroupSize << std::endl;
+      std::cout << "    Max Compute Units   : " << gpu_maxComputeUnits  << std::endl;
+      std::cout << std::endl;
+    }
+    
+    dispatchGlobalsPtr = zeRTASInitExp(gpu_device, gpu_context);
+  }
+
+  DeviceGPU::~DeviceGPU()
+  {
+    rthwifCleanup(this,dispatchGlobalsPtr,gpu_context);
+  }
+
+  void DeviceGPU::enter() {
+    enableUSMAllocEmbree(&gpu_context,&gpu_device);
+  }
+
+  void DeviceGPU::leave() {
+    disableUSMAllocEmbree();
+  }
+
+  void* DeviceGPU::malloc(size_t size, size_t align) {
+    return alignedSYCLMalloc(&gpu_context,&gpu_device,size,align,EMBREE_USM_SHARED_DEVICE_READ_ONLY);
+  }
+
+  void DeviceGPU::free(void* ptr) {
+    alignedSYCLFree(&gpu_context,ptr);
+  }
+
+  void DeviceGPU::setSYCLDevice(const sycl::device sycl_device_in) {
+    gpu_device = sycl_device_in;
+  }
+  
+#endif
+
+  DeviceEnterLeave::DeviceEnterLeave (RTCDevice hdevice)
+    : device((Device*)hdevice)
+  {
+    assert(device);
+    device->refInc();
+    device->enter();
+  }
+  
+  DeviceEnterLeave::DeviceEnterLeave (RTCScene hscene)
+    : device(((Scene*)hscene)->device)
+  {
+    assert(device);
+    device->refInc();
+    device->enter();
+  }
+  
+  DeviceEnterLeave::DeviceEnterLeave (RTCGeometry hgeometry)
+    : device(((Geometry*)hgeometry)->device)
+  {
+    assert(device);
+    device->refInc();
+    device->enter();
+  }
+  
+  DeviceEnterLeave::DeviceEnterLeave (RTCBuffer hbuffer)
+    : device(((Buffer*)hbuffer)->device)
+  {
+    assert(device);
+    device->refInc();
+    device->enter();
+  }
+  
+  DeviceEnterLeave::~DeviceEnterLeave() {
+    device->leave();
+    device->refDec();
+  }
 }
diff --git a/thirdparty/embree/kernels/common/device.h b/thirdparty/embree/kernels/common/device.h
index 21c42c654d..c9e8888a5a 100644
--- a/thirdparty/embree/kernels/common/device.h
+++ b/thirdparty/embree/kernels/common/device.h
@@ -11,10 +11,57 @@ namespace embree
 {
   class BVH4Factory;
   class BVH8Factory;
+  struct TaskArena;
 
   class Device : public State, public MemoryMonitorInterface
   {
     ALIGNED_CLASS_(16);
+    
+  public:
+    
+    /*! allocator that performs unified shared memory allocations */
+    template<typename T, size_t alignment>
+    struct allocator
+    {
+      typedef T value_type;
+      typedef T* pointer;
+      typedef const T* const_pointer;
+      typedef T& reference;
+      typedef const T& const_reference;
+      typedef std::size_t size_type;
+      typedef std::ptrdiff_t difference_type;
+      
+      allocator() {}
+      
+      allocator(Device* device)
+        : device(device) {}
+      
+      __forceinline pointer allocate( size_type n ) {
+        assert(device);
+        return (pointer) device->malloc(n*sizeof(T),alignment);
+      }
+      
+      __forceinline void deallocate( pointer p, size_type n ) {
+        if (device) device->free(p);
+      }
+      
+      __forceinline void construct( pointer p, const_reference val ) {
+        new (p) T(val);
+      }
+      
+      __forceinline void destroy( pointer p ) {
+        p->~T();
+      }
+      
+      Device* device = nullptr;
+    };
+
+    /*! vector class that performs aligned allocations from Device object */
+    template<typename T>
+    using vector = vector_t<T,allocator<T,std::alignment_of<T>::value>>;
+
+    template<typename T, size_t alignment>
+    using avector = vector_t<T,allocator<T,alignment>>;
 
   public:
 
@@ -54,6 +101,18 @@ namespace embree
     /*! gets a property */
     ssize_t getProperty(const RTCDeviceProperty prop);
 
+    /*! enter device by setting up some global state */
+    virtual void enter() {}
+
+    /*! leave device by setting up some global state */
+    virtual void leave() {}
+
+    /*! buffer allocation */
+    virtual void* malloc(size_t size, size_t align);
+
+    /*! buffer deallocation */
+    virtual void free(void* ptr);
+
   private:
 
     /*! initializes the tasking system */
@@ -62,6 +121,13 @@ namespace embree
     /*! shuts down the tasking system */
     void exitTaskingSystem();
 
+    std::unique_ptr<TaskArena> arena;
+
+  public:
+
+    // use tasking system arena to execute func
+    void execute(bool join, const std::function<void()>& func);
+
     /*! some variables that can be set via rtcSetParameter1i for debugging purposes */
   public:
     static ssize_t debug_int0;
@@ -74,12 +140,55 @@ namespace embree
 #if defined(EMBREE_TARGET_SIMD8)
     std::unique_ptr<BVH8Factory> bvh8_factory;
 #endif
-    
-#if USE_TASK_ARENA
-    std::unique_ptr<tbb::task_arena> arena;
+  };
+
+#if defined(EMBREE_SYCL_SUPPORT)
+     
+  class DeviceGPU : public Device
+  {
+  public:
+
+    DeviceGPU(sycl::context sycl_context, const char* cfg);
+    ~DeviceGPU();
+
+    virtual void enter() override;
+    virtual void leave() override;
+    virtual void* malloc(size_t size, size_t align) override;
+    virtual void free(void* ptr) override;
+
+    /* set SYCL device */
+    void setSYCLDevice(const sycl::device sycl_device);
+
+  private:
+    sycl::context gpu_context;
+    sycl::device  gpu_device;
+        
+    unsigned int gpu_maxWorkGroupSize;
+    unsigned int gpu_maxComputeUnits;
+
+  public:
+    void* dispatchGlobalsPtr = nullptr;
+
+  public:
+    inline sycl::device  &getGPUDevice()  { return gpu_device; }        
+    inline sycl::context &getGPUContext() { return gpu_context; }    
+
+    inline unsigned int getGPUMaxWorkGroupSize() { return gpu_maxWorkGroupSize; }
+
+    void init_rthw_level_zero();
+    void init_rthw_opencl();
+  };
+
 #endif
-    
-    /* ray streams filter */
-    RayStreamFilterFuncs rayStreamFilters;
+
+  struct DeviceEnterLeave
+  {
+    DeviceEnterLeave (RTCDevice hdevice);
+    DeviceEnterLeave (RTCScene hscene);
+    DeviceEnterLeave (RTCGeometry hgeometry);
+    DeviceEnterLeave (RTCBuffer hbuffer);
+    ~DeviceEnterLeave();
+  private:
+    Device* device;
   };
 }
diff --git a/thirdparty/embree/kernels/common/geometry.cpp b/thirdparty/embree/kernels/common/geometry.cpp
index d8d3f65a5c..79a6eb00d7 100644
--- a/thirdparty/embree/kernels/common/geometry.cpp
+++ b/thirdparty/embree/kernels/common/geometry.cpp
@@ -45,12 +45,13 @@ namespace embree
   Geometry::Geometry (Device* device, GType gtype, unsigned int numPrimitives, unsigned int numTimeSteps) 
     : device(device), userPtr(nullptr),
       numPrimitives(numPrimitives), numTimeSteps(unsigned(numTimeSteps)), fnumTimeSegments(float(numTimeSteps-1)), time_range(0.0f,1.0f),
-      mask(-1),
+      mask(1),
       gtype(gtype),
       gsubtype(GTY_SUBTYPE_DEFAULT),
       quality(RTC_BUILD_QUALITY_MEDIUM),
       state((unsigned)State::MODIFIED),
       enabled(true),
+      argumentFilterEnabled(false),
       intersectionFilterN(nullptr), occlusionFilterN(nullptr), pointQueryFunc(nullptr)
   {
     device->refInc();
@@ -88,6 +89,11 @@ namespace embree
     Geometry::update();
   }
   
+  BBox1f Geometry::getTimeRange () const
+  {
+    return time_range;
+  }
+
   void Geometry::update()
   {
     ++modCounter_; // FIXME: required?
@@ -227,11 +233,11 @@ namespace embree
       }
     }
   }
-    
+
   bool Geometry::pointQuery(PointQuery* query, PointQueryContext* context)
   {
     assert(context->primID < size());
-   
+
     RTCPointQueryFunctionArguments args;
     args.query           = (RTCPointQuery*)context->query_ws;
     args.userPtr         = context->userPtr;
@@ -239,7 +245,7 @@ namespace embree
     args.geomID          = context->geomID;
     args.context         = context->userContext;
     args.similarityScale = context->similarityScale;
-    
+
     bool update = false;
     if(context->func)  update |= context->func(&args);
     if(pointQueryFunc) update |= pointQueryFunc(&args);
diff --git a/thirdparty/embree/kernels/common/geometry.h b/thirdparty/embree/kernels/common/geometry.h
index 593990f5b1..00e3c5ede3 100644
--- a/thirdparty/embree/kernels/common/geometry.h
+++ b/thirdparty/embree/kernels/common/geometry.h
@@ -8,6 +8,7 @@
 #include "buffer.h"
 #include "../common/point_query.h"
 #include "../builders/priminfo.h"
+#include "../builders/priminfo_mb.h"
 
 namespace embree
 {
@@ -26,12 +27,14 @@ namespace embree
         numUserGeometries(0), numMBUserGeometries(0), 
         numInstancesCheap(0), numMBInstancesCheap(0), 
         numInstancesExpensive(0), numMBInstancesExpensive(0), 
-        numGrids(0), numMBGrids(0), 
+        numInstanceArrays(0), numMBInstanceArrays(0),
+        numGrids(0), numMBGrids(0),
+        numSubGrids(0), numMBSubGrids(0), 
         numPoints(0), numMBPoints(0) {}
 
     __forceinline size_t size() const {
-      return    numTriangles + numQuads + numBezierCurves + numLineSegments + numSubdivPatches + numUserGeometries + numInstancesCheap + numInstancesExpensive + numGrids + numPoints
-              + numMBTriangles + numMBQuads + numMBBezierCurves + numMBLineSegments + numMBSubdivPatches + numMBUserGeometries + numMBInstancesCheap + numMBInstancesExpensive + numMBGrids + numMBPoints;
+      return    numTriangles + numQuads + numBezierCurves + numLineSegments + numSubdivPatches + numUserGeometries + numInstancesCheap + numInstancesExpensive + numInstanceArrays + numGrids + numPoints
+              + numMBTriangles + numMBQuads + numMBBezierCurves + numMBLineSegments + numMBSubdivPatches + numMBUserGeometries + numMBInstancesCheap + numMBInstancesExpensive + numMBInstanceArrays + numMBGrids + numMBPoints;
     }
 
     __forceinline unsigned int enabledGeometryTypesMask() const
@@ -44,8 +47,9 @@ namespace embree
       if (numUserGeometries) mask |= 1 << 4;
       if (numInstancesCheap) mask |= 1 << 5;
       if (numInstancesExpensive) mask |= 1 << 6;
-      if (numGrids) mask |= 1 << 7;
-      if (numPoints) mask |= 1 << 8;
+      if (numInstanceArrays) mask |= 1 << 7;
+      if (numGrids) mask |= 1 << 8;
+      if (numPoints) mask |= 1 << 9;
 
       unsigned int maskMB = 0;
       if (numMBTriangles) maskMB |= 1 << 0;
@@ -55,8 +59,9 @@ namespace embree
       if (numMBUserGeometries) maskMB |= 1 << 4;
       if (numMBInstancesCheap) maskMB |= 1 << 5;
       if (numMBInstancesExpensive) maskMB |= 1 << 6;
-      if (numMBGrids) maskMB |= 1 << 7;
-      if (numMBPoints) maskMB |= 1 << 8;
+      if (numMBInstanceArrays) maskMB |= 1 << 7;
+      if (numMBGrids) maskMB |= 1 << 8;
+      if (numMBPoints) maskMB |= 1 << 9;
       
       return (mask<<8) + maskMB;
     }
@@ -81,8 +86,12 @@ namespace embree
       ret.numMBInstancesCheap = numMBInstancesCheap + rhs.numMBInstancesCheap;
       ret.numInstancesExpensive = numInstancesExpensive + rhs.numInstancesExpensive;
       ret.numMBInstancesExpensive = numMBInstancesExpensive + rhs.numMBInstancesExpensive;
+      ret.numInstanceArrays = numInstanceArrays + rhs.numInstanceArrays;
+      ret.numMBInstanceArrays = numMBInstanceArrays + rhs.numMBInstanceArrays;
       ret.numGrids = numGrids + rhs.numGrids;
       ret.numMBGrids = numMBGrids + rhs.numMBGrids;
+      ret.numSubGrids = numSubGrids + rhs.numSubGrids;
+      ret.numMBSubGrids = numMBSubGrids + rhs.numMBSubGrids;
       ret.numPoints = numPoints + rhs.numPoints;
       ret.numMBPoints = numMBPoints + rhs.numMBPoints;
 
@@ -106,8 +115,12 @@ namespace embree
     size_t numMBInstancesCheap;      //!< number of enabled motion blurred cheap instances
     size_t numInstancesExpensive;    //!< number of enabled expensive instances
     size_t numMBInstancesExpensive;  //!< number of enabled motion blurred expensive instances
+    size_t numInstanceArrays;        //!< number of enabled instance arrays
+    size_t numMBInstanceArrays;      //!< number of enabled motion blurred instance arrays
     size_t numGrids;                 //!< number of enabled grid geometries
     size_t numMBGrids;               //!< number of enabled motion blurred grid geometries
+    size_t numSubGrids;              //!< number of enabled grid geometries
+    size_t numMBSubGrids;            //!< number of enabled motion blurred grid geometries
     size_t numPoints;                //!< number of enabled points
     size_t numMBPoints;              //!< number of enabled motion blurred points
   };
@@ -115,6 +128,8 @@ namespace embree
   /*! Base class all geometries are derived from */
   class Geometry : public RefCount
   {
+    ALIGNED_CLASS_USM_(16);
+    
     friend class Scene;
   public:
 
@@ -154,6 +169,7 @@ namespace embree
       GTY_USER_GEOMETRY = 29,
       GTY_INSTANCE_CHEAP = 30,
       GTY_INSTANCE_EXPENSIVE = 31,
+      GTY_INSTANCE_ARRAY = 24,
       GTY_END = 32,
 
       GTY_BASIS_LINEAR = 0,
@@ -222,7 +238,10 @@ namespace embree
 
       MTY_INSTANCE_CHEAP = 1ul << GTY_INSTANCE_CHEAP,
       MTY_INSTANCE_EXPENSIVE = 1ul << GTY_INSTANCE_EXPENSIVE,
-      MTY_INSTANCE = MTY_INSTANCE_CHEAP | MTY_INSTANCE_EXPENSIVE
+      MTY_INSTANCE = MTY_INSTANCE_CHEAP | MTY_INSTANCE_EXPENSIVE,
+      MTY_INSTANCE_ARRAY = 1ul << GTY_INSTANCE_ARRAY,
+
+      MTY_ALL = -1
     };
 
     static const char* gtype_names[GTY_END];
@@ -248,8 +267,13 @@ namespace embree
     /*! tests if geometry is disabled */
     __forceinline bool isDisabled() const { return !isEnabled(); }
 
+    /* checks if argument version of filter functions are enabled */
+    __forceinline bool hasArgumentFilterFunctions() const {
+      return argumentFilterEnabled;
+    }
+    
     /*! tests if that geometry has some filter function set */
-    __forceinline bool hasFilterFunctions () const {
+    __forceinline bool hasGeometryFilterFunctions () const {
       return (intersectionFilterN  != nullptr) || (occlusionFilterN  != nullptr);
     }
 
@@ -265,6 +289,11 @@ namespace embree
     /*! returns geometry type mask */
     __forceinline GTypeMask getTypeMask() const { return (GTypeMask)(1 << gtype); }
 
+    /*! returns true of geometry contains motion blur */
+    __forceinline bool hasMotionBlur () const {
+      return numTimeSteps > 1;
+    }
+
     /*! returns number of primitives */
     __forceinline size_t size() const { return numPrimitives; }
 
@@ -277,6 +306,9 @@ namespace embree
     /*! sets motion blur time range */
     void setTimeRange (const BBox1f range);
 
+    /*! gets motion blur time range */
+    BBox1f getTimeRange () const;
+
     /*! sets number of vertex attributes */
     virtual void setVertexAttributeCount (unsigned int N) {
       throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
@@ -447,6 +479,11 @@ namespace embree
     /*! Set occlusion filter function for ray packets of size N. */
     virtual void setOcclusionFilterFunctionN (RTCFilterFunctionN filterN);
 
+    /* Enables argument version of intersection or occlusion filter function. */
+    virtual void enableFilterFunctionFromArguments (bool enable) {
+      argumentFilterEnabled = enable;
+    }
+
     /*! for instances only */
   public:
 
@@ -455,6 +492,11 @@ namespace embree
       throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
     }
 
+    /*! Sets the instanced scenes */
+    virtual void setInstancedScenes(const RTCScene* scenes, size_t numScenes) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+    }
+
     /*! Sets transformation of the instance */
     virtual void setTransform(const AffineSpace3fa& transform, unsigned int timeStep) {
       throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
@@ -467,7 +509,12 @@ namespace embree
 
     /*! Returns the transformation of the instance */
     virtual AffineSpace3fa getTransform(float time) {
-      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+    }
+
+    /*! Returns the transformation of the instance */
+    virtual AffineSpace3fa getTransform(size_t instance, float time) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
     }
 
     /*! for user geometries only */
@@ -498,18 +545,47 @@ namespace embree
 
   public:
 
-    virtual PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const {
+    virtual PrimInfo createPrimRefArray(PrimRef* prims, const range<size_t>& r, size_t k, unsigned int geomID) const {
       throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefArray not implemented for this geometry"); 
     }
 
+    PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const {
+      return createPrimRefArray(prims.data(),r,k,geomID);
+    }
+
+    PrimInfo createPrimRefArray(avector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const {
+      return createPrimRefArray(prims.data(),r,k,geomID);
+    }
+
+    virtual PrimInfo createPrimRefArray(mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids, const range<size_t>& r, size_t k, unsigned int geomID) const {
+      return createPrimRefArray(prims,r,k,geomID);
+    }
+
     virtual PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const {
       throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefMBArray not implemented for this geometry"); 
     }
 
+    /*! Calculates the PrimRef over the complete time interval */
+    virtual PrimInfo createPrimRefArrayMB(PrimRef* prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefMBArray not implemented for this geometry");
+    }
+
+    PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const {
+      return createPrimRefArrayMB(prims.data(),t0t1,r,k,geomID);
+    }
+
+    PrimInfo createPrimRefArrayMB(avector<PrimRef>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const {
+      return createPrimRefArrayMB(prims.data(),t0t1,r,k,geomID);
+    }
+    
     virtual PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const {
       throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefMBArray not implemented for this geometry"); 
     }
 
+    virtual PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, mvector<SubGridBuildData>& sgrids, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const {
+      return createPrimRefMBArray(prims,t0t1,r,k,geomID);
+    }
+
     virtual LinearSpace3fa computeAlignedSpace(const size_t primID) const {
       throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeAlignedSpace not implemented for this geometry"); 
     }
@@ -541,6 +617,10 @@ namespace embree
     virtual LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const {
       throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry"); 
     }
+
+    virtual LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range, const SubGridBuildData * const sgrids) const {
+      return vlinearBounds(primID,time_range);
+    }
     
     virtual LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const {
       throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry"); 
@@ -566,13 +646,14 @@ namespace embree
     
     unsigned int mask;             //!< for masking out geometry
     unsigned int modCounter_ = 1; //!< counter for every modification - used to rebuild scenes when geo is modified
-    
+
     struct {
       GType gtype : 8;                //!< geometry type
       GSubType gsubtype : 8;          //!< geometry subtype
       RTCBuildQuality quality : 3;    //!< build quality for geometry
       unsigned state : 2;
-      bool enabled : 1;              //!< true if geometry is enabled
+      bool enabled : 1;               //!< true if geometry is enabled
+      bool argumentFilterEnabled : 1; //!< true if argument filter functions are enabled for this geometry
     };
        
     RTCFilterFunctionN intersectionFilterN;
diff --git a/thirdparty/embree/kernels/common/hit.h b/thirdparty/embree/kernels/common/hit.h
index fd1a9d6391..cbaeb9b73a 100644
--- a/thirdparty/embree/kernels/common/hit.h
+++ b/thirdparty/embree/kernels/common/hit.h
@@ -17,14 +17,26 @@ namespace embree
     __forceinline HitK() {}
 
     /* Constructs a hit */
-    __forceinline HitK(const RTCIntersectContext* context, const vuint<K>& geomID, const vuint<K>& primID, const vfloat<K>& u, const vfloat<K>& v, const Vec3vf<K>& Ng)
+    __forceinline HitK(const RTCRayQueryContext* context, const vuint<K>& geomID, const vuint<K>& primID, const vfloat<K>& u, const vfloat<K>& v, const Vec3vf<K>& Ng)
       : Ng(Ng), u(u), v(v), primID(primID), geomID(geomID) 
     {
-      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) {
         instID[l] = RTC_INVALID_GEOMETRY_ID;
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+        instPrimID[l] = RTC_INVALID_GEOMETRY_ID;
+#endif
+      }
+      
       instance_id_stack::copy_UV<K>(context->instID, instID);
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+      instance_id_stack::copy_UV<K>(context->instPrimID, instPrimID);
+#endif
     }
 
+    /* Constructs a hit */
+    __forceinline HitK(const RTCRayQueryContext* context, const vuint<K>& geomID, const vuint<K>& primID, const Vec2vf<K>& uv, const Vec3vf<K>& Ng)
+      : HitK(context,geomID,primID,uv.x,uv.y,Ng) {}
+
     /* Returns the size of the hit */
     static __forceinline size_t size() { return K; }
 
@@ -35,6 +47,9 @@ namespace embree
     vuint<K> primID;      // primitive ID
     vuint<K> geomID;      // geometry ID
     vuint<K> instID[RTC_MAX_INSTANCE_LEVEL_COUNT];      // instance ID
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+    vuint<K> instPrimID[RTC_MAX_INSTANCE_LEVEL_COUNT];      // instance primitive ID
+#endif
   };
 
   /* Specialization for a single hit */
@@ -45,12 +60,19 @@ namespace embree
     __forceinline HitK() {}
 
     /* Constructs a hit */
-    __forceinline HitK(const RTCIntersectContext* context, unsigned int geomID, unsigned int primID, float u, float v, const Vec3fa& Ng)
+    __forceinline HitK(const RTCRayQueryContext* context, unsigned int geomID, unsigned int primID, float u, float v, const Vec3fa& Ng)
       : Ng(Ng.x,Ng.y,Ng.z), u(u), v(v), primID(primID), geomID(geomID)
     {
-      instance_id_stack::copy_UU(context->instID, instID);
+      instance_id_stack::copy_UU(context, context->instID, instID);
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+      instance_id_stack::copy_UU(context, context->instPrimID, instPrimID);
+#endif
     }
 
+    /* Constructs a hit */
+    __forceinline HitK(const RTCRayQueryContext* context, unsigned int geomID, unsigned int primID, const Vec2f& uv, const Vec3fa& Ng)
+      : HitK<1>(context,geomID,primID,uv.x,uv.y,Ng) {}
+
     /* Returns the size of the hit */
     static __forceinline size_t size() { return 1; }
 
@@ -61,6 +83,9 @@ namespace embree
     unsigned int primID;      // primitive ID
     unsigned int geomID;      // geometry ID
     unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT];      // instance ID
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+    unsigned int instPrimID[RTC_MAX_INSTANCE_LEVEL_COUNT];      // instance primitive ID
+#endif
   };
 
   /* Shortcuts */
@@ -68,6 +93,7 @@ namespace embree
   typedef HitK<4>  Hit4;
   typedef HitK<8>  Hit8;
   typedef HitK<16> Hit16;
+  typedef HitK<VSIZEX> Hitx;
 
   /* Outputs hit to stream */
   template<int K>
@@ -84,6 +110,13 @@ namespace embree
     {
       cout << " " << ray.instID[l];
     }
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+    cout << "  instPrimID =";
+    for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+    {
+      cout << " " << ray.instPrimID[l];
+    }
+#endif
     cout << embree_endl;
     return cout << "}";
   }
@@ -97,10 +130,13 @@ namespace embree
     ray.primID = hit.primID;
     ray.geomID = hit.geomID;
     instance_id_stack::copy_UU(hit.instID, ray.instID);
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+    instance_id_stack::copy_UU(hit.instPrimID, ray.instPrimID);
+#endif
   }
 
   template<int K>
-    __forceinline void copyHitToRay(const vbool<K> &mask, RayHitK<K> &ray, const HitK<K> &hit)
+    __forceinline void copyHitToRay(const vbool<K>& mask, RayHitK<K>& ray, const HitK<K>& hit)
   {
     vfloat<K>::storeu(mask,&ray.Ng.x, hit.Ng.x);
     vfloat<K>::storeu(mask,&ray.Ng.y, hit.Ng.y);
@@ -110,5 +146,8 @@ namespace embree
     vuint<K>::storeu(mask,&ray.primID, hit.primID);
     vuint<K>::storeu(mask,&ray.geomID, hit.geomID);
     instance_id_stack::copy_VV<K>(hit.instID, ray.instID, mask);
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+    instance_id_stack::copy_VV<K>(hit.instPrimID, ray.instPrimID, mask);
+#endif
   }
 }
diff --git a/thirdparty/embree/kernels/common/instance_stack.h b/thirdparty/embree/kernels/common/instance_stack.h
index d3c0a643f1..32b57b48a3 100644
--- a/thirdparty/embree/kernels/common/instance_stack.h
+++ b/thirdparty/embree/kernels/common/instance_stack.h
@@ -19,40 +19,108 @@ static_assert(RTC_MAX_INSTANCE_LEVEL_COUNT > 0,
 /* 
  * Push an instance to the stack. 
  */
-RTC_FORCEINLINE bool push(RTCIntersectContext* context, 
-                          unsigned instanceId)
+template<typename Context>
+RTC_FORCEINLINE bool push(Context context,
+                          unsigned instanceId,
+                          unsigned instancePrimId)
 {
 #if RTC_MAX_INSTANCE_LEVEL_COUNT > 1
   const bool spaceAvailable = context->instStackSize < RTC_MAX_INSTANCE_LEVEL_COUNT;
-  /* We assert here because instances are silently dropped when the stack is full. 
+  /* We assert here because instances are silently dropped when the stack is full.
      This might be quite hard to find in production. */
-  assert(spaceAvailable); 
-  if (likely(spaceAvailable))
-    context->instID[context->instStackSize++] = instanceId;
+  assert(spaceAvailable);
+  if (likely(spaceAvailable)) {
+    context->instID[context->instStackSize] = instanceId;
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+    context->instPrimID[context->instStackSize] = instancePrimId;
+#endif
+    context->instStackSize++;
+  }
   return spaceAvailable;
 #else
   const bool spaceAvailable = (context->instID[0] == RTC_INVALID_GEOMETRY_ID);
-  assert(spaceAvailable); 
-  if (likely(spaceAvailable))
+  assert(spaceAvailable);
+  if (likely(spaceAvailable)) {
     context->instID[0] = instanceId;
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+    context->instPrimID[0] = instancePrimId;
+#endif
+  }
   return spaceAvailable;
 #endif
 }
 
-
 /* 
  * Pop the last instance pushed to the stack. 
  * Do not call on an empty stack. 
  */
-RTC_FORCEINLINE void pop(RTCIntersectContext* context)
+template<typename Context>
+RTC_FORCEINLINE void pop(Context context)
 {
   assert(context);
 #if RTC_MAX_INSTANCE_LEVEL_COUNT > 1
   assert(context->instStackSize > 0);
-  context->instID[--context->instStackSize] = RTC_INVALID_GEOMETRY_ID;
+  --context->instStackSize;
+  context->instID[context->instStackSize] = RTC_INVALID_GEOMETRY_ID;
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+  context->instPrimID[context->instStackSize] = RTC_INVALID_GEOMETRY_ID;
+#endif
 #else
   assert(context->instID[0] != RTC_INVALID_GEOMETRY_ID);
   context->instID[0] = RTC_INVALID_GEOMETRY_ID;
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+  context->instPrimID[0] = RTC_INVALID_GEOMETRY_ID;
+#endif
+#endif
+}
+
+
+/* Push an instance to the stack. Used for point queries*/
+RTC_FORCEINLINE bool push(RTCPointQueryContext* context,
+                          unsigned int instanceId,
+                          unsigned int instancePrimId,
+                          AffineSpace3fa const& w2i,
+                          AffineSpace3fa const& i2w)
+{
+  assert(context);
+  const size_t stackSize = context->instStackSize;
+  assert(stackSize < RTC_MAX_INSTANCE_LEVEL_COUNT);
+  context->instID[stackSize] = instanceId;
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+  context->instPrimID[stackSize] = instancePrimId;
+#endif
+
+  AffineSpace3fa_store_unaligned(w2i,(AffineSpace3fa*)context->world2inst[stackSize]);
+  AffineSpace3fa_store_unaligned(i2w,(AffineSpace3fa*)context->inst2world[stackSize]);
+
+#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1
+  if (unlikely(stackSize > 0))
+  {
+    const AffineSpace3fa world2inst = AffineSpace3fa_load_unaligned((AffineSpace3fa*)context->world2inst[stackSize  ])
+                                    * AffineSpace3fa_load_unaligned((AffineSpace3fa*)context->world2inst[stackSize-1]);
+    const AffineSpace3fa inst2world = AffineSpace3fa_load_unaligned((AffineSpace3fa*)context->inst2world[stackSize-1])
+                                    * AffineSpace3fa_load_unaligned((AffineSpace3fa*)context->inst2world[stackSize  ]);
+    AffineSpace3fa_store_unaligned(world2inst,(AffineSpace3fa*)context->world2inst[stackSize]);
+    AffineSpace3fa_store_unaligned(inst2world,(AffineSpace3fa*)context->inst2world[stackSize]);
+  }
+#endif
+  context->instStackSize++;
+  return true;
+}
+
+template<>
+RTC_FORCEINLINE void pop(RTCPointQueryContext* context)
+{
+  assert(context);
+#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1
+  assert(context->instStackSize > 0);
+#else
+  assert(context->instID[0] != RTC_INVALID_GEOMETRY_ID);
+#endif
+  --context->instStackSize;
+  context->instID[context->instStackSize] = RTC_INVALID_GEOMETRY_ID;
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+  context->instPrimID[context->instStackSize] = RTC_INVALID_GEOMETRY_ID;
 #endif
 }
 
@@ -77,6 +145,24 @@ RTC_FORCEINLINE void copy_UU(const unsigned* src, unsigned* tgt)
 #endif
 }
 
+RTC_FORCEINLINE void copy_UU(const RTCRayQueryContext* context, const unsigned* src, unsigned* tgt)
+{
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT == 1)
+  tgt[0] = src[0];
+  
+#else
+  
+  unsigned int depth = context->instStackSize;
+  
+  for (unsigned l = 0; l < depth; ++l)
+    tgt[l] = src[l];
+  
+  for (unsigned l = depth; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+    tgt[l] = RTC_INVALID_GEOMETRY_ID;
+
+#endif
+}
+  
 template <int K>
 RTC_FORCEINLINE void copy_UV(const unsigned* src, vuint<K>* tgt)
 {
diff --git a/thirdparty/embree/kernels/common/point_query.h b/thirdparty/embree/kernels/common/point_query.h
index 7d55c91fff..e92e22ae36 100644
--- a/thirdparty/embree/kernels/common/point_query.h
+++ b/thirdparty/embree/kernels/common/point_query.h
@@ -120,6 +120,7 @@ namespace embree
   typedef PointQueryK<4>  PointQuery4;
   typedef PointQueryK<8>  PointQuery8;
   typedef PointQueryK<16> PointQuery16;
+  typedef PointQueryK<VSIZEX> PointQueryx;
   struct PointQueryN;
 
   /* Outputs point query to stream */
diff --git a/thirdparty/embree/kernels/common/ray.h b/thirdparty/embree/kernels/common/ray.h
index 3c8ee3989c..c886013354 100644
--- a/thirdparty/embree/kernels/common/ray.h
+++ b/thirdparty/embree/kernels/common/ray.h
@@ -10,8 +10,6 @@
 
 namespace embree
 {
-  static const size_t MAX_INTERNAL_STREAM_SIZE = 32;
-
   /* Ray structure for K rays */
   template<int K>
   struct RayK
@@ -97,16 +95,24 @@ namespace embree
       : RayK<K>(org, dir, tnear, tfar, time, mask, id, flags),
         geomID(RTC_INVALID_GEOMETRY_ID) 
     {
-      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) {
         instID[l] = RTC_INVALID_GEOMETRY_ID;
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+        instPrimID[l] = RTC_INVALID_GEOMETRY_ID;
+#endif
+      }
     }
 
     __forceinline RayHitK(const RayK<K>& ray)
       : RayK<K>(ray),
         geomID(RTC_INVALID_GEOMETRY_ID) 
     {
-      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) {
         instID[l] = RTC_INVALID_GEOMETRY_ID;
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+        instPrimID[l] = RTC_INVALID_GEOMETRY_ID;
+#endif
+      }
     }
 
     __forceinline RayHitK<K>& operator =(const RayK<K>& ray)
@@ -121,8 +127,12 @@ namespace embree
       flags  = ray.flags;
 
       geomID = RTC_INVALID_GEOMETRY_ID;
-      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) {
         instID[l] = RTC_INVALID_GEOMETRY_ID;
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+        instPrimID[l] = RTC_INVALID_GEOMETRY_ID;
+#endif
+      }
 
       return *this;
     }
@@ -159,7 +169,10 @@ namespace embree
     vuint<K> primID; // primitive ID
     vuint<K> geomID; // geometry ID
     vuint<K> instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID
-  };
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+    vuint<K> instPrimID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance prim ID
+#endif
+};
 
   /* Specialization for a single ray */
   template<>
@@ -178,6 +191,11 @@ namespace embree
       return all(le_mask(abs(Vec3fa(org)), Vec3fa(FLT_LARGE)) & le_mask(abs(Vec3fa(dir)), Vec3fa(FLT_LARGE))) && abs(tnear()) <= float(inf) && abs(tfar) <= float(inf);
     }
 
+    /* checks if occlusion ray is done */
+    __forceinline bool occluded() const {
+      return tfar < 0.0f;
+    }
+
     /* Ray data */
     Vec3ff org;  // 3 floats for ray origin, 1 float for tnear
     //float tnear; // start of ray segment
@@ -251,6 +269,9 @@ namespace embree
     unsigned int primID; // primitive ID
     unsigned int geomID; // geometry ID
     unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+    unsigned int instPrimID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance primitive ID
+#endif
   };
 
   /* Converts ray packet to single rays */
@@ -293,6 +314,9 @@ namespace embree
     ray.primID = primID[i]; ray.geomID = geomID[i]; 
 
     instance_id_stack::copy_VU<K>(instID, ray.instID, i);
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+    instance_id_stack::copy_VU<K>(instPrimID, ray.instPrimID, i);
+#endif
   }
 
   /* Converts single rays to ray packet */
@@ -332,6 +356,9 @@ namespace embree
     primID[i] = ray.primID; geomID[i] = ray.geomID;
 
     instance_id_stack::copy_UV<K>(ray.instID, instID, i);
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+    instance_id_stack::copy_UV<K>(ray.instPrimID, instPrimID, i);
+#endif
   }
 
   /* copies a ray packet element into another element*/
@@ -354,6 +381,9 @@ namespace embree
     primID[dest] = primID[source]; geomID[dest] = geomID[source];  
 
     instance_id_stack::copy_VV<K>(instID, instID, source, dest);
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+    instance_id_stack::copy_VV<K>(instPrimID, instPrimID, source, dest);
+#endif
   }
 
   /* Shortcuts */
@@ -361,12 +391,14 @@ namespace embree
   typedef RayK<4>  Ray4;
   typedef RayK<8>  Ray8;
   typedef RayK<16> Ray16;
+  typedef RayK<VSIZEX> Rayx;
   struct RayN;
 
   typedef RayHitK<1>  RayHit;
   typedef RayHitK<4>  RayHit4;
   typedef RayHitK<8>  RayHit8;
   typedef RayHitK<16> RayHit16;
+  typedef RayHitK<VSIZEX> RayHitx;
   struct RayHitN;
 
   template<int K, bool intersect>
@@ -428,6 +460,13 @@ namespace embree
     {
       cout << " " << ray.instID[l];
     }
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+    cout << "  instPrimID =";
+    for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+    {
+      cout << " " << ray.instPrimID[l];
+    }
+#endif
     cout << embree_endl;
     return cout << "}";
   }
@@ -464,6 +503,9 @@ namespace embree
     __forceinline unsigned int* primID(size_t offset = 0) { return (unsigned int*)&ptr[17*4*N+offset]; };   // primitive ID
     __forceinline unsigned int* geomID(size_t offset = 0) { return (unsigned int*)&ptr[18*4*N+offset]; };   // geometry ID
     __forceinline unsigned int* instID(size_t level, size_t offset = 0) { return (unsigned int*)&ptr[19*4*N+level*4*N+offset]; };   // instance ID
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+    __forceinline unsigned int* instPrimID(size_t level, size_t offset = 0) { return (unsigned int*)&ptr[19*4*N+RTC_MAX_INSTANCE_LEVEL_COUNT*4*N+level*4*N+offset]; };   // instance primitive ID
+#endif
 
     __forceinline Ray getRayByOffset(size_t offset)
     {
@@ -578,9 +620,16 @@ namespace embree
               geomID(offset)[k] = ray.geomID[k];
 
               instID(0, offset)[k] = ray.instID[0][k];
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+              instPrimID(0, offset)[k] = ray.instPrimID[0][k];
+#endif
 #if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
-              for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l)
+              for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l) {
                 instID(l, offset)[k] = ray.instID[l][k];
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+                instPrimID(l, offset)[k] = ray.instPrimID[l][k];
+#endif
+              }
 #endif
             }
           }
@@ -592,9 +641,16 @@ namespace embree
           vuint<K>::storeu(valid, geomID(offset), ray.geomID);
 
           vuint<K>::storeu(valid, instID(0, offset), ray.instID[0]);
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+          vuint<K>::storeu(valid, instPrimID(0, offset), ray.instPrimID[0]);
+#endif
 #if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
-          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l) {
             vuint<K>::storeu(valid, instID(l, offset), ray.instID[l]);
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+            vuint<K>::storeu(valid, instPrimID(l, offset), ray.instPrimID[l]);
+#endif
+          }
 #endif
         }
       }
@@ -698,9 +754,16 @@ namespace embree
         vuint<K>::template scatter<1>(valid, geomID(), offset, ray.geomID);
 
         vuint<K>::template scatter<1>(valid, instID(0), offset, ray.instID[0]);
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+        vuint<K>::template scatter<1>(valid, instPrimID(0), offset, ray.instPrimID[0]);
+#endif
 #if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
-        for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+        for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l) {
           vuint<K>::template scatter<1>(valid, instID(l), offset, ray.instID[l]);
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+          vuint<K>::template scatter<1>(valid, instPrimID(l), offset, ray.instPrimID[l]);
+#endif
+        }
 #endif
 #else
         size_t valid_bits = movemask(valid);
@@ -720,9 +783,16 @@ namespace embree
           *geomID(ofs) = ray.geomID[k];
 
           *instID(0, ofs) = ray.instID[0][k];
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+          *instPrimID(0, ofs) = ray.instPrimID[0][k];
+#endif
 #if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
-          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l) {
             *instID(l, ofs) = ray.instID[l][k];
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+            *instPrimID(l, ofs) = ray.instPrimID[l][k];
+#endif
+          }
 #endif
         }
 #endif
@@ -792,8 +862,12 @@ namespace embree
       primID = (unsigned int*)&t.primID;
       geomID = (unsigned int*)&t.geomID;
 
-      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) {
         instID[l] = (unsigned int*)&t.instID[l];
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+        instPrimID[l] = (unsigned int*)&t.instPrimID[l];
+#endif
+      }
     }
 
     __forceinline Ray getRayByOffset(size_t offset)
@@ -859,9 +933,16 @@ namespace embree
 
         if (likely(instID[0])) {
           *(unsigned int* __restrict__)((char*)instID[0] + offset) = ray.instID[0];
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+          *(unsigned int* __restrict__)((char*)instPrimID[0] + offset) = ray.instPrimID[0];
+#endif
 #if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
-          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID; ++l)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID; ++l) {
             *(unsigned int* __restrict__)((char*)instID[l] + offset) = ray.instID[l];
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+            *(unsigned int* __restrict__)((char*)instPrimID[l] + offset) = ray.instPrimID[l];
+#endif
+          }
 #endif
         }
       }
@@ -892,9 +973,16 @@ namespace embree
 
         if (likely(instID[0])) {
           vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)instID[0] + offset), ray.instID[0]);
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+          vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)instPrimID[0] + offset), ray.instPrimID[0]);
+#endif
 #if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
-          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l) {
             vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)instID[l] + offset), ray.instID[l]);
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+            vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)instPrimID[l] + offset), ray.instPrimID[l]);
+#endif
+          }
 #endif
         }
       }
@@ -1008,9 +1096,16 @@ namespace embree
 
         if (likely(instID[0])) {
           vuint<K>::template scatter<1>(valid, (unsigned int*)instID[0], offset, ray.instID[0]);
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+          vuint<K>::template scatter<1>(valid, (unsigned int*)instPrimID[0], offset, ray.instPrimID[0]);
+#endif
 #if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
-          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l) {
             vuint<K>::template scatter<1>(valid, (unsigned int*)instID[l], offset, ray.instID[l]);
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+            vuint<K>::template scatter<1>(valid, (unsigned int*)instPrimID[l], offset, ray.instPrimID[l]);
+#endif
+          }
 #endif
         }
 #else
@@ -1032,9 +1127,16 @@ namespace embree
 
           if (likely(instID[0])) {
             *(unsigned int* __restrict__)((char*)instID[0] + ofs) = ray.instID[0][k];
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+            *(unsigned int* __restrict__)((char*)instPrimID[0] + ofs) = ray.instPrimID[0][k];
+#endif
 #if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
-            for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l)
+            for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l) {
               *(unsigned int* __restrict__)((char*)instID[l] + ofs) = ray.instID[l][k];
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+              *(unsigned int* __restrict__)((char*)instPrimID[l] + ofs) = ray.instPrimID[l][k];
+#endif
+            }
 #endif
           }
         }
@@ -1091,7 +1193,10 @@ namespace embree
 
     unsigned int* __restrict__ primID; // primitive ID
     unsigned int* __restrict__ geomID; // geometry ID
-    unsigned int* __restrict__ instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID (optional)
+    unsigned int* __restrict__ instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+    unsigned int* __restrict__ instPrimID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance primitive ID (optional)
+#endif
   };
 
 
@@ -1134,9 +1239,16 @@ namespace embree
         vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->geomID, offset, ray.geomID);
 
         vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->instID[0], offset, ray.instID[0]);
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+        vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->instPrimID[0], offset, ray.instPrimID[0]);
+#endif
 #if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
-        for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+        for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l) {
           vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->instID[l], offset, ray.instID[l]);
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+          vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->instPrimID[l], offset, ray.instPrimID[l]);
+#endif
+        }
 #endif
 #else
         size_t valid_bits = movemask(valid);
@@ -1154,6 +1266,9 @@ namespace embree
           ray_k->geomID = ray.geomID[k];
 
           instance_id_stack::copy_VU<K>(ray.instID, ray_k->instID, k);
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+          instance_id_stack::copy_VU<K>(ray.instPrimID, ray_k->instPrimID, k);
+#endif
         }
 #endif
       }
@@ -1183,7 +1298,7 @@ namespace embree
 
     Ray* __restrict__ ptr;
   };
-
+  
   template<>
   __forceinline Ray4 RayStreamAOS::getRayByOffset<4>(const vint4& offset)
   {
@@ -1219,7 +1334,7 @@ namespace embree
 
     return ray;
   }
-
+  
 #if defined(__AVX__)
   template<>
   __forceinline Ray8 RayStreamAOS::getRayByOffset<8>(const vint8& offset)
@@ -1358,6 +1473,9 @@ namespace embree
           ray_k->primID = ray.primID[k];
           ray_k->geomID = ray.geomID[k];
           instance_id_stack::copy_VU<K>(ray.instID, ray_k->instID, k);
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+          instance_id_stack::copy_VU<K>(ray.instPrimID, ray_k->instPrimID, k);
+#endif
         }
       }
     }
@@ -1383,7 +1501,7 @@ namespace embree
 
     Ray** __restrict__ ptr;
   };
-
+  
   template<>
   __forceinline Ray4 RayStreamAOP::getRayByIndex<4>(const vint4& index)
   {
@@ -1419,7 +1537,7 @@ namespace embree
 
     return ray;
   }
-
+  
 #if defined(__AVX__)
   template<>
   __forceinline Ray8 RayStreamAOP::getRayByIndex<8>(const vint8& index)
diff --git a/thirdparty/embree/kernels/common/rtcore.cpp b/thirdparty/embree/kernels/common/rtcore.cpp
index a6ea55bfc4..eb8d2c0a58 100644
--- a/thirdparty/embree/kernels/common/rtcore.cpp
+++ b/thirdparty/embree/kernels/common/rtcore.cpp
@@ -8,11 +8,14 @@
 #include "scene.h"
 #include "context.h"
 #include "../geometry/filter.h"
-#include "../../include/embree3/rtcore_ray.h"
+#include "../../include/embree4/rtcore_ray.h"
 using namespace embree;
 
 RTC_NAMESPACE_BEGIN;
 
+#define RTC_ENTER_DEVICE(arg) \
+  DeviceEnterLeave enterleave(arg);
+
   /* mutex to make API thread safe */
   static MutexSys g_mutex;
 
@@ -27,6 +30,57 @@ RTC_NAMESPACE_BEGIN;
     return (RTCDevice) nullptr;
   }
 
+#if defined(EMBREE_SYCL_SUPPORT)
+
+  RTC_API RTCDevice rtcNewSYCLDeviceInternal(sycl::context sycl_context, const char* config)
+  {
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcNewSYCLDevice);
+    Lock<MutexSys> lock(g_mutex);
+
+    DeviceGPU* device = new DeviceGPU(sycl_context,config);
+    return (RTCDevice) device->refInc();
+    RTC_CATCH_END(nullptr);
+    return (RTCDevice) nullptr;
+  }
+
+  RTC_API bool rtcIsSYCLDeviceSupported(const sycl::device device)
+  {
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIsSYCLDeviceSupported);
+    return rthwifIsSYCLDeviceSupported(device) > 0;
+    RTC_CATCH_END(nullptr);
+    return false;
+  }
+
+  RTC_API int rtcSYCLDeviceSelector(const sycl::device device)
+  {
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSYCLDeviceSelector);
+    return rthwifIsSYCLDeviceSupported(device);
+    RTC_CATCH_END(nullptr);
+    return -1;
+  }
+
+  RTC_API void rtcSetDeviceSYCLDevice(RTCDevice hdevice, const sycl::device sycl_device)
+  {
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetDeviceSYCLDevice);
+    RTC_VERIFY_HANDLE(hdevice);
+
+    Lock<MutexSys> lock(g_mutex);
+    
+    DeviceGPU* device = dynamic_cast<DeviceGPU*>((Device*) hdevice);
+    if (device == nullptr)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "passed device must be an Embree SYCL device")
+      
+    device->setSYCLDevice(sycl_device);
+    
+    RTC_CATCH_END(nullptr);
+  }
+
+#endif
+
   RTC_API void rtcRetainDevice(RTCDevice hdevice) 
   {
     Device* device = (Device*) hdevice;
@@ -108,6 +162,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcNewBuffer);
     RTC_VERIFY_HANDLE(hdevice);
+    RTC_ENTER_DEVICE(hdevice);
     Buffer* buffer = new Buffer((Device*)hdevice, byteSize);
     return (RTCBuffer)buffer->refInc();
     RTC_CATCH_END((Device*)hdevice);
@@ -119,6 +174,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcNewSharedBuffer);
     RTC_VERIFY_HANDLE(hdevice);
+    RTC_ENTER_DEVICE(hdevice);
     Buffer* buffer = new Buffer((Device*)hdevice, byteSize, ptr);
     return (RTCBuffer)buffer->refInc();
     RTC_CATCH_END((Device*)hdevice);
@@ -131,6 +187,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcGetBufferData);
     RTC_VERIFY_HANDLE(hbuffer);
+    RTC_ENTER_DEVICE(hbuffer);
     return buffer->data();
     RTC_CATCH_END2(buffer);
     return nullptr;
@@ -142,6 +199,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcRetainBuffer);
     RTC_VERIFY_HANDLE(hbuffer);
+    RTC_ENTER_DEVICE(hbuffer);
     buffer->refInc();
     RTC_CATCH_END2(buffer);
   }
@@ -152,6 +210,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcReleaseBuffer);
     RTC_VERIFY_HANDLE(hbuffer);
+    RTC_ENTER_DEVICE(hbuffer);
     buffer->refDec();
     RTC_CATCH_END2(buffer);
   }
@@ -161,6 +220,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcNewScene);
     RTC_VERIFY_HANDLE(hdevice);
+    RTC_ENTER_DEVICE(hdevice);
     Scene* scene = new Scene((Device*)hdevice);
     return (RTCScene) scene->refInc();
     RTC_CATCH_END((Device*)hdevice);
@@ -184,6 +244,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcSetSceneProgressMonitorFunction);
     RTC_VERIFY_HANDLE(hscene);
+    RTC_ENTER_DEVICE(hscene);
     Lock<MutexSys> lock(g_mutex);
     scene->setProgressMonitorFunction(progress,ptr);
     RTC_CATCH_END2(scene);
@@ -195,13 +256,18 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcSetSceneBuildQuality);
     RTC_VERIFY_HANDLE(hscene);
+    RTC_ENTER_DEVICE(hscene);
+    // -- GODOT start --
+    // if (quality != RTC_BUILD_QUALITY_LOW &&
+    //     quality != RTC_BUILD_QUALITY_MEDIUM &&
+    //     quality != RTC_BUILD_QUALITY_HIGH)
+    //   throw std::runtime_error("invalid build quality");
     if (quality != RTC_BUILD_QUALITY_LOW &&
         quality != RTC_BUILD_QUALITY_MEDIUM &&
-        quality != RTC_BUILD_QUALITY_HIGH)
-      // -- GODOT start --
-      // throw std::runtime_error("invalid build quality");
+        quality != RTC_BUILD_QUALITY_HIGH) {
       abort();
-      // -- GODOT end --
+    }
+    // -- GODOT end --
     scene->setBuildQuality(quality);
     RTC_CATCH_END2(scene);
   }
@@ -212,6 +278,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcSetSceneFlags);
     RTC_VERIFY_HANDLE(hscene);
+    RTC_ENTER_DEVICE(hscene);
     scene->setSceneFlags(flags);
     RTC_CATCH_END2(scene);
   }
@@ -222,6 +289,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcGetSceneFlags);
     RTC_VERIFY_HANDLE(hscene);
+    RTC_ENTER_DEVICE(hscene);
     return scene->getSceneFlags();
     RTC_CATCH_END2(scene);
     return RTC_SCENE_FLAG_NONE;
@@ -233,6 +301,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcCommitScene);
     RTC_VERIFY_HANDLE(hscene);
+    RTC_ENTER_DEVICE(hscene);
     scene->commit(false);
     RTC_CATCH_END2(scene);
   }
@@ -243,6 +312,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcJoinCommitScene);
     RTC_VERIFY_HANDLE(hscene);
+    RTC_ENTER_DEVICE(hscene);
     scene->commit(true);
     RTC_CATCH_END2(scene);
   }
@@ -253,6 +323,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcGetSceneBounds);
     RTC_VERIFY_HANDLE(hscene);
+    RTC_ENTER_DEVICE(hscene);
     if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
     BBox3fa bounds = scene->bounds.bounds();
     bounds_o->lower_x = bounds.lower.x;
@@ -272,6 +343,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcGetSceneBounds);
     RTC_VERIFY_HANDLE(hscene);
+    RTC_ENTER_DEVICE(hscene);
     if (bounds_o == nullptr)
       throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid destination pointer");
     if (scene->isModified())
@@ -447,7 +519,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_END2_FALSE(scene);
   }
 
-  RTC_API void rtcIntersect1 (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit* rayhit) 
+  RTC_API void rtcIntersect1 (RTCScene hscene, RTCRayHit* rayhit, RTCIntersectArguments* args) 
   {
     Scene* scene = (Scene*) hscene;
     RTC_CATCH_BEGIN;
@@ -458,7 +530,21 @@ RTC_NAMESPACE_BEGIN;
     if (((size_t)rayhit) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes");   
 #endif
     STAT3(normal.travs,1,1,1);
-    IntersectContext context(scene,user_context);
+
+    RTCIntersectArguments defaultArgs;
+    if (unlikely(args == nullptr)) {
+      rtcInitIntersectArguments(&defaultArgs);
+      args = &defaultArgs;
+    }
+    RTCRayQueryContext* user_context = args->context;
+    
+    RTCRayQueryContext defaultContext;
+    if (unlikely(user_context == nullptr)) {
+      rtcInitRayQueryContext(&defaultContext);
+      user_context = &defaultContext;
+    }
+    RayQueryContext context(scene,user_context,args);
+    
     scene->intersectors.intersect(*rayhit,&context);
 #if defined(DEBUG)
     ((RayHit*)rayhit)->verifyHit();
@@ -466,7 +552,45 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_END2(scene);
   }
 
-  RTC_API void rtcIntersect4 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit4* rayhit) 
+  RTC_API void rtcForwardIntersect1 (const RTCIntersectFunctionNArguments* args, RTCScene hscene, RTCRay* iray_, unsigned int instID)
+  {
+    rtcForwardIntersect1Ex(args, hscene, iray_, instID, 0);
+  }
+
+  RTC_API void rtcForwardIntersect1Ex(const RTCIntersectFunctionNArguments* args, RTCScene hscene, RTCRay* iray_, unsigned int instID, unsigned int instPrimID)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcForwardIntersect1Ex);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)iray_) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes");
+#endif
+
+    Ray* iray = (Ray*) iray_;
+    RayHit* oray = (RayHit*)args->rayhit;
+    RTCRayQueryContext* user_context = args->context;
+    const Vec3ff ray_org_tnear = oray->org;
+    const Vec3ff ray_dir_time = oray->dir;
+    oray->org = iray->org;
+    oray->dir = iray->dir;
+    STAT3(normal.travs,1,1,1);
+
+    RTCIntersectArguments* iargs = ((IntersectFunctionNArguments*) args)->args;
+    RayQueryContext context(scene,user_context,iargs);
+
+    instance_id_stack::push(user_context, instID, instPrimID);
+    scene->intersectors.intersect(*(RTCRayHit*)oray,&context);
+    instance_id_stack::pop(user_context);
+
+    oray->org = ray_org_tnear;
+    oray->dir = ray_dir_time;
+
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcIntersect4 (const int* valid, RTCScene hscene, RTCRayHit4* rayhit, RTCIntersectArguments* args) 
   {
     Scene* scene = (Scene*) hscene;
     RTC_CATCH_BEGIN;
@@ -481,23 +605,119 @@ RTC_NAMESPACE_BEGIN;
     STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
     STAT3(normal.travs,cnt,cnt,cnt);
 
-    IntersectContext context(scene,user_context);
-#if !defined(EMBREE_RAY_PACKETS)
-    RayHit4* ray4 = (RayHit4*) rayhit;
-    for (size_t i=0; i<4; i++) {
-      if (!valid[i]) continue;
-      RayHit ray1; ray4->get(i,ray1);
-      scene->intersectors.intersect((RTCRayHit&)ray1,&context);
-      ray4->set(i,ray1);
+    RTCIntersectArguments defaultArgs;
+    if (unlikely(args == nullptr)) {
+      rtcInitIntersectArguments(&defaultArgs);
+      args = &defaultArgs;
+    }
+    RTCRayQueryContext* user_context = args->context;
+    
+    RTCRayQueryContext defaultContext;
+    if (unlikely(user_context == nullptr)) {
+      rtcInitRayQueryContext(&defaultContext);
+      user_context = &defaultContext;
+    }
+    RayQueryContext context(scene,user_context,args);
+
+    if (likely(scene->intersectors.intersector4))
+      scene->intersectors.intersect4(valid,*rayhit,&context);
+
+    else {
+      RayHit4* ray4 = (RayHit4*) rayhit;
+      for (size_t i=0; i<4; i++) {
+        if (!valid[i]) continue;
+        RayHit ray1; ray4->get(i,ray1);
+        scene->intersectors.intersect((RTCRayHit&)ray1,&context);
+        ray4->set(i,ray1);
+      }
     }
-#else
-    scene->intersectors.intersect4(valid,*rayhit,&context);
-#endif
     
     RTC_CATCH_END2(scene);
   }
+
+  template<int N> void copy(float* dst, float* src);
+
+  template<>
+  __forceinline void copy<4>(float* dst, float* src) {
+    vfloat4::storeu(&dst[0],vfloat4::loadu(&src[0]));
+  }
+
+  template<>
+  __forceinline void copy<8>(float* dst, float* src) {
+    vfloat4::storeu(&dst[0],vfloat4::loadu(&src[0]));
+    vfloat4::storeu(&dst[4],vfloat4::loadu(&src[4]));
+  }
+
+  template<>
+  __forceinline void copy<16>(float* dst, float* src) {
+    vfloat4::storeu(&dst[0],vfloat4::loadu(&src[0]));
+    vfloat4::storeu(&dst[4],vfloat4::loadu(&src[4]));
+    vfloat4::storeu(&dst[8],vfloat4::loadu(&src[8]));
+    vfloat4::storeu(&dst[12],vfloat4::loadu(&src[12]));
+  }
+
+  template<typename RTCRay, typename RTCRayHit, int N>
+  __forceinline void rtcForwardIntersectN(const int* valid, const RTCIntersectFunctionNArguments* args, RTCScene hscene, RTCRay* iray, unsigned int instID, unsigned int instPrimID)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTCRayHit* oray = (RTCRayHit*)args->rayhit;
+    RTCRayQueryContext* user_context = args->context;
+
+    __aligned(16) float ray_org_x[N];
+    __aligned(16) float ray_org_y[N];
+    __aligned(16) float ray_org_z[N];
+    __aligned(16) float ray_dir_x[N];
+    __aligned(16) float ray_dir_y[N];
+    __aligned(16) float ray_dir_z[N];
+    
+    copy<N>(ray_org_x,oray->ray.org_x);
+    copy<N>(ray_org_y,oray->ray.org_y);
+    copy<N>(ray_org_z,oray->ray.org_z);
+    copy<N>(ray_dir_x,oray->ray.dir_x);
+    copy<N>(ray_dir_y,oray->ray.dir_y);
+    copy<N>(ray_dir_z,oray->ray.dir_z);
+    
+    copy<N>(oray->ray.org_x,iray->org_x);
+    copy<N>(oray->ray.org_y,iray->org_y);
+    copy<N>(oray->ray.org_z,iray->org_z);
+    copy<N>(oray->ray.dir_x,iray->dir_x);
+    copy<N>(oray->ray.dir_y,iray->dir_y);
+    copy<N>(oray->ray.dir_z,iray->dir_z);
+    
+    STAT(size_t cnt=0; for (size_t i=0; i<N; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(normal.travs,cnt,cnt,cnt);
+
+    RTCIntersectArguments* iargs = ((IntersectFunctionNArguments*) args)->args;
+    RayQueryContext context(scene,user_context,iargs);
+
+    instance_id_stack::push(user_context, instID, instPrimID);
+    scene->intersectors.intersect(valid,*oray,&context);
+    instance_id_stack::pop(user_context);
+
+    copy<N>(oray->ray.org_x,ray_org_x);
+    copy<N>(oray->ray.org_y,ray_org_y);
+    copy<N>(oray->ray.org_z,ray_org_z);
+    copy<N>(oray->ray.dir_x,ray_dir_x);
+    copy<N>(oray->ray.dir_y,ray_dir_y);
+    copy<N>(oray->ray.dir_z,ray_dir_z);
+  }
+
+  RTC_API void rtcForwardIntersect4(const int* valid, const RTCIntersectFunctionNArguments* args, RTCScene hscene, RTCRay4* iray, unsigned int instID)
+  {
+    RTC_TRACE(rtcForwardIntersect4);
+    return rtcForwardIntersect4Ex(valid, args, hscene, iray, instID, 0);
+  }
+
+  RTC_API void rtcForwardIntersect4Ex(const int* valid, const RTCIntersectFunctionNArguments* args, RTCScene hscene, RTCRay4* iray, unsigned int instID, unsigned int instPrimID)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcForwardIntersect4);
+    rtcForwardIntersectN<RTCRay4,RTCRayHit4,4>(valid,args,hscene,iray,instID,instPrimID);
+    RTC_CATCH_END2(scene);
+  }
   
-  RTC_API void rtcIntersect8 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit8* rayhit) 
+  RTC_API void rtcIntersect8 (const int* valid, RTCScene hscene, RTCRayHit8* rayhit, RTCIntersectArguments* args) 
   {
     Scene* scene = (Scene*) hscene;
     RTC_CATCH_BEGIN;
@@ -512,25 +732,53 @@ RTC_NAMESPACE_BEGIN;
     STAT(size_t cnt=0; for (size_t i=0; i<8; i++) cnt += ((int*)valid)[i] == -1;);
     STAT3(normal.travs,cnt,cnt,cnt);
 
-    IntersectContext context(scene,user_context);
-#if !defined(EMBREE_RAY_PACKETS)
-    RayHit8* ray8 = (RayHit8*) rayhit;
-    for (size_t i=0; i<8; i++) {
-      if (!valid[i]) continue;
-      RayHit ray1; ray8->get(i,ray1);
-      scene->intersectors.intersect((RTCRayHit&)ray1,&context);
-      ray8->set(i,ray1);
+    RTCIntersectArguments defaultArgs;
+    if (unlikely(args == nullptr)) {
+      rtcInitIntersectArguments(&defaultArgs);
+      args = &defaultArgs;
     }
-#else
-    if (likely(scene->intersectors.intersector8))
+    RTCRayQueryContext* user_context = args->context;
+    
+    RTCRayQueryContext defaultContext;
+    if (unlikely(user_context == nullptr)) {
+      rtcInitRayQueryContext(&defaultContext);
+      user_context = &defaultContext;
+    }
+    RayQueryContext context(scene,user_context,args);
+    
+    if (likely(scene->intersectors.intersector8)) 
       scene->intersectors.intersect8(valid,*rayhit,&context);
+    
     else
-      scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,8,1,sizeof(RTCRayHit8),&context);
-#endif
+    {
+      RayHit8* ray8 = (RayHit8*) rayhit;
+      for (size_t i=0; i<8; i++) {
+        if (!valid[i]) continue;
+        RayHit ray1; ray8->get(i,ray1);
+        scene->intersectors.intersect((RTCRayHit&)ray1,&context);
+        ray8->set(i,ray1);
+      }
+    }
+    
     RTC_CATCH_END2(scene);
   }
-  
-  RTC_API void rtcIntersect16 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit16* rayhit) 
+
+  RTC_API void rtcForwardIntersect8(const int* valid, const RTCIntersectFunctionNArguments* args, RTCScene hscene, RTCRay8* iray, unsigned int instID)
+  {
+    RTC_TRACE(rtcForwardIntersect8);
+    return rtcForwardIntersect8Ex(valid, args, hscene, iray, instID, 0);
+  }
+
+  RTC_API void rtcForwardIntersect8Ex(const int* valid, const RTCIntersectFunctionNArguments* args, RTCScene hscene, RTCRay8* iray, unsigned int instID, unsigned int instPrimID)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcForwardIntersect8Ex);
+    rtcForwardIntersectN<RTCRay8,RTCRayHit8,8>(valid,args,hscene,iray,instID,instPrimID);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcIntersect16 (const int* valid, RTCScene hscene, RTCRayHit16* rayhit, RTCIntersectArguments* args) 
   {
     Scene* scene = (Scene*) hscene;
     RTC_CATCH_BEGIN;
@@ -545,179 +793,121 @@ RTC_NAMESPACE_BEGIN;
     STAT(size_t cnt=0; for (size_t i=0; i<16; i++) cnt += ((int*)valid)[i] == -1;);
     STAT3(normal.travs,cnt,cnt,cnt);
 
-    IntersectContext context(scene,user_context);
-#if !defined(EMBREE_RAY_PACKETS)
-    RayHit16* ray16 = (RayHit16*) rayhit;
-    for (size_t i=0; i<16; i++) {
-      if (!valid[i]) continue;
-      RayHit ray1; ray16->get(i,ray1);
-      scene->intersectors.intersect((RTCRayHit&)ray1,&context);
-      ray16->set(i,ray1);
+    RTCIntersectArguments defaultArgs;
+    if (unlikely(args == nullptr)) {
+      rtcInitIntersectArguments(&defaultArgs);
+      args = &defaultArgs;
     }
-#else
+    RTCRayQueryContext* user_context = args->context;
+    
+    RTCRayQueryContext defaultContext;
+    if (unlikely(user_context == nullptr)) {
+      rtcInitRayQueryContext(&defaultContext);
+      user_context = &defaultContext;
+    }
+    RayQueryContext context(scene,user_context,args);
+
     if (likely(scene->intersectors.intersector16))
       scene->intersectors.intersect16(valid,*rayhit,&context);
-    else
-      scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,16,1,sizeof(RTCRayHit16),&context);
-#endif
-    RTC_CATCH_END2(scene);
-  }
-
-  RTC_API void rtcIntersect1M (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit* rayhit, unsigned int M, size_t byteStride) 
-  {
-    Scene* scene = (Scene*) hscene;
-    RTC_CATCH_BEGIN;
-    RTC_TRACE(rtcIntersect1M);
-
-#if defined (EMBREE_RAY_PACKETS)
-#if defined(DEBUG)
-    RTC_VERIFY_HANDLE(hscene);
-    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
-    if (((size_t)rayhit ) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
-#endif
-    STAT3(normal.travs,M,M,M);
-    IntersectContext context(scene,user_context);
 
-    /* fast codepath for single rays */
-    if (likely(M == 1)) {
-      if (likely(rayhit->ray.tnear <= rayhit->ray.tfar)) 
-        scene->intersectors.intersect(*rayhit,&context);
-    } 
-
-    /* codepath for streams */
     else {
-      scene->device->rayStreamFilters.intersectAOS(scene,rayhit,M,byteStride,&context);   
+      RayHit16* ray16 = (RayHit16*) rayhit;
+      for (size_t i=0; i<16; i++) {
+        if (!valid[i]) continue;
+        RayHit ray1; ray16->get(i,ray1);
+        scene->intersectors.intersect((RTCRayHit&)ray1,&context);
+        ray16->set(i,ray1);
+      }
     }
-#else
-    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect1M not supported");
-#endif
+
     RTC_CATCH_END2(scene);
   }
 
-  RTC_API void rtcIntersect1Mp (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit** rn, unsigned int M) 
+  RTC_API void rtcForwardIntersect16(const int* valid, const RTCIntersectFunctionNArguments* args, RTCScene hscene, RTCRay16* iray, unsigned int instID)
+  {
+    RTC_TRACE(rtcForwardIntersect16);
+    return rtcForwardIntersect16Ex(valid, args, hscene, iray, instID, 0);
+  }
+
+  RTC_API void rtcForwardIntersect16Ex(const int* valid, const RTCIntersectFunctionNArguments* args, RTCScene hscene, RTCRay16* iray, unsigned int instID, unsigned int instPrimID)
   {
     Scene* scene = (Scene*) hscene;
     RTC_CATCH_BEGIN;
-    RTC_TRACE(rtcIntersect1Mp);
-
-#if defined (EMBREE_RAY_PACKETS)
-#if defined(DEBUG)
-    RTC_VERIFY_HANDLE(hscene);
-    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
-    if (((size_t)rn) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
-#endif
-    STAT3(normal.travs,M,M,M);
-    IntersectContext context(scene,user_context);
-
-    /* fast codepath for single rays */
-    if (likely(M == 1)) {
-      if (likely(rn[0]->ray.tnear <= rn[0]->ray.tfar)) 
-        scene->intersectors.intersect(*rn[0],&context);
-    } 
-
-    /* codepath for streams */
-    else {
-      scene->device->rayStreamFilters.intersectAOP(scene,rn,M,&context);
-    }
-#else
-    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect1Mp not supported");
-#endif
+    RTC_TRACE(rtcForwardIntersect16Ex);
+    rtcForwardIntersectN<RTCRay16,RTCRayHit16,16>(valid,args,hscene,iray,instID,instPrimID);
     RTC_CATCH_END2(scene);
   }
 
-  RTC_API void rtcIntersectNM (RTCScene hscene, RTCIntersectContext* user_context, struct RTCRayHitN* rayhit, unsigned int N, unsigned int M, size_t byteStride) 
+  RTC_API void rtcOccluded1 (RTCScene hscene, RTCRay* ray, RTCOccludedArguments* args) 
   {
     Scene* scene = (Scene*) hscene;
     RTC_CATCH_BEGIN;
-    RTC_TRACE(rtcIntersectNM);
-
-#if defined (EMBREE_RAY_PACKETS)
+    RTC_TRACE(rtcOccluded1);
+    STAT3(shadow.travs,1,1,1);
 #if defined(DEBUG)
     RTC_VERIFY_HANDLE(hscene);
     if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
-    if (((size_t)rayhit) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+    if (((size_t)ray) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes");   
 #endif
-    STAT3(normal.travs,N*M,N*M,N*M);
-    IntersectContext context(scene,user_context);
 
-    /* code path for single ray streams */
-    if (likely(N == 1))
-    {
-      /* fast code path for streams of size 1 */
-      if (likely(M == 1)) {
-        if (likely(((RTCRayHit*)rayhit)->ray.tnear <= ((RTCRayHit*)rayhit)->ray.tfar))
-          scene->intersectors.intersect(*(RTCRayHit*)rayhit,&context);
-      } 
-      /* normal codepath for single ray streams */
-      else {
-        scene->device->rayStreamFilters.intersectAOS(scene,(RTCRayHit*)rayhit,M,byteStride,&context);
-      }
+    RTCOccludedArguments defaultArgs;
+    if (unlikely(args == nullptr)) {
+      rtcInitOccludedArguments(&defaultArgs);
+      args = &defaultArgs;
     }
-    /* code path for ray packet streams */
-    else {
-      scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,N,M,byteStride,&context);
+    RTCRayQueryContext* user_context = args->context;
+    
+    RTCRayQueryContext defaultContext;
+    if (unlikely(user_context == nullptr)) {
+      rtcInitRayQueryContext(&defaultContext);
+      user_context = &defaultContext;
     }
-#else
-    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectNM not supported");
-#endif
+    RayQueryContext context(scene,user_context,args);
+    
+    scene->intersectors.occluded(*ray,&context);
     RTC_CATCH_END2(scene);
   }
 
-  RTC_API void rtcIntersectNp (RTCScene hscene, RTCIntersectContext* user_context, const RTCRayHitNp* rayhit, unsigned int N) 
+  RTC_API void rtcForwardOccluded1 (const RTCOccludedFunctionNArguments* args, RTCScene hscene, RTCRay* iray_, unsigned int instID)
   {
-    Scene* scene = (Scene*) hscene;
-    RTC_CATCH_BEGIN;
-    RTC_TRACE(rtcIntersectNp);
-
-#if defined (EMBREE_RAY_PACKETS)
-#if defined(DEBUG)
-    RTC_VERIFY_HANDLE(hscene);
-    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
-    if (((size_t)rayhit->ray.org_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_x not aligned to 4 bytes");   
-    if (((size_t)rayhit->ray.org_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_y not aligned to 4 bytes");   
-    if (((size_t)rayhit->ray.org_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_z not aligned to 4 bytes");   
-    if (((size_t)rayhit->ray.dir_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_x not aligned to 4 bytes");   
-    if (((size_t)rayhit->ray.dir_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_y not aligned to 4 bytes");   
-    if (((size_t)rayhit->ray.dir_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_z not aligned to 4 bytes");   
-    if (((size_t)rayhit->ray.tnear ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_x not aligned to 4 bytes");   
-    if (((size_t)rayhit->ray.tfar  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.tnear not aligned to 4 bytes");   
-    if (((size_t)rayhit->ray.time  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.time not aligned to 4 bytes");   
-    if (((size_t)rayhit->ray.mask  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.mask not aligned to 4 bytes");   
-    if (((size_t)rayhit->hit.Ng_x  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_x not aligned to 4 bytes");   
-    if (((size_t)rayhit->hit.Ng_y  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_y not aligned to 4 bytes");   
-    if (((size_t)rayhit->hit.Ng_z  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_z not aligned to 4 bytes");   
-    if (((size_t)rayhit->hit.u     ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.u not aligned to 4 bytes");   
-    if (((size_t)rayhit->hit.v     ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.v not aligned to 4 bytes");   
-    if (((size_t)rayhit->hit.geomID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.geomID not aligned to 4 bytes");   
-    if (((size_t)rayhit->hit.primID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.primID not aligned to 4 bytes");   
-    if (((size_t)rayhit->hit.instID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.instID not aligned to 4 bytes");   
-#endif
-    STAT3(normal.travs,N,N,N);
-    IntersectContext context(scene,user_context);
-    scene->device->rayStreamFilters.intersectSOP(scene,rayhit,N,&context);
-#else
-    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectNp not supported");
-#endif
-    RTC_CATCH_END2(scene);
+    RTC_TRACE(rtcForwardOccluded1);
+    return rtcForwardOccluded1Ex(args, hscene, iray_, instID, 0);
   }
-  
-  RTC_API void rtcOccluded1 (RTCScene hscene, RTCIntersectContext* user_context, RTCRay* ray) 
+
+  RTC_API void rtcForwardOccluded1Ex(const RTCOccludedFunctionNArguments* args, RTCScene hscene, RTCRay* iray_, unsigned int instID, unsigned int instPrimID)
   {
     Scene* scene = (Scene*) hscene;
     RTC_CATCH_BEGIN;
-    RTC_TRACE(rtcOccluded1);
+    RTC_TRACE(rtcForwardOccluded1Ex);
     STAT3(shadow.travs,1,1,1);
 #if defined(DEBUG)
     RTC_VERIFY_HANDLE(hscene);
     if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
-    if (((size_t)ray) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes");   
+    if (((size_t)iray_) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes");   
 #endif
-    IntersectContext context(scene,user_context);
-    scene->intersectors.occluded(*ray,&context);
+    
+    Ray* iray = (Ray*)iray_;
+    Ray* oray = (Ray*)args->ray;
+    RTCRayQueryContext* user_context = args->context;
+    const Vec3ff ray_org_tnear = oray->org;
+    const Vec3ff ray_dir_time = oray->dir;
+    oray->org = iray->org;
+    oray->dir = iray->dir;
+
+    RTCIntersectArguments* iargs = ((OccludedFunctionNArguments*) args)->args;
+    RayQueryContext context(scene,user_context,iargs);
+
+    instance_id_stack::push(user_context, instID, instPrimID);
+    scene->intersectors.occluded(*(RTCRay*)oray,&context);
+    instance_id_stack::pop(user_context);
+    
+    oray->org = ray_org_tnear;
+    oray->dir = ray_dir_time;
+
     RTC_CATCH_END2(scene);
   }
-  
-  RTC_API void rtcOccluded4 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay4* ray) 
+
+  RTC_API void rtcOccluded4 (const int* valid, RTCScene hscene, RTCRay4* ray, RTCOccludedArguments* args) 
   {
     Scene* scene = (Scene*) hscene;
     RTC_CATCH_BEGIN;
@@ -732,23 +922,98 @@ RTC_NAMESPACE_BEGIN;
     STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
     STAT3(shadow.travs,cnt,cnt,cnt);
 
-    IntersectContext context(scene,user_context);
-#if !defined(EMBREE_RAY_PACKETS)
-    RayHit4* ray4 = (RayHit4*) ray;
-    for (size_t i=0; i<4; i++) {
-      if (!valid[i]) continue;
-      RayHit ray1; ray4->get(i,ray1);
-      scene->intersectors.occluded((RTCRay&)ray1,&context);
-      ray4->geomID[i] = ray1.geomID; 
+    RTCOccludedArguments defaultArgs;
+    if (unlikely(args == nullptr)) {
+      rtcInitOccludedArguments(&defaultArgs);
+      args = &defaultArgs;
+    }
+    RTCRayQueryContext* user_context = args->context;
+    
+    RTCRayQueryContext defaultContext;
+    if (unlikely(user_context == nullptr)) {
+      rtcInitRayQueryContext(&defaultContext);
+      user_context = &defaultContext;
+    }
+    RayQueryContext context(scene,user_context,args);
+
+    if (likely(scene->intersectors.intersector4))
+       scene->intersectors.occluded4(valid,*ray,&context);
+
+    else {
+      RayHit4* ray4 = (RayHit4*) ray;
+      for (size_t i=0; i<4; i++) {
+        if (!valid[i]) continue;
+        RayHit ray1; ray4->get(i,ray1);
+        scene->intersectors.occluded((RTCRay&)ray1,&context);
+        ray4->geomID[i] = ray1.geomID; 
+      }
     }
-#else
-    scene->intersectors.occluded4(valid,*ray,&context);
-#endif
     
     RTC_CATCH_END2(scene);
   }
+
+  template<typename RTCRay, int N>
+  __forceinline void rtcForwardOccludedN (const int* valid, const RTCOccludedFunctionNArguments* args, RTCScene hscene, RTCRay* iray, unsigned int instID, unsigned int instPrimID)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTCRay* oray = (RTCRay*)args->ray;
+    RTCRayQueryContext* user_context = args->context;
+
+    __aligned(16) float ray_org_x[N];
+    __aligned(16) float ray_org_y[N];
+    __aligned(16) float ray_org_z[N];
+    __aligned(16) float ray_dir_x[N];
+    __aligned(16) float ray_dir_y[N];
+    __aligned(16) float ray_dir_z[N];
+    
+    copy<N>(ray_org_x,oray->org_x);
+    copy<N>(ray_org_y,oray->org_y);
+    copy<N>(ray_org_z,oray->org_z);
+    copy<N>(ray_dir_x,oray->dir_x);
+    copy<N>(ray_dir_y,oray->dir_y);
+    copy<N>(ray_dir_z,oray->dir_z);
+    
+    copy<N>(oray->org_x,iray->org_x);
+    copy<N>(oray->org_y,iray->org_y);
+    copy<N>(oray->org_z,iray->org_z);
+    copy<N>(oray->dir_x,iray->dir_x);
+    copy<N>(oray->dir_y,iray->dir_y);
+    copy<N>(oray->dir_z,iray->dir_z);
+    
+    STAT(size_t cnt=0; for (size_t i=0; i<N; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(normal.travs,cnt,cnt,cnt);
+
+    RTCIntersectArguments* iargs = ((IntersectFunctionNArguments*) args)->args;
+    RayQueryContext context(scene,user_context,iargs);
+
+    instance_id_stack::push(user_context, instID, instPrimID);
+    scene->intersectors.occluded(valid,*oray,&context);
+    instance_id_stack::pop(user_context);
+
+    copy<N>(oray->org_x,ray_org_x);
+    copy<N>(oray->org_y,ray_org_y);
+    copy<N>(oray->org_z,ray_org_z);
+    copy<N>(oray->dir_x,ray_dir_x);
+    copy<N>(oray->dir_y,ray_dir_y);
+    copy<N>(oray->dir_z,ray_dir_z);
+  }
+
+  RTC_API void rtcForwardOccluded4(const int* valid, const RTCOccludedFunctionNArguments* args, RTCScene hscene, RTCRay4* iray, unsigned int instID)
+  {
+    RTC_TRACE(rtcForwardOccluded4);
+    return rtcForwardOccluded4Ex(valid, args, hscene, iray, instID, 0);
+  }
+
+  RTC_API void rtcForwardOccluded4Ex(const int* valid, const RTCOccludedFunctionNArguments* args, RTCScene hscene, RTCRay4* iray, unsigned int instID, unsigned int instPrimID)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcForwardOccluded4);
+    rtcForwardOccludedN<RTCRay4,4>(valid,args,hscene,iray,instID,instPrimID);
+    RTC_CATCH_END2(scene);
+  }
  
-  RTC_API void rtcOccluded8 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay8* ray) 
+  RTC_API void rtcOccluded8 (const int* valid, RTCScene hscene, RTCRay8* ray, RTCOccludedArguments* args) 
   {
     Scene* scene = (Scene*) hscene;
     RTC_CATCH_BEGIN;
@@ -763,26 +1028,52 @@ RTC_NAMESPACE_BEGIN;
     STAT(size_t cnt=0; for (size_t i=0; i<8; i++) cnt += ((int*)valid)[i] == -1;);
     STAT3(shadow.travs,cnt,cnt,cnt);
 
-    IntersectContext context(scene,user_context);
-#if !defined(EMBREE_RAY_PACKETS)
-    RayHit8* ray8 = (RayHit8*) ray;
-    for (size_t i=0; i<8; i++) {
-      if (!valid[i]) continue;
-      RayHit ray1; ray8->get(i,ray1);
-      scene->intersectors.occluded((RTCRay&)ray1,&context);
-      ray8->set(i,ray1);
+    RTCOccludedArguments defaultArgs;
+    if (unlikely(args == nullptr)) {
+      rtcInitOccludedArguments(&defaultArgs);
+      args = &defaultArgs;
     }
-#else
+    RTCRayQueryContext* user_context = args->context;
+    
+    RTCRayQueryContext defaultContext;
+    if (unlikely(user_context == nullptr)) {
+      rtcInitRayQueryContext(&defaultContext);
+      user_context = &defaultContext;
+    }
+    RayQueryContext context(scene,user_context,args);
+
     if (likely(scene->intersectors.intersector8))
       scene->intersectors.occluded8(valid,*ray,&context);
-    else
-      scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,8,1,sizeof(RTCRay8),&context);
-#endif
+
+    else {
+      RayHit8* ray8 = (RayHit8*) ray;
+      for (size_t i=0; i<8; i++) {
+        if (!valid[i]) continue;
+        RayHit ray1; ray8->get(i,ray1);
+        scene->intersectors.occluded((RTCRay&)ray1,&context);
+        ray8->set(i,ray1);
+      }
+    }
 
     RTC_CATCH_END2(scene);
   }
-  
-  RTC_API void rtcOccluded16 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay16* ray) 
+
+  RTC_API void rtcForwardOccluded8(const int* valid, const RTCOccludedFunctionNArguments* args, RTCScene hscene, RTCRay8* iray, unsigned int instID)
+  {
+    RTC_TRACE(rtcForwardOccluded8);
+    return rtcForwardOccluded8Ex(valid, args, hscene, iray, instID, 0);
+  }
+
+  RTC_API void rtcForwardOccluded8Ex(const int* valid, const RTCOccludedFunctionNArguments* args, RTCScene hscene, RTCRay8* iray, unsigned int instID, unsigned int instPrimID)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcForwardOccluded8Ex);
+    rtcForwardOccludedN<RTCRay8,8>(valid, args, hscene, iray, instID, instPrimID);
+    RTC_CATCH_END2(scene);
+  }
+   
+  RTC_API void rtcOccluded16 (const int* valid, RTCScene hscene, RTCRay16* ray, RTCOccludedArguments* args) 
   {
     Scene* scene = (Scene*) hscene;
     RTC_CATCH_BEGIN;
@@ -797,159 +1088,58 @@ RTC_NAMESPACE_BEGIN;
     STAT(size_t cnt=0; for (size_t i=0; i<16; i++) cnt += ((int*)valid)[i] == -1;);
     STAT3(shadow.travs,cnt,cnt,cnt);
 
-    IntersectContext context(scene,user_context);
-#if !defined(EMBREE_RAY_PACKETS)
-    RayHit16* ray16 = (RayHit16*) ray;
-    for (size_t i=0; i<16; i++) {
-      if (!valid[i]) continue;
-      RayHit ray1; ray16->get(i,ray1);
-      scene->intersectors.occluded((RTCRay&)ray1,&context);
-      ray16->set(i,ray1);
+    RTCOccludedArguments defaultArgs;
+    if (unlikely(args == nullptr)) {
+      rtcInitOccludedArguments(&defaultArgs);
+      args = &defaultArgs;
     }
-#else
+    RTCRayQueryContext* user_context = args->context;
+    
+    RTCRayQueryContext defaultContext;
+    if (unlikely(user_context == nullptr)) {
+      rtcInitRayQueryContext(&defaultContext);
+      user_context = &defaultContext;
+    }
+    RayQueryContext context(scene,user_context,args);
+
     if (likely(scene->intersectors.intersector16))
       scene->intersectors.occluded16(valid,*ray,&context);
-    else
-      scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,16,1,sizeof(RTCRay16),&context);
-#endif
-
-    RTC_CATCH_END2(scene);
-  }
-  
-  RTC_API void rtcOccluded1M(RTCScene hscene, RTCIntersectContext* user_context, RTCRay* ray, unsigned int M, size_t byteStride) 
-  {
-    Scene* scene = (Scene*) hscene;
-    RTC_CATCH_BEGIN;
-    RTC_TRACE(rtcOccluded1M);
 
-#if defined (EMBREE_RAY_PACKETS)
-#if defined(DEBUG)
-    RTC_VERIFY_HANDLE(hscene);
-    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
-    if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
-#endif
-    STAT3(shadow.travs,M,M,M);
-    IntersectContext context(scene,user_context);
-    /* fast codepath for streams of size 1 */
-    if (likely(M == 1)) {
-      if (likely(ray->tnear <= ray->tfar)) 
-        scene->intersectors.occluded (*ray,&context);
-    } 
-    /* codepath for normal streams */
     else {
-      scene->device->rayStreamFilters.occludedAOS(scene,ray,M,byteStride,&context);
+      RayHit16* ray16 = (RayHit16*) ray;
+      for (size_t i=0; i<16; i++) {
+        if (!valid[i]) continue;
+        RayHit ray1; ray16->get(i,ray1);
+        scene->intersectors.occluded((RTCRay&)ray1,&context);
+        ray16->set(i,ray1);
+      }
     }
-#else
-    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccluded1M not supported");
-#endif
-    RTC_CATCH_END2(scene);
-  }
 
-  RTC_API void rtcOccluded1Mp(RTCScene hscene, RTCIntersectContext* user_context, RTCRay** ray, unsigned int M) 
-  {
-    Scene* scene = (Scene*) hscene;
-    RTC_CATCH_BEGIN;
-    RTC_TRACE(rtcOccluded1Mp);
-
-#if defined (EMBREE_RAY_PACKETS)
-#if defined(DEBUG)
-    RTC_VERIFY_HANDLE(hscene);
-    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
-    if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
-#endif
-    STAT3(shadow.travs,M,M,M);
-    IntersectContext context(scene,user_context);
-
-    /* fast codepath for streams of size 1 */
-    if (likely(M == 1)) {
-      if (likely(ray[0]->tnear <= ray[0]->tfar)) 
-        scene->intersectors.occluded (*ray[0],&context);
-    } 
-    /* codepath for normal streams */
-    else {
-      scene->device->rayStreamFilters.occludedAOP(scene,ray,M,&context);
-    }
-#else
-    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccluded1Mp not supported");
-#endif
     RTC_CATCH_END2(scene);
   }
 
-  RTC_API void rtcOccludedNM(RTCScene hscene, RTCIntersectContext* user_context, RTCRayN* ray, unsigned int N, unsigned int M, size_t byteStride)
+  RTC_API void rtcForwardOccluded16(const int* valid, const RTCOccludedFunctionNArguments* args, RTCScene hscene, RTCRay16* iray, unsigned int instID)
   {
-    Scene* scene = (Scene*) hscene;
-    RTC_CATCH_BEGIN;
-    RTC_TRACE(rtcOccludedNM);
-
-#if defined (EMBREE_RAY_PACKETS)
-#if defined(DEBUG)
-    RTC_VERIFY_HANDLE(hscene);
-    if (byteStride < sizeof(RTCRayHit)) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"byteStride too small");
-    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
-    if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
-#endif
-    STAT3(shadow.travs,N*M,N*N,N*N);
-    IntersectContext context(scene,user_context);
-
-    /* codepath for single rays */
-    if (likely(N == 1))
-    {
-      /* fast path for streams of size 1 */
-      if (likely(M == 1)) {
-        if (likely(((RTCRay*)ray)->tnear <= ((RTCRay*)ray)->tfar))
-          scene->intersectors.occluded (*(RTCRay*)ray,&context);
-      } 
-      /* codepath for normal ray streams */
-      else {
-        scene->device->rayStreamFilters.occludedAOS(scene,(RTCRay*)ray,M,byteStride,&context);
-      }
-    }
-    /* code path for ray packet streams */
-    else {
-      scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,N,M,byteStride,&context);
-    }
-#else
-    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccludedNM not supported");
-#endif
-    RTC_CATCH_END2(scene);
+    RTC_TRACE(rtcForwardOccluded16);
+    return rtcForwardOccluded16Ex(valid, args, hscene, iray, instID, 0);
   }
 
-  RTC_API void rtcOccludedNp(RTCScene hscene, RTCIntersectContext* user_context, const RTCRayNp* ray, unsigned int N)
+  RTC_API void rtcForwardOccluded16Ex(const int* valid, const RTCOccludedFunctionNArguments* args, RTCScene hscene, RTCRay16* iray, unsigned int instID, unsigned int instPrimID)
   {
     Scene* scene = (Scene*) hscene;
     RTC_CATCH_BEGIN;
-    RTC_TRACE(rtcOccludedNp);
-
-#if defined (EMBREE_RAY_PACKETS)
-#if defined(DEBUG)
-    RTC_VERIFY_HANDLE(hscene);
-    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
-    if (((size_t)ray->org_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_x not aligned to 4 bytes");   
-    if (((size_t)ray->org_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_y not aligned to 4 bytes");   
-    if (((size_t)ray->org_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_z not aligned to 4 bytes");   
-    if (((size_t)ray->dir_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_x not aligned to 4 bytes");   
-    if (((size_t)ray->dir_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_y not aligned to 4 bytes");   
-    if (((size_t)ray->dir_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_z not aligned to 4 bytes");   
-    if (((size_t)ray->tnear ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_x not aligned to 4 bytes");   
-    if (((size_t)ray->tfar  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "tnear not aligned to 4 bytes");   
-    if (((size_t)ray->time  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "time not aligned to 4 bytes");   
-    if (((size_t)ray->mask  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 4 bytes");   
-#endif
-    STAT3(shadow.travs,N,N,N);
-    IntersectContext context(scene,user_context);
-    scene->device->rayStreamFilters.occludedSOP(scene,ray,N,&context);
-#else
-    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccludedNp not supported");
-#endif
+    RTC_TRACE(rtcForwardOccluded16Ex);
+    rtcForwardOccludedN<RTCRay16,16>(valid, args, hscene, iray, instID, instPrimID);
     RTC_CATCH_END2(scene);
   }
-
+  
   RTC_API void rtcRetainScene (RTCScene hscene) 
   {
     Scene* scene = (Scene*) hscene;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcRetainScene);
     RTC_VERIFY_HANDLE(hscene);
+    RTC_ENTER_DEVICE(hscene);
     scene->refInc();
     RTC_CATCH_END2(scene);
   }
@@ -960,6 +1150,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcReleaseScene);
     RTC_VERIFY_HANDLE(hscene);
+    RTC_ENTER_DEVICE(hscene);
     scene->refDec();
     RTC_CATCH_END2(scene);
   }
@@ -972,10 +1163,23 @@ RTC_NAMESPACE_BEGIN;
     RTC_TRACE(rtcSetGeometryInstancedScene);
     RTC_VERIFY_HANDLE(hgeometry);
     RTC_VERIFY_HANDLE(hscene);
+    RTC_ENTER_DEVICE(hgeometry);
     geometry->setInstancedScene(scene);
     RTC_CATCH_END2(geometry);
   }
 
+  RTC_API void rtcSetGeometryInstancedScenes(RTCGeometry hgeometry, RTCScene* scenes, size_t numScenes)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryInstancedScene);
+    RTC_VERIFY_HANDLE(hgeometry);
+    RTC_VERIFY_HANDLE(scenes);
+    RTC_ENTER_DEVICE(hgeometry);
+    geometry->setInstancedScenes(scenes, numScenes);
+    RTC_CATCH_END2(geometry);
+  }
+
   AffineSpace3fa loadTransform(RTCFormat format, const float* xfm)
   {
     AffineSpace3fa space = one;
@@ -1009,43 +1213,14 @@ RTC_NAMESPACE_BEGIN;
     return space;
   }
 
-  void storeTransform(const AffineSpace3fa& space, RTCFormat format, float* xfm)
-  {
-    switch (format)
-    {
-    case RTC_FORMAT_FLOAT3X4_ROW_MAJOR:
-      xfm[ 0] = space.l.vx.x;  xfm[ 1] = space.l.vy.x;  xfm[ 2] = space.l.vz.x;  xfm[ 3] = space.p.x;
-      xfm[ 4] = space.l.vx.y;  xfm[ 5] = space.l.vy.y;  xfm[ 6] = space.l.vz.y;  xfm[ 7] = space.p.y;
-      xfm[ 8] = space.l.vx.z;  xfm[ 9] = space.l.vy.z;  xfm[10] = space.l.vz.z;  xfm[11] = space.p.z;
-      break;
-
-    case RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR:
-      xfm[ 0] = space.l.vx.x;  xfm[ 1] = space.l.vx.y;  xfm[ 2] = space.l.vx.z;
-      xfm[ 3] = space.l.vy.x;  xfm[ 4] = space.l.vy.y;  xfm[ 5] = space.l.vy.z;
-      xfm[ 6] = space.l.vz.x;  xfm[ 7] = space.l.vz.y;  xfm[ 8] = space.l.vz.z;
-      xfm[ 9] = space.p.x;     xfm[10] = space.p.y;     xfm[11] = space.p.z;
-      break;
-
-    case RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR:
-      xfm[ 0] = space.l.vx.x;  xfm[ 1] = space.l.vx.y;  xfm[ 2] = space.l.vx.z;  xfm[ 3] = 0.f;
-      xfm[ 4] = space.l.vy.x;  xfm[ 5] = space.l.vy.y;  xfm[ 6] = space.l.vy.z;  xfm[ 7] = 0.f;
-      xfm[ 8] = space.l.vz.x;  xfm[ 9] = space.l.vz.y;  xfm[10] = space.l.vz.z;  xfm[11] = 0.f;
-      xfm[12] = space.p.x;     xfm[13] = space.p.y;     xfm[14] = space.p.z;     xfm[15] = 1.f;
-      break;
-
-    default:
-      throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid matrix format");
-      break;
-    }
-  }
-
-  RTC_API void rtcSetGeometryTransform(RTCGeometry hgeometry, unsigned int timeStep, RTCFormat format, const void* xfm)
+RTC_API void rtcSetGeometryTransform(RTCGeometry hgeometry, unsigned int timeStep, RTCFormat format, const void* xfm)
   {
     Geometry* geometry = (Geometry*) hgeometry;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcSetGeometryTransform);
     RTC_VERIFY_HANDLE(hgeometry);
     RTC_VERIFY_HANDLE(xfm);
+    RTC_ENTER_DEVICE(hgeometry);
     const AffineSpace3fa transform = loadTransform(format, (const float*)xfm);
     geometry->setTransform(transform, timeStep);
     RTC_CATCH_END2(geometry);
@@ -1058,6 +1233,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_TRACE(rtcSetGeometryTransformQuaternion);
     RTC_VERIFY_HANDLE(hgeometry);
     RTC_VERIFY_HANDLE(qd);
+    RTC_ENTER_DEVICE(hgeometry);
     
     AffineSpace3fx transform;
     transform.l.vx.x = qd->scale_x;
@@ -1090,21 +1266,46 @@ RTC_NAMESPACE_BEGIN;
     Geometry* geometry = (Geometry*) hgeometry;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcGetGeometryTransform);
+    //RTC_ENTER_DEVICE(hgeometry); // no allocation required
     const AffineSpace3fa transform = geometry->getTransform(time);
     storeTransform(transform, format, (float*)xfm);
     RTC_CATCH_END2(geometry);
   }
 
-  RTC_API void rtcFilterIntersection(const struct RTCIntersectFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args)
+  RTC_API void rtcGetGeometryTransformEx(RTCGeometry hgeometry, unsigned int instPrimID, float time, RTCFormat format, void* xfm)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryTransformEx);
+    //RTC_ENTER_DEVICE(hgeometry); // no allocation required
+    const AffineSpace3fa transform = geometry->getTransform(instPrimID, time);
+    storeTransform(transform, format, (float*)xfm);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcGetGeometryTransformFromScene(RTCScene hscene, unsigned int geomID, float time, RTCFormat format, void* xfm)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryTransformFromScene);
+    //RTC_ENTER_DEVICE(hscene); // no allocation required
+    const AffineSpace3fa transform = scene->get(geomID)->getTransform(time);
+    storeTransform(transform, format, (float*)xfm);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcInvokeIntersectFilterFromGeometry(const struct RTCIntersectFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args)
   {
     IntersectFunctionNArguments* args = (IntersectFunctionNArguments*) args_i;
-    isa::reportIntersection1(args, filter_args);
+    if (args->geometry->intersectionFilterN)
+        args->geometry->intersectionFilterN(filter_args);
   }
 
-  RTC_API void rtcFilterOcclusion(const struct RTCOccludedFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args)
+  RTC_API void rtcInvokeOccludedFilterFromGeometry(const struct RTCOccludedFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args)
   {
     OccludedFunctionNArguments* args = (OccludedFunctionNArguments*) args_i;
-    isa::reportOcclusion1(args,filter_args);
+    if (args->geometry->occlusionFilterN)
+      args->geometry->occlusionFilterN(filter_args);
   }
   
   RTC_API RTCGeometry rtcNewGeometry (RTCDevice hdevice, RTCGeometryType type)
@@ -1112,6 +1313,7 @@ RTC_NAMESPACE_BEGIN;
     Device* device = (Device*) hdevice;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcNewGeometry);
+    RTC_ENTER_DEVICE(hdevice);
     RTC_VERIFY_HANDLE(hdevice);
 
     switch (type)
@@ -1262,6 +1464,18 @@ RTC_NAMESPACE_BEGIN;
 #endif
     }
 
+    case RTC_GEOMETRY_TYPE_INSTANCE_ARRAY:
+    {
+#if defined(EMBREE_GEOMETRY_INSTANCE_ARRAY)
+      createInstanceArrayTy createInstanceArray = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(device->enabled_cpu_features,createInstanceArray);
+      Geometry* geom = createInstanceArray(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_INSTANCE_ARRAY is not supported");
+#endif
+    }
+
     case RTC_GEOMETRY_TYPE_GRID:
     {
 #if defined(EMBREE_GEOMETRY_GRID)
@@ -1288,6 +1502,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcSetGeometryUserPrimitiveCount);
     RTC_VERIFY_HANDLE(hgeometry);
+    RTC_ENTER_DEVICE(hgeometry);
     
     if (unlikely(geometry->getType() != Geometry::GTY_USER_GEOMETRY))
       throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation only allowed for user geometries"); 
@@ -1302,6 +1517,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcSetGeometryTimeStepCount);
     RTC_VERIFY_HANDLE(hgeometry);
+    RTC_ENTER_DEVICE(hgeometry);
 
     if (timeStepCount > RTC_MAX_TIME_STEP_COUNT)
       throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"number of time steps is out of range");
@@ -1316,6 +1532,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcSetGeometryTimeRange);
     RTC_VERIFY_HANDLE(hgeometry);
+    RTC_ENTER_DEVICE(hgeometry);
 
     if (startTime > endTime)
       throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"startTime has to be smaller or equal to the endTime");
@@ -1330,6 +1547,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcSetGeometryVertexAttributeCount);
     RTC_VERIFY_HANDLE(hgeometry);
+    RTC_ENTER_DEVICE(hgeometry);
     geometry->setVertexAttributeCount(N);
     RTC_CATCH_END2(geometry);
   }
@@ -1340,6 +1558,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcSetGeometryTopologyCount);
     RTC_VERIFY_HANDLE(hgeometry);
+    RTC_ENTER_DEVICE(hgeometry);
     geometry->setTopologyCount(N);
     RTC_CATCH_END2(geometry);
   }
@@ -1350,14 +1569,20 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcSetGeometryBuildQuality);
     RTC_VERIFY_HANDLE(hgeometry);
+    RTC_ENTER_DEVICE(hgeometry);
+    // -- GODOT start --
+    // if (quality != RTC_BUILD_QUALITY_LOW &&
+    //     quality != RTC_BUILD_QUALITY_MEDIUM &&
+    //     quality != RTC_BUILD_QUALITY_HIGH &&
+    //     quality != RTC_BUILD_QUALITY_REFIT)
+    //   throw std::runtime_error("invalid build quality");
     if (quality != RTC_BUILD_QUALITY_LOW &&
         quality != RTC_BUILD_QUALITY_MEDIUM &&
         quality != RTC_BUILD_QUALITY_HIGH &&
-        quality != RTC_BUILD_QUALITY_REFIT)
-      // -- GODOT start --
-      // throw std::runtime_error("invalid build quality");
+        quality != RTC_BUILD_QUALITY_REFIT) {
       abort();
-      // -- GODOT end --
+    }
+    // -- GODOT end --
     geometry->setBuildQuality(quality);
     RTC_CATCH_END2(geometry);
   }
@@ -1383,6 +1608,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcSetGeometryMask);
     RTC_VERIFY_HANDLE(hgeometry);
+    RTC_ENTER_DEVICE(hgeometry);
     geometry->setMask(mask);
     RTC_CATCH_END2(geometry);
   }
@@ -1393,6 +1619,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcSetGeometrySubdivisionMode);
     RTC_VERIFY_HANDLE(hgeometry);
+    RTC_ENTER_DEVICE(hgeometry);
     geometry->setSubdivisionMode(topologyID,mode);
     RTC_CATCH_END2(geometry);
   }
@@ -1403,6 +1630,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcSetGeometryVertexAttributeTopology);
     RTC_VERIFY_HANDLE(hgeometry);
+    RTC_ENTER_DEVICE(hgeometry);
     geometry->setVertexAttributeTopology(vertexAttributeID, topologyID);
     RTC_CATCH_END2(geometry);
   }
@@ -1415,6 +1643,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_TRACE(rtcSetGeometryBuffer);
     RTC_VERIFY_HANDLE(hgeometry);
     RTC_VERIFY_HANDLE(hbuffer);
+    RTC_ENTER_DEVICE(hgeometry);
     
     if (geometry->device != buffer->device)
       throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices");
@@ -1432,10 +1661,11 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcSetSharedGeometryBuffer);
     RTC_VERIFY_HANDLE(hgeometry);
+    RTC_ENTER_DEVICE(hgeometry);
     
     if (itemCount > 0xFFFFFFFFu)
       throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large");
-    
+
     Ref<Buffer> buffer = new Buffer(geometry->device, itemCount*byteStride, (char*)ptr + byteOffset);
     geometry->setBuffer(type, slot, format, buffer, 0, byteStride, (unsigned int)itemCount);
     RTC_CATCH_END2(geometry);
@@ -1447,6 +1677,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcSetNewGeometryBuffer);
     RTC_VERIFY_HANDLE(hgeometry);
+    RTC_ENTER_DEVICE(hgeometry);
 
     if (itemCount > 0xFFFFFFFFu)
       throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large");
@@ -1469,6 +1700,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcGetGeometryBufferData);
     RTC_VERIFY_HANDLE(hgeometry);
+    RTC_ENTER_DEVICE(hgeometry);
     return geometry->getBuffer(type, slot);
     RTC_CATCH_END2(geometry);
     return nullptr;
@@ -1480,6 +1712,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcEnableGeometry);
     RTC_VERIFY_HANDLE(hgeometry);
+    RTC_ENTER_DEVICE(hgeometry);
     geometry->enable();
     RTC_CATCH_END2(geometry);
   }
@@ -1490,6 +1723,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcUpdateGeometryBuffer);
     RTC_VERIFY_HANDLE(hgeometry);
+    RTC_ENTER_DEVICE(hgeometry);
     geometry->updateBuffer(type, slot);
     RTC_CATCH_END2(geometry);
   }
@@ -1500,6 +1734,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcDisableGeometry);
     RTC_VERIFY_HANDLE(hgeometry);
+    RTC_ENTER_DEVICE(hgeometry);
     geometry->disable();
     RTC_CATCH_END2(geometry);
   }
@@ -1510,6 +1745,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcSetGeometryTessellationRate);
     RTC_VERIFY_HANDLE(hgeometry);
+    RTC_ENTER_DEVICE(hgeometry);
     geometry->setTessellationRate(tessellationRate);
     RTC_CATCH_END2(geometry);
   }
@@ -1520,6 +1756,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcSetGeometryUserData);
     RTC_VERIFY_HANDLE(hgeometry);
+    RTC_ENTER_DEVICE(hgeometry);
     geometry->setUserData(ptr);
     RTC_CATCH_END2(geometry);
   }
@@ -1530,17 +1767,34 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcGetGeometryUserData);
     RTC_VERIFY_HANDLE(hgeometry);
+    //RTC_ENTER_DEVICE(hgeometry); // do not enable for performance reasons !
     return geometry->getUserData();
     RTC_CATCH_END2(geometry);
     return nullptr;
   }
 
+  RTC_API void* rtcGetGeometryUserDataFromScene (RTCScene hscene, unsigned int geomID)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryUserDataFromScene);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    RTC_VERIFY_GEOMID(geomID);
+#endif
+    //RTC_ENTER_DEVICE(hscene); // do not enable for performance reasons
+    return scene->get(geomID)->getUserData();
+    RTC_CATCH_END2(scene);
+    return nullptr;
+  }
+
   RTC_API void rtcSetGeometryBoundsFunction (RTCGeometry hgeometry, RTCBoundsFunction bounds, void* userPtr)
   {
     Geometry* geometry = (Geometry*) hgeometry;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcSetGeometryBoundsFunction);
     RTC_VERIFY_HANDLE(hgeometry);
+    RTC_ENTER_DEVICE(hgeometry);
     geometry->setBoundsFunction(bounds,userPtr);
     RTC_CATCH_END2(geometry);
   }
@@ -1551,6 +1805,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcSetGeometryDisplacementFunction);
     RTC_VERIFY_HANDLE(hgeometry);
+    RTC_ENTER_DEVICE(hgeometry);
     geometry->setDisplacementFunction(displacement);
     RTC_CATCH_END2(geometry);
   }
@@ -1561,6 +1816,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcSetGeometryIntersectFunction);
     RTC_VERIFY_HANDLE(hgeometry);
+    RTC_ENTER_DEVICE(hgeometry);
     geometry->setIntersectFunctionN(intersect);
     RTC_CATCH_END2(geometry);
   }
@@ -1571,6 +1827,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcSetGeometryPointQueryFunction);
     RTC_VERIFY_HANDLE(hgeometry);
+    RTC_ENTER_DEVICE(hgeometry);
     geometry->setPointQueryFunction(pointQuery);
     RTC_CATCH_END2(geometry);
   }
@@ -1580,6 +1837,7 @@ RTC_NAMESPACE_BEGIN;
     Geometry* geometry = (Geometry*) hgeometry;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcGetGeometryFirstHalfEdge);
+    //RTC_ENTER_DEVICE(hgeometry); // do not enable for performance reasons
     return geometry->getFirstHalfEdge(faceID);
     RTC_CATCH_END2(geometry);
     return -1;
@@ -1590,6 +1848,7 @@ RTC_NAMESPACE_BEGIN;
     Geometry* geometry = (Geometry*) hgeometry;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcGetGeometryFace);
+    //RTC_ENTER_DEVICE(hgeometry); // do not enable for performance reasons
     return geometry->getFace(edgeID);
     RTC_CATCH_END2(geometry);
     return -1;
@@ -1600,6 +1859,7 @@ RTC_NAMESPACE_BEGIN;
     Geometry* geometry = (Geometry*) hgeometry;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcGetGeometryNextHalfEdge);
+    //RTC_ENTER_DEVICE(hgeometry); // do not enable for performance reasons
     return geometry->getNextHalfEdge(edgeID);
     RTC_CATCH_END2(geometry);
     return -1;
@@ -1610,6 +1870,7 @@ RTC_NAMESPACE_BEGIN;
     Geometry* geometry = (Geometry*) hgeometry;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcGetGeometryPreviousHalfEdge);
+    //RTC_ENTER_DEVICE(hgeometry); // do not enable for performance reasons
     return geometry->getPreviousHalfEdge(edgeID);
     RTC_CATCH_END2(geometry);
     return -1;
@@ -1620,6 +1881,7 @@ RTC_NAMESPACE_BEGIN;
     Geometry* geometry = (Geometry*) hgeometry;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcGetGeometryOppositeHalfEdge);
+    //RTC_ENTER_DEVICE(hgeometry); // do not enable for performance reasons
     return geometry->getOppositeHalfEdge(topologyID,edgeID);
     RTC_CATCH_END2(geometry);
     return -1;
@@ -1631,6 +1893,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcSetOccludedFunctionN);
     RTC_VERIFY_HANDLE(hgeometry);
+    RTC_ENTER_DEVICE(hgeometry);
     geometry->setOccludedFunctionN(occluded);
     RTC_CATCH_END2(geometry);
   }
@@ -1641,6 +1904,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcSetGeometryIntersectFilterFunction);
     RTC_VERIFY_HANDLE(hgeometry);
+    RTC_ENTER_DEVICE(hgeometry);
     geometry->setIntersectionFilterFunctionN(filter);
     RTC_CATCH_END2(geometry);
   }
@@ -1651,10 +1915,22 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcSetGeometryOccludedFilterFunction);
     RTC_VERIFY_HANDLE(hgeometry);
+    RTC_ENTER_DEVICE(hgeometry);
     geometry->setOcclusionFilterFunctionN(filter);
     RTC_CATCH_END2(geometry);
   }
 
+  RTC_API void rtcSetGeometryEnableFilterFunctionFromArguments (RTCGeometry hgeometry, bool enable) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryEnableFilterFunctionFromArguments);
+    RTC_VERIFY_HANDLE(hgeometry);
+    RTC_ENTER_DEVICE(hgeometry);
+    geometry->enableFilterFunctionFromArguments(enable);
+    RTC_CATCH_END2(geometry);
+  }
+
   RTC_API void rtcInterpolate(const RTCInterpolateArguments* const args)
   {
     Geometry* geometry = (Geometry*) args->geometry;
@@ -1663,6 +1939,7 @@ RTC_NAMESPACE_BEGIN;
 #if defined(DEBUG)
     RTC_VERIFY_HANDLE(args->geometry);
 #endif
+    //RTC_ENTER_DEVICE(hgeometry); // do not enable for performance reasons
     geometry->interpolate(args);
     RTC_CATCH_END2(geometry);
   }
@@ -1675,6 +1952,7 @@ RTC_NAMESPACE_BEGIN;
 #if defined(DEBUG)
     RTC_VERIFY_HANDLE(args->geometry);
 #endif
+    // RTC_ENTER_DEVICE(hgeometry); // do not enable for performance reasons
     geometry->interpolateN(args);
     RTC_CATCH_END2(geometry);
   }
@@ -1685,6 +1963,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcCommitGeometry);
     RTC_VERIFY_HANDLE(hgeometry);
+    RTC_ENTER_DEVICE(hgeometry);
     return geometry->commit();
     RTC_CATCH_END2(geometry);
   }
@@ -1697,6 +1976,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_TRACE(rtcAttachGeometry);
     RTC_VERIFY_HANDLE(hscene);
     RTC_VERIFY_HANDLE(hgeometry);
+    RTC_ENTER_DEVICE(hgeometry);
     if (scene->device != geometry->device)
       throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices");
     return scene->bind(RTC_INVALID_GEOMETRY_ID,geometry);
@@ -1713,6 +1993,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_VERIFY_HANDLE(hscene);
     RTC_VERIFY_HANDLE(hgeometry);
     RTC_VERIFY_GEOMID(geomID);
+    RTC_ENTER_DEVICE(hscene);
     if (scene->device != geometry->device)
       throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices");
     scene->bind(geomID,geometry);
@@ -1726,6 +2007,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_TRACE(rtcDetachGeometry);
     RTC_VERIFY_HANDLE(hscene);
     RTC_VERIFY_GEOMID(geomID);
+    RTC_ENTER_DEVICE(hscene);
     scene->detachGeometry(geomID);
     RTC_CATCH_END2(scene);
   }
@@ -1736,6 +2018,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcRetainGeometry);
     RTC_VERIFY_HANDLE(hgeometry);
+    RTC_ENTER_DEVICE(hgeometry);
     geometry->refInc();
     RTC_CATCH_END2(geometry);
   }
@@ -1746,6 +2029,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcReleaseGeometry);
     RTC_VERIFY_HANDLE(hgeometry);
+    RTC_ENTER_DEVICE(hgeometry);
     geometry->refDec();
     RTC_CATCH_END2(geometry);
   }
@@ -1759,6 +2043,7 @@ RTC_NAMESPACE_BEGIN;
     RTC_VERIFY_HANDLE(hscene);
     RTC_VERIFY_GEOMID(geomID);
 #endif
+    //RTC_ENTER_DEVICE(hscene); // do not enable for performance reasons
     return (RTCGeometry) scene->get(geomID);
     RTC_CATCH_END2(scene);
     return nullptr;
diff --git a/thirdparty/embree/kernels/common/rtcore.h b/thirdparty/embree/kernels/common/rtcore.h
index ac58a84d6f..47526482c1 100644
--- a/thirdparty/embree/kernels/common/rtcore.h
+++ b/thirdparty/embree/kernels/common/rtcore.h
@@ -3,26 +3,14 @@
 
 #pragma once
 
-#include "../../include/embree3/rtcore.h"
+#include "../../include/embree4/rtcore.h"
 RTC_NAMESPACE_USE
 
 namespace embree
 {  
   /*! decoding of intersection flags */
-  __forceinline bool isCoherent  (RTCIntersectContextFlags flags) { return (flags & RTC_INTERSECT_CONTEXT_FLAG_COHERENT) == RTC_INTERSECT_CONTEXT_FLAG_COHERENT; }
-  __forceinline bool isIncoherent(RTCIntersectContextFlags flags) { return (flags & RTC_INTERSECT_CONTEXT_FLAG_COHERENT) == RTC_INTERSECT_CONTEXT_FLAG_INCOHERENT; }
-
-#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR >= 8)
-#  define USE_TASK_ARENA 1
-#else
-#  define USE_TASK_ARENA 0
-#endif
-
-#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION >= 11009) // TBB 2019 Update 9
-#  define TASKING_TBB_USE_TASK_ISOLATION 1
-#else
-#  define TASKING_TBB_USE_TASK_ISOLATION 0
-#endif
+  __forceinline bool isCoherent  (RTCRayQueryFlags flags) { return (flags & RTC_RAY_QUERY_FLAG_COHERENT) == RTC_RAY_QUERY_FLAG_COHERENT; }
+  __forceinline bool isIncoherent(RTCRayQueryFlags flags) { return (flags & RTC_RAY_QUERY_FLAG_COHERENT) == RTC_RAY_QUERY_FLAG_INCOHERENT; }
 
 /*! Macros used in the rtcore API implementation */
 // -- GODOT start --
@@ -30,8 +18,8 @@ namespace embree
 #define RTC_CATCH_END(device)
 #define RTC_CATCH_END2(scene)
 #define RTC_CATCH_END2_FALSE(scene) return false;
-
 #if 0
+// -- GODOT end --
 #define RTC_CATCH_BEGIN try {
   
 #define RTC_CATCH_END(device)                                                \
@@ -47,7 +35,7 @@ namespace embree
   
 #define RTC_CATCH_END2(scene)                                                \
   } catch (std::bad_alloc&) {                                                   \
-    Device* device = scene ? scene->device : nullptr;                           \
+    Device* device = scene ? scene->device : nullptr;		\
     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
   } catch (rtcore_error& e) {                                                   \
     Device* device = scene ? scene->device : nullptr;                           \
@@ -78,9 +66,9 @@ namespace embree
     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
     return false;                                                               \
   }
-#endif
-// -- GODOT end --
 
+#endif
+  
 #define RTC_VERIFY_HANDLE(handle)                               \
   if (handle == nullptr) {                                         \
     throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \
@@ -137,5 +125,38 @@ namespace embree
 // -- GODOT end --
 
 #define RTC_BUILD_ARGUMENTS_HAS(settings,member) \
-  (settings.byteSize > (offsetof(RTCBuildArguments,member)+sizeof(settings.member))) 
+  (settings.byteSize > (offsetof(RTCBuildArguments,member)+sizeof(settings.member)))
+
+  
+  inline void storeTransform(const AffineSpace3fa& space, RTCFormat format, float* xfm)
+  {
+    switch (format)
+    {
+    case RTC_FORMAT_FLOAT3X4_ROW_MAJOR:
+      xfm[ 0] = space.l.vx.x;  xfm[ 1] = space.l.vy.x;  xfm[ 2] = space.l.vz.x;  xfm[ 3] = space.p.x;
+      xfm[ 4] = space.l.vx.y;  xfm[ 5] = space.l.vy.y;  xfm[ 6] = space.l.vz.y;  xfm[ 7] = space.p.y;
+      xfm[ 8] = space.l.vx.z;  xfm[ 9] = space.l.vy.z;  xfm[10] = space.l.vz.z;  xfm[11] = space.p.z;
+      break;
+
+    case RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR:
+      xfm[ 0] = space.l.vx.x;  xfm[ 1] = space.l.vx.y;  xfm[ 2] = space.l.vx.z;
+      xfm[ 3] = space.l.vy.x;  xfm[ 4] = space.l.vy.y;  xfm[ 5] = space.l.vy.z;
+      xfm[ 6] = space.l.vz.x;  xfm[ 7] = space.l.vz.y;  xfm[ 8] = space.l.vz.z;
+      xfm[ 9] = space.p.x;     xfm[10] = space.p.y;     xfm[11] = space.p.z;
+      break;
+
+    case RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR:
+      xfm[ 0] = space.l.vx.x;  xfm[ 1] = space.l.vx.y;  xfm[ 2] = space.l.vx.z;  xfm[ 3] = 0.f;
+      xfm[ 4] = space.l.vy.x;  xfm[ 5] = space.l.vy.y;  xfm[ 6] = space.l.vy.z;  xfm[ 7] = 0.f;
+      xfm[ 8] = space.l.vz.x;  xfm[ 9] = space.l.vz.y;  xfm[10] = space.l.vz.z;  xfm[11] = 0.f;
+      xfm[12] = space.p.x;     xfm[13] = space.p.y;     xfm[14] = space.p.z;     xfm[15] = 1.f;
+      break;
+
+    default:
+#if !defined(__SYCL_DEVICE_ONLY__)
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid matrix format");
+#endif
+      break;
+    }
+  }
 }
diff --git a/thirdparty/embree/kernels/common/scene.cpp b/thirdparty/embree/kernels/common/scene.cpp
index 65d31d0f81..10cb3c4bec 100644
--- a/thirdparty/embree/kernels/common/scene.cpp
+++ b/thirdparty/embree/kernels/common/scene.cpp
@@ -3,12 +3,35 @@
 
 #include "scene.h"
 
+#include "../../common/tasking/taskscheduler.h"
+
 #include "../bvh/bvh4_factory.h"
 #include "../bvh/bvh8_factory.h"
+
 #include "../../common/algorithms/parallel_reduce.h"
- 
+
+#if defined(EMBREE_SYCL_SUPPORT)
+#  include "../sycl/rthwif_embree_builder.h"
+#endif
+
+
 namespace embree
 {
+
+  struct TaskGroup {
+    /*! global lock step task scheduler */
+#if defined(TASKING_INTERNAL)
+    MutexSys schedulerMutex;
+    Ref<TaskScheduler> scheduler;
+#elif defined(TASKING_TBB) && TASKING_TBB_USE_TASK_ISOLATION
+    tbb::isolated_task_group group;
+#elif defined(TASKING_TBB)
+    tbb::task_group group;
+#elif defined(TASKING_PPL)
+    concurrency::task_group group;
+#endif
+  };
+
   /* error raising rtcIntersect and rtcOccluded functions */
   void missing_rtcCommit()      { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); }
   void invalid_rtcIntersect1()  { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect and rtcOccluded not enabled"); }
@@ -22,13 +45,20 @@ namespace embree
       flags_modified(true), enabled_geometry_types(0),
       scene_flags(RTC_SCENE_FLAG_NONE),
       quality_flags(RTC_BUILD_QUALITY_MEDIUM),
-      is_build(false), modified(true),
+      modified(true),
+      taskGroup(new TaskGroup()),
       progressInterface(this), progress_monitor_function(nullptr), progress_monitor_ptr(nullptr), progress_monitor_counter(0)
   {
     device->refInc();
 
     intersectors = Accel::Intersectors(missing_rtcCommit);
 
+    /* use proper device and context for SYCL allocations */
+#if defined(EMBREE_SYCL_SUPPORT)
+    if (DeviceGPU* gpu_device = dynamic_cast<DeviceGPU*>(device))
+      hwaccel = AccelBuffer(AccelAllocator<char>(device,gpu_device->getGPUDevice(),gpu_device->getGPUContext()),0);
+#endif
+       
     /* one can overwrite flags through device for debugging */
     if (device->quality_flags != -1)
       quality_flags = (RTCBuildQuality) device->quality_flags;
@@ -90,10 +120,11 @@ namespace embree
   void Scene::createTriangleAccel()
   {
 #if defined(EMBREE_GEOMETRY_TRIANGLE)
+
     if (device->tri_accel == "default") 
     {
       if (quality_flags != RTC_BUILD_QUALITY_LOW)
-      {
+      {	
         int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel(); 
         switch (mode) {
         case /*0b00*/ 0: 
@@ -168,11 +199,13 @@ namespace embree
 #endif
     else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown triangle acceleration structure "+device->tri_accel);
 #endif
+
   }
 
   void Scene::createTriangleMBAccel()
   {
 #if defined(EMBREE_GEOMETRY_TRIANGLE)
+
     if (device->tri_accel_mb == "default")
     {
       int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel(); 
@@ -211,6 +244,7 @@ namespace embree
   void Scene::createQuadAccel()
   {
 #if defined(EMBREE_GEOMETRY_QUAD)
+    
     if (device->quad_accel == "default") 
     {
       if (quality_flags != RTC_BUILD_QUALITY_LOW)
@@ -292,6 +326,7 @@ namespace embree
   void Scene::createQuadMBAccel()
   {
 #if defined(EMBREE_GEOMETRY_QUAD)
+
     if (device->quad_accel_mb == "default") 
     {
       int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel(); 
@@ -329,6 +364,7 @@ namespace embree
   void Scene::createHairAccel()
   {
 #if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT)
+
     if (device->hair_accel == "default")
     {
       int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel();
@@ -366,6 +402,7 @@ namespace embree
   void Scene::createHairMBAccel()
   {
 #if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT)
+
     if (device->hair_accel_mb == "default")
     {
 #if defined (EMBREE_TARGET_SIMD8)
@@ -416,7 +453,8 @@ namespace embree
   void Scene::createUserGeometryAccel()
   {
 #if defined(EMBREE_GEOMETRY_USER)
-    if (device->object_accel == "default") 
+
+    if (device->object_accel == "default")
     {
 #if defined (EMBREE_TARGET_SIMD8)
       if (device->canUseAVX() && !isCompactAccel())
@@ -448,6 +486,7 @@ namespace embree
   void Scene::createUserGeometryMBAccel()
   {
 #if defined(EMBREE_GEOMETRY_USER)
+
     if (device->object_accel_mb == "default"    ) {
 #if defined (EMBREE_TARGET_SIMD8)
       if (device->canUseAVX() && !isCompactAccel())
@@ -467,6 +506,7 @@ namespace embree
   void Scene::createInstanceAccel()
   {
 #if defined(EMBREE_GEOMETRY_INSTANCE)
+
     // if (device->object_accel == "default") 
     {
 #if defined (EMBREE_TARGET_SIMD8)
@@ -494,6 +534,7 @@ namespace embree
   void Scene::createInstanceMBAccel()
   {
 #if defined(EMBREE_GEOMETRY_INSTANCE)
+
     //if (device->instance_accel_mb == "default")
     {
 #if defined (EMBREE_TARGET_SIMD8)
@@ -550,10 +591,58 @@ namespace embree
 #endif
   }
 
+  void Scene::createInstanceArrayAccel()
+  {
+#if defined(EMBREE_GEOMETRY_INSTANCE_ARRAY)
+
+    // if (device->object_accel == "default") 
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel()) {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh8_factory->BVH8InstanceArray(this, BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh8_factory->BVH8InstanceArray(this, BVHFactory::BuildVariant::DYNAMIC));
+        }
+      } 
+      else
+#endif
+      {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh4_factory->BVH4InstanceArray(this, BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh4_factory->BVH4InstanceArray(this, BVHFactory::BuildVariant::DYNAMIC));
+        }
+      }
+    }
+    // else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance accel "+device->instance_accel);
+#endif
+  }
+
+  void Scene::createInstanceArrayMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_INSTANCE_ARRAY)
+
+    //if (device->instance_accel_mb == "default")
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel())
+        accels_add(device->bvh8_factory->BVH8InstanceArrayMB(this));
+      else
+#endif
+        accels_add(device->bvh4_factory->BVH4InstanceArrayMB(this));
+    }
+    //else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance mblur accel "+device->instance_accel_mb);
+#endif
+  }
+
+
   void Scene::createGridAccel()
   {
-    BVHFactory::IntersectVariant ivariant = isRobustAccel() ? BVHFactory::IntersectVariant::ROBUST : BVHFactory::IntersectVariant::FAST;
 #if defined(EMBREE_GEOMETRY_GRID)
+    
+    BVHFactory::IntersectVariant ivariant = isRobustAccel() ? BVHFactory::IntersectVariant::ROBUST : BVHFactory::IntersectVariant::FAST;
+
     if (device->grid_accel == "default") 
     {
 #if defined (EMBREE_TARGET_SIMD8)
@@ -579,6 +668,7 @@ namespace embree
   void Scene::createGridMBAccel()
   {
 #if defined(EMBREE_GEOMETRY_GRID)
+
     if (device->grid_accel_mb == "default") 
     {
       accels_add(device->bvh4_factory->BVH4GridMB(this,BVHFactory::BuildVariant::STATIC));
@@ -588,13 +678,13 @@ namespace embree
 #endif
 
   }
-  
+
   void Scene::clear() {
   }
 
   unsigned Scene::bind(unsigned geomID, Ref<Geometry> geometry) 
   {
-    Lock<SpinLock> lock(geometriesMutex);
+    Lock<MutexSys> lock(geometriesMutex);
     if (geomID == RTC_INVALID_GEOMETRY_ID) {
       geomID = id_pool.allocate();
       if (geomID == RTC_INVALID_GEOMETRY_ID)
@@ -620,7 +710,7 @@ namespace embree
 
   void Scene::detachGeometry(size_t geomID)
   {
-    Lock<SpinLock> lock(geometriesMutex);
+    Lock<MutexSys> lock(geometriesMutex);
     
     if (geomID >= geometries.size())
       throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry ID");
@@ -637,45 +727,11 @@ namespace embree
     geometryModCounters_[geomID] = 0;
   }
 
-  void Scene::updateInterface()
-  {
-    is_build = true;
-  }
-
-  void Scene::commit_task ()
+  void Scene::build_cpu_accels()
   {
-    checkIfModifiedAndSet ();
-    if (!isModified()) {
-      return;
-    }
-    
-    /* print scene statistics */
-    if (device->verbosity(2))
-      printStatistics();
-
-    progress_monitor_counter = 0;
-    
-    /* gather scene stats and call preCommit function of each geometry */
-    this->world = parallel_reduce (size_t(0), geometries.size(), GeometryCounts (), 
-      [this](const range<size_t>& r)->GeometryCounts
-      {
-        GeometryCounts c;
-        for (auto i=r.begin(); i<r.end(); ++i) 
-        {
-          if (geometries[i] && geometries[i]->isEnabled()) 
-          {
-            geometries[i]->preCommit();
-            geometries[i]->addElementsToCount (c);
-            c.numFilterFunctions += (int) geometries[i]->hasFilterFunctions();
-          }
-        }
-        return c;
-      },
-      std::plus<GeometryCounts>()
-    );
-    
     /* select acceleration structures to build */
     unsigned int new_enabled_geometry_types = world.enabledGeometryTypesMask();
+
     if (flags_modified || new_enabled_geometry_types != enabled_geometry_types)
     {
       accels_init();
@@ -685,7 +741,7 @@ namespace embree
       parallel_for(geometryModCounters_.size(), [&] ( const size_t i ) {
           geometryModCounters_[i] = 0;
         });
-      
+
       if (getNumPrimitives(TriangleMesh::geom_type,false)) createTriangleAccel();
       if (getNumPrimitives(TriangleMesh::geom_type,true)) createTriangleMBAccel();
       if (getNumPrimitives(QuadMesh::geom_type,false)) createQuadAccel();
@@ -702,7 +758,9 @@ namespace embree
       if (getNumPrimitives(Geometry::MTY_INSTANCE_CHEAP,true)) createInstanceMBAccel();
       if (getNumPrimitives(Geometry::MTY_INSTANCE_EXPENSIVE,false)) createInstanceExpensiveAccel();
       if (getNumPrimitives(Geometry::MTY_INSTANCE_EXPENSIVE,true)) createInstanceExpensiveMBAccel();
-      
+      if (getNumPrimitives(Geometry::MTY_INSTANCE_ARRAY,false)) createInstanceArrayAccel();
+      if (getNumPrimitives(Geometry::MTY_INSTANCE_ARRAY,true)) createInstanceArrayMBAccel();
+
       flags_modified = false;
       enabled_geometry_types = new_enabled_geometry_types;
     }
@@ -719,6 +777,61 @@ namespace embree
       flags_modified = true; // in non-dynamic mode we have to re-create accels
     }
 
+    if (device->verbosity(2)) {
+      std::cout << "created scene intersector" << std::endl;
+      accels_print(2);
+      std::cout << "selected scene intersector" << std::endl;
+      intersectors.print(2);
+    }
+  }
+
+  void Scene::build_gpu_accels()
+  {
+#if defined(EMBREE_SYCL_SUPPORT)
+    const BBox3f aabb = rthwifBuild(this,hwaccel);
+    bounds = LBBox<embree::Vec3fa>(aabb);
+    hwaccel_bounds = aabb;
+#endif
+  }
+
+  void Scene::commit_task ()
+  {
+    checkIfModifiedAndSet();
+    if (!isModified()) return;
+    
+    /* print scene statistics */
+    if (device->verbosity(2))
+      printStatistics();
+
+    progress_monitor_counter = 0;
+    
+    /* gather scene stats and call preCommit function of each geometry */
+    this->world = parallel_reduce (size_t(0), geometries.size(), GeometryCounts (), 
+      [this](const range<size_t>& r)->GeometryCounts
+      {
+        GeometryCounts c;
+        for (auto i=r.begin(); i<r.end(); ++i) 
+        {
+          if (geometries[i] && geometries[i]->isEnabled()) 
+          {
+            geometries[i]->preCommit();
+            geometries[i]->addElementsToCount (c);
+            c.numFilterFunctions += (int) geometries[i]->hasArgumentFilterFunctions();
+            c.numFilterFunctions += (int) geometries[i]->hasGeometryFilterFunctions();
+          }
+        }
+        return c;
+      },
+      std::plus<GeometryCounts>()
+    );
+
+#if defined(EMBREE_SYCL_SUPPORT)
+    if (DeviceGPU* gpu_device = dynamic_cast<DeviceGPU*>(device))
+      build_gpu_accels();
+    else
+#endif
+      build_cpu_accels();
+
     /* call postCommit function of each geometry */
     parallel_for(geometries.size(), [&] ( const size_t i ) {
         if (geometries[i] && geometries[i]->isEnabled()) {
@@ -727,16 +840,7 @@ namespace embree
           geometryModCounters_[i] = geometries[i]->getModCounter();
         }
       });
-      
-    updateInterface();
 
-    if (device->verbosity(2)) {
-      std::cout << "created scene intersector" << std::endl;
-      accels_print(2);
-      std::cout << "selected scene intersector" << std::endl;
-      intersectors.print(2);
-    }
-    
     setModified(false);
   }
 
@@ -771,11 +875,11 @@ namespace embree
     /* allocates own taskscheduler for each build */
     Ref<TaskScheduler> scheduler = nullptr;
     { 
-      Lock<MutexSys> lock(schedulerMutex);
-      scheduler = this->scheduler;
+      Lock<MutexSys> lock(taskGroup->schedulerMutex);
+      scheduler = taskGroup->scheduler;
       if (scheduler == null) {
         buildLock.lock();
-        this->scheduler = scheduler = new TaskScheduler;
+        taskGroup->scheduler = scheduler = new TaskScheduler;
       }
     }
 
@@ -792,13 +896,13 @@ namespace embree
     /* initiate build */
     // -- GODOT start --
     // try {
-      scheduler->spawn_root([&]() { commit_task(); Lock<MutexSys> lock(schedulerMutex); this->scheduler = nullptr; }, 1, !join);
+      TaskScheduler::TaskGroupContext context;
+      scheduler->spawn_root([&]() { commit_task(); Lock<MutexSys> lock(taskGroup->schedulerMutex); taskGroup->scheduler = nullptr; }, &context, 1, !join);
     // }
     // catch (...) {
     //   accels_clear();
-    //   updateInterface();
-    //   Lock<MutexSys> lock(schedulerMutex);
-    //   this->scheduler = nullptr;
+    //   Lock<MutexSys> lock(taskGroup->schedulerMutex);
+    //   taskGroup->scheduler = nullptr;
     //   throw;
     // }
     // -- GODOT end --
@@ -809,7 +913,7 @@ namespace embree
 #if defined(TASKING_TBB)
 
   void Scene::commit (bool join) 
-  {
+  {    
 #if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR < 8)
     if (join)
       throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcJoinCommitScene not supported with this TBB version");
@@ -827,16 +931,7 @@ namespace embree
 #endif
       
       do {
-
-#if USE_TASK_ARENA
-        if (join) {
-          device->arena->execute([&]{ group.wait(); });
-        }
-        else
-#endif
-        {
-          group.wait();
-        }
+        device->execute(join, [&](){ taskGroup->group.wait(); });
 
         pause_cpu();
         yield();
@@ -857,26 +952,14 @@ namespace embree
       tbb::task_group_context ctx( tbb::task_group_context::isolated, tbb::task_group_context::default_traits | tbb::task_group_context::fp_settings );
 #endif
       //ctx.set_priority(tbb::priority_high);
-
-#if USE_TASK_ARENA
-      if (join)
-      {
-        device->arena->execute([&]{
-            group.run([&]{
-                tbb::parallel_for (size_t(0), size_t(1), size_t(1), [&] (size_t) { commit_task(); }, ctx);
-              });
-            group.wait();
-          });
-      }
-      else
-#endif
+      device->execute(join, [&]()
       {
-        group.run([&]{
+        taskGroup->group.run([&]{
             tbb::parallel_for (size_t(0), size_t(1), size_t(1), [&] (size_t) { commit_task(); }, ctx);
           });
-        group.wait();
-      }
-     
+        taskGroup->group.wait();
+      });
+
       /* reset MXCSR register again */
       _mm_setcsr(mxcsr);
     } 
@@ -886,7 +969,6 @@ namespace embree
       _mm_setcsr(mxcsr);
       
       accels_clear();
-      updateInterface();
       throw;
     }
   }
@@ -915,10 +997,10 @@ namespace embree
     
     try {
 
-      group.run([&]{
+      taskGroup->group.run([&]{
           concurrency::parallel_for(size_t(0), size_t(1), size_t(1), [&](size_t) { commit_task(); });
         });
-      group.wait();
+      taskGroup->group.wait();
 
        /* reset MXCSR register again */
       _mm_setcsr(mxcsr);
@@ -929,7 +1011,6 @@ namespace embree
       _mm_setcsr(mxcsr);
       
       accels_clear();
-      updateInterface();
       throw;
     }
   }
diff --git a/thirdparty/embree/kernels/common/scene.h b/thirdparty/embree/kernels/common/scene.h
index 5ed80a63f6..d9acca1065 100644
--- a/thirdparty/embree/kernels/common/scene.h
+++ b/thirdparty/embree/kernels/common/scene.h
@@ -6,11 +6,11 @@
 #include "default.h"
 #include "device.h"
 #include "builder.h"
-#include "../../common/algorithms/parallel_any_of.h"
 #include "scene_triangle_mesh.h"
 #include "scene_quad_mesh.h"
 #include "scene_user_geometry.h"
 #include "scene_instance.h"
+#include "scene_instance_array.h"
 #include "scene_curves.h"
 #include "scene_line_segments.h"
 #include "scene_subdiv_mesh.h"
@@ -21,12 +21,18 @@
 #include "acceln.h"
 #include "geometry.h"
 
+#if defined(EMBREE_SYCL_SUPPORT)
+#include "../sycl/rthwif_embree_builder.h"
+#endif
+
 namespace embree
 {
+  struct TaskGroup;
+
   /*! Base class all scenes are derived from */
   class Scene : public AccelN
   {
-    ALIGNED_CLASS_(std::alignment_of<Scene>::value);
+    ALIGNED_CLASS_USM_(std::alignment_of<Scene>::value);
 
   public:
     template<typename Ty, bool mblur = false>
@@ -140,6 +146,7 @@ namespace embree
     ~Scene () noexcept;
 
   private:
+    
     /*! class is non-copyable */
     Scene (const Scene& other) DELETED; // do not implement
     Scene& operator= (const Scene& other) DELETED; // do not implement
@@ -159,6 +166,8 @@ namespace embree
     void createInstanceMBAccel();
     void createInstanceExpensiveAccel();
     void createInstanceExpensiveMBAccel();
+    void createInstanceArrayAccel();
+    void createInstanceArrayMBAccel();
     void createGridAccel();
     void createGridMBAccel();
 
@@ -176,13 +185,13 @@ namespace embree
     
     void setSceneFlags(RTCSceneFlags scene_flags);
     RTCSceneFlags getSceneFlags() const;
-    
+
+    void build_cpu_accels();
+    void build_gpu_accels();
     void commit (bool join);
     void commit_task ();
     void build () {}
 
-    void updateInterface();
-
     /* return number of geometries */
     __forceinline size_t size() const { return geometries.size(); }
     
@@ -205,20 +214,9 @@ namespace embree
     }
 
   protected:
-    
-    __forceinline void checkIfModifiedAndSet () 
-    {
-      if (isModified ()) return;
-      
-      auto geometryIsModified = [this](size_t geomID)->bool {
-        return isGeometryModified(geomID);
-      };
 
-      if (parallel_any_of (size_t(0), geometries.size (), geometryIsModified)) {
-        setModified ();
-      }
-    }
-    
+    void checkIfModifiedAndSet ();
+
   public:
 
     /* get mesh by ID */
@@ -247,7 +245,7 @@ namespace embree
     }
 
     __forceinline Ref<Geometry> get_locked(size_t i)  {
-      Lock<SpinLock> lock(geometriesMutex);
+      Lock<MutexSys> lock(geometriesMutex);
       assert(i < geometries.size()); 
       return geometries[i]; 
     }
@@ -259,8 +257,8 @@ namespace embree
     __forceinline bool isStaticAccel()  const { return !(scene_flags & RTC_SCENE_FLAG_DYNAMIC); }
     __forceinline bool isDynamicAccel() const { return scene_flags & RTC_SCENE_FLAG_DYNAMIC; }
     
-    __forceinline bool hasContextFilterFunction() const {
-      return scene_flags & RTC_SCENE_FLAG_CONTEXT_FILTER_FUNCTION;
+    __forceinline bool hasArgumentFilterFunction() const {
+      return scene_flags & RTC_SCENE_FLAG_FILTER_FUNCTION_IN_ARGUMENTS;
     }
     
     __forceinline bool hasGeometryFilterFunction() {
@@ -268,21 +266,21 @@ namespace embree
     }
       
     __forceinline bool hasFilterFunction() {
-      return hasContextFilterFunction() || hasGeometryFilterFunction();
+      return hasArgumentFilterFunction() || hasGeometryFilterFunction();
     }
     
-    /* test if scene got already build */
-    __forceinline bool isBuild() const { return is_build; }
+    void* createQBVH6Accel();
+
+  public:
+    Device* device;
 
   public:
     IDPool<unsigned,0xFFFFFFFE> id_pool;
-    vector<Ref<Geometry>> geometries; //!< list of all user geometries
-    vector<unsigned int> geometryModCounters_;
-    vector<float*> vertices;
+    Device::vector<Ref<Geometry>> geometries = device; //!< list of all user geometries
+    avector<unsigned int> geometryModCounters_;
+    Device::vector<float*> vertices = device;
     
   public:
-    Device* device;
-
     /* these are to detect if we need to recreate the acceleration structures */
     bool flags_modified;
     unsigned int enabled_geometry_types;
@@ -290,24 +288,20 @@ namespace embree
     RTCSceneFlags scene_flags;
     RTCBuildQuality quality_flags;
     MutexSys buildMutex;
-    SpinLock geometriesMutex;
-    bool is_build;
+    MutexSys geometriesMutex;
+
+#if defined(EMBREE_SYCL_SUPPORT)
+  public:
+    BBox3f hwaccel_bounds = empty;
+    AccelBuffer hwaccel;
+#endif
+    
   private:
     bool modified;                   //!< true if scene got modified
 
   public:
-    
-    /*! global lock step task scheduler */
-#if defined(TASKING_INTERNAL) 
-    MutexSys schedulerMutex;
-    Ref<TaskScheduler> scheduler;
-#elif defined(TASKING_TBB) && TASKING_TBB_USE_TASK_ISOLATION
-    tbb::isolated_task_group group;
-#elif defined(TASKING_TBB)
-    tbb::task_group group;
-#elif defined(TASKING_PPL)
-    concurrency::task_group group;
-#endif
+
+    std::unique_ptr<TaskGroup> taskGroup;
     
   public:
     struct BuildProgressMonitorInterface : public BuildProgressMonitor {
@@ -363,12 +357,28 @@ namespace embree
       
       if (mask & Geometry::MTY_INSTANCE_EXPENSIVE)
         count += mblur  ? world.numMBInstancesExpensive : world.numInstancesExpensive;
-      
+
+      if (mask & Geometry::MTY_INSTANCE_ARRAY)
+        count += mblur  ? world.numMBInstanceArrays : world.numInstanceArrays;
+
       if (mask & Geometry::MTY_GRID_MESH)
         count += mblur  ? world.numMBGrids : world.numGrids;
       
       return count;
     }
+
+    __forceinline size_t getNumSubPrimitives(Geometry::GTypeMask mask, bool mblur) const
+    {
+      size_t count = 0;
+      
+      if (mask & Geometry::MTY_GRID_MESH)
+        count += mblur  ? world.numMBSubGrids : world.numSubGrids;
+
+      Geometry::GTypeMask new_mask = (Geometry::GTypeMask)(mask & ~Geometry::MTY_GRID_MESH);
+      count += getNumPrimitives(new_mask, mblur);
+      
+      return count;
+    }
     
     template<typename Mesh, bool mblur>
     __forceinline unsigned getNumTimeSteps()
diff --git a/thirdparty/embree/kernels/common/scene_curves.h b/thirdparty/embree/kernels/common/scene_curves.h
index a1ea45d3c7..fd6ed81d7d 100644
--- a/thirdparty/embree/kernels/common/scene_curves.h
+++ b/thirdparty/embree/kernels/common/scene_curves.h
@@ -119,6 +119,15 @@ namespace embree
       p3 = vertex(i+3,itime);
     }
 
+    /*! gathers the curve normals starting with i'th vertex */
+    __forceinline void gather_normals(Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i) const
+    {
+      n0 = normal(i+0);
+      n1 = normal(i+1);
+      n2 = normal(i+2);
+      n3 = normal(i+3);
+    }
+
     /*! gathers the curve starting with i'th vertex */
     __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i) const
     {
@@ -178,6 +187,13 @@ namespace embree
     }
 
     /*! loads curve vertices for specified time */
+    __forceinline void gather_safe(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i, float time) const
+    {
+      if (hasMotionBlur()) gather(p0,p1,p2,p3,i,time);
+      else                 gather(p0,p1,p2,p3,i);
+    }
+    
+    /*! loads curve vertices for specified time */
     __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i, float time) const
     {
       float ftime;
@@ -199,8 +215,15 @@ namespace embree
       n3 = madd(Vec3ff(t0),an3,t1*bn3);
     }
 
+    /*! loads curve vertices for specified time for mblur and non-mblur case */
+    __forceinline void gather_safe(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i, float time) const
+    {
+      if (hasMotionBlur()) gather(p0,p1,p2,p3,n0,n1,n2,n3,i,time);
+      else                 gather(p0,p1,p2,p3,n0,n1,n2,n3,i);
+    }
+
     template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
-    __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const size_t itime) const
+    __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedCurve(RayQueryContext* context, const Vec3fa& ray_org, const unsigned int primID, const size_t itime) const
     {
       Vec3ff v0,v1,v2,v3; Vec3fa n0,n1,n2,n3;
       unsigned int vertexID = curve(primID);
@@ -212,7 +235,7 @@ namespace embree
     }
 
     template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
-    __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const float time) const
+    __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedCurve(RayQueryContext* context, const Vec3fa& ray_org, const unsigned int primID, const float time) const
     {
       float ftime;
       const size_t itime = timeSegment(time, ftime);
@@ -221,6 +244,19 @@ namespace embree
       return clerp(curve0,curve1,ftime);
     }
 
+    template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
+    __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedCurveSafe(RayQueryContext* context, const Vec3fa& ray_org, const unsigned int primID, const float time) const
+    {
+      float ftime = 0.0f;
+      const size_t itime = hasMotionBlur() ? timeSegment(time, ftime) : 0;
+      const TensorLinearCubicBezierSurface3fa curve0 = getNormalOrientedCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context,ray_org,primID,itime+0);
+      if (hasMotionBlur()) {
+        const TensorLinearCubicBezierSurface3fa curve1 = getNormalOrientedCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context,ray_org,primID,itime+1);
+        return clerp(curve0,curve1,ftime);
+      }
+      return curve0;
+    }
+
     /*! gathers the hermite curve starting with i'th vertex */
     __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i) const
     {
@@ -255,6 +291,13 @@ namespace embree
       t1 = madd(Vec3ff(f0),at1,f1*bt1);
     }
 
+    /*! loads curve vertices for specified time for mblur and non-mblur geometry */
+    __forceinline void gather_hermite_safe(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i, float time) const
+    {
+      if (hasMotionBlur()) gather_hermite(p0,t0,p1,t1,i,time);
+      else                 gather_hermite(p0,t0,p1,t1,i);
+    }
+
     /*! gathers the hermite curve starting with i'th vertex */
     __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3ff& t1, Vec3fa& n1, Vec3fa& dn1, size_t i) const
     {
@@ -282,7 +325,7 @@ namespace embree
     }
 
     /*! loads curve vertices for specified time */
-    __forceinline void gather_hermite(Vec3ff& p0, Vec3fa& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3fa& t1, Vec3fa& n1, Vec3fa& dn1, size_t i, float time) const
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3ff& t1, Vec3fa& n1, Vec3fa& dn1, size_t i, float time) const
     {
       float ftime;
       const size_t itime = timeSegment(time, ftime);
@@ -301,8 +344,15 @@ namespace embree
       dn1= madd(Vec3ff(f0),adn1,f1*bdn1);
     }
 
+    /*! loads curve vertices for specified time */
+    __forceinline void gather_hermite_safe(Vec3ff& p0, Vec3ff& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3ff& t1, Vec3fa& n1, Vec3fa& dn1, size_t i, float time) const
+    {
+      if (hasMotionBlur()) gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,i,time);
+      else                 gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,i);
+    }
+
     template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
-      __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedHermiteCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const size_t itime) const
+      __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedHermiteCurve(RayQueryContext* context, const Vec3fa& ray_org, const unsigned int primID, const size_t itime) const
     {
       Vec3ff v0,t0,v1,t1; Vec3fa n0,dn0,n1,dn1;
       unsigned int vertexID = curve(primID);
@@ -315,7 +365,7 @@ namespace embree
     }
 
     template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
-    __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedHermiteCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const float time) const
+    __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedHermiteCurve(RayQueryContext* context, const Vec3fa& ray_org, const unsigned int primID, const float time) const
     {
       float ftime;
       const size_t itime = timeSegment(time, ftime);
@@ -324,6 +374,24 @@ namespace embree
       return clerp(curve0,curve1,ftime);
     }
 
+    template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
+    __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedHermiteCurveSafe(RayQueryContext* context, const Vec3fa& ray_org, const unsigned int primID, const float time) const
+    {
+      float ftime = 0.0f;
+      const size_t itime = hasMotionBlur() ? timeSegment(time, ftime) : 0;
+      const TensorLinearCubicBezierSurface3fa curve0 = getNormalOrientedHermiteCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,itime+0);
+      if (hasMotionBlur()) {
+        const TensorLinearCubicBezierSurface3fa curve1 = getNormalOrientedHermiteCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,itime+1);
+        return clerp(curve0,curve1,ftime);
+      }
+      return curve0;
+    }
+
+    /* returns the projected area */
+    __forceinline float projectedPrimitiveArea(const size_t i) const {
+      return 1.0f;
+    }
+  
   private:
     void resizeBuffers(unsigned int numSteps);
 
@@ -333,12 +401,12 @@ namespace embree
     BufferView<Vec3fa> normals0;            //!< fast access to first normal buffer
     BufferView<Vec3ff> tangents0;           //!< fast access to first tangent buffer
     BufferView<Vec3fa> dnormals0;           //!< fast access to first normal derivative buffer
-    vector<BufferView<Vec3ff>> vertices;    //!< vertex array for each timestep
-    vector<BufferView<Vec3fa>> normals;     //!< normal array for each timestep
-    vector<BufferView<Vec3ff>> tangents;    //!< tangent array for each timestep
-    vector<BufferView<Vec3fa>> dnormals;    //!< normal derivative array for each timestep
+    Device::vector<BufferView<Vec3ff>> vertices = device;    //!< vertex array for each timestep
+    Device::vector<BufferView<Vec3fa>> normals = device;     //!< normal array for each timestep
+    Device::vector<BufferView<Vec3ff>> tangents = device;    //!< tangent array for each timestep
+    Device::vector<BufferView<Vec3fa>> dnormals = device;    //!< normal derivative array for each timestep
     BufferView<char> flags;                 //!< start, end flag per segment
-    vector<BufferView<char>> vertexAttribs; //!< user buffers
+    Device::vector<BufferView<char>> vertexAttribs = device; //!< user buffers
     int tessellationRate;                   //!< tessellation rate for flat curve
     float maxRadiusScale = 1.0;             //!< maximal min-width scaling of curve radii
   };
@@ -486,7 +554,7 @@ namespace embree
         src    = vertices[bufferSlot].getPtr();
         stride = vertices[bufferSlot].getStride();
       }
-      
+
       for (unsigned int i=0; i<valueCount; i+=N)
       {
         size_t ofs = i*sizeof(float);
diff --git a/thirdparty/embree/kernels/common/scene_grid_mesh.h b/thirdparty/embree/kernels/common/scene_grid_mesh.h
index fb6fed445b..eb2048b286 100644
--- a/thirdparty/embree/kernels/common/scene_grid_mesh.h
+++ b/thirdparty/embree/kernels/common/scene_grid_mesh.h
@@ -133,12 +133,26 @@ namespace embree
         }
       }
     }
-    
+
     void addElementsToCount (GeometryCounts & counts) const;
     
-    __forceinline unsigned int getNumSubGrids(const size_t gridID)
+    __forceinline unsigned int getNumTotalQuads() const
     {
-      const Grid &g = grid(gridID);
+      size_t quads = 0;
+      for (size_t primID=0; primID<numPrimitives; primID++)
+        quads += getNumQuads(primID);
+      return quads;
+    }
+
+    __forceinline unsigned int getNumQuads(const size_t gridID) const
+    {
+      const Grid& g = grid(gridID);
+      return (unsigned int) max((int)1,((int)g.resX-1) * ((int)g.resY-1));
+    }
+    
+    __forceinline unsigned int getNumSubGrids(const size_t gridID) const
+    {
+      const Grid& g = grid(gridID);
       return max((unsigned int)1,((unsigned int)g.resX >> 1) * ((unsigned int)g.resY >> 1));
     }
 
@@ -174,6 +188,18 @@ namespace embree
       return vertices[itime][i];
     }
 
+    /*! returns i'th vertex of for specified time */
+    __forceinline const Vec3fa vertex(size_t i, float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+      const float t0 = 1.0f - ftime;
+      const float t1 = ftime;
+      Vec3fa v0 = vertex(i, itime+0);
+      Vec3fa v1 = vertex(i, itime+1);
+      return madd(Vec3fa(t0),v0,t1*v1);
+    }
+
     /*! returns i'th vertex of itime'th timestep */
     __forceinline const char* vertexPtr(size_t i, size_t itime) const {
       return vertices[itime].getPtr(i);
@@ -198,6 +224,56 @@ namespace embree
       return vertex(index,itime);
     }
 
+    /*! returns i'th vertex of the itime'th timestep */
+    __forceinline const Vec3fa grid_vertex(const Grid& g, size_t x, size_t y, float time) const {
+      const size_t index = grid_vertex_index(g,x,y);
+      return vertex(index,time);
+    }
+    
+    /*! gathers quad vertices */
+    __forceinline void gather_quad_vertices(Vec3fa& v0, Vec3fa& v1, Vec3fa& v2, Vec3fa& v3, const Grid& g, size_t x, size_t y) const
+    {
+      v0 = grid_vertex(g,x+0,y+0);
+      v1 = grid_vertex(g,x+1,y+0);
+      v2 = grid_vertex(g,x+1,y+1);
+      v3 = grid_vertex(g,x+0,y+1);
+    }
+    
+    /*! gathers quad vertices for specified time */
+    __forceinline void gather_quad_vertices(Vec3fa& v0, Vec3fa& v1, Vec3fa& v2, Vec3fa& v3, const Grid& g, size_t x, size_t y, float time) const
+    {
+      v0 = grid_vertex(g,x+0,y+0,time);
+      v1 = grid_vertex(g,x+1,y+0,time);
+      v2 = grid_vertex(g,x+1,y+1,time);
+      v3 = grid_vertex(g,x+0,y+1,time);
+    }
+
+    /*! gathers quad vertices for mblur and non-mblur meshes */
+    __forceinline void gather_quad_vertices_safe(Vec3fa& v0, Vec3fa& v1, Vec3fa& v2, Vec3fa& v3, const Grid& g, size_t x, size_t y, float time) const
+    {
+      if (hasMotionBlur()) gather_quad_vertices(v0,v1,v2,v3,g,x,y,time);
+      else                 gather_quad_vertices(v0,v1,v2,v3,g,x,y);
+    }
+
+    /*! calculates the build bounds of the i'th quad, if it's valid */
+    __forceinline bool buildBoundsQuad(const Grid& g, size_t sx, size_t sy, BBox3fa& bbox) const
+    {
+      BBox3fa b(empty);
+      for (size_t t=0; t<numTimeSteps; t++)
+      {
+        for (size_t y=sy;y<sy+2;y++)
+          for (size_t x=sx;x<sx+2;x++)
+          {
+            const Vec3fa v = grid_vertex(g,x,y,t);
+            if (unlikely(!isvalid(v))) return false;
+            b.extend(v);
+          }
+      }
+
+      bbox = b;
+      return true;
+    }
+    
     /*! calculates the build bounds of the i'th primitive, if it's valid */
     __forceinline bool buildBounds(const Grid& g, size_t sx, size_t sy, BBox3fa& bbox) const
     {
@@ -254,7 +330,6 @@ namespace embree
       return true;
     }
 
-
     __forceinline BBox3fa bounds(const Grid& g, size_t sx, size_t sy, size_t itime) const
     {
       BBox3fa box(empty);
@@ -274,11 +349,22 @@ namespace embree
       return LBBox3fa([&] (size_t itime) { return bounds(g,sx,sy,itime); }, dt, time_range, fnumTimeSegments);
     }
 
+    __forceinline float projectedPrimitiveArea(const size_t i) const {
+      return pos_inf;
+    }
+
   public:
     BufferView<Grid> grids;      //!< array of triangles
     BufferView<Vec3fa> vertices0;        //!< fast access to first vertex buffer
-    vector<BufferView<Vec3fa>> vertices; //!< vertex array for each timestep
-    vector<RawBufferView> vertexAttribs; //!< vertex attributes
+    Device::vector<BufferView<Vec3fa>> vertices = device; //!< vertex array for each timestep
+    Device::vector<RawBufferView> vertexAttribs = device; //!< vertex attributes
+
+#if defined(EMBREE_SYCL_SUPPORT)
+    
+  public:
+    struct PrimID_XY { uint32_t primID; uint16_t x,y; };
+    Device::vector<PrimID_XY> quadID_to_primID_xy = device;  //!< maps a quad to the primitive ID and grid coordinates
+#endif
   };
 
   namespace isa
@@ -287,6 +373,94 @@ namespace embree
     {
       GridMeshISA (Device* device)
         : GridMesh(device) {}
+
+      LBBox3fa vlinearBounds(size_t buildID, const BBox1f& time_range, const SubGridBuildData * const sgrids) const override {
+        const SubGridBuildData &subgrid = sgrids[buildID];                      
+        const unsigned int primID = subgrid.primID;
+        const size_t x = subgrid.x();
+        const size_t y = subgrid.y();
+        return linearBounds(grid(primID),x,y,time_range);
+      }
+
+#if defined(EMBREE_SYCL_SUPPORT)
+      PrimInfo createPrimRefArray(PrimRef* prims, const range<size_t>& r, size_t k, unsigned int geomID) const override
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          const PrimID_XY& quad = quadID_to_primID_xy[j];
+          if (!buildBoundsQuad(grids[quad.primID],quad.x,quad.y,bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+#endif
+      
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids, const range<size_t>& r, size_t k, unsigned int geomID) const override 
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          if (!valid(j)) continue;
+          const GridMesh::Grid &g = grid(j);
+          
+          for (unsigned int y=0; y<g.resY-1u; y+=2)
+          {
+            for (unsigned int x=0; x<g.resX-1u; x+=2)
+            {
+              BBox3fa bounds = empty;
+              if (!buildBounds(g,x,y,bounds)) continue; // get bounds of subgrid
+              const PrimRef prim(bounds,(unsigned)geomID,(unsigned)k);
+              pinfo.add_center2(prim);
+              sgrids[k] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j));
+              prims[k++] = prim;                
+            }
+          }
+        }
+        return pinfo;
+      }
+
+#if defined(EMBREE_SYCL_SUPPORT)
+      PrimInfo createPrimRefArrayMB(PrimRef* prims, const BBox1f& time_range, const range<size_t>& r, size_t k, unsigned int geomID) const override
+      {
+        const BBox1f t0t1 = BBox1f::intersect(getTimeRange(), time_range);
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          const PrimID_XY& quad = quadID_to_primID_xy[j];
+          const LBBox3fa lbounds = linearBounds(grids[quad.primID],quad.x,quad.y,t0t1);
+          const PrimRef prim(lbounds.bounds(), unsigned(geomID), unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+#endif
+
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, mvector<SubGridBuildData>& sgrids, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const override
+      {
+        PrimInfoMB pinfoMB(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          if (!valid(j, timeSegmentRange(t0t1))) continue;
+          const GridMesh::Grid &g = grid(j);
+          
+          for (unsigned int y=0; y<g.resY-1u; y+=2)
+          {
+            for (unsigned int x=0; x<g.resX-1u; x+=2)
+            {
+              const PrimRefMB prim(linearBounds(g,x,y,t0t1),numTimeSegments(),time_range,numTimeSegments(),unsigned(geomID),unsigned(k));
+              pinfoMB.add_primref(prim);
+              sgrids[k] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j));
+              prims[k++] = prim;
+            }
+          }
+        }
+        return pinfoMB;
+      }
     };
   }
 
diff --git a/thirdparty/embree/kernels/common/scene_instance.h b/thirdparty/embree/kernels/common/scene_instance.h
index 773f2b6fec..1176018777 100644
--- a/thirdparty/embree/kernels/common/scene_instance.h
+++ b/thirdparty/embree/kernels/common/scene_instance.h
@@ -13,7 +13,7 @@ namespace embree
   /*! Instanced acceleration structure */
   struct Instance : public Geometry
   {
-    ALIGNED_STRUCT_(16);
+    //ALIGNED_STRUCT_(16);
     static const Geometry::GTypeMask geom_type = Geometry::MTY_INSTANCE;
 
   public:
@@ -50,6 +50,7 @@ namespace embree
     virtual void setTransform(const AffineSpace3fa& local2world, unsigned int timeStep) override;
     virtual void setQuaternionDecomposition(const AffineSpace3ff& qd, unsigned int timeStep) override;
     virtual AffineSpace3fa getTransform(float time) override;
+    virtual AffineSpace3fa getTransform(size_t, float time) override;
     virtual void setMask (unsigned mask) override;
     virtual void build() {}
     virtual void addElementsToCount (GeometryCounts & counts) const override;
@@ -132,10 +133,13 @@ namespace embree
 
     __forceinline AffineSpace3fa getLocal2World(float t) const
     {
-      float ftime; const unsigned int itime = timeSegment(t, ftime);
-      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
-        return slerp(local2world[itime+0],local2world[itime+1],ftime);
-      return lerp(local2world[itime+0],local2world[itime+1],ftime);
+      if (numTimeSegments() > 0) {
+        float ftime; const unsigned int itime = timeSegment(t, ftime);
+        if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+          return slerp(local2world[itime+0],local2world[itime+1],ftime);
+        return lerp(local2world[itime+0],local2world[itime+1],ftime);
+      }
+      return getLocal2World();
     }
 
     __forceinline AffineSpace3fa getWorld2Local() const {
@@ -143,7 +147,9 @@ namespace embree
     }
 
     __forceinline AffineSpace3fa getWorld2Local(float t) const {
-      return rcp(getLocal2World(t));
+      if (numTimeSegments() > 0)
+        return rcp(getLocal2World(t));
+      return getWorld2Local();
     }
 
     template<int K>
@@ -154,6 +160,10 @@ namespace embree
       return getWorld2LocalLerp<K>(valid, t);
     }
 
+    __forceinline float projectedPrimitiveArea(const size_t i) const {
+      return area(bounds(i));
+    }
+
     private:
 
     template<int K>
@@ -220,7 +230,11 @@ namespace embree
       InstanceISA (Device* device)
         : Instance(device) {}
 
-      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const {
+        return linearBounds(primID,time_range);
+      }
+
+      PrimInfo createPrimRefArray(PrimRef* prims, const range<size_t>& r, size_t k, unsigned int geomID) const
       {
         assert(r.begin() == 0);
         assert(r.end()   == 1);
@@ -252,7 +266,23 @@ namespace embree
         prims[k++] = prim;
         return pinfo;
       }
-      
+
+      PrimInfo createPrimRefArrayMB(PrimRef* prims, const BBox1f& time_range, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        assert(r.begin() == 0);
+        assert(r.end()   == 1);
+
+        PrimInfo pinfo(empty);
+        const BBox1f t0t1 = intersect(getTimeRange(), time_range);
+        if (t0t1.empty()) return pinfo;
+        
+        const BBox3fa bounds = linearBounds(0, t0t1).bounds();
+        const PrimRef prim(bounds, geomID, unsigned(0));
+        pinfo.add_center2(prim);
+        prims[k++] = prim;
+        return pinfo;
+      }
+
       PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
       {
         assert(r.begin() == 0);
diff --git a/thirdparty/embree/kernels/common/scene_instance_array.h b/thirdparty/embree/kernels/common/scene_instance_array.h
new file mode 100644
index 0000000000..3cf4d68feb
--- /dev/null
+++ b/thirdparty/embree/kernels/common/scene_instance_array.h
@@ -0,0 +1,385 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "accel.h"
+
+namespace embree
+{
+  struct MotionDerivativeCoefficients;
+
+  /*! Instanced acceleration structure */
+  struct InstanceArray : public Geometry
+  {
+    //ALIGNED_STRUCT_(16);
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_INSTANCE_ARRAY;
+
+  public:
+    InstanceArray (Device* device, unsigned int numTimeSteps = 1);
+    ~InstanceArray();
+
+  private:
+    InstanceArray (const InstanceArray& other) DELETED; // do not implement
+    InstanceArray& operator= (const InstanceArray& other) DELETED; // do not implement
+
+  private:
+    LBBox3fa nonlinearBounds(size_t i,
+                             const BBox1f& time_range_in,
+                             const BBox1f& geom_time_range,
+                             float geom_time_segments) const;
+
+    BBox3fa boundSegment(size_t i, size_t itime,
+      BBox3fa const& obbox0, BBox3fa const& obbox1,
+      BBox3fa const& bbox0, BBox3fa const& bbox1,
+      float t_min, float t_max) const;
+
+    /* calculates the (correct) interpolated bounds */
+    __forceinline BBox3fa bounds(size_t i, size_t itime0, size_t itime1, float f) const
+    {
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return xfmBounds(slerp(l2w(i, itime0), l2w(i, itime1), f),
+                         lerp(getObjectBounds(i, itime0), getObjectBounds(i, itime1), f));
+      return xfmBounds(lerp(l2w(i, itime0), l2w(i, itime1), f),
+                        lerp(getObjectBounds(i, itime0), getObjectBounds(i, itime1), f));
+    }
+
+  public:
+
+    virtual void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num) override;
+    virtual void* getBuffer(RTCBufferType type, unsigned int slot) override;
+    virtual void updateBuffer(RTCBufferType type, unsigned int slot) override;
+
+    virtual void setNumTimeSteps (unsigned int numTimeSteps) override;
+    virtual void setInstancedScene(const Ref<Scene>& scene) override;
+    virtual void setInstancedScenes(const RTCScene* scenes, size_t numScenes) override;
+    virtual AffineSpace3fa getTransform(size_t, float time) override;
+    virtual void setMask (unsigned mask) override;
+    virtual void build() {}
+    virtual void addElementsToCount (GeometryCounts & counts) const override;
+    virtual void commit() override;
+
+  public:
+
+     /*! calculates the bounds of instance */
+    __forceinline BBox3fa bounds(size_t i) const {
+      if (!valid(i))
+        return BBox3fa();
+
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return xfmBounds(quaternionDecompositionToAffineSpace(l2w(i, 0)),getObject(i)->bounds.bounds());
+      return xfmBounds(l2w(i, 0),getObject(i)->bounds.bounds());
+    }
+
+    /*! gets the bounds of the instanced scene */
+    __forceinline BBox3fa getObjectBounds(size_t i, size_t itime) const {
+      if (!valid(i))
+        return BBox3fa();
+
+      return getObject(i)->getBounds(timeStep(itime));
+    }
+
+     /*! calculates the bounds of instance */
+    __forceinline BBox3fa bounds(size_t i, size_t itime) const {
+      if (!valid(i))
+        return BBox3fa();
+
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return xfmBounds(quaternionDecompositionToAffineSpace(l2w(i, itime)),getObjectBounds(i, itime));
+      return xfmBounds(l2w(i, itime),getObjectBounds(i, itime));
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(size_t i, const BBox1f& dt) const {
+      if (!valid(i))
+        return LBBox3fa();
+
+      LBBox3fa lbbox = nonlinearBounds(i, dt, time_range, fnumTimeSegments);
+      return lbbox;
+    }
+
+    /*! calculates the build bounds of the i'th item, if it's valid */
+    __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const
+    {
+      if (!valid(i))
+        return false;
+
+      const BBox3fa b = bounds(i);
+      if (bbox) *bbox = b;
+      return isvalid(b);
+    }
+
+     /*! calculates the build bounds of the i'th item at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+    {
+      if (!valid(i))
+        return false;
+
+      const LBBox3fa bounds = linearBounds(i,itime);
+      bbox = bounds.bounds ();
+      return isvalid(bounds);
+    }
+
+    /* gets version info of topology */
+    unsigned int getTopologyVersion() const {
+      return numPrimitives;
+    }
+  
+    /* returns true if topology changed */
+    bool topologyChanged(unsigned int otherVersion) const {
+      return numPrimitives != otherVersion;
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t i) const
+    {
+      if (object) return true;
+      return (object_ids[i] != (unsigned int)(-1));
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+    {
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+        if (!isvalid(bounds(i,itime))) return false;
+
+      return true;
+    }
+
+    __forceinline AffineSpace3fa getLocal2World(size_t i) const
+    {
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return quaternionDecompositionToAffineSpace(l2w(i,0));
+      return l2w(i, 0);
+    }
+
+    __forceinline AffineSpace3fa getLocal2World(size_t i, float t) const
+    {
+      if (numTimeSegments() > 0) {
+        float ftime; const unsigned int itime = timeSegment(t, ftime);
+        if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+          return slerp(l2w(i, itime+0),l2w(i, itime+1),ftime);
+        return lerp(l2w(i, itime+0),l2w(i, itime+1),ftime);
+      }
+      return getLocal2World(i);
+    }
+
+    __forceinline AffineSpace3fa getWorld2Local(size_t i) const {
+      return rcp(getLocal2World(i));
+    }
+
+    __forceinline AffineSpace3fa getWorld2Local(size_t i, float t) const {
+      return rcp(getLocal2World(i, t));
+    }
+
+    template<int K>
+    __forceinline AffineSpace3vf<K> getWorld2Local(size_t i, const vbool<K>& valid, const vfloat<K>& t) const
+    {
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return getWorld2LocalSlerp<K>(i, valid, t);
+      return getWorld2LocalLerp<K>(i, valid, t);
+    }
+
+    __forceinline float projectedPrimitiveArea(const size_t i) const {
+      return area(bounds(i));
+    }
+
+    inline Accel* getObject(size_t i) const {
+      if (object) {
+        return object;
+      }
+
+      assert(objects);
+      assert(i < numPrimitives);
+      if (object_ids[i] == (unsigned int)(-1))
+        return nullptr;
+
+      assert(object_ids[i] < numObjects);
+      return objects[object_ids[i]];
+    }
+
+    private:
+
+    template<int K>
+    __forceinline AffineSpace3vf<K> getWorld2LocalSlerp(size_t i, const vbool<K>& valid, const vfloat<K>& t) const
+    {
+      vfloat<K> ftime;
+      const vint<K> itime_k = timeSegment<K>(t, ftime);
+      assert(any(valid));
+      const size_t index = bsf(movemask(valid));
+      const int itime = itime_k[index];
+      if (likely(all(valid, itime_k == vint<K>(itime)))) {
+        return rcp(slerp(AffineSpace3vff<K>(l2w(i, itime+0)),
+                         AffineSpace3vff<K>(l2w(i, itime+1)),
+                         ftime));
+      }
+      else {
+        AffineSpace3vff<K> space0,space1;
+        vbool<K> valid1 = valid;
+        while (any(valid1)) {
+          vbool<K> valid2;
+          const int itime = next_unique(valid1, itime_k, valid2);
+          space0 = select(valid2, AffineSpace3vff<K>(l2w(i, itime+0)), space0);
+          space1 = select(valid2, AffineSpace3vff<K>(l2w(i, itime+1)), space1);
+        }
+        return rcp(slerp(space0, space1, ftime));
+      }
+    }
+
+    template<int K>
+    __forceinline AffineSpace3vf<K> getWorld2LocalLerp(size_t i, const vbool<K>& valid, const vfloat<K>& t) const
+    {
+      vfloat<K> ftime;
+      const vint<K> itime_k = timeSegment<K>(t, ftime);
+      assert(any(valid));
+      const size_t index = bsf(movemask(valid));
+      const int itime = itime_k[index];
+      if (likely(all(valid, itime_k == vint<K>(itime)))) {
+        return rcp(lerp(AffineSpace3vf<K>((AffineSpace3fa)l2w(i, itime+0)),
+                        AffineSpace3vf<K>((AffineSpace3fa)l2w(i, itime+1)),
+                        ftime));
+      } else {
+        AffineSpace3vf<K> space0,space1;
+        vbool<K> valid1 = valid;
+        while (any(valid1)) {
+          vbool<K> valid2;
+          const int itime = next_unique(valid1, itime_k, valid2);
+          space0 = select(valid2, AffineSpace3vf<K>((AffineSpace3fa)l2w(i, itime+0)), space0);
+          space1 = select(valid2, AffineSpace3vf<K>((AffineSpace3fa)l2w(i, itime+1)), space1);
+        }
+        return rcp(lerp(space0, space1, ftime));
+      }
+    }
+
+  private:
+
+    __forceinline AffineSpace3ff l2w(size_t i, size_t itime) const {
+      if (l2w_buf[itime].getFormat() == RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR) {
+        return *(AffineSpace3ff*)(l2w_buf[itime].getPtr(i));
+      }
+      else if(l2w_buf[itime].getFormat() == RTC_FORMAT_QUATERNION_DECOMPOSITION) {
+        AffineSpace3ff transform;
+        QuaternionDecomposition* qd = (QuaternionDecomposition*)l2w_buf[itime].getPtr(i);
+        transform.l.vx.x = qd->scale_x;
+        transform.l.vy.y = qd->scale_y;
+        transform.l.vz.z = qd->scale_z;
+        transform.l.vy.x = qd->skew_xy;
+        transform.l.vz.x = qd->skew_xz;
+        transform.l.vz.y = qd->skew_yz;
+        transform.l.vx.y = qd->translation_x;
+        transform.l.vx.z = qd->translation_y;
+        transform.l.vy.z = qd->translation_z;
+        transform.p.x    = qd->shift_x;
+        transform.p.y    = qd->shift_y;
+        transform.p.z    = qd->shift_z;
+        // normalize quaternion
+        Quaternion3f q(qd->quaternion_r, qd->quaternion_i, qd->quaternion_j, qd->quaternion_k);
+        q = normalize(q);
+        transform.l.vx.w = q.i;
+        transform.l.vy.w = q.j;
+        transform.l.vz.w = q.k;
+        transform.p.w    = q.r;
+        return transform;
+      }
+      else if (l2w_buf[itime].getFormat() == RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR) {
+        AffineSpace3f* l2w = reinterpret_cast<AffineSpace3f*>(l2w_buf[itime].getPtr(i));
+        return AffineSpace3ff(*l2w);
+      }
+      else if (l2w_buf[itime].getFormat() == RTC_FORMAT_FLOAT3X4_ROW_MAJOR) {
+        float* data = reinterpret_cast<float*>(l2w_buf[itime].getPtr(i));
+        AffineSpace3f l2w;
+        l2w.l.vx.x = data[0]; l2w.l.vy.x = data[1]; l2w.l.vz.x = data[2]; l2w.p.x = data[3];
+        l2w.l.vx.y = data[4]; l2w.l.vy.y = data[5]; l2w.l.vz.y = data[6]; l2w.p.y = data[7];
+        l2w.l.vx.z = data[8]; l2w.l.vy.z = data[9]; l2w.l.vz.z = data[10]; l2w.p.z = data[11];
+        return l2w;
+      }
+      assert(false);
+      return AffineSpace3ff();
+    }
+
+    inline AffineSpace3ff l2w(size_t i) const {
+      return l2w(i, 0);
+    }
+
+  private:
+    Accel* object;                   //!< fast path if only one scene is instanced
+    Accel** objects;
+    uint32_t numObjects;
+    Device::vector<RawBufferView> l2w_buf = device; //!< transformation from local space to world space for each timestep (either normal matrix or quaternion decomposition)
+    BufferView<uint32_t> object_ids; //!< array of scene ids per instance array primitive
+  };
+
+  namespace isa
+  {
+    struct InstanceArrayISA : public InstanceArray
+    {
+      InstanceArrayISA (Device* device)
+        : InstanceArray(device) {}
+
+      LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const {
+        return linearBounds(primID,time_range);
+      }
+
+      PrimInfo createPrimRefArray(PrimRef* prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j = r.begin(); j < r.end(); j++) {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j, &bounds) || !valid(j))
+            continue;
+          const PrimRef prim(bounds, geomID, unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j = r.begin(); j < r.end(); j++) {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j, itime, bounds))
+            continue;
+          const PrimRef prim(bounds, geomID, unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(PrimRef* prims, const BBox1f& time_range, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        const BBox1f t0t1 = BBox1f::intersect(getTimeRange(), time_range);
+        if (t0t1.empty()) return pinfo;
+
+        for (size_t j = r.begin(); j < r.end(); j++) {
+          LBBox3fa lbounds = linearBounds(j, t0t1);
+          if (!isvalid(lbounds.bounds()))
+            continue;
+          const PrimRef prim(lbounds.bounds(), geomID, unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfoMB pinfo(empty);
+        for (size_t j = r.begin(); j < r.end(); j++) {
+          if (!valid(j, timeSegmentRange(t0t1)))
+            continue;
+          const PrimRefMB prim(linearBounds(j, t0t1), this->numTimeSegments(), this->time_range, this->numTimeSegments(), geomID, unsigned(j));
+          pinfo.add_primref(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(InstanceArray*, createInstanceArray, Device*);
+}
diff --git a/thirdparty/embree/kernels/common/scene_line_segments.h b/thirdparty/embree/kernels/common/scene_line_segments.h
index 3c9fdb39db..e58fd1b7eb 100644
--- a/thirdparty/embree/kernels/common/scene_line_segments.h
+++ b/thirdparty/embree/kernels/common/scene_line_segments.h
@@ -84,6 +84,14 @@ namespace embree
       return segments[i];
     }
 
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+    /*! returns the i'th segment */
+    template<int M>
+    __forceinline const vuint<M> vsegment(const vuint<M>& i) const {
+      return segments[i.v];
+    }
+#endif
+
     /*! returns the segment to the left of the i'th segment */
     __forceinline bool segmentLeftExists(size_t i) const {
       assert (flags);
@@ -136,6 +144,219 @@ namespace embree
       return vertices[itime][i].w;
     }
 
+    /*! gathers the curve starting with i'th vertex */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, unsigned int vid) const
+    {
+      p0 = vertex(vid+0);
+      p1 = vertex(vid+1);
+    }
+
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+    template<int M>
+    __forceinline void vgather(Vec4vf<M>& p0, Vec4vf<M>& p1, const vuint<M>& vid) const
+    {
+      p0 = vertex(vid.v+0);
+      p1 = vertex(vid.v+1);
+    }
+#endif
+
+    /*! gathers the curve starting with i'th vertex of itime'th timestep */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, unsigned int vid, size_t itime) const
+    {
+      p0 = vertex(vid+0,itime);
+      p1 = vertex(vid+1,itime);
+    }
+
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+    template<int M>
+    __forceinline void vgather(Vec4vf<M>& p0, Vec4vf<M>& p1, const vuint<M>& vid, const vint<M>& itime) const
+    {
+      p0 = vertex(vid.v+0,itime.v);
+      p1 = vertex(vid.v+1,itime.v);
+    }
+#endif
+
+     /*! loads curve vertices for specified time */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, unsigned int vid, float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+
+      const float t0 = 1.0f - ftime;
+      const float t1 = ftime;
+      Vec3ff a0,a1; gather(a0,a1,vid,itime);
+      Vec3ff b0,b1; gather(b0,b1,vid,itime+1);
+      p0 = madd(Vec3ff(t0),a0,t1*b0);
+      p1 = madd(Vec3ff(t0),a1,t1*b1);
+    }
+
+    /*! loads curve vertices for specified time for mblur and non-mblur case */
+    __forceinline void gather_safe(Vec3ff& p0, Vec3ff& p1, unsigned int vid, float time) const
+    {
+      if (hasMotionBlur()) gather(p0,p1,vid,time);
+      else                 gather(p0,p1,vid);
+    }
+
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+    template<int M>
+    __forceinline void vgather(Vec4vf<M>& p0, Vec4vf<M>& p1, const vuint<M>& vid, const vfloat<M>& time) const
+    {
+      vfloat<M> ftime;
+      const vint<M> itime = timeSegment<M>(time, ftime);
+
+      const vfloat<M> t0 = 1.0f - ftime;
+      const vfloat<M> t1 = ftime;
+      Vec4vf<M> a0,a1; vgather<M>(a0,a1,vid,itime);
+      Vec4vf<M> b0,b1; vgather<M>(b0,b1,vid,itime+1);
+      p0 = madd(Vec4vf<M>(t0),a0,t1*b0);
+      p1 = madd(Vec4vf<M>(t0),a1,t1*b1);
+    }
+#endif
+    
+    /*! gathers the cone curve starting with i'th vertex */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, bool& cL, bool& cR, unsigned int primID, unsigned int vid) const
+    {
+      gather(p0,p1,vid);
+      cL = !segmentLeftExists (primID);
+      cR = !segmentRightExists(primID);
+    }
+
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+    template<int M>
+    __forceinline void vgather(Vec4vf<M>& p0, Vec4vf<M>& p1, vbool<M>& cL, vbool<M>& cR, const vuint<M>& primID, const vuint<M>& vid) const
+    {
+      vgather<M>(p0,p1,vid);
+      cL = !segmentLeftExists (primID.v);
+      cR = !segmentRightExists(primID.v);
+    }
+#endif
+
+    /*! gathers the cone curve starting with i'th vertex of itime'th timestep */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, bool& cL, bool& cR, unsigned int primID, size_t vid, size_t itime) const
+    {
+      gather(p0,p1,vid,itime);
+      cL = !segmentLeftExists (primID);
+      cR = !segmentRightExists(primID);
+    }
+
+     /*! loads cone curve vertices for specified time */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, bool& cL, bool& cR, unsigned int primID, size_t vid, float time) const
+    {
+      gather(p0,p1,vid,time);
+      cL = !segmentLeftExists (primID);
+      cR = !segmentRightExists(primID);
+    }
+
+    /*! loads cone curve vertices for specified time for mblur and non-mblur geometry */
+    __forceinline void gather_safe(Vec3ff& p0, Vec3ff& p1, bool& cL, bool& cR, unsigned int primID, size_t vid, float time) const
+    {
+      if (hasMotionBlur()) gather(p0,p1,cL,cR,primID,vid,time);
+      else                 gather(p0,p1,cL,cR,primID,vid);
+    }
+
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+    template<int M>
+    __forceinline void vgather(Vec4vf<M>& p0, Vec4vf<M>& p1, vbool<M>& cL, vbool<M>& cR, const vuint<M>& primID, const vuint<M>& vid, const vfloat<M>& time) const
+    {
+      vgather<M>(p0,p1,vid,time);
+      cL = !segmentLeftExists (primID.v);
+      cR = !segmentRightExists(primID.v);
+    }
+#endif
+
+    /*! gathers the curve starting with i'th vertex */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, unsigned int primID, size_t vid) const
+    {
+      p0 = vertex(vid+0);
+      p1 = vertex(vid+1);
+      p2 = segmentLeftExists (primID) ? vertex(vid-1) : Vec3ff(inf);
+      p3 = segmentRightExists(primID) ? vertex(vid+2) : Vec3ff(inf);
+    }
+
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+    template<int M>
+    __forceinline void vgather(Vec4vf<M>& p0, Vec4vf<M>& p1, Vec4vf<M>& p2, Vec4vf<M>& p3, const vuint<M>& primID, const vuint<M>& vid) const
+    {
+      p0 = vertex(vid.v+0);
+      p1 = vertex(vid.v+1);
+      vbool<M> left  = segmentLeftExists (primID.v);
+      vbool<M> right = segmentRightExists(primID.v);
+      vuint<M> i2 = select(left, vid-1,vid+0);
+      vuint<M> i3 = select(right,vid+2,vid+1);
+      p2 = vertex(i2.v);
+      p3 = vertex(i3.v);
+      p2 =  select(left, p2,Vec4vf<M>(inf));
+      p3 =  select(right,p3,Vec4vf<M>(inf));
+    }
+#endif
+
+     /*! gathers the curve starting with i'th vertex of itime'th timestep */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, unsigned int primID, size_t vid, size_t itime) const
+    {
+      p0 = vertex(vid+0,itime);
+      p1 = vertex(vid+1,itime);
+      p2 = segmentLeftExists (primID) ? vertex(vid-1,itime) : Vec3ff(inf);
+      p3 = segmentRightExists(primID) ? vertex(vid+2,itime) : Vec3ff(inf);
+    }
+
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+    template<int M>
+    __forceinline void vgather(Vec4vf<M>& p0, Vec4vf<M>& p1, Vec4vf<M>& p2, Vec4vf<M>& p3, const vuint<M>& primID, const vuint<M>& vid, const vint<M>& itime) const
+    {
+      p0 = vertex(vid.v+0, itime.v);
+      p1 = vertex(vid.v+1, itime.v);
+      vbool<M> left  = segmentLeftExists (primID.v);
+      vbool<M> right = segmentRightExists(primID.v);
+      vuint<M> i2 = select(left, vid-1,vid+0);
+      vuint<M> i3 = select(right,vid+2,vid+1);
+      p2 = vertex(i2.v, itime.v);
+      p3 = vertex(i3.v, itime.v);
+      p2 =  select(left, p2,Vec4vf<M>(inf));
+      p3 =  select(right,p3,Vec4vf<M>(inf));
+    }
+#endif
+    
+     /*! loads curve vertices for specified time */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, unsigned int primID, size_t vid, float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+
+      const float t0 = 1.0f - ftime;
+      const float t1 = ftime;
+      Vec3ff a0,a1,a2,a3; gather(a0,a1,a2,a3,primID,vid,itime);
+      Vec3ff b0,b1,b2,b3; gather(b0,b1,b2,b3,primID,vid,itime+1);
+      p0 = madd(Vec3ff(t0),a0,t1*b0);
+      p1 = madd(Vec3ff(t0),a1,t1*b1);
+      p2 = madd(Vec3ff(t0),a2,t1*b2);
+      p3 = madd(Vec3ff(t0),a3,t1*b3);
+    }
+
+    /*! loads curve vertices for specified time for mblur and non-mblur geometry */
+    __forceinline void gather_safe(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, unsigned int primID, size_t vid, float time) const
+    {
+      if (hasMotionBlur()) gather(p0,p1,p2,p3,primID,vid,time);
+      else                 gather(p0,p1,p2,p3,primID,vid);
+    }
+
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+    template<int M>
+    __forceinline void vgather(Vec4vf<M>& p0, Vec4vf<M>& p1, Vec4vf<M>& p2, Vec4vf<M>& p3, const vuint<M>& primID, const vuint<M>& vid, const vfloat<M>& time) const
+    {
+      vfloat<M> ftime;
+      const vint<M> itime = timeSegment<M>(time, ftime);
+
+      const vfloat<M> t0 = 1.0f - ftime;
+      const vfloat<M> t1 = ftime;
+      Vec4vf<M> a0,a1,a2,a3; vgather<M>(a0,a1,a2,a3,primID,vid,itime);
+      Vec4vf<M> b0,b1,b2,b3; vgather<M>(b0,b1,b2,b3,primID,vid,itime+1);
+      p0 = madd(Vec4vf<M>(t0),a0,t1*b0);
+      p1 = madd(Vec4vf<M>(t0),a1,t1*b1);
+      p2 = madd(Vec4vf<M>(t0),a2,t1*b2);
+      p3 = madd(Vec4vf<M>(t0),a3,t1*b3);
+    }
+#endif
+    
     /*! calculates bounding box of i'th line segment */
     __forceinline BBox3fa bounds(const Vec3ff& v0, const Vec3ff& v1) const
     {
@@ -183,6 +404,18 @@ namespace embree
       return bounds(w0,w1);
     }
 
+    /*! calculates bounding box of i'th segment */
+    __forceinline BBox3fa bounds(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t i, size_t itime = 0) const
+    {
+      const float r_scale = r_scale0*scale;
+      const unsigned int index = segment(i);
+      const Vec3ff v0 = vertex(index+0,itime);
+      const Vec3ff v1 = vertex(index+1,itime);
+      const Vec3ff w0(xfmVector(space,(v0-ofs)*Vec3fa(scale)),maxRadiusScale*v0.w*r_scale);
+      const Vec3ff w1(xfmVector(space,(v1-ofs)*Vec3fa(scale)),maxRadiusScale*v1.w*r_scale);
+      return bounds(w0,w1);
+    }     
+
     /*! check if the i'th primitive is valid at the itime'th timestep */
     __forceinline bool valid(size_t i, size_t itime) const {
       return valid(i, make_range(itime, itime));
@@ -193,13 +426,17 @@ namespace embree
     {
       const unsigned int index = segment(i);
       if (index+1 >= numVertices()) return false;
-      
+
+#if !defined(__SYCL_DEVICE_ONLY__)
+
       for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
       {
         const Vec3ff v0 = vertex(index+0,itime); if (unlikely(!isvalid4(v0))) return false;
         const Vec3ff v1 = vertex(index+1,itime); if (unlikely(!isvalid4(v1))) return false;
         if (min(v0.w,v1.w) < 0.0f) return false;
       }
+#endif
+      
       return true;
     }
 
@@ -235,6 +472,11 @@ namespace embree
     }
 
     /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&] (size_t itime) { return bounds(ofs, scale, r_scale0, space, primID, itime); }, dt, this->time_range, fnumTimeSegments);
+    }
+    
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
     __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const
     {
       if (!valid(i, timeSegmentRange(time_range))) return false;
@@ -252,9 +494,9 @@ namespace embree
     BufferView<Vec3ff> vertices0;           //!< fast access to first vertex buffer
     BufferView<Vec3fa> normals0;            //!< fast access to first normal buffer
     BufferView<char> flags;                 //!< start, end flag per segment
-    vector<BufferView<Vec3ff>> vertices;    //!< vertex array for each timestep
-    vector<BufferView<Vec3fa>> normals;     //!< normal array for each timestep
-    vector<BufferView<char>> vertexAttribs; //!< user buffers
+    Device::vector<BufferView<Vec3ff>> vertices = device;    //!< vertex array for each timestep
+    Device::vector<BufferView<Vec3fa>> normals = device;     //!< normal array for each timestep
+    Device::vector<BufferView<char>> vertexAttribs = device; //!< user buffers
     int tessellationRate;                   //!< tessellation rate for bezier curve
     float maxRadiusScale = 1.0;             //!< maximal min-width scaling of curve radii
   };
@@ -266,6 +508,28 @@ namespace embree
       LineSegmentsISA (Device* device, Geometry::GType gtype)
         : LineSegments(device,gtype) {}
 
+      LinearSpace3fa computeAlignedSpace(const size_t primID) const
+      {
+        const Vec3fa dir = normalize(computeDirection(primID));
+        if (is_finite(dir)) return frame(dir);
+        else return LinearSpace3fa(one);
+      }
+
+      LinearSpace3fa computeAlignedSpaceMB(const size_t primID, const BBox1f time_range) const
+      {
+        Vec3fa axisz(0,0,1);
+        Vec3fa axisy(0,1,0);
+
+        const range<int> tbounds = this->timeSegmentRange(time_range);
+        if (tbounds.size() == 0) return frame(axisz);
+        
+        const size_t itime = (tbounds.begin()+tbounds.end())/2;
+
+        const Vec3fa dir = normalize(computeDirection(primID,itime));
+        if (is_finite(dir)) return frame(dir);
+        else return LinearSpace3fa(one);
+      }     
+
       Vec3fa computeDirection(unsigned int primID) const
       {
         const unsigned vtxID = segment(primID);
@@ -282,7 +546,7 @@ namespace embree
         return v1-v0;
       }
 
-      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      PrimInfo createPrimRefArray(PrimRef* prims, const range<size_t>& r, size_t k, unsigned int geomID) const
       {
         PrimInfo pinfo(empty);
         for (size_t j=r.begin(); j<r.end(); j++)
@@ -309,7 +573,24 @@ namespace embree
         }
         return pinfo;
       }
-      
+
+      PrimInfo createPrimRefArrayMB(PrimRef* prims, const BBox1f& time_range, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        const BBox1f t0t1 = BBox1f::intersect(getTimeRange(), time_range);
+        if (t0t1.empty()) return pinfo;
+        
+        for (size_t j = r.begin(); j < r.end(); j++) {
+          LBBox3fa lbounds = empty;
+          if (!linearBounds(j, t0t1, lbounds))
+            continue;
+          const PrimRef prim(lbounds.bounds(), geomID, unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
       PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
       {
         PrimInfoMB pinfo(empty);
@@ -331,6 +612,10 @@ namespace embree
         return bounds(space,i);
       }
 
+       BBox3fa vbounds(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t i, size_t itime = 0) const {
+        return bounds(ofs,scale,r_scale0,space,i,itime);
+      }
+
       LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const {
         return linearBounds(primID,time_range);
       }
@@ -338,6 +623,10 @@ namespace embree
       LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const {
         return linearBounds(space,primID,time_range);
       }
+
+       LBBox3fa vlinearBounds(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const {
+        return linearBounds(ofs,scale,r_scale0,space,primID,time_range);
+      }
     };
   }
 
diff --git a/thirdparty/embree/kernels/common/scene_points.h b/thirdparty/embree/kernels/common/scene_points.h
index 017e098a51..937a8f1806 100644
--- a/thirdparty/embree/kernels/common/scene_points.h
+++ b/thirdparty/embree/kernels/common/scene_points.h
@@ -68,6 +68,25 @@ namespace embree
       return vertices[itime][i];
     }
 
+    /*! returns i'th vertex of for specified time */
+    __forceinline Vec3ff vertex(size_t i, float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+      const float t0 = 1.0f - ftime;
+      const float t1 = ftime;
+      Vec3ff v0 = vertex(i, itime+0);
+      Vec3ff v1 = vertex(i, itime+1);
+      return madd(Vec3ff(t0),v0,t1*v1);
+    }
+
+    /*! returns i'th vertex of for specified time */
+    __forceinline Vec3ff vertex_safe(size_t i, float time) const
+    {
+      if (hasMotionBlur()) return vertex(i,time);
+      else                 return vertex(i);
+    }
+
     /*! returns i'th vertex of itime'th timestep */
     __forceinline const char* vertexPtr(size_t i, size_t itime) const {
       return vertices[itime].getPtr(i);
@@ -78,11 +97,49 @@ namespace embree
       return normals[itime][i];
     }
 
+    /*! returns i'th normal of for specified time */
+    __forceinline Vec3fa normal(size_t i, float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+      const float t0 = 1.0f - ftime;
+      const float t1 = ftime;
+      Vec3fa n0 = normal(i, itime+0);
+      Vec3fa n1 = normal(i, itime+1);
+      return madd(Vec3fa(t0),n0,t1*n1);
+    }
+
+    /*! returns i'th normal of for specified time */
+    __forceinline Vec3fa normal_safe(size_t i, float time) const
+    {
+      if (hasMotionBlur()) return normal(i,time);
+      else                 return normal(i);
+    }
+
     /*! returns i'th radius of itime'th timestep */
     __forceinline float radius(size_t i, size_t itime) const {
       return vertices[itime][i].w;
     }
 
+    /*! returns i'th radius of for specified time */
+    __forceinline float radius(size_t i, float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+      const float t0 = 1.0f - ftime;
+      const float t1 = ftime;
+      float r0 = radius(i, itime+0);
+      float r1 = radius(i, itime+1);
+      return madd(t0,r0,t1*r1);
+    }
+
+    /*! returns i'th radius of for specified time */
+    __forceinline float radius_safe(size_t i, float time) const
+    {
+      if (hasMotionBlur()) return radius(i,time);
+      else                 return radius(i);
+    }
+
     /*! calculates bounding box of i'th line segment */
     __forceinline BBox3fa bounds(const Vec3ff& v0) const {
       return enlarge(BBox3fa(v0), maxRadiusScale*Vec3fa(v0.w));
@@ -185,13 +242,18 @@ namespace embree
     __forceinline float * getCompactVertexArray () const {
       return (float*) vertices0.getPtr();
     }
+    
+    __forceinline float projectedPrimitiveArea(const size_t i) const {
+      const float R = radius(i);
+      return 1 + 2*M_PI*R*R;
+    }
 
    public:
     BufferView<Vec3ff> vertices0;            //!< fast access to first vertex buffer
     BufferView<Vec3fa> normals0;             //!< fast access to first normal buffer
-    vector<BufferView<Vec3ff>> vertices;     //!< vertex array for each timestep
-    vector<BufferView<Vec3fa>> normals;      //!< normal array for each timestep
-    vector<BufferView<char>> vertexAttribs;  //!< user buffers
+    Device::vector<BufferView<Vec3ff>> vertices = device;     //!< vertex array for each timestep
+    Device::vector<BufferView<Vec3fa>> normals = device;      //!< normal array for each timestep
+    Device::vector<BufferView<char>> vertexAttribs = device;  //!< user buffers
     float maxRadiusScale = 1.0;              //!< maximal min-width scaling of curve radii
   };
 
@@ -211,7 +273,7 @@ namespace embree
         return Vec3fa(1, 0, 0);
       }
 
-      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      PrimInfo createPrimRefArray(PrimRef* prims, const range<size_t>& r, size_t k, unsigned int geomID) const
       {
         PrimInfo pinfo(empty);
         for (size_t j = r.begin(); j < r.end(); j++) {
@@ -239,6 +301,23 @@ namespace embree
         return pinfo;
       }
 
+      PrimInfo createPrimRefArrayMB(PrimRef* prims, const BBox1f& time_range, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        const BBox1f t0t1 = BBox1f::intersect(getTimeRange(), time_range);
+        if (t0t1.empty()) return pinfo;
+        
+        for (size_t j = r.begin(); j < r.end(); j++) {
+          LBBox3fa lbounds = empty;
+          if (!linearBounds(j, t0t1, lbounds))
+            continue;
+          const PrimRef prim(lbounds.bounds(), geomID, unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
       PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims,
                                       const BBox1f& t0t1,
                                       const range<size_t>& r,
diff --git a/thirdparty/embree/kernels/common/scene_quad_mesh.h b/thirdparty/embree/kernels/common/scene_quad_mesh.h
index bd8eeaaeb7..09a8b8ddd9 100644
--- a/thirdparty/embree/kernels/common/scene_quad_mesh.h
+++ b/thirdparty/embree/kernels/common/scene_quad_mesh.h
@@ -17,12 +17,18 @@ namespace embree
     /*! triangle indices */
     struct Quad
     {
-      uint32_t v[4];
+      Quad() {}
+
+      Quad (uint32_t v0, uint32_t v1, uint32_t v2, uint32_t v3) {
+        v[0] = v0; v[1] = v1; v[2] = v2; v[3] = v3;
+      }
 
       /*! outputs triangle indices */
       __forceinline friend embree_ostream operator<<(embree_ostream cout, const Quad& q) {
         return cout << "Quad {" << q.v[0] << ", " << q.v[1] << ", " << q.v[2] << ", " << q.v[3] << " }";
       }
+
+      uint32_t v[4];
     };
 
   public:
@@ -135,6 +141,18 @@ namespace embree
       return vertices[itime].getPtr(i);
     }
 
+    /*! returns i'th vertex of for specified time */
+    __forceinline Vec3fa vertex(size_t i, float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+      const float t0 = 1.0f - ftime;
+      const float t1 = ftime;
+      Vec3fa v0 = vertex(i, itime+0);
+      Vec3fa v1 = vertex(i, itime+1);
+      return madd(Vec3fa(t0),v0,t1*v1);
+    }
+
     /*! calculates the bounds of the i'th quad */
     __forceinline BBox3fa bounds(size_t i) const 
     {
@@ -196,7 +214,7 @@ namespace embree
       if (q.v[2] >= numVertices()) return false;
       if (q.v[3] >= numVertices()) return false;
 
-      for (unsigned int t=0; t<numTimeSteps; t++)
+      for (size_t t=0; t<numTimeSteps; t++)
       {
         const Vec3fa v0 = vertex(q.v[0],t);
         const Vec3fa v1 = vertex(q.v[1],t);
@@ -279,8 +297,8 @@ namespace embree
   public:
     BufferView<Quad> quads;                 //!< array of quads
     BufferView<Vec3fa> vertices0;           //!< fast access to first vertex buffer
-    vector<BufferView<Vec3fa>> vertices;    //!< vertex array for each timestep
-    vector<BufferView<char>> vertexAttribs; //!< vertex attribute buffers
+    Device::vector<BufferView<Vec3fa>> vertices = device; //!< vertex array for each timestep
+    Device::vector<RawBufferView> vertexAttribs = device; //!< vertex attribute buffers
   };
 
   namespace isa
@@ -290,7 +308,11 @@ namespace embree
       QuadMeshISA (Device* device)
         : QuadMesh(device) {}
 
-      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const {
+        return linearBounds(primID,time_range);
+      }
+
+      PrimInfo createPrimRefArray(PrimRef* prims, const range<size_t>& r, size_t k, unsigned int geomID) const
       {
         PrimInfo pinfo(empty);
         for (size_t j=r.begin(); j<r.end(); j++)
@@ -317,7 +339,24 @@ namespace embree
         }
         return pinfo;
       }
-      
+
+      PrimInfo createPrimRefArrayMB(PrimRef* prims, const BBox1f& time_range, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        const BBox1f t0t1 = BBox1f::intersect(getTimeRange(), time_range);
+        if (t0t1.empty()) return pinfo;
+                
+        for (size_t j = r.begin(); j < r.end(); j++) {
+          LBBox3fa lbounds = empty;
+          if (!linearBounds(j, t0t1, lbounds))
+            continue;
+          const PrimRef prim(lbounds.bounds(), geomID, unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
       PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
       {
         PrimInfoMB pinfo(empty);
diff --git a/thirdparty/embree/kernels/common/scene_subdiv_mesh.h b/thirdparty/embree/kernels/common/scene_subdiv_mesh.h
index 1db170196d..b213a9b7ba 100644
--- a/thirdparty/embree/kernels/common/scene_subdiv_mesh.h
+++ b/thirdparty/embree/kernels/common/scene_subdiv_mesh.h
@@ -9,11 +9,13 @@
 #include "../subdiv/tessellation_cache.h"
 #include "../subdiv/catmullclark_coefficients.h"
 #include "../subdiv/patch.h"
-#include "../../common/algorithms/parallel_map.h"
-#include "../../common/algorithms/parallel_set.h"
 
 namespace embree
 {
+  struct HoleSet;
+  struct VertexCreaseMap;
+  struct EdgeCreaseMap;
+
   class SubdivMesh : public Geometry
   {
     ALIGNED_CLASS_(16);
@@ -49,6 +51,7 @@ namespace embree
 
     /*! subdiv mesh construction */
     SubdivMesh(Device* device);
+    ~SubdivMesh();
 
   public:
     void setMask (unsigned mask);
@@ -272,7 +275,7 @@ namespace embree
     mvector<uint32_t> halfEdgeFace;
 
     /*! set with all holes */
-    parallel_set<uint32_t> holeSet;
+    std::unique_ptr<HoleSet> holeSet;
 
     /*! fast lookup table to detect invalid faces */
     mvector<char> invalid_face;
@@ -299,10 +302,10 @@ namespace embree
   private:
 
     /*! map with all vertex creases */
-    parallel_map<uint32_t,float> vertexCreaseMap;
+    std::unique_ptr<VertexCreaseMap> vertexCreaseMap;
     
     /*! map with all edge creases */
-    parallel_map<uint64_t,float> edgeCreaseMap;
+    std::unique_ptr<EdgeCreaseMap> edgeCreaseMap;
 
   protected:
     
diff --git a/thirdparty/embree/kernels/common/scene_triangle_mesh.cpp b/thirdparty/embree/kernels/common/scene_triangle_mesh.cpp
index 3bbd7e51ae..6cdd542a65 100644
--- a/thirdparty/embree/kernels/common/scene_triangle_mesh.cpp
+++ b/thirdparty/embree/kernels/common/scene_triangle_mesh.cpp
@@ -134,7 +134,7 @@ namespace embree
     Geometry::update();
   }
 
-  void TriangleMesh::commit() 
+  void TriangleMesh::commit()
   {
     /* verify that stride of all time steps are identical */
     for (unsigned int t=0; t<numTimeSteps; t++)
diff --git a/thirdparty/embree/kernels/common/scene_triangle_mesh.h b/thirdparty/embree/kernels/common/scene_triangle_mesh.h
index ad3f602fde..0d28219b96 100644
--- a/thirdparty/embree/kernels/common/scene_triangle_mesh.h
+++ b/thirdparty/embree/kernels/common/scene_triangle_mesh.h
@@ -129,6 +129,18 @@ namespace embree
       return vertices[itime].getPtr(i);
     }
 
+    /*! returns i'th vertex of for specified time */
+    __forceinline Vec3fa vertex(size_t i, float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+      const float t0 = 1.0f - ftime;
+      const float t1 = ftime;
+      Vec3fa v0 = vertex(i, itime+0);
+      Vec3fa v1 = vertex(i, itime+1);
+      return madd(Vec3fa(t0),v0,t1*v1);
+    }
+
     /*! calculates the bounds of the i'th triangle */
     __forceinline BBox3fa bounds(size_t i) const 
     {
@@ -260,8 +272,8 @@ namespace embree
   public:
     BufferView<Triangle> triangles;      //!< array of triangles
     BufferView<Vec3fa> vertices0;        //!< fast access to first vertex buffer
-    vector<BufferView<Vec3fa>> vertices; //!< vertex array for each timestep
-    vector<RawBufferView> vertexAttribs; //!< vertex attributes
+    Device::vector<BufferView<Vec3fa>> vertices = device; //!< vertex array for each timestep
+    Device::vector<RawBufferView> vertexAttribs = device; //!< vertex attributes
   };
 
   namespace isa
@@ -271,7 +283,11 @@ namespace embree
       TriangleMeshISA (Device* device)
         : TriangleMesh(device) {}
 
-      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const {
+        return linearBounds(primID,time_range);
+      }
+
+      PrimInfo createPrimRefArray(PrimRef* prims, const range<size_t>& r, size_t k, unsigned int geomID) const
       {
         PrimInfo pinfo(empty);
         for (size_t j=r.begin(); j<r.end(); j++)
@@ -298,7 +314,24 @@ namespace embree
         }
         return pinfo;
       }
-      
+
+      PrimInfo createPrimRefArrayMB(PrimRef* prims, const BBox1f& time_range, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        const BBox1f t0t1 = BBox1f::intersect(getTimeRange(), time_range);
+        if (t0t1.empty()) return pinfo;
+        
+        for (size_t j = r.begin(); j < r.end(); j++) {
+          LBBox3fa lbounds = empty;
+          if (!linearBounds(j, t0t1, lbounds))
+            continue;
+          const PrimRef prim(lbounds.bounds(), geomID, unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
       PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
       {
         PrimInfoMB pinfo(empty);
diff --git a/thirdparty/embree/kernels/common/scene_user_geometry.h b/thirdparty/embree/kernels/common/scene_user_geometry.h
index 2867b18b79..033476f658 100644
--- a/thirdparty/embree/kernels/common/scene_user_geometry.h
+++ b/thirdparty/embree/kernels/common/scene_user_geometry.h
@@ -21,6 +21,8 @@ namespace embree
     virtual void setOccludedFunctionN (RTCOccludedFunctionN occluded);
     virtual void build() {}
     virtual void addElementsToCount (GeometryCounts & counts) const;
+
+    __forceinline float projectedPrimitiveArea(const size_t i) const { return 0.0f; }
   };
 
   namespace isa
@@ -30,7 +32,7 @@ namespace embree
       UserGeometryISA (Device* device)
         : UserGeometry(device) {}
 
-      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      PrimInfo createPrimRefArray(PrimRef* prims, const range<size_t>& r, size_t k, unsigned int geomID) const
       {
         PrimInfo pinfo(empty);
         for (size_t j=r.begin(); j<r.end(); j++)
@@ -57,7 +59,24 @@ namespace embree
         }
         return pinfo;
       }
-      
+
+      PrimInfo createPrimRefArrayMB(PrimRef* prims, const BBox1f& time_range, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        const BBox1f t0t1 = BBox1f::intersect(getTimeRange(), time_range);
+        if (t0t1.empty()) return pinfo;
+        
+        for (size_t j = r.begin(); j < r.end(); j++) {
+          LBBox3fa lbounds = empty;
+          if (!linearBounds(j, t0t1, lbounds))
+            continue;
+          const PrimRef prim(lbounds.bounds(), geomID, unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
       PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
       {
         PrimInfoMB pinfo(empty);
diff --git a/thirdparty/embree/kernels/common/scene_verify.cpp b/thirdparty/embree/kernels/common/scene_verify.cpp
new file mode 100644
index 0000000000..1db7844f4f
--- /dev/null
+++ b/thirdparty/embree/kernels/common/scene_verify.cpp
@@ -0,0 +1,24 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "scene.h"
+
+#include "../../common/algorithms/parallel_any_of.h"
+
+namespace embree
+{
+
+void Scene::checkIfModifiedAndSet ()
+{
+  if (isModified ()) return;
+
+  auto geometryIsModified = [this](size_t geomID)->bool {
+    return isGeometryModified(geomID);
+  };
+
+  if (parallel_any_of (size_t(0), geometries.size (), geometryIsModified)) {
+    setModified ();
+  }
+}
+
+}
+\ No newline at end of file
diff --git a/thirdparty/embree/kernels/common/stat.cpp b/thirdparty/embree/kernels/common/stat.cpp
index ebb77cd534..9a8c8fac4e 100644
--- a/thirdparty/embree/kernels/common/stat.cpp
+++ b/thirdparty/embree/kernels/common/stat.cpp
@@ -17,7 +17,7 @@ namespace embree
 #endif
   }
 
-  void Stat::print(std::ostream& cout)
+  void Stat::print(embree_ostream cout)
   {
     Counters& cntrs = instance.cntrs;
     Counters::Data& data = instance.cntrs.code;
diff --git a/thirdparty/embree/kernels/common/state.cpp b/thirdparty/embree/kernels/common/state.cpp
index db6b803041..1d73ae9629 100644
--- a/thirdparty/embree/kernels/common/state.cpp
+++ b/thirdparty/embree/kernels/common/state.cpp
@@ -192,10 +192,17 @@ namespace embree
   const char* symbols[3] = { "=", ",", "|" };
 
   bool State::parseFile(const FileName& fileName)
-  {
-    FILE* f = fopen(fileName.c_str(),"r");
-    if (!f) return false;
-    Ref<Stream<int> > file = new FileStream(f,fileName);
+  { 
+    Ref<Stream<int> > file;
+    // -- GODOT start --
+    // try {
+      file = new FileStream(fileName);
+    // }
+    // catch (std::runtime_error& e) {
+    //   (void) e;
+    //   return false;
+    // }
+    // -- GODOT end --
     
     std::vector<std::string> syms;
     for (size_t i=0; i<sizeof(symbols)/sizeof(void*); i++) 
@@ -393,7 +400,7 @@ namespace embree
         grid_accel = cin->get().Identifier();
       else if (tok == Token::Id("grid_accel_mb") && cin->trySymbol("="))
         grid_accel_mb = cin->get().Identifier();
-      
+
       else if (tok == Token::Id("verbose") && cin->trySymbol("="))
         verbose = cin->get().Int();
       else if (tok == Token::Id("benchmark") && cin->trySymbol("="))
@@ -419,7 +426,7 @@ namespace embree
           } while (cin->trySymbol("|"));
         }
       }
-      
+
       else if (tok == Token::Id("max_spatial_split_replications") && cin->trySymbol("="))
         max_spatial_split_replications = cin->get().Float();
 
diff --git a/thirdparty/embree/kernels/common/state.h b/thirdparty/embree/kernels/common/state.h
index 33bcc843b2..8c34614185 100644
--- a/thirdparty/embree/kernels/common/state.h
+++ b/thirdparty/embree/kernels/common/state.h
@@ -189,7 +189,7 @@ namespace embree
       memory_monitor_function = fptr;
       memory_monitor_userptr = uptr;
     }
-      
+
     RTCMemoryMonitorFunction memory_monitor_function;
     void* memory_monitor_userptr;
   };
diff --git a/thirdparty/embree/kernels/config.h b/thirdparty/embree/kernels/config.h
index 84ac27d103..5979b543c9 100644
--- a/thirdparty/embree/kernels/config.h
+++ b/thirdparty/embree/kernels/config.h
@@ -1,22 +1,26 @@
 // Copyright 2009-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-/* #undef EMBREE_RAY_MASK */
-/* #undef EMBREE_STAT_COUNTERS */
-/* #undef EMBREE_BACKFACE_CULLING */
-/* #undef EMBREE_BACKFACE_CULLING_CURVES */
+#include "../include/embree4/rtcore_config.h"
+
+// #cmakedefine EMBREE_RAY_MASK
+// #cmakedefine EMBREE_STAT_COUNTERS
+// #cmakedefine EMBREE_BACKFACE_CULLING
+// #cmakedefine EMBREE_BACKFACE_CULLING_CURVES
+// #cmakedefine EMBREE_BACKFACE_CULLING_SPHERES
 #define EMBREE_FILTER_FUNCTION
-/* #undef EMBREE_IGNORE_INVALID_RAYS */
+// #cmakedefine EMBREE_IGNORE_INVALID_RAYS
 #define EMBREE_GEOMETRY_TRIANGLE
-/* #undef EMBREE_GEOMETRY_QUAD */
-/* #undef EMBREE_GEOMETRY_CURVE */
-/* #undef EMBREE_GEOMETRY_SUBDIVISION */
-/* #undef EMBREE_GEOMETRY_USER */
-/* #undef EMBREE_GEOMETRY_INSTANCE */
-/* #undef EMBREE_GEOMETRY_GRID */
-/* #undef EMBREE_GEOMETRY_POINT */
+// #cmakedefine EMBREE_GEOMETRY_QUAD
+// #cmakedefine EMBREE_GEOMETRY_CURVE
+// #cmakedefine EMBREE_GEOMETRY_SUBDIVISION
+// #cmakedefine EMBREE_GEOMETRY_USER
+// #cmakedefine EMBREE_GEOMETRY_INSTANCE
+// EMBREE_GEOMETRY_INSTANCE_ARRAY is defined in rtcore_config.h
+// #cmakedefine EMBREE_GEOMETRY_GRID
+// #cmakedefine EMBREE_GEOMETRY_POINT
 #define EMBREE_RAY_PACKETS
-/* #undef EMBREE_COMPACT_POLYS */
+// #cmakedefine EMBREE_COMPACT_POLYS
 
 #define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0
 #define EMBREE_DISC_POINT_SELF_INTERSECTION_AVOIDANCE
@@ -69,8 +73,18 @@
   #define IF_ENABLED_INSTANCE(x)
 #endif
 
+#if defined(EMBREE_GEOMETRY_INSTANCE_ARRAY)
+  #define IF_ENABLED_INSTANCE_ARRAY(x) x
+#else
+  #define IF_ENABLED_INSTANCE_ARRAY(x)
+#endif
+
 #if defined(EMBREE_GEOMETRY_GRID)
   #define IF_ENABLED_GRIDS(x) x
 #else
   #define IF_ENABLED_GRIDS(x)
 #endif
+
+
+
+
diff --git a/thirdparty/embree/kernels/geometry/coneline_intersector.h b/thirdparty/embree/kernels/geometry/coneline_intersector.h
index 90f3792eff..696ea41ebc 100644
--- a/thirdparty/embree/kernels/geometry/coneline_intersector.h
+++ b/thirdparty/embree/kernels/geometry/coneline_intersector.h
@@ -159,7 +159,7 @@ namespace embree
         template<typename Epilog>
         static __forceinline bool intersect(const vbool<M>& valid_i,
                                             Ray& ray,
-                                            IntersectContext* context,
+                                            RayQueryContext* context,
                                             const LineSegments* geom,
                                             const Precalculations& pre,
                                             const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
@@ -190,7 +190,7 @@ namespace embree
         template<typename Epilog>
         static __forceinline bool intersect(const vbool<M>& valid_i,
                                             RayK<K>& ray, size_t k,
-                                            IntersectContext* context,
+                                            RayQueryContext* context,
                                             const LineSegments* geom,
                                             const Precalculations& pre,
                                             const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
diff --git a/thirdparty/embree/kernels/geometry/conelinei_intersector.h b/thirdparty/embree/kernels/geometry/conelinei_intersector.h
index 6a985ebcad..c919fe9f7b 100644
--- a/thirdparty/embree/kernels/geometry/conelinei_intersector.h
+++ b/thirdparty/embree/kernels/geometry/conelinei_intersector.h
@@ -16,7 +16,7 @@ namespace embree
       typedef LineMi<M> Primitive;
       typedef CurvePrecalculations1 Precalculations;
 
-      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& line)
       {
         STAT3(normal.trav_prims,1,1,1);
         const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
@@ -27,7 +27,7 @@ namespace embree
         ConeCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Intersect1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
       }
 
-      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& line)
       {
         STAT3(shadow.trav_prims,1,1,1);
         const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
@@ -51,7 +51,7 @@ namespace embree
       typedef LineMi<M> Primitive;
       typedef CurvePrecalculations1 Precalculations;
 
-      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& line)
       {
         STAT3(normal.trav_prims,1,1,1);
         const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
@@ -62,7 +62,7 @@ namespace embree
         ConeCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Intersect1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
       }
 
-      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& line)
       {
         STAT3(shadow.trav_prims,1,1,1);
         const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
@@ -86,7 +86,7 @@ namespace embree
       typedef LineMi<M> Primitive;
       typedef CurvePrecalculationsK<K> Precalculations;
 
-      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const Primitive& line)
       {
         STAT3(normal.trav_prims,1,1,1);
         const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
@@ -97,7 +97,7 @@ namespace embree
         ConeCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Intersect1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
       }
 
-      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive& line)
       {
         STAT3(shadow.trav_prims,1,1,1);
         const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
@@ -115,7 +115,7 @@ namespace embree
       typedef LineMi<M> Primitive;
       typedef CurvePrecalculationsK<K> Precalculations;
 
-      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context,  const Primitive& line)
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context,  const Primitive& line)
       {
         STAT3(normal.trav_prims,1,1,1);
         const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
@@ -126,7 +126,7 @@ namespace embree
         ConeCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Intersect1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
       }
 
-      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive& line)
       {
         STAT3(shadow.trav_prims,1,1,1);
         const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
diff --git a/thirdparty/embree/kernels/geometry/curveNi_intersector.h b/thirdparty/embree/kernels/geometry/curveNi_intersector.h
index c0b66515c1..137ec06d0c 100644
--- a/thirdparty/embree/kernels/geometry/curveNi_intersector.h
+++ b/thirdparty/embree/kernels/geometry/curveNi_intersector.h
@@ -5,6 +5,12 @@
 
 #include "curveNi.h"
 
+#include "roundline_intersector.h"
+#include "coneline_intersector.h"
+#include "curve_intersector_ribbon.h"
+#include "curve_intersector_oriented.h"
+#include "curve_intersector_sweep.h"
+
 namespace embree
 {
   namespace isa
@@ -20,9 +26,14 @@ namespace embree
       static __forceinline vbool<M> intersect(Ray& ray, const Primitive& prim, vfloat<M>& tNear_o)
       {
         const size_t N = prim.N;
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+        const Vec3fa offset = *prim.offset(N);
+        const float scale  = *prim.scale(N);
+#else
         const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N));
         const Vec3fa offset = Vec3fa(offset_scale);
         const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale));
+#endif
         const Vec3fa org1 = (ray.org-offset)*scale;
         const Vec3fa dir1 = ray.dir*scale;
         
@@ -50,7 +61,7 @@ namespace embree
       }
 
       template<typename Intersector, typename Epilog>
-        static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+        static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = intersect(ray,prim,tNear);
@@ -84,7 +95,7 @@ namespace embree
       }
 
       template<typename Intersector, typename Epilog>
-        static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+        static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = intersect(ray,prim,tNear);
@@ -121,7 +132,7 @@ namespace embree
       }
 
       template<typename Intersector, typename Epilog>
-        static __forceinline void intersect_n(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+        static __forceinline void intersect_n(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = intersect(ray,prim,tNear);
@@ -157,7 +168,7 @@ namespace embree
       }
 
       template<typename Intersector, typename Epilog>
-        static __forceinline bool occluded_n(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+        static __forceinline bool occluded_n(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = intersect(ray,prim,tNear);
@@ -196,7 +207,7 @@ namespace embree
       }
 
       template<typename Intersector, typename Epilog>
-        static __forceinline void intersect_h(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+        static __forceinline void intersect_h(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = intersect(ray,prim,tNear);
@@ -217,7 +228,7 @@ namespace embree
       }
 
       template<typename Intersector, typename Epilog>
-        static __forceinline bool occluded_h(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+        static __forceinline bool occluded_h(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = intersect(ray,prim,tNear);
@@ -241,7 +252,7 @@ namespace embree
       }
 
       template<typename Intersector, typename Epilog>
-        static __forceinline void intersect_hn(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+        static __forceinline void intersect_hn(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = intersect(ray,prim,tNear);
@@ -262,7 +273,7 @@ namespace embree
       }
 
       template<typename Intersector, typename Epilog>
-        static __forceinline bool occluded_hn(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+        static __forceinline bool occluded_hn(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = intersect(ray,prim,tNear);
@@ -297,10 +308,14 @@ namespace embree
       static __forceinline vbool<M> intersect(RayK<K>& ray, const size_t k, const Primitive& prim, vfloat<M>& tNear_o)
       {
         const size_t N = prim.N;
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+        const Vec3fa offset = *prim.offset(N);
+        const float scale  = *prim.scale(N);
+#else
         const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N));
         const Vec3fa offset = Vec3fa(offset_scale);
         const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale));
-
+#endif
         const Vec3fa ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]);
         const Vec3fa ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
         const Vec3fa org1 = (ray_org-offset)*scale;
@@ -330,7 +345,7 @@ namespace embree
       }
       
       template<typename Intersector, typename Epilog>
-        static __forceinline void intersect_t(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+        static __forceinline void intersect_t(Precalculations& pre, RayHitK<K>& ray, const size_t k, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = intersect(ray,k,prim,tNear);
@@ -364,7 +379,7 @@ namespace embree
       }
       
       template<typename Intersector, typename Epilog>
-        static __forceinline bool occluded_t(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+        static __forceinline bool occluded_t(Precalculations& pre, RayK<K>& ray, const size_t k, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = intersect(ray,k,prim,tNear);
@@ -401,7 +416,7 @@ namespace embree
       }
 
       template<typename Intersector, typename Epilog>
-        static __forceinline void intersect_n(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+        static __forceinline void intersect_n(Precalculations& pre, RayHitK<K>& ray, const size_t k, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = intersect(ray,k,prim,tNear);
@@ -437,7 +452,7 @@ namespace embree
       }
       
       template<typename Intersector, typename Epilog>
-        static __forceinline bool occluded_n(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+        static __forceinline bool occluded_n(Precalculations& pre, RayK<K>& ray, const size_t k, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = intersect(ray,k,prim,tNear);
@@ -476,7 +491,7 @@ namespace embree
       }
 
       template<typename Intersector, typename Epilog>
-        static __forceinline void intersect_h(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+        static __forceinline void intersect_h(Precalculations& pre, RayHitK<K>& ray, const size_t k, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = intersect(ray,k,prim,tNear);
@@ -497,7 +512,7 @@ namespace embree
       }
       
       template<typename Intersector, typename Epilog>
-        static __forceinline bool occluded_h(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+        static __forceinline bool occluded_h(Precalculations& pre, RayK<K>& ray, const size_t k, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = intersect(ray,k,prim,tNear);
@@ -521,7 +536,7 @@ namespace embree
       }
 
       template<typename Intersector, typename Epilog>
-        static __forceinline void intersect_hn(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+        static __forceinline void intersect_hn(Precalculations& pre, RayHitK<K>& ray, const size_t k, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = intersect(ray,k,prim,tNear);
@@ -542,7 +557,7 @@ namespace embree
       }
       
       template<typename Intersector, typename Epilog>
-        static __forceinline bool occluded_hn(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+        static __forceinline bool occluded_hn(Precalculations& pre, RayK<K>& ray, const size_t k, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = intersect(ray,k,prim,tNear);
@@ -565,5 +580,69 @@ namespace embree
         return false;
       }
     };
+
+     __forceinline void convert_to_bezier(const Geometry::GType gtype,
+                                         Vec3ff& v0, Vec3ff& v1, Vec3ff& v2, Vec3ff& v3,
+                                         Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3)
+    {
+      const Geometry::GType basis = (Geometry::GType)(gtype & Geometry::GTY_BASIS_MASK);
+      const Geometry::GType stype = (Geometry::GType)(gtype & Geometry::GTY_SUBTYPE_MASK);
+      
+      if (basis == Geometry::GTY_BASIS_BSPLINE) {
+        BezierCurveT<Vec3ff> bezier;
+        convert(BSplineCurveT<Vec3ff>(v0,v1,v2,v3),bezier);
+        v0 = bezier.v0; v1 = bezier.v1; v2 = bezier.v2; v3 = bezier.v3;
+      }
+      else if (basis == Geometry::GTY_BASIS_HERMITE) {
+        BezierCurveT<Vec3ff> bezier;
+        convert(HermiteCurveT<Vec3ff>(v0,v1,v2,v3),bezier);
+        v0 = bezier.v0; v1 = bezier.v1; v2 = bezier.v2; v3 = bezier.v3;
+      }
+      else if (basis == Geometry::GTY_BASIS_CATMULL_ROM) {
+        BezierCurveT<Vec3ff> bezier;
+        convert(CatmullRomCurveT<Vec3ff>(v0,v1,v2,v3),bezier);
+        v0 = bezier.v0; v1 = bezier.v1; v2 = bezier.v2; v3 = bezier.v3;
+      }
+
+      if (stype == Geometry::GTY_SUBTYPE_ORIENTED_CURVE)
+      {
+        if (basis == Geometry::GTY_BASIS_BSPLINE) {
+          BezierCurveT<Vec3fa> bezier;
+          convert(BSplineCurveT<Vec3fa>(n0,n1,n2,n3),bezier);
+          n0 = bezier.v0; n1 = bezier.v1; n2 = bezier.v2; n3 = bezier.v3;
+        }
+        else if (basis == Geometry::GTY_BASIS_HERMITE) {
+          BezierCurveT<Vec3fa> bezier;
+          convert(HermiteCurveT<Vec3fa>(n0,n1,n2,n3),bezier);
+          n0 = bezier.v0; n1 = bezier.v1; n2 = bezier.v2; n3 = bezier.v3;
+        }
+        else if (basis == Geometry::GTY_BASIS_CATMULL_ROM) {
+          BezierCurveT<Vec3fa> bezier;
+          convert(CatmullRomCurveT<Vec3fa>(n0,n1,n2,n3),bezier);
+          n0 = bezier.v0; n1 = bezier.v1; n2 = bezier.v2; n3 = bezier.v3;
+        }
+      }
+    }
+
+    __forceinline void convert_to_bezier(const Geometry::GType gtype, Vec3ff& v0, Vec3ff& v1, Vec3ff& v2, Vec3ff& v3)
+    {
+      const Geometry::GType basis = (Geometry::GType)(gtype & Geometry::GTY_BASIS_MASK);
+      
+      if (basis == Geometry::GTY_BASIS_BSPLINE) {
+        BezierCurveT<Vec3ff> bezier;
+        convert(BSplineCurveT<Vec3ff>(v0,v1,v2,v3),bezier);
+        v0 = bezier.v0; v1 = bezier.v1; v2 = bezier.v2; v3 = bezier.v3;
+      }
+      else if (basis == Geometry::GTY_BASIS_HERMITE) {
+        BezierCurveT<Vec3ff> bezier;
+        convert(HermiteCurveT<Vec3ff>(v0,v1,v2,v3),bezier);
+        v0 = bezier.v0; v1 = bezier.v1; v2 = bezier.v2; v3 = bezier.v3;
+      }
+      else if (basis == Geometry::GTY_BASIS_CATMULL_ROM) {
+        BezierCurveT<Vec3ff> bezier;
+        convert(CatmullRomCurveT<Vec3ff>(v0,v1,v2,v3),bezier);
+        v0 = bezier.v0; v1 = bezier.v1; v2 = bezier.v2; v3 = bezier.v3;
+      }
+    }
   }
 }
diff --git a/thirdparty/embree/kernels/geometry/curveNi_mb_intersector.h b/thirdparty/embree/kernels/geometry/curveNi_mb_intersector.h
index bab796b33b..4c14c2f004 100644
--- a/thirdparty/embree/kernels/geometry/curveNi_mb_intersector.h
+++ b/thirdparty/embree/kernels/geometry/curveNi_mb_intersector.h
@@ -6,6 +6,12 @@
 #include "curveNi_mb.h"
 #include "../subdiv/linear_bezier_patch.h"
 
+#include "roundline_intersector.h"
+#include "coneline_intersector.h"
+#include "curve_intersector_ribbon.h"
+#include "curve_intersector_oriented.h"
+#include "curve_intersector_sweep.h"
+
 namespace embree
 {
   namespace isa
@@ -21,9 +27,14 @@ namespace embree
       static __forceinline vbool<M> intersect(Ray& ray, const Primitive& prim, vfloat<M>& tNear_o)
       {
         const size_t N = prim.N;
+#if __SYCL_DEVICE_ONLY__
+        const Vec3f offset = *prim.offset(N);
+        const float scale  = *prim.scale(N);
+#else
         const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N));
         const Vec3fa offset = Vec3fa(offset_scale);
         const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale));
+#endif
         const Vec3fa org1 = (ray.org-offset)*scale;
         const Vec3fa dir1 = ray.dir*scale;
         
@@ -73,7 +84,7 @@ namespace embree
       }
 
       template<typename Intersector, typename Epilog>
-        static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+        static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = intersect(ray,prim,tNear);
@@ -95,7 +106,7 @@ namespace embree
       }
 
       template<typename Intersector, typename Epilog>
-        static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+        static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = intersect(ray,prim,tNear);
@@ -120,7 +131,7 @@ namespace embree
       }
 
       template<typename Intersector, typename Epilog>
-        static __forceinline void intersect_n(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+        static __forceinline void intersect_n(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = intersect(ray,prim,tNear);
@@ -141,7 +152,7 @@ namespace embree
       }
 
       template<typename Intersector, typename Epilog>
-        static __forceinline bool occluded_n(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+        static __forceinline bool occluded_n(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = intersect(ray,prim,tNear);
@@ -166,7 +177,7 @@ namespace embree
       }
 
       template<typename Intersector, typename Epilog>
-        static __forceinline void intersect_h(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+        static __forceinline void intersect_h(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = intersect(ray,prim,tNear);
@@ -187,7 +198,7 @@ namespace embree
       }
 
       template<typename Intersector, typename Epilog>
-        static __forceinline bool occluded_h(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+        static __forceinline bool occluded_h(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = intersect(ray,prim,tNear);
@@ -211,7 +222,7 @@ namespace embree
       }
 
       template<typename Intersector, typename Epilog>
-        static __forceinline void intersect_hn(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+        static __forceinline void intersect_hn(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = intersect(ray,prim,tNear);
@@ -232,7 +243,7 @@ namespace embree
       }
 
       template<typename Intersector, typename Epilog>
-        static __forceinline bool occluded_hn(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+        static __forceinline bool occluded_hn(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = intersect(ray,prim,tNear);
@@ -267,10 +278,14 @@ namespace embree
       static __forceinline vbool<M> intersect(RayK<K>& ray, const size_t k, const Primitive& prim, vfloat<M>& tNear_o)
       {
         const size_t N = prim.N;
+#if __SYCL_DEVICE_ONLY__
+        const Vec3f offset = *prim.offset(N);
+        const float scale  = *prim.scale(N);
+#else
         const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N));
         const Vec3fa offset = Vec3fa(offset_scale);
         const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale));
-
+#endif
         const Vec3fa ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]);
         const Vec3fa ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
         const Vec3fa org1 = (ray_org-offset)*scale;
@@ -322,7 +337,7 @@ namespace embree
       }
 
       template<typename Intersector, typename Epilog>
-        static __forceinline void intersect_t(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+        static __forceinline void intersect_t(Precalculations& pre, RayHitK<K>& ray, const size_t k, RayQueryContext* context, const Primitive& prim)
       {
         
         vfloat<M> tNear;
@@ -345,7 +360,7 @@ namespace embree
       }
 
       template<typename Intersector, typename Epilog>
-        static __forceinline bool occluded_t(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+        static __forceinline bool occluded_t(Precalculations& pre, RayK<K>& ray, const size_t k, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = intersect(ray,k,prim,tNear);
@@ -370,7 +385,7 @@ namespace embree
       }
 
       template<typename Intersector, typename Epilog>
-        static __forceinline void intersect_n(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+        static __forceinline void intersect_n(Precalculations& pre, RayHitK<K>& ray, const size_t k, RayQueryContext* context, const Primitive& prim)
       {
         
         vfloat<M> tNear;
@@ -393,7 +408,7 @@ namespace embree
       }
 
       template<typename Intersector, typename Epilog>
-        static __forceinline bool occluded_n(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+        static __forceinline bool occluded_n(Precalculations& pre, RayK<K>& ray, const size_t k, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = intersect(ray,k,prim,tNear);
@@ -419,7 +434,7 @@ namespace embree
       }
 
       template<typename Intersector, typename Epilog>
-        static __forceinline void intersect_h(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+        static __forceinline void intersect_h(Precalculations& pre, RayHitK<K>& ray, const size_t k, RayQueryContext* context, const Primitive& prim)
       {
         
         vfloat<M> tNear;
@@ -441,7 +456,7 @@ namespace embree
       }
 
       template<typename Intersector, typename Epilog>
-        static __forceinline bool occluded_h(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+        static __forceinline bool occluded_h(Precalculations& pre, RayK<K>& ray, const size_t k, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = intersect(ray,k,prim,tNear);
@@ -465,7 +480,7 @@ namespace embree
       }
 
       template<typename Intersector, typename Epilog>
-        static __forceinline void intersect_hn(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+        static __forceinline void intersect_hn(Precalculations& pre, RayHitK<K>& ray, const size_t k, RayQueryContext* context, const Primitive& prim)
       {
         
         vfloat<M> tNear;
@@ -488,7 +503,7 @@ namespace embree
       }
 
       template<typename Intersector, typename Epilog>
-        static __forceinline bool occluded_hn(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+        static __forceinline bool occluded_hn(Precalculations& pre, RayK<K>& ray, const size_t k, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = intersect(ray,k,prim,tNear);
diff --git a/thirdparty/embree/kernels/geometry/curveNv_intersector.h b/thirdparty/embree/kernels/geometry/curveNv_intersector.h
index 2742725aec..cfbd1a029c 100644
--- a/thirdparty/embree/kernels/geometry/curveNv_intersector.h
+++ b/thirdparty/embree/kernels/geometry/curveNv_intersector.h
@@ -17,7 +17,7 @@ namespace embree
       typedef CurvePrecalculations1 Precalculations;
 
       template<typename Intersector, typename Epilog>
-        static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+        static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = CurveNiIntersector1<M>::intersect(ray,prim,tNear);
@@ -54,7 +54,7 @@ namespace embree
       }
 
       template<typename Intersector, typename Epilog>
-        static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+        static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = CurveNiIntersector1<M>::intersect(ray,prim,tNear);
@@ -101,7 +101,7 @@ namespace embree
       typedef CurvePrecalculationsK<K> Precalculations;
 
       template<typename Intersector, typename Epilog>
-        static __forceinline void intersect_t(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+        static __forceinline void intersect_t(Precalculations& pre, RayHitK<K>& ray, const size_t k, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = CurveNiIntersectorK<M,K>::intersect(ray,k,prim,tNear);
@@ -138,7 +138,7 @@ namespace embree
       }
 
       template<typename Intersector, typename Epilog>
-        static __forceinline bool occluded_t(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+        static __forceinline bool occluded_t(Precalculations& pre, RayK<K>& ray, const size_t k, RayQueryContext* context, const Primitive& prim)
       {
         vfloat<M> tNear;
         vbool<M> valid = CurveNiIntersectorK<M,K>::intersect(ray,k,prim,tNear);
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector.h b/thirdparty/embree/kernels/geometry/curve_intersector.h
index 1e8ac26125..a258befb5e 100644
--- a/thirdparty/embree/kernels/geometry/curve_intersector.h
+++ b/thirdparty/embree/kernels/geometry/curve_intersector.h
@@ -24,7 +24,7 @@ namespace embree
       typedef CurvePrecalculations1 Precalculations;
       
       template<int N, bool robust>
-        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
       {
         assert(num == 1);
         RTCGeometryType ty = (RTCGeometryType)(*prim);
@@ -34,7 +34,7 @@ namespace embree
       }
       
       template<int N, bool robust>        
-        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
       {
         assert(num == 1);
         RTCGeometryType ty = (RTCGeometryType)(*prim);
@@ -50,7 +50,7 @@ namespace embree
         typedef unsigned char Primitive;
         typedef CurvePrecalculationsK<K> Precalculations;
         
-        static __forceinline void intersect(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+        static __forceinline void intersect(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
         {
           assert(num == 1);
           RTCGeometryType ty = (RTCGeometryType)(*prim);
@@ -60,7 +60,7 @@ namespace embree
           while (mask) leafIntersector.intersect<K>(&pre,&ray,bscf(mask),context,prim);
         }
         
-        static __forceinline vbool<K> occluded(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+        static __forceinline vbool<K> occluded(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
         {
           assert(num == 1);
           RTCGeometryType ty = (RTCGeometryType)(*prim);
@@ -76,7 +76,7 @@ namespace embree
           return valid_o;
         }
         
-        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
         {
           assert(num == 1);
           RTCGeometryType ty = (RTCGeometryType)(*prim);
@@ -85,7 +85,7 @@ namespace embree
           leafIntersector.intersect<K>(&pre,&ray,k,context,prim);
         }
         
-        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
         {
           assert(num == 1);
           RTCGeometryType ty = (RTCGeometryType)(*prim);
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_distance.h b/thirdparty/embree/kernels/geometry/curve_intersector_distance.h
index 748a9511a5..80e1760289 100644
--- a/thirdparty/embree/kernels/geometry/curve_intersector_distance.h
+++ b/thirdparty/embree/kernels/geometry/curve_intersector_distance.h
@@ -45,15 +45,56 @@ namespace embree
       vfloat<M> vv;
       vfloat<M> vt;
     };
-    
+
     template<typename NativeCurve3fa>
+    struct DistanceCurveHit<NativeCurve3fa,1>
+    {
+      enum { M = 1 };
+      
+      __forceinline DistanceCurveHit() {}
+
+      __forceinline DistanceCurveHit(const vbool<M>& valid, const vfloat<M>& U, const vfloat<M>& V, const vfloat<M>& T, const int i, const int N,
+                                     const NativeCurve3fa& curve3D)
+        : U(U), V(V), T(T), i(i), N(N), curve3D(curve3D), valid(valid) {}
+      
+      __forceinline void finalize() 
+      {
+        vu = (vfloat<M>(step)+U+vfloat<M>(float(i)))*(1.0f/float(N));
+        vv = V;
+        vt = T;
+      }
+      
+      __forceinline Vec2f uv () const { return Vec2f(vu,vv); }
+      __forceinline float t  () const { return vt; }
+      __forceinline Vec3fa Ng() const { return curve3D.eval_du(vu); }
+      
+    public:
+      vfloat<M> U;
+      vfloat<M> V;
+      vfloat<M> T;
+      int i, N;
+      NativeCurve3fa curve3D;
+      
+    public:
+      vbool<M> valid;
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+    };
+    
+    template<typename NativeCurve3fa, int W = VSIZEX>
     struct DistanceCurve1Intersector1
     {
+      using vboolx = vbool<W>;
+      using vintx = vint<W>;
+      using vfloatx = vfloat<W>;
+      using Vec4vfx = Vec4vf<W>;
+      
       template<typename Epilog>
-      __forceinline bool intersect(const CurvePrecalculations1& pre,Ray& ray,
-                                   IntersectContext* context,
+      __forceinline bool intersect(const CurvePrecalculations1& pre, Ray& ray,
+                                   RayQueryContext* context,
                                    const CurveGeometry* geom, const unsigned int primID,
-                                   const Vec3fa& v0, const Vec3fa& v1, const Vec3fa& v2, const Vec3fa& v3,
+                                   const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3,
                                    const Epilog& epilog)
       {
         const int N = geom->tessellationRate;
@@ -65,8 +106,8 @@ namespace embree
       
         /* evaluate the bezier curve */
         vboolx valid = vfloatx(step) < vfloatx(float(N));
-        const Vec4vfx p0 = curve2D.template eval0<VSIZEX>(0,N);
-        const Vec4vfx p1 = curve2D.template eval1<VSIZEX>(0,N);
+        const Vec4vfx p0 = curve2D.template eval0<W>(0,N);
+        const Vec4vfx p1 = curve2D.template eval1<W>(0,N);
 
         /* approximative intersection with cone */
         const Vec4vfx v = p1-p0;
@@ -86,19 +127,19 @@ namespace embree
         /* update hit information */
         bool ishit = false;
         if (unlikely(any(valid))) {
-          DistanceCurveHit<NativeCurve3fa,VSIZEX> hit(valid,u,0.0f,t,0,N,curve3D);
+          DistanceCurveHit<NativeCurve3fa,W> hit(valid,u,0.0f,t,0,N,curve3D);
           ishit = ishit | epilog(valid,hit);
         }
 
-        if (unlikely(VSIZEX < N)) 
+        if (unlikely(W < N)) 
         {
           /* process SIMD-size many segments per iteration */
-          for (int i=VSIZEX; i<N; i+=VSIZEX)
+          for (int i=W; i<N; i+=W)
           {
             /* evaluate the bezier curve */
             vboolx valid = vintx(i)+vintx(step) < vintx(N);
-            const Vec4vfx p0 = curve2D.template eval0<VSIZEX>(i,N);
-            const Vec4vfx p1 = curve2D.template eval1<VSIZEX>(i,N);
+            const Vec4vfx p0 = curve2D.template eval0<W>(i,N);
+            const Vec4vfx p1 = curve2D.template eval1<W>(i,N);
             
             /* approximative intersection with cone */
             const Vec4vfx v = p1-p0;
@@ -117,7 +158,7 @@ namespace embree
 
              /* update hit information */
             if (unlikely(any(valid))) {
-              DistanceCurveHit<NativeCurve3fa,VSIZEX> hit(valid,u,0.0f,t,i,N,curve3D);
+              DistanceCurveHit<NativeCurve3fa,W> hit(valid,u,0.0f,t,i,N,curve3D);
               ishit = ishit | epilog(valid,hit);
             }
           }
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_oriented.h b/thirdparty/embree/kernels/geometry/curve_intersector_oriented.h
index 75532f5ae0..cdab06ea5e 100644
--- a/thirdparty/embree/kernels/geometry/curve_intersector_oriented.h
+++ b/thirdparty/embree/kernels/geometry/curve_intersector_oriented.h
@@ -14,7 +14,7 @@ namespace embree
 {
   namespace isa
   {
-    template<typename Ray, typename Epilog>
+    template<typename Ray, typename Epilog, int N = VSIZEX-1, int V = VSIZEX>
       struct TensorLinearCubicBezierSurfaceIntersector
       {
         const LinearSpace3fa& ray_space;
@@ -235,7 +235,7 @@ namespace embree
            if (!clip_v(cu,cv)) return;
            return solve_newton_raphson(cu,cv);
         }
-        
+
         __forceinline void solve_newton_raphson_recursion(BBox1f cu, BBox1f cv)
         {
           unsigned int sptr = 0;
@@ -259,8 +259,8 @@ namespace embree
               if (mask) sptr++; // there are still items on the stack
               
               /* process next element recurse into each hit curve segment */
-              const float u0 = float(i+0)*(1.0f/(VSIZEX-1));
-              const float u1 = float(i+1)*(1.0f/(VSIZEX-1));
+              const float u0 = float(i+0)*(1.0f/(N));
+              const float u1 = float(i+1)*(1.0f/(N));
               const BBox1f cui(lerp(cu.lower,cu.upper,u0),lerp(cu.lower,cu.upper,u1));
               cu = cui;
             }
@@ -280,28 +280,37 @@ namespace embree
 #endif
 
           entry:
-          
-            /* split the curve into VSIZEX-1 segments in u-direction */
-            vboolx valid = true;
-            TensorLinearCubicBezierSurface<Vec2vfx> subcurves = curve2d.clip_v(cv).vsplit_u(valid,cu);
-            
-            /* slabs test in u-direction */
-            Vec2vfx ndv = cross(subcurves.axis_v());
-            BBox<vfloatx> boundsv = subcurves.vxfm(ndv).bounds();
-            valid &= boundsv.lower <= eps;
-            valid &= boundsv.upper >= -eps;
-            if (none(valid)) continue;
-
-            /* slabs test in v-direction */
-            Vec2vfx ndu = cross(subcurves.axis_u());
-            BBox<vfloatx> boundsu = subcurves.vxfm(ndu).bounds();
-            valid &= boundsu.lower <= eps;
-            valid &= boundsu.upper >= -eps;
-            if (none(valid)) continue;
+
+            /* split the curve into N segments in u-direction */
+            unsigned int mask = 0;
+            for (int i=0; i<N;)
+            {
+              int i0 = i;
+              vbool<V> valid = true;
+              TensorLinearCubicBezierSurface<Vec2vf<V>> subcurves = curve2d.clip_v(cv).template vsplit_u<V>(valid,cu,i,N);
+              
+              /* slabs test in u-direction */
+              Vec2vf<V> ndv = cross(subcurves.axis_v());
+              BBox<vfloat<V>> boundsv = subcurves.template vxfm<V>(ndv).bounds();
+              valid &= boundsv.lower <= eps;
+              valid &= boundsv.upper >= -eps;
+              if (none(valid)) continue;
+
+              /* slabs test in v-direction */
+              Vec2vf<V> ndu = cross(subcurves.axis_u());
+              BBox<vfloat<V>> boundsu = subcurves.template vxfm<V>(ndu).bounds();
+              valid &= boundsu.lower <= eps;
+              valid &= boundsu.upper >= -eps;
+              if (none(valid)) continue;
+
+              mask |= movemask(valid) << i0;
+            }
+
+            if (!mask) continue;
 
             /* push valid segments to stack */
             assert(sptr < stack_size);
-            mask_stack [sptr] = movemask(valid);
+            mask_stack [sptr] = mask;
             cu_stack   [sptr] = cu;
             cv_stack   [sptr] = cv;
             sptr++;
@@ -318,7 +327,7 @@ namespace embree
       };
 
 
-    template<template<typename Ty> class SourceCurve>
+    template<template<typename Ty> class SourceCurve, int N = VSIZEX-1, int V = VSIZEX>
       struct OrientedCurve1Intersector1
     {
       //template<typename Ty> using Curve = SourceCurve<Ty>;
@@ -329,33 +338,32 @@ namespace embree
       
       __forceinline OrientedCurve1Intersector1(const Ray& ray, const void* ptr) {}
       
-      template<typename Epilog>
-      __noinline bool intersect(const CurvePrecalculations1& pre, Ray& ray,
-                                IntersectContext* context,
+      template<typename Ray, typename Epilog>
+      __forceinline bool intersect(const CurvePrecalculations1& pre, Ray& ray,
+                                RayQueryContext* context,
                                 const CurveGeometry* geom, const unsigned int primID, 
                                 const Vec3ff& v0i, const Vec3ff& v1i, const Vec3ff& v2i, const Vec3ff& v3i,
                                 const Vec3fa& n0i, const Vec3fa& n1i, const Vec3fa& n2i, const Vec3fa& n3i,
                                 const Epilog& epilog) const
       {
         STAT3(normal.trav_prims,1,1,1);
-
         SourceCurve3ff ccurve(v0i,v1i,v2i,v3i);
         SourceCurve3fa ncurve(n0i,n1i,n2i,n3i);
         ccurve = enlargeRadiusToMinWidth(context,geom,ray.org,ccurve);
         TensorLinearCubicBezierSurface3fa curve = TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve);
         //return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog>(pre.ray_space,ray,curve,epilog).solve_bezier_clipping();
-        return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog>(pre.ray_space,ray,curve,epilog).solve_newton_raphson_main();
+        return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog,N,V>(pre.ray_space,ray,curve,epilog).solve_newton_raphson_main();
       }
 
-      template<typename Epilog>
-      __noinline bool intersect(const CurvePrecalculations1& pre, Ray& ray,
-                                IntersectContext* context,
+      template<typename Ray, typename Epilog>
+      __forceinline bool intersect(const CurvePrecalculations1& pre, Ray& ray,
+                                RayQueryContext* context,
                                 const CurveGeometry* geom, const unsigned int primID,
                                 const TensorLinearCubicBezierSurface3fa& curve, const Epilog& epilog) const
       {
         STAT3(normal.trav_prims,1,1,1);
         //return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog>(pre.ray_space,ray,curve,epilog).solve_bezier_clipping();
-        return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog>(pre.ray_space,ray,curve,epilog).solve_newton_raphson_main();
+        return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog,N,V>(pre.ray_space,ray,curve,epilog).solve_newton_raphson_main();
       }
     };
 
@@ -384,7 +392,7 @@ namespace embree
 
       template<typename Epilog>
       __forceinline bool intersect(const CurvePrecalculationsK<K>& pre, RayK<K>& vray, size_t k,
-                                   IntersectContext* context,
+                                   RayQueryContext* context,
                                    const CurveGeometry* geom, const unsigned int primID,
                                    const Vec3ff& v0i, const Vec3ff& v1i, const Vec3ff& v2i, const Vec3ff& v3i,
                                    const Vec3fa& n0i, const Vec3fa& n1i, const Vec3fa& n2i, const Vec3fa& n3i,
@@ -402,7 +410,7 @@ namespace embree
 
       template<typename Epilog>
       __forceinline bool intersect(const CurvePrecalculationsK<K>& pre, RayK<K>& vray, size_t k,
-                                   IntersectContext* context,
+                                   RayQueryContext* context,
                                    const CurveGeometry* geom, const unsigned int primID,
                                    const TensorLinearCubicBezierSurface3fa& curve,
                                    const Epilog& epilog)
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_ribbon.h b/thirdparty/embree/kernels/geometry/curve_intersector_ribbon.h
index c3272e99fd..423fd5b08d 100644
--- a/thirdparty/embree/kernels/geometry/curve_intersector_ribbon.h
+++ b/thirdparty/embree/kernels/geometry/curve_intersector_ribbon.h
@@ -53,21 +53,23 @@ namespace embree
     };
 
     /* calculate squared distance of point p0 to line p1->p2 */
-    __forceinline std::pair<vfloatx,vfloatx> sqr_point_line_distance(const Vec2vfx& p0, const Vec2vfx& p1, const Vec2vfx& p2)
+    template<int M>
+    __forceinline std::pair<vfloat<M>,vfloat<M>> sqr_point_line_distance(const Vec2vf<M>& p0, const Vec2vf<M>& p1, const Vec2vf<M>& p2)
     {
-      const vfloatx num = det(p2-p1,p1-p0);
-      const vfloatx den2 = dot(p2-p1,p2-p1);
+      const vfloat<M> num = det(p2-p1,p1-p0);
+      const vfloat<M> den2 = dot(p2-p1,p2-p1);
       return std::make_pair(num*num,den2);
     }
     
     /* performs culling against a cylinder */
-    __forceinline vboolx cylinder_culling_test(const Vec2vfx& p0, const Vec2vfx& p1, const Vec2vfx& p2, const vfloatx& r)
+     template<int M>
+     __forceinline vbool<M> cylinder_culling_test(const Vec2vf<M>& p0, const Vec2vf<M>& p1, const Vec2vf<M>& p2, const vfloat<M>& r)
     {
-      const std::pair<vfloatx,vfloatx> d = sqr_point_line_distance(p0,p1,p2);
+      const std::pair<vfloat<M>,vfloat<M>> d = sqr_point_line_distance<M>(p0,p1,p2);
       return d.first <= r*r*d.second;
     }
 
-    template<typename NativeCurve3ff, typename Epilog>
+    template<int M = VSIZEX, typename NativeCurve3ff, typename Epilog>
     __forceinline bool intersect_ribbon(const Vec3fa& ray_org, const Vec3fa& ray_dir, const float ray_tnear, const float& ray_tfar,
                                         const LinearSpace3fa& ray_space, const float& depth_scale,
                                         const NativeCurve3ff& curve3D, const int N,
@@ -76,89 +78,96 @@ namespace embree
       /* transform control points into ray space */
       const NativeCurve3ff curve2D = curve3D.xfm_pr(ray_space,ray_org);
       float eps = 4.0f*float(ulp)*reduce_max(max(abs(curve2D.v0),abs(curve2D.v1),abs(curve2D.v2),abs(curve2D.v3)));
-      
-      /* evaluate the bezier curve */
+
+      int i=0;
       bool ishit = false;
-      vboolx valid = vfloatx(step) < vfloatx(float(N));
-      const Vec4vfx p0 = curve2D.template eval0<VSIZEX>(0,N);
-      const Vec4vfx p1 = curve2D.template eval1<VSIZEX>(0,N);
-      valid &= cylinder_culling_test(zero,Vec2vfx(p0.x,p0.y),Vec2vfx(p1.x,p1.y),max(p0.w,p1.w));
       
-      if (any(valid)) 
+#if !defined(__SYCL_DEVICE_ONLY__)
       {
-        Vec3vfx dp0dt = curve2D.template derivative0<VSIZEX>(0,N);
-        Vec3vfx dp1dt = curve2D.template derivative1<VSIZEX>(0,N);
-        dp0dt = select(reduce_max(abs(dp0dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp0dt);
-        dp1dt = select(reduce_max(abs(dp1dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp1dt);
-        const Vec3vfx n0(dp0dt.y,-dp0dt.x,0.0f);
-        const Vec3vfx n1(dp1dt.y,-dp1dt.x,0.0f);
-        const Vec3vfx nn0 = normalize(n0);
-        const Vec3vfx nn1 = normalize(n1);
-        const Vec3vfx lp0 = madd(p0.w,nn0,Vec3vfx(p0));
-        const Vec3vfx lp1 = madd(p1.w,nn1,Vec3vfx(p1));
-        const Vec3vfx up0 = nmadd(p0.w,nn0,Vec3vfx(p0));
-        const Vec3vfx up1 = nmadd(p1.w,nn1,Vec3vfx(p1));
+        /* evaluate the bezier curve */
+        vbool<M> valid = vfloat<M>(step) < vfloat<M>(float(N));
+        const Vec4vf<M> p0 = curve2D.template eval0<M>(0,N);
+        const Vec4vf<M> p1 = curve2D.template eval1<M>(0,N);
+        valid &= cylinder_culling_test<M>(zero,Vec2vf<M>(p0.x,p0.y),Vec2vf<M>(p1.x,p1.y),max(p0.w,p1.w));
         
-        vfloatx vu,vv,vt;
-        vboolx valid0 = intersect_quad_backface_culling<VSIZEX>(valid,zero,Vec3fa(0,0,1),ray_tnear,ray_tfar,lp0,lp1,up1,up0,vu,vv,vt);
-
-        if (any(valid0))
+        if (any(valid)) 
         {
-          /* ignore self intersections */
-          if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) {
-            vfloatx r = lerp(p0.w, p1.w, vu);
-            valid0 &= vt > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale;
-          }
+          Vec3vf<M> dp0dt = curve2D.template derivative0<M>(0,N);
+          Vec3vf<M> dp1dt = curve2D.template derivative1<M>(0,N);
+          dp0dt = select(reduce_max(abs(dp0dt)) < vfloat<M>(eps),Vec3vf<M>(p1-p0),dp0dt);
+          dp1dt = select(reduce_max(abs(dp1dt)) < vfloat<M>(eps),Vec3vf<M>(p1-p0),dp1dt);
+          const Vec3vf<M> n0(dp0dt.y,-dp0dt.x,0.0f);
+          const Vec3vf<M> n1(dp1dt.y,-dp1dt.x,0.0f);
+          const Vec3vf<M> nn0 = normalize(n0);
+          const Vec3vf<M> nn1 = normalize(n1);
+          const Vec3vf<M> lp0 = madd(p0.w,nn0,Vec3vf<M>(p0));
+          const Vec3vf<M> lp1 = madd(p1.w,nn1,Vec3vf<M>(p1));
+          const Vec3vf<M> up0 = nmadd(p0.w,nn0,Vec3vf<M>(p0));
+          const Vec3vf<M> up1 = nmadd(p1.w,nn1,Vec3vf<M>(p1));
+          
+          vfloat<M> vu,vv,vt;
+          vbool<M> valid0 = intersect_quad_backface_culling<M>(valid,zero,Vec3fa(0,0,1),ray_tnear,ray_tfar,lp0,lp1,up1,up0,vu,vv,vt);
           
           if (any(valid0))
           {
-            vv = madd(2.0f,vv,vfloatx(-1.0f));
-            RibbonHit<NativeCurve3ff,VSIZEX> bhit(valid0,vu,vv,vt,0,N,curve3D);
-            ishit |= epilog(bhit.valid,bhit);
+            /* ignore self intersections */
+            if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) {
+              vfloat<M> r = lerp(p0.w, p1.w, vu);
+              valid0 &= vt > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale;
+            }
+            
+            if (any(valid0))
+            {
+              vv = madd(2.0f,vv,vfloat<M>(-1.0f));
+              RibbonHit<NativeCurve3ff,M> bhit(valid0,vu,vv,vt,0,N,curve3D);
+              ishit |= epilog(bhit.valid,bhit);
+            }
           }
         }
+        i += M;
       }
       
-      if (unlikely(VSIZEX < N)) 
+      if (unlikely(i < N))
+#endif
       {
         /* process SIMD-size many segments per iteration */
-        for (int i=VSIZEX; i<N; i+=VSIZEX)
+        for (; i<N; i+=M)
         {
           /* evaluate the bezier curve */
-          vboolx valid = vintx(i)+vintx(step) < vintx(N);
-          const Vec4vfx p0 = curve2D.template eval0<VSIZEX>(i,N);
-          const Vec4vfx p1 = curve2D.template eval1<VSIZEX>(i,N);
-          valid &= cylinder_culling_test(zero,Vec2vfx(p0.x,p0.y),Vec2vfx(p1.x,p1.y),max(p0.w,p1.w));
+          vbool<M> valid = vint<M>(i)+vint<M>(step) < vint<M>(N);
+          const Vec4vf<M> p0 = curve2D.template eval0<M>(i,N);
+          const Vec4vf<M> p1 = curve2D.template eval1<M>(i,N);
+          valid &= cylinder_culling_test<M>(zero,Vec2vf<M>(p0.x,p0.y),Vec2vf<M>(p1.x,p1.y),max(p0.w,p1.w));
           if (none(valid)) continue;
           
-          Vec3vfx dp0dt = curve2D.template derivative0<VSIZEX>(i,N);
-          Vec3vfx dp1dt = curve2D.template derivative1<VSIZEX>(i,N);
-          dp0dt = select(reduce_max(abs(dp0dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp0dt);
-          dp1dt = select(reduce_max(abs(dp1dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp1dt);
-          const Vec3vfx n0(dp0dt.y,-dp0dt.x,0.0f);
-          const Vec3vfx n1(dp1dt.y,-dp1dt.x,0.0f);
-          const Vec3vfx nn0 = normalize(n0);
-          const Vec3vfx nn1 = normalize(n1);
-          const Vec3vfx lp0 = madd(p0.w,nn0,Vec3vfx(p0));
-          const Vec3vfx lp1 = madd(p1.w,nn1,Vec3vfx(p1));
-          const Vec3vfx up0 = nmadd(p0.w,nn0,Vec3vfx(p0));
-          const Vec3vfx up1 = nmadd(p1.w,nn1,Vec3vfx(p1));
+          Vec3vf<M> dp0dt = curve2D.template derivative0<M>(i,N);
+          Vec3vf<M> dp1dt = curve2D.template derivative1<M>(i,N);
+          dp0dt = select(reduce_max(abs(dp0dt)) < vfloat<M>(eps),Vec3vf<M>(p1-p0),dp0dt);
+          dp1dt = select(reduce_max(abs(dp1dt)) < vfloat<M>(eps),Vec3vf<M>(p1-p0),dp1dt);
+          const Vec3vf<M> n0(dp0dt.y,-dp0dt.x,0.0f);
+          const Vec3vf<M> n1(dp1dt.y,-dp1dt.x,0.0f);
+          const Vec3vf<M> nn0 = normalize(n0);
+          const Vec3vf<M> nn1 = normalize(n1);
+          const Vec3vf<M> lp0 = madd(p0.w,nn0,Vec3vf<M>(p0));
+          const Vec3vf<M> lp1 = madd(p1.w,nn1,Vec3vf<M>(p1));
+          const Vec3vf<M> up0 = nmadd(p0.w,nn0,Vec3vf<M>(p0));
+          const Vec3vf<M> up1 = nmadd(p1.w,nn1,Vec3vf<M>(p1));
           
-          vfloatx vu,vv,vt;
-          vboolx valid0 = intersect_quad_backface_culling<VSIZEX>(valid,zero,Vec3fa(0,0,1),ray_tnear,ray_tfar,lp0,lp1,up1,up0,vu,vv,vt);
+          vfloat<M> vu,vv,vt;
+          vbool<M> valid0 = intersect_quad_backface_culling<M>(valid,zero,Vec3fa(0,0,1),ray_tnear,ray_tfar,lp0,lp1,up1,up0,vu,vv,vt);
 
           if (any(valid0))
           {
             /* ignore self intersections */
             if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) {
-              vfloatx r = lerp(p0.w, p1.w, vu);
+              vfloat<M> r = lerp(p0.w, p1.w, vu);
               valid0 &= vt > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale;
             }
             
             if (any(valid0))
             {
-              vv = madd(2.0f,vv,vfloatx(-1.0f));
-              RibbonHit<NativeCurve3ff,VSIZEX> bhit(valid0,vu,vv,vt,i,N,curve3D);
+              vv = madd(2.0f,vv,vfloat<M>(-1.0f));
+              RibbonHit<NativeCurve3ff,M> bhit(valid0,vu,vv,vt,i,N,curve3D);
               ishit |= epilog(bhit.valid,bhit);
             }
           }
@@ -167,14 +176,14 @@ namespace embree
       return ishit;
     }
         
-    template<template<typename Ty> class NativeCurve>
+    template<template<typename Ty> class NativeCurve, int M = VSIZEX>
     struct RibbonCurve1Intersector1
     {
       typedef NativeCurve<Vec3ff> NativeCurve3ff;
       
-      template<typename Epilog>
+      template<typename Ray, typename Epilog>
       __forceinline bool intersect(const CurvePrecalculations1& pre, Ray& ray,
-                                   IntersectContext* context,
+                                   RayQueryContext* context,
                                    const CurveGeometry* geom, const unsigned int primID,
                                    const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3,
                                    const Epilog& epilog)
@@ -182,21 +191,21 @@ namespace embree
         const int N = geom->tessellationRate;
         NativeCurve3ff curve(v0,v1,v2,v3);
         curve = enlargeRadiusToMinWidth(context,geom,ray.org,curve);
-        return intersect_ribbon<NativeCurve3ff>(ray.org,ray.dir,ray.tnear(),ray.tfar,
+        return intersect_ribbon<M,NativeCurve3ff>(ray.org,ray.dir,ray.tnear(),ray.tfar,
                                                 pre.ray_space,pre.depth_scale,
                                                 curve,N,
                                                 epilog);
       }
     };
     
-    template<template<typename Ty> class NativeCurve, int K>
+    template<template<typename Ty> class NativeCurve, int K, int M = VSIZEX>
     struct RibbonCurve1IntersectorK
     {
       typedef NativeCurve<Vec3ff> NativeCurve3ff;
       
       template<typename Epilog>
       __forceinline bool intersect(const CurvePrecalculationsK<K>& pre, RayK<K>& ray, size_t k,
-                                   IntersectContext* context,
+                                   RayQueryContext* context,
                                    const CurveGeometry* geom, const unsigned int primID,
                                    const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3,
                                    const Epilog& epilog)
@@ -206,7 +215,7 @@ namespace embree
         const Vec3fa ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
         NativeCurve3ff curve(v0,v1,v2,v3);
         curve = enlargeRadiusToMinWidth(context,geom,ray_org,curve);
-        return intersect_ribbon<NativeCurve3ff>(ray_org,ray_dir,ray.tnear()[k],ray.tfar[k],
+        return intersect_ribbon<M,NativeCurve3ff>(ray_org,ray_dir,ray.tnear()[k],ray.tfar[k],
                                                 pre.ray_space[k],pre.depth_scale[k],
                                                 curve,N,
                                                 epilog);
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_sweep.h b/thirdparty/embree/kernels/geometry/curve_intersector_sweep.h
index ed827d583f..5581822551 100644
--- a/thirdparty/embree/kernels/geometry/curve_intersector_sweep.h
+++ b/thirdparty/embree/kernels/geometry/curve_intersector_sweep.h
@@ -14,7 +14,9 @@ namespace embree
   namespace isa
   {
     static const size_t numJacobianIterations = 5;
-#if defined(__AVX__)
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+    static const size_t numBezierSubdivisions = 2;
+#elif defined(__AVX__)
     static const size_t numBezierSubdivisions = 2;
 #else
     static const size_t numBezierSubdivisions = 3;
@@ -132,10 +134,15 @@ namespace embree
       return false;
     }
 
+#if !defined(__SYCL_DEVICE_ONLY__)
+    
     template<typename NativeCurve3ff, typename Ray, typename Epilog>
-    bool intersect_bezier_recursive_jacobian(const Ray& ray, const float dt, const NativeCurve3ff& curve,
-                                             float u0, float u1, unsigned int depth, const Epilog& epilog)
+    __forceinline bool intersect_bezier_recursive_jacobian(const Ray& ray, const float dt, const NativeCurve3ff& curve, const Epilog& epilog)
     {
+      float u0 = 0.0f;
+      float u1 = 1.0f;
+      unsigned int depth = 1;
+        
 #if defined(__AVX__)
       enum { VSIZEX_ = 8 };
       typedef vbool8 vboolx; // maximally 8-wide to work around KNL issues
@@ -147,8 +154,6 @@ namespace embree
       typedef vint4 vintx; 
       typedef vfloat4 vfloatx;
 #endif
-      typedef Vec3<vfloatx> Vec3vfx;
-      typedef Vec4<vfloatx> Vec4vfx;
     
       unsigned int maxDepth = numBezierSubdivisions;
       bool found = false;
@@ -295,14 +300,156 @@ namespace embree
       return found;
     }
 
+#else
+    
+     template<typename NativeCurve3ff, typename Ray, typename Epilog>
+     __forceinline bool intersect_bezier_recursive_jacobian(const Ray& ray, const float dt, const NativeCurve3ff& curve, const Epilog& epilog)
+    {
+      const Vec3fa org = zero;
+      const Vec3fa dir = ray.dir;
+      const unsigned int max_depth = 7;
+      
+      bool found = false;
+
+      struct ShortStack
+      {
+        /* pushes both children */
+        __forceinline void push() {
+          depth++;
+        }
+
+        /* pops next node */
+        __forceinline void pop() {
+          short_stack += (1<<(31-depth));
+          depth = 31-bsf(short_stack);
+        }
+        
+        unsigned int depth = 0;
+        unsigned int short_stack = 0;
+      };
+
+      ShortStack stack;
+
+      do
+      {
+        const float u0 = (stack.short_stack+0*(1<<(31-stack.depth)))/float(0x80000000);
+        const float u1 = (stack.short_stack+1*(1<<(31-stack.depth)))/float(0x80000000);
+      
+        /* subdivide bezier curve */
+        Vec3ff P0, dP0du; curve.eval(u0,P0,dP0du); dP0du = dP0du * (u1-u0);
+        Vec3ff P3, dP3du; curve.eval(u1,P3,dP3du); dP3du = dP3du * (u1-u0);
+        const Vec3ff P1 = P0 + dP0du*(1.0f/3.0f); 
+        const Vec3ff P2 = P3 - dP3du*(1.0f/3.0f);
+
+        /* check if curve is well behaved, by checking deviation of tangents from straight line */
+        const Vec3ff W = Vec3ff(P3-P0,0.0f);
+        const Vec3ff dQ0 = abs(3.0f*(P1-P0) - W);
+        const Vec3ff dQ1 = abs(3.0f*(P2-P1) - W);
+        const Vec3ff dQ2 = abs(3.0f*(P3-P2) - W);
+        const Vec3ff max_dQ = max(dQ0,dQ1,dQ2);
+        const float m = max(max_dQ.x,max_dQ.y,max_dQ.z); //,max_dQ.w);
+        const float l = length(Vec3f(W));
+        const bool well_behaved = m < 0.2f*l;
+
+        if (!well_behaved && stack.depth < max_depth) {
+          stack.push();
+          continue;
+        }
+        
+        /* calculate bounding cylinders */
+        const float rr1 = sqr_point_to_line_distance(Vec3f(dP0du),Vec3f(P3-P0));
+        const float rr2 = sqr_point_to_line_distance(Vec3f(dP3du),Vec3f(P3-P0));
+        const float maxr12 = sqrt(max(rr1,rr2));
+        const float one_plus_ulp  = 1.0f+2.0f*float(ulp);
+        const float one_minus_ulp = 1.0f-2.0f*float(ulp);
+        float r_outer = max(P0.w,P1.w,P2.w,P3.w)+maxr12;
+        float r_inner = min(P0.w,P1.w,P2.w,P3.w)-maxr12;
+        r_outer = one_plus_ulp*r_outer;
+        r_inner = max(0.0f,one_minus_ulp*r_inner);
+        const Cylinder cylinder_outer(Vec3f(P0),Vec3f(P3),r_outer);
+        const Cylinder cylinder_inner(Vec3f(P0),Vec3f(P3),r_inner);
+        
+        /* intersect with outer cylinder */
+        BBox<float> tc_outer; float u_outer0; Vec3fa Ng_outer0; float u_outer1; Vec3fa Ng_outer1;
+        if (!cylinder_outer.intersect(org,dir,tc_outer,u_outer0,Ng_outer0,u_outer1,Ng_outer1))
+        {
+          stack.pop();
+          continue;
+        }
+                
+        /* intersect with cap-planes */
+        BBox<float> tp(ray.tnear()-dt,ray.tfar-dt);
+        tp = embree::intersect(tp,tc_outer);
+        BBox<float> h0 = HalfPlane(Vec3f(P0),+Vec3f(dP0du)).intersect(org,dir);
+        tp = embree::intersect(tp,h0);
+        BBox<float> h1 = HalfPlane(Vec3f(P3),-Vec3f(dP3du)).intersect(org,dir);
+        tp = embree::intersect(tp,h1);
+        if (tp.lower > tp.upper)
+        {
+          stack.pop();
+          continue;
+        }
+        
+        bool valid = true;
+        
+        /* clamp and correct u parameter */
+        u_outer0 = clamp(u_outer0,float(0.0f),float(1.0f));
+        u_outer1 = clamp(u_outer1,float(0.0f),float(1.0f));
+        u_outer0 = lerp(u0,u1,u_outer0);
+        u_outer1 = lerp(u0,u1,u_outer1);
+        
+        /* intersect with inner cylinder */
+        BBox<float> tc_inner;
+        float u_inner0 = zero; Vec3fa Ng_inner0 = zero; float u_inner1 = zero; Vec3fa Ng_inner1 = zero;
+        const bool valid_inner =  cylinder_inner.intersect(org,dir,tc_inner,u_inner0,Ng_inner0,u_inner1,Ng_inner1);
+
+        /* subtract the inner interval from the current hit interval */
+        BBox<float> tp0, tp1;
+        subtract(tp,tc_inner,tp0,tp1);
+        bool valid0 = valid & (tp0.lower <= tp0.upper);
+        bool valid1 = valid & (tp1.lower <= tp1.upper);
+        if (!(valid0 | valid1))
+        {
+          stack.pop();
+          continue;
+        }
+
+        /* at the unstable area we subdivide deeper */
+        const bool unstable0 = valid0 && ((!valid_inner) | (abs(dot(Vec3fa(ray.dir),Ng_inner0)) < 0.3f));
+        const bool unstable1 = valid1 && ((!valid_inner) | (abs(dot(Vec3fa(ray.dir),Ng_inner1)) < 0.3f));
+    
+        if ((unstable0 | unstable1) && (stack.depth < max_depth)) {
+           stack.push();
+           continue;
+         }
+
+        if (valid0)
+          found |= intersect_bezier_iterative_jacobian(ray,dt,curve,u_outer0,tp0.lower,epilog);
+          
+        /* the far hit cannot be closer, thus skip if we hit entry already */
+        valid1 &= tp1.lower+dt <= ray.tfar;
+        
+        /* iterate over second hit */
+        if (valid1)
+          found |= intersect_bezier_iterative_jacobian(ray,dt,curve,u_outer1,tp1.upper,epilog);
+
+        stack.pop();
+        
+      } while (stack.short_stack != 0x80000000);
+
+      return found;
+    }
+
+#endif
+    
     template<template<typename Ty> class NativeCurve>
     struct SweepCurve1Intersector1
     {
       typedef NativeCurve<Vec3ff> NativeCurve3ff;
       
-      template<typename Epilog>
-      __noinline bool intersect(const CurvePrecalculations1& pre, Ray& ray,
-                                IntersectContext* context,
+      template<typename Ray, typename Epilog>
+      __forceinline bool intersect(const CurvePrecalculations1& pre, Ray& ray,
+                                RayQueryContext* context,
                                 const CurveGeometry* geom, const unsigned int primID,
                                 const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3,
                                 const Epilog& epilog)
@@ -315,7 +462,7 @@ namespace embree
         const float dt = dot(curve0.center()-ray.org,ray.dir)*rcp(dot(ray.dir,ray.dir));
         const Vec3ff ref(madd(Vec3fa(dt),ray.dir,ray.org),0.0f);
         const NativeCurve3ff curve1 = curve0-ref;
-        return intersect_bezier_recursive_jacobian(ray,dt,curve1,0.0f,1.0f,1,epilog);
+        return intersect_bezier_recursive_jacobian(ray,dt,curve1,epilog);
       }
     };
 
@@ -343,7 +490,7 @@ namespace embree
 
       template<typename Epilog>
       __forceinline bool intersect(const CurvePrecalculationsK<K>& pre, RayK<K>& vray, size_t k,
-                                   IntersectContext* context,
+                                   RayQueryContext* context,
                                    const CurveGeometry* geom, const unsigned int primID,
                                    const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3,
                                    const Epilog& epilog)
@@ -357,7 +504,7 @@ namespace embree
         const float dt = dot(curve0.center()-ray.org,ray.dir)*rcp(dot(ray.dir,ray.dir));
         const Vec3ff ref(madd(Vec3fa(dt),ray.dir,ray.org),0.0f);
         const NativeCurve3ff curve1 = curve0-ref;
-        return intersect_bezier_recursive_jacobian(ray,dt,curve1,0.0f,1.0f,1,epilog);
+        return intersect_bezier_recursive_jacobian(ray,dt,curve1,epilog);
       }
     };
   }
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_virtual.h b/thirdparty/embree/kernels/geometry/curve_intersector_virtual.h
index cffa8e46ad..cbdcadac0c 100644
--- a/thirdparty/embree/kernels/geometry/curve_intersector_virtual.h
+++ b/thirdparty/embree/kernels/geometry/curve_intersector_virtual.h
@@ -5,7 +5,7 @@
 
 #include "primitive.h"
 #include "../subdiv/bezier_curve.h"
-#include "../common/primref.h"
+#include "../builders/primref.h"
 #include "curve_intersector_precalculations.h"
 #include "../bvh/node_intersector1.h"
 #include "../bvh/node_intersector_packet.h"
@@ -37,28 +37,28 @@ namespace embree
 {
   struct VirtualCurveIntersector
   {
-    typedef void (*Intersect1Ty)(void* pre, void* ray, IntersectContext* context, const void* primitive);
-    typedef bool (*Occluded1Ty )(void* pre, void* ray, IntersectContext* context, const void* primitive);
+    typedef void (*Intersect1Ty)(void* pre, void* ray, RayQueryContext* context, const void* primitive);
+    typedef bool (*Occluded1Ty )(void* pre, void* ray, RayQueryContext* context, const void* primitive);
     
-    typedef void (*Intersect4Ty)(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
-    typedef bool (*Occluded4Ty) (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+    typedef void (*Intersect4Ty)(void* pre, void* ray, size_t k, RayQueryContext* context, const void* primitive);
+    typedef bool (*Occluded4Ty) (void* pre, void* ray, size_t k, RayQueryContext* context, const void* primitive);
     
-    typedef void (*Intersect8Ty)(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
-    typedef bool (*Occluded8Ty) (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+    typedef void (*Intersect8Ty)(void* pre, void* ray, size_t k, RayQueryContext* context, const void* primitive);
+    typedef bool (*Occluded8Ty) (void* pre, void* ray, size_t k, RayQueryContext* context, const void* primitive);
     
-    typedef void (*Intersect16Ty)(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
-    typedef bool (*Occluded16Ty) (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+    typedef void (*Intersect16Ty)(void* pre, void* ray, size_t k, RayQueryContext* context, const void* primitive);
+    typedef bool (*Occluded16Ty) (void* pre, void* ray, size_t k, RayQueryContext* context, const void* primitive);
 
   public:
     struct Intersectors
     {
       Intersectors() {} // WARNING: Do not zero initialize this, as we otherwise get problems with thread unsafe local static variable initialization (e.g. on VS2013) in curve_intersector_virtual.cpp.
       
-      template<int K> void intersect(void* pre, void* ray, IntersectContext* context, const void* primitive);
-      template<int K> bool occluded (void* pre, void* ray, IntersectContext* context, const void* primitive);
+      template<int K> void intersect(void* pre, void* ray, RayQueryContext* context, const void* primitive);
+      template<int K> bool occluded (void* pre, void* ray, RayQueryContext* context, const void* primitive);
 
-      template<int K> void intersect(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
-      template<int K> bool occluded (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+      template<int K> void intersect(void* pre, void* ray, size_t k, RayQueryContext* context, const void* primitive);
+      template<int K> bool occluded (void* pre, void* ray, size_t k, RayQueryContext* context, const void* primitive);
 
     public:
       Intersect1Ty intersect1;
@@ -74,20 +74,20 @@ namespace embree
     Intersectors vtbl[Geometry::GTY_END];
   };
 
-  template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<1> (void* pre, void* ray, IntersectContext* context, const void* primitive) { assert(intersect1); intersect1(pre,ray,context,primitive); }
-  template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<1>  (void* pre, void* ray, IntersectContext* context, const void* primitive) { assert(occluded1); return occluded1(pre,ray,context,primitive); }
+  template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<1> (void* pre, void* ray, RayQueryContext* context, const void* primitive) { assert(intersect1); intersect1(pre,ray,context,primitive); }
+  template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<1>  (void* pre, void* ray, RayQueryContext* context, const void* primitive) { assert(occluded1); return occluded1(pre,ray,context,primitive); }
       
-  template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<4>(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(intersect4); intersect4(pre,ray,k,context,primitive); }
-  template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<4> (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(occluded4); return occluded4(pre,ray,k,context,primitive); }
+  template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<4>(void* pre, void* ray, size_t k, RayQueryContext* context, const void* primitive) { assert(intersect4); intersect4(pre,ray,k,context,primitive); }
+  template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<4> (void* pre, void* ray, size_t k, RayQueryContext* context, const void* primitive) { assert(occluded4); return occluded4(pre,ray,k,context,primitive); }
       
 #if defined(__AVX__)
-  template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<8>(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(intersect8); intersect8(pre,ray,k,context,primitive); }
-  template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<8> (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(occluded8); return occluded8(pre,ray,k,context,primitive); }
+  template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<8>(void* pre, void* ray, size_t k, RayQueryContext* context, const void* primitive) { assert(intersect8); intersect8(pre,ray,k,context,primitive); }
+  template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<8> (void* pre, void* ray, size_t k, RayQueryContext* context, const void* primitive) { assert(occluded8); return occluded8(pre,ray,k,context,primitive); }
 #endif
   
 #if defined(__AVX512F__)
-  template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<16>(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(intersect16); intersect16(pre,ray,k,context,primitive); }
-  template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<16> (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(occluded16); return occluded16(pre,ray,k,context,primitive); }
+  template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<16>(void* pre, void* ray, size_t k, RayQueryContext* context, const void* primitive) { assert(intersect16); intersect16(pre,ray,k,context,primitive); }
+  template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<16> (void* pre, void* ray, size_t k, RayQueryContext* context, const void* primitive) { assert(occluded16); return occluded16(pre,ray,k,context,primitive); }
 #endif
   
   namespace isa
@@ -98,7 +98,7 @@ namespace embree
       typedef CurvePrecalculations1 Precalculations;
       
       template<int N, bool robust>
-        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
       {
         assert(num == 1);
         RTCGeometryType ty = (RTCGeometryType)(*prim);
@@ -108,7 +108,7 @@ namespace embree
       }
 
       template<int N, bool robust>      
-        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
       {
         assert(num == 1);
         RTCGeometryType ty = (RTCGeometryType)(*prim);
@@ -125,7 +125,7 @@ namespace embree
         typedef CurvePrecalculationsK<K> Precalculations;
         
         template<bool robust>        
-        static __forceinline void intersect(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        static __forceinline void intersect(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
         {
           assert(num == 1);
           RTCGeometryType ty = (RTCGeometryType)(*prim);
@@ -136,7 +136,7 @@ namespace embree
         }
         
         template<bool robust>        
-        static __forceinline vbool<K> occluded(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        static __forceinline vbool<K> occluded(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
         {
           assert(num == 1);
           RTCGeometryType ty = (RTCGeometryType)(*prim);
@@ -153,7 +153,7 @@ namespace embree
         }
         
         template<int N, bool robust>              
-        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
         {
           assert(num == 1);
           RTCGeometryType ty = (RTCGeometryType)(*prim);
@@ -163,7 +163,7 @@ namespace embree
         }
         
         template<int N, bool robust>      
-        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
         {
           assert(num == 1);
           RTCGeometryType ty = (RTCGeometryType)(*prim);
diff --git a/thirdparty/embree/kernels/geometry/disc_intersector.h b/thirdparty/embree/kernels/geometry/disc_intersector.h
index ec6fa9c4f3..2997d36202 100644
--- a/thirdparty/embree/kernels/geometry/disc_intersector.h
+++ b/thirdparty/embree/kernels/geometry/disc_intersector.h
@@ -23,18 +23,24 @@ namespace embree
 
       __forceinline void finalize() {}
 
-      __forceinline Vec2f uv(const size_t i) const
-      {
+      __forceinline Vec2f uv(const size_t i) const {
         return Vec2f(vu[i], vv[i]);
       }
-      __forceinline float t(const size_t i) const
-      {
+      __forceinline Vec2vf<M> uv() const {
+        return Vec2vf<M>(vu, vv);
+      }
+      __forceinline float t(const size_t i) const {
         return vt[i];
       }
-      __forceinline Vec3fa Ng(const size_t i) const
-      {
+      __forceinline vfloat<M> t() const {
+        return vt;
+      }
+      __forceinline Vec3fa Ng(const size_t i) const {
         return Vec3fa(vNg.x[i], vNg.y[i], vNg.z[i]);
       }
+      __forceinline Vec3vf<M> Ng() const { 
+        return vNg;
+      }
 
      public:
       vfloat<M> vu;
@@ -43,16 +49,45 @@ namespace embree
       Vec3vf<M> vNg;
     };
 
+    template<>
+    struct DiscIntersectorHitM<1>
+    {
+      __forceinline DiscIntersectorHitM() {}
+
+      __forceinline DiscIntersectorHitM(const float& u, const float& v, const float& t, const Vec3fa& Ng)
+          : vu(u), vv(v), vt(t), vNg(Ng) {}
+
+      __forceinline void finalize() {}
+
+      __forceinline Vec2f uv() const {
+        return Vec2f(vu, vv);
+      }
+
+      __forceinline float t() const {
+        return vt;
+      }
+
+      __forceinline Vec3fa Ng() const { 
+        return vNg;
+      }
+
+     public:
+      float vu;
+      float vv;
+      float vt;
+      Vec3fa vNg;
+    };
+
     template<int M>
     struct DiscIntersector1
     {
       typedef CurvePrecalculations1 Precalculations;
 
-      template<typename Epilog>
+      template<typename Ray, typename Epilog>
       static __forceinline bool intersect(
           const vbool<M>& valid_i,
           Ray& ray,
-          IntersectContext* context,
+          RayQueryContext* context,
           const Points* geom,
           const Precalculations& pre,
           const Vec4vf<M>& v0i,
@@ -97,10 +132,10 @@ namespace embree
         return epilog(valid, hit);
       }
 
-      template<typename Epilog>
+      template<typename Ray, typename Epilog>
       static __forceinline bool intersect(const vbool<M>& valid_i,
                                           Ray& ray,
-                                          IntersectContext* context,
+                                          RayQueryContext* context,
                                           const Points* geom,
                                           const Precalculations& pre,
                                           const Vec4vf<M>& v0i,
@@ -145,7 +180,7 @@ namespace embree
       static __forceinline bool intersect(const vbool<M>& valid_i,
                                           RayK<K>& ray,
                                           size_t k,
-                                          IntersectContext* context,
+                                          RayQueryContext* context,
                                           const Points* geom,
                                           const Precalculations& pre,
                                           const Vec4vf<M>& v0i,
@@ -194,7 +229,7 @@ namespace embree
       static __forceinline bool intersect(const vbool<M>& valid_i,
                                           RayK<K>& ray,
                                           size_t k,
-                                          IntersectContext* context,
+                                          RayQueryContext* context,
                                           const Points* geom,
                                           const Precalculations& pre,
                                           const Vec4vf<M>& v0i,
diff --git a/thirdparty/embree/kernels/geometry/disci_intersector.h b/thirdparty/embree/kernels/geometry/disci_intersector.h
index bb9d396f6e..1ae4a66330 100644
--- a/thirdparty/embree/kernels/geometry/disci_intersector.h
+++ b/thirdparty/embree/kernels/geometry/disci_intersector.h
@@ -19,7 +19,7 @@ namespace embree
 
       static __forceinline void intersect(const Precalculations& pre,
                                           RayHit& ray,
-                                          IntersectContext* context,
+                                          RayQueryContext* context,
                                           const Primitive& Disc)
       {
         STAT3(normal.trav_prims, 1, 1, 1);
@@ -32,7 +32,7 @@ namespace embree
 
       static __forceinline bool occluded(const Precalculations& pre,
                                          Ray& ray,
-                                         IntersectContext* context,
+                                         RayQueryContext* context,
                                          const Primitive& Disc)
       {
         STAT3(shadow.trav_prims, 1, 1, 1);
@@ -52,7 +52,7 @@ namespace embree
 
       static __forceinline void intersect(const Precalculations& pre,
                                           RayHit& ray,
-                                          IntersectContext* context,
+                                          RayQueryContext* context,
                                           const Primitive& Disc)
       {
         STAT3(normal.trav_prims, 1, 1, 1);
@@ -65,7 +65,7 @@ namespace embree
 
       static __forceinline bool occluded(const Precalculations& pre,
                                          Ray& ray,
-                                         IntersectContext* context,
+                                         RayQueryContext* context,
                                          const Primitive& Disc)
       {
         STAT3(shadow.trav_prims, 1, 1, 1);
@@ -84,7 +84,7 @@ namespace embree
       typedef CurvePrecalculationsK<K> Precalculations;
 
       static __forceinline void intersect(
-          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const Primitive& Disc)
       {
         STAT3(normal.trav_prims, 1, 1, 1);
         const Points* geom = context->scene->get<Points>(Disc.geomID());
@@ -96,7 +96,7 @@ namespace embree
       }
 
       static __forceinline bool occluded(
-          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+          const Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive& Disc)
       {
         STAT3(shadow.trav_prims, 1, 1, 1);
         const Points* geom = context->scene->get<Points>(Disc.geomID());
@@ -115,7 +115,7 @@ namespace embree
       typedef CurvePrecalculationsK<K> Precalculations;
 
       static __forceinline void intersect(
-          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const Primitive& Disc)
       {
         STAT3(normal.trav_prims, 1, 1, 1);
         const Points* geom = context->scene->get<Points>(Disc.geomID());
@@ -127,7 +127,7 @@ namespace embree
       }
 
       static __forceinline bool occluded(
-          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+          const Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive& Disc)
       {
         STAT3(shadow.trav_prims, 1, 1, 1);
         const Points* geom = context->scene->get<Points>(Disc.geomID());
@@ -146,7 +146,7 @@ namespace embree
 
       static __forceinline void intersect(const Precalculations& pre,
                                           RayHit& ray,
-                                          IntersectContext* context,
+                                          RayQueryContext* context,
                                           const Primitive& Disc)
       {
         STAT3(normal.trav_prims, 1, 1, 1);
@@ -160,7 +160,7 @@ namespace embree
 
       static __forceinline bool occluded(const Precalculations& pre,
                                          Ray& ray,
-                                         IntersectContext* context,
+                                         RayQueryContext* context,
                                          const Primitive& Disc)
       {
         STAT3(shadow.trav_prims, 1, 1, 1);
@@ -181,7 +181,7 @@ namespace embree
 
       static __forceinline void intersect(const Precalculations& pre,
                                           RayHit& ray,
-                                          IntersectContext* context,
+                                          RayQueryContext* context,
                                           const Primitive& Disc)
       {
         STAT3(normal.trav_prims, 1, 1, 1);
@@ -195,7 +195,7 @@ namespace embree
 
       static __forceinline bool occluded(const Precalculations& pre,
                                          Ray& ray,
-                                         IntersectContext* context,
+                                         RayQueryContext* context,
                                          const Primitive& Disc)
       {
         STAT3(shadow.trav_prims, 1, 1, 1);
@@ -215,7 +215,7 @@ namespace embree
       typedef CurvePrecalculationsK<K> Precalculations;
 
       static __forceinline void intersect(
-          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const Primitive& Disc)
       {
         STAT3(normal.trav_prims, 1, 1, 1);
         const Points* geom = context->scene->get<Points>(Disc.geomID());
@@ -228,7 +228,7 @@ namespace embree
       }
 
       static __forceinline bool occluded(
-          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+          const Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive& Disc)
       {
         STAT3(shadow.trav_prims, 1, 1, 1);
         const Points* geom = context->scene->get<Points>(Disc.geomID());
@@ -248,7 +248,7 @@ namespace embree
       typedef CurvePrecalculationsK<K> Precalculations;
 
       static __forceinline void intersect(
-          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const Primitive& Disc)
       {
         STAT3(normal.trav_prims, 1, 1, 1);
         const Points* geom = context->scene->get<Points>(Disc.geomID());
@@ -261,7 +261,7 @@ namespace embree
       }
 
       static __forceinline bool occluded(
-          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+          const Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive& Disc)
       {
         STAT3(shadow.trav_prims, 1, 1, 1);
         const Points* geom = context->scene->get<Points>(Disc.geomID());
diff --git a/thirdparty/embree/kernels/geometry/filter.h b/thirdparty/embree/kernels/geometry/filter.h
index d64320bf78..773c2bfeb8 100644
--- a/thirdparty/embree/kernels/geometry/filter.h
+++ b/thirdparty/embree/kernels/geometry/filter.h
@@ -12,20 +12,20 @@ namespace embree
 {
   namespace isa
   {
-    __forceinline bool runIntersectionFilter1Helper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context)
+    __forceinline bool runIntersectionFilter1Helper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, RayQueryContext* context)
     {
       if (geometry->intersectionFilterN)
       {
-        assert(context->scene->hasGeometryFilterFunction());
         geometry->intersectionFilterN(args);
 
         if (args->valid[0] == 0)
           return false;
       }
-            
-      if (context->user->filter) {
-        assert(context->scene->hasContextFilterFunction());
-        context->user->filter(args);
+
+      if (context->getFilter())
+      {
+        if (context->enforceArgumentFilterFunction() || geometry->hasArgumentFilterFunctions())
+          context->getFilter()(args);
 
         if (args->valid[0] == 0)
           return false;
@@ -35,7 +35,7 @@ namespace embree
       return true;
     }
     
-    __forceinline bool runIntersectionFilter1(const Geometry* const geometry, RayHit& ray, IntersectContext* context, Hit& hit)
+    __forceinline bool runIntersectionFilter1(const Geometry* const geometry, RayHit& ray, RayQueryContext* context, Hit& hit)
     {
       RTCFilterFunctionNArguments args;
       int mask = -1;
@@ -48,39 +48,29 @@ namespace embree
       return runIntersectionFilter1Helper(&args,geometry,context);
     }
 
-    __forceinline void reportIntersection1(IntersectFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args)
-    {
-#if defined(EMBREE_FILTER_FUNCTION)
-      if (args->geometry->intersectionFilterN)
-        args->geometry->intersectionFilterN(filter_args);
-      
-      if (args->context->filter)
-        args->context->filter(filter_args);
-#endif
-    }
-    
-    __forceinline bool runOcclusionFilter1Helper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context)
+    __forceinline bool runOcclusionFilter1Helper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, RayQueryContext* context)
     {
       if (geometry->occlusionFilterN)
       {
-        assert(context->scene->hasGeometryFilterFunction());
         geometry->occlusionFilterN(args);
 
         if (args->valid[0] == 0)
           return false;
       }
-      
-      if (context->user->filter) {
-        assert(context->scene->hasContextFilterFunction());
-        context->user->filter(args);
+
+      if (context->getFilter())
+      {
+        if (context->enforceArgumentFilterFunction() || geometry->hasArgumentFilterFunctions())
+          context->getFilter()(args);
 
         if (args->valid[0] == 0)
           return false;
       }
+
       return true;
     }
 
-    __forceinline bool runOcclusionFilter1(const Geometry* const geometry, Ray& ray, IntersectContext* context, Hit& hit)
+    __forceinline bool runOcclusionFilter1(const Geometry* const geometry, Ray& ray, RayQueryContext* context, Hit& hit)
     {
       RTCFilterFunctionNArguments args;
       int mask = -1;
@@ -93,33 +83,19 @@ namespace embree
       return runOcclusionFilter1Helper(&args,geometry,context);
     }
 
-    __forceinline void reportOcclusion1(OccludedFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args)
-    {
-#if defined(EMBREE_FILTER_FUNCTION)
-      if (args->geometry->occlusionFilterN)
-        args->geometry->occlusionFilterN(filter_args);
-      
-      if (args->context->filter)
-        args->context->filter(filter_args);
-#endif
-    }
-
     template<int K>
-      __forceinline vbool<K> runIntersectionFilterHelper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context)
+      __forceinline vbool<K> runIntersectionFilterHelper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, RayQueryContext* context)
     {
       vint<K>* mask = (vint<K>*) args->valid;
       if (geometry->intersectionFilterN)
-      {
-        assert(context->scene->hasGeometryFilterFunction());
         geometry->intersectionFilterN(args);
-      }
-
+      
       vbool<K> valid_o = *mask != vint<K>(zero);
       if (none(valid_o)) return valid_o;
 
-      if (context->user->filter) {
-        assert(context->scene->hasContextFilterFunction());
-        context->user->filter(args);
+      if (context->getFilter()) {
+        if (context->enforceArgumentFilterFunction() || geometry->hasArgumentFilterFunctions())
+          context->getFilter()(args);
       }
 
       valid_o = *mask != vint<K>(zero);
@@ -130,7 +106,7 @@ namespace embree
     }
     
     template<int K>
-    __forceinline vbool<K> runIntersectionFilter(const vbool<K>& valid, const Geometry* const geometry, RayHitK<K>& ray, IntersectContext* context, HitK<K>& hit)
+    __forceinline vbool<K> runIntersectionFilter(const vbool<K>& valid, const Geometry* const geometry, RayHitK<K>& ray, RayQueryContext* context, HitK<K>& hit)
     {
       RTCFilterFunctionNArguments args;
       vint<K> mask = valid.mask32();
@@ -144,24 +120,19 @@ namespace embree
     }
 
     template<int K>
-      __forceinline vbool<K> runOcclusionFilterHelper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context)
+      __forceinline vbool<K> runOcclusionFilterHelper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, RayQueryContext* context)
     {
       vint<K>* mask = (vint<K>*) args->valid;
       if (geometry->occlusionFilterN)
-      {
-        assert(context->scene->hasGeometryFilterFunction());
         geometry->occlusionFilterN(args);
-      }
-
-      vbool<K> valid_o = *mask != vint<K>(zero);
       
+      vbool<K> valid_o = *mask != vint<K>(zero);
       if (none(valid_o)) return valid_o;
 
-      if (context->user->filter) {
-        assert(context->scene->hasContextFilterFunction());
-        context->user->filter(args);
+      if (context->getFilter()) {
+        if (context->enforceArgumentFilterFunction() || geometry->hasArgumentFilterFunctions())
+          context->getFilter()(args);
       }
-
       valid_o = *mask != vint<K>(zero);
 
       RayK<K>* ray = (RayK<K>*) args->ray;
@@ -170,7 +141,7 @@ namespace embree
     }
 
     template<int K>
-      __forceinline vbool<K> runOcclusionFilter(const vbool<K>& valid, const Geometry* const geometry, RayK<K>& ray, IntersectContext* context, HitK<K>& hit)
+      __forceinline vbool<K> runOcclusionFilter(const vbool<K>& valid, const Geometry* const geometry, RayK<K>& ray, RayQueryContext* context, HitK<K>& hit)
     {
       RTCFilterFunctionNArguments args;
       vint<K> mask = valid.mask32();
diff --git a/thirdparty/embree/kernels/geometry/filter_sycl.h b/thirdparty/embree/kernels/geometry/filter_sycl.h
new file mode 100644
index 0000000000..00f333134b
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/filter_sycl.h
@@ -0,0 +1,109 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/geometry.h"
+#include "../common/ray.h"
+#include "../common/hit.h"
+#include "../common/context.h"
+
+namespace embree
+{
+  __forceinline bool runIntersectionFilter1Helper(RTCFilterFunctionNArguments* args, int& mask, const Geometry* const geometry, RayQueryContext* context)
+  {
+    typedef void (*RTCFilterFunctionSYCL)(const void* args);
+    const RTCFeatureFlags feature_mask MAYBE_UNUSED = context->args->feature_mask;
+    
+#if EMBREE_SYCL_GEOMETRY_CALLBACK
+    if (feature_mask & RTC_FEATURE_FLAG_FILTER_FUNCTION_IN_GEOMETRY)
+    {
+      RTCFilterFunctionSYCL gfilter = (RTCFilterFunctionSYCL) geometry->intersectionFilterN;
+      if (gfilter)
+      {
+        gfilter(args);
+        
+        if (mask == 0)
+          return false;
+      }
+    }
+#endif
+
+    if (feature_mask & RTC_FEATURE_FLAG_FILTER_FUNCTION_IN_ARGUMENTS)
+    {
+      RTCFilterFunctionSYCL cfilter = (RTCFilterFunctionSYCL) context->args->filter;
+      if (cfilter)
+      {
+        if (context->enforceArgumentFilterFunction() || geometry->hasArgumentFilterFunctions())
+          cfilter(args);
+        
+        if (mask == 0)
+          return false;
+      }
+    }
+    
+    return true;
+  }
+
+  __forceinline bool runOcclusionFilter1Helper(RTCFilterFunctionNArguments* args, int& mask, const Geometry* const geometry, RayQueryContext* context)
+  {
+    typedef void (*RTCFilterFunctionSYCL)(const void* args);
+    const RTCFeatureFlags feature_mask MAYBE_UNUSED = context->args->feature_mask;
+    
+#if EMBREE_SYCL_GEOMETRY_CALLBACK
+    if (feature_mask & RTC_FEATURE_FLAG_FILTER_FUNCTION_IN_GEOMETRY)
+    {
+      RTCFilterFunctionSYCL gfilter = (RTCFilterFunctionSYCL) geometry->occlusionFilterN;
+      if (gfilter)
+      {
+        gfilter(args);
+        
+        if (mask == 0)
+          return false;
+      }
+    }
+#endif
+
+    if (feature_mask & RTC_FEATURE_FLAG_FILTER_FUNCTION_IN_ARGUMENTS)
+    {
+      RTCFilterFunctionSYCL cfilter = (RTCFilterFunctionSYCL) context->args->filter;
+      if (cfilter)
+      {
+        if (context->enforceArgumentFilterFunction() || geometry->hasArgumentFilterFunctions())
+          cfilter(args);
+        
+        if (mask == 0)
+          return false;
+      }
+    }
+
+    return true;
+  }
+  
+  __forceinline bool runIntersectionFilter1SYCL(Geometry* geometry, RayHit& ray, sycl::private_ptr<RayQueryContext> context, Hit& hit)
+  {
+    RTCFilterFunctionNArguments args;
+    int mask = -1;
+    args.valid = &mask;
+    args.geometryUserPtr = geometry->userPtr;
+    args.context = context->user;
+    args.ray = (RTCRayN*) &ray;
+    args.hit = (RTCHitN*) &hit;
+    args.N = 1;
+    return runIntersectionFilter1Helper(&args,mask,geometry,context);
+  }
+
+
+  __forceinline bool runIntersectionFilter1SYCL(Geometry* geometry, Ray& ray, sycl::private_ptr<RayQueryContext> context, Hit& hit)
+  {
+    RTCFilterFunctionNArguments args;
+    int mask = -1;
+    args.valid = &mask;
+    args.geometryUserPtr = geometry->userPtr;
+    args.context = context->user;
+    args.ray = (RTCRayN*) &ray;
+    args.hit = (RTCHitN*) &hit;
+    args.N = 1;
+    return runOcclusionFilter1Helper(&args,mask,geometry,context);
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/grid_intersector.h b/thirdparty/embree/kernels/geometry/grid_intersector.h
index 9c59cef119..9837205445 100644
--- a/thirdparty/embree/kernels/geometry/grid_intersector.h
+++ b/thirdparty/embree/kernels/geometry/grid_intersector.h
@@ -35,20 +35,20 @@ namespace embree
       typedef Grid1Precalculations<GridSOAIntersector1::Precalculations> Precalculations;
 
       /*! Intersect a ray with the primitive. */
-      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) 
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) 
       {
         GridSOAIntersector1::intersect(pre,ray,context,prim,lazy_node);
       }
-      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, size_t& lazy_node) {
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, RayQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, size_t& lazy_node) {
         intersect(pre,ray,context,prim,ty,lazy_node);
       }
       
       /*! Test if the ray is occluded by the primitive */
-      static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+      static __forceinline bool occluded(Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
       {
         GridSOAIntersector1::occluded(pre,ray,context,prim,lazy_node);
       }
-      static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, size_t& lazy_node) {
+      static __forceinline bool occluded(Precalculations& pre, Ray& ray, RayQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, size_t& lazy_node) {
         return occluded(pre,ray,context,prim,ty,lazy_node);
       }
       
@@ -70,22 +70,22 @@ namespace embree
       typedef SubdivPatch1PrecalculationsK<K,typename GridSOAIntersectorK<K>::Precalculations> Precalculations;
       
       
-      static __forceinline void intersect(const vbool<K>& valid, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+      static __forceinline void intersect(const vbool<K>& valid, Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
       {
         GridSOAIntersectorK<K>::intersect(valid,pre,ray,context,prim,lazy_node);
       }
       
-      static __forceinline vbool<K> occluded(const vbool<K>& valid, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+      static __forceinline vbool<K> occluded(const vbool<K>& valid, Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
       {
         GridSOAIntersectorK<K>::occluded(valid,pre,ray,context,prim,lazy_node);
       }
       
-      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
       {
         GridSOAIntersectorK<K>::intersect(pre,ray,k,context,prim,lazy_node);
       }
       
-      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
       {
         GridSOAIntersectorK<K>::occluded(pre,ray,k,context,prim,lazy_node);
       }
diff --git a/thirdparty/embree/kernels/geometry/grid_soa_intersector1.h b/thirdparty/embree/kernels/geometry/grid_soa_intersector1.h
index 8fbf0d4bdf..6d56bd0404 100644
--- a/thirdparty/embree/kernels/geometry/grid_soa_intersector1.h
+++ b/thirdparty/embree/kernels/geometry/grid_soa_intersector1.h
@@ -30,7 +30,7 @@ namespace embree
       
       template<typename Loader>
         static __forceinline void intersect(RayHit& ray,
-                                            IntersectContext* context, 
+                                            RayQueryContext* context, 
                                             const float* const grid_x,
                                             const size_t line_offset,
                                             const size_t lines,
@@ -50,7 +50,7 @@ namespace embree
       
       template<typename Loader>
         static __forceinline bool occluded(Ray& ray,
-                                           IntersectContext* context, 
+                                           RayQueryContext* context, 
                                            const float* const grid_x,
                                            const size_t line_offset,
                                            const size_t lines,
@@ -71,7 +71,7 @@ namespace embree
       }
       
       /*! Intersect a ray with the primitive. */
-      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) 
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive* prim, size_t& lazy_node) 
       {
         const size_t line_offset   = pre.grid->width;
         const size_t lines         = pre.grid->height;
@@ -87,7 +87,7 @@ namespace embree
       }
       
       /*! Test if the ray is occluded by the primitive */
-      static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      static __forceinline bool occluded(Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive* prim, size_t& lazy_node)
       {
         const size_t line_offset   = pre.grid->width;
         const size_t lines         = pre.grid->height;
@@ -112,7 +112,7 @@ namespace embree
       
       template<typename Loader>
         static __forceinline void intersect(RayHit& ray, const float ftime,
-                                            IntersectContext* context, 
+                                            RayQueryContext* context, 
                                             const float* const grid_x,
                                             const size_t line_offset,
                                             const size_t lines,
@@ -142,7 +142,7 @@ namespace embree
       
       template<typename Loader>
         static __forceinline bool occluded(Ray& ray, const float ftime,
-                                           IntersectContext* context, 
+                                           RayQueryContext* context, 
                                            const float* const grid_x,
                                            const size_t line_offset,
                                            const size_t lines,
@@ -171,7 +171,7 @@ namespace embree
       }
       
       /*! Intersect a ray with the primitive. */
-      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) 
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive* prim, size_t& lazy_node) 
       { 
         const size_t line_offset   = pre.grid->width;
         const size_t lines         = pre.grid->height;
@@ -187,7 +187,7 @@ namespace embree
       }
       
       /*! Test if the ray is occluded by the primitive */
-      static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      static __forceinline bool occluded(Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive* prim, size_t& lazy_node)
       {
         const size_t line_offset   = pre.grid->width;
         const size_t lines         = pre.grid->height;
diff --git a/thirdparty/embree/kernels/geometry/grid_soa_intersector_packet.h b/thirdparty/embree/kernels/geometry/grid_soa_intersector_packet.h
index 14cacab5fe..5e5a24b7dd 100644
--- a/thirdparty/embree/kernels/geometry/grid_soa_intersector_packet.h
+++ b/thirdparty/embree/kernels/geometry/grid_soa_intersector_packet.h
@@ -79,7 +79,7 @@ namespace embree
       };
 
       /*! Intersect a ray with the primitive. */
-      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const Primitive* prim, size_t& lazy_node)
       {
         const size_t dim_offset    = pre.grid->dim_offset;
         const size_t line_offset   = pre.grid->width;
@@ -110,7 +110,7 @@ namespace embree
       }
 
       /*! Test if the ray is occluded by the primitive */
-      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const Primitive* prim, size_t& lazy_node)
       {
         const size_t dim_offset    = pre.grid->dim_offset;
         const size_t line_offset   = pre.grid->width;
@@ -146,7 +146,7 @@ namespace embree
 
       template<typename Loader>
         static __forceinline void intersect(RayHitK<K>& ray, size_t k,
-                                            IntersectContext* context,
+                                            RayQueryContext* context,
                                             const float* const grid_x,
                                             const size_t line_offset,
                                             const size_t lines,
@@ -163,7 +163,7 @@ namespace embree
 
       template<typename Loader>
         static __forceinline bool occluded(RayK<K>& ray, size_t k,
-                                           IntersectContext* context,
+                                           RayQueryContext* context,
                                            const float* const grid_x,
                                            const size_t line_offset,
                                            const size_t lines,
@@ -179,7 +179,7 @@ namespace embree
       }
 
       /*! Intersect a ray with the primitive. */
-      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const Primitive* prim, size_t& lazy_node)
       {
         const size_t line_offset   = pre.grid->width;
         const size_t lines         = pre.grid->height;
@@ -194,7 +194,7 @@ namespace embree
       }
 
       /*! Test if the ray is occluded by the primitive */
-      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive* prim, size_t& lazy_node)
       {
         const size_t line_offset   = pre.grid->width;
         const size_t lines         = pre.grid->height;
@@ -219,7 +219,7 @@ namespace embree
       typedef typename GridSOAIntersectorK<K>::Precalculations Precalculations;
 
       /*! Intersect a ray with the primitive. */
-      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const Primitive* prim, size_t& lazy_node)
       {
         vfloat<K> vftime;
         vint<K> vitime = getTimeSegment<K>(ray.time(), vfloat<K>((float)(pre.grid->time_steps-1)), vftime);
@@ -235,7 +235,7 @@ namespace embree
       }
 
       /*! Intersect a ray with the primitive. */
-      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, const vfloat<K>& ftime, int itime, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, const vfloat<K>& ftime, int itime, RayQueryContext* context, const Primitive* prim, size_t& lazy_node)
       {
         const size_t grid_offset   = pre.grid->gridBytes >> 2;
         const size_t dim_offset    = pre.grid->dim_offset;
@@ -279,7 +279,7 @@ namespace embree
       }
 
       /*! Test if the ray is occluded by the primitive */
-      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const Primitive* prim, size_t& lazy_node)
       {
         vfloat<K> vftime;
         vint<K> vitime = getTimeSegment<K>(ray.time(), vfloat<K>((float)(pre.grid->time_steps-1)), vftime);
@@ -297,7 +297,7 @@ namespace embree
       }
 
       /*! Test if the ray is occluded by the primitive */
-      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, const vfloat<K>& ftime, int itime, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, const vfloat<K>& ftime, int itime, RayQueryContext* context, const Primitive* prim, size_t& lazy_node)
       {
         const size_t grid_offset   = pre.grid->gridBytes >> 2;
         const size_t dim_offset    = pre.grid->dim_offset;
@@ -347,7 +347,7 @@ namespace embree
       template<typename Loader>
         static __forceinline void intersect(RayHitK<K>& ray, size_t k,
                                             const float ftime,
-                                            IntersectContext* context,
+                                            RayQueryContext* context,
                                             const float* const grid_x,
                                             const size_t line_offset,
                                             const size_t lines,
@@ -376,7 +376,7 @@ namespace embree
       template<typename Loader>
         static __forceinline bool occluded(RayK<K>& ray, size_t k,
                                            const float ftime,
-                                           IntersectContext* context,
+                                           RayQueryContext* context,
                                            const float* const grid_x,
                                            const size_t line_offset,
                                            const size_t lines,
@@ -403,7 +403,7 @@ namespace embree
       }
 
       /*! Intersect a ray with the primitive. */
-      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const Primitive* prim, size_t& lazy_node)
       { 
         float ftime;
         int itime = getTimeSegment(ray.time()[k], float(pre.grid->time_steps-1), ftime);
@@ -422,7 +422,7 @@ namespace embree
       }
 
       /*! Test if the ray is occluded by the primitive */
-      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive* prim, size_t& lazy_node)
       {
         float ftime;
         int itime = getTimeSegment(ray.time()[k], float(pre.grid->time_steps-1), ftime);
diff --git a/thirdparty/embree/kernels/geometry/instance_array.h b/thirdparty/embree/kernels/geometry/instance_array.h
new file mode 100644
index 0000000000..fdbe79b3a8
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/instance_array.h
@@ -0,0 +1,85 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "../common/scene_instance_array.h"
+
+namespace embree
+{
+  //template<int M>
+  struct InstanceArrayPrimitive
+  {
+    struct Type : public PrimitiveType
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* primitive supports multiple time segments */
+    static const bool singleTimeSegment = false;
+
+    /* Returns maximum number of stored primitives */
+    static __forceinline size_t max_size() { return 1; }
+
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return N; }
+
+  public:
+
+    InstanceArrayPrimitive (const uint32_t geomID, const uint32_t primID)
+    : primID_(primID)
+    , instID_(geomID)
+    {}
+
+    __forceinline bool valid() const { 
+      return primID_ != -1;
+    }
+
+    void fill(const PrimRef* prims, size_t& i, size_t end, Scene* scene)
+    {
+      assert(end-i == 1);
+      const PrimRef& prim = prims[i]; i++;
+      const unsigned int geomID = prim.geomID();
+      const unsigned int primID = prim.primID();
+      new (this) InstanceArrayPrimitive(geomID, primID);
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& i, size_t end, Scene* scene, size_t itime)
+    {
+      assert(end-i == 1);
+      const PrimRef& prim = prims[i]; i++;
+      const unsigned int geomID = prim.geomID();
+      const size_t primID = prim.primID();
+      new (this) InstanceArrayPrimitive(geomID, primID);
+      const InstanceArray* instanceArray = scene->get<InstanceArray>(geomID);
+      return instanceArray->linearBounds(primID,itime);
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& i, size_t end, Scene* scene, const BBox1f time_range)
+    {
+      assert(end-i == 1);
+      const PrimRefMB& prim = prims[i]; i++;
+      const unsigned int geomID = prim.geomID();
+      const size_t primID = prim.primID();
+      new (this) InstanceArrayPrimitive(geomID, primID);
+      const InstanceArray* instanceArray = scene->get<InstanceArray>(geomID);
+      return instanceArray->linearBounds(primID,time_range);
+    }
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(InstanceArray* instanceArray) {
+      return instanceArray->bounds(0);
+    }
+
+  public:
+    unsigned int primID_;
+    unsigned int instID_;
+  };
+}
diff --git a/thirdparty/embree/kernels/geometry/instance_array_intersector.h b/thirdparty/embree/kernels/geometry/instance_array_intersector.h
new file mode 100644
index 0000000000..c32c8b4cde
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/instance_array_intersector.h
@@ -0,0 +1,85 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "instance_array.h"
+#include "../common/ray.h"
+#include "../common/point_query.h"
+#include "../common/scene.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct InstanceArrayIntersector1
+    {
+      typedef InstanceArrayPrimitive Primitive;
+
+      struct Precalculations {
+        __forceinline Precalculations (const Ray& ray, const void *ptr) {}
+      };
+
+      static void intersect(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& prim);
+      static bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& prim);
+      static bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim);
+    };
+
+    struct InstanceArrayIntersector1MB
+    {
+      typedef InstanceArrayPrimitive Primitive;
+
+      struct Precalculations {
+        __forceinline Precalculations (const Ray& ray, const void *ptr) {}
+      };
+      
+      static void intersect(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& prim);
+      static bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& prim);
+      static bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim);
+    };
+
+    template<int K>
+      struct InstanceArrayIntersectorK
+    {
+      typedef InstanceArrayPrimitive Primitive;
+      
+      struct Precalculations {
+        __forceinline Precalculations (const vbool<K>& valid, const RayK<K>& ray) {}
+      };
+      
+      static void intersect(const vbool<K>& valid_i, const Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const Primitive& prim);
+      static vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const Primitive& prim);
+
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const Primitive& prim) {
+        intersect(vbool<K>(1<<int(k)),pre,ray,context,prim);
+      }
+      
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive& prim) {
+        occluded(vbool<K>(1<<int(k)),pre,ray,context,prim);
+        return ray.tfar[k] < 0.0f; 
+      }
+    };
+
+    template<int K>
+      struct InstanceArrayIntersectorKMB
+    {
+      typedef InstanceArrayPrimitive Primitive;
+      
+      struct Precalculations {
+        __forceinline Precalculations (const vbool<K>& valid, const RayK<K>& ray) {}
+      };
+      
+      static void intersect(const vbool<K>& valid_i, const Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const Primitive& prim);
+      static vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const Primitive& prim);
+
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const Primitive& prim) {
+        intersect(vbool<K>(1<<int(k)),pre,ray,context,prim);
+      }
+      
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive& prim) {
+        occluded(vbool<K>(1<<int(k)),pre,ray,context,prim);
+        return ray.tfar[k] < 0.0f; 
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/instance_intersector.h b/thirdparty/embree/kernels/geometry/instance_intersector.h
index 28a7b728e5..45c3d36c2d 100644
--- a/thirdparty/embree/kernels/geometry/instance_intersector.h
+++ b/thirdparty/embree/kernels/geometry/instance_intersector.h
@@ -19,8 +19,8 @@ namespace embree
         __forceinline Precalculations (const Ray& ray, const void *ptr) {}
       };
       
-      static void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim);
-      static bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim);
+      static void intersect(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& prim);
+      static bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& prim);
       static bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim);
     };
 
@@ -32,8 +32,8 @@ namespace embree
         __forceinline Precalculations (const Ray& ray, const void *ptr) {}
       };
       
-      static void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim);
-      static bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim);
+      static void intersect(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& prim);
+      static bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& prim);
       static bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim);
     };
 
@@ -46,14 +46,14 @@ namespace embree
         __forceinline Precalculations (const vbool<K>& valid, const RayK<K>& ray) {}
       };
       
-      static void intersect(const vbool<K>& valid_i, const Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& prim);
-      static vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& prim);
+      static void intersect(const vbool<K>& valid_i, const Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const Primitive& prim);
+      static vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const Primitive& prim);
 
-      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const Primitive& prim) {
         intersect(vbool<K>(1<<int(k)),pre,ray,context,prim);
       }
       
-      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive& prim) {
         occluded(vbool<K>(1<<int(k)),pre,ray,context,prim);
         return ray.tfar[k] < 0.0f; 
       }
@@ -68,14 +68,14 @@ namespace embree
         __forceinline Precalculations (const vbool<K>& valid, const RayK<K>& ray) {}
       };
       
-      static void intersect(const vbool<K>& valid_i, const Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& prim);
-      static vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& prim);
+      static void intersect(const vbool<K>& valid_i, const Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const Primitive& prim);
+      static vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const Primitive& prim);
 
-      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const Primitive& prim) {
         intersect(vbool<K>(1<<int(k)),pre,ray,context,prim);
       }
       
-      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive& prim) {
         occluded(vbool<K>(1<<int(k)),pre,ray,context,prim);
         return ray.tfar[k] < 0.0f; 
       }
diff --git a/thirdparty/embree/kernels/geometry/intersector_epilog.h b/thirdparty/embree/kernels/geometry/intersector_epilog.h
index 7bf134cc54..af85b0e57d 100644
--- a/thirdparty/embree/kernels/geometry/intersector_epilog.h
+++ b/thirdparty/embree/kernels/geometry/intersector_epilog.h
@@ -21,12 +21,12 @@ namespace embree
     struct Intersect1Epilog1
     {
       RayHit& ray;
-      IntersectContext* context;
+      RayQueryContext* context;
       const unsigned int geomID;
       const unsigned int primID;
 
       __forceinline Intersect1Epilog1(RayHit& ray,
-                                      IntersectContext* context,
+                                      RayQueryContext* context,
                                       const unsigned int geomID,
                                       const unsigned int primID)
         : ray(ray), context(context), geomID(geomID), primID(primID) {}
@@ -64,6 +64,9 @@ namespace embree
         ray.primID = primID;
         ray.geomID = geomID;
         instance_id_stack::copy_UU(context->user->instID, ray.instID);
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+        instance_id_stack::copy_UU(context->user->instPrimID, ray.instPrimID);
+#endif
         return true;
       }
     };
@@ -72,12 +75,12 @@ namespace embree
     struct Occluded1Epilog1
     {
       Ray& ray;
-      IntersectContext* context;
+      RayQueryContext* context;
       const unsigned int geomID;
       const unsigned int primID;
 
       __forceinline Occluded1Epilog1(Ray& ray,
-                                     IntersectContext* context,
+                                     RayQueryContext* context,
                                      const unsigned int geomID,
                                      const unsigned int primID)
         : ray(ray), context(context), geomID(geomID), primID(primID) {}
@@ -117,12 +120,12 @@ namespace embree
     {
       RayHitK<K>& ray;
       size_t k;
-      IntersectContext* context;
+      RayQueryContext* context;
       const unsigned int geomID;
       const unsigned int primID;
 
       __forceinline Intersect1KEpilog1(RayHitK<K>& ray, size_t k,
-                                       IntersectContext* context,
+                                       RayQueryContext* context,
                                        const unsigned int geomID,
                                        const unsigned int primID)
         : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {}
@@ -163,6 +166,9 @@ namespace embree
         ray.primID[k] = primID;
         ray.geomID[k] = geomID;
         instance_id_stack::copy_UV<K>(context->user->instID, ray.instID, k);
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+        instance_id_stack::copy_UV<K>(context->user->instPrimID, ray.instPrimID, k);
+#endif
         return true;
       }
     };
@@ -172,12 +178,12 @@ namespace embree
     {
       RayK<K>& ray;
       size_t k;
-      IntersectContext* context;
+      RayQueryContext* context;
       const unsigned int geomID;
       const unsigned int primID;
 
       __forceinline Occluded1KEpilog1(RayK<K>& ray, size_t k,
-                                      IntersectContext* context,
+                                      RayQueryContext* context,
                                       const unsigned int geomID,
                                       const unsigned int primID)
         : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {}
@@ -215,12 +221,12 @@ namespace embree
     struct Intersect1EpilogM
     {
       RayHit& ray;
-      IntersectContext* context;
+      RayQueryContext* context;
       const vuint<M>& geomIDs;
       const vuint<M>& primIDs;
 
       __forceinline Intersect1EpilogM(RayHit& ray,
-                                      IntersectContext* context,
+                                      RayQueryContext* context,
                                       const vuint<M>& geomIDs,
                                       const vuint<M>& primIDs)
         : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs) {}
@@ -287,6 +293,9 @@ namespace embree
         ray.primID = primIDs[i];
         ray.geomID = geomID;
         instance_id_stack::copy_UU(context->user->instID, ray.instID);
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+        instance_id_stack::copy_UU(context->user->instPrimID, ray.instPrimID);
+#endif
         return true;
 
       }
@@ -296,12 +305,12 @@ namespace embree
     struct Occluded1EpilogM
     {
       Ray& ray;
-      IntersectContext* context;
+      RayQueryContext* context;
       const vuint<M>& geomIDs;
       const vuint<M>& primIDs;
 
       __forceinline Occluded1EpilogM(Ray& ray,
-                                     IntersectContext* context,
+                                     RayQueryContext* context,
                                      const vuint<M>& geomIDs,
                                      const vuint<M>& primIDs)
         : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs) {}
@@ -363,12 +372,12 @@ namespace embree
     struct Intersect1EpilogMU
     {
       RayHit& ray;
-      IntersectContext* context;
+      RayQueryContext* context;
       const unsigned int geomID;
       const unsigned int primID;
 
       __forceinline Intersect1EpilogMU(RayHit& ray,
-                                       IntersectContext* context,
+                                       RayQueryContext* context,
                                        const unsigned int geomID,
                                        const unsigned int primID)
         : ray(ray), context(context), geomID(geomID), primID(primID) {}
@@ -424,6 +433,9 @@ namespace embree
         ray.primID = primID;
         ray.geomID = geomID;
         instance_id_stack::copy_UU(context->user->instID, ray.instID);
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+        instance_id_stack::copy_UU(context->user->instPrimID, ray.instPrimID);
+#endif
         return true;
       }
     };
@@ -432,12 +444,12 @@ namespace embree
     struct Occluded1EpilogMU
     {
       Ray& ray;
-      IntersectContext* context;
+      RayQueryContext* context;
       const unsigned int geomID;
       const unsigned int primID;
 
       __forceinline Occluded1EpilogMU(Ray& ray,
-                                      IntersectContext* context,
+                                      RayQueryContext* context,
                                       const unsigned int geomID,
                                       const unsigned int primID)
         : ray(ray), context(context), geomID(geomID), primID(primID) {}
@@ -477,13 +489,13 @@ namespace embree
     struct IntersectKEpilogM
     {
       RayHitK<K>& ray;
-      IntersectContext* context;
+      RayQueryContext* context;
       const vuint<M>& geomIDs;
       const vuint<M>& primIDs;
       const size_t i;
 
       __forceinline IntersectKEpilogM(RayHitK<K>& ray,
-                                      IntersectContext* context,
+                                      RayQueryContext* context,
                                      const vuint<M>& geomIDs,
                                      const vuint<M>& primIDs,
                                      size_t i)
@@ -534,6 +546,9 @@ namespace embree
         vuint<K>::store(valid,&ray.primID,primID);
         vuint<K>::store(valid,&ray.geomID,geomID);
         instance_id_stack::copy_UV<K>(context->user->instID, ray.instID, valid);
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+        instance_id_stack::copy_UV<K>(context->user->instPrimID, ray.instPrimID, valid);
+#endif
         return valid;
       }
     };
@@ -543,14 +558,14 @@ namespace embree
     {
       vbool<K>& valid0;
       RayK<K>& ray;
-      IntersectContext* context;
+      RayQueryContext* context;
       const vuint<M>& geomIDs;
       const vuint<M>& primIDs;
       const size_t i;
 
       __forceinline OccludedKEpilogM(vbool<K>& valid0,
                                      RayK<K>& ray,
-                                     IntersectContext* context,
+                                     RayQueryContext* context,
                                      const vuint<M>& geomIDs,
                                      const vuint<M>& primIDs,
                                      size_t i)
@@ -598,12 +613,12 @@ namespace embree
     struct IntersectKEpilogMU
     {
       RayHitK<K>& ray;
-      IntersectContext* context;
+      RayQueryContext* context;
       const unsigned int geomID;
       const unsigned int primID;
 
       __forceinline IntersectKEpilogMU(RayHitK<K>& ray,
-                                       IntersectContext* context,
+                                       RayQueryContext* context,
                                        const unsigned int geomID,
                                        const unsigned int primID)
         : ray(ray), context(context), geomID(geomID), primID(primID) {}
@@ -649,6 +664,9 @@ namespace embree
         vuint<K>::store(valid,&ray.primID,primID);
         vuint<K>::store(valid,&ray.geomID,geomID);
         instance_id_stack::copy_UV<K>(context->user->instID, ray.instID, valid);
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+        instance_id_stack::copy_UV<K>(context->user->instPrimID, ray.instPrimID, valid);
+#endif
         return valid;
       }
     };
@@ -658,13 +676,13 @@ namespace embree
     {
       vbool<K>& valid0;
       RayK<K>& ray;
-      IntersectContext* context;
+      RayQueryContext* context;
       const unsigned int geomID;
       const unsigned int primID;
 
       __forceinline OccludedKEpilogMU(vbool<K>& valid0,
                                       RayK<K>& ray,
-                                      IntersectContext* context,
+                                      RayQueryContext* context,
                                       const unsigned int geomID,
                                       const unsigned int primID)
         : valid0(valid0), ray(ray), context(context), geomID(geomID), primID(primID) {}
@@ -709,12 +727,12 @@ namespace embree
     {
       RayHitK<K>& ray;
       size_t k;
-      IntersectContext* context;
+      RayQueryContext* context;
       const vuint<M>& geomIDs;
       const vuint<M>& primIDs;
 
       __forceinline Intersect1KEpilogM(RayHitK<K>& ray, size_t k,
-                                       IntersectContext* context,
+                                       RayQueryContext* context,
                                        const vuint<M>& geomIDs,
                                        const vuint<M>& primIDs)
         : ray(ray), k(k), context(context), geomIDs(geomIDs), primIDs(primIDs) {}
@@ -783,6 +801,9 @@ namespace embree
         ray.primID[k] = primIDs[i];
         ray.geomID[k] = geomID;
         instance_id_stack::copy_UV<K>(context->user->instID, ray.instID, k);
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+        instance_id_stack::copy_UV<K>(context->user->instPrimID, ray.instPrimID, k);
+#endif
         return true;
       }
     };
@@ -792,12 +813,12 @@ namespace embree
     {
       RayK<K>& ray;
       size_t k;
-      IntersectContext* context;
+      RayQueryContext* context;
       const vuint<M>& geomIDs;
       const vuint<M>& primIDs;
 
       __forceinline Occluded1KEpilogM(RayK<K>& ray, size_t k,
-                                      IntersectContext* context,
+                                      RayQueryContext* context,
                                       const vuint<M>& geomIDs,
                                       const vuint<M>& primIDs)
         : ray(ray), k(k), context(context), geomIDs(geomIDs), primIDs(primIDs) {}
@@ -860,12 +881,12 @@ namespace embree
     {
       RayHitK<K>& ray;
       size_t k;
-      IntersectContext* context;
+      RayQueryContext* context;
       const unsigned int geomID;
       const unsigned int primID;
 
       __forceinline Intersect1KEpilogMU(RayHitK<K>& ray, size_t k,
-                                        IntersectContext* context,
+                                        RayQueryContext* context,
                                         const unsigned int geomID,
                                         const unsigned int primID)
         : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {}
@@ -923,6 +944,9 @@ namespace embree
         ray.primID[k] = primID;
         ray.geomID[k] = geomID;
         instance_id_stack::copy_UV<K>(context->user->instID, ray.instID, k);
+#if defined(RTC_GEOMETRY_INSTANCE_ARRAY)
+        instance_id_stack::copy_UV<K>(context->user->instPrimID, ray.instPrimID, k);
+#endif
         return true;
       }
     };
@@ -932,12 +956,12 @@ namespace embree
     {
       RayK<K>& ray;
       size_t k;
-      IntersectContext* context;
+      RayQueryContext* context;
       const unsigned int geomID;
       const unsigned int primID;
 
       __forceinline Occluded1KEpilogMU(RayK<K>& ray, size_t k,
-                                       IntersectContext* context,
+                                       RayQueryContext* context,
                                        const unsigned int geomID,
                                        const unsigned int primID)
         : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {}
diff --git a/thirdparty/embree/kernels/geometry/intersector_epilog_sycl.h b/thirdparty/embree/kernels/geometry/intersector_epilog_sycl.h
new file mode 100644
index 0000000000..9565006738
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/intersector_epilog_sycl.h
@@ -0,0 +1,207 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/context.h"
+#include "filter_sycl.h"
+
+namespace embree
+{
+  template<typename Ray>
+  struct Intersect1Epilog1_HWIF;
+
+  template<>
+  struct Intersect1Epilog1_HWIF<RayHit>
+  {
+    RayHit& ray;
+    sycl::private_ptr<RayQueryContext> context;
+    const unsigned int geomID;
+    const unsigned int primID;
+    const bool filter;
+    
+    __forceinline Intersect1Epilog1_HWIF(RayHit& ray,
+                                         sycl::private_ptr<RayQueryContext> context,
+                                         const unsigned int geomID,
+                                         const unsigned int primID,
+                                         const bool filter)
+      : ray(ray), context(context), geomID(geomID), primID(primID), filter(filter) {}
+    
+    template<typename Hit_i>
+    __forceinline bool operator() (Hit_i& hit_i) const
+    {
+      hit_i.finalize();
+
+      Scene* scene MAYBE_UNUSED = context->scene;
+      Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+      /* ray mask test */
+#if defined(EMBREE_RAY_MASK)
+      if ((geometry->mask & ray.mask) == 0)
+        return false;
+#endif
+
+      /* call intersection filter function */
+#if defined(EMBREE_FILTER_FUNCTION) 
+      if (filter && (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())))
+      {
+        Hit h(context->user,geomID,primID,Vec2f(hit_i.u,hit_i.v),hit_i.Ng);
+        float old_t = ray.tfar;
+        ray.tfar = hit_i.t;
+        bool found = runIntersectionFilter1SYCL(geometry,ray,context,h);
+        if (!found) {
+          ray.tfar = old_t;
+          return false;
+        }
+      }
+#endif
+      
+      ray.tfar = hit_i.t;
+      ray.u = hit_i.u;
+      ray.v = hit_i.v;
+      ray.Ng.x = hit_i.Ng.x;
+      ray.Ng.y = hit_i.Ng.y;
+      ray.Ng.z = hit_i.Ng.z;
+      ray.geomID = geomID;
+      ray.primID = primID;
+      instance_id_stack::copy_UU(context->user, context->user->instID, ray.instID);
+#if defined(EMBREE_GEOMETRY_INSTANCE_ARRAY)
+      instance_id_stack::copy_UU(context->user, context->user->instPrimID, ray.instPrimID);
+#endif
+      return true;
+    }
+    
+    template<typename Hit_i>
+    __forceinline bool operator() (bool, Hit_i& hit_i) const
+    {
+      hit_i.finalize();
+
+      Scene* scene MAYBE_UNUSED = context->scene;
+      Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+      /* ray mask test */
+#if defined(EMBREE_RAY_MASK)
+      if ((geometry->mask & ray.mask) == 0)
+        return false;
+#endif
+
+      const Vec3fa Ng = hit_i.Ng();
+      const Vec2f uv = hit_i.uv();
+      
+      /* call intersection filter function */
+#if defined(EMBREE_FILTER_FUNCTION) 
+      if (filter && (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())))
+      {
+        Hit h(context->user,geomID,primID,uv,Ng);
+        float old_t = ray.tfar;
+        ray.tfar = hit_i.t();
+        bool found = runIntersectionFilter1SYCL(geometry,ray,context,h);
+        if (!found) {
+          ray.tfar = old_t;
+          return false;
+        }
+      }
+#endif
+      
+      ray.tfar = hit_i.t();
+      ray.u = uv.x;
+      ray.v = uv.y;
+      ray.Ng.x = Ng.x;
+      ray.Ng.y = Ng.y;
+      ray.Ng.z = Ng.z;
+      ray.geomID = geomID;
+      ray.primID = primID;
+      instance_id_stack::copy_UU(context->user, context->user->instID, ray.instID);
+#if defined(EMBREE_GEOMETRY_INSTANCE_ARRAY)
+      instance_id_stack::copy_UU(context->user, context->user->instPrimID, ray.instPrimID);
+#endif
+      return true;
+    }
+  };
+
+  template<>
+  struct Intersect1Epilog1_HWIF<Ray>
+  {
+    Ray& ray;
+    sycl::private_ptr<RayQueryContext> context;
+    const unsigned int geomID;
+    const unsigned int primID;
+    const bool filter;
+    
+    __forceinline Intersect1Epilog1_HWIF(Ray& ray,
+                                         sycl::private_ptr<RayQueryContext> context,
+                                         const unsigned int geomID,
+                                         const unsigned int primID,
+                                         const bool filter)
+      : ray(ray), context(context), geomID(geomID), primID(primID), filter(filter) {}
+    
+    template<typename Hit_i>
+    __forceinline bool operator() (Hit_i& hit_i) const
+    {
+      hit_i.finalize();
+
+      Scene* scene MAYBE_UNUSED = context->scene;
+      Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+      /* ray mask test */
+#if defined(EMBREE_RAY_MASK)
+      if ((geometry->mask & ray.mask) == 0)
+        return false;
+#endif
+
+      /* call intersection filter function */
+#if defined(EMBREE_FILTER_FUNCTION) 
+      if (filter && (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())))
+      {
+        Hit h(context->user,geomID,primID,Vec2f(hit_i.u,hit_i.v),hit_i.Ng);
+        float old_t = ray.tfar;
+        ray.tfar = hit_i.t;
+        bool found = runIntersectionFilter1SYCL(geometry,ray,context,h);
+        if (!found) {
+          ray.tfar = old_t;
+          return false;
+        }
+      }
+#endif
+      
+      ray.tfar = neg_inf;
+      return true;
+    }
+    
+    template<typename Hit_i>
+    __forceinline bool operator() (bool, Hit_i& hit_i) const
+    {
+      hit_i.finalize();
+
+      Scene* scene MAYBE_UNUSED = context->scene;
+      Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+      /* ray mask test */
+#if defined(EMBREE_RAY_MASK)
+      if ((geometry->mask & ray.mask) == 0)
+        return false;
+#endif
+
+      /* call intersection filter function */
+#if defined(EMBREE_FILTER_FUNCTION) 
+      if (filter && (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())))
+      {
+        const Vec3fa Ng = hit_i.Ng();
+        const Vec2f uv = hit_i.uv();
+        Hit h(context->user,geomID,primID,uv,Ng);
+        float old_t = ray.tfar;
+        ray.tfar = hit_i.t();
+        bool found = runIntersectionFilter1SYCL(geometry,ray,context,h);
+        if (!found) {
+          ray.tfar = old_t;
+          return false;
+        }
+      }
+#endif
+      
+      ray.tfar = neg_inf;
+      return true;
+    }
+  };
+}
diff --git a/thirdparty/embree/kernels/geometry/intersector_iterators.h b/thirdparty/embree/kernels/geometry/intersector_iterators.h
index 9cac1cd25c..5c6a6a7bb0 100644
--- a/thirdparty/embree/kernels/geometry/intersector_iterators.h
+++ b/thirdparty/embree/kernels/geometry/intersector_iterators.h
@@ -20,14 +20,15 @@ namespace embree
       typedef typename Intersector::Precalculations Precalculations;
 
       template<int N, bool robust>
-      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
       {
-        for (size_t i=0; i<num; i++)
+        for (size_t i=0; i<num; i++) {
           Intersector::intersect(pre,ray,context,prim[i]);
+        }
       }
 
       template<int N, bool robust>
-      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
       {
         for (size_t i=0; i<num; i++) {
           if (Intersector::occluded(pre,ray,context,prim[i]))
@@ -35,7 +36,7 @@ namespace embree
         }
         return false;
       }
-      
+
       template<int N>
       static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery<N> &tquery, size_t& lazy_node)
       {
@@ -46,12 +47,12 @@ namespace embree
       }
 
       template<int K>
-      static __forceinline void intersectK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+      static __forceinline void intersectK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayHitK<K>& ray, RayQueryContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
       {
       }
 
       template<int K>
-      static __forceinline vbool<K> occludedK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+      static __forceinline vbool<K> occludedK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayK<K>& ray, RayQueryContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
       {
         return valid;
       }
@@ -64,7 +65,7 @@ namespace embree
       typedef typename Intersector::Precalculations Precalculations;
 
       template<bool robust>
-      static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
       {
         for (size_t i=0; i<num; i++) {
           Intersector::intersect(valid,pre,ray,context,prim[i]);
@@ -72,7 +73,7 @@ namespace embree
       }
 
       template<bool robust>
-      static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
       {
         vbool<K> valid0 = valid;
         for (size_t i=0; i<num; i++) {
@@ -83,7 +84,7 @@ namespace embree
       }
 
       template<int N, bool robust>
-      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
       {
         for (size_t i=0; i<num; i++) {
           Intersector::intersect(pre,ray,k,context,prim[i]);
@@ -91,7 +92,7 @@ namespace embree
       }
 
       template<int N, bool robust>
-      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
       {
         for (size_t i=0; i<num; i++) {
           if (Intersector::occluded(pre,ray,k,context,prim[i]))
@@ -109,7 +110,7 @@ namespace embree
       typedef typename IntersectorK::Primitive PrimitiveK;
       typedef typename IntersectorK::Precalculations PrecalculationsK;
 
-      static __forceinline void intersectK(const vbool<K>& valid, const Accel::Intersectors* This, /* PrecalculationsK& pre, */ RayHitK<K>& ray, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+      static __forceinline void intersectK(const vbool<K>& valid, const Accel::Intersectors* This, /* PrecalculationsK& pre, */ RayHitK<K>& ray, RayQueryContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
       {
         PrecalculationsK pre(valid,ray); // FIXME: might cause trouble
 
@@ -118,7 +119,7 @@ namespace embree
         }
       }
 
-      static __forceinline vbool<K> occludedK(const vbool<K>& valid, const Accel::Intersectors* This, /* PrecalculationsK& pre, */ RayK<K>& ray, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+      static __forceinline vbool<K> occludedK(const vbool<K>& valid, const Accel::Intersectors* This, /* PrecalculationsK& pre, */ RayK<K>& ray, RayQueryContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
       {
         PrecalculationsK pre(valid,ray); // FIXME: might cause trouble
         vbool<K> valid0 = valid;
@@ -129,7 +130,7 @@ namespace embree
         return !valid0;
       }
 
-      static __forceinline void intersect(const Accel::Intersectors* This, RayHitK<K>& ray, size_t k, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+      static __forceinline void intersect(const Accel::Intersectors* This, RayHitK<K>& ray, size_t k, RayQueryContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
       {
         PrecalculationsK pre(ray.tnear() <= ray.tfar,ray); // FIXME: might cause trouble
         for (size_t i=0; i<num; i++) {
@@ -137,7 +138,7 @@ namespace embree
         }
       }
 
-      static __forceinline bool occluded(const Accel::Intersectors* This, RayK<K>& ray, size_t k, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+      static __forceinline bool occluded(const Accel::Intersectors* This, RayK<K>& ray, size_t k, RayQueryContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
       {
         PrecalculationsK pre(ray.tnear() <= ray.tfar,ray); // FIXME: might cause trouble
         for (size_t i=0; i<num; i++) {
@@ -147,7 +148,7 @@ namespace embree
         return false;
       }
 
-      static __forceinline size_t occluded(const Accel::Intersectors* This, size_t cur_mask, RayK<K>** __restrict__ inputPackets, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+      static __forceinline size_t occluded(const Accel::Intersectors* This, size_t cur_mask, RayK<K>** __restrict__ inputPackets, RayQueryContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
       {
         size_t m_occluded = 0;
         for (size_t i=0; i<num; i++) {
diff --git a/thirdparty/embree/kernels/geometry/line_intersector.h b/thirdparty/embree/kernels/geometry/line_intersector.h
index 41096d8794..e3bad0a3d5 100644
--- a/thirdparty/embree/kernels/geometry/line_intersector.h
+++ b/thirdparty/embree/kernels/geometry/line_intersector.h
@@ -43,7 +43,7 @@ namespace embree
         template<typename Ray, typename Epilog>
         static __forceinline bool intersect(const vbool<M>& valid_i,
                                             Ray& ray,
-                                            IntersectContext* context,
+                                            RayQueryContext* context,
                                             const LineSegments* geom,
                                             const Precalculations& pre,
                                             const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
@@ -96,7 +96,7 @@ namespace embree
         template<typename Epilog>
         static __forceinline bool intersect(const vbool<M>& valid_i,
                                             RayK<K>& ray, size_t k,
-                                            IntersectContext* context,
+                                            RayQueryContext* context,
                                             const LineSegments* geom,
                                             const Precalculations& pre,
                                             const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
diff --git a/thirdparty/embree/kernels/geometry/linei.h b/thirdparty/embree/kernels/geometry/linei.h
index 3ee70ac012..3305025fc9 100644
--- a/thirdparty/embree/kernels/geometry/linei.h
+++ b/thirdparty/embree/kernels/geometry/linei.h
@@ -457,6 +457,9 @@ namespace embree
     p1 = lerp(a1,b1,vfloat4(ftime));
     pL = lerp(aL,bL,vfloat4(ftime));
     pR = lerp(aR,bR,vfloat4(ftime));
+
+    pL = select(vboolf4(leftExists), pL, Vec4vf4(inf));
+    pR = select(vboolf4(rightExists), pR, Vec4vf4(inf));
   }
 
 #if defined(__AVX__)
@@ -647,6 +650,9 @@ namespace embree
     p1 = lerp(a1,b1,vfloat8(ftime));
     pL = lerp(aL,bL,vfloat8(ftime));
     pR = lerp(aR,bR,vfloat8(ftime));
+    
+    pL = select(vboolf4(leftExists), pL, Vec4vf8(inf));
+    pR = select(vboolf4(rightExists), pR, Vec4vf8(inf));
   }
 
   template<>
diff --git a/thirdparty/embree/kernels/geometry/linei_intersector.h b/thirdparty/embree/kernels/geometry/linei_intersector.h
index 5992827f5b..d754cc1269 100644
--- a/thirdparty/embree/kernels/geometry/linei_intersector.h
+++ b/thirdparty/embree/kernels/geometry/linei_intersector.h
@@ -17,7 +17,7 @@ namespace embree
       typedef LineMi<M> Primitive;
       typedef CurvePrecalculations1 Precalculations;
 
-      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& line)
       {
         STAT3(normal.trav_prims,1,1,1);
         const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
@@ -26,7 +26,7 @@ namespace embree
         FlatLinearCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,Intersect1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
       }
 
-      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& line)
       {
         STAT3(shadow.trav_prims,1,1,1);
         const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
@@ -47,7 +47,7 @@ namespace embree
       typedef LineMi<M> Primitive;
       typedef CurvePrecalculations1 Precalculations;
 
-      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& line)
       {
         STAT3(normal.trav_prims,1,1,1);
         const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
@@ -56,7 +56,7 @@ namespace embree
         FlatLinearCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,Intersect1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
       }
 
-      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& line)
       {
         STAT3(shadow.trav_prims,1,1,1);
         const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
@@ -77,7 +77,7 @@ namespace embree
       typedef LineMi<M> Primitive;
       typedef CurvePrecalculationsK<K> Precalculations;
 
-      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const Primitive& line)
       {
         STAT3(normal.trav_prims,1,1,1);
         const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
@@ -86,7 +86,7 @@ namespace embree
         FlatLinearCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,Intersect1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
       }
 
-      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive& line)
       {
         STAT3(shadow.trav_prims,1,1,1);
         const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
@@ -102,7 +102,7 @@ namespace embree
       typedef LineMi<M> Primitive;
       typedef CurvePrecalculationsK<K> Precalculations;
 
-      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context,  const Primitive& line)
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context,  const Primitive& line)
       {
         STAT3(normal.trav_prims,1,1,1);
         const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
@@ -111,7 +111,7 @@ namespace embree
         FlatLinearCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,Intersect1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
       }
 
-      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive& line)
       {
         STAT3(shadow.trav_prims,1,1,1);
         const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
diff --git a/thirdparty/embree/kernels/geometry/object.h b/thirdparty/embree/kernels/geometry/object.h
index 2a61829ffd..9525b97880 100644
--- a/thirdparty/embree/kernels/geometry/object.h
+++ b/thirdparty/embree/kernels/geometry/object.h
@@ -62,6 +62,17 @@ namespace embree
     }
 
     /*! fill triangle from triangle list */
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t i, Scene* scene, const BBox1f time_range)
+    {
+      const PrimRefMB& prim = prims[i]; i++;
+      const unsigned geomID = prim.geomID();
+      const unsigned primID = prim.primID();
+      new (this) Object(geomID, primID);
+      AccelSet* accel = (AccelSet*) scene->get(geomID);
+      return accel->linearBounds(primID,time_range);
+    }
+
+     /*! fill triangle from triangle list */
     __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& i, size_t end, Scene* scene, const BBox1f time_range)
     {
       const PrimRefMB& prim = prims[i]; i++;
diff --git a/thirdparty/embree/kernels/geometry/object_intersector.h b/thirdparty/embree/kernels/geometry/object_intersector.h
index e4ad01852f..b89bc1837d 100644
--- a/thirdparty/embree/kernels/geometry/object_intersector.h
+++ b/thirdparty/embree/kernels/geometry/object_intersector.h
@@ -22,7 +22,7 @@ namespace embree
         __forceinline Precalculations (const Ray& ray, const void *ptr) {}
       };
       
-      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) 
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& prim) 
       {
         AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID());
 
@@ -35,7 +35,7 @@ namespace embree
         accel->intersect(ray,prim.geomID(),prim.primID(),context);
       }
       
-      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& prim)
       {
         AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID());
         /* perform ray mask test */
@@ -47,6 +47,40 @@ namespace embree
         accel->occluded(ray,prim.geomID(),prim.primID(),context);
         return ray.tfar < 0.0f;
       }
+
+      static __forceinline bool intersect(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& prim) {
+        return occluded(pre,ray,context,prim);
+      }
+
+      static __forceinline void intersect(unsigned int k, const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& prim) 
+      {
+        AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID());
+
+        /* perform ray mask test */
+#if defined(EMBREE_RAY_MASK)
+        if ((ray.mask & accel->mask) == 0) 
+          return;
+#endif
+
+        accel->intersect(k,ray,prim.geomID(),prim.primID(),context);
+      }
+      
+      static __forceinline bool occluded(unsigned int k, const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& prim)
+      {
+        AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID());
+        /* perform ray mask test */
+#if defined(EMBREE_RAY_MASK)
+        if ((ray.mask & accel->mask) == 0) 
+          return false;
+#endif
+
+        accel->occluded(k, ray,prim.geomID(),prim.primID(),context);
+        return ray.tfar < 0.0f;
+      }
+
+      static __forceinline bool intersect(unsigned int k, const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& prim) {
+        return occluded(k,pre,ray,context,prim);
+      }
       
       static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim)
       {
@@ -57,13 +91,13 @@ namespace embree
       }
       
       template<int K>
-      static __forceinline void intersectK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+      static __forceinline void intersectK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayHitK<K>& ray, RayQueryContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
       {
         assert(false);
       }
 
       template<int K>
-      static __forceinline vbool<K> occludedK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+      static __forceinline vbool<K> occludedK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayK<K>& ray, RayQueryContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
       {
         assert(false);
         return valid;
@@ -79,7 +113,7 @@ namespace embree
         __forceinline Precalculations (const vbool<K>& valid, const RayK<K>& ray) {}
       };
       
-      static __forceinline void intersect(const vbool<K>& valid_i, const Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& prim)
+      static __forceinline void intersect(const vbool<K>& valid_i, const Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const Primitive& prim)
       {
         vbool<K> valid = valid_i;
         AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID());
@@ -92,7 +126,7 @@ namespace embree
         accel->intersect(valid,ray,prim.geomID(),prim.primID(),context);
       }
 
-      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& prim)
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const Primitive& prim)
       {
         vbool<K> valid = valid_i;
         AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID());
@@ -106,11 +140,11 @@ namespace embree
         return ray.tfar < 0.0f;
       }
       
-      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const Primitive& prim) {
         intersect(vbool<K>(1<<int(k)),pre,ray,context,prim);
       }
       
-      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive& prim) {
         occluded(vbool<K>(1<<int(k)),pre,ray,context,prim);
         return ray.tfar[k] < 0.0f; 
       }
diff --git a/thirdparty/embree/kernels/geometry/pointi.h b/thirdparty/embree/kernels/geometry/pointi.h
index bed04116b0..f81edb9035 100644
--- a/thirdparty/embree/kernels/geometry/pointi.h
+++ b/thirdparty/embree/kernels/geometry/pointi.h
@@ -267,10 +267,10 @@ namespace embree
     const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime));
     const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime));
     transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w);
-    const vfloat4 b0 = vfloat4(geom->normal(primID(0), itime));
-    const vfloat4 b1 = vfloat4(geom->normal(primID(1), itime));
-    const vfloat4 b2 = vfloat4(geom->normal(primID(2), itime));
-    const vfloat4 b3 = vfloat4(geom->normal(primID(3), itime));
+    const vfloat4 b0 = vfloat4(geom->normal((size_t)primID(0), (size_t)itime));
+    const vfloat4 b1 = vfloat4(geom->normal((size_t)primID(1), (size_t)itime));
+    const vfloat4 b2 = vfloat4(geom->normal((size_t)primID(2), (size_t)itime));
+    const vfloat4 b3 = vfloat4(geom->normal((size_t)primID(3), (size_t)itime));
     transpose(b0, b1, b2, b3, n0.x, n0.y, n0.z);
   }
 
@@ -364,14 +364,14 @@ namespace embree
     const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6), itime));
     const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7), itime));
     transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w);
-    const vfloat4 b0 = vfloat4(geom->normal(primID(0), itime));
-    const vfloat4 b1 = vfloat4(geom->normal(primID(1), itime));
-    const vfloat4 b2 = vfloat4(geom->normal(primID(2), itime));
-    const vfloat4 b3 = vfloat4(geom->normal(primID(3), itime));
-    const vfloat4 b4 = vfloat4(geom->normal(primID(4), itime));
-    const vfloat4 b5 = vfloat4(geom->normal(primID(5), itime));
-    const vfloat4 b6 = vfloat4(geom->normal(primID(6), itime));
-    const vfloat4 b7 = vfloat4(geom->normal(primID(7), itime));
+    const vfloat4 b0 = vfloat4(geom->normal((size_t)primID(0), (size_t)itime));
+    const vfloat4 b1 = vfloat4(geom->normal((size_t)primID(1), (size_t)itime));
+    const vfloat4 b2 = vfloat4(geom->normal((size_t)primID(2), (size_t)itime));
+    const vfloat4 b3 = vfloat4(geom->normal((size_t)primID(3), (size_t)itime));
+    const vfloat4 b4 = vfloat4(geom->normal((size_t)primID(4), (size_t)itime));
+    const vfloat4 b5 = vfloat4(geom->normal((size_t)primID(5), (size_t)itime));
+    const vfloat4 b6 = vfloat4(geom->normal((size_t)primID(6), (size_t)itime));
+    const vfloat4 b7 = vfloat4(geom->normal((size_t)primID(7), (size_t)itime));
     transpose(b0, b1, b2, b3, b4, b5, b6, b7, n0.x, n0.y, n0.z);
   }
 
diff --git a/thirdparty/embree/kernels/geometry/primitive.h b/thirdparty/embree/kernels/geometry/primitive.h
index 608d981dd7..de25ab87ef 100644
--- a/thirdparty/embree/kernels/geometry/primitive.h
+++ b/thirdparty/embree/kernels/geometry/primitive.h
@@ -6,8 +6,8 @@
 #include "../common/default.h"
 #include "../common/scene.h"
 #include "../../common/simd/simd.h"
-#include "../common/primref.h"
-#include "../common/primref_mb.h"
+#include "../builders/primref.h"
+#include "../builders/primref_mb.h"
 
 namespace embree
 {
diff --git a/thirdparty/embree/kernels/geometry/primitive4.cpp b/thirdparty/embree/kernels/geometry/primitive4.cpp
index 9c953c5d35..924e78bbf1 100644
--- a/thirdparty/embree/kernels/geometry/primitive4.cpp
+++ b/thirdparty/embree/kernels/geometry/primitive4.cpp
@@ -15,6 +15,7 @@
 #include "subdivpatch1.h"
 #include "object.h"
 #include "instance.h"
+#include "instance_array.h"
 #include "subgrid.h"
 
 namespace embree
@@ -335,6 +336,26 @@ namespace embree
 
   InstancePrimitive::Type InstancePrimitive::type;
 
+  /********************** InstanceArray4 **************************/
+
+  const char* InstanceArrayPrimitive::Type::name () const {
+    return "instance_array";
+  }
+
+  size_t InstanceArrayPrimitive::Type::sizeActive(const char* This) const {
+    return 1;
+  }
+
+  size_t InstanceArrayPrimitive::Type::sizeTotal(const char* This) const {
+    return 1;
+  }
+
+  size_t InstanceArrayPrimitive::Type::getBytes(const char* This) const {
+    return sizeof(InstanceArrayPrimitive);
+  }
+
+  InstanceArrayPrimitive::Type InstanceArrayPrimitive::type;
+
   /********************** SubGrid **************************/
 
   const char* SubGrid::Type::name () const {
diff --git a/thirdparty/embree/kernels/geometry/quad_intersector_moeller.h b/thirdparty/embree/kernels/geometry/quad_intersector_moeller.h
index 3abc9d6f70..2c574e4e47 100644
--- a/thirdparty/embree/kernels/geometry/quad_intersector_moeller.h
+++ b/thirdparty/embree/kernels/geometry/quad_intersector_moeller.h
@@ -122,7 +122,7 @@ namespace embree
 
       __forceinline QuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {}
 
-      __forceinline void intersect(RayHit& ray, IntersectContext* context,
+      __forceinline void intersect(RayHit& ray, RayQueryContext* context,
                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
                                    const vuint<M>& geomID, const vuint<M>& primID) const
       {
@@ -144,7 +144,7 @@ namespace embree
         }
       }
       
-      __forceinline bool occluded(Ray& ray, IntersectContext* context,
+      __forceinline bool occluded(Ray& ray, RayQueryContext* context,
                                   const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
                                   const vuint<M>& geomID, const vuint<M>& primID) const
       {
@@ -215,14 +215,14 @@ namespace embree
         return false;
       }
       
-      __forceinline bool intersect(RayHit& ray, IntersectContext* context,
+      __forceinline bool intersect(RayHit& ray, RayQueryContext* context,
                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
                                    const vuint4& geomID, const vuint4& primID) const
       {
         return intersect(ray,v0,v1,v2,v3,Intersect1EpilogM<8,filter>(ray,context,vuint8(geomID),vuint8(primID)));
       }
       
-      __forceinline bool occluded(Ray& ray, IntersectContext* context,
+      __forceinline bool occluded(Ray& ray, RayQueryContext* context,
                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
                                   const vuint4& geomID, const vuint4& primID) const
       {
@@ -394,7 +394,7 @@ namespace embree
       __forceinline QuadMIntersectorKMoellerTrumbore(const vbool<K>& valid, const RayK<K>& ray)
         : QuadMIntersectorKMoellerTrumboreBase<M,K,filter>(valid,ray) {}
 
-      __forceinline void intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+      __forceinline void intersect1(RayHitK<K>& ray, size_t k, RayQueryContext* context,
                                     const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
                                     const vuint<M>& geomID, const vuint<M>& primID) const
       {
@@ -403,7 +403,7 @@ namespace embree
         MoellerTrumboreIntersector1KTriangleM::intersect1<M,K>(ray,k,v2,v3,v1,vbool<M>(true ),epilog);
       }
       
-      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, RayQueryContext* context,
                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
                                    const vuint<M>& geomID, const vuint<M>& primID) const
       {
@@ -440,14 +440,14 @@ namespace embree
         return MoellerTrumboreIntersector1KTriangleM::intersect1<8,K>(ray,k,vtx0,vtx1,vtx2,flags,epilog); 
       }
       
-      __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+      __forceinline bool intersect1(RayHitK<K>& ray, size_t k, RayQueryContext* context,
                                     const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
                                     const vuint4& geomID, const vuint4& primID) const
       {
         return intersect1(ray,k,v0,v1,v2,v3,Intersect1KEpilogM<8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
       }
       
-      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, RayQueryContext* context,
                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
                                    const vuint4& geomID, const vuint4& primID) const
       {
diff --git a/thirdparty/embree/kernels/geometry/quad_intersector_pluecker.h b/thirdparty/embree/kernels/geometry/quad_intersector_pluecker.h
index 9873ff76ac..69d6881080 100644
--- a/thirdparty/embree/kernels/geometry/quad_intersector_pluecker.h
+++ b/thirdparty/embree/kernels/geometry/quad_intersector_pluecker.h
@@ -171,7 +171,7 @@ namespace embree
 
       __forceinline QuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {}
 
-      __forceinline void intersect(RayHit& ray, IntersectContext* context,
+      __forceinline void intersect(RayHit& ray, RayQueryContext* context,
                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
                                    const vuint<M>& geomID, const vuint<M>& primID) const
       {
@@ -180,7 +180,7 @@ namespace embree
         PlueckerIntersectorTriangle1::intersect<M>(ray,v2,v3,v1,vbool<M>(true),epilog);
       }
       
-      __forceinline bool occluded(Ray& ray, IntersectContext* context,
+      __forceinline bool occluded(Ray& ray, RayQueryContext* context,
                                   const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
                                   const vuint<M>& geomID, const vuint<M>& primID) const
       {
@@ -216,13 +216,13 @@ namespace embree
         return PlueckerIntersectorTriangle1::intersect<8>(ray,vtx0,vtx1,vtx2,flags,epilog); 
       }
       
-      __forceinline bool intersect(RayHit& ray, IntersectContext* context, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+      __forceinline bool intersect(RayHit& ray, RayQueryContext* context, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
                                    const vuint4& geomID, const vuint4& primID) const
       {
         return intersect(ray,v0,v1,v2,v3,Intersect1EpilogM<8,filter>(ray,context,vuint8(geomID),vuint8(primID)));
       }
       
-      __forceinline bool occluded(Ray& ray, IntersectContext* context, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3,
+      __forceinline bool occluded(Ray& ray, RayQueryContext* context, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3,
                                   const vuint4& geomID, const vuint4& primID) const
       {
         return intersect(ray,v0,v1,v2,v3,Occluded1EpilogM<8,filter>(ray,context,vuint8(geomID),vuint8(primID)));
@@ -374,7 +374,7 @@ namespace embree
       __forceinline QuadMIntersectorKPluecker(const vbool<K>& valid, const RayK<K>& ray)
         : QuadMIntersectorKPlueckerBase<M,K,filter>(valid,ray) {}
 
-      __forceinline void intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+      __forceinline void intersect1(RayHitK<K>& ray, size_t k, RayQueryContext* context,
                                     const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
                                     const vuint<M>& geomID, const vuint<M>& primID) const
       {
@@ -383,7 +383,7 @@ namespace embree
         PlueckerIntersector1KTriangleM::intersect1<M,K>(ray,k,v2,v3,v1,vbool<M>(true ),epilog);
       }
       
-      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, RayQueryContext* context,
                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
                                    const vuint<M>& geomID, const vuint<M>& primID) const
       {
@@ -418,14 +418,14 @@ namespace embree
         return PlueckerIntersector1KTriangleM::intersect1<8,K>(ray,k,vtx0,vtx1,vtx2,flags,epilog); 
       }
       
-      __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+      __forceinline bool intersect1(RayHitK<K>& ray, size_t k, RayQueryContext* context,
                                     const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
                                     const vuint4& geomID, const vuint4& primID) const
       {
         return intersect1(ray,k,v0,v1,v2,v3,Intersect1KEpilogM<8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
       }
       
-      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, RayQueryContext* context,
                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
                                    const vuint4& geomID, const vuint4& primID) const
       {
diff --git a/thirdparty/embree/kernels/geometry/quadi_intersector.h b/thirdparty/embree/kernels/geometry/quadi_intersector.h
index 20a98c3406..96fc77b42d 100644
--- a/thirdparty/embree/kernels/geometry/quadi_intersector.h
+++ b/thirdparty/embree/kernels/geometry/quadi_intersector.h
@@ -19,7 +19,7 @@ namespace embree
       typedef QuadMIntersector1MoellerTrumbore<M,filter> Precalculations;
 
       /*! Intersect a ray with the M quads and updates the hit. */
-      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& quad)
       {
         STAT3(normal.trav_prims,1,1,1);
         Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
@@ -27,7 +27,7 @@ namespace embree
       }
 
       /*! Test if the ray is occluded by one of M quads. */
-      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& quad)
       {
         STAT3(shadow.trav_prims,1,1,1);
         Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
@@ -48,7 +48,7 @@ namespace embree
       typedef QuadMIntersectorKMoellerTrumbore<M,K,filter> Precalculations;
 
       /*! Intersects K rays with M triangles. */
-      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const QuadMi<M>& quad)
       {
         Scene* scene = context->scene;
         for (size_t i=0; i<QuadMi<M>::max_size(); i++)
@@ -64,7 +64,7 @@ namespace embree
       }
 
       /*! Test for K rays if they are occluded by any of the M triangles. */
-      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const QuadMi<M>& quad)
       {
         Scene* scene = context->scene;
         vbool<K> valid0 = valid_i;
@@ -83,7 +83,7 @@ namespace embree
       }
       
       /*! Intersect a ray with M triangles and updates the hit. */
-      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const QuadMi<M>& quad)
       {
         STAT3(normal.trav_prims,1,1,1);
         Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
@@ -91,7 +91,7 @@ namespace embree
       }
 
       /*! Test if the ray is occluded by one of the M triangles. */
-      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const QuadMi<M>& quad)
       {
         STAT3(shadow.trav_prims,1,1,1);
         Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
@@ -107,7 +107,7 @@ namespace embree
       typedef QuadMIntersector1Pluecker<M,filter> Precalculations;
 
       /*! Intersect a ray with the M quads and updates the hit. */
-      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& quad)
       {
         STAT3(normal.trav_prims,1,1,1);
         Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
@@ -115,7 +115,7 @@ namespace embree
       }
 
       /*! Test if the ray is occluded by one of M quads. */
-      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& quad)
       {
         STAT3(shadow.trav_prims,1,1,1);
         Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
@@ -136,7 +136,7 @@ namespace embree
       typedef QuadMIntersectorKPluecker<M,K,filter> Precalculations;
 
       /*! Intersects K rays with M triangles. */
-      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const QuadMi<M>& quad)
       {
         Scene* scene = context->scene;
         for (size_t i=0; i<QuadMi<M>::max_size(); i++)
@@ -152,7 +152,7 @@ namespace embree
       }
 
       /*! Test for K rays if they are occluded by any of the M triangles. */
-      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const QuadMi<M>& quad)
       {
         Scene* scene = context->scene;
         vbool<K> valid0 = valid_i;
@@ -171,7 +171,7 @@ namespace embree
       }
       
       /*! Intersect a ray with M triangles and updates the hit. */
-      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const QuadMi<M>& quad)
       {
         STAT3(normal.trav_prims,1,1,1);
         Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
@@ -179,7 +179,7 @@ namespace embree
       }
 
       /*! Test if the ray is occluded by one of the M triangles. */
-      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const QuadMi<M>& quad)
       {
         STAT3(shadow.trav_prims,1,1,1);
         Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
@@ -195,7 +195,7 @@ namespace embree
       typedef QuadMIntersector1MoellerTrumbore<M,filter> Precalculations;
 
       /*! Intersect a ray with the M quads and updates the hit. */
-      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& quad)
       {
         STAT3(normal.trav_prims,1,1,1);
         Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time());
@@ -203,7 +203,7 @@ namespace embree
       }
 
       /*! Test if the ray is occluded by one of M quads. */
-      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& quad)
       {
         STAT3(shadow.trav_prims,1,1,1);
         Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time());
@@ -224,7 +224,7 @@ namespace embree
       typedef QuadMIntersectorKMoellerTrumbore<M,K,filter> Precalculations;
 
       /*! Intersects K rays with M quads. */
-      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const QuadMi<M>& quad)
       {
         for (size_t i=0; i<QuadMi<M>::max_size(); i++)
         {
@@ -236,7 +236,7 @@ namespace embree
       }
 
       /*! Test for K rays if they are occluded by any of the M quads. */
-      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const QuadMi<M>& quad)
       {
         vbool<K> valid0 = valid_i;
         for (size_t i=0; i<QuadMi<M>::max_size(); i++)
@@ -251,7 +251,7 @@ namespace embree
       }
       
       /*! Intersect a ray with M quads and updates the hit. */
-      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const QuadMi<M>& quad)
       {
         STAT3(normal.trav_prims,1,1,1);
         Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]);
@@ -259,7 +259,7 @@ namespace embree
       }
 
       /*! Test if the ray is occluded by one of the M quads. */
-      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const QuadMi<M>& quad)
       {
         STAT3(shadow.trav_prims,1,1,1);
         Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]);
@@ -275,7 +275,7 @@ namespace embree
       typedef QuadMIntersector1Pluecker<M,filter> Precalculations;
 
       /*! Intersect a ray with the M quads and updates the hit. */
-      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& quad)
       {
         STAT3(normal.trav_prims,1,1,1);
         Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time());
@@ -283,7 +283,7 @@ namespace embree
       }
 
       /*! Test if the ray is occluded by one of M quads. */
-      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& quad)
       {
         STAT3(shadow.trav_prims,1,1,1);
         Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time());
@@ -304,7 +304,7 @@ namespace embree
       typedef QuadMIntersectorKPluecker<M,K,filter> Precalculations;
 
       /*! Intersects K rays with M quads. */
-      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const QuadMi<M>& quad)
       {
         for (size_t i=0; i<QuadMi<M>::max_size(); i++)
         {
@@ -316,7 +316,7 @@ namespace embree
       }
 
       /*! Test for K rays if they are occluded by any of the M quads. */
-      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const QuadMi<M>& quad)
       {
         vbool<K> valid0 = valid_i;
         for (size_t i=0; i<QuadMi<M>::max_size(); i++)
@@ -331,7 +331,7 @@ namespace embree
       }
       
       /*! Intersect a ray with M quads and updates the hit. */
-      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const QuadMi<M>& quad)
       {
         STAT3(normal.trav_prims,1,1,1);
         Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]);
@@ -339,7 +339,7 @@ namespace embree
       }
 
       /*! Test if the ray is occluded by one of the M quads. */
-      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const QuadMi<M>& quad)
       {
         STAT3(shadow.trav_prims,1,1,1);
         Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]);
diff --git a/thirdparty/embree/kernels/geometry/quadv_intersector.h b/thirdparty/embree/kernels/geometry/quadv_intersector.h
index 9b28e05614..9eea8275f7 100644
--- a/thirdparty/embree/kernels/geometry/quadv_intersector.h
+++ b/thirdparty/embree/kernels/geometry/quadv_intersector.h
@@ -19,14 +19,14 @@ namespace embree
       typedef QuadMIntersector1MoellerTrumbore<M,filter> Precalculations;
         
       /*! Intersect a ray with the M quads and updates the hit. */
-      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& quad)
       {
         STAT3(normal.trav_prims,1,1,1);
         pre.intersect(ray,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
       }
         
       /*! Test if the ray is occluded by one of M quads. */
-      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& quad)
       {
         STAT3(shadow.trav_prims,1,1,1);
         return pre.occluded(ray,context, quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
@@ -46,7 +46,7 @@ namespace embree
       typedef QuadMIntersectorKMoellerTrumbore<M,K,filter> Precalculations;
 
       /*! Intersects K rays with M triangles. */
-      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMv<M>& quad)
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const QuadMv<M>& quad)
       {
         for (size_t i=0; i<QuadMv<M>::max_size(); i++)
         {
@@ -61,7 +61,7 @@ namespace embree
       }
 
       /*! Test for K rays if they are occluded by any of the M triangles. */
-      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMv<M>& quad)
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const QuadMv<M>& quad)
       {
         vbool<K> valid0 = valid_i;
 
@@ -80,14 +80,14 @@ namespace embree
       }
       
       /*! Intersect a ray with M triangles and updates the hit. */
-      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMv<M>& quad)
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const QuadMv<M>& quad)
       {
         STAT3(normal.trav_prims,1,1,1);
         pre.intersect1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
       }
 
       /*! Test if the ray is occluded by one of the M triangles. */
-      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMv<M>& quad)
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const QuadMv<M>& quad)
       {
         STAT3(shadow.trav_prims,1,1,1);
         return pre.occluded1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
@@ -102,14 +102,14 @@ namespace embree
       typedef QuadMIntersector1Pluecker<M,filter> Precalculations;
         
       /*! Intersect a ray with the M quads and updates the hit. */
-      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& quad)
       {
         STAT3(normal.trav_prims,1,1,1);
         pre.intersect(ray,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
       }
         
       /*! Test if the ray is occluded by one of M quads. */
-      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& quad)
       {
         STAT3(shadow.trav_prims,1,1,1);
         return pre.occluded(ray,context, quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
@@ -129,7 +129,7 @@ namespace embree
       typedef QuadMIntersectorKPluecker<M,K,filter> Precalculations;
 
       /*! Intersects K rays with M triangles. */
-      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMv<M>& quad)
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const QuadMv<M>& quad)
       {
         for (size_t i=0; i<QuadMv<M>::max_size(); i++)
         {
@@ -144,7 +144,7 @@ namespace embree
       }
 
       /*! Test for K rays if they are occluded by any of the M triangles. */
-      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMv<M>& quad)
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const QuadMv<M>& quad)
       {
         vbool<K> valid0 = valid_i;
 
@@ -163,14 +163,14 @@ namespace embree
       }
       
       /*! Intersect a ray with M triangles and updates the hit. */
-      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMv<M>& quad)
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const QuadMv<M>& quad)
       {
         STAT3(normal.trav_prims,1,1,1);
         pre.intersect1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
       }
 
       /*! Test if the ray is occluded by one of the M triangles. */
-      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMv<M>& quad)
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const QuadMv<M>& quad)
       {
         STAT3(shadow.trav_prims,1,1,1);
         return pre.occluded1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
diff --git a/thirdparty/embree/kernels/geometry/roundline_intersector.h b/thirdparty/embree/kernels/geometry/roundline_intersector.h
index 764ff93fec..a83dd72a7f 100644
--- a/thirdparty/embree/kernels/geometry/roundline_intersector.h
+++ b/thirdparty/embree/kernels/geometry/roundline_intersector.h
@@ -661,7 +661,7 @@ namespace embree
         template<typename Ray, typename Epilog>
         static __forceinline bool intersect(const vbool<M>& valid_i,
                                             Ray& ray,
-                                            IntersectContext* context,
+                                            RayQueryContext* context,
                                             const LineSegments* geom,
                                             const Precalculations& pre,
                                             const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
@@ -694,7 +694,7 @@ namespace embree
         template<typename Epilog>
         static __forceinline bool intersect(const vbool<M>& valid_i,
                                             RayK<K>& ray, size_t k,
-                                            IntersectContext* context,
+                                            RayQueryContext* context,
                                             const LineSegments* geom,
                                             const Precalculations& pre,
                                             const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
diff --git a/thirdparty/embree/kernels/geometry/roundlinei_intersector.h b/thirdparty/embree/kernels/geometry/roundlinei_intersector.h
index 29061d6475..02a5d63a3d 100644
--- a/thirdparty/embree/kernels/geometry/roundlinei_intersector.h
+++ b/thirdparty/embree/kernels/geometry/roundlinei_intersector.h
@@ -16,7 +16,7 @@ namespace embree
       typedef LineMi<M> Primitive;
       typedef CurvePrecalculations1 Precalculations;
 
-      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& line)
       {
         STAT3(normal.trav_prims,1,1,1);
         const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
@@ -25,7 +25,7 @@ namespace embree
         RoundLinearCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Intersect1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
       }
 
-      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& line)
       {
         STAT3(shadow.trav_prims,1,1,1);
         const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
@@ -46,7 +46,7 @@ namespace embree
       typedef LineMi<M> Primitive;
       typedef CurvePrecalculations1 Precalculations;
 
-      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& line)
       {
         STAT3(normal.trav_prims,1,1,1);
         const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
@@ -55,7 +55,7 @@ namespace embree
         RoundLinearCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Intersect1EpilogM<M,filter>(ray,context,line.geomID(),line.primID()));
       }
 
-      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& line)
       {
         STAT3(shadow.trav_prims,1,1,1);
         const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
@@ -76,7 +76,7 @@ namespace embree
       typedef LineMi<M> Primitive;
       typedef CurvePrecalculationsK<K> Precalculations;
 
-      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const Primitive& line)
       {
         STAT3(normal.trav_prims,1,1,1);
         const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
@@ -85,7 +85,7 @@ namespace embree
         RoundLinearCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Intersect1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
       }
 
-      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive& line)
       {
         STAT3(shadow.trav_prims,1,1,1);
         const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
@@ -101,7 +101,7 @@ namespace embree
       typedef LineMi<M> Primitive;
       typedef CurvePrecalculationsK<K> Precalculations;
 
-      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context,  const Primitive& line)
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context,  const Primitive& line)
       {
         STAT3(normal.trav_prims,1,1,1);
         const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
@@ -110,7 +110,7 @@ namespace embree
         RoundLinearCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Intersect1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID()));
       }
 
-      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive& line)
       {
         STAT3(shadow.trav_prims,1,1,1);
         const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
diff --git a/thirdparty/embree/kernels/geometry/sphere_intersector.h b/thirdparty/embree/kernels/geometry/sphere_intersector.h
index 2670f9762d..ba242a88b6 100644
--- a/thirdparty/embree/kernels/geometry/sphere_intersector.h
+++ b/thirdparty/embree/kernels/geometry/sphere_intersector.h
@@ -24,24 +24,60 @@ namespace embree
       __forceinline Vec2f uv(const size_t i) const {
         return Vec2f(0.0f, 0.0f);
       }
+      __forceinline Vec2vf<M> uv() const {
+        return Vec2vf<M>(0.0f, 0.0f);
+      }
       __forceinline float t(const size_t i) const {
         return vt[i];
       }
+      __forceinline vfloat<M> t() const {
+        return vt;
+      }
       __forceinline Vec3fa Ng(const size_t i) const {
         return Vec3fa(vNg.x[i], vNg.y[i], vNg.z[i]);
       }
+      __forceinline Vec3vf<M> Ng() const { 
+        return vNg;
+      }
 
      public:
       vfloat<M> vt;
       Vec3vf<M> vNg;
     };
 
+    template<>
+    struct SphereIntersectorHitM<1>
+    {
+      __forceinline SphereIntersectorHitM() {}
+
+      __forceinline SphereIntersectorHitM(const float& t, const Vec3f& Ng)
+        : vt(t), vNg(Ng) {}
+
+      __forceinline void finalize() {}
+
+      __forceinline Vec2f uv() const {
+        return Vec2f(0.0f, 0.0f);
+      }
+
+      __forceinline float t() const {
+        return vt;
+      }
+
+      __forceinline Vec3f Ng() const { 
+        return vNg;
+      }
+
+     public:
+      float vt;
+      Vec3f vNg;
+    };
+
     template<int M>
     struct SphereIntersector1
     {
       typedef CurvePrecalculations1 Precalculations;
 
-      template<typename Epilog>
+      template<typename Ray, typename Epilog>
       static __forceinline bool intersect(
           const vbool<M>& valid_i, Ray& ray,
           const Precalculations& pre, const Vec4vf<M>& v0, const Epilog& epilog)
@@ -68,10 +104,15 @@ namespace embree
         const vfloat<M> t_back  = projC0 + td;
 
         const vbool<M> valid_front = valid & (ray.tnear() <= t_front) & (t_front <= ray.tfar);
-        const vbool<M> valid_back  = valid & (ray.tnear() <= t_back ) & (t_back  <= ray.tfar);
 
         /* check if there is a first hit */
+#if defined (EMBREE_BACKFACE_CULLING_SPHERES)
+        /* check if there is a first hit */
+        const vbool<M> valid_first = valid_front;
+#else
+        const vbool<M> valid_back  = valid & (ray.tnear() <= t_back ) & (t_back  <= ray.tfar);
         const vbool<M> valid_first = valid_front | valid_back;
+#endif
         if (unlikely(none(valid_first)))
           return false;
 
@@ -84,7 +125,10 @@ namespace embree
 
         /* invoke intersection filter for first hit */
         const bool is_hit_first = epilog(valid_first, hit);
-                
+
+#if defined (EMBREE_BACKFACE_CULLING_SPHERES)
+        return is_hit_first;
+#else
         /* check for possible second hits before potentially accepted hit */
         const vfloat<M> t_second = t_back;
         const vbool<M> valid_second = valid_front & valid_back & (t_second <= ray.tfar);
@@ -95,13 +139,13 @@ namespace embree
         const Vec3vf<M> Ng_second = td_back * ray_dir - perp;
         hit = SphereIntersectorHitM<M> (t_second, Ng_second);
         const bool is_hit_second = epilog(valid_second, hit);
-        
         return is_hit_first | is_hit_second;
+#endif
       }
 
       template<typename Epilog>
       static __forceinline bool intersect(
-        const vbool<M>& valid_i, Ray& ray, IntersectContext* context, const Points* geom,
+        const vbool<M>& valid_i, Ray& ray, RayQueryContext* context, const Points* geom,
         const Precalculations& pre, const Vec4vf<M>& v0i, const Epilog& epilog)
       {
         const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
@@ -118,7 +162,7 @@ namespace embree
       template<typename Epilog>
       static __forceinline bool intersect(const vbool<M>& valid_i,
                                           RayK<K>& ray, size_t k,
-                                          IntersectContext* context,
+                                          RayQueryContext* context,
                                           const Points* geom,
                                           const Precalculations& pre,
                                           const Vec4vf<M>& v0i,
@@ -148,10 +192,14 @@ namespace embree
         const vfloat<M> t_back  = projC0 + td;
 
         const vbool<M> valid_front = valid & (ray.tnear()[k] <= t_front) & (t_front <= ray.tfar[k]);
-        const vbool<M> valid_back  = valid & (ray.tnear()[k] <= t_back ) & (t_back  <= ray.tfar[k]);
 
         /* check if there is a first hit */
+#if defined (EMBREE_BACKFACE_CULLING_SPHERES)
+        const vbool<M> valid_first = valid_front;
+#else
+        const vbool<M> valid_back  = valid & (ray.tnear()[k] <= t_back ) & (t_back  <= ray.tfar[k]);
         const vbool<M> valid_first = valid_front | valid_back;
+#endif
         if (unlikely(none(valid_first)))
           return false;
 
@@ -164,7 +212,10 @@ namespace embree
 
         /* invoke intersection filter for first hit */
         const bool is_hit_first = epilog(valid_first, hit);
-                
+
+#if defined (EMBREE_BACKFACE_CULLING_SPHERES)
+        return is_hit_first;
+#else
         /* check for possible second hits before potentially accepted hit */
         const vfloat<M> t_second = t_back;
         const vbool<M> valid_second = valid_front & valid_back & (t_second <= ray.tfar[k]);
@@ -177,6 +228,7 @@ namespace embree
         const bool is_hit_second = epilog(valid_second, hit);
         
         return is_hit_first | is_hit_second;
+#endif
       }
     };
   }  // namespace isa
diff --git a/thirdparty/embree/kernels/geometry/spherei_intersector.h b/thirdparty/embree/kernels/geometry/spherei_intersector.h
index 7a0b428117..7072ea81e8 100644
--- a/thirdparty/embree/kernels/geometry/spherei_intersector.h
+++ b/thirdparty/embree/kernels/geometry/spherei_intersector.h
@@ -19,7 +19,7 @@ namespace embree
 
       static __forceinline void intersect(const Precalculations& pre,
                                           RayHit& ray,
-                                          IntersectContext* context,
+                                          RayQueryContext* context,
                                           const Primitive& sphere)
       {
         STAT3(normal.trav_prims, 1, 1, 1);
@@ -32,7 +32,7 @@ namespace embree
 
       static __forceinline bool occluded(const Precalculations& pre,
                                          Ray& ray,
-                                         IntersectContext* context,
+                                         RayQueryContext* context,
                                          const Primitive& sphere)
       {
         STAT3(shadow.trav_prims, 1, 1, 1);
@@ -59,7 +59,7 @@ namespace embree
 
       static __forceinline void intersect(const Precalculations& pre,
                                           RayHit& ray,
-                                          IntersectContext* context,
+                                          RayQueryContext* context,
                                           const Primitive& sphere)
       {
         STAT3(normal.trav_prims, 1, 1, 1);
@@ -72,7 +72,7 @@ namespace embree
 
       static __forceinline bool occluded(const Precalculations& pre,
                                          Ray& ray,
-                                         IntersectContext* context,
+                                         RayQueryContext* context,
                                          const Primitive& sphere)
       {
         STAT3(shadow.trav_prims, 1, 1, 1);
@@ -98,7 +98,7 @@ namespace embree
       typedef CurvePrecalculationsK<K> Precalculations;
 
       static __forceinline void intersect(
-          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& sphere)
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const Primitive& sphere)
       {
         STAT3(normal.trav_prims, 1, 1, 1);
         const Points* geom = context->scene->get<Points>(sphere.geomID());
@@ -110,7 +110,7 @@ namespace embree
       }
 
       static __forceinline bool occluded(
-          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& sphere)
+          const Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive& sphere)
       {
         STAT3(shadow.trav_prims, 1, 1, 1);
         const Points* geom = context->scene->get<Points>(sphere.geomID());
@@ -129,7 +129,7 @@ namespace embree
       typedef CurvePrecalculationsK<K> Precalculations;
 
       static __forceinline void intersect(
-          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& sphere)
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const Primitive& sphere)
       {
         STAT3(normal.trav_prims, 1, 1, 1);
         const Points* geom = context->scene->get<Points>(sphere.geomID());
@@ -141,7 +141,7 @@ namespace embree
       }
 
       static __forceinline bool occluded(
-          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& sphere)
+          const Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive& sphere)
       {
         STAT3(shadow.trav_prims, 1, 1, 1);
         const Points* geom = context->scene->get<Points>(sphere.geomID());
diff --git a/thirdparty/embree/kernels/geometry/subdivpatch1_intersector.h b/thirdparty/embree/kernels/geometry/subdivpatch1_intersector.h
index b4b15a1210..cf17127169 100644
--- a/thirdparty/embree/kernels/geometry/subdivpatch1_intersector.h
+++ b/thirdparty/embree/kernels/geometry/subdivpatch1_intersector.h
@@ -35,7 +35,7 @@ namespace embree
       typedef GridSOA Primitive;
       typedef SubdivPatch1Precalculations<GridSOAIntersector1::Precalculations> Precalculations;
 
-      static __forceinline bool processLazyNode(Precalculations& pre, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      static __forceinline bool processLazyNode(Precalculations& pre, RayQueryContext* context, const Primitive* prim, size_t& lazy_node)
       {
         lazy_node = prim->root(0);
         pre.grid = (Primitive*)prim;
@@ -44,27 +44,27 @@ namespace embree
 
       /*! Intersect a ray with the primitive. */
       template<int N, bool robust>
-        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) 
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) 
       {
         if (likely(ty == 0)) GridSOAIntersector1::intersect(pre,ray,context,prim,lazy_node);
         else                 processLazyNode(pre,context,prim,lazy_node);
       }
 
       template<int N, bool robust>
-      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) {
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, RayQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) {
         intersect(This,pre,ray,context,prim,ty,tray,lazy_node);
       }
       
       /*! Test if the ray is occluded by the primitive */
       template<int N, bool robust>
-      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node)
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node)
       {
         if (likely(ty == 0)) return GridSOAIntersector1::occluded(pre,ray,context,prim,lazy_node);
         else                 return processLazyNode(pre,context,prim,lazy_node);
       }
 
       template<int N, bool robust>
-      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) {
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, RayQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) {
         return occluded(This,pre,ray,context,prim,ty,tray,lazy_node);
       }
       
@@ -88,7 +88,7 @@ namespace embree
       typedef SubdivPatch1 Primitive;
       typedef GridSOAMBIntersector1::Precalculations Precalculations;
       
-      static __forceinline bool processLazyNode(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim_i, size_t& lazy_node)
+      static __forceinline bool processLazyNode(Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive* prim_i, size_t& lazy_node)
       {
         Primitive* prim = (Primitive*) prim_i;
         GridSOA* grid = nullptr;
@@ -101,27 +101,27 @@ namespace embree
 
       /*! Intersect a ray with the primitive. */
       template<int N, bool robust>
-      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) 
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) 
       {
         if (likely(ty == 0)) GridSOAMBIntersector1::intersect(pre,ray,context,prim,lazy_node);
         else                 processLazyNode(pre,ray,context,prim,lazy_node);
       }
 
       template<int N, bool robust>
-      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) {
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, RayQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) {
         intersect(This,pre,ray,context,prim,ty,tray,lazy_node);
       }
       
       /*! Test if the ray is occluded by the primitive */
       template<int N, bool robust>
-      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node)
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node)
       {
         if (likely(ty == 0)) return GridSOAMBIntersector1::occluded(pre,ray,context,prim,lazy_node);
         else                 return processLazyNode(pre,ray,context,prim,lazy_node);
       }
 
       template<int N, bool robust>
-      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) {
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, RayQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) {
         return occluded(This,pre,ray,context,prim,ty,tray,lazy_node);
       }
       
@@ -145,7 +145,7 @@ namespace embree
       typedef GridSOA Primitive;
       typedef SubdivPatch1PrecalculationsK<K,typename GridSOAIntersectorK<K>::Precalculations> Precalculations;
       
-      static __forceinline bool processLazyNode(Precalculations& pre, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      static __forceinline bool processLazyNode(Precalculations& pre, RayQueryContext* context, const Primitive* prim, size_t& lazy_node)
       {
         lazy_node = prim->root(0);
         pre.grid = (Primitive*)prim;
@@ -153,28 +153,28 @@ namespace embree
       }
       
       template<bool robust>        
-      static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node)
       {
         if (likely(ty == 0)) GridSOAIntersectorK<K>::intersect(valid,pre,ray,context,prim,lazy_node);
         else                 processLazyNode(pre,context,prim,lazy_node);
       }
       
       template<bool robust>        
-      static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node)
       {
         if (likely(ty == 0)) return GridSOAIntersectorK<K>::occluded(valid,pre,ray,context,prim,lazy_node);
         else                 return processLazyNode(pre,context,prim,lazy_node);
       }
       
       template<int N, bool robust>              
-        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node)
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node)
       {
         if (likely(ty == 0)) GridSOAIntersectorK<K>::intersect(pre,ray,k,context,prim,lazy_node);
         else                 processLazyNode(pre,context,prim,lazy_node);
       }
       
       template<int N, bool robust>              
-      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node)
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node)
       {
         if (likely(ty == 0)) return GridSOAIntersectorK<K>::occluded(pre,ray,k,context,prim,lazy_node);
         else                 return processLazyNode(pre,context,prim,lazy_node);
@@ -192,7 +192,7 @@ namespace embree
       //typedef GridSOAMBIntersectorK<K>::Precalculations Precalculations;
       typedef SubdivPatch1PrecalculationsK<K,typename GridSOAMBIntersectorK<K>::Precalculations> Precalculations;
       
-      static __forceinline bool processLazyNode(Precalculations& pre, IntersectContext* context, const Primitive* prim_i, size_t& lazy_node)
+      static __forceinline bool processLazyNode(Precalculations& pre, RayQueryContext* context, const Primitive* prim_i, size_t& lazy_node)
       {
         Primitive* prim = (Primitive*) prim_i;
         GridSOA* grid = (GridSOA*) prim->root_ref.get();
@@ -202,28 +202,28 @@ namespace embree
       }
 
       template<bool robust>
-      static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node)
       {
         if (likely(ty == 0)) GridSOAMBIntersectorK<K>::intersect(valid,pre,ray,context,prim,lazy_node);
         else                 processLazyNode(pre,context,prim,lazy_node);
       }
 
       template<bool robust>
-      static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node)
       {
         if (likely(ty == 0)) return GridSOAMBIntersectorK<K>::occluded(valid,pre,ray,context,prim,lazy_node);
         else                 return processLazyNode(pre,context,prim,lazy_node);
       }
       
       template<int N, bool robust>      
-      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node)
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node)
       {
         if (likely(ty == 0)) GridSOAMBIntersectorK<K>::intersect(pre,ray,k,context,prim,lazy_node);
         else                 processLazyNode(pre,context,prim,lazy_node);
       }
       
       template<int N, bool robust>      
-      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node)
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node)
       {
         if (likely(ty == 0)) return GridSOAMBIntersectorK<K>::occluded(pre,ray,k,context,prim,lazy_node);
         else                 return processLazyNode(pre,context,prim,lazy_node);
diff --git a/thirdparty/embree/kernels/geometry/subgrid.h b/thirdparty/embree/kernels/geometry/subgrid.h
index ce54421cab..f57e55db78 100644
--- a/thirdparty/embree/kernels/geometry/subgrid.h
+++ b/thirdparty/embree/kernels/geometry/subgrid.h
@@ -283,7 +283,7 @@ namespace embree
 
 
         friend embree_ostream operator<<(embree_ostream cout, const SubGrid& sg) {
-          return cout << "SubGrid " << " ( x " << sg.x() << ", y = " << sg.y() << ", geomID = " << sg.geomID() << ", primID = " << sg.primID() << " )";
+          return cout << "SubGrid " << " ( x = " << sg.x() << ", y = " << sg.y() << ", geomID = " << sg.geomID() << ", primID = " << sg.primID() << ", invalid3x3X() " << (int)sg.invalid3x3X() << ", invalid3x3Y() " << (int)sg.invalid3x3Y();
         }
 
         __forceinline unsigned int geomID() const { return _geomID; }
@@ -305,9 +305,10 @@ namespace embree
         
         __forceinline SubGridID() {}
         __forceinline SubGridID(const unsigned int x, const unsigned int y, const unsigned int primID) :
-        x(x), y(y), primID(primID) {}        
+        x(x), y(y), primID(primID) {}
+        
       };
-
+      
       /* QuantizedBaseNode as large subgrid leaf */
       template<int N>
       struct SubGridQBVHN
@@ -403,6 +404,9 @@ namespace embree
       typedef SubGridQBVHN<8> SubGridQBVH8;
 
 
+      
+
+
       /* QuantizedBaseNode as large subgrid leaf */
       template<int N>
       struct SubGridMBQBVHN
@@ -513,5 +517,4 @@ namespace embree
         }
 
       };
-
 }
diff --git a/thirdparty/embree/kernels/geometry/subgrid_intersector.h b/thirdparty/embree/kernels/geometry/subgrid_intersector.h
index e241073812..561d3d43f2 100644
--- a/thirdparty/embree/kernels/geometry/subgrid_intersector.h
+++ b/thirdparty/embree/kernels/geometry/subgrid_intersector.h
@@ -23,7 +23,7 @@ namespace embree
       typedef SubGridQBVHN<N> Primitive;
       typedef SubGridQuadMIntersector1MoellerTrumbore<4,filter> Precalculations;
 
-      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const SubGrid& subgrid)
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const SubGrid& subgrid)
       {
         STAT3(normal.trav_prims,1,1,1);
         const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
@@ -33,7 +33,7 @@ namespace embree
         pre.intersect(ray,context,v0,v1,v2,v3,g,subgrid);
       }
 
-      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const SubGrid& subgrid)
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const SubGrid& subgrid)
       {
         STAT3(shadow.trav_prims,1,1,1);
         const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
@@ -54,7 +54,7 @@ namespace embree
       }
 
       template<bool robust>
-        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
       {
         BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
 
@@ -76,7 +76,7 @@ namespace embree
         }
       }
       template<bool robust>        
-        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
 
       {
         BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
@@ -126,7 +126,7 @@ namespace embree
       typedef SubGridQBVHN<N> Primitive;
       typedef SubGridQuadMIntersector1Pluecker<4,filter> Precalculations;
 
-      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const SubGrid& subgrid)
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const SubGrid& subgrid)
       {
         STAT3(normal.trav_prims,1,1,1);
         const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
@@ -136,7 +136,7 @@ namespace embree
         pre.intersect(ray,context,v0,v1,v2,v3,g,subgrid);
       }
 
-      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const SubGrid& subgrid)
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const SubGrid& subgrid)
       {
         STAT3(shadow.trav_prims,1,1,1);
         const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
@@ -156,7 +156,7 @@ namespace embree
       }
 
       template<bool robust>
-        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
       {
         BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
 
@@ -179,7 +179,7 @@ namespace embree
       }
 
       template<bool robust>        
-        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
       {
         BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
 
@@ -231,7 +231,7 @@ namespace embree
       typedef SubGridQBVHN<N> Primitive;
       typedef SubGridQuadMIntersectorKMoellerTrumbore<4,K,filter> Precalculations;
 
-      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const SubGrid& subgrid)
       {
         Vec3fa vtx[16];
         const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
@@ -249,7 +249,7 @@ namespace embree
         }
       }
 
-      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const SubGrid& subgrid)
       {
         vbool<K> valid0 = valid_i;
         Vec3fa vtx[16];
@@ -270,7 +270,7 @@ namespace embree
         return !valid0;
       }
       
-      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const SubGrid& subgrid)
       {
         STAT3(normal.trav_prims,1,1,1);
         const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
@@ -280,7 +280,7 @@ namespace embree
         pre.intersect1(ray,k,context,v0,v1,v2,v3,g,subgrid);
       }
 
-      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const SubGrid& subgrid)
       {
         STAT3(shadow.trav_prims,1,1,1);
         const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
@@ -290,7 +290,7 @@ namespace embree
       }
 
         template<bool robust>
-          static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+          static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
         {
           BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
           for (size_t j=0;j<num;j++)
@@ -307,7 +307,7 @@ namespace embree
         }
 
         template<bool robust>        
-        static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
         {
           BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
           vbool<K> valid0 = valid;
@@ -327,7 +327,7 @@ namespace embree
         }
         
         template<bool robust>        
-          static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+          static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
         {
           BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
 
@@ -347,7 +347,7 @@ namespace embree
         }
         
         template<bool robust>
-        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
         {
           BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
 
@@ -375,7 +375,7 @@ namespace embree
       typedef SubGridQBVHN<N> Primitive;
       typedef SubGridQuadMIntersectorKPluecker<4,K,filter> Precalculations;
 
-      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const SubGrid& subgrid)
       {
         Vec3fa vtx[16];
         const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
@@ -393,7 +393,7 @@ namespace embree
         }
       }
 
-      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const SubGrid& subgrid)
       {
         vbool<K> valid0 = valid_i;
         Vec3fa vtx[16];
@@ -414,7 +414,7 @@ namespace embree
         return !valid0;
       }
       
-      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const SubGrid& subgrid)
       {
         STAT3(normal.trav_prims,1,1,1);
         const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
@@ -424,7 +424,7 @@ namespace embree
         pre.intersect1(ray,k,context,v0,v1,v2,v3,g,subgrid);
       }
 
-      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const SubGrid& subgrid)
       {
         STAT3(shadow.trav_prims,1,1,1);
         const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
@@ -434,7 +434,7 @@ namespace embree
       }
       
         template<bool robust>
-          static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+          static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
         {
           BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
           for (size_t j=0;j<num;j++)
@@ -451,7 +451,7 @@ namespace embree
         }
 
         template<bool robust>        
-        static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
         {
           BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
           vbool<K> valid0 = valid;
@@ -471,7 +471,7 @@ namespace embree
         }
         
         template<bool robust>        
-          static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+          static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
         {
           BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
 
@@ -491,7 +491,7 @@ namespace embree
         }
         
         template<bool robust>
-        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
         {
           BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
 
diff --git a/thirdparty/embree/kernels/geometry/subgrid_intersector_moeller.h b/thirdparty/embree/kernels/geometry/subgrid_intersector_moeller.h
index 64937d34fe..2666847333 100644
--- a/thirdparty/embree/kernels/geometry/subgrid_intersector_moeller.h
+++ b/thirdparty/embree/kernels/geometry/subgrid_intersector_moeller.h
@@ -39,7 +39,7 @@ namespace embree
 
         __forceinline SubGridQuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {}
 
-        __forceinline void intersect(RayHit& ray, IntersectContext* context,
+        __forceinline void intersect(RayHit& ray, RayQueryContext* context,
                                      const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
                                      const GridMesh::Grid &g, const SubGrid& subgrid) const
         {
@@ -65,7 +65,7 @@ namespace embree
           }
         }
       
-        __forceinline bool occluded(Ray& ray, IntersectContext* context,
+        __forceinline bool occluded(Ray& ray, RayQueryContext* context,
                                     const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
                                     const GridMesh::Grid &g, const SubGrid& subgrid) const
         {
@@ -135,14 +135,14 @@ namespace embree
         return false;
       }
       
-      __forceinline bool intersect(RayHit& ray, IntersectContext* context,
+      __forceinline bool intersect(RayHit& ray, RayQueryContext* context,
                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
                                    const GridMesh::Grid &g, const SubGrid& subgrid) const
       {
           return intersect(ray,v0,v1,v2,v3,g,subgrid,Intersect1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID()));
       }
       
-      __forceinline bool occluded(Ray& ray, IntersectContext* context,
+      __forceinline bool occluded(Ray& ray, RayQueryContext* context,
                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
                                   const GridMesh::Grid &g, const SubGrid& subgrid) const
       {
@@ -270,7 +270,7 @@ namespace embree
       __forceinline SubGridQuadMIntersectorKMoellerTrumbore(const vbool<K>& valid, const RayK<K>& ray)
         : SubGridQuadMIntersectorKMoellerTrumboreBase<M,K,filter>(valid,ray) {}
 
-      __forceinline void intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+      __forceinline void intersect1(RayHitK<K>& ray, size_t k, RayQueryContext* context,
                                     const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
       {
 	UVIdentity<M> mapUV;
@@ -294,7 +294,7 @@ namespace embree
           }
       }
       
-      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, RayQueryContext* context,
                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
       {
 	UVIdentity<M> mapUV;
@@ -361,13 +361,13 @@ namespace embree
         return false;
       }
       
-      __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+      __forceinline bool intersect1(RayHitK<K>& ray, size_t k, RayQueryContext* context,
                                     const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
       {
         return intersect1(ray,k,v0,v1,v2,v3,g,subgrid,Intersect1KEpilogMU<8,K,filter>(ray,k,context,subgrid.geomID(),subgrid.primID()));
       }
       
-      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, RayQueryContext* context,
                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
       {
         return intersect1(ray,k,v0,v1,v2,v3,g,subgrid,Occluded1KEpilogMU<8,K,filter>(ray,k,context,subgrid.geomID(),subgrid.primID()));
diff --git a/thirdparty/embree/kernels/geometry/subgrid_intersector_pluecker.h b/thirdparty/embree/kernels/geometry/subgrid_intersector_pluecker.h
index 5ded56e1f7..4919f927ae 100644
--- a/thirdparty/embree/kernels/geometry/subgrid_intersector_pluecker.h
+++ b/thirdparty/embree/kernels/geometry/subgrid_intersector_pluecker.h
@@ -36,7 +36,7 @@ namespace embree
 
         __forceinline SubGridQuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {}
 
-        __forceinline void intersect(RayHit& ray, IntersectContext* context,
+        __forceinline void intersect(RayHit& ray, RayQueryContext* context,
                                      const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
                                      const GridMesh::Grid &g, const SubGrid& subgrid) const
         {
@@ -63,7 +63,7 @@ namespace embree
           }
         }
       
-        __forceinline bool occluded(Ray& ray, IntersectContext* context,
+        __forceinline bool occluded(Ray& ray, RayQueryContext* context,
                                     const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
                                     const GridMesh::Grid &g, const SubGrid& subgrid) const
         {
@@ -134,14 +134,14 @@ namespace embree
         return false;
       }
       
-      __forceinline bool intersect(RayHit& ray, IntersectContext* context,
+      __forceinline bool intersect(RayHit& ray, RayQueryContext* context,
                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
                                    const GridMesh::Grid &g, const SubGrid& subgrid) const
       {
           return intersect(ray,v0,v1,v2,v3,g,subgrid,Intersect1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID()));
       }
       
-      __forceinline bool occluded(Ray& ray, IntersectContext* context,
+      __forceinline bool occluded(Ray& ray, RayQueryContext* context,
                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
                                   const GridMesh::Grid &g, const SubGrid& subgrid) const
       {
@@ -255,7 +255,7 @@ namespace embree
       __forceinline SubGridQuadMIntersectorKPluecker(const vbool<K>& valid, const RayK<K>& ray)
         : SubGridQuadMIntersectorKPlueckerBase<M,K,filter>(valid,ray) {}
 
-      __forceinline void intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+      __forceinline void intersect1(RayHitK<K>& ray, size_t k, RayQueryContext* context,
                                     const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
       {
 	UVIdentity<M> mapUV;
@@ -280,7 +280,7 @@ namespace embree
           }
       }
       
-      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, RayQueryContext* context,
                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
       {
 	UVIdentity<M> mapUV;
@@ -348,13 +348,13 @@ namespace embree
         return false;
       }
       
-      __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+      __forceinline bool intersect1(RayHitK<K>& ray, size_t k, RayQueryContext* context,
                                     const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
       {
         return intersect1(ray,k,v0,v1,v2,v3,g,subgrid,Intersect1KEpilogMU<8,K,filter>(ray,k,context,subgrid.geomID(),subgrid.primID()));
       }
       
-      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, RayQueryContext* context,
                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
       {
         return intersect1(ray,k,v0,v1,v2,v3,g,subgrid,Occluded1KEpilogMU<8,K,filter>(ray,k,context,subgrid.geomID(),subgrid.primID()));
diff --git a/thirdparty/embree/kernels/geometry/subgrid_mb_intersector.h b/thirdparty/embree/kernels/geometry/subgrid_mb_intersector.h
index 473d656e24..0986eea134 100644
--- a/thirdparty/embree/kernels/geometry/subgrid_mb_intersector.h
+++ b/thirdparty/embree/kernels/geometry/subgrid_mb_intersector.h
@@ -15,7 +15,7 @@ namespace embree
       typedef SubGridMBQBVHN<N> Primitive;
       typedef SubGridQuadMIntersector1Pluecker<4,filter> Precalculations;
 
-      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const SubGrid& subgrid)
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const SubGrid& subgrid)
       {
         STAT3(normal.trav_prims,1,1,1);
         const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
@@ -27,7 +27,7 @@ namespace embree
         pre.intersect(ray,context,v0,v1,v2,v3,g,subgrid);
       }
 
-      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const SubGrid& subgrid)
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const SubGrid& subgrid)
       {
         STAT3(shadow.trav_prims,1,1,1);
         const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
@@ -46,7 +46,7 @@ namespace embree
       }
 
       template<bool robust>
-        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
       {
         BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
         for (size_t i=0;i<num;i++)
@@ -69,7 +69,7 @@ namespace embree
       }
 
       template<bool robust>        
-        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
       {
         BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
         for (size_t i=0;i<num;i++)
@@ -102,7 +102,7 @@ namespace embree
       typedef SubGridMBQBVHN<N> Primitive;
       typedef SubGridQuadMIntersectorKPluecker<4,K,filter> Precalculations;
 
-      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const SubGrid& subgrid)
       {
         size_t m_valid = movemask(valid_i);
         while(m_valid)
@@ -112,7 +112,7 @@ namespace embree
         }
       }
 
-      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const SubGrid& subgrid)
       {
         vbool<K> valid0 = valid_i;
         size_t m_valid = movemask(valid_i);
@@ -125,7 +125,7 @@ namespace embree
         return !valid0;
       }
       
-      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const SubGrid& subgrid)
       {
         STAT3(normal.trav_prims,1,1,1);
         const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
@@ -137,7 +137,7 @@ namespace embree
         pre.intersect1(ray,k,context,v0,v1,v2,v3,g,subgrid);
       }
 
-      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const SubGrid& subgrid)
       {
         STAT3(shadow.trav_prims,1,1,1);
         const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
@@ -150,7 +150,7 @@ namespace embree
       }
 
         template<bool robust>
-          static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+          static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
         {
           BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
           for (size_t j=0;j<num;j++)
@@ -169,7 +169,7 @@ namespace embree
         }
 
         template<bool robust>        
-        static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
         {
           BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
 
@@ -191,7 +191,7 @@ namespace embree
         }
         
         template<bool robust>        
-          static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+          static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
         {
           BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
           for (size_t i=0;i<num;i++)
@@ -211,7 +211,7 @@ namespace embree
         }
         
         template<bool robust>
-        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node)
         {
           BVHNQuantizedBaseNodeIntersector1<N,robust> isec1;
           
diff --git a/thirdparty/embree/kernels/geometry/triangle_intersector.h b/thirdparty/embree/kernels/geometry/triangle_intersector.h
index 2cdff78ec8..9d9ddc7cad 100644
--- a/thirdparty/embree/kernels/geometry/triangle_intersector.h
+++ b/thirdparty/embree/kernels/geometry/triangle_intersector.h
@@ -18,14 +18,14 @@ namespace embree
       typedef MoellerTrumboreIntersector1<M> Precalculations;
 
       /*! Intersect a ray with the M triangles and updates the hit. */
-      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const TriangleM<M>& tri)
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const TriangleM<M>& tri)
       {
         STAT3(normal.trav_prims,1,1,1);
         pre.intersectEdge(ray,tri.v0,tri.e1,tri.e2,UVIdentity<M>(),Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
       }
 
       /*! Test if the ray is occluded by one of M triangles. */
-      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const TriangleM<M>& tri)
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const TriangleM<M>& tri)
       {
         STAT3(shadow.trav_prims,1,1,1);
         return pre.intersectEdge(ray,tri.v0,tri.e1,tri.e2,UVIdentity<M>(),Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
@@ -46,7 +46,7 @@ namespace embree
       typedef MoellerTrumboreIntersectorK<M,K> Precalculations;
 
       /*! Intersects K rays with M triangles. */
-      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleM<M>& tri)
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const TriangleM<M>& tri)
       {
         STAT_USER(0,TriangleM<M>::max_size());
         for (size_t i=0; i<TriangleM<M>::max_size(); i++)
@@ -61,7 +61,7 @@ namespace embree
       }
 
       /*! Test for K rays if they are occluded by any of the M triangles. */
-      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleM<M>& tri)
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const TriangleM<M>& tri)
       {
         vbool<K> valid0 = valid_i;
 
@@ -79,14 +79,14 @@ namespace embree
       }
       
       /*! Intersect a ray with M triangles and updates the hit. */
-      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleM<M>& tri)
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const TriangleM<M>& tri)
       {
         STAT3(normal.trav_prims,1,1,1);
         pre.intersectEdge(ray,k,tri.v0,tri.e1,tri.e2,UVIdentity<M>(),Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
       }
 
       /*! Test if the ray is occluded by one of the M triangles. */
-      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleM<M>& tri)
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const TriangleM<M>& tri)
       {
         STAT3(shadow.trav_prims,1,1,1);
         return pre.intersectEdge(ray,k,tri.v0,tri.e1,tri.e2,UVIdentity<M>(),Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
diff --git a/thirdparty/embree/kernels/geometry/triangle_intersector_moeller.h b/thirdparty/embree/kernels/geometry/triangle_intersector_moeller.h
index 0a42d8f08b..87bf0cd069 100644
--- a/thirdparty/embree/kernels/geometry/triangle_intersector_moeller.h
+++ b/thirdparty/embree/kernels/geometry/triangle_intersector_moeller.h
@@ -77,18 +77,18 @@ namespace embree
       {
         /* calculate denominator */
         vbool<M> valid = valid0;
+
         const Vec3vf<M> O = Vec3vf<M>((Vec3fa)ray.org);
         const Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray.dir);
         const Vec3vf<M> C = Vec3vf<M>(tri_v0) - O;
         const Vec3vf<M> R = cross(C,D);
         const vfloat<M> den = dot(Vec3vf<M>(tri_Ng),D);
-
         const vfloat<M> absDen = abs(den);
         const vfloat<M> sgnDen = signmsk(den);
-        
+
         /* perform edge tests */
-        const vfloat<M> U = dot(R,Vec3vf<M>(tri_e2)) ^ sgnDen;
-        const vfloat<M> V = dot(R,Vec3vf<M>(tri_e1)) ^ sgnDen;
+        const vfloat<M> U = asFloat(asInt(dot(R,Vec3vf<M>(tri_e2))) ^ asInt(sgnDen));
+        const vfloat<M> V = asFloat(asInt(dot(R,Vec3vf<M>(tri_e1))) ^ asInt(sgnDen));
         
         /* perform backface culling */        
 #if defined(EMBREE_BACKFACE_CULLING)
@@ -99,14 +99,15 @@ namespace embree
         if (likely(early_out && none(valid))) return false;
 
         /* perform depth test */
-        const vfloat<M> T = dot(Vec3vf<M>(tri_Ng),C) ^ sgnDen;
+        const vfloat<M> T = asFloat(asInt(dot(Vec3vf<M>(tri_Ng),C)) ^ asInt(sgnDen));
+
         valid &= (absDen*vfloat<M>(ray.tnear()) < T) & (T <= absDen*vfloat<M>(ray.tfar));
         if (likely(early_out && none(valid))) return false;
            
         /* update hit information */
         new (&hit) MoellerTrumboreHitM<M,UVMapper>(valid,U,V,T,absDen,tri_Ng,mapUV);
 
-        return true;
+        return early_out || any(valid);
       }
 
       template<typename UVMapper>
@@ -320,8 +321,7 @@ namespace embree
         const Vec3vf<K> Ng = cross(e2,e1);
         return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,e1,e2,Ng,mapUV,hit);
       }
-      
-      
+
       /*! Intersects K rays with one of M triangles. */
       template<typename UVMapper, typename Epilog>
       __forceinline vbool<K> intersectK(const vbool<K>& valid0, 
diff --git a/thirdparty/embree/kernels/geometry/triangle_intersector_pluecker.h b/thirdparty/embree/kernels/geometry/triangle_intersector_pluecker.h
index 8fbefcea88..e21f1fa4c8 100644
--- a/thirdparty/embree/kernels/geometry/triangle_intersector_pluecker.h
+++ b/thirdparty/embree/kernels/geometry/triangle_intersector_pluecker.h
@@ -78,7 +78,7 @@ namespace embree
         
         /* calculate vertices relative to ray origin */
         const Vec3vf<M> O = Vec3vf<M>((Vec3fa)ray.org);
-	const Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray.dir);
+        const Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray.dir);
         const Vec3vf<M> v0 = tri_v0-O;
         const Vec3vf<M> v1 = tri_v1-O;
         const Vec3vf<M> v2 = tri_v2-O;
@@ -114,7 +114,7 @@ namespace embree
 
         /* update hit information */
         new (&hit) PlueckerHitM<M,UVMapper>(valid,U,V,UVW,t,Ng,mapUV);
-        return true;
+        return early_out || any(valid);
       }
 
       template<typename UVMapper>
diff --git a/thirdparty/embree/kernels/geometry/triangle_intersector_woop.h b/thirdparty/embree/kernels/geometry/triangle_intersector_woop.h
index f05dcc4537..03d23adc3d 100644
--- a/thirdparty/embree/kernels/geometry/triangle_intersector_woop.h
+++ b/thirdparty/embree/kernels/geometry/triangle_intersector_woop.h
@@ -64,7 +64,7 @@ namespace embree
         kx = (kz+1) % 3;
         ky = (kx+1) % 3;
         const float inv_dir_kz = rcp(ray.dir[kz]);
-        if (ray.dir[kz]) std::swap(kx,ky);
+        if (ray.dir[kz] < 0.0f) std::swap(kx,ky);
         S.x = ray.dir[kx] * inv_dir_kz;
         S.y = ray.dir[ky] * inv_dir_kz;
         S.z = inv_dir_kz;
diff --git a/thirdparty/embree/kernels/geometry/trianglei_intersector.h b/thirdparty/embree/kernels/geometry/trianglei_intersector.h
index f7deb9e72d..fa7f3ae43d 100644
--- a/thirdparty/embree/kernels/geometry/trianglei_intersector.h
+++ b/thirdparty/embree/kernels/geometry/trianglei_intersector.h
@@ -18,14 +18,14 @@ namespace embree
       typedef TriangleMi<M> Primitive;
       typedef MoellerTrumboreIntersector1<M> Precalculations;
 
-      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& tri)
       {
         STAT3(normal.trav_prims,1,1,1);
         Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
         pre.intersect(ray,v0,v1,v2,Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
       }
 
-      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& tri)
       {
         STAT3(shadow.trav_prims,1,1,1);
         Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
@@ -45,7 +45,7 @@ namespace embree
       typedef TriangleMi<M> Primitive;
       typedef MoellerTrumboreIntersectorK<M,K> Precalculations;
 
-      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& tri)
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const Primitive& tri)
       {
         const Scene* scene = context->scene;
         for (size_t i=0; i<Primitive::max_size(); i++)
@@ -59,7 +59,7 @@ namespace embree
         }
       }
 
-      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& tri)
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const Primitive& tri)
       {
         vbool<K> valid0 = valid_i;
         const Scene* scene = context->scene;
@@ -77,14 +77,14 @@ namespace embree
         return !valid0;
       }
       
-      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const Primitive& tri)
       {
         STAT3(normal.trav_prims,1,1,1);
         Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
         pre.intersect(ray,k,v0,v1,v2,Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
       }
 
-      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive& tri)
       {
         STAT3(shadow.trav_prims,1,1,1);
         Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
@@ -99,14 +99,14 @@ namespace embree
       typedef TriangleMi<M> Primitive;
       typedef PlueckerIntersector1<M> Precalculations;
 
-      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& tri)
       {
         STAT3(normal.trav_prims,1,1,1);
         Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
         pre.intersect(ray,v0,v1,v2,Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
       }
 
-      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& tri)
       {
         STAT3(shadow.trav_prims,1,1,1);
         Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
@@ -126,7 +126,7 @@ namespace embree
       typedef TriangleMi<M> Primitive;
       typedef PlueckerIntersectorK<M,K> Precalculations;
 
-      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& tri)
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const Primitive& tri)
       {
         const Scene* scene = context->scene;
         for (size_t i=0; i<Primitive::max_size(); i++)
@@ -140,7 +140,7 @@ namespace embree
         }
       }
 
-      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& tri)
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const Primitive& tri)
       {
         vbool<K> valid0 = valid_i;
         const Scene* scene = context->scene;
@@ -158,14 +158,14 @@ namespace embree
         return !valid0;
       }
 
-      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const Primitive& tri)
       {
         STAT3(normal.trav_prims,1,1,1);
         Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
         pre.intersect(ray,k,v0,v1,v2,Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
       }
 
-      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive& tri)
       {
         STAT3(shadow.trav_prims,1,1,1);
         Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
@@ -181,7 +181,7 @@ namespace embree
       typedef MoellerTrumboreIntersector1<M> Precalculations;
 
       /*! Intersect a ray with the M triangles and updates the hit. */
-      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& tri)
       {
         STAT3(normal.trav_prims,1,1,1);
         Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time());
@@ -189,7 +189,7 @@ namespace embree
       }
 
       /*! Test if the ray is occluded by one of M triangles. */
-      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& tri)
       {
         STAT3(shadow.trav_prims,1,1,1);
         Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time());
@@ -210,7 +210,7 @@ namespace embree
       typedef MoellerTrumboreIntersectorK<M,K> Precalculations;
 
       /*! Intersects K rays with M triangles. */
-      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleMi<M>& tri)
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const TriangleMi<M>& tri)
       {
         for (size_t i=0; i<TriangleMi<M>::max_size(); i++)
         {
@@ -222,7 +222,7 @@ namespace embree
       }
 
       /*! Test for K rays if they are occluded by any of the M triangles. */
-      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleMi<M>& tri)
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const TriangleMi<M>& tri)
       {
         vbool<K> valid0 = valid_i;
         for (size_t i=0; i<TriangleMi<M>::max_size(); i++)
@@ -237,7 +237,7 @@ namespace embree
       }
 
       /*! Intersect a ray with M triangles and updates the hit. */
-      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleMi<M>& tri)
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const TriangleMi<M>& tri)
       {
         STAT3(normal.trav_prims,1,1,1);
         Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]);
@@ -245,7 +245,7 @@ namespace embree
       }
 
       /*! Test if the ray is occluded by one of the M triangles. */
-      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleMi<M>& tri)
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const TriangleMi<M>& tri)
       {
         STAT3(shadow.trav_prims,1,1,1);
         Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]);
@@ -261,7 +261,7 @@ namespace embree
       typedef PlueckerIntersector1<M> Precalculations;
 
       /*! Intersect a ray with the M triangles and updates the hit. */
-      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& tri)
       {
         STAT3(normal.trav_prims,1,1,1);
         Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time());
@@ -269,7 +269,7 @@ namespace embree
       }
 
       /*! Test if the ray is occluded by one of M triangles. */
-      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& tri)
       {
         STAT3(shadow.trav_prims,1,1,1);
         Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time());
@@ -290,7 +290,7 @@ namespace embree
       typedef PlueckerIntersectorK<M,K> Precalculations;
 
       /*! Intersects K rays with M triangles. */
-      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleMi<M>& tri)
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const TriangleMi<M>& tri)
       {
         for (size_t i=0; i<TriangleMi<M>::max_size(); i++)
         {
@@ -302,7 +302,7 @@ namespace embree
       }
 
       /*! Test for K rays if they are occluded by any of the M triangles. */
-      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleMi<M>& tri)
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const TriangleMi<M>& tri)
       {
         vbool<K> valid0 = valid_i;
         for (size_t i=0; i<TriangleMi<M>::max_size(); i++)
@@ -317,7 +317,7 @@ namespace embree
       }
       
       /*! Intersect a ray with M triangles and updates the hit. */
-      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleMi<M>& tri)
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const TriangleMi<M>& tri)
       {
         STAT3(normal.trav_prims,1,1,1);
         Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]);
@@ -325,7 +325,7 @@ namespace embree
       }
 
       /*! Test if the ray is occluded by one of the M triangles. */
-      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleMi<M>& tri)
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const TriangleMi<M>& tri)
       {
         STAT3(shadow.trav_prims,1,1,1);
         Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]);
diff --git a/thirdparty/embree/kernels/geometry/trianglev_intersector.h b/thirdparty/embree/kernels/geometry/trianglev_intersector.h
index 3abb7f8e32..0575bad388 100644
--- a/thirdparty/embree/kernels/geometry/trianglev_intersector.h
+++ b/thirdparty/embree/kernels/geometry/trianglev_intersector.h
@@ -20,14 +20,14 @@ namespace embree
       typedef MoellerTrumboreIntersector1<M> Precalculations;
 
       /*! Intersect a ray with M triangles and updates the hit. */
-      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& tri)
       {
         STAT3(normal.trav_prims,1,1,1);
         pre.intersect(ray,tri.v0,tri.v1,tri.v2,/*UVIdentity<M>(),*/Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
       }
 
       /*! Test if the ray is occluded by one of the M triangles. */
-      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& tri)
       {
         STAT3(shadow.trav_prims,1,1,1);
         return pre.intersect(ray,tri.v0,tri.v1,tri.v2,/*UVIdentity<M>(),*/Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
@@ -48,14 +48,14 @@ namespace embree
       typedef WoopPrecalculations1<M> Precalculations;
 
       /*! Intersect a ray with M triangles and updates the hit. */
-      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& tri)
       {
         STAT3(normal.trav_prims,1,1,1);
         intersec::intersect(ray,pre,tri.v0,tri.v1,tri.v2,Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
       }
 
       /*! Test if the ray is occluded by one of the M triangles. */
-      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& tri)
       {
         STAT3(shadow.trav_prims,1,1,1);
         return intersec::intersect(ray,pre,tri.v0,tri.v1,tri.v2,Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
@@ -76,7 +76,7 @@ namespace embree
       typedef MoellerTrumboreIntersectorK<M,K> Precalculations;
 
       /*! Intersects K rays with M triangles. */
-      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& tri)
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const Primitive& tri)
       {
         for (size_t i=0; i<M; i++)
         {
@@ -90,7 +90,7 @@ namespace embree
       }
 
       /*! Test for K rays if they are occluded by any of the M triangles. */
-      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& tri)
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const Primitive& tri)
       {
         vbool<K> valid0 = valid_i;
 
@@ -108,14 +108,14 @@ namespace embree
       }
       
       /*! Intersect a ray with M triangles and updates the hit. */
-      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const Primitive& tri)
       {
         STAT3(normal.trav_prims,1,1,1);
         pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,/*UVIdentity<M>(),*/Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID())); //FIXME: M
       }
 
       /*! Test if the ray is occluded by one of the M triangles. */
-      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive& tri)
       {
         STAT3(shadow.trav_prims,1,1,1);
         return pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,/*UVIdentity<M>(),*/Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID())); //FIXME: M
@@ -130,14 +130,14 @@ namespace embree
       typedef PlueckerIntersector1<M> Precalculations;
 
       /*! Intersect a ray with M triangles and updates the hit. */
-      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, RayQueryContext* context, const Primitive& tri)
       {
         STAT3(normal.trav_prims,1,1,1);
         pre.intersect(ray,tri.v0,tri.v1,tri.v2,UVIdentity<M>(),Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
       }
 
       /*! Test if the ray is occluded by one of the M triangles. */
-      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const Primitive& tri)
       {
         STAT3(shadow.trav_prims,1,1,1);
         return pre.intersect(ray,tri.v0,tri.v1,tri.v2,UVIdentity<M>(),Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID()));
@@ -157,7 +157,7 @@ namespace embree
       typedef PlueckerIntersectorK<M,K> Precalculations;
 
       /*! Intersects K rays with M triangles. */
-      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& tri)
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const Primitive& tri)
       {
         for (size_t i=0; i<M; i++)
         {
@@ -171,7 +171,7 @@ namespace embree
       }
 
       /*! Test for K rays if they are occluded by any of the M triangles. */
-      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& tri)
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const Primitive& tri)
       {
         vbool<K> valid0 = valid_i;
 
@@ -189,14 +189,14 @@ namespace embree
       }
       
       /*! Intersect a ray with M triangles and updates the hit. */
-      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const Primitive& tri)
       {
         STAT3(normal.trav_prims,1,1,1);
         pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,UVIdentity<M>(),Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
       }
 
       /*! Test if the ray is occluded by one of the M triangles. */
-      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const Primitive& tri)
       {
         STAT3(shadow.trav_prims,1,1,1);
         return pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,UVIdentity<M>(),Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
diff --git a/thirdparty/embree/kernels/geometry/trianglev_mb.h b/thirdparty/embree/kernels/geometry/trianglev_mb.h
index b550a29fd5..c15313f209 100644
--- a/thirdparty/embree/kernels/geometry/trianglev_mb.h
+++ b/thirdparty/embree/kernels/geometry/trianglev_mb.h
@@ -113,7 +113,6 @@ namespace embree
 
       BBox3fa bounds0 = empty;
       BBox3fa bounds1 = empty;
-      
       for (size_t i=0; i<M && begin<end; i++, begin++)
       {
 	const PrimRef& prim = prims[begin];
@@ -121,12 +120,12 @@ namespace embree
         const unsigned primID = prim.primID();
         const TriangleMesh* __restrict__ const mesh = scene->get<TriangleMesh>(geomID);
         const TriangleMesh::Triangle& tri = mesh->triangle(primID);
-        const Vec3fa& a0 = mesh->vertex(tri.v[0],itime+0); bounds0.extend(a0);
-        const Vec3fa& a1 = mesh->vertex(tri.v[0],itime+1); bounds1.extend(a1);
-        const Vec3fa& b0 = mesh->vertex(tri.v[1],itime+0); bounds0.extend(b0);
-        const Vec3fa& b1 = mesh->vertex(tri.v[1],itime+1); bounds1.extend(b1);
-        const Vec3fa& c0 = mesh->vertex(tri.v[2],itime+0); bounds0.extend(c0);
-        const Vec3fa& c1 = mesh->vertex(tri.v[2],itime+1); bounds1.extend(c1);
+        const Vec3fa& a0 = mesh->vertex(tri.v[0],size_t(itime+0)); bounds0.extend(a0);
+        const Vec3fa& a1 = mesh->vertex(tri.v[0],size_t(itime+1)); bounds1.extend(a1);
+        const Vec3fa& b0 = mesh->vertex(tri.v[1],size_t(itime+0)); bounds0.extend(b0);
+        const Vec3fa& b1 = mesh->vertex(tri.v[1],size_t(itime+1)); bounds1.extend(b1);
+        const Vec3fa& c0 = mesh->vertex(tri.v[2],size_t(itime+0)); bounds0.extend(c0);
+        const Vec3fa& c1 = mesh->vertex(tri.v[2],size_t(itime+1)); bounds1.extend(c1);
         vgeomID [i] = geomID;
         vprimID [i] = primID;
         va0.x[i] = a0.x; va0.y[i] = a0.y; va0.z[i] = a0.z;
@@ -159,12 +158,12 @@ namespace embree
         const int ilower = itime_range.begin();
         const TriangleMesh::Triangle& tri = mesh->triangle(primID);
         allBounds.extend(mesh->linearBounds(primID, time_range));
-        const Vec3fa& a0 = mesh->vertex(tri.v[0],ilower+0);
-        const Vec3fa& a1 = mesh->vertex(tri.v[0],ilower+1);
-        const Vec3fa& b0 = mesh->vertex(tri.v[1],ilower+0);
-        const Vec3fa& b1 = mesh->vertex(tri.v[1],ilower+1);
-        const Vec3fa& c0 = mesh->vertex(tri.v[2],ilower+0);
-        const Vec3fa& c1 = mesh->vertex(tri.v[2],ilower+1);
+        const Vec3fa& a0 = mesh->vertex(tri.v[0],size_t(ilower+0));
+        const Vec3fa& a1 = mesh->vertex(tri.v[0],size_t(ilower+1));
+        const Vec3fa& b0 = mesh->vertex(tri.v[1],size_t(ilower+0));
+        const Vec3fa& b1 = mesh->vertex(tri.v[1],size_t(ilower+1));
+        const Vec3fa& c0 = mesh->vertex(tri.v[2],size_t(ilower+0));
+        const Vec3fa& c1 = mesh->vertex(tri.v[2],size_t(ilower+1));
         const BBox1f time_range_v(mesh->timeStep(ilower+0),mesh->timeStep(ilower+1));
         auto a01 = globalLinear(std::make_pair(a0,a1),time_range_v);
         auto b01 = globalLinear(std::make_pair(b0,b1),time_range_v);
diff --git a/thirdparty/embree/kernels/geometry/trianglev_mb_intersector.h b/thirdparty/embree/kernels/geometry/trianglev_mb_intersector.h
index 38cd52e85d..c9042ba323 100644
--- a/thirdparty/embree/kernels/geometry/trianglev_mb_intersector.h
+++ b/thirdparty/embree/kernels/geometry/trianglev_mb_intersector.h
@@ -18,7 +18,7 @@ namespace embree
       typedef MoellerTrumboreIntersector1<M> Precalculations;
 
       /*! Intersect a ray with the M triangles and updates the hit. */
-      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const TriangleMvMB<M>& tri)
       {
         STAT3(normal.trav_prims,1,1,1);
         const Vec3vf<M> time(ray.time());
@@ -29,7 +29,7 @@ namespace embree
       }
 
       /*! Test if the ray is occluded by one of M triangles. */
-      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const TriangleMvMB<M>& tri)
       {
         STAT3(shadow.trav_prims,1,1,1);
         const Vec3vf<M> time(ray.time());
@@ -53,7 +53,7 @@ namespace embree
       typedef MoellerTrumboreIntersectorK<M,K> Precalculations;
 
       /*! Intersects K rays with M triangles. */
-      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const TriangleMvMB<M>& tri)
       {
         for (size_t i=0; i<TriangleMvMB<M>::max_size(); i++)
         {
@@ -68,7 +68,7 @@ namespace embree
       }
 
       /*! Test for K rays if they are occluded by any of the M triangles. */
-      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const TriangleMvMB<M>& tri)
       {
         vbool<K> valid0 = valid_i;
 
@@ -87,7 +87,7 @@ namespace embree
       }
       
       /*! Intersect a ray with M triangles and updates the hit. */
-      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleMvMB<M>& tri)
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const TriangleMvMB<M>& tri)
       {
         STAT3(normal.trav_prims,1,1,1);
         const Vec3vf<M> time(ray.time()[k]);
@@ -98,7 +98,7 @@ namespace embree
       }
 
       /*! Test if the ray is occluded by one of the M triangles. */
-      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleMvMB<M>& tri)
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const TriangleMvMB<M>& tri)
       {
         STAT3(shadow.trav_prims,1,1,1);
         const Vec3vf<M> time(ray.time()[k]);
@@ -117,7 +117,7 @@ namespace embree
       typedef PlueckerIntersector1<M> Precalculations;
 
       /*! Intersect a ray with the M triangles and updates the hit. */
-      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, RayQueryContext* context, const TriangleMvMB<M>& tri)
       {
         STAT3(normal.trav_prims,1,1,1);
         const Vec3vf<M> time(ray.time());
@@ -128,7 +128,7 @@ namespace embree
       }
 
       /*! Test if the ray is occluded by one of M triangles. */
-      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, RayQueryContext* context, const TriangleMvMB<M>& tri)
       {
         STAT3(shadow.trav_prims,1,1,1);
         const Vec3vf<M> time(ray.time());
@@ -152,7 +152,7 @@ namespace embree
       typedef PlueckerIntersectorK<M,K> Precalculations;
 
       /*! Intersects K rays with M triangles. */
-      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, RayQueryContext* context, const TriangleMvMB<M>& tri)
       {
         for (size_t i=0; i<TriangleMvMB<M>::max_size(); i++)
         {
@@ -167,7 +167,7 @@ namespace embree
       }
 
       /*! Test for K rays if they are occluded by any of the M triangles. */
-      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, RayQueryContext* context, const TriangleMvMB<M>& tri)
       {
         vbool<K> valid0 = valid_i;
 
@@ -186,7 +186,7 @@ namespace embree
       }
 
       /*! Intersect a ray with M triangles and updates the hit. */
-      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleMvMB<M>& tri)
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, RayQueryContext* context, const TriangleMvMB<M>& tri)
       {
         STAT3(normal.trav_prims,1,1,1);
         const Vec3vf<M> time(ray.time()[k]);
@@ -197,7 +197,7 @@ namespace embree
       }
 
       /*! Test if the ray is occluded by one of the M triangles. */
-      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleMvMB<M>& tri)
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, RayQueryContext* context, const TriangleMvMB<M>& tri)
       {
         STAT3(shadow.trav_prims,1,1,1);
         const Vec3vf<M> time(ray.time()[k]);
diff --git a/thirdparty/embree/kernels/hash.h b/thirdparty/embree/kernels/hash.h
index 39d50e2354..ec9759ee35 100644
--- a/thirdparty/embree/kernels/hash.h
+++ b/thirdparty/embree/kernels/hash.h
@@ -1,4 +1,4 @@
 // Copyright 2009-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#define RTC_HASH "698442324ccddd11725fb8875275dc1384f7fb40"
+#define RTC_HASH "daa8de0e714e18ad5e5c9841b67c1950d9c91c51"
diff --git a/thirdparty/embree/kernels/subdiv/bezier_curve.h b/thirdparty/embree/kernels/subdiv/bezier_curve.h
index a5adad5cc9..257e0afd40 100644
--- a/thirdparty/embree/kernels/subdiv/bezier_curve.h
+++ b/thirdparty/embree/kernels/subdiv/bezier_curve.h
@@ -185,9 +185,10 @@ namespace embree
       __forceinline CubicBezierCurve<float> xfm(const Vertex& dx) const {
         return CubicBezierCurve<float>(dot(v0,dx),dot(v1,dx),dot(v2,dx),dot(v3,dx));
       }
-      
-      __forceinline CubicBezierCurve<vfloatx> vxfm(const Vertex& dx) const {
-        return CubicBezierCurve<vfloatx>(dot(v0,dx),dot(v1,dx),dot(v2,dx),dot(v3,dx));
+
+      template<int W>
+      __forceinline CubicBezierCurve<vfloat<W>> vxfm(const Vertex& dx) const {
+        return CubicBezierCurve<vfloat<W>>(dot(v0,dx),dot(v1,dx),dot(v2,dx),dot(v3,dx));
       }
       
       __forceinline CubicBezierCurve<float> xfm(const Vertex& dx, const Vertex& p) const {
@@ -286,7 +287,7 @@ namespace embree
       {
         const float u0 = 0.0f, u1 = 1.0f;
         const float dscale = (u1-u0)*(1.0f/(3.0f*(VSIZEX-1)));
-        const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(VSIZEX-1)));
+        const vfloatx vu0 = lerp(u0,u1,vfloatx(StepTy())*(1.0f/(VSIZEX-1)));
         Vec2vfx P0, dP0du; evalN(vu0,P0,dP0du); dP0du = dP0du * Vec2vfx(dscale);
         const Vec2vfx P3 = shift_right_1(P0);
         const Vec2vfx dP3du = shift_right_1(dP0du); 
@@ -299,7 +300,7 @@ namespace embree
       {
         const float u0 = u.lower, u1 = u.upper;
         const float dscale = (u1-u0)*(1.0f/(3.0f*(VSIZEX-1)));
-        const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(VSIZEX-1)));
+        const vfloatx vu0 = lerp(u0,u1,vfloatx(StepTy())*(1.0f/(VSIZEX-1)));
         Vec2vfx P0, dP0du; evalN(vu0,P0,dP0du); dP0du = dP0du * Vec2vfx(dscale);
         const Vec2vfx P3 = shift_right_1(P0);
         const Vec2vfx dP3du = shift_right_1(dP0du); 
@@ -307,6 +308,33 @@ namespace embree
         const Vec2vfx P2 = P3 - dP3du;
         return CubicBezierCurve<Vec2vfx>(P0,P1,P2,P3);
       }
+
+      template<int W>
+      __forceinline CubicBezierCurve<Vec2vf<W>> split(const BBox1f& u, int i, int N) const
+      {
+        const float u0 = u.lower, u1 = u.upper;
+        const float dscale = (u1-u0)*(1.0f/(3.0f*N));
+        const vfloat<W> vu0 = lerp(u0,u1,(vfloat<W>(i)+vfloat<W>(StepTy()))*(1.0f/N));
+        Vec2vf<W> P0, dP0du; evalN(vu0,P0,dP0du); dP0du = dP0du * Vec2vf<W>(dscale);
+        const Vec2vf<W> P3 = shift_right_1(P0);
+        const Vec2vf<W> dP3du = shift_right_1(dP0du); 
+        const Vec2vf<W> P1 = P0 + dP0du; 
+        const Vec2vf<W> P2 = P3 - dP3du;
+        return CubicBezierCurve<Vec2vf<W>>(P0,P1,P2,P3);
+      }
+
+       __forceinline CubicBezierCurve<Vec2f> split1(const BBox1f& u, int i, int N) const
+      {
+        const float u0 = u.lower, u1 = u.upper;
+        const float dscale = (u1-u0)*(1.0f/(3.0f*N));
+        const float vu0 = lerp(u0,u1,(float(i)+0)*(1.0f/N));
+        const float vu1 = lerp(u0,u1,(float(i)+1)*(1.0f/N));
+        Vec2fa P0, dP0du; eval(vu0,P0,dP0du); dP0du = dP0du * Vec2fa(dscale);
+        Vec2fa P3, dP3du; eval(vu1,P3,dP3du); dP3du = dP3du * Vec2fa(dscale);
+        const Vec2fa P1 = P0 + dP0du; 
+        const Vec2fa P2 = P3 - dP3du;
+        return CubicBezierCurve<Vec2f>(P0,P1,P2,P3);
+      }
       
       __forceinline void eval(float t, Vertex& p, Vertex& dp) const
       {
@@ -461,20 +489,20 @@ namespace embree
         return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
       }
       
-      template<int M>
-      __forceinline void veval(const vfloat<M>& t, Vec4vf<M>& p, Vec4vf<M>& dp) const
+      template<int M, typename Vec>
+      __forceinline void veval(const vfloat<M>& t, Vec& p, Vec& dp) const
       {
-        const Vec4vf<M> p00 = v0;
-        const Vec4vf<M> p01 = v1;
-        const Vec4vf<M> p02 = v2;
-        const Vec4vf<M> p03 = v3;
+        const Vec p00 = v0;
+        const Vec p01 = v1;
+        const Vec p02 = v2;
+        const Vec p03 = v3;
         
-        const Vec4vf<M> p10 = lerp(p00,p01,t);
-        const Vec4vf<M> p11 = lerp(p01,p02,t);
-        const Vec4vf<M> p12 = lerp(p02,p03,t);
-        const Vec4vf<M> p20 = lerp(p10,p11,t);
-        const Vec4vf<M> p21 = lerp(p11,p12,t);
-        const Vec4vf<M> p30 = lerp(p20,p21,t);
+        const Vec p10 = lerp(p00,p01,t);
+        const Vec p11 = lerp(p01,p02,t);
+        const Vec p12 = lerp(p02,p03,t);
+        const Vec p20 = lerp(p10,p11,t);
+        const Vec p21 = lerp(p11,p12,t);
+        const Vec p30 = lerp(p20,p21,t);
         
         p = p30;
         dp = vfloat<M>(3.0f)*(p21-p20);
@@ -485,10 +513,17 @@ namespace embree
       {
         assert(size <= PrecomputedBezierBasis::N);
         assert(ofs <= size);
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+        assert(size > 0);
+        const vfloat<M> t = (vfloat<M>(step) + vfloat<M>(ofs+0))*rcp(float(size));
+        Vec p,dp; veval<M>(t,p,dp);
+        return p;
+#else
         return madd(vfloat<M>::loadu(&bezier_basis0.c0[size][ofs]), Vec(v0),
                     madd(vfloat<M>::loadu(&bezier_basis0.c1[size][ofs]), Vec(v1),
                          madd(vfloat<M>::loadu(&bezier_basis0.c2[size][ofs]), Vec(v2),
                               vfloat<M>::loadu(&bezier_basis0.c3[size][ofs]) * Vec(v3))));
+#endif
       }
       
       template<int M, typename Vec = Vec4vf<M>>
@@ -496,10 +531,17 @@ namespace embree
       {
         assert(size <= PrecomputedBezierBasis::N);
         assert(ofs <= size);
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+        assert(size > 0);
+        const vfloat<M> t = (vfloat<M>(step) + vfloat<M>(ofs+1))*rcp(float(size));
+        Vec p,dp; veval<M>(t,p,dp);
+        return p;
+#else
         return madd(vfloat<M>::loadu(&bezier_basis1.c0[size][ofs]), Vec(v0), 
                     madd(vfloat<M>::loadu(&bezier_basis1.c1[size][ofs]), Vec(v1),
                          madd(vfloat<M>::loadu(&bezier_basis1.c2[size][ofs]), Vec(v2),
                               vfloat<M>::loadu(&bezier_basis1.c3[size][ofs]) * Vec(v3))));
+#endif
       }
       
       template<int M, typename Vec = Vec4vf<M>>
@@ -507,10 +549,17 @@ namespace embree
       {
         assert(size <= PrecomputedBezierBasis::N);
         assert(ofs <= size);
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+        assert(size > 0);
+        const vfloat<M> t = (vfloat<M>(step) + vfloat<M>(ofs+0))*rcp(float(size));
+        Vec p,dp; veval<M>(t,p,dp);
+        return dp;
+#else
         return madd(vfloat<M>::loadu(&bezier_basis0.d0[size][ofs]), Vec(v0),
                     madd(vfloat<M>::loadu(&bezier_basis0.d1[size][ofs]), Vec(v1),
                          madd(vfloat<M>::loadu(&bezier_basis0.d2[size][ofs]), Vec(v2),
                               vfloat<M>::loadu(&bezier_basis0.d3[size][ofs]) * Vec(v3))));
+#endif
       }
       
       template<int M, typename Vec = Vec4vf<M>>
@@ -518,10 +567,17 @@ namespace embree
       {
         assert(size <= PrecomputedBezierBasis::N);
         assert(ofs <= size);
+#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
+        assert(size > 0);
+        const vfloat<M> t = (vfloat<M>(step) + vfloat<M>(ofs+1))*rcp(float(size));
+        Vec p,dp; veval<M>(t,p,dp);
+        return dp;
+#else
         return madd(vfloat<M>::loadu(&bezier_basis1.d0[size][ofs]), Vec(v0),
                     madd(vfloat<M>::loadu(&bezier_basis1.d1[size][ofs]), Vec(v1),
                          madd(vfloat<M>::loadu(&bezier_basis1.d2[size][ofs]), Vec(v2),
                               vfloat<M>::loadu(&bezier_basis1.d3[size][ofs]) * Vec(v3))));
+#endif
       }
 
       /* calculates bounds of bezier curve geometry */
@@ -532,7 +588,7 @@ namespace embree
         Vec3vfx pl(pos_inf), pu(neg_inf);
         for (int i=0; i<=N; i+=VSIZEX)
         {
-          vintx vi = vintx(i)+vintx(step);
+          vintx vi = vintx(i)+vintx(StepTy());
           vboolx valid = vi <= vintx(N);
           const Vec3vfx p  = eval0<VSIZEX,Vec3vf<VSIZEX>>(i,N);
           const Vec3vfx dp = derivative0<VSIZEX,Vec3vf<VSIZEX>>(i,N);
@@ -554,7 +610,7 @@ namespace embree
         Vec4vfx pl(pos_inf), pu(neg_inf);
         for (int i=0; i<=N; i+=VSIZEX)
         {
-          vintx vi = vintx(i)+vintx(step);
+          vintx vi = vintx(i)+vintx(StepTy());
           vboolx valid = vi <= vintx(N);
           const Vec4vfx p  = eval0<VSIZEX>(i,N);
           const Vec4vfx dp = derivative0<VSIZEX>(i,N);
@@ -587,7 +643,7 @@ namespace embree
           Vec3vfx pl(pos_inf), pu(neg_inf); vfloatx ru(0.0f);
           for (int i=0; i<N; i+=VSIZEX)
           {
-            vboolx valid = vintx(i)+vintx(step) < vintx(N);
+            vboolx valid = vintx(i)+vintx(StepTy()) < vintx(N);
             const Vec4vfx pi = eval0<VSIZEX>(i,N);
             
             pl.x = select(valid,min(pl.x,pi.x),pl.x); // FIXME: use masked min
@@ -645,6 +701,7 @@ namespace embree
   typedef CubicBezierCurve<Vec2fa> CubicBezierCurve2fa;
   typedef CubicBezierCurve<Vec3fa> CubicBezierCurve3fa;
   typedef CubicBezierCurve<Vec3fa> BezierCurve3fa;
+  typedef CubicBezierCurve<Vec3ff> BezierCurve3ff;
   
   template<> __forceinline int CubicBezierCurve<float>::maxRoots() const
   {
@@ -660,8 +717,9 @@ namespace embree
     return numRoots(v0,v1) + numRoots(v1,v2) + numRoots(v2,v3);
   }
 
+  struct CurveGeometry; // FIXME: this code should move !
   template<typename CurveGeometry>
-  __forceinline CubicBezierCurve<Vec3ff> enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const CubicBezierCurve<Vec3ff>& curve)
+  __forceinline CubicBezierCurve<Vec3ff> enlargeRadiusToMinWidth(const RayQueryContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const CubicBezierCurve<Vec3ff>& curve)
   {
     return CubicBezierCurve<Vec3ff>(enlargeRadiusToMinWidth(context,geom,ray_org,curve.v0),
                                     enlargeRadiusToMinWidth(context,geom,ray_org,curve.v1),
diff --git a/thirdparty/embree/kernels/subdiv/bspline_curve.h b/thirdparty/embree/kernels/subdiv/bspline_curve.h
index 51489ef37c..5d25ebb8e4 100644
--- a/thirdparty/embree/kernels/subdiv/bspline_curve.h
+++ b/thirdparty/embree/kernels/subdiv/bspline_curve.h
@@ -129,6 +129,12 @@ namespace embree
         const Vec4<float> b = BSplineBasis::derivative2(t);
         return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
       }
+
+      __forceinline void eval(const float t, Vertex& p, Vertex& dp) const
+      {
+        p = eval(t);
+        dp = eval_du(t);
+      }
       
       __forceinline void eval(const float t, Vertex& p, Vertex& dp, Vertex& ddp) const
       {
@@ -307,7 +313,7 @@ namespace embree
   }
 
   template<typename CurveGeometry>
-  __forceinline BSplineCurveT<Vec3ff> enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const BSplineCurveT<Vec3ff>& curve)
+  __forceinline BSplineCurveT<Vec3ff> enlargeRadiusToMinWidth(const RayQueryContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const BSplineCurveT<Vec3ff>& curve)
   {
     return BSplineCurveT<Vec3ff>(enlargeRadiusToMinWidth(context,geom,ray_org,curve.v0),
                                  enlargeRadiusToMinWidth(context,geom,ray_org,curve.v1),
diff --git a/thirdparty/embree/kernels/subdiv/catmullrom_curve.h b/thirdparty/embree/kernels/subdiv/catmullrom_curve.h
index 9532287d98..c42435b9d7 100644
--- a/thirdparty/embree/kernels/subdiv/catmullrom_curve.h
+++ b/thirdparty/embree/kernels/subdiv/catmullrom_curve.h
@@ -4,12 +4,13 @@
 #pragma once
 
 #include "../common/default.h"
+#include "bezier_curve.h"
 #include "../common/scene_curves.h"
 
 /*
 
-  Implements Catmull-Rom curves with control points p0, p1, p2, p3. At
-  t=0 the curve goes through p1, with tangent (p2-p0)/3, and for t=1
+  Implements Catmul Rom curves with control points p0, p1, p2, p3. At
+  t=0 the curve goes through p1, with tangent (p2-p0)/2, and for t=1
   the curve goes through p2 with tangent (p3-p2)/2.
 
  */
@@ -99,7 +100,7 @@ namespace embree
       }
 
       __forceinline Vertex center() const {
-        return 0.25f*(v0+v1+v2+v3);
+        return 0.5f*(v0+v1);
       }
 
       __forceinline BBox<Vertex> bounds() const {
@@ -136,6 +137,12 @@ namespace embree
         const Vec4<float> b = CatmullRomBasis::derivative2(t);
         return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
       }
+
+      __forceinline void eval(const float t, Vertex& p, Vertex& dp) const
+      {
+        p = eval(t);
+        dp = eval_du(t);
+      }
       
       __forceinline void eval(const float t, Vertex& p, Vertex& dp, Vertex& ddp) const
       {
@@ -283,8 +290,18 @@ namespace embree
       }
     };
 
+  template<typename Vertex>
+    __forceinline void convert(const CatmullRomCurveT<Vertex>& icurve, BezierCurveT<Vertex>& ocurve)
+  {
+    const Vertex v0 = icurve.v1;
+    const Vertex v1 = icurve.v1+(icurve.v2-icurve.v0)*(1.0f/6.0f);
+    const Vertex v2 = icurve.v2+(icurve.v1-icurve.v3)*(1.0f/6.0f);
+    const Vertex v3 = icurve.v2;
+    ocurve = BezierCurveT<Vertex>(v0,v1,v2,v3);
+  }
+
   template<typename CurveGeometry>
-  __forceinline CatmullRomCurveT<Vec3ff> enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const CatmullRomCurveT<Vec3ff>& curve)
+  __forceinline CatmullRomCurveT<Vec3ff> enlargeRadiusToMinWidth(const RayQueryContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const CatmullRomCurveT<Vec3ff>& curve)
   {
     return CatmullRomCurveT<Vec3ff>(enlargeRadiusToMinWidth(context,geom,ray_org,curve.v0),
                                     enlargeRadiusToMinWidth(context,geom,ray_org,curve.v1),
diff --git a/thirdparty/embree/kernels/subdiv/hermite_curve.h b/thirdparty/embree/kernels/subdiv/hermite_curve.h
index ffef5a4315..09ad4cd8b4 100644
--- a/thirdparty/embree/kernels/subdiv/hermite_curve.h
+++ b/thirdparty/embree/kernels/subdiv/hermite_curve.h
@@ -29,8 +29,14 @@ namespace embree
       }
     };
 
+  template<typename Vertex>
+    __forceinline void convert(const HermiteCurveT<Vertex>& icurve, BezierCurveT<Vertex>& ocurve)
+  {
+    ocurve = BezierCurveT<Vertex>(icurve.v0,icurve.v1,icurve.v2,icurve.v3);
+  }
+  
   template<typename CurveGeometry>
-  __forceinline HermiteCurveT<Vec3ff> enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const HermiteCurveT<Vec3ff>& curve) {
+  __forceinline HermiteCurveT<Vec3ff> enlargeRadiusToMinWidth(const RayQueryContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const HermiteCurveT<Vec3ff>& curve) {
     return HermiteCurveT<Vec3ff>(enlargeRadiusToMinWidth(context,geom,ray_org,BezierCurveT<Vec3ff>(curve)));
   }
   
diff --git a/thirdparty/embree/kernels/subdiv/linear_bezier_patch.h b/thirdparty/embree/kernels/subdiv/linear_bezier_patch.h
index dcdb101d7c..e93a86d7a3 100644
--- a/thirdparty/embree/kernels/subdiv/linear_bezier_patch.h
+++ b/thirdparty/embree/kernels/subdiv/linear_bezier_patch.h
@@ -31,7 +31,8 @@ namespace embree
           return merge(L.bounds(),R.bounds());
         }
       };
-    
+
+#if !defined(__SYCL_DEVICE_ONLY__)   
     template<>
       struct TensorLinearQuadraticBezierSurface<Vec2fa>
     {
@@ -57,6 +58,7 @@ namespace embree
         return merge(bl,br);
       }
     };
+#endif
     
     template<typename V>
       struct TensorLinearCubicBezierSurface
@@ -148,9 +150,10 @@ namespace embree
         __forceinline TensorLinearCubicBezierSurface<float> xfm(const V& dx) const {
           return TensorLinearCubicBezierSurface<float>(L.xfm(dx),R.xfm(dx));
         }
-        
-        __forceinline TensorLinearCubicBezierSurface<vfloatx> vxfm(const V& dx) const {
-          return TensorLinearCubicBezierSurface<vfloatx>(L.vxfm(dx),R.vxfm(dx));
+
+        template<int W>
+        __forceinline TensorLinearCubicBezierSurface<vfloat<W>> vxfm(const V& dx) const {
+          return TensorLinearCubicBezierSurface<vfloat<W>>(L.template vxfm<W>(dx),R.template vxfm<W>(dx));
         }
         
         __forceinline TensorLinearCubicBezierSurface<float> xfm(const V& dx, const V& p) const {
@@ -188,11 +191,20 @@ namespace embree
           new (&left ) TensorLinearCubicBezierSurface(L0,R0);
           new (&right) TensorLinearCubicBezierSurface(L1,R1);
         }
-        
+
         __forceinline TensorLinearCubicBezierSurface<Vec2vfx> vsplit_u(vboolx& valid, const BBox1f& u) const {
           valid = true; clear(valid,VSIZEX-1);
           return TensorLinearCubicBezierSurface<Vec2vfx>(L.split(u),R.split(u));
         }
+
+        template<int W>
+        __forceinline TensorLinearCubicBezierSurface<Vec2vf<W>> vsplit_u(vbool<W>& valid, const BBox1f& u, int& i, int N) const
+        {
+          valid = true; clear(valid,W-1); 
+          auto r = TensorLinearCubicBezierSurface<Vec2vf<W>>(L.template split<W>(u,i,N),R.template split<W>(u,i,N));
+          i += W-1;
+          return r;
+        }
         
         __forceinline V eval(const float u, const float v) const {
           return clerp(L,R,V(v)).eval(u);
@@ -244,6 +256,8 @@ namespace embree
           return TensorLinearCubicBezierSurface(clerp(a.L,b.L,V(t)), clerp(a.R,b.R,V(t)));
         }
       };
+
+#if !defined(__SYCL_DEVICE_ONLY__)
     
     template<>
       struct TensorLinearCubicBezierSurface<Vec2fa>
@@ -332,12 +346,20 @@ namespace embree
         new (&left ) TensorLinearCubicBezierSurface(LR0);
         new (&right) TensorLinearCubicBezierSurface(LR1);
       }
-      
+
       __forceinline TensorLinearCubicBezierSurface<Vec2vfx> vsplit_u(vboolx& valid, const BBox1f& u) const {
         valid = true; clear(valid,VSIZEX-1);
         return TensorLinearCubicBezierSurface<Vec2vfx>(getL().split(u),getR().split(u));
       }
       
+      template<int W>
+      __forceinline TensorLinearCubicBezierSurface<Vec2vf<W>> vsplit_u(vbool<W>& valid, const BBox1f& u, int& i, int N) const {
+        valid = true; clear(valid,W-1); 
+        auto r = TensorLinearCubicBezierSurface<Vec2vf<W>>(getL().split<W>(u,i,N),getR().split<W>(u,i,N));
+        i += W-1;
+        return r;
+      }
+      
       __forceinline Vec2fa eval(const float u, const float v) const
       {
         const vfloat4 p = LR.eval(u);
@@ -396,6 +418,24 @@ namespace embree
       }
     };
 
+    template<>
+    __forceinline TensorLinearCubicBezierSurface<Vec2f> TensorLinearCubicBezierSurface<Vec2fa>::vsplit_u<1>(bool& valid, const BBox1f& u, int& i, int N) const {
+      auto r = TensorLinearCubicBezierSurface<Vec2f>(getL().split1(u,i,N),getR().split1(u,i,N));
+      valid = true; i += 1;
+      return r;
+    }
+    
+#else
+
+    template<> template<>
+    __forceinline TensorLinearCubicBezierSurface<Vec2f> TensorLinearCubicBezierSurface<Vec2fa>::vsplit_u<1>(bool& valid, const BBox1f& u, int& i, int N) const {
+      auto r = TensorLinearCubicBezierSurface<Vec2f>(L.split1(u,i,N),R.split1(u,i,N));
+      valid = true; i += 1;
+      return r;
+    }
+
+#endif
+
     typedef TensorLinearCubicBezierSurface<float> TensorLinearCubicBezierSurface1f;
     typedef TensorLinearCubicBezierSurface<Vec2fa> TensorLinearCubicBezierSurface2fa;
     typedef TensorLinearCubicBezierSurface<Vec3fa> TensorLinearCubicBezierSurface3fa;
diff --git a/thirdparty/embree/patches/godot-changes-noexcept.patch b/thirdparty/embree/patches/godot-changes-noexcept.patch
index 84169c36e4..bf9973a764 100644
--- a/thirdparty/embree/patches/godot-changes-noexcept.patch
+++ b/thirdparty/embree/patches/godot-changes-noexcept.patch
@@ -1,121 +1,5 @@
-diff --git a/thirdparty/embree/common/algorithms/parallel_for.h b/thirdparty/embree/common/algorithms/parallel_for.h
-index f2969a88f1..6d411e4852 100644
---- a/thirdparty/embree/common/algorithms/parallel_for.h
-+++ b/thirdparty/embree/common/algorithms/parallel_for.h
-@@ -21,7 +21,10 @@ namespace embree
-           func(r.begin());
-         });
-       if (!TaskScheduler::wait())
--        throw std::runtime_error("task cancelled");
-+        // -- GODOT start --
-+        // throw std::runtime_error("task cancelled");
-+        abort();
-+        // -- GODOT end --
-     }
- #elif defined(TASKING_TBB)
-   #if TBB_INTERFACE_VERSION >= 12002
-@@ -30,13 +33,19 @@ namespace embree
-         func(i);
-       },context);
-     if (context.is_group_execution_cancelled())
--      throw std::runtime_error("task cancelled");
-+      // -- GODOT start --
-+      // throw std::runtime_error("task cancelled");
-+      abort();
-+      // -- GODOT end --
-   #else
-     tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
-         func(i);
-       });
-     if (tbb::task::self().is_cancelled())
--      throw std::runtime_error("task cancelled");
-+      // -- GODOT start --
-+      // throw std::runtime_error("task cancelled");
-+      abort();
-+      // -- GODOT end --
-   #endif
- 
- #elif defined(TASKING_PPL)
-@@ -56,7 +65,10 @@ namespace embree
- #if defined(TASKING_INTERNAL)
-     TaskScheduler::spawn(first,last,minStepSize,func);
-     if (!TaskScheduler::wait())
--      throw std::runtime_error("task cancelled");
-+      // -- GODOT start --
-+      // throw std::runtime_error("task cancelled");
-+      abort();
-+      // -- GODOT end --
- 
- #elif defined(TASKING_TBB)
-   #if TBB_INTERFACE_VERSION >= 12002
-@@ -65,13 +77,19 @@ namespace embree
-         func(range<Index>(r.begin(),r.end()));
-       },context);
-     if (context.is_group_execution_cancelled())
--      throw std::runtime_error("task cancelled");
-+      // -- GODOT start --
-+      // throw std::runtime_error("task cancelled");
-+      abort();
-+      // -- GODOT end --
-   #else
-     tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
-         func(range<Index>(r.begin(),r.end()));
-       });
-     if (tbb::task::self().is_cancelled())
--      throw std::runtime_error("task cancelled");
-+      // -- GODOT start --
-+      // throw std::runtime_error("task cancelled");
-+      abort();
-+      // -- GODOT end --
-   #endif
- 
- #elif defined(TASKING_PPL)
-@@ -103,13 +121,19 @@ namespace embree
-           func(i);
-         },tbb::simple_partitioner(),context);
-       if (context.is_group_execution_cancelled())
--        throw std::runtime_error("task cancelled");
-+        // -- GODOT start --
-+        // throw std::runtime_error("task cancelled");
-+        abort();
-+        // -- GODOT end --
-     #else
-       tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
-           func(i);
-         },tbb::simple_partitioner());
-       if (tbb::task::self().is_cancelled())
--        throw std::runtime_error("task cancelled");
-+        // -- GODOT start --
-+        // throw std::runtime_error("task cancelled");
-+        abort();
-+        // -- GODOT end --
-     #endif
-   }
- 
-@@ -124,13 +148,19 @@ namespace embree
-           func(i);
-         },ap,context);
-       if (context.is_group_execution_cancelled())
--        throw std::runtime_error("task cancelled");
-+        // -- GODOT start --
-+        // throw std::runtime_error("task cancelled");
-+        abort();
-+        // -- GODOT end --
-     #else
-       tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
-           func(i);
-         },ap);
-       if (tbb::task::self().is_cancelled())
--        throw std::runtime_error("task cancelled");
-+        // -- GODOT start --
-+        // throw std::runtime_error("task cancelled");
-+        abort();
-+        // -- GODOT end --
-     #endif
-   }
- 
 diff --git a/thirdparty/embree/common/algorithms/parallel_reduce.h b/thirdparty/embree/common/algorithms/parallel_reduce.h
-index 1a94aad8c4..cd0078f2e6 100644
+index b52b1e2e13..51ec0a6405 100644
 --- a/thirdparty/embree/common/algorithms/parallel_reduce.h
 +++ b/thirdparty/embree/common/algorithms/parallel_reduce.h
 @@ -58,15 +58,19 @@ namespace embree
@@ -143,119 +27,233 @@ index 1a94aad8c4..cd0078f2e6 100644
    #endif
  #else // TASKING_PPL
 diff --git a/thirdparty/embree/common/lexers/stringstream.cpp b/thirdparty/embree/common/lexers/stringstream.cpp
-index 42ffb10176..a037869506 100644
+index 42ffb10176..c93da0b420 100644
 --- a/thirdparty/embree/common/lexers/stringstream.cpp
 +++ b/thirdparty/embree/common/lexers/stringstream.cpp
-@@ -39,7 +39,10 @@ namespace embree
+@@ -39,7 +39,12 @@ namespace embree
      std::vector<char> str; str.reserve(64);
      while (cin->peek() != EOF && !isSeparator(cin->peek())) {
        int c = cin->get();
 -      if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input");
 +      // -- GODOT start --
 +      // if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input");
-+      if (!isValidChar(c)) abort();
++      if (!isValidChar(c)) {
++        abort();
++      }
 +      // -- GODOT end --
        str.push_back((char)c);
      }
      str.push_back(0);
 diff --git a/thirdparty/embree/common/sys/alloc.cpp b/thirdparty/embree/common/sys/alloc.cpp
-index 1bc30fe9a5..abdd269069 100644
+index de225fafc6..71616a3982 100644
 --- a/thirdparty/embree/common/sys/alloc.cpp
 +++ b/thirdparty/embree/common/sys/alloc.cpp
-@@ -21,7 +21,10 @@ namespace embree
+@@ -24,16 +24,32 @@ namespace embree
+   
+   void enableUSMAllocEmbree(sycl::context* context, sycl::device* device)
+   {
+-    if (tls_context_embree != nullptr) throw std::runtime_error("USM allocation already enabled");
+-    if (tls_device_embree != nullptr) throw std::runtime_error("USM allocation already enabled");
++    // -- GODOT start --
++    // if (tls_context_embree != nullptr) throw std::runtime_error("USM allocation already enabled");
++    // if (tls_device_embree != nullptr) throw std::runtime_error("USM allocation already enabled");
++    if (tls_context_embree != nullptr) {
++      abort();
++    }
++    if (tls_device_embree != nullptr) {
++      abort();
++    }
++    // -- GODOT end --
+     tls_context_embree = context;
+     tls_device_embree = device;
+   }
+ 
+   void disableUSMAllocEmbree()
+   {
+-    if (tls_context_embree  == nullptr) throw std::runtime_error("USM allocation not enabled");
+-    if (tls_device_embree  == nullptr) throw std::runtime_error("USM allocation not enabled");
++    // -- GODOT start --
++    // if (tls_context_embree  == nullptr) throw std::runtime_error("USM allocation not enabled");
++    // if (tls_device_embree  == nullptr) throw std::runtime_error("USM allocation not enabled");
++    if (tls_context_embree  == nullptr) {
++      abort();
++    }
++    if (tls_device_embree  == nullptr) {
++      abort();
++    }
++    // -- GODOT end --
+     tls_context_embree = nullptr;
+     tls_device_embree = nullptr;
+   }
+@@ -48,8 +64,16 @@ namespace embree
+ 
+   void disableUSMAllocTutorial()
+   {
+-    if (tls_context_tutorial  == nullptr) throw std::runtime_error("USM allocation not enabled");
+-    if (tls_device_tutorial  == nullptr) throw std::runtime_error("USM allocation not enabled");
++    // -- GODOT start --
++    // if (tls_context_tutorial  == nullptr) throw std::runtime_error("USM allocation not enabled");
++    // if (tls_device_tutorial  == nullptr) throw std::runtime_error("USM allocation not enabled");
++    if (tls_context_tutorial  == nullptr) {
++      abort();
++    }
++    if (tls_device_tutorial  == nullptr) {
++      abort();
++    }
++    // -- GODOT end --
+     
+     tls_context_tutorial = nullptr;
+     tls_device_tutorial = nullptr;
+@@ -64,8 +88,13 @@ namespace embree
+ 
+     assert((align & (align-1)) == 0);
      void* ptr = _mm_malloc(size,align);
+-    if (size != 0 && ptr == nullptr)
+-      throw std::bad_alloc();
++    // -- GODOT start --
++    // if (size != 0 && ptr == nullptr)
++    //   throw std::bad_alloc();
++    if (size != 0 && ptr == nullptr) {
++      abort();
++    }
++    // -- GODOT end --
+     return ptr;
+   }
  
-     if (size != 0 && ptr == nullptr)
+@@ -94,8 +123,13 @@ namespace embree
+     else
+       ptr = sycl::aligned_alloc_shared(align,size,*device,*context);
+       
+-    if (size != 0 && ptr == nullptr)
 -      throw std::bad_alloc();
-+      // -- GODOT start --
-+      // throw std::bad_alloc();
++    // -- GODOT start --
++    // if (size != 0 && ptr == nullptr)
++    //   throw std::bad_alloc();
++    if (size != 0 && ptr == nullptr) {
 +      abort();
-+      // -- GODOT end --
-     
++    }
++    // -- GODOT end --
+ 
      return ptr;
    }
-@@ -128,7 +131,10 @@ namespace embree
+@@ -241,7 +275,12 @@ namespace embree
      /* fall back to 4k pages */
      int flags = MEM_COMMIT | MEM_RESERVE;
      char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE);
 -    if (ptr == nullptr) throw std::bad_alloc();
 +    // -- GODOT start --
 +    // if (ptr == nullptr) throw std::bad_alloc();
-+    if (ptr == nullptr) abort();
++    if (ptr == nullptr) {
++      abort();
++    }
 +    // -- GODOT end --
      hugepages = false;
      return ptr;
    }
-@@ -145,7 +151,10 @@ namespace embree
+@@ -257,8 +296,13 @@ namespace embree
+     if (bytesNew >= bytesOld)
        return bytesOld;
  
-     if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT))
+-    if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT))
 -      throw std::bad_alloc();
-+      // -- GODOT start --
-+      // throw std::bad_alloc();
++    // -- GODOT start --
++    // if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT))
++    //   throw std::bad_alloc();
++    if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT)) {
 +      abort();
-+      // -- GODOT end --
++    }
++    // -- GODOT end --
  
      return bytesNew;
    }
-@@ -156,7 +165,10 @@ namespace embree
+@@ -268,8 +312,13 @@ namespace embree
+     if (bytes == 0) 
        return;
  
-     if (!VirtualFree(ptr,0,MEM_RELEASE))
+-    if (!VirtualFree(ptr,0,MEM_RELEASE))
 -      throw std::bad_alloc();
-+      // -- GODOT start --
-+      // throw std::bad_alloc();
++    // -- GODOT start --
++    // if (!VirtualFree(ptr,0,MEM_RELEASE))
++    //   throw std::bad_alloc();
++    if (!VirtualFree(ptr,0,MEM_RELEASE)) {
 +      abort();
-+      // -- GODOT end --
++    }
++    // -- GODOT end --
    }
  
    void os_advise(void *ptr, size_t bytes)
-@@ -260,7 +272,10 @@ namespace embree
+@@ -373,7 +422,12 @@ namespace embree
  
      /* fallback to 4k pages */
      void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
 -    if (ptr == MAP_FAILED) throw std::bad_alloc();
 +    // -- GODOT start --
 +    // if (ptr == MAP_FAILED) throw std::bad_alloc();
-+    if (ptr == MAP_FAILED) abort();
++    if (ptr == MAP_FAILED) {
++      abort();
++    }
 +    // -- GODOT end --
      hugepages = false;
  
      /* advise huge page hint for THP */
-@@ -277,7 +292,10 @@ namespace embree
+@@ -389,8 +443,13 @@ namespace embree
+     if (bytesNew >= bytesOld)
        return bytesOld;
  
-     if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1)
+-    if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1)
 -      throw std::bad_alloc();
-+      // -- GODOT start --
-+      // throw std::bad_alloc();
++    // -- GODOT start --
++    // if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1)
++    //   throw std::bad_alloc();
++    if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1) {
 +      abort();
-+      // -- GODOT end --
++    }
++    // -- GODOT end --
  
      return bytesNew;
    }
-@@ -291,7 +309,10 @@ namespace embree
+@@ -403,8 +462,13 @@ namespace embree
+     /* for hugepages we need to also align the size */
      const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
      bytes = (bytes+pageSize-1) & ~(pageSize-1);
-     if (munmap(ptr,bytes) == -1)
+-    if (munmap(ptr,bytes) == -1)
 -      throw std::bad_alloc();
-+      // -- GODOT start --
-+      // throw std::bad_alloc();
++    // -- GODOT start --
++    // if (munmap(ptr,bytes) == -1)
++    //   throw std::bad_alloc();
++    if (munmap(ptr,bytes) == -1) {
 +      abort();
-+      // -- GODOT end --
++    }
++    // -- GODOT end --
    }
  
    /* hint for transparent huge pages (THP) */
+diff --git a/thirdparty/embree/common/sys/alloc.h b/thirdparty/embree/common/sys/alloc.h
+index e19c2c221a..28b17f988d 100644
+--- a/thirdparty/embree/common/sys/alloc.h
++++ b/thirdparty/embree/common/sys/alloc.h
+@@ -160,7 +160,10 @@ namespace embree
+       typedef std::ptrdiff_t difference_type;
+ 
+       __forceinline pointer allocate( size_type n ) {
+-        throw std::runtime_error("no allocation supported");
++        // -- GODOT start --
++        // throw std::runtime_error("no allocation supported");
++        abort();
++        // -- GODOT end --
+       }
+ 
+       __forceinline void deallocate( pointer p, size_type n ) {
 diff --git a/thirdparty/embree/common/sys/platform.h b/thirdparty/embree/common/sys/platform.h
-index be3ec36436..728bf6ed7d 100644
+index 6dc0cf3318..d4a9b9e119 100644
 --- a/thirdparty/embree/common/sys/platform.h
 +++ b/thirdparty/embree/common/sys/platform.h
-@@ -178,11 +178,19 @@
- #define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl
+@@ -213,11 +213,19 @@
+ #define UPRINT4(x,y,z,w) embree_cout_uniform << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl
  
  #if defined(DEBUG) // only report file and line in debug mode
 +  // -- GODOT start --
-+  // #define THROW_RUNTIME_ERROR(str)
++  // #define THROW_RUNTIME_ERROR(str) \
 +  //   throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
    #define THROW_RUNTIME_ERROR(str) \
 -    throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
@@ -263,7 +261,7 @@ index be3ec36436..728bf6ed7d 100644
 +  // -- GODOT end --
  #else
 +  // -- GODOT start --
-+  // #define THROW_RUNTIME_ERROR(str)
++  // #define THROW_RUNTIME_ERROR(str) \
 +  //   throw std::runtime_error(str);
    #define THROW_RUNTIME_ERROR(str) \
 -    throw std::runtime_error(str);
@@ -273,7 +271,7 @@ index be3ec36436..728bf6ed7d 100644
  
  #define FATAL(x)   THROW_RUNTIME_ERROR(x)
 diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.cpp b/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
-index dca835a716..ad438588a3 100644
+index 83ead95122..88b88a30ec 100644
 --- a/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
 +++ b/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
 @@ -48,13 +48,15 @@ namespace embree
@@ -281,115 +279,59 @@ index dca835a716..ad438588a3 100644
        Task* prevTask = thread.task;
        thread.task = this;
 -      try {
--        if (thread.scheduler->cancellingException == nullptr)
+-        if (context->cancellingException == nullptr)
 +      // -- GODOT start --
 +      // try {
-+      // if (thread.scheduler->cancellingException == nullptr)
++      //   if (context->cancellingException == nullptr)
            closure->execute();
 -      } catch (...) {
--        if (thread.scheduler->cancellingException == nullptr)
--          thread.scheduler->cancellingException = std::current_exception();
+-        if (context->cancellingException == nullptr)
+-          context->cancellingException = std::current_exception();
 -      }
 +      // } catch (...) {
-+      //   if (thread.scheduler->cancellingException == nullptr)
-+      //     thread.scheduler->cancellingException = std::current_exception();
++      //   if (context->cancellingException == nullptr)
++      //     context->cancellingException = std::current_exception();
 +      // }
 +      // -- GODOT end --
        thread.task = prevTask;
        add_dependencies(-1);
      }
-@@ -291,8 +293,11 @@ namespace embree
-     size_t threadIndex = allocThreadIndex();
-     condition.wait(mutex, [&] () { return hasRootTask.load(); });
-     mutex.unlock();
--    std::exception_ptr except = thread_loop(threadIndex);
--    if (except != nullptr) std::rethrow_exception(except);
-+    // -- GODOT start --
-+    // std::exception_ptr except = thread_loop(threadIndex);
-+    // if (except != nullptr) std::rethrow_exception(except);
-+    thread_loop(threadIndex);
-+    // -- GODOT end --
-   }
- 
-   void TaskScheduler::reset() {
-@@ -324,7 +329,10 @@ namespace embree
-     return thread->scheduler->cancellingException == nullptr;
-   }
- 
--  std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex)
-+// -- GODOT start --
-+//   std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex)
-+  void TaskScheduler::thread_loop(size_t threadIndex)
-+// -- GODOT end --
-   {
-     /* allocate thread structure */
-     std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation
-@@ -347,9 +355,10 @@ namespace embree
-     swapThread(oldThread);
- 
-     /* remember exception to throw */
--    std::exception_ptr except = nullptr;
--    if (cancellingException != nullptr) except = cancellingException;
--
-+    // -- GODOT start --
-+    // std::exception_ptr except = nullptr;
-+    // if (cancellingException != nullptr) except = cancellingException;
-+    // -- GODOT end --
-     /* wait for all threads to terminate */
-     threadCounter--;
- #if defined(__WIN32__)
-@@ -367,7 +376,10 @@ namespace embree
-           yield();
- #endif
- 	}
--    return except;
-+     // -- GODOT start --
-+     // return except;
-+     return;
-+     // -- GODOT end --
-   }
- 
-   bool TaskScheduler::steal_from_other_threads(Thread& thread)
 diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.h b/thirdparty/embree/common/tasking/taskschedulerinternal.h
-index 61a0e57c5b..6cc2495195 100644
+index 355648b3f8..e72d3b72ba 100644
 --- a/thirdparty/embree/common/tasking/taskschedulerinternal.h
 +++ b/thirdparty/embree/common/tasking/taskschedulerinternal.h
-@@ -123,7 +123,10 @@ namespace embree
+@@ -130,8 +130,13 @@ namespace embree
+       __forceinline void* alloc(size_t bytes, size_t align = 64)
        {
          size_t ofs = bytes + ((align - stackPtr) & (align-1));
-         if (stackPtr + ofs > CLOSURE_STACK_SIZE)
+-        if (stackPtr + ofs > CLOSURE_STACK_SIZE)
 -          throw std::runtime_error("closure stack overflow");
-+          // -- GODOT start --
-+          // throw std::runtime_error("closure stack overflow");
++        // -- GODOT start --
++        // if (stackPtr + ofs > CLOSURE_STACK_SIZE)
++        //   throw std::runtime_error("closure stack overflow");
++        if (stackPtr + ofs > CLOSURE_STACK_SIZE) {
 +          abort();
-+          // -- GODOT end --
++        }
++        // -- GODOT end --
          stackPtr += ofs;
          return &stack[stackPtr-bytes];
        }
-@@ -132,7 +135,10 @@ namespace embree
-       __forceinline void push_right(Thread& thread, const size_t size, const Closure& closure)
+@@ -139,8 +144,13 @@ namespace embree
+       template<typename Closure>
+       __forceinline void push_right(Thread& thread, const size_t size, const Closure& closure, TaskGroupContext* context)
        {
-         if (right >= TASK_STACK_SIZE)
+-        if (right >= TASK_STACK_SIZE)
 -          throw std::runtime_error("task stack overflow");
-+           // -- GODOT start --
-+           // throw std::runtime_error("task stack overflow");
-+           abort();
-+           // -- GODOT end --
++        // -- GODOT start --
++        // if (right >= TASK_STACK_SIZE)
++        //   throw std::runtime_error("task stack overflow");
++        if (right >= TASK_STACK_SIZE) {
++          abort();
++        }
++        // -- GODOT end --
  
  	/* allocate new task on right side of stack */
          size_t oldStackPtr = stackPtr;
-@@ -238,7 +244,10 @@ namespace embree
-     void wait_for_threads(size_t threadCount);
- 
-     /*! thread loop for all worker threads */
--    std::exception_ptr thread_loop(size_t threadIndex);
-+    // -- GODOT start --
-+    // std::exception_ptr thread_loop(size_t threadIndex);
-+    void thread_loop(size_t threadIndex);
-+    // -- GODOT end --
- 
-     /*! steals a task from a different thread */
-     bool steal_from_other_threads(Thread& thread);
 diff --git a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
 index 40f9043736..57f75bfd7e 100644
 --- a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
@@ -406,40 +348,94 @@ index 40f9043736..57f75bfd7e 100644
      }
      return s;
    } 
+diff --git a/thirdparty/embree/kernels/common/alloc.h b/thirdparty/embree/kernels/common/alloc.h
+index 2bd292de4d..840d48c327 100644
+--- a/thirdparty/embree/kernels/common/alloc.h
++++ b/thirdparty/embree/kernels/common/alloc.h
+@@ -189,8 +189,13 @@ namespace embree
+       , atype(osAllocation ? EMBREE_OS_MALLOC : ALIGNED_MALLOC)
+       , primrefarray(device,0)
+     {
+-      if (osAllocation && useUSM)
+-        throw std::runtime_error("USM allocation cannot be combined with OS allocation.");
++      // -- GODOT start --
++      // if (osAllocation && useUSM)
++      //   throw std::runtime_error("USM allocation cannot be combined with OS allocation.");
++      if (osAllocation && useUSM) {
++        abort();
++      }
++      // -- GODOT end --
+ 
+       for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
+       {
+@@ -502,8 +507,13 @@ namespace embree
+         Block* myUsedBlocks = threadUsedBlocks[slot];
+         if (myUsedBlocks) {
+           void* ptr = myUsedBlocks->malloc(device,bytes,align,partial);
+-          if (ptr == nullptr && !blockAllocation)
+-            throw std::bad_alloc();
++          // -- GODOT start --
++          // if (ptr == nullptr && !blockAllocation)
++          //   throw std::bad_alloc();
++          if (ptr == nullptr && !blockAllocation) {
++            abort();
++          }
++          // -- GODOT end --
+           if (ptr) return ptr;
+         }
+ 
 diff --git a/thirdparty/embree/kernels/common/rtcore.cpp b/thirdparty/embree/kernels/common/rtcore.cpp
-index 95a94319ec..a6ea55bfc4 100644
+index 8dc5d7045b..eb8d2c0a58 100644
 --- a/thirdparty/embree/kernels/common/rtcore.cpp
 +++ b/thirdparty/embree/kernels/common/rtcore.cpp
-@@ -198,7 +198,10 @@ RTC_NAMESPACE_BEGIN;
+@@ -257,10 +257,17 @@ RTC_NAMESPACE_BEGIN;
+     RTC_TRACE(rtcSetSceneBuildQuality);
+     RTC_VERIFY_HANDLE(hscene);
+     RTC_ENTER_DEVICE(hscene);
++    // -- GODOT start --
++    // if (quality != RTC_BUILD_QUALITY_LOW &&
++    //     quality != RTC_BUILD_QUALITY_MEDIUM &&
++    //     quality != RTC_BUILD_QUALITY_HIGH)
++    //   throw std::runtime_error("invalid build quality");
      if (quality != RTC_BUILD_QUALITY_LOW &&
          quality != RTC_BUILD_QUALITY_MEDIUM &&
-         quality != RTC_BUILD_QUALITY_HIGH)
+-        quality != RTC_BUILD_QUALITY_HIGH)
 -      throw std::runtime_error("invalid build quality");
-+      // -- GODOT start --
-+      // throw std::runtime_error("invalid build quality");
++        quality != RTC_BUILD_QUALITY_HIGH) {
 +      abort();
-+      // -- GODOT end --
++    }
++    // -- GODOT end --
      scene->setBuildQuality(quality);
      RTC_CATCH_END2(scene);
    }
-@@ -1351,7 +1354,10 @@ RTC_NAMESPACE_BEGIN;
+@@ -1563,11 +1570,19 @@ RTC_API void rtcSetGeometryTransform(RTCGeometry hgeometry, unsigned int timeSte
+     RTC_TRACE(rtcSetGeometryBuildQuality);
+     RTC_VERIFY_HANDLE(hgeometry);
+     RTC_ENTER_DEVICE(hgeometry);
++    // -- GODOT start --
++    // if (quality != RTC_BUILD_QUALITY_LOW &&
++    //     quality != RTC_BUILD_QUALITY_MEDIUM &&
++    //     quality != RTC_BUILD_QUALITY_HIGH &&
++    //     quality != RTC_BUILD_QUALITY_REFIT)
++    //   throw std::runtime_error("invalid build quality");
+     if (quality != RTC_BUILD_QUALITY_LOW &&
          quality != RTC_BUILD_QUALITY_MEDIUM &&
          quality != RTC_BUILD_QUALITY_HIGH &&
-         quality != RTC_BUILD_QUALITY_REFIT)
+-        quality != RTC_BUILD_QUALITY_REFIT)
 -      throw std::runtime_error("invalid build quality");
-+      // -- GODOT start --
-+      // throw std::runtime_error("invalid build quality");
++        quality != RTC_BUILD_QUALITY_REFIT) {
 +      abort();
-+      // -- GODOT end --
++    }
++    // -- GODOT end --
      geometry->setBuildQuality(quality);
      RTC_CATCH_END2(geometry);
    }
 diff --git a/thirdparty/embree/kernels/common/rtcore.h b/thirdparty/embree/kernels/common/rtcore.h
-index 4e4b24e9c2..ac58a84d6f 100644
+index 73a061de11..47526482c1 100644
 --- a/thirdparty/embree/kernels/common/rtcore.h
 +++ b/thirdparty/embree/kernels/common/rtcore.h
-@@ -25,6 +25,13 @@ namespace embree
- #endif
+@@ -13,13 +13,13 @@ namespace embree
+   __forceinline bool isIncoherent(RTCRayQueryFlags flags) { return (flags & RTC_RAY_QUERY_FLAG_COHERENT) == RTC_RAY_QUERY_FLAG_INCOHERENT; }
  
  /*! Macros used in the rtcore API implementation */
 +// -- GODOT start --
@@ -447,21 +443,18 @@ index 4e4b24e9c2..ac58a84d6f 100644
 +#define RTC_CATCH_END(device)
 +#define RTC_CATCH_END2(scene)
 +#define RTC_CATCH_END2_FALSE(scene) return false;
-+
-+#if 0
+ #if 0
+-#  define RTC_CATCH_BEGIN
+-#  define RTC_CATCH_END(device)
+-#  define RTC_CATCH_END2(scene)
+-#  define RTC_CATCH_END2_FALSE(scene) return false;
+-#else
+-  
++// -- GODOT end --
  #define RTC_CATCH_BEGIN try {
    
  #define RTC_CATCH_END(device)                                                \
-@@ -71,6 +78,8 @@ namespace embree
-     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
-     return false;                                                               \
-   }
-+#endif
-+// -- GODOT end --
- 
- #define RTC_VERIFY_HANDLE(handle)                               \
-   if (handle == nullptr) {                                         \
-@@ -97,6 +106,8 @@ namespace embree
+@@ -94,6 +94,8 @@ namespace embree
  #define RTC_TRACE(x) 
  #endif
  
@@ -470,7 +463,7 @@ index 4e4b24e9c2..ac58a84d6f 100644
    /*! used to throw embree API errors */
    struct rtcore_error : public std::exception
    {
-@@ -112,14 +123,18 @@ namespace embree
+@@ -109,14 +111,18 @@ namespace embree
      RTCError error;
      std::string str;
    };
@@ -490,36 +483,61 @@ index 4e4b24e9c2..ac58a84d6f 100644
 +// -- GODOT end --
  
  #define RTC_BUILD_ARGUMENTS_HAS(settings,member) \
-   (settings.byteSize > (offsetof(RTCBuildArguments,member)+sizeof(settings.member))) 
+   (settings.byteSize > (offsetof(RTCBuildArguments,member)+sizeof(settings.member)))
 diff --git a/thirdparty/embree/kernels/common/scene.cpp b/thirdparty/embree/kernels/common/scene.cpp
-index ad1916c54e..65d31d0f81 100644
+index fda8dd938a..10cb3c4bec 100644
 --- a/thirdparty/embree/kernels/common/scene.cpp
 +++ b/thirdparty/embree/kernels/common/scene.cpp
-@@ -790,16 +790,18 @@ namespace embree
+@@ -894,16 +894,18 @@ namespace embree
      }
  
      /* initiate build */
 -    try {
 +    // -- GODOT start --
 +    // try {
-       scheduler->spawn_root([&]() { commit_task(); Lock<MutexSys> lock(schedulerMutex); this->scheduler = nullptr; }, 1, !join);
+       TaskScheduler::TaskGroupContext context;
+       scheduler->spawn_root([&]() { commit_task(); Lock<MutexSys> lock(taskGroup->schedulerMutex); taskGroup->scheduler = nullptr; }, &context, 1, !join);
 -    }
 -    catch (...) {
 -      accels_clear();
--      updateInterface();
--      Lock<MutexSys> lock(schedulerMutex);
--      this->scheduler = nullptr;
+-      Lock<MutexSys> lock(taskGroup->schedulerMutex);
+-      taskGroup->scheduler = nullptr;
 -      throw;
 -    }
 +    // }
 +    // catch (...) {
 +    //   accels_clear();
-+    //   updateInterface();
-+    //   Lock<MutexSys> lock(schedulerMutex);
-+    //   this->scheduler = nullptr;
++    //   Lock<MutexSys> lock(taskGroup->schedulerMutex);
++    //   taskGroup->scheduler = nullptr;
 +    //   throw;
 +    // }
 +    // -- GODOT end --
    }
  
  #endif
+diff --git a/thirdparty/embree/kernels/common/state.cpp b/thirdparty/embree/kernels/common/state.cpp
+index 4e3ab6ddfb..1d73ae9629 100644
+--- a/thirdparty/embree/kernels/common/state.cpp
++++ b/thirdparty/embree/kernels/common/state.cpp
+@@ -194,13 +194,15 @@ namespace embree
+   bool State::parseFile(const FileName& fileName)
+   { 
+     Ref<Stream<int> > file;
+-    try {
++    // -- GODOT start --
++    // try {
+       file = new FileStream(fileName);
+-    }
+-    catch (std::runtime_error& e) {
+-      (void) e;
+-      return false;
+-    }
++    // }
++    // catch (std::runtime_error& e) {
++    //   (void) e;
++    //   return false;
++    // }
++    // -- GODOT end --
+     
+     std::vector<std::string> syms;
+     for (size_t i=0; i<sizeof(symbols)/sizeof(void*); i++) 
diff --git a/thirdparty/embree/patches/godot-config-changes.patch b/thirdparty/embree/patches/godot-config-changes.patch
new file mode 100644
index 0000000000..88e15c282c
--- /dev/null
+++ b/thirdparty/embree/patches/godot-config-changes.patch
@@ -0,0 +1,102 @@
+diff --git a/thirdparty/embree/include/embree4/rtcore_config.h b/thirdparty/embree/include/embree4/rtcore_config.h
+index 8abd6954c3..cb3a8678a7 100644
+--- a/thirdparty/embree/include/embree4/rtcore_config.h
++++ b/thirdparty/embree/include/embree4/rtcore_config.h
+@@ -4,7 +4,7 @@
+ #pragma once
+ 
+ #if !defined(EMBREE_SYCL_SUPPORT)
+-#cmakedefine EMBREE_SYCL_SUPPORT
++// #cmakedefine EMBREE_SYCL_SUPPORT
+ #endif
+ 
+ #define RTC_VERSION_MAJOR 4
+@@ -13,28 +13,28 @@
+ #define RTC_VERSION 40301
+ #define RTC_VERSION_STRING "4.3.1"
+ 
+-#define RTC_MAX_INSTANCE_LEVEL_COUNT @EMBREE_MAX_INSTANCE_LEVEL_COUNT@
++#define RTC_MAX_INSTANCE_LEVEL_COUNT 1
+ 
+-#cmakedefine EMBREE_GEOMETRY_INSTANCE_ARRAY
++// #cmakedefine EMBREE_GEOMETRY_INSTANCE_ARRAY
+ #if defined(EMBREE_GEOMETRY_INSTANCE_ARRAY)
+   #define RTC_GEOMETRY_INSTANCE_ARRAY
+ #endif
+ 
+-#cmakedefine01 EMBREE_SYCL_GEOMETRY_CALLBACK
++// #cmakedefine01 EMBREE_SYCL_GEOMETRY_CALLBACK
+ 
+-#cmakedefine01 EMBREE_MIN_WIDTH
++#define EMBREE_MIN_WIDTH 0
+ #define RTC_MIN_WIDTH EMBREE_MIN_WIDTH
+ 
+ #if !defined(EMBREE_STATIC_LIB)
+-#cmakedefine EMBREE_STATIC_LIB
++#define EMBREE_STATIC_LIB
+ #endif
+-#cmakedefine EMBREE_API_NAMESPACE
++// #cmakedefine EMBREE_API_NAMESPACE
+ 
+ #if defined(EMBREE_API_NAMESPACE)
+-#  define RTC_NAMESPACE @EMBREE_API_NAMESPACE@
+-#  define RTC_NAMESPACE_BEGIN namespace @EMBREE_API_NAMESPACE@ {
++#  define RTC_NAMESPACE
++#  define RTC_NAMESPACE_BEGIN namespace {
+ #  define RTC_NAMESPACE_END }
+-#  define RTC_NAMESPACE_USE using namespace @EMBREE_API_NAMESPACE@;
++#  define RTC_NAMESPACE_USE using namespace;
+ #  define RTC_API_EXTERN_C
+ #  undef EMBREE_API_NAMESPACE
+ #else
+diff --git a/thirdparty/embree/kernels/config.h b/thirdparty/embree/kernels/config.h
+index 1669c4af72..5979b543c9 100644
+--- a/thirdparty/embree/kernels/config.h
++++ b/thirdparty/embree/kernels/config.h
+@@ -3,27 +3,27 @@
+ 
+ #include "../include/embree4/rtcore_config.h"
+ 
+-#cmakedefine EMBREE_RAY_MASK
+-#cmakedefine EMBREE_STAT_COUNTERS
+-#cmakedefine EMBREE_BACKFACE_CULLING
+-#cmakedefine EMBREE_BACKFACE_CULLING_CURVES
+-#cmakedefine EMBREE_BACKFACE_CULLING_SPHERES
+-#cmakedefine EMBREE_FILTER_FUNCTION
+-#cmakedefine EMBREE_IGNORE_INVALID_RAYS
+-#cmakedefine EMBREE_GEOMETRY_TRIANGLE
+-#cmakedefine EMBREE_GEOMETRY_QUAD
+-#cmakedefine EMBREE_GEOMETRY_CURVE
+-#cmakedefine EMBREE_GEOMETRY_SUBDIVISION
+-#cmakedefine EMBREE_GEOMETRY_USER
+-#cmakedefine EMBREE_GEOMETRY_INSTANCE
++// #cmakedefine EMBREE_RAY_MASK
++// #cmakedefine EMBREE_STAT_COUNTERS
++// #cmakedefine EMBREE_BACKFACE_CULLING
++// #cmakedefine EMBREE_BACKFACE_CULLING_CURVES
++// #cmakedefine EMBREE_BACKFACE_CULLING_SPHERES
++#define EMBREE_FILTER_FUNCTION
++// #cmakedefine EMBREE_IGNORE_INVALID_RAYS
++#define EMBREE_GEOMETRY_TRIANGLE
++// #cmakedefine EMBREE_GEOMETRY_QUAD
++// #cmakedefine EMBREE_GEOMETRY_CURVE
++// #cmakedefine EMBREE_GEOMETRY_SUBDIVISION
++// #cmakedefine EMBREE_GEOMETRY_USER
++// #cmakedefine EMBREE_GEOMETRY_INSTANCE
+ // EMBREE_GEOMETRY_INSTANCE_ARRAY is defined in rtcore_config.h
+-#cmakedefine EMBREE_GEOMETRY_GRID
+-#cmakedefine EMBREE_GEOMETRY_POINT
+-#cmakedefine EMBREE_RAY_PACKETS
+-#cmakedefine EMBREE_COMPACT_POLYS
++// #cmakedefine EMBREE_GEOMETRY_GRID
++// #cmakedefine EMBREE_GEOMETRY_POINT
++#define EMBREE_RAY_PACKETS
++// #cmakedefine EMBREE_COMPACT_POLYS
+ 
+-#define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR @EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR@
+-#cmakedefine EMBREE_DISC_POINT_SELF_INTERSECTION_AVOIDANCE
++#define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0
++#define EMBREE_DISC_POINT_SELF_INTERSECTION_AVOIDANCE
+ 
+ #if defined(EMBREE_GEOMETRY_TRIANGLE)
+   #define IF_ENABLED_TRIS(x) x