1 files changed, 929 insertions, 334 deletions
diff --git a/servers/rendering/rendering_device.cpp b/servers/rendering/rendering_device.cpp
index e322bba768..b287550986 100644
--- a/servers/rendering/rendering_device.cpp
+++ b/servers/rendering/rendering_device.cpp
@@ -36,7 +36,15 @@
 #include "core/config/project_settings.h"
 #include "core/io/dir_access.h"
 
+// TODO: Thread safety
+// - Roll back thread safe attribute for RID_Owner members after the read-only/atomic update scheme is implemented.
+
 #define FORCE_SEPARATE_PRESENT_QUEUE 0
+#define PRINT_FRAMEBUFFER_FORMAT 0
+
+#define ERR_RENDER_THREAD_MSG String("This function (") + String(__func__) + String(") can only be called from the render thread. ")
+#define ERR_RENDER_THREAD_GUARD() ERR_FAIL_COND_MSG(render_thread_id != Thread::get_caller_id(), ERR_RENDER_THREAD_MSG);
+#define ERR_RENDER_THREAD_GUARD_V(m_ret) ERR_FAIL_COND_V_MSG(render_thread_id != Thread::get_caller_id(), (m_ret), ERR_RENDER_THREAD_MSG);
 
 /**************************/
 /**** HELPER FUNCTIONS ****/
@@ -138,6 +146,8 @@ RenderingDevice::ShaderSPIRVGetCacheKeyFunction RenderingDevice::get_spirv_cache
 /***************************/
 
 void RenderingDevice::_add_dependency(RID p_id, RID p_depends_on) {
+	_THREAD_SAFE_METHOD_
+
 	HashSet<RID> *set = dependency_map.getptr(p_depends_on);
 	if (set == nullptr) {
 		set = &dependency_map.insert(p_depends_on, HashSet<RID>())->value;
@@ -152,6 +162,8 @@ void RenderingDevice::_add_dependency(RID p_id, RID p_depends_on) {
 }
 
 void RenderingDevice::_free_dependencies(RID p_id) {
+	_THREAD_SAFE_METHOD_
+
 	// Direct dependencies must be freed.
 
 	HashMap<RID, HashSet<RID>>::Iterator E = dependency_map.find(p_id);
@@ -236,6 +248,35 @@ RenderingDevice::Buffer *RenderingDevice::_get_buffer_from_owner(RID p_buffer) {
 	return buffer;
 }
 
+Error RenderingDevice::_buffer_initialize(Buffer *p_buffer, const uint8_t *p_data, size_t p_data_size, uint32_t p_required_align) {
+	uint32_t transfer_worker_offset;
+	TransferWorker *transfer_worker = _acquire_transfer_worker(p_data_size, p_required_align, transfer_worker_offset);
+	p_buffer->transfer_worker_index = transfer_worker->index;
+
+	{
+		MutexLock lock(transfer_worker->operations_mutex);
+		p_buffer->transfer_worker_operation = ++transfer_worker->operations_counter;
+	}
+
+	// Copy to the worker's staging buffer.
+	uint8_t *data_ptr = driver->buffer_map(transfer_worker->staging_buffer);
+	ERR_FAIL_NULL_V(data_ptr, ERR_CANT_CREATE);
+
+	memcpy(data_ptr + transfer_worker_offset, p_data, p_data_size);
+	driver->buffer_unmap(transfer_worker->staging_buffer);
+
+	// Copy from the staging buffer to the real buffer.
+	RDD::BufferCopyRegion region;
+	region.src_offset = transfer_worker_offset;
+	region.dst_offset = 0;
+	region.size = p_data_size;
+	driver->command_copy_buffer(transfer_worker->command_buffer, transfer_worker->staging_buffer, p_buffer->driver_id, region);
+
+	_release_transfer_worker(transfer_worker);
+
+	return OK;
+}
+
 Error RenderingDevice::_insert_staging_block() {
 	StagingBufferBlock block;
 
@@ -386,32 +427,89 @@ void RenderingDevice::_staging_buffer_execute_required_action(StagingRequiredAct
 	}
 }
 
-Error RenderingDevice::_buffer_update(Buffer *p_buffer, RID p_buffer_id, size_t p_offset, const uint8_t *p_data, size_t p_data_size, bool p_use_draw_queue, uint32_t p_required_align) {
+Error RenderingDevice::buffer_copy(RID p_src_buffer, RID p_dst_buffer, uint32_t p_src_offset, uint32_t p_dst_offset, uint32_t p_size) {
+	ERR_RENDER_THREAD_GUARD_V(ERR_UNAVAILABLE);
+
+	ERR_FAIL_COND_V_MSG(draw_list, ERR_INVALID_PARAMETER,
+			"Copying buffers is forbidden during creation of a draw list");
+	ERR_FAIL_COND_V_MSG(compute_list, ERR_INVALID_PARAMETER,
+			"Copying buffers is forbidden during creation of a compute list");
+
+	Buffer *src_buffer = _get_buffer_from_owner(p_src_buffer);
+	if (!src_buffer) {
+		ERR_FAIL_V_MSG(ERR_INVALID_PARAMETER, "Source buffer argument is not a valid buffer of any type.");
+	}
+
+	Buffer *dst_buffer = _get_buffer_from_owner(p_dst_buffer);
+	if (!dst_buffer) {
+		ERR_FAIL_V_MSG(ERR_INVALID_PARAMETER, "Destination buffer argument is not a valid buffer of any type.");
+	}
+
+	// Validate the copy's dimensions for both buffers.
+	ERR_FAIL_COND_V_MSG((p_size + p_src_offset) > src_buffer->size, ERR_INVALID_PARAMETER, "Size is larger than the source buffer.");
+	ERR_FAIL_COND_V_MSG((p_size + p_dst_offset) > dst_buffer->size, ERR_INVALID_PARAMETER, "Size is larger than the destination buffer.");
+
+	_check_transfer_worker_buffer(src_buffer);
+	_check_transfer_worker_buffer(dst_buffer);
+
+	// Perform the copy.
+	RDD::BufferCopyRegion region;
+	region.src_offset = p_src_offset;
+	region.dst_offset = p_dst_offset;
+	region.size = p_size;
+
+	if (_buffer_make_mutable(dst_buffer, p_dst_buffer)) {
+		// The destination buffer must be mutable to be used as a copy destination.
+		draw_graph.add_synchronization();
+	}
+
+	draw_graph.add_buffer_copy(src_buffer->driver_id, src_buffer->draw_tracker, dst_buffer->driver_id, dst_buffer->draw_tracker, region);
+
+	return OK;
+}
+
+Error RenderingDevice::buffer_update(RID p_buffer, uint32_t p_offset, uint32_t p_size, const void *p_data) {
+	ERR_RENDER_THREAD_GUARD_V(ERR_UNAVAILABLE);
+
+	copy_bytes_count += p_size;
+
+	ERR_FAIL_COND_V_MSG(draw_list, ERR_INVALID_PARAMETER,
+			"Updating buffers is forbidden during creation of a draw list");
+	ERR_FAIL_COND_V_MSG(compute_list, ERR_INVALID_PARAMETER,
+			"Updating buffers is forbidden during creation of a compute list");
+
+	Buffer *buffer = _get_buffer_from_owner(p_buffer);
+	ERR_FAIL_NULL_V_MSG(buffer, ERR_INVALID_PARAMETER, "Buffer argument is not a valid buffer of any type.");
+	ERR_FAIL_COND_V_MSG(p_offset + p_size > buffer->size, ERR_INVALID_PARAMETER, "Attempted to write buffer (" + itos((p_offset + p_size) - buffer->size) + " bytes) past the end.");
+
+	_check_transfer_worker_buffer(buffer);
+
 	// Submitting may get chunked for various reasons, so convert this to a task.
-	size_t to_submit = p_data_size;
+	size_t to_submit = p_size;
 	size_t submit_from = 0;
 
 	thread_local LocalVector<RDG::RecordedBufferCopy> command_buffer_copies_vector;
 	command_buffer_copies_vector.clear();
 
+	const uint8_t *src_data = reinterpret_cast<const uint8_t *>(p_data);
+	const uint32_t required_align = 32;
 	while (to_submit > 0) {
 		uint32_t block_write_offset;
 		uint32_t block_write_amount;
 		StagingRequiredAction required_action;
 
-		Error err = _staging_buffer_allocate(MIN(to_submit, staging_buffer_block_size), p_required_align, block_write_offset, block_write_amount, required_action);
+		Error err = _staging_buffer_allocate(MIN(to_submit, staging_buffer_block_size), required_align, block_write_offset, block_write_amount, required_action);
 		if (err) {
 			return err;
 		}
 
-		if (p_use_draw_queue && !command_buffer_copies_vector.is_empty() && required_action == STAGING_REQUIRED_ACTION_FLUSH_AND_STALL_ALL) {
-			if (_buffer_make_mutable(p_buffer, p_buffer_id)) {
+		if (!command_buffer_copies_vector.is_empty() && required_action == STAGING_REQUIRED_ACTION_FLUSH_AND_STALL_ALL) {
+			if (_buffer_make_mutable(buffer, p_buffer)) {
 				// The buffer must be mutable to be used as a copy destination.
 				draw_graph.add_synchronization();
 			}
 
-			// If we're using the draw queue and the staging buffer requires flushing everything, we submit the command early and clear the current vector.
-			draw_graph.add_buffer_update(p_buffer->driver_id, p_buffer->draw_tracker, command_buffer_copies_vector);
+			draw_graph.add_buffer_update(buffer->driver_id, buffer->draw_tracker, command_buffer_copies_vector);
 			command_buffer_copies_vector.clear();
 		}
 
@@ -422,7 +520,7 @@ Error RenderingDevice::_buffer_update(Buffer *p_buffer, RID p_buffer_id, size_t
 		ERR_FAIL_NULL_V(data_ptr, ERR_CANT_CREATE);
 
 		// Copy to staging buffer.
-		memcpy(data_ptr + block_write_offset, p_data + submit_from, block_write_amount);
+		memcpy(data_ptr + block_write_offset, src_data + submit_from, block_write_amount);
 
 		// Unmap.
 		driver->buffer_unmap(staging_buffer_blocks[staging_buffer_current].driver_id);
@@ -433,14 +531,10 @@ Error RenderingDevice::_buffer_update(Buffer *p_buffer, RID p_buffer_id, size_t
 		region.dst_offset = submit_from + p_offset;
 		region.size = block_write_amount;
 
-		if (p_use_draw_queue) {
-			RDG::RecordedBufferCopy buffer_copy;
-			buffer_copy.source = staging_buffer_blocks[staging_buffer_current].driver_id;
-			buffer_copy.region = region;
-			command_buffer_copies_vector.push_back(buffer_copy);
-		} else {
-			driver->command_copy_buffer(frames[frame].setup_command_buffer, staging_buffer_blocks[staging_buffer_current].driver_id, p_buffer->driver_id, region);
-		}
+		RDG::RecordedBufferCopy buffer_copy;
+		buffer_copy.source = staging_buffer_blocks[staging_buffer_current].driver_id;
+		buffer_copy.region = region;
+		command_buffer_copies_vector.push_back(buffer_copy);
 
 		staging_buffer_blocks.write[staging_buffer_current].fill_amount = block_write_offset + block_write_amount;
 
@@ -448,77 +542,18 @@ Error RenderingDevice::_buffer_update(Buffer *p_buffer, RID p_buffer_id, size_t
 		submit_from += block_write_amount;
 	}
 
-	if (p_use_draw_queue && !command_buffer_copies_vector.is_empty()) {
-		if (_buffer_make_mutable(p_buffer, p_buffer_id)) {
+	if (!command_buffer_copies_vector.is_empty()) {
+		if (_buffer_make_mutable(buffer, p_buffer)) {
 			// The buffer must be mutable to be used as a copy destination.
 			draw_graph.add_synchronization();
 		}
 
-		draw_graph.add_buffer_update(p_buffer->driver_id, p_buffer->draw_tracker, command_buffer_copies_vector);
+		draw_graph.add_buffer_update(buffer->driver_id, buffer->draw_tracker, command_buffer_copies_vector);
 	}
 
-	return OK;
-}
-
-Error RenderingDevice::buffer_copy(RID p_src_buffer, RID p_dst_buffer, uint32_t p_src_offset, uint32_t p_dst_offset, uint32_t p_size) {
-	_THREAD_SAFE_METHOD_
-
-	ERR_FAIL_COND_V_MSG(draw_list, ERR_INVALID_PARAMETER,
-			"Copying buffers is forbidden during creation of a draw list");
-	ERR_FAIL_COND_V_MSG(compute_list, ERR_INVALID_PARAMETER,
-			"Copying buffers is forbidden during creation of a compute list");
-
-	Buffer *src_buffer = _get_buffer_from_owner(p_src_buffer);
-	if (!src_buffer) {
-		ERR_FAIL_V_MSG(ERR_INVALID_PARAMETER, "Source buffer argument is not a valid buffer of any type.");
-	}
-
-	Buffer *dst_buffer = _get_buffer_from_owner(p_dst_buffer);
-	if (!dst_buffer) {
-		ERR_FAIL_V_MSG(ERR_INVALID_PARAMETER, "Destination buffer argument is not a valid buffer of any type.");
-	}
-
-	// Validate the copy's dimensions for both buffers.
-	ERR_FAIL_COND_V_MSG((p_size + p_src_offset) > src_buffer->size, ERR_INVALID_PARAMETER, "Size is larger than the source buffer.");
-	ERR_FAIL_COND_V_MSG((p_size + p_dst_offset) > dst_buffer->size, ERR_INVALID_PARAMETER, "Size is larger than the destination buffer.");
-
-	// Perform the copy.
-	RDD::BufferCopyRegion region;
-	region.src_offset = p_src_offset;
-	region.dst_offset = p_dst_offset;
-	region.size = p_size;
-
-	if (_buffer_make_mutable(dst_buffer, p_dst_buffer)) {
-		// The destination buffer must be mutable to be used as a copy destination.
-		draw_graph.add_synchronization();
-	}
-
-	draw_graph.add_buffer_copy(src_buffer->driver_id, src_buffer->draw_tracker, dst_buffer->driver_id, dst_buffer->draw_tracker, region);
-
-	return OK;
-}
-
-Error RenderingDevice::buffer_update(RID p_buffer, uint32_t p_offset, uint32_t p_size, const void *p_data) {
-	_THREAD_SAFE_METHOD_
-
-	copy_bytes_count += p_size;
-
-	ERR_FAIL_COND_V_MSG(draw_list, ERR_INVALID_PARAMETER,
-			"Updating buffers is forbidden during creation of a draw list");
-	ERR_FAIL_COND_V_MSG(compute_list, ERR_INVALID_PARAMETER,
-			"Updating buffers is forbidden during creation of a compute list");
-
-	Buffer *buffer = _get_buffer_from_owner(p_buffer);
-	if (!buffer) {
-		ERR_FAIL_V_MSG(ERR_INVALID_PARAMETER, "Buffer argument is not a valid buffer of any type.");
-	}
-
-	ERR_FAIL_COND_V_MSG(p_offset + p_size > buffer->size, ERR_INVALID_PARAMETER,
-			"Attempted to write buffer (" + itos((p_offset + p_size) - buffer->size) + " bytes) past the end.");
-
 	gpu_copy_count++;
 
-	return _buffer_update(buffer, p_buffer, p_offset, (uint8_t *)p_data, p_size, true);
+	return OK;
 }
 
 String RenderingDevice::get_perf_report() const {
@@ -534,7 +569,7 @@ void RenderingDevice::update_perf_report() {
 }
 
 Error RenderingDevice::buffer_clear(RID p_buffer, uint32_t p_offset, uint32_t p_size) {
-	_THREAD_SAFE_METHOD_
+	ERR_RENDER_THREAD_GUARD_V(ERR_UNAVAILABLE);
 
 	ERR_FAIL_COND_V_MSG((p_size % 4) != 0, ERR_INVALID_PARAMETER,
 			"Size must be a multiple of four");
@@ -551,6 +586,8 @@ Error RenderingDevice::buffer_clear(RID p_buffer, uint32_t p_offset, uint32_t p_
 	ERR_FAIL_COND_V_MSG(p_offset + p_size > buffer->size, ERR_INVALID_PARAMETER,
 			"Attempted to write buffer (" + itos((p_offset + p_size) - buffer->size) + " bytes) past the end.");
 
+	_check_transfer_worker_buffer(buffer);
+
 	if (_buffer_make_mutable(buffer, p_buffer)) {
 		// The destination buffer must be mutable to be used as a clear destination.
 		draw_graph.add_synchronization();
@@ -562,7 +599,7 @@ Error RenderingDevice::buffer_clear(RID p_buffer, uint32_t p_offset, uint32_t p_
 }
 
 Vector<uint8_t> RenderingDevice::buffer_get_data(RID p_buffer, uint32_t p_offset, uint32_t p_size) {
-	_THREAD_SAFE_METHOD_
+	ERR_RENDER_THREAD_GUARD_V(Vector<uint8_t>());
 
 	Buffer *buffer = _get_buffer_from_owner(p_buffer);
 	if (!buffer) {
@@ -577,6 +614,8 @@ Vector<uint8_t> RenderingDevice::buffer_get_data(RID p_buffer, uint32_t p_offset
 				"Size is larger than the buffer.");
 	}
 
+	_check_transfer_worker_buffer(buffer);
+
 	RDD::BufferID tmp_buffer = driver->buffer_create(buffer->size, RDD::BUFFER_USAGE_TRANSFER_TO_BIT, RDD::MEMORY_ALLOCATION_TYPE_CPU);
 	ERR_FAIL_COND_V(!tmp_buffer, Vector<uint8_t>());
 
@@ -607,8 +646,6 @@ Vector<uint8_t> RenderingDevice::buffer_get_data(RID p_buffer, uint32_t p_offset
 }
 
 RID RenderingDevice::storage_buffer_create(uint32_t p_size_bytes, const Vector<uint8_t> &p_data, BitField<StorageBufferUsage> p_usage) {
-	_THREAD_SAFE_METHOD_
-
 	ERR_FAIL_COND_V(p_data.size() && (uint32_t)p_data.size() != p_size_bytes, RID());
 
 	Buffer buffer;
@@ -625,10 +662,12 @@ RID RenderingDevice::storage_buffer_create(uint32_t p_size_bytes, const Vector<u
 	buffer.draw_tracker->buffer_driver_id = buffer.driver_id;
 
 	if (p_data.size()) {
-		_buffer_update(&buffer, RID(), 0, p_data.ptr(), p_data.size());
+		_buffer_initialize(&buffer, p_data.ptr(), p_data.size());
 	}
 
+	_THREAD_SAFE_LOCK_
 	buffer_memory += buffer.size;
+	_THREAD_SAFE_UNLOCK_
 
 	RID id = storage_buffer_owner.make_rid(buffer);
 #ifdef DEV_ENABLED
@@ -638,8 +677,6 @@ RID RenderingDevice::storage_buffer_create(uint32_t p_size_bytes, const Vector<u
 }
 
 RID RenderingDevice::texture_buffer_create(uint32_t p_size_elements, DataFormat p_format, const Vector<uint8_t> &p_data) {
-	_THREAD_SAFE_METHOD_
-
 	uint32_t element_size = get_format_vertex_size(p_format);
 	ERR_FAIL_COND_V_MSG(element_size == 0, RID(), "Format requested is not supported for texture buffers");
 	uint64_t size_bytes = uint64_t(element_size) * p_size_elements;
@@ -665,10 +702,12 @@ RID RenderingDevice::texture_buffer_create(uint32_t p_size_elements, DataFormat
 	}
 
 	if (p_data.size()) {
-		_buffer_update(&texture_buffer, RID(), 0, p_data.ptr(), p_data.size());
+		_buffer_initialize(&texture_buffer, p_data.ptr(), p_data.size());
 	}
 
+	_THREAD_SAFE_LOCK_
 	buffer_memory += size_bytes;
+	_THREAD_SAFE_UNLOCK_
 
 	RID id = texture_buffer_owner.make_rid(texture_buffer);
 #ifdef DEV_ENABLED
@@ -682,8 +721,6 @@ RID RenderingDevice::texture_buffer_create(uint32_t p_size_elements, DataFormat
 /*****************/
 
 RID RenderingDevice::texture_create(const TextureFormat &p_format, const TextureView &p_view, const Vector<Vector<uint8_t>> &p_data) {
-	_THREAD_SAFE_METHOD_
-
 	// Some adjustments will happen.
 	TextureFormat format = p_format;
 
@@ -740,6 +777,9 @@ RID RenderingDevice::texture_create(const TextureFormat &p_format, const Texture
 					"Data for slice index " + itos(i) + " (mapped to layer " + itos(i) + ") differs in size (supplied: " + itos(p_data[i].size()) + ") than what is required by the format (" + itos(required_size) + ").");
 		}
 
+		ERR_FAIL_COND_V_MSG(format.usage_bits & TEXTURE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, RID(),
+				"Textures created as depth attachments can't be initialized with data directly. Use RenderingDevice::texture_update() instead.");
+
 		if (!(format.usage_bits & TEXTURE_USAGE_CAN_UPDATE_BIT)) {
 			forced_usage_bits = TEXTURE_USAGE_CAN_UPDATE_BIT;
 		}
@@ -839,7 +879,7 @@ RID RenderingDevice::texture_create(const TextureFormat &p_format, const Texture
 
 	if (p_data.size()) {
 		for (uint32_t i = 0; i < p_format.array_layers; i++) {
-			_texture_update(id, i, p_data[i], true, false);
+			_texture_initialize(id, i, p_data[i]);
 		}
 
 		if (texture.draw_tracker != nullptr) {
@@ -852,8 +892,6 @@ RID RenderingDevice::texture_create(const TextureFormat &p_format, const Texture
 }
 
 RID RenderingDevice::texture_create_shared(const TextureView &p_view, RID p_with_texture) {
-	_THREAD_SAFE_METHOD_
-
 	Texture *src_texture = texture_owner.get_or_null(p_with_texture);
 	ERR_FAIL_NULL_V(src_texture, RID());
 
@@ -939,7 +977,6 @@ RID RenderingDevice::texture_create_shared(const TextureView &p_view, RID p_with
 }
 
 RID RenderingDevice::texture_create_from_extension(TextureType p_type, DataFormat p_format, TextureSamples p_samples, BitField<RenderingDevice::TextureUsageBits> p_usage, uint64_t p_image, uint64_t p_width, uint64_t p_height, uint64_t p_depth, uint64_t p_layers) {
-	_THREAD_SAFE_METHOD_
 	// This method creates a texture object using a VkImage created by an extension, module or other external source (OpenXR uses this).
 
 	Texture texture;
@@ -982,8 +1019,6 @@ RID RenderingDevice::texture_create_from_extension(TextureType p_type, DataForma
 }
 
 RID RenderingDevice::texture_create_shared_from_slice(const TextureView &p_view, RID p_with_texture, uint32_t p_layer, uint32_t p_mipmap, uint32_t p_mipmaps, TextureSliceType p_slice_type, uint32_t p_layers) {
-	_THREAD_SAFE_METHOD_
-
 	Texture *src_texture = texture_owner.get_or_null(p_with_texture);
 	ERR_FAIL_NULL_V(src_texture, RID());
 
@@ -1124,10 +1159,6 @@ RID RenderingDevice::texture_create_shared_from_slice(const TextureView &p_view,
 	return id;
 }
 
-Error RenderingDevice::texture_update(RID p_texture, uint32_t p_layer, const Vector<uint8_t> &p_data) {
-	return _texture_update(p_texture, p_layer, p_data, false, true);
-}
-
 static _ALWAYS_INLINE_ void _copy_region(uint8_t const *__restrict p_src, uint8_t *__restrict p_dst, uint32_t p_src_x, uint32_t p_src_y, uint32_t p_src_w, uint32_t p_src_h, uint32_t p_src_full_w, uint32_t p_dst_pitch, uint32_t p_unit_size) {
 	uint32_t src_offset = (p_src_y * p_src_full_w + p_src_x) * p_unit_size;
 	uint32_t dst_offset = 0;
@@ -1144,11 +1175,184 @@ static _ALWAYS_INLINE_ void _copy_region(uint8_t const *__restrict p_src, uint8_
 	}
 }
 
-Error RenderingDevice::_texture_update(RID p_texture, uint32_t p_layer, const Vector<uint8_t> &p_data, bool p_use_setup_queue, bool p_validate_can_update) {
-	_THREAD_SAFE_METHOD_
+static _ALWAYS_INLINE_ void _copy_region_block_or_regular(const uint8_t *p_read_ptr, uint8_t *p_write_ptr, uint32_t p_x, uint32_t p_y, uint32_t p_width, uint32_t p_region_w, uint32_t p_region_h, uint32_t p_block_w, uint32_t p_block_h, uint32_t p_dst_pitch, uint32_t p_pixel_size, uint32_t p_block_size) {
+	if (p_block_w != 1 || p_block_h != 1) {
+		// Block format.
+		uint32_t xb = p_x / p_block_w;
+		uint32_t yb = p_y / p_block_h;
+		uint32_t wb = p_width / p_block_w;
+		uint32_t region_wb = p_region_w / p_block_w;
+		uint32_t region_hb = p_region_h / p_block_h;
+		_copy_region(p_read_ptr, p_write_ptr, xb, yb, region_wb, region_hb, wb, p_dst_pitch, p_block_size);
+	} else {
+		// Regular format.
+		_copy_region(p_read_ptr, p_write_ptr, p_x, p_y, p_region_w, p_region_h, p_width, p_dst_pitch, p_pixel_size);
+	}
+}
+
+uint32_t RenderingDevice::_texture_layer_count(Texture *p_texture) const {
+	switch (p_texture->type) {
+		case TEXTURE_TYPE_CUBE:
+		case TEXTURE_TYPE_CUBE_ARRAY:
+			return p_texture->layers * 6;
+		default:
+			return p_texture->layers;
+	}
+}
+
+uint32_t RenderingDevice::_texture_alignment(Texture *p_texture) const {
+	uint32_t alignment = get_compressed_image_format_block_byte_size(p_texture->format);
+	if (alignment == 1) {
+		alignment = get_image_format_pixel_size(p_texture->format);
+	}
+
+	return STEPIFY(alignment, driver->api_trait_get(RDD::API_TRAIT_TEXTURE_TRANSFER_ALIGNMENT));
+}
+
+Error RenderingDevice::_texture_initialize(RID p_texture, uint32_t p_layer, const Vector<uint8_t> &p_data) {
+	Texture *texture = texture_owner.get_or_null(p_texture);
+	ERR_FAIL_NULL_V(texture, ERR_INVALID_PARAMETER);
+
+	if (texture->owner != RID()) {
+		p_texture = texture->owner;
+		texture = texture_owner.get_or_null(texture->owner);
+		ERR_FAIL_NULL_V(texture, ERR_BUG); // This is a bug.
+	}
+
+	uint32_t layer_count = _texture_layer_count(texture);
+	ERR_FAIL_COND_V(p_layer >= layer_count, ERR_INVALID_PARAMETER);
+
+	uint32_t width, height;
+	uint32_t tight_mip_size = get_image_format_required_size(texture->format, texture->width, texture->height, texture->depth, texture->mipmaps, &width, &height);
+	uint32_t required_size = tight_mip_size;
+	uint32_t required_align = _texture_alignment(texture);
+
+	ERR_FAIL_COND_V_MSG(required_size != (uint32_t)p_data.size(), ERR_INVALID_PARAMETER,
+			"Required size for texture update (" + itos(required_size) + ") does not match data supplied size (" + itos(p_data.size()) + ").");
+
+	uint32_t block_w, block_h;
+	get_compressed_image_format_block_dimensions(texture->format, block_w, block_h);
+
+	uint32_t pixel_size = get_image_format_pixel_size(texture->format);
+	uint32_t pixel_rshift = get_compressed_image_format_pixel_rshift(texture->format);
+	uint32_t block_size = get_compressed_image_format_block_byte_size(texture->format);
+
+	// The algorithm operates on two passes, one to figure out the total size the staging buffer will require to allocate and another one where the copy is actually performed.
+	uint32_t staging_worker_offset = 0;
+	uint32_t staging_local_offset = 0;
+	TransferWorker *transfer_worker = nullptr;
+	const uint8_t *read_ptr = p_data.ptr();
+	uint8_t *write_ptr = nullptr;
+	for (uint32_t pass = 0; pass < 2; pass++) {
+		const bool copy_pass = (pass == 1);
+		if (copy_pass) {
+			transfer_worker = _acquire_transfer_worker(staging_local_offset, required_align, staging_worker_offset);
+			texture->transfer_worker_index = transfer_worker->index;
+
+			{
+				MutexLock lock(transfer_worker->operations_mutex);
+				texture->transfer_worker_operation = ++transfer_worker->operations_counter;
+			}
+
+			staging_local_offset = 0;
+
+			write_ptr = driver->buffer_map(transfer_worker->staging_buffer);
+			ERR_FAIL_NULL_V(write_ptr, ERR_CANT_CREATE);
+
+			write_ptr += staging_worker_offset;
+
+			if (driver->api_trait_get(RDD::API_TRAIT_HONORS_PIPELINE_BARRIERS)) {
+				// Transition the texture to the optimal layout.
+				RDD::TextureBarrier tb;
+				tb.texture = texture->driver_id;
+				tb.dst_access = RDD::BARRIER_ACCESS_COPY_WRITE_BIT;
+				tb.prev_layout = RDD::TEXTURE_LAYOUT_UNDEFINED;
+				tb.next_layout = RDD::TEXTURE_LAYOUT_COPY_DST_OPTIMAL;
+				tb.subresources.aspect = texture->barrier_aspect_flags;
+				tb.subresources.mipmap_count = texture->mipmaps;
+				tb.subresources.base_layer = p_layer;
+				tb.subresources.layer_count = 1;
+				driver->command_pipeline_barrier(transfer_worker->command_buffer, RDD::PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, RDD::PIPELINE_STAGE_COPY_BIT, {}, {}, tb);
+			}
+		}
+
+		uint32_t mipmap_offset = 0;
+		uint32_t logic_width = texture->width;
+		uint32_t logic_height = texture->height;
+		for (uint32_t mm_i = 0; mm_i < texture->mipmaps; mm_i++) {
+			uint32_t depth = 0;
+			uint32_t image_total = get_image_format_required_size(texture->format, texture->width, texture->height, texture->depth, mm_i + 1, &width, &height, &depth);
+
+			const uint8_t *read_ptr_mipmap = read_ptr + mipmap_offset;
+			tight_mip_size = image_total - mipmap_offset;
+
+			for (uint32_t z = 0; z < depth; z++) {
+				if (required_align > 0) {
+					uint32_t align_offset = staging_local_offset % required_align;
+					if (align_offset != 0) {
+						staging_local_offset += required_align - align_offset;
+					}
+				}
+
+				uint32_t pitch = (width * pixel_size * block_w) >> pixel_rshift;
+				uint32_t pitch_step = driver->api_trait_get(RDD::API_TRAIT_TEXTURE_DATA_ROW_PITCH_STEP);
+				pitch = STEPIFY(pitch, pitch_step);
+				uint32_t to_allocate = pitch * height;
+				to_allocate >>= pixel_rshift;
+
+				if (copy_pass) {
+					const uint8_t *read_ptr_mipmap_layer = read_ptr_mipmap + (tight_mip_size / depth) * z;
+					_copy_region_block_or_regular(read_ptr_mipmap_layer, write_ptr, 0, 0, width, width, height, block_w, block_h, pitch, pixel_size, block_size);
+					write_ptr += to_allocate;
 
-	ERR_FAIL_COND_V_MSG((draw_list || compute_list) && !p_use_setup_queue, ERR_INVALID_PARAMETER,
-			"Updating textures is forbidden during creation of a draw or compute list");
+					RDD::BufferTextureCopyRegion copy_region;
+					copy_region.buffer_offset = staging_worker_offset + staging_local_offset;
+					copy_region.texture_subresources.aspect = texture->read_aspect_flags;
+					copy_region.texture_subresources.mipmap = mm_i;
+					copy_region.texture_subresources.base_layer = p_layer;
+					copy_region.texture_subresources.layer_count = 1;
+					copy_region.texture_offset = Vector3i(0, 0, z);
+					copy_region.texture_region_size = Vector3i(logic_width, logic_height, 1);
+					driver->command_copy_buffer_to_texture(transfer_worker->command_buffer, transfer_worker->staging_buffer, texture->driver_id, RDD::TEXTURE_LAYOUT_COPY_DST_OPTIMAL, copy_region);
+				}
+
+				staging_local_offset += to_allocate;
+			}
+
+			mipmap_offset = image_total;
+			logic_width = MAX(1u, logic_width >> 1);
+			logic_height = MAX(1u, logic_height >> 1);
+		}
+
+		if (copy_pass) {
+			driver->buffer_unmap(transfer_worker->staging_buffer);
+
+			// If the texture does not have a tracker, it means it must be transitioned to the sampling state.
+			if (texture->draw_tracker == nullptr && driver->api_trait_get(RDD::API_TRAIT_HONORS_PIPELINE_BARRIERS)) {
+				RDD::TextureBarrier tb;
+				tb.texture = texture->driver_id;
+				tb.src_access = RDD::BARRIER_ACCESS_COPY_WRITE_BIT;
+				tb.prev_layout = RDD::TEXTURE_LAYOUT_COPY_DST_OPTIMAL;
+				tb.next_layout = RDD::TEXTURE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+				tb.subresources.aspect = texture->barrier_aspect_flags;
+				tb.subresources.mipmap_count = texture->mipmaps;
+				tb.subresources.base_layer = p_layer;
+				tb.subresources.layer_count = 1;
+
+				driver->command_pipeline_barrier(transfer_worker->command_buffer, RDD::PIPELINE_STAGE_COPY_BIT, RDD::PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, {}, {}, tb);
+			}
+
+			_release_transfer_worker(transfer_worker);
+		}
+	}
+
+	return OK;
+}
+
+Error RenderingDevice::texture_update(RID p_texture, uint32_t p_layer, const Vector<uint8_t> &p_data) {
+	ERR_RENDER_THREAD_GUARD_V(ERR_UNAVAILABLE);
+
+	ERR_FAIL_COND_V_MSG(draw_list || compute_list, ERR_INVALID_PARAMETER, "Updating textures is forbidden during creation of a draw or compute list");
 
 	Texture *texture = texture_owner.get_or_null(p_texture);
 	ERR_FAIL_NULL_V(texture, ERR_INVALID_PARAMETER);
@@ -1162,47 +1366,37 @@ Error RenderingDevice::_texture_update(RID p_texture, uint32_t p_layer, const Ve
 	ERR_FAIL_COND_V_MSG(texture->bound, ERR_CANT_ACQUIRE_RESOURCE,
 			"Texture can't be updated while a draw list that uses it as part of a framebuffer is being created. Ensure the draw list is finalized (and that the color/depth texture using it is not set to `RenderingDevice.FINAL_ACTION_CONTINUE`) to update this texture.");
 
-	ERR_FAIL_COND_V_MSG(p_validate_can_update && !(texture->usage_flags & TEXTURE_USAGE_CAN_UPDATE_BIT), ERR_INVALID_PARAMETER,
-			"Texture requires the `RenderingDevice.TEXTURE_USAGE_CAN_UPDATE_BIT` to be set to be updatable.");
+	ERR_FAIL_COND_V_MSG(!(texture->usage_flags & TEXTURE_USAGE_CAN_UPDATE_BIT), ERR_INVALID_PARAMETER, "Texture requires the `RenderingDevice.TEXTURE_USAGE_CAN_UPDATE_BIT` to be set to be updatable.");
 
-	ERR_FAIL_COND_V(p_layer >= texture->layers, ERR_INVALID_PARAMETER);
+	uint32_t layer_count = _texture_layer_count(texture);
+	ERR_FAIL_COND_V(p_layer >= layer_count, ERR_INVALID_PARAMETER);
 
 	uint32_t width, height;
 	uint32_t tight_mip_size = get_image_format_required_size(texture->format, texture->width, texture->height, texture->depth, texture->mipmaps, &width, &height);
 	uint32_t required_size = tight_mip_size;
-	uint32_t required_align = get_compressed_image_format_block_byte_size(texture->format);
-	if (required_align == 1) {
-		required_align = get_image_format_pixel_size(texture->format);
-	}
-	required_align = STEPIFY(required_align, driver->api_trait_get(RDD::API_TRAIT_TEXTURE_TRANSFER_ALIGNMENT));
+	uint32_t required_align = _texture_alignment(texture);
 
 	ERR_FAIL_COND_V_MSG(required_size != (uint32_t)p_data.size(), ERR_INVALID_PARAMETER,
 			"Required size for texture update (" + itos(required_size) + ") does not match data supplied size (" + itos(p_data.size()) + ").");
 
+	_check_transfer_worker_texture(texture);
+
+	uint32_t block_w, block_h;
+	get_compressed_image_format_block_dimensions(texture->format, block_w, block_h);
+
+	uint32_t pixel_size = get_image_format_pixel_size(texture->format);
+	uint32_t pixel_rshift = get_compressed_image_format_pixel_rshift(texture->format);
+	uint32_t block_size = get_compressed_image_format_block_byte_size(texture->format);
+
 	uint32_t region_size = texture_upload_region_size_px;
 
-	const uint8_t *r = p_data.ptr();
+	const uint8_t *read_ptr = p_data.ptr();
 
 	thread_local LocalVector<RDG::RecordedBufferToTextureCopy> command_buffer_to_texture_copies_vector;
 	command_buffer_to_texture_copies_vector.clear();
 
-	if (p_use_setup_queue && driver->api_trait_get(RDD::API_TRAIT_HONORS_PIPELINE_BARRIERS)) {
-		// When using the setup queue directly, we transition the texture to the optimal layout.
-		RDD::TextureBarrier tb;
-		tb.texture = texture->driver_id;
-		tb.dst_access = RDD::BARRIER_ACCESS_COPY_WRITE_BIT;
-		tb.prev_layout = RDD::TEXTURE_LAYOUT_UNDEFINED;
-		tb.next_layout = RDD::TEXTURE_LAYOUT_COPY_DST_OPTIMAL;
-		tb.subresources.aspect = texture->barrier_aspect_flags;
-		tb.subresources.mipmap_count = texture->mipmaps;
-		tb.subresources.base_layer = p_layer;
-		tb.subresources.layer_count = 1;
-
-		driver->command_pipeline_barrier(frames[frame].setup_command_buffer, RDD::PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, RDD::PIPELINE_STAGE_COPY_BIT, {}, {}, tb);
-	} else if (!p_use_setup_queue) {
-		// Indicate the texture will get modified for the shared texture fallback.
-		_texture_update_shared_fallback(p_texture, texture, true);
-	}
+	// Indicate the texture will get modified for the shared texture fallback.
+	_texture_update_shared_fallback(p_texture, texture, true);
 
 	uint32_t mipmap_offset = 0;
 
@@ -1213,13 +1407,11 @@ Error RenderingDevice::_texture_update(RID p_texture, uint32_t p_layer, const Ve
 		uint32_t depth = 0;
 		uint32_t image_total = get_image_format_required_size(texture->format, texture->width, texture->height, texture->depth, mm_i + 1, &width, &height, &depth);
 
-		const uint8_t *read_ptr_mipmap = r + mipmap_offset;
+		const uint8_t *read_ptr_mipmap = read_ptr + mipmap_offset;
 		tight_mip_size = image_total - mipmap_offset;
 
-		for (uint32_t z = 0; z < depth; z++) { // For 3D textures, depth may be > 0.
-
-			const uint8_t *read_ptr = read_ptr_mipmap + (tight_mip_size / depth) * z;
-
+		for (uint32_t z = 0; z < depth; z++) {
+			const uint8_t *read_ptr_mipmap_layer = read_ptr_mipmap + (tight_mip_size / depth) * z;
 			for (uint32_t y = 0; y < height; y += region_size) {
 				for (uint32_t x = 0; x < width; x += region_size) {
 					uint32_t region_w = MIN(region_size, width - x);
@@ -1228,11 +1420,7 @@ Error RenderingDevice::_texture_update(RID p_texture, uint32_t p_layer, const Ve
 					uint32_t region_logic_w = MIN(region_size, logic_width - x);
 					uint32_t region_logic_h = MIN(region_size, logic_height - y);
 
-					uint32_t pixel_size = get_image_format_pixel_size(texture->format);
-					uint32_t block_w = 0, block_h = 0;
-					get_compressed_image_format_block_dimensions(texture->format, block_w, block_h);
-
-					uint32_t region_pitch = (region_w * pixel_size * block_w) >> get_compressed_image_format_pixel_rshift(texture->format);
+					uint32_t region_pitch = (region_w * pixel_size * block_w) >> pixel_rshift;
 					uint32_t pitch_step = driver->api_trait_get(RDD::API_TRAIT_TEXTURE_DATA_ROW_PITCH_STEP);
 					region_pitch = STEPIFY(region_pitch, pitch_step);
 					uint32_t to_allocate = region_pitch * region_h;
@@ -1241,13 +1429,13 @@ Error RenderingDevice::_texture_update(RID p_texture, uint32_t p_layer, const Ve
 					Error err = _staging_buffer_allocate(to_allocate, required_align, alloc_offset, alloc_size, required_action, false);
 					ERR_FAIL_COND_V(err, ERR_CANT_CREATE);
 
-					if (!p_use_setup_queue && !command_buffer_to_texture_copies_vector.is_empty() && required_action == STAGING_REQUIRED_ACTION_FLUSH_AND_STALL_ALL) {
+					if (!command_buffer_to_texture_copies_vector.is_empty() && required_action == STAGING_REQUIRED_ACTION_FLUSH_AND_STALL_ALL) {
 						if (_texture_make_mutable(texture, p_texture)) {
 							// The texture must be mutable to be used as a copy destination.
 							draw_graph.add_synchronization();
 						}
 
-						// If we're using the draw queue and the staging buffer requires flushing everything, we submit the command early and clear the current vector.
+						// If the staging buffer requires flushing everything, we submit the command early and clear the current vector.
 						draw_graph.add_texture_update(texture->driver_id, texture->draw_tracker, command_buffer_to_texture_copies_vector);
 						command_buffer_to_texture_copies_vector.clear();
 					}
@@ -1266,24 +1454,7 @@ Error RenderingDevice::_texture_update(RID p_texture, uint32_t p_layer, const Ve
 					ERR_FAIL_COND_V(region_w % block_w, ERR_BUG);
 					ERR_FAIL_COND_V(region_h % block_h, ERR_BUG);
 
-					if (block_w != 1 || block_h != 1) {
-						// Compressed image (blocks).
-						// Must copy a block region.
-
-						uint32_t block_size = get_compressed_image_format_block_byte_size(texture->format);
-						// Re-create current variables in blocky format.
-						uint32_t xb = x / block_w;
-						uint32_t yb = y / block_h;
-						uint32_t wb = width / block_w;
-						//uint32_t hb = height / block_h;
-						uint32_t region_wb = region_w / block_w;
-						uint32_t region_hb = region_h / block_h;
-						_copy_region(read_ptr, write_ptr, xb, yb, region_wb, region_hb, wb, region_pitch, block_size);
-					} else {
-						// Regular image (pixels).
-						// Must copy a pixel region.
-						_copy_region(read_ptr, write_ptr, x, y, region_w, region_h, width, region_pitch, pixel_size);
-					}
+					_copy_region_block_or_regular(read_ptr_mipmap_layer, write_ptr, x, y, width, region_w, region_h, block_w, block_h, region_pitch, pixel_size, block_size);
 
 					{ // Unmap.
 						driver->buffer_unmap(staging_buffer_blocks[staging_buffer_current].driver_id);
@@ -1298,14 +1469,10 @@ Error RenderingDevice::_texture_update(RID p_texture, uint32_t p_layer, const Ve
 					copy_region.texture_offset = Vector3i(x, y, z);
 					copy_region.texture_region_size = Vector3i(region_logic_w, region_logic_h, 1);
 
-					if (p_use_setup_queue) {
-						driver->command_copy_buffer_to_texture(frames[frame].setup_command_buffer, staging_buffer_blocks[staging_buffer_current].driver_id, texture->driver_id, RDD::TEXTURE_LAYOUT_COPY_DST_OPTIMAL, copy_region);
-					} else {
-						RDG::RecordedBufferToTextureCopy buffer_to_texture_copy;
-						buffer_to_texture_copy.from_buffer = staging_buffer_blocks[staging_buffer_current].driver_id;
-						buffer_to_texture_copy.region = copy_region;
-						command_buffer_to_texture_copies_vector.push_back(buffer_to_texture_copy);
-					}
+					RDG::RecordedBufferToTextureCopy buffer_to_texture_copy;
+					buffer_to_texture_copy.from_buffer = staging_buffer_blocks[staging_buffer_current].driver_id;
+					buffer_to_texture_copy.region = copy_region;
+					command_buffer_to_texture_copies_vector.push_back(buffer_to_texture_copy);
 
 					staging_buffer_blocks.write[staging_buffer_current].fill_amount = alloc_offset + alloc_size;
 				}
@@ -1317,27 +1484,13 @@ Error RenderingDevice::_texture_update(RID p_texture, uint32_t p_layer, const Ve
 		logic_height = MAX(1u, logic_height >> 1);
 	}
 
-	if (p_use_setup_queue && (texture->draw_tracker == nullptr) && driver->api_trait_get(RDD::API_TRAIT_HONORS_PIPELINE_BARRIERS)) {
-		// If the texture does not have a tracker, it means it must be transitioned to the sampling state.
-		RDD::TextureBarrier tb;
-		tb.texture = texture->driver_id;
-		tb.src_access = RDD::BARRIER_ACCESS_COPY_WRITE_BIT;
-		tb.prev_layout = RDD::TEXTURE_LAYOUT_COPY_DST_OPTIMAL;
-		tb.next_layout = RDD::TEXTURE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
-		tb.subresources.aspect = texture->barrier_aspect_flags;
-		tb.subresources.mipmap_count = texture->mipmaps;
-		tb.subresources.base_layer = p_layer;
-		tb.subresources.layer_count = 1;
-		driver->command_pipeline_barrier(frames[frame].setup_command_buffer, RDD::PIPELINE_STAGE_COPY_BIT, RDD::PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, {}, {}, tb);
-	} else if (!p_use_setup_queue && !command_buffer_to_texture_copies_vector.is_empty()) {
-		if (_texture_make_mutable(texture, p_texture)) {
-			// The texture must be mutable to be used as a copy destination.
-			draw_graph.add_synchronization();
-		}
-
-		draw_graph.add_texture_update(texture->driver_id, texture->draw_tracker, command_buffer_to_texture_copies_vector);
+	if (_texture_make_mutable(texture, p_texture)) {
+		// The texture must be mutable to be used as a copy destination.
+		draw_graph.add_synchronization();
 	}
 
+	draw_graph.add_texture_update(texture->driver_id, texture->draw_tracker, command_buffer_to_texture_copies_vector);
+
 	return OK;
 }
 
@@ -1587,7 +1740,7 @@ Vector<uint8_t> RenderingDevice::_texture_get_data(Texture *tex, uint32_t p_laye
 }
 
 Vector<uint8_t> RenderingDevice::texture_get_data(RID p_texture, uint32_t p_layer) {
-	_THREAD_SAFE_METHOD_
+	ERR_RENDER_THREAD_GUARD_V(Vector<uint8_t>());
 
 	Texture *tex = texture_owner.get_or_null(p_texture);
 	ERR_FAIL_NULL_V(tex, Vector<uint8_t>());
@@ -1599,7 +1752,9 @@ Vector<uint8_t> RenderingDevice::texture_get_data(RID p_texture, uint32_t p_laye
 
 	ERR_FAIL_COND_V(p_layer >= tex->layers, Vector<uint8_t>());
 
-	if ((tex->usage_flags & TEXTURE_USAGE_CPU_READ_BIT)) {
+	_check_transfer_worker_texture(tex);
+
+	if (tex->usage_flags & TEXTURE_USAGE_CPU_READ_BIT) {
 		// Does not need anything fancy, map and read.
 		return _texture_get_data(tex, p_layer);
 	} else {
@@ -1701,7 +1856,7 @@ Vector<uint8_t> RenderingDevice::texture_get_data(RID p_texture, uint32_t p_laye
 }
 
 bool RenderingDevice::texture_is_shared(RID p_texture) {
-	_THREAD_SAFE_METHOD_
+	ERR_RENDER_THREAD_GUARD_V(false);
 
 	Texture *tex = texture_owner.get_or_null(p_texture);
 	ERR_FAIL_NULL_V(tex, false);
@@ -1709,11 +1864,13 @@ bool RenderingDevice::texture_is_shared(RID p_texture) {
 }
 
 bool RenderingDevice::texture_is_valid(RID p_texture) {
+	ERR_RENDER_THREAD_GUARD_V(false);
+
 	return texture_owner.owns(p_texture);
 }
 
 RD::TextureFormat RenderingDevice::texture_get_format(RID p_texture) {
-	_THREAD_SAFE_METHOD_
+	ERR_RENDER_THREAD_GUARD_V(TextureFormat());
 
 	Texture *tex = texture_owner.get_or_null(p_texture);
 	ERR_FAIL_NULL_V(tex, TextureFormat());
@@ -1736,7 +1893,7 @@ RD::TextureFormat RenderingDevice::texture_get_format(RID p_texture) {
 }
 
 Size2i RenderingDevice::texture_size(RID p_texture) {
-	_THREAD_SAFE_METHOD_
+	ERR_RENDER_THREAD_GUARD_V(Size2i());
 
 	Texture *tex = texture_owner.get_or_null(p_texture);
 	ERR_FAIL_NULL_V(tex, Size2i());
@@ -1750,7 +1907,7 @@ uint64_t RenderingDevice::texture_get_native_handle(RID p_texture) {
 #endif
 
 Error RenderingDevice::texture_copy(RID p_from_texture, RID p_to_texture, const Vector3 &p_from, const Vector3 &p_to, const Vector3 &p_size, uint32_t p_src_mipmap, uint32_t p_dst_mipmap, uint32_t p_src_layer, uint32_t p_dst_layer) {
-	_THREAD_SAFE_METHOD_
+	ERR_RENDER_THREAD_GUARD_V(ERR_UNAVAILABLE);
 
 	Texture *src_tex = texture_owner.get_or_null(p_from_texture);
 	ERR_FAIL_NULL_V(src_tex, ERR_INVALID_PARAMETER);
@@ -1789,6 +1946,9 @@ Error RenderingDevice::texture_copy(RID p_from_texture, RID p_to_texture, const
 	ERR_FAIL_COND_V_MSG(src_tex->read_aspect_flags != dst_tex->read_aspect_flags, ERR_INVALID_PARAMETER,
 			"Source and destination texture must be of the same type (color or depth).");
 
+	_check_transfer_worker_texture(src_tex);
+	_check_transfer_worker_texture(dst_tex);
+
 	RDD::TextureCopyRegion copy_region;
 	copy_region.src_subresources.aspect = src_tex->read_aspect_flags;
 	copy_region.src_subresources.mipmap = p_src_mipmap;
@@ -1820,7 +1980,7 @@ Error RenderingDevice::texture_copy(RID p_from_texture, RID p_to_texture, const
 }
 
 Error RenderingDevice::texture_resolve_multisample(RID p_from_texture, RID p_to_texture) {
-	_THREAD_SAFE_METHOD_
+	ERR_RENDER_THREAD_GUARD_V(ERR_UNAVAILABLE);
 
 	Texture *src_tex = texture_owner.get_or_null(p_from_texture);
 	ERR_FAIL_NULL_V(src_tex, ERR_INVALID_PARAMETER);
@@ -1853,6 +2013,9 @@ Error RenderingDevice::texture_resolve_multisample(RID p_from_texture, RID p_to_
 	// Indicate the texture will get modified for the shared texture fallback.
 	_texture_update_shared_fallback(p_to_texture, dst_tex, true);
 
+	_check_transfer_worker_texture(src_tex);
+	_check_transfer_worker_texture(dst_tex);
+
 	// The textures must be mutable to be used in the resolve operation.
 	bool src_made_mutable = _texture_make_mutable(src_tex, p_from_texture);
 	bool dst_made_mutable = _texture_make_mutable(dst_tex, p_to_texture);
@@ -1866,7 +2029,7 @@ Error RenderingDevice::texture_resolve_multisample(RID p_from_texture, RID p_to_
 }
 
 Error RenderingDevice::texture_clear(RID p_texture, const Color &p_color, uint32_t p_base_mipmap, uint32_t p_mipmaps, uint32_t p_base_layer, uint32_t p_layers) {
-	_THREAD_SAFE_METHOD_
+	ERR_RENDER_THREAD_GUARD_V(ERR_UNAVAILABLE);
 
 	Texture *src_tex = texture_owner.get_or_null(p_texture);
 	ERR_FAIL_NULL_V(src_tex, ERR_INVALID_PARAMETER);
@@ -1883,6 +2046,8 @@ Error RenderingDevice::texture_clear(RID p_texture, const Color &p_color, uint32
 	ERR_FAIL_COND_V(p_base_mipmap + p_mipmaps > src_tex->mipmaps, ERR_INVALID_PARAMETER);
 	ERR_FAIL_COND_V(p_base_layer + p_layers > src_tex->layers, ERR_INVALID_PARAMETER);
 
+	_check_transfer_worker_texture(src_tex);
+
 	RDD::TextureSubresourceRange range;
 	range.aspect = src_tex->read_aspect_flags;
 	range.base_mipmap = src_tex->base_mipmap + p_base_mipmap;
@@ -1906,8 +2071,6 @@ Error RenderingDevice::texture_clear(RID p_texture, const Color &p_color, uint32
 bool RenderingDevice::texture_is_format_supported_for_usage(DataFormat p_format, BitField<RenderingDevice::TextureUsageBits> p_usage) const {
 	ERR_FAIL_INDEX_V(p_format, DATA_FORMAT_MAX, false);
 
-	_THREAD_SAFE_METHOD_
-
 	bool cpu_readable = (p_usage & RDD::TEXTURE_USAGE_CPU_READ_BIT);
 	BitField<TextureUsageBits> supported = driver->texture_get_usages_supported_by_format(p_format, cpu_readable);
 	bool any_unsupported = (((int64_t)supported) | ((int64_t)p_usage)) != ((int64_t)supported);
@@ -2081,6 +2244,8 @@ RDD::RenderPassID RenderingDevice::_render_pass_create(const Vector<AttachmentFo
 		}
 		for (int j = 0; j < pass->resolve_attachments.size(); j++) {
 			int32_t attachment = pass->resolve_attachments[j];
+			attachments[attachment].load_op = RDD::ATTACHMENT_LOAD_OP_DONT_CARE;
+
 			RDD::AttachmentReference reference;
 			if (attachment == ATTACHMENT_UNUSED) {
 				reference.attachment = RDD::AttachmentReference::UNUSED;
@@ -2212,10 +2377,20 @@ RenderingDevice::FramebufferFormatID RenderingDevice::framebuffer_format_create_
 	fb_format.pass_samples = samples;
 	fb_format.view_count = p_view_count;
 	framebuffer_formats[id] = fb_format;
+
+#if PRINT_FRAMEBUFFER_FORMAT
+	print_line("FRAMEBUFFER FORMAT:", id, "ATTACHMENTS:", p_attachments.size(), "PASSES:", p_passes.size());
+	for (RD::AttachmentFormat attachment : p_attachments) {
+		print_line("FORMAT:", attachment.format, "SAMPLES:", attachment.samples, "USAGE FLAGS:", attachment.usage_flags);
+	}
+#endif
+
 	return id;
 }
 
 RenderingDevice::FramebufferFormatID RenderingDevice::framebuffer_format_create_empty(TextureSamples p_samples) {
+	_THREAD_SAFE_METHOD_
+
 	FramebufferFormatKey key;
 	key.passes.push_back(FramebufferPass());
 
@@ -2240,10 +2415,17 @@ RenderingDevice::FramebufferFormatID RenderingDevice::framebuffer_format_create_
 	fb_format.render_pass = render_pass;
 	fb_format.pass_samples.push_back(p_samples);
 	framebuffer_formats[id] = fb_format;
+
+#if PRINT_FRAMEBUFFER_FORMAT
+	print_line("FRAMEBUFFER FORMAT:", id, "ATTACHMENTS: EMPTY");
+#endif
+
 	return id;
 }
 
 RenderingDevice::TextureSamples RenderingDevice::framebuffer_format_get_texture_samples(FramebufferFormatID p_format, uint32_t p_pass) {
+	_THREAD_SAFE_METHOD_
+
 	HashMap<FramebufferFormatID, FramebufferFormat>::Iterator E = framebuffer_formats.find(p_format);
 	ERR_FAIL_COND_V(!E, TEXTURE_SAMPLES_1);
 	ERR_FAIL_COND_V(p_pass >= uint32_t(E->value.pass_samples.size()), TEXTURE_SAMPLES_1);
@@ -2253,6 +2435,7 @@ RenderingDevice::TextureSamples RenderingDevice::framebuffer_format_get_texture_
 
 RID RenderingDevice::framebuffer_create_empty(const Size2i &p_size, TextureSamples p_samples, FramebufferFormatID p_format_check) {
 	_THREAD_SAFE_METHOD_
+
 	Framebuffer framebuffer;
 	framebuffer.format_id = framebuffer_format_create_empty(p_samples);
 	ERR_FAIL_COND_V(p_format_check != INVALID_FORMAT_ID && framebuffer.format_id != p_format_check, RID());
@@ -2276,6 +2459,10 @@ RID RenderingDevice::framebuffer_create(const Vector<RID> &p_texture_attachments
 
 		ERR_FAIL_COND_V_MSG(texture && texture->layers != p_view_count, RID(), "Layers of our texture doesn't match view count for this framebuffer");
 
+		if (texture != nullptr) {
+			_check_transfer_worker_texture(texture);
+		}
+
 		if (texture && texture->usage_flags & TEXTURE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) {
 			pass.depth_attachment = i;
 		} else if (texture && texture->usage_flags & TEXTURE_USAGE_VRS_ATTACHMENT_BIT) {
@@ -2310,6 +2497,8 @@ RID RenderingDevice::framebuffer_create_multipass(const Vector<RID> &p_texture_a
 		} else {
 			ERR_FAIL_COND_V_MSG(texture->layers != p_view_count, RID(), "Layers of our texture doesn't match view count for this framebuffer");
 
+			_check_transfer_worker_texture(texture);
+
 			if (!size_set) {
 				size.width = texture->width;
 				size.height = texture->height;
@@ -2409,10 +2598,10 @@ RID RenderingDevice::sampler_create(const SamplerState &p_state) {
 }
 
 bool RenderingDevice::sampler_is_format_supported_for_filter(DataFormat p_format, SamplerFilter p_sampler_filter) const {
-	ERR_FAIL_INDEX_V(p_format, DATA_FORMAT_MAX, false);
-
 	_THREAD_SAFE_METHOD_
 
+	ERR_FAIL_INDEX_V(p_format, DATA_FORMAT_MAX, false);
+
 	return driver->sampler_is_format_supported_for_filter(p_format, p_sampler_filter);
 }
 
@@ -2421,8 +2610,6 @@ bool RenderingDevice::sampler_is_format_supported_for_filter(DataFormat p_format
 /***********************/
 
 RID RenderingDevice::vertex_buffer_create(uint32_t p_size_bytes, const Vector<uint8_t> &p_data, bool p_use_as_storage) {
-	_THREAD_SAFE_METHOD_
-
 	ERR_FAIL_COND_V(p_data.size() && (uint32_t)p_data.size() != p_size_bytes, RID());
 
 	Buffer buffer;
@@ -2441,10 +2628,12 @@ RID RenderingDevice::vertex_buffer_create(uint32_t p_size_bytes, const Vector<ui
 	}
 
 	if (p_data.size()) {
-		_buffer_update(&buffer, RID(), 0, p_data.ptr(), p_data.size());
+		_buffer_initialize(&buffer, p_data.ptr(), p_data.size());
 	}
 
+	_THREAD_SAFE_LOCK_
 	buffer_memory += buffer.size;
+	_THREAD_SAFE_UNLOCK_
 
 	RID id = vertex_buffer_owner.make_rid(buffer);
 #ifdef DEV_ENABLED
@@ -2544,6 +2733,11 @@ RID RenderingDevice::vertex_array_create(uint32_t p_vertex_count, VertexFormatID
 		} else {
 			vertex_array.untracked_buffers.insert(p_src_buffers[i]);
 		}
+
+		if (buffer->transfer_worker_index >= 0) {
+			vertex_array.transfer_worker_indices.push_back(buffer->transfer_worker_index);
+			vertex_array.transfer_worker_operations.push_back(buffer->transfer_worker_operation);
+		}
 	}
 
 	RID id = vertex_array_owner.make_rid(vertex_array);
@@ -2555,8 +2749,6 @@ RID RenderingDevice::vertex_array_create(uint32_t p_vertex_count, VertexFormatID
 }
 
 RID RenderingDevice::index_buffer_create(uint32_t p_index_count, IndexBufferFormat p_format, const Vector<uint8_t> &p_data, bool p_use_restart_indices) {
-	_THREAD_SAFE_METHOD_
-
 	ERR_FAIL_COND_V(p_index_count == 0, RID());
 
 	IndexBuffer index_buffer;
@@ -2605,10 +2797,12 @@ RID RenderingDevice::index_buffer_create(uint32_t p_index_count, IndexBufferForm
 	}
 
 	if (p_data.size()) {
-		_buffer_update(&index_buffer, RID(), 0, p_data.ptr(), p_data.size());
+		_buffer_initialize(&index_buffer, p_data.ptr(), p_data.size());
 	}
 
+	_THREAD_SAFE_LOCK_
 	buffer_memory += index_buffer.size;
+	_THREAD_SAFE_UNLOCK_
 
 	RID id = index_buffer_owner.make_rid(index_buffer);
 #ifdef DEV_ENABLED
@@ -2635,6 +2829,8 @@ RID RenderingDevice::index_array_create(RID p_index_buffer, uint32_t p_index_off
 	index_array.indices = p_index_count;
 	index_array.format = index_buffer->format;
 	index_array.supports_restart_indices = index_buffer->supports_restart_indices;
+	index_array.transfer_worker_index = index_buffer->transfer_worker_index;
+	index_array.transfer_worker_operation = index_buffer->transfer_worker_operation;
 
 	RID id = index_array_owner.make_rid(index_array);
 	_add_dependency(id, p_index_buffer);
@@ -2753,6 +2949,8 @@ RID RenderingDevice::shader_create_from_bytecode(const Vector<uint8_t> &p_shader
 }
 
 RID RenderingDevice::shader_create_placeholder() {
+	_THREAD_SAFE_METHOD_
+
 	Shader shader;
 	return shader_owner.make_rid(shader);
 }
@@ -2770,8 +2968,6 @@ uint64_t RenderingDevice::shader_get_vertex_input_attribute_mask(RID p_shader) {
 /******************/
 
 RID RenderingDevice::uniform_buffer_create(uint32_t p_size_bytes, const Vector<uint8_t> &p_data) {
-	_THREAD_SAFE_METHOD_
-
 	ERR_FAIL_COND_V(p_data.size() && (uint32_t)p_data.size() != p_size_bytes, RID());
 
 	Buffer buffer;
@@ -2787,10 +2983,12 @@ RID RenderingDevice::uniform_buffer_create(uint32_t p_size_bytes, const Vector<u
 	}
 
 	if (p_data.size()) {
-		_buffer_update(&buffer, RID(), 0, p_data.ptr(), p_data.size());
+		_buffer_initialize(&buffer, p_data.ptr(), p_data.size());
 	}
 
+	_THREAD_SAFE_LOCK_
 	buffer_memory += buffer.size;
+	_THREAD_SAFE_UNLOCK_
 
 	RID id = uniform_buffer_owner.make_rid(buffer);
 #ifdef DEV_ENABLED
@@ -2921,6 +3119,7 @@ RID RenderingDevice::uniform_set_create(const Vector<Uniform> &p_uniforms, RID p
 
 					driver_uniform.ids.push_back(*sampler_driver_id);
 					driver_uniform.ids.push_back(driver_id);
+					_check_transfer_worker_texture(texture);
 				}
 			} break;
 			case UNIFORM_TYPE_TEXTURE: {
@@ -2965,6 +3164,7 @@ RID RenderingDevice::uniform_set_create(const Vector<Uniform> &p_uniforms, RID p
 					DEV_ASSERT(!texture->owner.is_valid() || texture_owner.get_or_null(texture->owner));
 
 					driver_uniform.ids.push_back(driver_id);
+					_check_transfer_worker_texture(texture);
 				}
 			} break;
 			case UNIFORM_TYPE_IMAGE: {
@@ -3008,6 +3208,7 @@ RID RenderingDevice::uniform_set_create(const Vector<Uniform> &p_uniforms, RID p
 					DEV_ASSERT(!texture->owner.is_valid() || texture_owner.get_or_null(texture->owner));
 
 					driver_uniform.ids.push_back(texture->driver_id);
+					_check_transfer_worker_texture(texture);
 				}
 			} break;
 			case UNIFORM_TYPE_TEXTURE_BUFFER: {
@@ -3042,6 +3243,7 @@ RID RenderingDevice::uniform_set_create(const Vector<Uniform> &p_uniforms, RID p
 					}
 
 					driver_uniform.ids.push_back(buffer->driver_id);
+					_check_transfer_worker_buffer(buffer);
 				}
 			} break;
 			case UNIFORM_TYPE_SAMPLER_WITH_TEXTURE_BUFFER: {
@@ -3070,6 +3272,7 @@ RID RenderingDevice::uniform_set_create(const Vector<Uniform> &p_uniforms, RID p
 
 					driver_uniform.ids.push_back(*sampler_driver_id);
 					driver_uniform.ids.push_back(buffer->driver_id);
+					_check_transfer_worker_buffer(buffer);
 				}
 			} break;
 			case UNIFORM_TYPE_IMAGE_BUFFER: {
@@ -3094,6 +3297,7 @@ RID RenderingDevice::uniform_set_create(const Vector<Uniform> &p_uniforms, RID p
 				}
 
 				driver_uniform.ids.push_back(buffer->driver_id);
+				_check_transfer_worker_buffer(buffer);
 			} break;
 			case UNIFORM_TYPE_STORAGE_BUFFER: {
 				ERR_FAIL_COND_V_MSG(uniform.get_id_count() != 1, RID(),
@@ -3133,6 +3337,7 @@ RID RenderingDevice::uniform_set_create(const Vector<Uniform> &p_uniforms, RID p
 				}
 
 				driver_uniform.ids.push_back(buffer->driver_id);
+				_check_transfer_worker_buffer(buffer);
 			} break;
 			case UNIFORM_TYPE_INPUT_ATTACHMENT: {
 				ERR_FAIL_COND_V_MSG(shader->is_compute, RID(), "InputAttachment (binding: " + itos(uniform.binding) + ") supplied for compute shader (this is not allowed).");
@@ -3158,6 +3363,7 @@ RID RenderingDevice::uniform_set_create(const Vector<Uniform> &p_uniforms, RID p
 					DEV_ASSERT(!texture->owner.is_valid() || texture_owner.get_or_null(texture->owner));
 
 					driver_uniform.ids.push_back(texture->driver_id);
+					_check_transfer_worker_texture(texture);
 				}
 			} break;
 			default: {
@@ -3197,10 +3403,14 @@ RID RenderingDevice::uniform_set_create(const Vector<Uniform> &p_uniforms, RID p
 }
 
 bool RenderingDevice::uniform_set_is_valid(RID p_uniform_set) {
+	_THREAD_SAFE_METHOD_
+
 	return uniform_set_owner.owns(p_uniform_set);
 }
 
 void RenderingDevice::uniform_set_set_invalidation_callback(RID p_uniform_set, InvalidationCallback p_callback, void *p_userdata) {
+	_THREAD_SAFE_METHOD_
+
 	UniformSet *us = uniform_set_owner.get_or_null(p_uniform_set);
 	ERR_FAIL_NULL(us);
 	us->invalidated_callback = p_callback;
@@ -3212,21 +3422,22 @@ void RenderingDevice::uniform_set_set_invalidation_callback(RID p_uniform_set, I
 /*******************/
 
 RID RenderingDevice::render_pipeline_create(RID p_shader, FramebufferFormatID p_framebuffer_format, VertexFormatID p_vertex_format, RenderPrimitive p_render_primitive, const PipelineRasterizationState &p_rasterization_state, const PipelineMultisampleState &p_multisample_state, const PipelineDepthStencilState &p_depth_stencil_state, const PipelineColorBlendState &p_blend_state, BitField<PipelineDynamicStateFlags> p_dynamic_state_flags, uint32_t p_for_render_pass, const Vector<PipelineSpecializationConstant> &p_specialization_constants) {
-	_THREAD_SAFE_METHOD_
-
 	// Needs a shader.
 	Shader *shader = shader_owner.get_or_null(p_shader);
 	ERR_FAIL_NULL_V(shader, RID());
+	ERR_FAIL_COND_V_MSG(shader->is_compute, RID(), "Compute shaders can't be used in render pipelines");
 
-	ERR_FAIL_COND_V_MSG(shader->is_compute, RID(),
-			"Compute shaders can't be used in render pipelines");
+	FramebufferFormat fb_format;
+	{
+		_THREAD_SAFE_METHOD_
 
-	if (p_framebuffer_format == INVALID_ID) {
-		// If nothing provided, use an empty one (no attachments).
-		p_framebuffer_format = framebuffer_format_create(Vector<AttachmentFormat>());
+		if (p_framebuffer_format == INVALID_ID) {
+			// If nothing provided, use an empty one (no attachments).
+			p_framebuffer_format = framebuffer_format_create(Vector<AttachmentFormat>());
+		}
+		ERR_FAIL_COND_V(!framebuffer_formats.has(p_framebuffer_format), RID());
+		fb_format = framebuffer_formats[p_framebuffer_format];
 	}
-	ERR_FAIL_COND_V(!framebuffer_formats.has(p_framebuffer_format), RID());
-	const FramebufferFormat &fb_format = framebuffer_formats[p_framebuffer_format];
 
 	// Validate shader vs. framebuffer.
 	{
@@ -3372,30 +3583,41 @@ RID RenderingDevice::render_pipeline_create(RID p_shader, FramebufferFormatID p_
 	};
 	pipeline.validation.primitive_minimum = primitive_minimum[p_render_primitive];
 #endif
+
 	// Create ID to associate with this pipeline.
 	RID id = render_pipeline_owner.make_rid(pipeline);
+	{
+		_THREAD_SAFE_METHOD_
+
 #ifdef DEV_ENABLED
-	set_resource_name(id, "RID:" + itos(id.get_id()));
+		set_resource_name(id, "RID:" + itos(id.get_id()));
 #endif
-	// Now add all the dependencies.
-	_add_dependency(id, p_shader);
+		// Now add all the dependencies.
+		_add_dependency(id, p_shader);
+	}
+
 	return id;
 }
 
 bool RenderingDevice::render_pipeline_is_valid(RID p_pipeline) {
 	_THREAD_SAFE_METHOD_
+
 	return render_pipeline_owner.owns(p_pipeline);
 }
 
 RID RenderingDevice::compute_pipeline_create(RID p_shader, const Vector<PipelineSpecializationConstant> &p_specialization_constants) {
-	_THREAD_SAFE_METHOD_
+	Shader *shader;
 
-	// Needs a shader.
-	Shader *shader = shader_owner.get_or_null(p_shader);
-	ERR_FAIL_NULL_V(shader, RID());
+	{
+		_THREAD_SAFE_METHOD_
 
-	ERR_FAIL_COND_V_MSG(!shader->is_compute, RID(),
-			"Non-compute shaders can't be used in compute pipelines");
+		// Needs a shader.
+		shader = shader_owner.get_or_null(p_shader);
+		ERR_FAIL_NULL_V(shader, RID());
+
+		ERR_FAIL_COND_V_MSG(!shader->is_compute, RID(),
+				"Non-compute shaders can't be used in compute pipelines");
+	}
 
 	for (int i = 0; i < shader->specialization_constants.size(); i++) {
 		const ShaderSpecializationConstant &sc = shader->specialization_constants[i];
@@ -3427,15 +3649,22 @@ RID RenderingDevice::compute_pipeline_create(RID p_shader, const Vector<Pipeline
 
 	// Create ID to associate with this pipeline.
 	RID id = compute_pipeline_owner.make_rid(pipeline);
+	{
+		_THREAD_SAFE_METHOD_
+
 #ifdef DEV_ENABLED
-	set_resource_name(id, "RID:" + itos(id.get_id()));
+		set_resource_name(id, "RID:" + itos(id.get_id()));
 #endif
-	// Now add all the dependencies.
-	_add_dependency(id, p_shader);
+		// Now add all the dependencies.
+		_add_dependency(id, p_shader);
+	}
+
 	return id;
 }
 
 bool RenderingDevice::compute_pipeline_is_valid(RID p_pipeline) {
+	_THREAD_SAFE_METHOD_
+
 	return compute_pipeline_owner.owns(p_pipeline);
 }
 
@@ -3515,6 +3744,7 @@ Error RenderingDevice::screen_prepare_for_drawing(DisplayServer::WindowID p_scre
 
 int RenderingDevice::screen_get_width(DisplayServer::WindowID p_screen) const {
 	_THREAD_SAFE_METHOD_
+
 	RenderingContextDriver::SurfaceID surface = context->surface_get_from_window(p_screen);
 	ERR_FAIL_COND_V_MSG(surface == 0, 0, "A surface was not created for the screen.");
 	return context->surface_get_width(surface);
@@ -3522,6 +3752,7 @@ int RenderingDevice::screen_get_width(DisplayServer::WindowID p_screen) const {
 
 int RenderingDevice::screen_get_height(DisplayServer::WindowID p_screen) const {
 	_THREAD_SAFE_METHOD_
+
 	RenderingContextDriver::SurfaceID surface = context->surface_get_from_window(p_screen);
 	ERR_FAIL_COND_V_MSG(surface == 0, 0, "A surface was not created for the screen.");
 	return context->surface_get_height(surface);
@@ -3568,7 +3799,7 @@ Error RenderingDevice::screen_free(DisplayServer::WindowID p_screen) {
 /*******************/
 
 RenderingDevice::DrawListID RenderingDevice::draw_list_begin_for_screen(DisplayServer::WindowID p_screen, const Color &p_clear_color) {
-	_THREAD_SAFE_METHOD_
+	ERR_RENDER_THREAD_GUARD_V(INVALID_ID);
 
 	ERR_FAIL_COND_V_MSG(draw_list != nullptr, INVALID_ID, "Only one draw list can be active at the same time.");
 	ERR_FAIL_COND_V_MSG(compute_list != nullptr, INVALID_ID, "Only one draw/compute list can be active at the same time.");
@@ -3594,8 +3825,8 @@ RenderingDevice::DrawListID RenderingDevice::draw_list_begin_for_screen(DisplayS
 	RDD::RenderPassID render_pass = driver->swap_chain_get_render_pass(sc_it->value);
 	draw_graph.add_draw_list_begin(render_pass, fb_it->value, viewport, clear_value, true, false, RDD::BreadcrumbMarker::BLIT_PASS);
 
-	_draw_list_set_viewport(viewport);
-	_draw_list_set_scissor(viewport);
+	draw_graph.add_draw_list_set_viewport(viewport);
+	draw_graph.add_draw_list_set_scissor(viewport);
 
 	return int64_t(ID_TYPE_DRAW_LIST) << ID_BASE_SHIFT;
 }
@@ -3707,14 +3938,6 @@ Error RenderingDevice::_draw_list_render_pass_begin(Framebuffer *p_framebuffer,
 	return OK;
 }
 
-void RenderingDevice::_draw_list_set_viewport(Rect2i p_rect) {
-	draw_graph.add_draw_list_set_viewport(p_rect);
-}
-
-void RenderingDevice::_draw_list_set_scissor(Rect2i p_rect) {
-	draw_graph.add_draw_list_set_scissor(p_rect);
-}
-
 void RenderingDevice::_draw_list_insert_clear_region(DrawList *p_draw_list, Framebuffer *p_framebuffer, Point2i p_viewport_offset, Point2i p_viewport_size, bool p_clear_color, const Vector<Color> &p_clear_colors, bool p_clear_depth, float p_depth, uint32_t p_stencil) {
 	LocalVector<RDD::AttachmentClear> clear_attachments;
 	int color_index = 0;
@@ -3752,7 +3975,7 @@ void RenderingDevice::_draw_list_insert_clear_region(DrawList *p_draw_list, Fram
 }
 
 RenderingDevice::DrawListID RenderingDevice::draw_list_begin(RID p_framebuffer, InitialAction p_initial_color_action, FinalAction p_final_color_action, InitialAction p_initial_depth_action, FinalAction p_final_depth_action, const Vector<Color> &p_clear_color_values, float p_clear_depth, uint32_t p_clear_stencil, const Rect2 &p_region, uint32_t p_breadcrumb) {
-	_THREAD_SAFE_METHOD_
+	ERR_RENDER_THREAD_GUARD_V(INVALID_ID);
 
 	ERR_FAIL_COND_V_MSG(draw_list != nullptr, INVALID_ID, "Only one draw list can be active at the same time.");
 
@@ -3812,8 +4035,9 @@ RenderingDevice::DrawListID RenderingDevice::draw_list_begin(RID p_framebuffer,
 #endif
 	draw_list_current_subpass = 0;
 
-	_draw_list_set_viewport(Rect2i(viewport_offset, viewport_size));
-	_draw_list_set_scissor(Rect2i(viewport_offset, viewport_size));
+	Rect2i viewport_rect(viewport_offset, viewport_size);
+	draw_graph.add_draw_list_set_viewport(viewport_rect);
+	draw_graph.add_draw_list_set_scissor(viewport_rect);
 
 	return int64_t(ID_TYPE_DRAW_LIST) << ID_BASE_SHIFT;
 }
@@ -3839,6 +4063,8 @@ RenderingDevice::DrawList *RenderingDevice::_get_draw_list_ptr(DrawListID p_id)
 }
 
 void RenderingDevice::draw_list_set_blend_constants(DrawListID p_list, const Color &p_color) {
+	ERR_RENDER_THREAD_GUARD();
+
 	DrawList *dl = _get_draw_list_ptr(p_list);
 	ERR_FAIL_NULL(dl);
 #ifdef DEBUG_ENABLED
@@ -3849,6 +4075,8 @@ void RenderingDevice::draw_list_set_blend_constants(DrawListID p_list, const Col
 }
 
 void RenderingDevice::draw_list_bind_render_pipeline(DrawListID p_list, RID p_render_pipeline) {
+	ERR_RENDER_THREAD_GUARD();
+
 	DrawList *dl = _get_draw_list_ptr(p_list);
 	ERR_FAIL_NULL(dl);
 #ifdef DEBUG_ENABLED
@@ -3877,23 +4105,35 @@ void RenderingDevice::draw_list_bind_render_pipeline(DrawListID p_list, RID p_re
 		const uint32_t *pformats = pipeline->set_formats.ptr(); // Pipeline set formats.
 
 		uint32_t first_invalid_set = UINT32_MAX; // All valid by default.
-		switch (driver->api_trait_get(RDD::API_TRAIT_SHADER_CHANGE_INVALIDATION)) {
-			case RDD::SHADER_CHANGE_INVALIDATION_ALL_BOUND_UNIFORM_SETS: {
-				first_invalid_set = 0;
-			} break;
-			case RDD::SHADER_CHANGE_INVALIDATION_INCOMPATIBLE_SETS_PLUS_CASCADE: {
-				for (uint32_t i = 0; i < pcount; i++) {
-					if (dl->state.sets[i].pipeline_expected_format != pformats[i]) {
-						first_invalid_set = i;
-						break;
-					}
-				}
-			} break;
-			case RDD::SHADER_CHANGE_INVALIDATION_ALL_OR_NONE_ACCORDING_TO_LAYOUT_HASH: {
-				if (dl->state.pipeline_shader_layout_hash != pipeline->shader_layout_hash) {
+		if (pipeline->push_constant_size != dl->state.pipeline_push_constant_size) {
+			// All sets must be invalidated as the pipeline layout is not compatible if the push constant range is different.
+			dl->state.pipeline_push_constant_size = pipeline->push_constant_size;
+			first_invalid_set = 0;
+		} else {
+			switch (driver->api_trait_get(RDD::API_TRAIT_SHADER_CHANGE_INVALIDATION)) {
+				case RDD::SHADER_CHANGE_INVALIDATION_ALL_BOUND_UNIFORM_SETS: {
 					first_invalid_set = 0;
-				}
-			} break;
+				} break;
+				case RDD::SHADER_CHANGE_INVALIDATION_INCOMPATIBLE_SETS_PLUS_CASCADE: {
+					for (uint32_t i = 0; i < pcount; i++) {
+						if (dl->state.sets[i].pipeline_expected_format != pformats[i]) {
+							first_invalid_set = i;
+							break;
+						}
+					}
+				} break;
+				case RDD::SHADER_CHANGE_INVALIDATION_ALL_OR_NONE_ACCORDING_TO_LAYOUT_HASH: {
+					if (dl->state.pipeline_shader_layout_hash != pipeline->shader_layout_hash) {
+						first_invalid_set = 0;
+					}
+				} break;
+			}
+		}
+
+		if (pipeline->push_constant_size) {
+#ifdef DEBUG_ENABLED
+			dl->validation.pipeline_push_constant_supplied = false;
+#endif
 		}
 
 		for (uint32_t i = 0; i < pcount; i++) {
@@ -3908,12 +4148,6 @@ void RenderingDevice::draw_list_bind_render_pipeline(DrawListID p_list, RID p_re
 
 		dl->state.set_count = pcount; // Update set count.
 
-		if (pipeline->push_constant_size) {
-#ifdef DEBUG_ENABLED
-			dl->validation.pipeline_push_constant_supplied = false;
-#endif
-		}
-
 		dl->state.pipeline_shader = pipeline->shader;
 		dl->state.pipeline_shader_driver_id = pipeline->shader_driver_id;
 		dl->state.pipeline_shader_layout_hash = pipeline->shader_layout_hash;
@@ -3932,6 +4166,8 @@ void RenderingDevice::draw_list_bind_render_pipeline(DrawListID p_list, RID p_re
 }
 
 void RenderingDevice::draw_list_bind_uniform_set(DrawListID p_list, RID p_uniform_set, uint32_t p_index) {
+	ERR_RENDER_THREAD_GUARD();
+
 #ifdef DEBUG_ENABLED
 	ERR_FAIL_COND_MSG(p_index >= driver->limit_get(LIMIT_MAX_BOUND_UNIFORM_SETS) || p_index >= MAX_UNIFORM_SETS,
 			"Attempting to bind a descriptor set (" + itos(p_index) + ") greater than what the hardware supports (" + itos(driver->limit_get(LIMIT_MAX_BOUND_UNIFORM_SETS)) + ").");
@@ -3972,19 +4208,23 @@ void RenderingDevice::draw_list_bind_uniform_set(DrawListID p_list, RID p_unifor
 }
 
 void RenderingDevice::draw_list_bind_vertex_array(DrawListID p_list, RID p_vertex_array) {
+	ERR_RENDER_THREAD_GUARD();
+
 	DrawList *dl = _get_draw_list_ptr(p_list);
 	ERR_FAIL_NULL(dl);
 #ifdef DEBUG_ENABLED
 	ERR_FAIL_COND_MSG(!dl->validation.active, "Submitted Draw Lists can no longer be modified.");
 #endif
 
-	const VertexArray *vertex_array = vertex_array_owner.get_or_null(p_vertex_array);
+	VertexArray *vertex_array = vertex_array_owner.get_or_null(p_vertex_array);
 	ERR_FAIL_NULL(vertex_array);
 
 	if (dl->state.vertex_array == p_vertex_array) {
 		return; // Already set.
 	}
 
+	_check_transfer_worker_vertex_array(vertex_array);
+
 	dl->state.vertex_array = p_vertex_array;
 
 #ifdef DEBUG_ENABLED
@@ -4001,19 +4241,23 @@ void RenderingDevice::draw_list_bind_vertex_array(DrawListID p_list, RID p_verte
 }
 
 void RenderingDevice::draw_list_bind_index_array(DrawListID p_list, RID p_index_array) {
+	ERR_RENDER_THREAD_GUARD();
+
 	DrawList *dl = _get_draw_list_ptr(p_list);
 	ERR_FAIL_NULL(dl);
 #ifdef DEBUG_ENABLED
 	ERR_FAIL_COND_MSG(!dl->validation.active, "Submitted Draw Lists can no longer be modified.");
 #endif
 
-	const IndexArray *index_array = index_array_owner.get_or_null(p_index_array);
+	IndexArray *index_array = index_array_owner.get_or_null(p_index_array);
 	ERR_FAIL_NULL(index_array);
 
 	if (dl->state.index_array == p_index_array) {
 		return; // Already set.
 	}
 
+	_check_transfer_worker_index_array(index_array);
+
 	dl->state.index_array = p_index_array;
 #ifdef DEBUG_ENABLED
 	dl->validation.index_array_max_index = index_array->max_index;
@@ -4029,6 +4273,8 @@ void RenderingDevice::draw_list_bind_index_array(DrawListID p_list, RID p_index_
 }
 
 void RenderingDevice::draw_list_set_line_width(DrawListID p_list, float p_width) {
+	ERR_RENDER_THREAD_GUARD();
+
 	DrawList *dl = _get_draw_list_ptr(p_list);
 	ERR_FAIL_NULL(dl);
 #ifdef DEBUG_ENABLED
@@ -4039,6 +4285,8 @@ void RenderingDevice::draw_list_set_line_width(DrawListID p_list, float p_width)
 }
 
 void RenderingDevice::draw_list_set_push_constant(DrawListID p_list, const void *p_data, uint32_t p_data_size) {
+	ERR_RENDER_THREAD_GUARD();
+
 	DrawList *dl = _get_draw_list_ptr(p_list);
 	ERR_FAIL_NULL(dl);
 
@@ -4059,6 +4307,8 @@ void RenderingDevice::draw_list_set_push_constant(DrawListID p_list, const void
 }
 
 void RenderingDevice::draw_list_draw(DrawListID p_list, bool p_use_indices, uint32_t p_instances, uint32_t p_procedural_vertices) {
+	ERR_RENDER_THREAD_GUARD();
+
 	DrawList *dl = _get_draw_list_ptr(p_list);
 	ERR_FAIL_NULL(dl);
 #ifdef DEBUG_ENABLED
@@ -4192,6 +4442,8 @@ void RenderingDevice::draw_list_draw(DrawListID p_list, bool p_use_indices, uint
 }
 
 void RenderingDevice::draw_list_enable_scissor(DrawListID p_list, const Rect2 &p_rect) {
+	ERR_RENDER_THREAD_GUARD();
+
 	DrawList *dl = _get_draw_list_ptr(p_list);
 
 	ERR_FAIL_NULL(dl);
@@ -4207,25 +4459,30 @@ void RenderingDevice::draw_list_enable_scissor(DrawListID p_list, const Rect2 &p
 		return;
 	}
 
-	_draw_list_set_scissor(rect);
+	draw_graph.add_draw_list_set_scissor(rect);
 }
 
 void RenderingDevice::draw_list_disable_scissor(DrawListID p_list) {
+	ERR_RENDER_THREAD_GUARD();
+
 	DrawList *dl = _get_draw_list_ptr(p_list);
 	ERR_FAIL_NULL(dl);
 #ifdef DEBUG_ENABLED
 	ERR_FAIL_COND_MSG(!dl->validation.active, "Submitted Draw Lists can no longer be modified.");
 #endif
 
-	_draw_list_set_scissor(dl->viewport);
+	draw_graph.add_draw_list_set_scissor(dl->viewport);
 }
 
 uint32_t RenderingDevice::draw_list_get_current_pass() {
+	ERR_RENDER_THREAD_GUARD_V(0);
+
 	return draw_list_current_subpass;
 }
 
 RenderingDevice::DrawListID RenderingDevice::draw_list_switch_to_next_pass() {
-	_THREAD_SAFE_METHOD_
+	ERR_RENDER_THREAD_GUARD_V(INVALID_ID);
+
 	ERR_FAIL_NULL_V(draw_list, INVALID_ID);
 	ERR_FAIL_COND_V(draw_list_current_subpass >= draw_list_subpass_count - 1, INVALID_FORMAT_ID);
 
@@ -4248,9 +4505,6 @@ Error RenderingDevice::draw_list_switch_to_next_pass_split(uint32_t p_splits, Dr
 #endif
 
 Error RenderingDevice::_draw_list_allocate(const Rect2i &p_viewport, uint32_t p_subpass) {
-	// Lock while draw_list is active.
-	_THREAD_SAFE_LOCK_
-
 	draw_list = memnew(DrawList);
 	draw_list->viewport = p_viewport;
 
@@ -4264,13 +4518,10 @@ void RenderingDevice::_draw_list_free(Rect2i *r_last_viewport) {
 	// Just end the list.
 	memdelete(draw_list);
 	draw_list = nullptr;
-
-	// Draw_list is no longer active.
-	_THREAD_SAFE_UNLOCK_
 }
 
 void RenderingDevice::draw_list_end() {
-	_THREAD_SAFE_METHOD_
+	ERR_RENDER_THREAD_GUARD();
 
 	ERR_FAIL_NULL_MSG(draw_list, "Immediate draw list is already inactive.");
 
@@ -4297,13 +4548,10 @@ void RenderingDevice::draw_list_end() {
 /***********************/
 
 RenderingDevice::ComputeListID RenderingDevice::compute_list_begin() {
-	_THREAD_SAFE_METHOD_
+	ERR_RENDER_THREAD_GUARD_V(INVALID_ID);
 
 	ERR_FAIL_COND_V_MSG(compute_list != nullptr, INVALID_ID, "Only one draw/compute list can be active at the same time.");
 
-	// Lock while compute_list is active.
-	_THREAD_SAFE_LOCK_
-
 	compute_list = memnew(ComputeList);
 
 	draw_graph.add_compute_list_begin();
@@ -4312,7 +4560,7 @@ RenderingDevice::ComputeListID RenderingDevice::compute_list_begin() {
 }
 
 void RenderingDevice::compute_list_bind_compute_pipeline(ComputeListID p_list, RID p_compute_pipeline) {
-	// Must be called within a compute list, the class mutex is locked during that time
+	ERR_RENDER_THREAD_GUARD();
 
 	ERR_FAIL_COND(p_list != ID_TYPE_COMPUTE_LIST);
 	ERR_FAIL_NULL(compute_list);
@@ -4391,7 +4639,7 @@ void RenderingDevice::compute_list_bind_compute_pipeline(ComputeListID p_list, R
 }
 
 void RenderingDevice::compute_list_bind_uniform_set(ComputeListID p_list, RID p_uniform_set, uint32_t p_index) {
-	// Must be called within a compute list, the class mutex is locked during that time
+	ERR_RENDER_THREAD_GUARD();
 
 	ERR_FAIL_COND(p_list != ID_TYPE_COMPUTE_LIST);
 	ERR_FAIL_NULL(compute_list);
@@ -4436,6 +4684,8 @@ void RenderingDevice::compute_list_bind_uniform_set(ComputeListID p_list, RID p_
 }
 
 void RenderingDevice::compute_list_set_push_constant(ComputeListID p_list, const void *p_data, uint32_t p_data_size) {
+	ERR_RENDER_THREAD_GUARD();
+
 	ERR_FAIL_COND(p_list != ID_TYPE_COMPUTE_LIST);
 	ERR_FAIL_NULL(compute_list);
 	ERR_FAIL_COND_MSG(p_data_size > MAX_PUSH_CONSTANT_SIZE, "Push constants can't be bigger than 128 bytes to maintain compatibility.");
@@ -4463,7 +4713,7 @@ void RenderingDevice::compute_list_set_push_constant(ComputeListID p_list, const
 }
 
 void RenderingDevice::compute_list_dispatch(ComputeListID p_list, uint32_t p_x_groups, uint32_t p_y_groups, uint32_t p_z_groups) {
-	// Must be called within a compute list, the class mutex is locked during that time
+	ERR_RENDER_THREAD_GUARD();
 
 	ERR_FAIL_COND(p_list != ID_TYPE_COMPUTE_LIST);
 	ERR_FAIL_NULL(compute_list);
@@ -4551,6 +4801,8 @@ void RenderingDevice::compute_list_dispatch(ComputeListID p_list, uint32_t p_x_g
 }
 
 void RenderingDevice::compute_list_dispatch_threads(ComputeListID p_list, uint32_t p_x_threads, uint32_t p_y_threads, uint32_t p_z_threads) {
+	ERR_RENDER_THREAD_GUARD();
+
 	ERR_FAIL_COND(p_list != ID_TYPE_COMPUTE_LIST);
 	ERR_FAIL_NULL(compute_list);
 
@@ -4578,6 +4830,8 @@ void RenderingDevice::compute_list_dispatch_threads(ComputeListID p_list, uint32
 }
 
 void RenderingDevice::compute_list_dispatch_indirect(ComputeListID p_list, RID p_buffer, uint32_t p_offset) {
+	ERR_RENDER_THREAD_GUARD();
+
 	ERR_FAIL_COND(p_list != ID_TYPE_COMPUTE_LIST);
 	ERR_FAIL_NULL(compute_list);
 
@@ -4661,10 +4915,12 @@ void RenderingDevice::compute_list_dispatch_indirect(ComputeListID p_list, RID p
 	if (buffer->draw_tracker != nullptr) {
 		draw_graph.add_compute_list_usage(buffer->draw_tracker, RDG::RESOURCE_USAGE_INDIRECT_BUFFER_READ);
 	}
+
+	_check_transfer_worker_buffer(buffer);
 }
 
 void RenderingDevice::compute_list_add_barrier(ComputeListID p_list) {
-	// Must be called within a compute list, the class mutex is locked during that time
+	ERR_RENDER_THREAD_GUARD();
 
 	compute_list_barrier_state = compute_list->state;
 	compute_list_end();
@@ -4686,15 +4942,14 @@ void RenderingDevice::compute_list_add_barrier(ComputeListID p_list) {
 }
 
 void RenderingDevice::compute_list_end() {
+	ERR_RENDER_THREAD_GUARD();
+
 	ERR_FAIL_NULL(compute_list);
 
 	draw_graph.add_compute_list_end();
 
 	memdelete(compute_list);
 	compute_list = nullptr;
-
-	// Compute_list is no longer active.
-	_THREAD_SAFE_UNLOCK_
 }
 
 #ifndef DISABLE_DEPRECATED
@@ -4707,6 +4962,282 @@ void RenderingDevice::full_barrier() {
 }
 #endif
 
+/*************************/
+/**** TRANSFER WORKER ****/
+/*************************/
+
+static uint32_t _get_alignment_offset(uint32_t p_offset, uint32_t p_required_align) {
+	uint32_t alignment_offset = (p_required_align > 0) ? (p_offset % p_required_align) : 0;
+	if (alignment_offset != 0) {
+		// If a particular alignment is required, add the offset as part of the required size.
+		alignment_offset = p_required_align - alignment_offset;
+	}
+
+	return alignment_offset;
+}
+
+RenderingDevice::TransferWorker *RenderingDevice::_acquire_transfer_worker(uint32_t p_transfer_size, uint32_t p_required_align, uint32_t &r_staging_offset) {
+	// Find the first worker that is not currently executing anything and has enough size for the transfer.
+	// If no workers are available, we make a new one. If we're not allowed to make new ones, we wait until one of them is available.
+	TransferWorker *transfer_worker = nullptr;
+	uint32_t available_list_index = 0;
+	bool transfer_worker_busy = true;
+	bool transfer_worker_full = true;
+	{
+		MutexLock pool_lock(transfer_worker_pool_mutex);
+
+		// If no workers are available and we've reached the max pool capacity, wait until one of them becomes available.
+		bool transfer_worker_pool_full = transfer_worker_pool.size() >= transfer_worker_pool_max_size;
+		while (transfer_worker_pool_available_list.is_empty() && transfer_worker_pool_full) {
+			transfer_worker_pool_condition.wait(pool_lock);
+		}
+
+		// Look at all available workers first.
+		for (uint32_t i = 0; i < transfer_worker_pool_available_list.size(); i++) {
+			uint32_t worker_index = transfer_worker_pool_available_list[i];
+			TransferWorker *candidate_worker = transfer_worker_pool[worker_index];
+			candidate_worker->thread_mutex.lock();
+
+			// Figure out if the worker can fit the transfer.
+			uint32_t alignment_offset = _get_alignment_offset(candidate_worker->staging_buffer_size_in_use, p_required_align);
+			uint32_t required_size = candidate_worker->staging_buffer_size_in_use + p_transfer_size + alignment_offset;
+			bool candidate_worker_busy = candidate_worker->submitted;
+			bool candidate_worker_full = required_size > candidate_worker->staging_buffer_size_allocated;
+			bool pick_candidate = false;
+			if (!candidate_worker_busy && !candidate_worker_full) {
+				// A worker that can fit the transfer and is not waiting for a previous execution is the best possible candidate.
+				pick_candidate = true;
+			} else if (!candidate_worker_busy) {
+				// The worker can't fit the transfer but it's not currently doing anything.
+				// We pick it as a possible candidate if the current one is busy.
+				pick_candidate = transfer_worker_busy;
+			} else if (!candidate_worker_full) {
+				// The worker can fit the transfer but it's currently executing previous work.
+				// We pick it as a possible candidate if the current one is both busy and full.
+				pick_candidate = transfer_worker_busy && transfer_worker_full;
+			} else if (transfer_worker == nullptr) {
+				// The worker can't fit the transfer and it's currently executing work, so it's the worst candidate.
+				// We only pick if no candidate has been picked yet.
+				pick_candidate = true;
+			}
+
+			if (pick_candidate) {
+				if (transfer_worker != nullptr) {
+					// Release the lock for the worker that was picked previously.
+					transfer_worker->thread_mutex.unlock();
+				}
+
+				// Keep the lock active for this worker.
+				transfer_worker = candidate_worker;
+				transfer_worker_busy = candidate_worker_busy;
+				transfer_worker_full = candidate_worker_full;
+				available_list_index = i;
+
+				if (!transfer_worker_busy && !transfer_worker_full) {
+					// Best possible candidate, stop searching early.
+					break;
+				}
+			} else {
+				// Release the lock for the candidate.
+				candidate_worker->thread_mutex.unlock();
+			}
+		}
+
+		if (transfer_worker != nullptr) {
+			// A worker was picked, remove it from the available list.
+			transfer_worker_pool_available_list.remove_at(available_list_index);
+		} else {
+			DEV_ASSERT(!transfer_worker_pool_full && "A transfer worker should never be created when the pool is full.");
+
+			// No existing worker was picked, we create a new one.
+			transfer_worker = memnew(TransferWorker);
+			transfer_worker->command_fence = driver->fence_create();
+			transfer_worker->command_semaphore = driver->semaphore_create();
+			transfer_worker->command_pool = driver->command_pool_create(transfer_queue_family, RDD::COMMAND_BUFFER_TYPE_PRIMARY);
+			transfer_worker->command_buffer = driver->command_buffer_create(transfer_worker->command_pool);
+			transfer_worker->index = transfer_worker_pool.size();
+			transfer_worker_pool.push_back(transfer_worker);
+			transfer_worker_operation_used_by_draw.push_back(0);
+			transfer_worker->thread_mutex.lock();
+		}
+	}
+
+	if (transfer_worker->submitted) {
+		// Wait for the worker if the command buffer was submitted but it hasn't finished processing yet.
+		_wait_for_transfer_worker(transfer_worker);
+	}
+
+	uint32_t alignment_offset = _get_alignment_offset(transfer_worker->staging_buffer_size_in_use, p_required_align);
+	transfer_worker->max_transfer_size = MAX(transfer_worker->max_transfer_size, p_transfer_size);
+
+	uint32_t required_size = transfer_worker->staging_buffer_size_in_use + p_transfer_size + alignment_offset;
+	if (required_size > transfer_worker->staging_buffer_size_allocated) {
+		// If there's not enough bytes to use on the staging buffer, we submit everything pending from the worker and wait for the work to be finished.
+		if (transfer_worker->recording) {
+			_end_transfer_worker(transfer_worker);
+			_submit_transfer_worker(transfer_worker, false);
+		}
+
+		if (transfer_worker->submitted) {
+			_wait_for_transfer_worker(transfer_worker);
+		}
+
+		alignment_offset = 0;
+
+		// If the staging buffer can't fit the transfer, we recreate the buffer.
+		const uint32_t expected_buffer_size_minimum = 16 * 1024;
+		uint32_t expected_buffer_size = MAX(transfer_worker->max_transfer_size, expected_buffer_size_minimum);
+		if (expected_buffer_size > transfer_worker->staging_buffer_size_allocated) {
+			if (transfer_worker->staging_buffer.id != 0) {
+				driver->buffer_free(transfer_worker->staging_buffer);
+			}
+
+			uint32_t new_staging_buffer_size = next_power_of_2(expected_buffer_size);
+			transfer_worker->staging_buffer_size_allocated = new_staging_buffer_size;
+			transfer_worker->staging_buffer = driver->buffer_create(new_staging_buffer_size, RDD::BUFFER_USAGE_TRANSFER_FROM_BIT, RDD::MEMORY_ALLOCATION_TYPE_CPU);
+		}
+	}
+
+	// Add the alignment before storing the offset that will be returned.
+	transfer_worker->staging_buffer_size_in_use += alignment_offset;
+
+	// Store the offset to return and increment the current size.
+	r_staging_offset = transfer_worker->staging_buffer_size_in_use;
+	transfer_worker->staging_buffer_size_in_use += p_transfer_size;
+
+	if (!transfer_worker->recording) {
+		// Begin the command buffer if the worker wasn't recording yet.
+		driver->command_buffer_begin(transfer_worker->command_buffer);
+		transfer_worker->recording = true;
+	}
+
+	return transfer_worker;
+}
+
+void RenderingDevice::_release_transfer_worker(TransferWorker *p_transfer_worker) {
+	p_transfer_worker->thread_mutex.unlock();
+
+	transfer_worker_pool_mutex.lock();
+	transfer_worker_pool_available_list.push_back(p_transfer_worker->index);
+	transfer_worker_pool_mutex.unlock();
+	transfer_worker_pool_condition.notify_one();
+}
+
+void RenderingDevice::_end_transfer_worker(TransferWorker *p_transfer_worker) {
+	driver->command_buffer_end(p_transfer_worker->command_buffer);
+	p_transfer_worker->recording = false;
+}
+
+void RenderingDevice::_submit_transfer_worker(TransferWorker *p_transfer_worker, bool p_signal_semaphore) {
+	const VectorView<RDD::SemaphoreID> execute_semaphore = p_signal_semaphore ? p_transfer_worker->command_semaphore : VectorView<RDD::SemaphoreID>();
+	driver->command_queue_execute_and_present(transfer_queue, {}, p_transfer_worker->command_buffer, execute_semaphore, p_transfer_worker->command_fence, {});
+	if (p_signal_semaphore) {
+		// Indicate the frame should wait on these semaphores before executing the main command buffer.
+		frames[frame].semaphores_to_wait_on.push_back(p_transfer_worker->command_semaphore);
+	}
+
+	p_transfer_worker->submitted = true;
+
+	{
+		MutexLock lock(p_transfer_worker->operations_mutex);
+		p_transfer_worker->operations_submitted = p_transfer_worker->operations_counter;
+	}
+}
+
+void RenderingDevice::_wait_for_transfer_worker(TransferWorker *p_transfer_worker) {
+	driver->fence_wait(p_transfer_worker->command_fence);
+	p_transfer_worker->staging_buffer_size_in_use = 0;
+	p_transfer_worker->submitted = false;
+
+	{
+		MutexLock lock(p_transfer_worker->operations_mutex);
+		p_transfer_worker->operations_processed = p_transfer_worker->operations_submitted;
+	}
+}
+
+void RenderingDevice::_check_transfer_worker_operation(uint32_t p_transfer_worker_index, uint64_t p_transfer_worker_operation) {
+	TransferWorker *transfer_worker = transfer_worker_pool[p_transfer_worker_index];
+	MutexLock lock(transfer_worker->operations_mutex);
+	uint64_t &dst_operation = transfer_worker_operation_used_by_draw[transfer_worker->index];
+	dst_operation = MAX(dst_operation, p_transfer_worker_operation);
+}
+
+void RenderingDevice::_check_transfer_worker_buffer(Buffer *p_buffer) {
+	if (p_buffer->transfer_worker_index >= 0) {
+		_check_transfer_worker_operation(p_buffer->transfer_worker_index, p_buffer->transfer_worker_operation);
+		p_buffer->transfer_worker_index = -1;
+	}
+}
+
+void RenderingDevice::_check_transfer_worker_texture(Texture *p_texture) {
+	if (p_texture->transfer_worker_index >= 0) {
+		_check_transfer_worker_operation(p_texture->transfer_worker_index, p_texture->transfer_worker_operation);
+		p_texture->transfer_worker_index = -1;
+	}
+}
+
+void RenderingDevice::_check_transfer_worker_vertex_array(VertexArray *p_vertex_array) {
+	if (!p_vertex_array->transfer_worker_indices.is_empty()) {
+		for (int i = 0; i < p_vertex_array->transfer_worker_indices.size(); i++) {
+			_check_transfer_worker_operation(p_vertex_array->transfer_worker_indices[i], p_vertex_array->transfer_worker_operations[i]);
+		}
+
+		p_vertex_array->transfer_worker_indices.clear();
+		p_vertex_array->transfer_worker_operations.clear();
+	}
+}
+
+void RenderingDevice::_check_transfer_worker_index_array(IndexArray *p_index_array) {
+	if (p_index_array->transfer_worker_index >= 0) {
+		_check_transfer_worker_operation(p_index_array->transfer_worker_index, p_index_array->transfer_worker_operation);
+		p_index_array->transfer_worker_index = -1;
+	}
+}
+
+void RenderingDevice::_submit_transfer_workers(bool p_operations_used_by_draw) {
+	MutexLock transfer_worker_lock(transfer_worker_pool_mutex);
+	for (TransferWorker *worker : transfer_worker_pool) {
+		if (p_operations_used_by_draw) {
+			MutexLock lock(worker->operations_mutex);
+			if (worker->operations_processed >= transfer_worker_operation_used_by_draw[worker->index]) {
+				// The operation used by the draw has already been processed, we don't need to wait on the worker.
+				continue;
+			}
+		}
+
+		{
+			MutexLock lock(worker->thread_mutex);
+			if (worker->recording) {
+				_end_transfer_worker(worker);
+				_submit_transfer_worker(worker, true);
+			}
+		}
+	}
+}
+
+void RenderingDevice::_wait_for_transfer_workers() {
+	MutexLock transfer_worker_lock(transfer_worker_pool_mutex);
+	for (TransferWorker *worker : transfer_worker_pool) {
+		MutexLock lock(worker->thread_mutex);
+		if (worker->submitted) {
+			_wait_for_transfer_worker(worker);
+		}
+	}
+}
+
+void RenderingDevice::_free_transfer_workers() {
+	MutexLock transfer_worker_lock(transfer_worker_pool_mutex);
+	for (TransferWorker *worker : transfer_worker_pool) {
+		driver->semaphore_free(worker->command_semaphore);
+		driver->fence_free(worker->command_fence);
+		driver->buffer_free(worker->staging_buffer);
+		driver->command_pool_free(worker->command_pool);
+		memdelete(worker);
+	}
+
+	transfer_worker_pool.clear();
+}
+
 /***********************/
 /**** COMMAND GRAPH ****/
 /***********************/
@@ -4851,7 +5382,7 @@ bool RenderingDevice::_dependency_make_mutable(RID p_id, RID p_resource_id, RDG:
 	}
 }
 
-bool RenderingDevice::_dependencies_make_mutable(RID p_id, RDG::ResourceTracker *p_resource_tracker) {
+bool RenderingDevice::_dependencies_make_mutable_recursive(RID p_id, RDG::ResourceTracker *p_resource_tracker) {
 	bool made_mutable = false;
 	HashMap<RID, HashSet<RID>>::Iterator E = dependency_map.find(p_id);
 	if (E) {
@@ -4863,12 +5394,17 @@ bool RenderingDevice::_dependencies_make_mutable(RID p_id, RDG::ResourceTracker
 	return made_mutable;
 }
 
+bool RenderingDevice::_dependencies_make_mutable(RID p_id, RDG::ResourceTracker *p_resource_tracker) {
+	_THREAD_SAFE_METHOD_
+	return _dependencies_make_mutable_recursive(p_id, p_resource_tracker);
+}
+
 /**************************/
 /**** FRAME MANAGEMENT ****/
 /**************************/
 
 void RenderingDevice::free(RID p_id) {
-	_THREAD_SAFE_METHOD_
+	ERR_RENDER_THREAD_GUARD();
 
 	_free_dependencies(p_id); // Recursively erase dependencies first, to avoid potential API problems.
 	_free_internal(p_id);
@@ -4886,6 +5422,8 @@ void RenderingDevice::_free_internal(RID p_id) {
 	// Push everything so it's disposed of next time this frame index is processed (means, it's safe to do it).
 	if (texture_owner.owns(p_id)) {
 		Texture *texture = texture_owner.get_or_null(p_id);
+		_check_transfer_worker_texture(texture);
+
 		RDG::ResourceTracker *draw_tracker = texture->draw_tracker;
 		if (draw_tracker != nullptr) {
 			draw_tracker->reference_count--;
@@ -4919,6 +5457,8 @@ void RenderingDevice::_free_internal(RID p_id) {
 		sampler_owner.free(p_id);
 	} else if (vertex_buffer_owner.owns(p_id)) {
 		Buffer *vertex_buffer = vertex_buffer_owner.get_or_null(p_id);
+		_check_transfer_worker_buffer(vertex_buffer);
+
 		RDG::resource_tracker_free(vertex_buffer->draw_tracker);
 		frames[frame].buffers_to_dispose_of.push_back(*vertex_buffer);
 		vertex_buffer_owner.free(p_id);
@@ -4926,6 +5466,8 @@ void RenderingDevice::_free_internal(RID p_id) {
 		vertex_array_owner.free(p_id);
 	} else if (index_buffer_owner.owns(p_id)) {
 		IndexBuffer *index_buffer = index_buffer_owner.get_or_null(p_id);
+		_check_transfer_worker_buffer(index_buffer);
+
 		RDG::resource_tracker_free(index_buffer->draw_tracker);
 		frames[frame].buffers_to_dispose_of.push_back(*index_buffer);
 		index_buffer_owner.free(p_id);
@@ -4939,16 +5481,22 @@ void RenderingDevice::_free_internal(RID p_id) {
 		shader_owner.free(p_id);
 	} else if (uniform_buffer_owner.owns(p_id)) {
 		Buffer *uniform_buffer = uniform_buffer_owner.get_or_null(p_id);
+		_check_transfer_worker_buffer(uniform_buffer);
+
 		RDG::resource_tracker_free(uniform_buffer->draw_tracker);
 		frames[frame].buffers_to_dispose_of.push_back(*uniform_buffer);
 		uniform_buffer_owner.free(p_id);
 	} else if (texture_buffer_owner.owns(p_id)) {
 		Buffer *texture_buffer = texture_buffer_owner.get_or_null(p_id);
+		_check_transfer_worker_buffer(texture_buffer);
+
 		RDG::resource_tracker_free(texture_buffer->draw_tracker);
 		frames[frame].buffers_to_dispose_of.push_back(*texture_buffer);
 		texture_buffer_owner.free(p_id);
 	} else if (storage_buffer_owner.owns(p_id)) {
 		Buffer *storage_buffer = storage_buffer_owner.get_or_null(p_id);
+		_check_transfer_worker_buffer(storage_buffer);
+
 		RDG::resource_tracker_free(storage_buffer->draw_tracker);
 		frames[frame].buffers_to_dispose_of.push_back(*storage_buffer);
 		storage_buffer_owner.free(p_id);
@@ -4980,6 +5528,8 @@ void RenderingDevice::_free_internal(RID p_id) {
 // The full list of resources that can be named is in the VkObjectType enum.
 // We just expose the resources that are owned and can be accessed easily.
 void RenderingDevice::set_resource_name(RID p_id, const String &p_name) {
+	_THREAD_SAFE_METHOD_
+
 	if (texture_owner.owns(p_id)) {
 		Texture *texture = texture_owner.get_or_null(p_id);
 		driver->set_object_name(RDD::OBJECT_TYPE_TEXTURE, texture->driver_id, p_name);
@@ -5026,6 +5576,8 @@ void RenderingDevice::set_resource_name(RID p_id, const String &p_name) {
 }
 
 void RenderingDevice::draw_command_begin_label(String p_label_name, const Color &p_color) {
+	ERR_RENDER_THREAD_GUARD();
+
 	if (!context->is_debug_utils_enabled()) {
 		return;
 	}
@@ -5040,6 +5592,8 @@ void RenderingDevice::draw_command_insert_label(String p_label_name, const Color
 #endif
 
 void RenderingDevice::draw_command_end_label() {
+	ERR_RENDER_THREAD_GUARD();
+
 	draw_graph.end_label();
 }
 
@@ -5072,7 +5626,7 @@ String RenderingDevice::get_device_pipeline_cache_uuid() const {
 }
 
 void RenderingDevice::swap_buffers() {
-	_THREAD_SAFE_METHOD_
+	ERR_RENDER_THREAD_GUARD();
 
 	_end_frame();
 	_execute_frame(true);
@@ -5083,18 +5637,20 @@ void RenderingDevice::swap_buffers() {
 }
 
 void RenderingDevice::submit() {
-	_THREAD_SAFE_METHOD_
+	ERR_RENDER_THREAD_GUARD();
 	ERR_FAIL_COND_MSG(is_main_instance, "Only local devices can submit and sync.");
 	ERR_FAIL_COND_MSG(local_device_processing, "device already submitted, call sync to wait until done.");
+
 	_end_frame();
 	_execute_frame(false);
 	local_device_processing = true;
 }
 
 void RenderingDevice::sync() {
-	_THREAD_SAFE_METHOD_
+	ERR_RENDER_THREAD_GUARD();
 	ERR_FAIL_COND_MSG(is_main_instance, "Only local devices can submit and sync.");
 	ERR_FAIL_COND_MSG(!local_device_processing, "sync can only be called after a submit");
+
 	_begin_frame();
 	local_device_processing = false;
 }
@@ -5207,17 +5763,16 @@ uint64_t RenderingDevice::get_memory_usage(MemoryType p_type) const {
 
 void RenderingDevice::_begin_frame() {
 	// Before beginning this frame, wait on the fence if it was signaled to make sure its work is finished.
-	if (frames[frame].draw_fence_signaled) {
-		driver->fence_wait(frames[frame].draw_fence);
-		frames[frame].draw_fence_signaled = false;
+	if (frames[frame].fence_signaled) {
+		driver->fence_wait(frames[frame].fence);
+		frames[frame].fence_signaled = false;
 	}
 
 	update_perf_report();
 
 	// Begin recording on the frame's command buffers.
 	driver->begin_segment(frame, frames_drawn++);
-	driver->command_buffer_begin(frames[frame].setup_command_buffer);
-	driver->command_buffer_begin(frames[frame].draw_command_buffer);
+	driver->command_buffer_begin(frames[frame].command_buffer);
 
 	// Reset the graph.
 	draw_graph.begin();
@@ -5233,7 +5788,7 @@ void RenderingDevice::_begin_frame() {
 
 	if (frames[frame].timestamp_count) {
 		driver->timestamp_query_pool_get_results(frames[frame].timestamp_pool, frames[frame].timestamp_count, frames[frame].timestamp_result_values.ptr());
-		driver->command_timestamp_query_pool_reset(frames[frame].setup_command_buffer, frames[frame].timestamp_pool, frames[frame].timestamp_count);
+		driver->command_timestamp_query_pool_reset(frames[frame].command_buffer, frames[frame].timestamp_pool, frames[frame].timestamp_count);
 		SWAP(frames[frame].timestamp_names, frames[frame].timestamp_result_names);
 		SWAP(frames[frame].timestamp_cpu_values, frames[frame].timestamp_cpu_result_values);
 	}
@@ -5252,10 +5807,10 @@ void RenderingDevice::_end_frame() {
 		ERR_PRINT("Found open compute list at the end of the frame, this should never happen (further compute will likely not work).");
 	}
 
-	driver->command_buffer_end(frames[frame].setup_command_buffer);
+	_submit_transfer_workers(true);
 
 	// The command buffer must be copied into a stack variable as the driver workarounds can change the command buffer in use.
-	RDD::CommandBufferID command_buffer = frames[frame].draw_command_buffer;
+	RDD::CommandBufferID command_buffer = frames[frame].command_buffer;
 	draw_graph.end(RENDER_GRAPH_REORDER, RENDER_GRAPH_FULL_BARRIERS, command_buffer, frames[frame].command_buffer_pool);
 	driver->command_buffer_end(command_buffer);
 	driver->end_segment();
@@ -5268,9 +5823,6 @@ void RenderingDevice::_execute_frame(bool p_present) {
 	thread_local LocalVector<RDD::SwapChainID> swap_chains;
 	swap_chains.clear();
 
-	// Execute the setup command buffer.
-	driver->command_queue_execute_and_present(main_queue, {}, frames[frame].setup_command_buffer, frames[frame].setup_semaphore, {}, {});
-
 	// Execute command buffers and use semaphores to wait on the execution of the previous one. Normally there's only one command buffer,
 	// but driver workarounds can force situations where there'll be more.
 	uint32_t command_buffer_count = 1;
@@ -5280,7 +5832,9 @@ void RenderingDevice::_execute_frame(bool p_present) {
 		buffer_pool.buffers_used = 0;
 	}
 
-	RDD::SemaphoreID wait_semaphore = frames[frame].setup_semaphore;
+	thread_local LocalVector<RDD::SemaphoreID> wait_semaphores;
+	wait_semaphores = frames[frame].semaphores_to_wait_on;
+
 	for (uint32_t i = 0; i < command_buffer_count; i++) {
 		RDD::CommandBufferID command_buffer;
 		RDD::SemaphoreID signal_semaphore;
@@ -5289,14 +5843,14 @@ void RenderingDevice::_execute_frame(bool p_present) {
 			command_buffer = buffer_pool.buffers[i - 1];
 			signal_semaphore = buffer_pool.semaphores[i - 1];
 		} else {
-			command_buffer = frames[frame].draw_command_buffer;
-			signal_semaphore = frames[frame].draw_semaphore;
+			command_buffer = frames[frame].command_buffer;
+			signal_semaphore = frames[frame].semaphore;
 		}
 
 		bool signal_semaphore_valid;
 		if (i == (command_buffer_count - 1)) {
 			// This is the last command buffer, it should signal the fence.
-			signal_fence = frames[frame].draw_fence;
+			signal_fence = frames[frame].fence;
 			signal_semaphore_valid = false;
 
 			if (frame_can_present && separate_present_queue) {
@@ -5311,19 +5865,21 @@ void RenderingDevice::_execute_frame(bool p_present) {
 			signal_semaphore_valid = true;
 		}
 
-		driver->command_queue_execute_and_present(main_queue, wait_semaphore, command_buffer, signal_semaphore_valid ? signal_semaphore : VectorView<RDD::SemaphoreID>(), signal_fence, swap_chains);
+		driver->command_queue_execute_and_present(main_queue, wait_semaphores, command_buffer, signal_semaphore_valid ? signal_semaphore : VectorView<RDD::SemaphoreID>(), signal_fence, swap_chains);
 
 		// Make the next command buffer wait on the semaphore signaled by this one.
-		wait_semaphore = signal_semaphore;
+		wait_semaphores.resize(1);
+		wait_semaphores[0] = signal_semaphore;
 	}
 
 	// Indicate the fence has been signaled so the next time the frame's contents need to be used, the CPU needs to wait on the work to be completed.
-	frames[frame].draw_fence_signaled = true;
+	frames[frame].semaphores_to_wait_on.clear();
+	frames[frame].fence_signaled = true;
 
 	if (frame_can_present) {
 		if (separate_present_queue) {
 			// Issue the presentation separately if the presentation queue is different from the main queue.
-			driver->command_queue_execute_and_present(present_queue, wait_semaphore, {}, {}, {}, frames[frame].swap_chains_to_present);
+			driver->command_queue_execute_and_present(present_queue, wait_semaphores, {}, {}, {}, frames[frame].swap_chains_to_present);
 		}
 
 		frames[frame].swap_chains_to_present.clear();
@@ -5332,9 +5888,9 @@ void RenderingDevice::_execute_frame(bool p_present) {
 
 void RenderingDevice::_stall_for_previous_frames() {
 	for (uint32_t i = 0; i < frames.size(); i++) {
-		if (frames[i].draw_fence_signaled) {
-			driver->fence_wait(frames[i].draw_fence);
-			frames[i].draw_fence_signaled = false;
+		if (frames[i].fence_signaled) {
+			driver->fence_wait(frames[i].fence);
+			frames[i].fence_signaled = false;
 		}
 	}
 }
@@ -5347,8 +5903,9 @@ void RenderingDevice::_flush_and_stall_for_all_frames() {
 }
 
 Error RenderingDevice::initialize(RenderingContextDriver *p_context, DisplayServer::WindowID p_main_window) {
-	Error err;
+	ERR_RENDER_THREAD_GUARD_V(ERR_UNAVAILABLE);
 
+	Error err;
 	RenderingContextDriver::SurfaceID main_surface = 0;
 	is_main_instance = (singleton == this) && (p_main_window != DisplayServer::INVALID_WINDOW_ID);
 	if (p_main_window != DisplayServer::INVALID_WINDOW_ID) {
@@ -5436,12 +5993,25 @@ Error RenderingDevice::initialize(RenderingContextDriver *p_context, DisplayServ
 	main_queue = driver->command_queue_create(main_queue_family, true);
 	ERR_FAIL_COND_V(!main_queue, FAILED);
 
+	transfer_queue_family = driver->command_queue_family_get(RDD::COMMAND_QUEUE_FAMILY_TRANSFER_BIT);
+	if (transfer_queue_family) {
+		// Create the transfer queue.
+		transfer_queue = driver->command_queue_create(transfer_queue_family);
+		ERR_FAIL_COND_V(!transfer_queue, FAILED);
+	} else {
+		// Use main queue as the transfer queue.
+		transfer_queue = main_queue;
+		transfer_queue_family = main_queue_family;
+	}
+
 	if (present_queue_family) {
-		// Create the presentation queue.
+		// Create the present queue.
 		present_queue = driver->command_queue_create(present_queue_family);
 		ERR_FAIL_COND_V(!present_queue, FAILED);
 	} else {
+		// Use main queue as the present queue.
 		present_queue = main_queue;
+		present_queue_family = main_queue_family;
 	}
 
 	// Create data for all the frames.
@@ -5451,17 +6021,13 @@ Error RenderingDevice::initialize(RenderingContextDriver *p_context, DisplayServ
 		// Create command pool, command buffers, semaphores and fences.
 		frames[i].command_pool = driver->command_pool_create(main_queue_family, RDD::COMMAND_BUFFER_TYPE_PRIMARY);
 		ERR_FAIL_COND_V(!frames[i].command_pool, FAILED);
-		frames[i].setup_command_buffer = driver->command_buffer_create(frames[i].command_pool);
-		ERR_FAIL_COND_V(!frames[i].setup_command_buffer, FAILED);
-		frames[i].draw_command_buffer = driver->command_buffer_create(frames[i].command_pool);
-		ERR_FAIL_COND_V(!frames[i].draw_command_buffer, FAILED);
-		frames[i].setup_semaphore = driver->semaphore_create();
-		ERR_FAIL_COND_V(!frames[i].setup_semaphore, FAILED);
-		frames[i].draw_semaphore = driver->semaphore_create();
-		ERR_FAIL_COND_V(!frames[i].draw_semaphore, FAILED);
-		frames[i].draw_fence = driver->fence_create();
-		ERR_FAIL_COND_V(!frames[i].draw_fence, FAILED);
-		frames[i].draw_fence_signaled = false;
+		frames[i].command_buffer = driver->command_buffer_create(frames[i].command_pool);
+		ERR_FAIL_COND_V(!frames[i].command_buffer, FAILED);
+		frames[i].semaphore = driver->semaphore_create();
+		ERR_FAIL_COND_V(!frames[i].semaphore, FAILED);
+		frames[i].fence = driver->fence_create();
+		ERR_FAIL_COND_V(!frames[i].fence, FAILED);
+		frames[i].fence_signaled = false;
 
 		// Create query pool.
 		frames[i].timestamp_pool = driver->timestamp_query_pool_create(max_timestamp_query_elements);
@@ -5482,8 +6048,7 @@ Error RenderingDevice::initialize(RenderingContextDriver *p_context, DisplayServ
 
 	// Initialize recording on the first frame.
 	driver->begin_segment(frame, frames_drawn++);
-	driver->command_buffer_begin(frames[0].setup_command_buffer);
-	driver->command_buffer_begin(frames[0].draw_command_buffer);
+	driver->command_buffer_begin(frames[0].command_buffer);
 
 	// Create draw graph and start it initialized as well.
 	draw_graph.initialize(driver, device, frames.size(), main_queue_family, SECONDARY_COMMAND_BUFFERS_PER_FRAME);
@@ -5491,7 +6056,7 @@ Error RenderingDevice::initialize(RenderingContextDriver *p_context, DisplayServ
 
 	for (uint32_t i = 0; i < frames.size(); i++) {
 		// Reset all queries in a query pool before doing any operations with them..
-		driver->command_timestamp_query_pool_reset(frames[0].setup_command_buffer, frames[i].timestamp_pool, max_timestamp_query_elements);
+		driver->command_timestamp_query_pool_reset(frames[0].command_buffer, frames[i].timestamp_pool, max_timestamp_query_elements);
 	}
 
 	// Convert block size from KB.
@@ -5522,6 +6087,9 @@ Error RenderingDevice::initialize(RenderingContextDriver *p_context, DisplayServ
 		ERR_FAIL_COND_V(err, FAILED);
 	}
 
+	// TODO: How should this max size be determined?
+	transfer_worker_pool_max_size = OS::get_singleton()->get_processor_count();
+
 	draw_list = nullptr;
 	compute_list = nullptr;
 
@@ -5560,6 +6128,8 @@ Vector<uint8_t> RenderingDevice::_load_pipeline_cache() {
 }
 
 void RenderingDevice::_update_pipeline_cache(bool p_closing) {
+	_THREAD_SAFE_METHOD_
+
 	{
 		bool still_saving = pipeline_cache_save_task != WorkerThreadPool::INVALID_TASK_ID && !WorkerThreadPool::get_singleton()->is_task_completed(pipeline_cache_save_task);
 		if (still_saving) {
@@ -5641,6 +6211,8 @@ void RenderingDevice::_free_rids(T &p_owner, const char *p_type) {
 }
 
 void RenderingDevice::capture_timestamp(const String &p_name) {
+	ERR_RENDER_THREAD_GUARD();
+
 	ERR_FAIL_COND_MSG(draw_list != nullptr && draw_list->state.draw_count > 0, "Capturing timestamps during draw list creation is not allowed. Offending timestamp was: " + p_name);
 	ERR_FAIL_COND_MSG(compute_list != nullptr && compute_list->state.dispatch_count > 0, "Capturing timestamps during compute list creation is not allowed. Offending timestamp was: " + p_name);
 	ERR_FAIL_COND_MSG(frames[frame].timestamp_count >= max_timestamp_query_elements, vformat("Tried capturing more timestamps than the configured maximum (%d). You can increase this limit in the project settings under 'Debug/Settings' called 'Max Timestamp Query Elements'.", max_timestamp_query_elements));
@@ -5653,7 +6225,7 @@ void RenderingDevice::capture_timestamp(const String &p_name) {
 }
 
 uint64_t RenderingDevice::get_driver_resource(DriverResource p_resource, RID p_rid, uint64_t p_index) {
-	_THREAD_SAFE_METHOD_
+	ERR_RENDER_THREAD_GUARD_V(0);
 
 	uint64_t driver_id = 0;
 	switch (p_resource) {
@@ -5769,19 +6341,23 @@ uint64_t RenderingDevice::get_device_allocs_by_object_type(uint32_t type) const
 }
 
 uint32_t RenderingDevice::get_captured_timestamps_count() const {
+	ERR_RENDER_THREAD_GUARD_V(0);
 	return frames[frame].timestamp_result_count;
 }
 
 uint64_t RenderingDevice::get_captured_timestamps_frame() const {
+	ERR_RENDER_THREAD_GUARD_V(0);
 	return frames[frame].index;
 }
 
 uint64_t RenderingDevice::get_captured_timestamp_gpu_time(uint32_t p_index) const {
+	ERR_RENDER_THREAD_GUARD_V(0);
 	ERR_FAIL_UNSIGNED_INDEX_V(p_index, frames[frame].timestamp_result_count, 0);
 	return driver->timestamp_query_result_to_time(frames[frame].timestamp_result_values[p_index]);
 }
 
 uint64_t RenderingDevice::get_captured_timestamp_cpu_time(uint32_t p_index) const {
+	ERR_RENDER_THREAD_GUARD_V(0);
 	ERR_FAIL_UNSIGNED_INDEX_V(p_index, frames[frame].timestamp_result_count, 0);
 	return frames[frame].timestamp_cpu_result_values[p_index];
 }
@@ -5796,11 +6372,17 @@ uint64_t RenderingDevice::limit_get(Limit p_limit) const {
 }
 
 void RenderingDevice::finalize() {
+	ERR_RENDER_THREAD_GUARD();
+
 	if (!frames.is_empty()) {
 		// Wait for all frames to have finished rendering.
 		_flush_and_stall_for_all_frames();
 	}
 
+	// Wait for transfer workers to finish.
+	_submit_transfer_workers(false);
+	_wait_for_transfer_workers();
+
 	// Delete everything the graph has created.
 	draw_graph.finalize();
 
@@ -5854,15 +6436,17 @@ void RenderingDevice::finalize() {
 		}
 	}
 
+	// Erase the transfer workers after all resources have been freed.
+	_free_transfer_workers();
+
 	// Free everything pending.
 	for (uint32_t i = 0; i < frames.size(); i++) {
 		int f = (frame + i) % frames.size();
 		_free_pending_resources(f);
 		driver->command_pool_free(frames[i].command_pool);
 		driver->timestamp_query_pool_free(frames[i].timestamp_pool);
-		driver->semaphore_free(frames[i].setup_semaphore);
-		driver->semaphore_free(frames[i].draw_semaphore);
-		driver->fence_free(frames[i].draw_fence);
+		driver->semaphore_free(frames[i].semaphore);
+		driver->fence_free(frames[i].fence);
 
 		RDG::CommandBufferPool &buffer_pool = frames[i].command_buffer_pool;
 		for (uint32_t j = 0; j < buffer_pool.buffers.size(); j++) {
@@ -5909,6 +6493,15 @@ void RenderingDevice::finalize() {
 		present_queue = RDD::CommandQueueID();
 	}
 
+	if (transfer_queue) {
+		if (main_queue != transfer_queue) {
+			// Only delete the transfer queue if it's unique.
+			driver->command_queue_free(transfer_queue);
+		}
+
+		transfer_queue = RDD::CommandQueueID();
+	}
+
 	if (main_queue) {
 		driver->command_queue_free(main_queue);
 		main_queue = RDD::CommandQueueID();
@@ -6644,6 +7237,8 @@ RenderingDevice::RenderingDevice() {
 	if (singleton == nullptr) {
 		singleton = this;
 	}
+
+	render_thread_id = Thread::get_caller_id();
 }
 
 /*****************/