5 files changed, 132 insertions, 27 deletions
diff --git a/core/io/resource_loader.cpp b/core/io/resource_loader.cpp
index ed5e482296..58ad61b621 100644
--- a/core/io/resource_loader.cpp
+++ b/core/io/resource_loader.cpp
@@ -40,6 +40,7 @@
 #include "core/string/print_string.h"
 #include "core/string/translation.h"
 #include "core/variant/variant_parser.h"
+#include "servers/rendering_server.h"
 
 #ifdef DEBUG_LOAD_THREADED
 #define print_lt(m_text) print_line(m_text)
@@ -585,6 +586,16 @@ ResourceLoader::ThreadLoadStatus ResourceLoader::load_threaded_get_status(const
 		*r_progress = _dependency_get_progress(local_path);
 	}
 
+	// Support userland polling in a loop on the main thread.
+	if (Thread::is_main_thread() && status == THREAD_LOAD_IN_PROGRESS) {
+		uint64_t frame = Engine::get_singleton()->get_process_frames();
+		if (frame == load_task.last_progress_check_main_thread_frame) {
+			_ensure_load_progress();
+		} else {
+			load_task.last_progress_check_main_thread_frame = frame;
+		}
+	}
+
 	return status;
 }
 
@@ -613,6 +624,21 @@ Ref<Resource> ResourceLoader::load_threaded_get(const String &p_path, Error *r_e
 			}
 			return Ref<Resource>();
 		}
+
+		// Support userland requesting on the main thread before the load is reported to be complete.
+		if (Thread::is_main_thread() && !load_token->local_path.is_empty()) {
+			const ThreadLoadTask &load_task = thread_load_tasks[load_token->local_path];
+			while (load_task.status == THREAD_LOAD_IN_PROGRESS) {
+				if (!_ensure_load_progress()) {
+					// This local poll loop is not needed.
+					break;
+				}
+				thread_load_lock.~MutexLock();
+				OS::get_singleton()->delay_usec(1000);
+				new (&thread_load_lock) MutexLock(thread_load_mutex);
+			}
+		}
+
 		res = _load_complete_inner(*load_token, r_error, thread_load_lock);
 		if (load_token->unreference()) {
 			memdelete(load_token);
@@ -731,6 +757,17 @@ Ref<Resource> ResourceLoader::_load_complete_inner(LoadToken &p_load_token, Erro
 	}
 }
 
+bool ResourceLoader::_ensure_load_progress() {
+	// Some servers may need a new engine iteration to allow the load to progress.
+	// Since the only known one is the rendering server (in single thread mode), let's keep it simple and just sync it.
+	// This may be refactored in the future to support other servers and have less coupling.
+	if (OS::get_singleton()->get_render_thread_mode() == OS::RENDER_SEPARATE_THREAD) {
+		return false; // Not needed.
+	}
+	RenderingServer::get_singleton()->sync();
+	return true;
+}
+
 Ref<Resource> ResourceLoader::ensure_resource_ref_override_for_outer_load(const String &p_path, const String &p_res_type) {
 	ERR_FAIL_COND_V(load_nesting == 0, Ref<Resource>()); // It makes no sense to use this from nesting level 0.
 	const String &local_path = _validate_local_path(p_path);
diff --git a/core/io/resource_loader.h b/core/io/resource_loader.h
index c48f39b5cc..46df79ea22 100644
--- a/core/io/resource_loader.h
+++ b/core/io/resource_loader.h
@@ -174,6 +174,7 @@ private:
 		String type_hint;
 		float progress = 0.0f;
 		float max_reported_progress = 0.0f;
+		uint64_t last_progress_check_main_thread_frame = UINT64_MAX;
 		ThreadLoadStatus status = THREAD_LOAD_IN_PROGRESS;
 		ResourceFormatLoader::CacheMode cache_mode = ResourceFormatLoader::CACHE_MODE_REUSE;
 		Error error = OK;
@@ -197,6 +198,8 @@ private:
 
 	static float _dependency_get_progress(const String &p_path);
 
+	static bool _ensure_load_progress();
+
 public:
 	static Error load_threaded_request(const String &p_path, const String &p_type_hint = "", bool p_use_sub_threads = false, ResourceFormatLoader::CacheMode p_cache_mode = ResourceFormatLoader::CACHE_MODE_REUSE);
 	static ThreadLoadStatus load_threaded_get_status(const String &p_path, float *r_progress = nullptr);
diff --git a/core/object/worker_thread_pool.cpp b/core/object/worker_thread_pool.cpp
index a7c0a0353e..8f56ca37de 100644
--- a/core/object/worker_thread_pool.cpp
+++ b/core/object/worker_thread_pool.cpp
@@ -33,7 +33,6 @@
 #include "core/object/script_language.h"
 #include "core/os/os.h"
 #include "core/os/thread_safe.h"
-#include "core/templates/command_queue_mt.h"
 
 WorkerThreadPool::Task *const WorkerThreadPool::ThreadData::YIELDING = (Task *)1;
 
@@ -46,7 +45,9 @@ void WorkerThreadPool::Task::free_template_userdata() {
 
 WorkerThreadPool *WorkerThreadPool::singleton = nullptr;
 
-thread_local CommandQueueMT *WorkerThreadPool::flushing_cmd_queue = nullptr;
+#ifdef THREADS_ENABLED
+thread_local uintptr_t WorkerThreadPool::unlockable_mutexes[MAX_UNLOCKABLE_MUTEXES] = {};
+#endif
 
 void WorkerThreadPool::_process_task(Task *p_task) {
 #ifdef THREADS_ENABLED
@@ -419,6 +420,34 @@ Error WorkerThreadPool::wait_for_task_completion(TaskID p_task_id) {
 	return OK;
 }
 
+void WorkerThreadPool::_lock_unlockable_mutexes() {
+#ifdef THREADS_ENABLED
+	for (uint32_t i = 0; i < MAX_UNLOCKABLE_MUTEXES; i++) {
+		if (unlockable_mutexes[i]) {
+			if ((((uintptr_t)unlockable_mutexes[i]) & 1) == 0) {
+				((Mutex *)unlockable_mutexes[i])->lock();
+			} else {
+				((BinaryMutex *)unlockable_mutexes[i])->lock();
+			}
+		}
+	}
+#endif
+}
+
+void WorkerThreadPool::_unlock_unlockable_mutexes() {
+#ifdef THREADS_ENABLED
+	for (uint32_t i = 0; i < MAX_UNLOCKABLE_MUTEXES; i++) {
+		if (unlockable_mutexes[i]) {
+			if ((((uintptr_t)unlockable_mutexes[i]) & 1) == 0) {
+				((Mutex *)unlockable_mutexes[i])->unlock();
+			} else {
+				((BinaryMutex *)unlockable_mutexes[i])->unlock();
+			}
+		}
+	}
+#endif
+}
+
 void WorkerThreadPool::_wait_collaboratively(ThreadData *p_caller_pool_thread, Task *p_task) {
 	// Keep processing tasks until the condition to stop waiting is met.
 
@@ -426,6 +455,7 @@ void WorkerThreadPool::_wait_collaboratively(ThreadData *p_caller_pool_thread, T
 
 	while (true) {
 		Task *task_to_process = nullptr;
+		bool relock_unlockables = false;
 		{
 			MutexLock lock(task_mutex);
 			bool was_signaled = p_caller_pool_thread->signaled;
@@ -463,13 +493,9 @@ void WorkerThreadPool::_wait_collaboratively(ThreadData *p_caller_pool_thread, T
 				if (!task_to_process) {
 					p_caller_pool_thread->awaited_task = p_task;
 
-					if (flushing_cmd_queue) {
-						flushing_cmd_queue->unlock();
-					}
+					_unlock_unlockable_mutexes();
+					relock_unlockables = true;
 					p_caller_pool_thread->cond_var.wait(lock);
-					if (flushing_cmd_queue) {
-						flushing_cmd_queue->lock();
-					}
 
 					DEV_ASSERT(exit_threads || p_caller_pool_thread->signaled || IS_WAIT_OVER);
 					p_caller_pool_thread->awaited_task = nullptr;
@@ -477,6 +503,10 @@ void WorkerThreadPool::_wait_collaboratively(ThreadData *p_caller_pool_thread, T
 			}
 		}
 
+		if (relock_unlockables) {
+			_lock_unlockable_mutexes();
+		}
+
 		if (task_to_process) {
 			_process_task(task_to_process);
 		}
@@ -603,13 +633,9 @@ void WorkerThreadPool::wait_for_group_task_completion(GroupID p_group) {
 	{
 		Group *group = *groupp;
 
-		if (flushing_cmd_queue) {
-			flushing_cmd_queue->unlock();
-		}
+		_unlock_unlockable_mutexes();
 		group->done_semaphore.wait();
-		if (flushing_cmd_queue) {
-			flushing_cmd_queue->lock();
-		}
+		_lock_unlockable_mutexes();
 
 		uint32_t max_users = group->tasks_used + 1; // Add 1 because the thread waiting for it is also user. Read before to avoid another thread freeing task after increment.
 		uint32_t finished_users = group->finished.increment(); // fetch happens before inc, so increment later.
@@ -633,16 +659,41 @@ int WorkerThreadPool::get_thread_index() {
 	return singleton->thread_ids.has(tid) ? singleton->thread_ids[tid] : -1;
 }
 
-void WorkerThreadPool::thread_enter_command_queue_mt_flush(CommandQueueMT *p_queue) {
-	ERR_FAIL_COND(flushing_cmd_queue != nullptr);
-	flushing_cmd_queue = p_queue;
+#ifdef THREADS_ENABLED
+uint32_t WorkerThreadPool::thread_enter_unlock_allowance_zone(Mutex *p_mutex) {
+	return _thread_enter_unlock_allowance_zone(p_mutex, false);
+}
+
+uint32_t WorkerThreadPool::thread_enter_unlock_allowance_zone(BinaryMutex *p_mutex) {
+	return _thread_enter_unlock_allowance_zone(p_mutex, true);
 }
 
-void WorkerThreadPool::thread_exit_command_queue_mt_flush() {
-	ERR_FAIL_NULL(flushing_cmd_queue);
-	flushing_cmd_queue = nullptr;
+uint32_t WorkerThreadPool::_thread_enter_unlock_allowance_zone(void *p_mutex, bool p_is_binary) {
+	for (uint32_t i = 0; i < MAX_UNLOCKABLE_MUTEXES; i++) {
+		if (unlikely(unlockable_mutexes[i] == (uintptr_t)p_mutex)) {
+			// Already registered in the current thread.
+			return UINT32_MAX;
+		}
+		if (!unlockable_mutexes[i]) {
+			unlockable_mutexes[i] = (uintptr_t)p_mutex;
+			if (p_is_binary) {
+				unlockable_mutexes[i] |= 1;
+			}
+			return i;
+		}
+	}
+	ERR_FAIL_V_MSG(UINT32_MAX, "No more unlockable mutex slots available. Engine bug.");
 }
 
+void WorkerThreadPool::thread_exit_unlock_allowance_zone(uint32_t p_zone_id) {
+	if (p_zone_id == UINT32_MAX) {
+		return;
+	}
+	DEV_ASSERT(unlockable_mutexes[p_zone_id]);
+	unlockable_mutexes[p_zone_id] = 0;
+}
+#endif
+
 void WorkerThreadPool::init(int p_thread_count, float p_low_priority_task_ratio) {
 	ERR_FAIL_COND(threads.size() > 0);
 	if (p_thread_count < 0) {
diff --git a/core/object/worker_thread_pool.h b/core/object/worker_thread_pool.h
index a9cf260a0f..8774143abf 100644
--- a/core/object/worker_thread_pool.h
+++ b/core/object/worker_thread_pool.h
@@ -41,8 +41,6 @@
 #include "core/templates/rid.h"
 #include "core/templates/safe_refcount.h"
 
-class CommandQueueMT;
-
 class WorkerThreadPool : public Object {
 	GDCLASS(WorkerThreadPool, Object)
 public:
@@ -163,7 +161,10 @@ private:
 
 	static WorkerThreadPool *singleton;
 
-	static thread_local CommandQueueMT *flushing_cmd_queue;
+#ifdef THREADS_ENABLED
+	static const uint32_t MAX_UNLOCKABLE_MUTEXES = 2;
+	static thread_local uintptr_t unlockable_mutexes[MAX_UNLOCKABLE_MUTEXES];
+#endif
 
 	TaskID _add_task(const Callable &p_callable, void (*p_func)(void *), void *p_userdata, BaseTemplateUserdata *p_template_userdata, bool p_high_priority, const String &p_description);
 	GroupID _add_group_task(const Callable &p_callable, void (*p_func)(void *, uint32_t), void *p_userdata, BaseTemplateUserdata *p_template_userdata, int p_elements, int p_tasks, bool p_high_priority, const String &p_description);
@@ -190,6 +191,13 @@ private:
 
 	void _wait_collaboratively(ThreadData *p_caller_pool_thread, Task *p_task);
 
+#ifdef THREADS_ENABLED
+	static uint32_t _thread_enter_unlock_allowance_zone(void *p_mutex, bool p_is_binary);
+#endif
+
+	void _lock_unlockable_mutexes();
+	void _unlock_unlockable_mutexes();
+
 protected:
 	static void _bind_methods();
 
@@ -232,8 +240,14 @@ public:
 	static WorkerThreadPool *get_singleton() { return singleton; }
 	static int get_thread_index();
 
-	static void thread_enter_command_queue_mt_flush(CommandQueueMT *p_queue);
-	static void thread_exit_command_queue_mt_flush();
+#ifdef THREADS_ENABLED
+	static uint32_t thread_enter_unlock_allowance_zone(Mutex *p_mutex);
+	static uint32_t thread_enter_unlock_allowance_zone(BinaryMutex *p_mutex);
+	static void thread_exit_unlock_allowance_zone(uint32_t p_zone_id);
+#else
+	static uint32_t thread_enter_unlock_allowance_zone(void *p_mutex) { return UINT32_MAX; }
+	static void thread_exit_unlock_allowance_zone(uint32_t p_zone_id) {}
+#endif
 
 	void init(int p_thread_count = -1, float p_low_priority_task_ratio = 0.3);
 	void finish();
diff --git a/core/templates/command_queue_mt.h b/core/templates/command_queue_mt.h
index 349404d75b..0748e9cb83 100644
--- a/core/templates/command_queue_mt.h
+++ b/core/templates/command_queue_mt.h
@@ -364,7 +364,7 @@ class CommandQueueMT {
 
 		lock();
 
-		WorkerThreadPool::thread_enter_command_queue_mt_flush(this);
+		uint32_t allowance_id = WorkerThreadPool::thread_enter_unlock_allowance_zone(&mutex);
 		while (flush_read_ptr < command_mem.size()) {
 			uint64_t size = *(uint64_t *)&command_mem[flush_read_ptr];
 			flush_read_ptr += 8;
@@ -383,7 +383,7 @@ class CommandQueueMT {
 
 			flush_read_ptr += size;
 		}
-		WorkerThreadPool::thread_exit_command_queue_mt_flush();
+		WorkerThreadPool::thread_exit_unlock_allowance_zone(allowance_id);
 
 		command_mem.clear();
 		flush_read_ptr = 0;