summaryrefslogtreecommitdiffstats
path: root/thirdparty/embree/kernels/common/alloc.h
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/embree/kernels/common/alloc.h')
-rw-r--r--thirdparty/embree/kernels/common/alloc.h253
1 files changed, 133 insertions, 120 deletions
diff --git a/thirdparty/embree/kernels/common/alloc.h b/thirdparty/embree/kernels/common/alloc.h
index 12769df2c8..840d48c327 100644
--- a/thirdparty/embree/kernels/common/alloc.h
+++ b/thirdparty/embree/kernels/common/alloc.h
@@ -6,11 +6,9 @@
#include "default.h"
#include "device.h"
#include "scene.h"
-#include "primref.h"
+#include "../builders/primref.h"
-#if defined(APPLE) && defined(__aarch64__)
-#include <mutex>
-#endif
+#include "../../common/tasking/taskscheduler.h"
namespace embree
{
@@ -18,7 +16,7 @@ namespace embree
{
/*! maximum supported alignment */
static const size_t maxAlignment = 64;
-
+
/*! maximum allocation size */
/* default settings */
@@ -39,14 +37,14 @@ namespace embree
public:
/*! Constructor for usage with ThreadLocalData */
- __forceinline ThreadLocal (ThreadLocal2* parent)
- : parent(parent), ptr(nullptr), cur(0), end(0), allocBlockSize(0), bytesUsed(0), bytesWasted(0) {}
+ __forceinline ThreadLocal (ThreadLocal2* parent)
+ : parent(parent), ptr(nullptr), cur(0), end(0), allocBlockSize(0), bytesUsed(0), bytesWasted(0) {}
/*! initialize allocator */
- void init(FastAllocator* alloc)
+ void init(FastAllocator* alloc)
{
ptr = nullptr;
- cur = end = 0;
+ cur = end = 0;
bytesUsed = 0;
bytesWasted = 0;
allocBlockSize = 0;
@@ -54,64 +52,62 @@ namespace embree
}
/* Allocate aligned memory from the threads memory block. */
- __forceinline void* malloc(FastAllocator* alloc, size_t bytes, size_t align = 16)
+ __forceinline void* malloc(FastAllocator* alloc, size_t bytes, size_t align = 16)
{
/* bind the thread local allocator to the proper FastAllocator*/
parent->bind(alloc);
assert(align <= maxAlignment);
- bytesUsed += bytes;
+ bytesUsed += bytes;
/* try to allocate in local block */
- size_t ofs = (align - cur) & (align-1);
+ size_t ofs = (align - cur) & (align-1);
cur += bytes + ofs;
if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
- cur -= bytes + ofs;
-
+ cur -= bytes + ofs;
+
/* if allocation is too large allocate with parent allocator */
if (4*bytes > allocBlockSize) {
return alloc->malloc(bytes,maxAlignment,false);
- }
+ }
/* get new partial block if allocation failed */
size_t blockSize = allocBlockSize;
ptr = (char*) alloc->malloc(blockSize,maxAlignment,true);
- bytesWasted += end-cur;
- cur = 0; end = blockSize;
+ bytesWasted += end-cur;
+ cur = 0; end = blockSize;
/* retry allocation */
- ofs = (align - cur) & (align-1);
+ ofs = (align - cur) & (align-1);
cur += bytes + ofs;
if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
- cur -= bytes + ofs;
+ cur -= bytes + ofs;
/* get new full block if allocation failed */
blockSize = allocBlockSize;
ptr = (char*) alloc->malloc(blockSize,maxAlignment,false);
- bytesWasted += end-cur;
- cur = 0; end = blockSize;
+ bytesWasted += end-cur;
+ cur = 0; end = blockSize;
/* retry allocation */
- ofs = (align - cur) & (align-1);
+ ofs = (align - cur) & (align-1);
cur += bytes + ofs;
if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
- cur -= bytes + ofs;
+ cur -= bytes + ofs;
/* should never happen as large allocations get handled specially above */
assert(false);
return nullptr;
}
-
- /*! returns amount of used bytes */
__forceinline size_t getUsedBytes() const { return bytesUsed; }
-
+
/*! returns amount of free bytes */
__forceinline size_t getFreeBytes() const { return end-cur; }
-
+
/*! returns amount of wasted bytes */
__forceinline size_t getWastedBytes() const { return bytesWasted; }
-
+
private:
ThreadLocal2* parent;
char* ptr; //!< pointer to memory block
@@ -136,11 +132,7 @@ namespace embree
{
assert(alloc_i);
if (alloc.load() == alloc_i) return;
-#if defined(APPLE) && defined(__aarch64__)
- std::scoped_lock lock(mutex);
-#else
- Lock<SpinLock> lock(mutex);
-#endif
+ Lock<MutexSys> lock(mutex);
//if (alloc.load() == alloc_i) return; // not required as only one thread calls bind
if (alloc.load()) {
alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();
@@ -158,11 +150,7 @@ namespace embree
{
assert(alloc_i);
if (alloc.load() != alloc_i) return;
-#if defined(APPLE) && defined(__aarch64__)
- std::scoped_lock lock(mutex);
-#else
- Lock<SpinLock> lock(mutex);
-#endif
+ Lock<MutexSys> lock(mutex);
if (alloc.load() != alloc_i) return; // required as a different thread calls unbind
alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();
alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes();
@@ -173,26 +161,47 @@ namespace embree
}
public:
-#if defined(APPLE) && defined(__aarch64__)
- std::mutex mutex;
-#else
- SpinLock mutex; //!< required as unbind is called from other threads
-#endif
+ MutexSys mutex;
std::atomic<FastAllocator*> alloc; //!< parent allocator
ThreadLocal alloc0;
ThreadLocal alloc1;
};
- FastAllocator (Device* device, bool osAllocation)
- : device(device), slotMask(0), usedBlocks(nullptr), freeBlocks(nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE), estimatedSize(0),
- growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? EMBREE_OS_MALLOC : ALIGNED_MALLOC),
- primrefarray(device,0)
+ FastAllocator (Device* device,
+ bool osAllocation,
+ bool useUSM = false,
+ bool blockAllocation = true)
+ : device(device)
+ , slotMask(0)
+ , defaultBlockSize(PAGE_SIZE)
+ , estimatedSize(0)
+ , growSize(PAGE_SIZE)
+ , maxGrowSize(maxAllocationSize)
+ , usedBlocks(nullptr)
+ , freeBlocks(nullptr)
+ , useUSM(useUSM)
+ , blockAllocation(blockAllocation)
+ , use_single_mode(false)
+ , log2_grow_size_scale(0)
+ , bytesUsed(0)
+ , bytesFree(0)
+ , bytesWasted(0)
+ , atype(osAllocation ? EMBREE_OS_MALLOC : ALIGNED_MALLOC)
+ , primrefarray(device,0)
{
+ // -- GODOT start --
+ // if (osAllocation && useUSM)
+ // throw std::runtime_error("USM allocation cannot be combined with OS allocation.");
+ if (osAllocation && useUSM) {
+ abort();
+ }
+ // -- GODOT end --
+
for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
{
threadUsedBlocks[i] = nullptr;
threadBlocks[i] = nullptr;
- assert(!slotMutex[i].isLocked());
+ //assert(!slotMutex[i].isLocked());
}
}
@@ -233,11 +242,7 @@ namespace embree
ThreadLocal2* alloc = thread_local_allocator2;
if (alloc == nullptr) {
thread_local_allocator2 = alloc = new ThreadLocal2;
-#if defined(APPLE) && defined(__aarch64__)
- std::scoped_lock lock(s_thread_local_allocators_lock);
-#else
- Lock<SpinLock> lock(s_thread_local_allocators_lock);
-#endif
+ Lock<MutexSys> lock(s_thread_local_allocators_lock);
s_thread_local_allocators.push_back(make_unique(alloc));
}
return alloc;
@@ -247,11 +252,7 @@ namespace embree
__forceinline void join(ThreadLocal2* alloc)
{
-#if defined(APPLE) && defined(__aarch64__)
- std::scoped_lock lock(s_thread_local_allocators_lock);
-#else
- Lock<SpinLock> lock(thread_local_allocators_lock);
-#endif
+ Lock<MutexSys> lock(s_thread_local_allocators_lock);
thread_local_allocators.push_back(alloc);
}
@@ -412,7 +413,7 @@ namespace embree
slotMask = MAX_THREAD_USED_BLOCK_SLOTS-1; // FIXME: remove
if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
if (bytesReserve == 0) bytesReserve = bytesAllocate;
- freeBlocks = Block::create(device,bytesAllocate,bytesReserve,nullptr,atype);
+ freeBlocks = Block::create(device,useUSM,bytesAllocate,bytesReserve,nullptr,atype);
estimatedSize = bytesEstimate;
initGrowSizeAndNumSlots(bytesEstimate,true);
}
@@ -478,8 +479,8 @@ namespace embree
bytesUsed.store(0);
bytesFree.store(0);
bytesWasted.store(0);
- if (usedBlocks.load() != nullptr) usedBlocks.load()->clear_list(device); usedBlocks = nullptr;
- if (freeBlocks.load() != nullptr) freeBlocks.load()->clear_list(device); freeBlocks = nullptr;
+ if (usedBlocks.load() != nullptr) usedBlocks.load()->clear_list(device,useUSM); usedBlocks = nullptr;
+ if (freeBlocks.load() != nullptr) freeBlocks.load()->clear_list(device,useUSM); freeBlocks = nullptr;
for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) {
threadUsedBlocks[i] = nullptr;
threadBlocks[i] = nullptr;
@@ -503,9 +504,16 @@ namespace embree
/* allocate using current block */
size_t threadID = TaskScheduler::threadID();
size_t slot = threadID & slotMask;
- Block* myUsedBlocks = threadUsedBlocks[slot];
+ Block* myUsedBlocks = threadUsedBlocks[slot];
if (myUsedBlocks) {
void* ptr = myUsedBlocks->malloc(device,bytes,align,partial);
+ // -- GODOT start --
+ // if (ptr == nullptr && !blockAllocation)
+ // throw std::bad_alloc();
+ if (ptr == nullptr && !blockAllocation) {
+ abort();
+ }
+ // -- GODOT end --
if (ptr) return ptr;
}
@@ -516,16 +524,12 @@ namespace embree
/* parallel block creation in case of no freeBlocks, avoids single global mutex */
if (likely(freeBlocks.load() == nullptr))
{
-#if defined(APPLE) && defined(__aarch64__)
- std::scoped_lock lock(slotMutex[slot]);
-#else
- Lock<SpinLock> lock(slotMutex[slot]);
-#endif
+ Lock<MutexSys> lock(slotMutex[slot]);
if (myUsedBlocks == threadUsedBlocks[slot]) {
const size_t alignedBytes = (bytes+(align-1)) & ~(align-1);
const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes);
assert(allocSize >= bytes);
- threadBlocks[slot] = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,threadBlocks[slot],atype); // FIXME: a large allocation might throw away a block here!
+ threadBlocks[slot] = threadUsedBlocks[slot] = Block::create(device,useUSM,allocSize,allocSize,threadBlocks[slot],atype); // FIXME: a large allocation might throw away a block here!
// FIXME: a direct allocation should allocate inside the block here, and not in the next loop! a different thread could do some allocation and make the large allocation fail.
}
continue;
@@ -533,24 +537,20 @@ namespace embree
/* if this fails allocate new block */
{
-#if defined(APPLE) && defined(__aarch64__)
- std::scoped_lock lock(mutex);
-#else
- Lock<SpinLock> lock(mutex);
-#endif
- if (myUsedBlocks == threadUsedBlocks[slot])
- {
+ Lock<MutexSys> lock(mutex);
+ if (myUsedBlocks == threadUsedBlocks[slot])
+ {
if (freeBlocks.load() != nullptr) {
- Block* nextFreeBlock = freeBlocks.load()->next;
- freeBlocks.load()->next = usedBlocks;
- __memory_barrier();
- usedBlocks = freeBlocks.load();
+ Block* nextFreeBlock = freeBlocks.load()->next;
+ freeBlocks.load()->next = usedBlocks;
+ __memory_barrier();
+ usedBlocks = freeBlocks.load();
threadUsedBlocks[slot] = freeBlocks.load();
- freeBlocks = nextFreeBlock;
- } else {
+ freeBlocks = nextFreeBlock;
+ } else {
const size_t allocSize = min(growSize*incGrowSizeScale(),maxGrowSize);
- usedBlocks = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,usedBlocks,atype); // FIXME: a large allocation should get delivered directly, like above!
- }
+ usedBlocks = threadUsedBlocks[slot] = Block::create(device,useUSM,allocSize,allocSize,usedBlocks,atype); // FIXME: a large allocation should get delivered directly, like above!
+ }
}
}
}
@@ -559,11 +559,7 @@ namespace embree
/*! add new block */
void addBlock(void* ptr, ssize_t bytes)
{
-#if defined(APPLE) && defined(__aarch64__)
- std::scoped_lock lock(mutex);
-#else
- Lock<SpinLock> lock(mutex);
-#endif
+ Lock<MutexSys> lock(mutex);
const size_t sizeof_Header = offsetof(Block,data[0]);
void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1));
size_t ofs = (size_t) aptr - (size_t) ptr;
@@ -723,7 +719,12 @@ namespace embree
void print_blocks()
{
- std::cout << " estimatedSize = " << estimatedSize << ", slotMask = " << slotMask << ", use_single_mode = " << use_single_mode << ", maxGrowSize = " << maxGrowSize << ", defaultBlockSize = " << defaultBlockSize << std::endl;
+ std::cout << " estimatedSize = " << estimatedSize
+ << ", slotMask = " << slotMask
+ << ", use_single_mode = " << use_single_mode
+ << ", maxGrowSize = " << maxGrowSize
+ << ", defaultBlockSize = " << defaultBlockSize
+ << std::endl;
std::cout << " used blocks = ";
if (usedBlocks.load() != nullptr) usedBlocks.load()->print_list();
@@ -738,7 +739,19 @@ namespace embree
struct Block
{
- static Block* create(MemoryMonitorInterface* device, size_t bytesAllocate, size_t bytesReserve, Block* next, AllocationType atype)
+ __forceinline static void* blockAlignedMalloc(Device* device, bool useUSM, size_t bytesAllocate, size_t bytesAlignment)
+ {
+ if (useUSM) return device->malloc(bytesAllocate, bytesAlignment);
+ else return alignedMalloc (bytesAllocate, bytesAlignment);
+ }
+
+ __forceinline static void blockAlignedFree(Device* device, bool useUSM, void* ptr)
+ {
+ if (useUSM) return device->free(ptr);
+ else return alignedFree(ptr);
+ }
+
+ static Block* create(Device* device, bool useUSM, size_t bytesAllocate, size_t bytesReserve, Block* next, AllocationType atype)
{
/* We avoid using os_malloc for small blocks as this could
* cause a risk of fragmenting the virtual address space and
@@ -766,7 +779,7 @@ namespace embree
{
const size_t alignment = maxAlignment;
if (device) device->memoryMonitor(bytesAllocate+alignment,false);
- ptr = alignedMalloc(bytesAllocate,alignment);
+ ptr = blockAlignedMalloc(device,useUSM,bytesAllocate,alignment);
/* give hint to transparently convert these pages to 2MB pages */
const size_t ptr_aligned_begin = ((size_t)ptr) & ~size_t(PAGE_SIZE_2M-1);
@@ -780,7 +793,7 @@ namespace embree
{
const size_t alignment = maxAlignment;
if (device) device->memoryMonitor(bytesAllocate+alignment,false);
- ptr = alignedMalloc(bytesAllocate,alignment);
+ ptr = blockAlignedMalloc(device,useUSM,bytesAllocate,alignment);
return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
}
}
@@ -812,23 +825,23 @@ namespace embree
return head;
}
- void clear_list(MemoryMonitorInterface* device)
+ void clear_list(Device* device, bool useUSM)
{
Block* block = this;
while (block) {
Block* next = block->next;
- block->clear_block(device);
+ block->clear_block(device, useUSM);
block = next;
}
}
- void clear_block (MemoryMonitorInterface* device)
+ void clear_block (Device* device, bool useUSM)
{
const size_t sizeof_Header = offsetof(Block,data[0]);
const ssize_t sizeof_Alloced = wasted+sizeof_Header+getBlockAllocatedBytes();
if (atype == ALIGNED_MALLOC) {
- alignedFree(this);
+ blockAlignedFree(device, useUSM, this);
if (device) device->memoryMonitor(-sizeof_Alloced,true);
}
@@ -847,16 +860,16 @@ namespace embree
size_t bytes = bytes_in;
assert(align <= maxAlignment);
bytes = (bytes+(align-1)) & ~(align-1);
- if (unlikely(cur+bytes > reserveEnd && !partial)) return nullptr;
- const size_t i = cur.fetch_add(bytes);
+ if (unlikely(cur+bytes > reserveEnd && !partial)) return nullptr;
+ const size_t i = cur.fetch_add(bytes);
if (unlikely(i+bytes > reserveEnd && !partial)) return nullptr;
if (unlikely(i > reserveEnd)) return nullptr;
bytes_in = bytes = min(bytes,reserveEnd-i);
-
- if (i+bytes > allocEnd) {
+
+ if (i+bytes > allocEnd) {
if (device) device->memoryMonitor(i+bytes-max(i,allocEnd),true);
}
- return &data[i];
+ return &data[i];
}
void* ptr() {
@@ -874,7 +887,7 @@ namespace embree
}
size_t getBlockFreeBytes() const {
- return getBlockAllocatedBytes() - getBlockUsedBytes();
+ return getBlockAllocatedBytes() - getBlockUsedBytes();
}
size_t getBlockAllocatedBytes() const {
@@ -963,40 +976,40 @@ namespace embree
char data[1]; //!< here starts memory to use for allocations
};
+ public:
+ static const size_t blockHeaderSize = offsetof(Block,data[0]);
+
private:
Device* device;
- SpinLock mutex;
size_t slotMask;
+ size_t defaultBlockSize;
+ size_t estimatedSize;
+ size_t growSize;
+ size_t maxGrowSize;
+
+ MutexSys mutex;
+ MutexSys slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
std::atomic<Block*> threadUsedBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
+ std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
std::atomic<Block*> usedBlocks;
std::atomic<Block*> freeBlocks;
- std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
-#if defined(APPLE) && defined(__aarch64__)
- std::mutex slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
-#else
- PaddedSpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
-#endif
-
+ bool useUSM;
+ bool blockAllocation = true;
bool use_single_mode;
- size_t defaultBlockSize;
- size_t estimatedSize;
- size_t growSize;
- size_t maxGrowSize;
+
std::atomic<size_t> log2_grow_size_scale; //!< log2 of scaling factor for grow size // FIXME: remove
std::atomic<size_t> bytesUsed;
std::atomic<size_t> bytesFree;
std::atomic<size_t> bytesWasted;
+
static __thread ThreadLocal2* thread_local_allocator2;
- static SpinLock s_thread_local_allocators_lock;
+ static MutexSys s_thread_local_allocators_lock;
static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators;
-#if defined(APPLE) && defined(__aarch64__)
- std::mutex thread_local_allocators_lock;
-#else
- SpinLock thread_local_allocators_lock;
-#endif
+
std::vector<ThreadLocal2*> thread_local_allocators;
AllocationType atype;
+
mvector<PrimRef> primrefarray; //!< primrefarray used to allocate nodes
};
}