diff options
Diffstat (limited to 'thirdparty/embree/common/sys')
-rw-r--r-- | thirdparty/embree/common/sys/alloc.cpp | 212 | ||||
-rw-r--r-- | thirdparty/embree/common/sys/alloc.h | 93 | ||||
-rw-r--r-- | thirdparty/embree/common/sys/atomic.h | 4 | ||||
-rw-r--r-- | thirdparty/embree/common/sys/barrier.h | 2 | ||||
-rw-r--r-- | thirdparty/embree/common/sys/estring.cpp (renamed from thirdparty/embree/common/sys/string.cpp) | 2 | ||||
-rw-r--r-- | thirdparty/embree/common/sys/estring.h (renamed from thirdparty/embree/common/sys/string.h) | 36 | ||||
-rw-r--r-- | thirdparty/embree/common/sys/intrinsics.h | 113 | ||||
-rw-r--r-- | thirdparty/embree/common/sys/mutex.h | 4 | ||||
-rw-r--r-- | thirdparty/embree/common/sys/platform.h | 245 | ||||
-rw-r--r-- | thirdparty/embree/common/sys/sycl.h | 307 | ||||
-rw-r--r-- | thirdparty/embree/common/sys/sysinfo.cpp | 11 | ||||
-rw-r--r-- | thirdparty/embree/common/sys/thread.cpp | 2 | ||||
-rw-r--r-- | thirdparty/embree/common/sys/vector.h | 22 |
13 files changed, 956 insertions, 97 deletions
diff --git a/thirdparty/embree/common/sys/alloc.cpp b/thirdparty/embree/common/sys/alloc.cpp index abdd269069..71616a3982 100644 --- a/thirdparty/embree/common/sys/alloc.cpp +++ b/thirdparty/embree/common/sys/alloc.cpp @@ -12,33 +12,177 @@ namespace embree { - void* alignedMalloc(size_t size, size_t align) + size_t total_allocations = 0; + +#if defined(EMBREE_SYCL_SUPPORT) + + __thread sycl::context* tls_context_tutorial = nullptr; + __thread sycl::device* tls_device_tutorial = nullptr; + + __thread sycl::context* tls_context_embree = nullptr; + __thread sycl::device* tls_device_embree = nullptr; + + void enableUSMAllocEmbree(sycl::context* context, sycl::device* device) + { + // -- GODOT start -- + // if (tls_context_embree != nullptr) throw std::runtime_error("USM allocation already enabled"); + // if (tls_device_embree != nullptr) throw std::runtime_error("USM allocation already enabled"); + if (tls_context_embree != nullptr) { + abort(); + } + if (tls_device_embree != nullptr) { + abort(); + } + // -- GODOT end -- + tls_context_embree = context; + tls_device_embree = device; + } + + void disableUSMAllocEmbree() + { + // -- GODOT start -- + // if (tls_context_embree == nullptr) throw std::runtime_error("USM allocation not enabled"); + // if (tls_device_embree == nullptr) throw std::runtime_error("USM allocation not enabled"); + if (tls_context_embree == nullptr) { + abort(); + } + if (tls_device_embree == nullptr) { + abort(); + } + // -- GODOT end -- + tls_context_embree = nullptr; + tls_device_embree = nullptr; + } + + void enableUSMAllocTutorial(sycl::context* context, sycl::device* device) + { + //if (tls_context_tutorial != nullptr) throw std::runtime_error("USM allocation already enabled"); + //if (tls_device_tutorial != nullptr) throw std::runtime_error("USM allocation already enabled"); + tls_context_tutorial = context; + tls_device_tutorial = device; + } + + void disableUSMAllocTutorial() + { + // -- GODOT start -- + // if (tls_context_tutorial == nullptr) throw std::runtime_error("USM allocation not enabled"); + // if (tls_device_tutorial == nullptr) throw std::runtime_error("USM allocation not enabled"); + if (tls_context_tutorial == nullptr) { + abort(); + } + if (tls_device_tutorial == nullptr) { + abort(); + } + // -- GODOT end -- + + tls_context_tutorial = nullptr; + tls_device_tutorial = nullptr; + } + +#endif + + void* alignedMalloc(size_t size, size_t align) { if (size == 0) return nullptr; - + assert((align & (align-1)) == 0); void* ptr = _mm_malloc(size,align); - - if (size != 0 && ptr == nullptr) - // -- GODOT start -- - // throw std::bad_alloc(); + // -- GODOT start -- + // if (size != 0 && ptr == nullptr) + // throw std::bad_alloc(); + if (size != 0 && ptr == nullptr) { abort(); - // -- GODOT end -- - + } + // -- GODOT end -- return ptr; } - + void alignedFree(void* ptr) { if (ptr) _mm_free(ptr); } +#if defined(EMBREE_SYCL_SUPPORT) + + void* alignedSYCLMalloc(sycl::context* context, sycl::device* device, size_t size, size_t align, EmbreeUSMMode mode) + { + assert(context); + assert(device); + + if (size == 0) + return nullptr; + + assert((align & (align-1)) == 0); + total_allocations++; + + void* ptr = nullptr; + if (mode == EMBREE_USM_SHARED_DEVICE_READ_ONLY) + ptr = sycl::aligned_alloc_shared(align,size,*device,*context,sycl::ext::oneapi::property::usm::device_read_only()); + else + ptr = sycl::aligned_alloc_shared(align,size,*device,*context); + + // -- GODOT start -- + // if (size != 0 && ptr == nullptr) + // throw std::bad_alloc(); + if (size != 0 && ptr == nullptr) { + abort(); + } + // -- GODOT end -- + + return ptr; + } + + static MutexSys g_alloc_mutex; + + void* alignedSYCLMalloc(size_t size, size_t align, EmbreeUSMMode mode) + { + if (tls_context_tutorial) return alignedSYCLMalloc(tls_context_tutorial, tls_device_tutorial, size, align, mode); + if (tls_context_embree ) return alignedSYCLMalloc(tls_context_embree, tls_device_embree, size, align, mode); + return nullptr; + } + + void alignedSYCLFree(sycl::context* context, void* ptr) + { + assert(context); + if (ptr) { + sycl::free(ptr,*context); + } + } + + void alignedSYCLFree(void* ptr) + { + if (tls_context_tutorial) return alignedSYCLFree(tls_context_tutorial, ptr); + if (tls_context_embree ) return alignedSYCLFree(tls_context_embree, ptr); + } + +#endif + + void* alignedUSMMalloc(size_t size, size_t align, EmbreeUSMMode mode) + { +#if defined(EMBREE_SYCL_SUPPORT) + if (tls_context_embree || tls_context_tutorial) + return alignedSYCLMalloc(size,align,mode); + else +#endif + return alignedMalloc(size,align); + } + + void alignedUSMFree(void* ptr) + { +#if defined(EMBREE_SYCL_SUPPORT) + if (tls_context_embree || tls_context_tutorial) + return alignedSYCLFree(ptr); + else +#endif + return alignedFree(ptr); + } + static bool huge_pages_enabled = false; static MutexSys os_init_mutex; - __forceinline bool isHugePageCandidate(const size_t bytes) + __forceinline bool isHugePageCandidate(const size_t bytes) { if (!huge_pages_enabled) return false; @@ -133,7 +277,9 @@ namespace embree char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE); // -- GODOT start -- // if (ptr == nullptr) throw std::bad_alloc(); - if (ptr == nullptr) abort(); + if (ptr == nullptr) { + abort(); + } // -- GODOT end -- hugepages = false; return ptr; @@ -150,11 +296,13 @@ namespace embree if (bytesNew >= bytesOld) return bytesOld; - if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT)) - // -- GODOT start -- - // throw std::bad_alloc(); + // -- GODOT start -- + // if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT)) + // throw std::bad_alloc(); + if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT)) { abort(); - // -- GODOT end -- + } + // -- GODOT end -- return bytesNew; } @@ -164,11 +312,13 @@ namespace embree if (bytes == 0) return; - if (!VirtualFree(ptr,0,MEM_RELEASE)) - // -- GODOT start -- - // throw std::bad_alloc(); + // -- GODOT start -- + // if (!VirtualFree(ptr,0,MEM_RELEASE)) + // throw std::bad_alloc(); + if (!VirtualFree(ptr,0,MEM_RELEASE)) { abort(); - // -- GODOT end -- + } + // -- GODOT end -- } void os_advise(void *ptr, size_t bytes) @@ -274,7 +424,9 @@ namespace embree void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); // -- GODOT start -- // if (ptr == MAP_FAILED) throw std::bad_alloc(); - if (ptr == MAP_FAILED) abort(); + if (ptr == MAP_FAILED) { + abort(); + } // -- GODOT end -- hugepages = false; @@ -291,11 +443,13 @@ namespace embree if (bytesNew >= bytesOld) return bytesOld; - if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1) - // -- GODOT start -- - // throw std::bad_alloc(); + // -- GODOT start -- + // if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1) + // throw std::bad_alloc(); + if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1) { abort(); - // -- GODOT end -- + } + // -- GODOT end -- return bytesNew; } @@ -308,11 +462,13 @@ namespace embree /* for hugepages we need to also align the size */ const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K; bytes = (bytes+pageSize-1) & ~(pageSize-1); - if (munmap(ptr,bytes) == -1) - // -- GODOT start -- - // throw std::bad_alloc(); + // -- GODOT start -- + // if (munmap(ptr,bytes) == -1) + // throw std::bad_alloc(); + if (munmap(ptr,bytes) == -1) { abort(); - // -- GODOT end -- + } + // -- GODOT end -- } /* hint for transparent huge pages (THP) */ diff --git a/thirdparty/embree/common/sys/alloc.h b/thirdparty/embree/common/sys/alloc.h index 4fa474ec1d..28b17f988d 100644 --- a/thirdparty/embree/common/sys/alloc.h +++ b/thirdparty/embree/common/sys/alloc.h @@ -9,20 +9,72 @@ namespace embree { -#define ALIGNED_STRUCT_(align) \ - void* operator new(size_t size) { return alignedMalloc(size,align); } \ - void operator delete(void* ptr) { alignedFree(ptr); } \ +#if defined(EMBREE_SYCL_SUPPORT) + + /* enables SYCL USM allocation */ + void enableUSMAllocEmbree(sycl::context* context, sycl::device* device); + void enableUSMAllocTutorial(sycl::context* context, sycl::device* device); + + /* disables SYCL USM allocation */ + void disableUSMAllocEmbree(); + void disableUSMAllocTutorial(); + +#endif + +#define ALIGNED_STRUCT_(align) \ + void* operator new(size_t size) { return alignedMalloc(size,align); } \ + void operator delete(void* ptr) { alignedFree(ptr); } \ void* operator new[](size_t size) { return alignedMalloc(size,align); } \ void operator delete[](void* ptr) { alignedFree(ptr); } + +#define ALIGNED_STRUCT_USM_(align) \ + void* operator new(size_t size) { return alignedUSMMalloc(size,align); } \ + void operator delete(void* ptr) { alignedUSMFree(ptr); } \ + void* operator new[](size_t size) { return alignedUSMMalloc(size,align); } \ + void operator delete[](void* ptr) { alignedUSMFree(ptr); } + +#define ALIGNED_CLASS_(align) \ + public: \ + ALIGNED_STRUCT_(align) \ + private: -#define ALIGNED_CLASS_(align) \ +#define ALIGNED_CLASS_USM_(align) \ public: \ - ALIGNED_STRUCT_(align) \ + ALIGNED_STRUCT_USM_(align) \ private: + + enum EmbreeUSMMode { + EMBREE_USM_SHARED = 0, + EMBREE_USM_SHARED_DEVICE_READ_WRITE = 0, + EMBREE_USM_SHARED_DEVICE_READ_ONLY = 1 + }; /*! aligned allocation */ void* alignedMalloc(size_t size, size_t align); void alignedFree(void* ptr); + + /*! aligned allocation using SYCL USM */ + void* alignedUSMMalloc(size_t size, size_t align = 16, EmbreeUSMMode mode = EMBREE_USM_SHARED_DEVICE_READ_ONLY); + void alignedUSMFree(void* ptr); + +#if defined(EMBREE_SYCL_SUPPORT) + + /*! aligned allocation using SYCL USM */ + void* alignedSYCLMalloc(sycl::context* context, sycl::device* device, size_t size, size_t align, EmbreeUSMMode mode); + void alignedSYCLFree(sycl::context* context, void* ptr); + + // deleter functor to use as deleter in std unique or shared pointers that + // capture raw pointers created by sycl::malloc and it's variants + template<typename T> + struct sycl_deleter + { + void operator()(T const* ptr) + { + alignedUSMFree((void*)ptr); + } + }; + +#endif /*! allocator that performs aligned allocations */ template<typename T, size_t alignment> @@ -95,6 +147,37 @@ namespace embree bool hugepages; }; + /*! allocator that newer performs allocations */ + template<typename T> + struct no_allocator + { + typedef T value_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + __forceinline pointer allocate( size_type n ) { + // -- GODOT start -- + // throw std::runtime_error("no allocation supported"); + abort(); + // -- GODOT end -- + } + + __forceinline void deallocate( pointer p, size_type n ) { + } + + __forceinline void construct( pointer p, const_reference val ) { + new (p) T(val); + } + + __forceinline void destroy( pointer p ) { + p->~T(); + } + }; + /*! allocator for IDs */ template<typename T, size_t max_id> struct IDPool diff --git a/thirdparty/embree/common/sys/atomic.h b/thirdparty/embree/common/sys/atomic.h index 67af254f36..cf9909aad9 100644 --- a/thirdparty/embree/common/sys/atomic.h +++ b/thirdparty/embree/common/sys/atomic.h @@ -36,7 +36,7 @@ namespace embree }; template<typename T> - __forceinline void atomic_min(std::atomic<T>& aref, const T& bref) + __forceinline void _atomic_min(std::atomic<T>& aref, const T& bref) { const T b = bref.load(); while (true) { @@ -47,7 +47,7 @@ namespace embree } template<typename T> - __forceinline void atomic_max(std::atomic<T>& aref, const T& bref) + __forceinline void _atomic_max(std::atomic<T>& aref, const T& bref) { const T b = bref.load(); while (true) { diff --git a/thirdparty/embree/common/sys/barrier.h b/thirdparty/embree/common/sys/barrier.h index c56513a2ed..e1580f41a9 100644 --- a/thirdparty/embree/common/sys/barrier.h +++ b/thirdparty/embree/common/sys/barrier.h @@ -34,7 +34,7 @@ namespace embree void* opaque; }; - /*! fast active barrier using atomitc counter */ + /*! fast active barrier using atomic counter */ struct BarrierActive { public: diff --git a/thirdparty/embree/common/sys/string.cpp b/thirdparty/embree/common/sys/estring.cpp index f42fdc8536..c66c5c5b84 100644 --- a/thirdparty/embree/common/sys/string.cpp +++ b/thirdparty/embree/common/sys/estring.cpp @@ -1,7 +1,7 @@ // Copyright 2009-2021 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#include "string.h" +#include "estring.h" #include <algorithm> #include <ctype.h> diff --git a/thirdparty/embree/common/sys/string.h b/thirdparty/embree/common/sys/estring.h index 820076b21c..63051ad3c3 100644 --- a/thirdparty/embree/common/sys/string.h +++ b/thirdparty/embree/common/sys/estring.h @@ -28,6 +28,42 @@ namespace embree std::streamsize precision; }; + struct IndentOStream : public std::streambuf + { + explicit IndentOStream(std::ostream &ostream, int indent = 2) + : streambuf(ostream.rdbuf()) + , start_of_line(true) + , ident_str(indent, ' ') + , stream(&ostream) + { + // set streambuf of ostream to this and save original streambuf + stream->rdbuf(this); + } + + virtual ~IndentOStream() + { + if (stream != NULL) { + // restore old streambuf + stream->rdbuf(streambuf); + } + } + + protected: + virtual int overflow(int ch) { + if (start_of_line && ch != '\n') { + streambuf->sputn(ident_str.data(), ident_str.size()); + } + start_of_line = ch == '\n'; + return streambuf->sputc(ch); + } + + private: + std::streambuf *streambuf; + bool start_of_line; + std::string ident_str; + std::ostream *stream; + }; + std::string toLowerCase(const std::string& s); std::string toUpperCase(const std::string& s); diff --git a/thirdparty/embree/common/sys/intrinsics.h b/thirdparty/embree/common/sys/intrinsics.h index 2c2f6eccda..f5074bb29d 100644 --- a/thirdparty/embree/common/sys/intrinsics.h +++ b/thirdparty/embree/common/sys/intrinsics.h @@ -64,7 +64,7 @@ namespace embree /// Windows Platform //////////////////////////////////////////////////////////////////////////////// -#if defined(__WIN32__) +#if defined(__WIN32__) && !defined(__INTEL_LLVM_COMPILER) __forceinline size_t read_tsc() { @@ -89,7 +89,7 @@ namespace embree #endif } -#if defined(__X86_64__) +#if defined(__X86_64__) || defined (__aarch64__) __forceinline size_t bsf(size_t v) { #if defined(__AVX2__) return _tzcnt_u64(v); @@ -113,7 +113,7 @@ namespace embree return i; } -#if defined(__X86_64__) +#if defined(__X86_64__) || defined (__aarch64__) __forceinline size_t bscf(size_t& v) { size_t i = bsf(v); @@ -138,7 +138,7 @@ namespace embree #endif } -#if defined(__X86_64__) +#if defined(__X86_64__) || defined (__aarch64__) __forceinline size_t bsr(size_t v) { #if defined(__AVX2__) return 63 -_lzcnt_u64(v); @@ -196,49 +196,6 @@ namespace embree #else -#if defined(__i386__) && defined(__PIC__) - - __forceinline void __cpuid(int out[4], int op) - { - asm volatile ("xchg{l}\t{%%}ebx, %1\n\t" - "cpuid\n\t" - "xchg{l}\t{%%}ebx, %1\n\t" - : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) - : "0"(op)); - } - - __forceinline void __cpuid_count(int out[4], int op1, int op2) - { - asm volatile ("xchg{l}\t{%%}ebx, %1\n\t" - "cpuid\n\t" - "xchg{l}\t{%%}ebx, %1\n\t" - : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3]) - : "0" (op1), "2" (op2)); - } - -#elif defined(__X86_ASM__) - - __forceinline void __cpuid(int out[4], int op) { -#if defined(__ARM_NEON) - if (op == 0) { // Get CPU name - out[0] = 0x41524d20; - out[1] = 0x41524d20; - out[2] = 0x41524d20; - out[3] = 0x41524d20; - } -#else - asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op)); -#endif - } - -#if !defined(__ARM_NEON) - __forceinline void __cpuid_count(int out[4], int op1, int op2) { - asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2)); - } -#endif - -#endif - __forceinline uint64_t read_tsc() { #if defined(__X86_ASM__) uint32_t high,low; @@ -263,6 +220,13 @@ namespace embree #endif #endif } + +#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__) + __forceinline unsigned int bsf(unsigned v) { + return sycl::ctz(v); + } + +#else #if defined(__64BIT__) __forceinline unsigned bsf(unsigned v) @@ -280,6 +244,13 @@ namespace embree #endif } #endif +#endif + +#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__) + __forceinline size_t bsf(size_t v) { + return sycl::ctz(v); + } +#else __forceinline size_t bsf(size_t v) { #if defined(__AVX2__) && !defined(__aarch64__) @@ -294,6 +265,7 @@ namespace embree return __builtin_ctzl(v); #endif } +#endif __forceinline int bscf(int& v) { @@ -434,6 +406,41 @@ namespace embree #endif +#if !defined(__WIN32__) + +#if defined(__i386__) && defined(__PIC__) + + __forceinline void __cpuid(int out[4], int op) + { + asm volatile ("xchg{l}\t{%%}ebx, %1\n\t" + "cpuid\n\t" + "xchg{l}\t{%%}ebx, %1\n\t" + : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "0"(op)); + } + + __forceinline void __cpuid_count(int out[4], int op1, int op2) + { + asm volatile ("xchg{l}\t{%%}ebx, %1\n\t" + "cpuid\n\t" + "xchg{l}\t{%%}ebx, %1\n\t" + : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3]) + : "0" (op1), "2" (op2)); + } + +#elif defined(__X86_ASM__) + + __forceinline void __cpuid(int out[4], int op) { + asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op)); + } + + __forceinline void __cpuid_count(int out[4], int op1, int op2) { + asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2)); + } + +#endif +#endif + //////////////////////////////////////////////////////////////////////////////// /// All Platforms //////////////////////////////////////////////////////////////////////////////// @@ -459,8 +466,16 @@ namespace embree #endif #endif -#if defined(__SSE4_2__) || defined(__ARM_NEON) +#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__) + __forceinline unsigned int popcnt(unsigned int in) { + return sycl::popcount(in); + } + +#else + +#if defined(__SSE4_2__) || defined(__ARM_NEON) + __forceinline int popcnt(int in) { return _mm_popcnt_u32(in); } @@ -476,6 +491,8 @@ namespace embree #endif #endif + +#endif #if defined(__X86_ASM__) __forceinline uint64_t rdtsc() diff --git a/thirdparty/embree/common/sys/mutex.h b/thirdparty/embree/common/sys/mutex.h index 26af6c582c..0f7345cf45 100644 --- a/thirdparty/embree/common/sys/mutex.h +++ b/thirdparty/embree/common/sys/mutex.h @@ -86,8 +86,8 @@ namespace embree class PaddedSpinLock : public SpinLock { - private: - char padding[CPU_CACHELINE_SIZE - sizeof(SpinLock)]; + private: + MAYBE_UNUSED char padding[CPU_CACHELINE_SIZE - sizeof(SpinLock)]; }; /*! safe mutex lock and unlock helper */ template<typename Mutex> class Lock { diff --git a/thirdparty/embree/common/sys/platform.h b/thirdparty/embree/common/sys/platform.h index 728bf6ed7d..d4a9b9e119 100644 --- a/thirdparty/embree/common/sys/platform.h +++ b/thirdparty/embree/common/sys/platform.h @@ -3,7 +3,9 @@ #pragma once +#if !defined(_CRT_SECURE_NO_WARNINGS) #define _CRT_SECURE_NO_WARNINGS +#endif #include <cstddef> #include <cassert> @@ -18,6 +20,30 @@ #include <cstring> #include <stdint.h> #include <functional> +#include <mutex> + +#if defined(EMBREE_SYCL_SUPPORT) + +#define __SYCL_USE_NON_VARIADIC_SPIRV_OCL_PRINTF__ + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wdeprecated-declarations" +#pragma clang diagnostic ignored "-W#pragma-messages" + +#include <sycl/sycl.hpp> + +#pragma clang diagnostic pop + +#include "sycl.h" + +#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__) +#define CONSTANT __attribute__((opencl_constant)) +#else +#define CONSTANT +#endif + +#endif + //////////////////////////////////////////////////////////////////////////////// /// detect platform @@ -115,7 +141,7 @@ #else #define __restrict__ //__restrict // causes issues with MSVC #endif -#if !defined(__thread) +#if !defined(__thread) && !defined(__INTEL_LLVM_COMPILER) #define __thread __declspec(thread) #endif #if !defined(__aligned) @@ -148,6 +174,10 @@ #define MAYBE_UNUSED #endif +#if !defined(_unused) +#define _unused(x) ((void)(x)) +#endif + #if defined(_MSC_VER) && (_MSC_VER < 1900) // before VS2015 deleted functions are not supported properly #define DELETED #else @@ -155,7 +185,7 @@ #endif #if !defined(likely) -#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) || defined(__SYCL_DEVICE_ONLY__) #define likely(expr) (expr) #define unlikely(expr) (expr) #else @@ -171,22 +201,27 @@ /* debug printing macros */ #define STRING(x) #x #define TOSTRING(x) STRING(x) -#define PING embree_cout << __FILE__ << " (" << __LINE__ << "): " << __FUNCTION__ << embree_endl +#define PING embree_cout_uniform << __FILE__ << " (" << __LINE__ << "): " << __FUNCTION__ << embree_endl #define PRINT(x) embree_cout << STRING(x) << " = " << (x) << embree_endl #define PRINT2(x,y) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << embree_endl #define PRINT3(x,y,z) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << embree_endl #define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl +#define UPRINT(x) embree_cout_uniform << STRING(x) << " = " << (x) << embree_endl +#define UPRINT2(x,y) embree_cout_uniform << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << embree_endl +#define UPRINT3(x,y,z) embree_cout_uniform << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << embree_endl +#define UPRINT4(x,y,z,w) embree_cout_uniform << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl + #if defined(DEBUG) // only report file and line in debug mode // -- GODOT start -- - // #define THROW_RUNTIME_ERROR(str) + // #define THROW_RUNTIME_ERROR(str) \ // throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); #define THROW_RUNTIME_ERROR(str) \ printf("%s (%d): %s", __FILE__, __LINE__, std::string(str).c_str()), abort(); // -- GODOT end -- #else // -- GODOT start -- - // #define THROW_RUNTIME_ERROR(str) + // #define THROW_RUNTIME_ERROR(str) \ // throw std::runtime_error(str); #define THROW_RUNTIME_ERROR(str) \ abort(); @@ -323,13 +358,209 @@ __forceinline std::string toString(long long value) { #define DISABLE_DEPRECATED_WARNING __pragma(warning (disable: 4996)) // warning: function was declared deprecated #define ENABLE_DEPRECATED_WARNING __pragma(warning (enable : 4996)) // warning: function was declared deprecated #endif + +//////////////////////////////////////////////////////////////////////////////// +/// SYCL specific +//////////////////////////////////////////////////////////////////////////////// + + +#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__) + +#define sycl_printf0(format, ...) { \ + static const CONSTANT char fmt[] = format; \ + if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true))) \ + sycl::ext::oneapi::experimental::printf(fmt, __VA_ARGS__ ); \ + } + +#define sycl_printf0_(format) { \ + static const CONSTANT char fmt[] = format; \ + if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true))) \ + sycl::ext::oneapi::experimental::printf(fmt); \ + } + +#else + +#define sycl_printf0(format, ...) { \ + static const CONSTANT char fmt[] = format; \ + sycl::ext::oneapi::experimental::printf(fmt, __VA_ARGS__ ); \ + } + +#define sycl_printf0_(format) { \ + static const CONSTANT char fmt[] = format; \ + sycl::ext::oneapi::experimental::printf(fmt); \ + } + +#endif + +#define sycl_printf(format, ...) { \ + static const CONSTANT char fmt[] = format; \ + sycl::ext::oneapi::experimental::printf(fmt, __VA_ARGS__ ); \ + } + +#define sycl_printf_(format) { \ + static const CONSTANT char fmt[] = format; \ + sycl::ext::oneapi::experimental::printf(fmt); \ + } + +#if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__) + +namespace embree +{ + struct sycl_ostream_ { + sycl_ostream_ (bool uniform) : uniform(uniform) {} + bool uniform = false; + }; + struct sycl_endl_ {}; + +#define embree_ostream embree::sycl_ostream_ +#define embree_cout embree::sycl_ostream_(false) +#define embree_cout_uniform embree::sycl_ostream_(true) +#define embree_endl embree::sycl_endl_() + + inline sycl_ostream_ operator <<(sycl_ostream_ cout, int i) + { + if (cout.uniform) { + if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true))) + sycl_printf("%i",i); + } + else + sycl_printf("%i ",i); + + return cout; + } + + inline sycl_ostream_ operator <<(sycl_ostream_ cout, unsigned int i) + { + if (cout.uniform) { + if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true))) + sycl_printf("%u",i); + } else + sycl_printf("%u ",i); + + return cout; + } + + inline sycl_ostream_ operator <<(sycl_ostream_ cout, float f) + { + if (cout.uniform) { + if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true))) + sycl_printf("%f",f); + } else + sycl_printf("%f ",f); + + return cout; + } + + inline sycl_ostream_ operator <<(sycl_ostream_ cout, double d) + { + if (cout.uniform) { + if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true))) + sycl_printf("%f",d); + } else + sycl_printf("%f ",d); + + return cout; + } + + inline sycl_ostream_ operator <<(sycl_ostream_ cout, uint64_t l) + { + if (cout.uniform) { + if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true))) + sycl_printf("%lu",l); + } else + sycl_printf("%lu ",l); + + return cout; + } + + inline sycl_ostream_ operator <<(sycl_ostream_ cout, long l) + { + if (cout.uniform) { + if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true))) + sycl_printf("%l",l); + } else + sycl_printf("%l ",l); + + return cout; + } + + + inline sycl_ostream_ operator <<(sycl_ostream_ cout, void* p) + { + if (cout.uniform) { + if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true))) + sycl_printf("%p",p); + } else + sycl_printf("%p ",p); + + return cout; + } + + inline sycl_ostream_ operator <<(sycl_ostream_ cout, const char* c) + { + if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true))) + sycl_printf("%s",c); + return cout; + } + + inline sycl_ostream_ operator <<(sycl_ostream_ cout, sycl_endl_) + { + if (get_sub_group_local_id() == sycl::ctz(intel_sub_group_ballot(true))) + sycl_printf_("\n"); + return cout; + } +} + +#else -/* embree output stream */ #define embree_ostream std::ostream& #define embree_cout std::cout #define embree_cout_uniform std::cout #define embree_endl std::endl - + +#endif + +#if defined(EMBREE_SYCL_SUPPORT) + + /* printing out sycle vector types */ + __forceinline embree_ostream operator<<(embree_ostream out, const sycl::float4& v) { + return out << "(" << v.x() << "," << v.y() << "," << v.z() << "," << v.w() << ")"; + } + __forceinline embree_ostream operator<<(embree_ostream out, const sycl::float3& v) { + return out << "(" << v.x() << "," << v.y() << "," << v.z() << ")"; + } + __forceinline embree_ostream operator<<(embree_ostream out, const sycl::float2& v) { + return out << "(" << v.x() << "," << v.y() << ")"; + } + __forceinline embree_ostream operator<<(embree_ostream out, const sycl::int4& v) { + return out << "(" << v.x() << "," << v.y() << "," << v.z() << "," << v.w() << ")"; + } + __forceinline embree_ostream operator<<(embree_ostream out, const sycl::int3& v) { + return out << "(" << v.x() << "," << v.y() << "," << v.z() << ")"; + } + __forceinline embree_ostream operator<<(embree_ostream out, const sycl::int2& v) { + return out << "(" << v.x() << "," << v.y() << ")"; + } + __forceinline embree_ostream operator<<(embree_ostream out, const sycl::uint4& v) { + return out << "(" << v.x() << "," << v.y() << "," << v.z() << "," << v.w() << ")"; + } + __forceinline embree_ostream operator<<(embree_ostream out, const sycl::uint3& v) { + return out << "(" << v.x() << "," << v.y() << "," << v.z() << ")"; + } + __forceinline embree_ostream operator<<(embree_ostream out, const sycl::uint2& v) { + return out << "(" << v.x() << "," << v.y() << ")"; + } + +#endif + +inline void tab(std::ostream& cout, int n) { + for (int i=0; i<n; i++) cout << " "; +} + +inline std::string tab(int depth) { + return std::string(2*depth,' '); +} + //////////////////////////////////////////////////////////////////////////////// /// Some macros for static profiling //////////////////////////////////////////////////////////////////////////////// diff --git a/thirdparty/embree/common/sys/sycl.h b/thirdparty/embree/common/sys/sycl.h new file mode 100644 index 0000000000..2558eb052f --- /dev/null +++ b/thirdparty/embree/common/sys/sycl.h @@ -0,0 +1,307 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" + +using sycl::float16; +using sycl::float8; +using sycl::float4; +using sycl::float3; +using sycl::float2; +using sycl::int16; +using sycl::int8; +using sycl::int4; +using sycl::int3; +using sycl::int2; +using sycl::uint16; +using sycl::uint8; +using sycl::uint4; +using sycl::uint3; +using sycl::uint2; +using sycl::uchar16; +using sycl::uchar8; +using sycl::uchar4; +using sycl::uchar3; +using sycl::uchar2; +using sycl::ushort16; +using sycl::ushort8; +using sycl::ushort4; +using sycl::ushort3; +using sycl::ushort2; + +#ifdef __SYCL_DEVICE_ONLY__ +#define GLOBAL __attribute__((opencl_global)) +#define LOCAL __attribute__((opencl_local)) + +SYCL_EXTERNAL extern int work_group_reduce_add(int x); +SYCL_EXTERNAL extern float work_group_reduce_min(float x); +SYCL_EXTERNAL extern float work_group_reduce_max(float x); + +SYCL_EXTERNAL extern float atomic_min(volatile GLOBAL float *p, float val); +SYCL_EXTERNAL extern float atomic_min(volatile LOCAL float *p, float val); +SYCL_EXTERNAL extern float atomic_max(volatile GLOBAL float *p, float val); +SYCL_EXTERNAL extern float atomic_max(volatile LOCAL float *p, float val); + +SYCL_EXTERNAL extern "C" unsigned int intel_sub_group_ballot(bool valid); + +SYCL_EXTERNAL extern "C" void __builtin_IB_assume_uniform(void *p); + +// Load message caching control + + enum LSC_LDCC { + LSC_LDCC_DEFAULT, + LSC_LDCC_L1UC_L3UC, // Override to L1 uncached and L3 uncached + LSC_LDCC_L1UC_L3C, // Override to L1 uncached and L3 cached + LSC_LDCC_L1C_L3UC, // Override to L1 cached and L3 uncached + LSC_LDCC_L1C_L3C, // Override to L1 cached and L3 cached + LSC_LDCC_L1S_L3UC, // Override to L1 streaming load and L3 uncached + LSC_LDCC_L1S_L3C, // Override to L1 streaming load and L3 cached + LSC_LDCC_L1IAR_L3C, // Override to L1 invalidate-after-read, and L3 cached + }; + + + +// Store message caching control (also used for atomics) + + enum LSC_STCC { + LSC_STCC_DEFAULT, + LSC_STCC_L1UC_L3UC, // Override to L1 uncached and L3 uncached + LSC_STCC_L1UC_L3WB, // Override to L1 uncached and L3 written back + LSC_STCC_L1WT_L3UC, // Override to L1 written through and L3 uncached + LSC_STCC_L1WT_L3WB, // Override to L1 written through and L3 written back + LSC_STCC_L1S_L3UC, // Override to L1 streaming and L3 uncached + LSC_STCC_L1S_L3WB, // Override to L1 streaming and L3 written back + LSC_STCC_L1WB_L3WB, // Override to L1 written through and L3 written back + }; + + + +/////////////////////////////////////////////////////////////////////// + +// LSC Loads + +/////////////////////////////////////////////////////////////////////// + +SYCL_EXTERNAL /* extern "C" */ uint32_t __builtin_IB_lsc_load_global_uchar_to_uint (const GLOBAL uint8_t *base, int elemOff, enum LSC_LDCC cacheOpt); //D8U32 +SYCL_EXTERNAL /* extern "C" */ uint32_t __builtin_IB_lsc_load_global_ushort_to_uint(const GLOBAL uint16_t *base, int elemOff, enum LSC_LDCC cacheOpt); //D16U32 +SYCL_EXTERNAL /* extern "C" */ uint32_t __builtin_IB_lsc_load_global_uint (const GLOBAL uint32_t *base, int elemOff, enum LSC_LDCC cacheOpt); //D32V1 +SYCL_EXTERNAL /* extern "C" */ sycl::uint2 __builtin_IB_lsc_load_global_uint2 (const GLOBAL sycl::uint2 *base, int elemOff, enum LSC_LDCC cacheOpt); //D32V2 +SYCL_EXTERNAL /* extern "C" */ sycl::uint3 __builtin_IB_lsc_load_global_uint3 (const GLOBAL sycl::uint3 *base, int elemOff, enum LSC_LDCC cacheOpt); //D32V3 +SYCL_EXTERNAL /* extern "C" */ sycl::uint4 __builtin_IB_lsc_load_global_uint4 (const GLOBAL sycl::uint4 *base, int elemOff, enum LSC_LDCC cacheOpt); //D32V4 +SYCL_EXTERNAL /* extern "C" */ sycl::uint8 __builtin_IB_lsc_load_global_uint8 (const GLOBAL sycl::uint8 *base, int elemOff, enum LSC_LDCC cacheOpt); //D32V8 +SYCL_EXTERNAL /* extern "C" */ uint64_t __builtin_IB_lsc_load_global_ulong (const GLOBAL uint64_t *base, int elemOff, enum LSC_LDCC cacheOpt); //D64V1 +SYCL_EXTERNAL /* extern "C" */ sycl::ulong2 __builtin_IB_lsc_load_global_ulong2 (const GLOBAL sycl::ulong2 *base, int elemOff, enum LSC_LDCC cacheOpt); //D64V2 +SYCL_EXTERNAL /* extern "C" */ sycl::ulong3 __builtin_IB_lsc_load_global_ulong3 (const GLOBAL sycl::ulong3 *base, int elemOff, enum LSC_LDCC cacheOpt); //D64V3 +SYCL_EXTERNAL /* extern "C" */ sycl::ulong4 __builtin_IB_lsc_load_global_ulong4 (const GLOBAL sycl::ulong4 *base, int elemOff, enum LSC_LDCC cacheOpt); //D64V4 +SYCL_EXTERNAL /* extern "C" */ sycl::ulong8 __builtin_IB_lsc_load_global_ulong8 (const GLOBAL sycl::ulong8 *base, int elemOff, enum LSC_LDCC cacheOpt); //D64V8 + +// global address space +SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_uchar_from_uint (GLOBAL uint8_t *base, int immElemOff, uint32_t val, enum LSC_STCC cacheOpt); //D8U32 +SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_ushort_from_uint(GLOBAL uint16_t *base, int immElemOff, uint32_t val, enum LSC_STCC cacheOpt); //D16U32 +SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_uint (GLOBAL uint32_t *base, int immElemOff, uint32_t val, enum LSC_STCC cacheOpt); //D32V1 +SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_uint2 (GLOBAL sycl::uint2 *base, int immElemOff, sycl::uint2 val, enum LSC_STCC cacheOpt); //D32V2 +SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_uint3 (GLOBAL sycl::uint3 *base, int immElemOff, sycl::uint3 val, enum LSC_STCC cacheOpt); //D32V3 +SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_uint4 (GLOBAL sycl::uint4 *base, int immElemOff, sycl::uint4 val, enum LSC_STCC cacheOpt); //D32V4 +SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_uint8 (GLOBAL sycl::uint8 *base, int immElemOff, sycl::uint8 val, enum LSC_STCC cacheOpt); //D32V8 +SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_ulong (GLOBAL uint64_t *base, int immElemOff, uint64_t val, enum LSC_STCC cacheOpt); //D64V1 +SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_ulong2 (GLOBAL sycl::ulong2 *base, int immElemOff, sycl::ulong2 val, enum LSC_STCC cacheOpt); //D64V2 +SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_ulong3 (GLOBAL sycl::ulong3 *base, int immElemOff, sycl::ulong3 val, enum LSC_STCC cacheOpt); //D64V3 +SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_ulong4 (GLOBAL sycl::ulong4 *base, int immElemOff, sycl::ulong4 val, enum LSC_STCC cacheOpt); //D64V4 +SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_ulong8 (GLOBAL sycl::ulong8 *base, int immElemOff, sycl::ulong8 val, enum LSC_STCC cacheOpt); //D64V8 + +/////////////////////////////////////////////////////////////////////// +// prefetching +/////////////////////////////////////////////////////////////////////// +// +// LSC Pre-Fetch Load functions with CacheControls +// global address space +SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_uchar (const GLOBAL uint8_t *base, int immElemOff, enum LSC_LDCC cacheOpt); //D8U32 +SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_ushort(const GLOBAL uint16_t *base, int immElemOff, enum LSC_LDCC cacheOpt); //D16U32 +SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_uint (const GLOBAL uint32_t *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V1 +SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_uint2 (const GLOBAL sycl::uint2 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V2 +SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_uint3 (const GLOBAL sycl::uint3 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V3 +SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_uint4 (const GLOBAL sycl::uint4 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V4 +SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_uint8 (const GLOBAL sycl::uint8 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V8 +SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_ulong (const GLOBAL uint64_t *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V1 +SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_ulong2(const GLOBAL sycl::ulong2 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V2 +SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_ulong3(const GLOBAL sycl::ulong3 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V3 +SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_ulong4(const GLOBAL sycl::ulong4 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V4 +SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_ulong8(const GLOBAL sycl::ulong8 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V8 + +#else + +#define GLOBAL +#define LOCAL + +/* dummy functions for host */ +inline int work_group_reduce_add(int x) { return x; } +inline float work_group_reduce_min(float x) { return x; } +inline float work_group_reduce_max(float x) { return x; } + +inline float atomic_min(volatile float *p, float val) { return val; }; +inline float atomic_max(volatile float *p, float val) { return val; }; + +inline uint32_t intel_sub_group_ballot(bool valid) { return 0; } + +#endif + +/* creates a temporary that is enforced to be uniform */ +#define SYCL_UNIFORM_VAR(Ty,tmp,k) \ + Ty tmp##_data; \ + Ty* p##tmp##_data = (Ty*) sub_group_broadcast((uint64_t)&tmp##_data,k); \ + Ty& tmp = *p##tmp##_data; + +#if !defined(__forceinline) +#define __forceinline inline __attribute__((always_inline)) +#endif + +#if __SYCL_COMPILER_VERSION < 20210801 +#define all_of_group all_of +#define any_of_group any_of +#define none_of_group none_of +#define group_broadcast broadcast +#define reduce_over_group reduce +#define exclusive_scan_over_group exclusive_scan +#define inclusive_scan_over_group inclusive_scan +#endif + +namespace embree +{ + template<typename T> + __forceinline T cselect(const bool mask, const T &a, const T &b) + { + return sycl::select(b,a,(int)mask); + } + + template<typename T, typename M> + __forceinline T cselect(const M &mask, const T &a, const T &b) + { + return sycl::select(b,a,mask); + } + + __forceinline const sycl::sub_group this_sub_group() { + return sycl::ext::oneapi::experimental::this_sub_group(); + } + + __forceinline const uint32_t get_sub_group_local_id() { + return this_sub_group().get_local_id()[0]; + } + + __forceinline const uint32_t get_sub_group_size() { + return this_sub_group().get_max_local_range().size(); + } + + __forceinline const uint32_t get_sub_group_id() { + return this_sub_group().get_group_id()[0]; + } + + __forceinline const uint32_t get_num_sub_groups() { + return this_sub_group().get_group_range().size(); + } + + __forceinline uint32_t sub_group_ballot(bool pred) { + return intel_sub_group_ballot(pred); + } + + __forceinline bool sub_group_all_of(bool pred) { + return sycl::all_of_group(this_sub_group(),pred); + } + + __forceinline bool sub_group_any_of(bool pred) { + return sycl::any_of_group(this_sub_group(),pred); + } + + __forceinline bool sub_group_none_of(bool pred) { + return sycl::none_of_group(this_sub_group(),pred); + } + + template <typename T> __forceinline T sub_group_broadcast(T x, sycl::id<1> local_id) { + return sycl::group_broadcast<sycl::sub_group>(this_sub_group(),x,local_id); + } + + template <typename T> __forceinline T sub_group_make_uniform(T x) { + return sub_group_broadcast(x,sycl::ctz(intel_sub_group_ballot(true))); + } + + __forceinline void assume_uniform_array(void* ptr) { +#ifdef __SYCL_DEVICE_ONLY__ + __builtin_IB_assume_uniform(ptr); +#endif + } + + template <typename T, class BinaryOperation> __forceinline T sub_group_reduce(T x, BinaryOperation binary_op) { + return sycl::reduce_over_group<sycl::sub_group>(this_sub_group(),x,binary_op); + } + + template <typename T, class BinaryOperation> __forceinline T sub_group_reduce(T x, T init, BinaryOperation binary_op) { + return sycl::reduce_over_group<sycl::sub_group>(this_sub_group(),x,init,binary_op); + } + + template <typename T> __forceinline T sub_group_reduce_min(T x, T init) { + return sub_group_reduce(x, init, sycl::ext::oneapi::minimum<T>()); + } + + template <typename T> __forceinline T sub_group_reduce_min(T x) { + return sub_group_reduce(x, sycl::ext::oneapi::minimum<T>()); + } + + template <typename T> __forceinline T sub_group_reduce_max(T x) { + return sub_group_reduce(x, sycl::ext::oneapi::maximum<T>()); + } + + template <typename T> __forceinline T sub_group_reduce_add(T x) { + return sub_group_reduce(x, sycl::ext::oneapi::plus<T>()); + } + + template <typename T, class BinaryOperation> __forceinline T sub_group_exclusive_scan(T x, BinaryOperation binary_op) { + return sycl::exclusive_scan_over_group(this_sub_group(),x,binary_op); + } + + template <typename T, class BinaryOperation> __forceinline T sub_group_exclusive_scan_min(T x) { + return sub_group_exclusive_scan(x,sycl::ext::oneapi::minimum<T>()); + } + + template <typename T, class BinaryOperation> __forceinline T sub_group_exclusive_scan(T x, T init, BinaryOperation binary_op) { + return sycl::exclusive_scan_over_group(this_sub_group(),x,init,binary_op); + } + + template <typename T, class BinaryOperation> __forceinline T sub_group_inclusive_scan(T x, BinaryOperation binary_op) { + return sycl::inclusive_scan_over_group(this_sub_group(),x,binary_op); + } + + template <typename T, class BinaryOperation> __forceinline T sub_group_inclusive_scan(T x, BinaryOperation binary_op, T init) { + return sycl::inclusive_scan_over_group(this_sub_group(),x,binary_op,init); + } + + template <typename T> __forceinline T sub_group_shuffle(T x, sycl::id<1> local_id) { + return this_sub_group().shuffle(x, local_id); + } + + template <typename T> __forceinline T sub_group_shuffle_down(T x, uint32_t delta) { + return this_sub_group().shuffle_down(x, delta); + } + + template <typename T> __forceinline T sub_group_shuffle_up(T x, uint32_t delta) { + return this_sub_group().shuffle_up(x, delta); + } + + template <typename T> __forceinline T sub_group_load(const void* src) { + return this_sub_group().load(sycl::multi_ptr<T,sycl::access::address_space::global_space>((T*)src)); + } + + template <typename T> __forceinline void sub_group_store(void* dst, const T& x) { + this_sub_group().store(sycl::multi_ptr<T,sycl::access::address_space::global_space>((T*)dst),x); + } +} + +#if __SYCL_COMPILER_VERSION < 20210801 +#undef all_of_group +#undef any_of_group +#undef none_of_group +#undef group_broadcast +#undef reduce_over_group +#undef exclusive_scan_over_group +#undef inclusive_scan_over_group +#endif diff --git a/thirdparty/embree/common/sys/sysinfo.cpp b/thirdparty/embree/common/sys/sysinfo.cpp index 7f7a009a1e..d01eab3c9d 100644 --- a/thirdparty/embree/common/sys/sysinfo.cpp +++ b/thirdparty/embree/common/sys/sysinfo.cpp @@ -1,9 +1,15 @@ // Copyright 2009-2021 Intel Corporation // SPDX-License-Identifier: Apache-2.0 +#if defined(__INTEL_LLVM_COMPILER) +// prevents "'__thiscall' calling convention is not supported for this target" warning from TBB +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wignored-attributes" +#endif + #include "sysinfo.h" #include "intrinsics.h" -#include "string.h" +#include "estring.h" #include "ref.h" #if defined(__FREEBSD__) #include <sys/cpuset.h> @@ -690,3 +696,6 @@ namespace embree } #endif +#if defined(__INTEL_LLVM_COMPILER) +#pragma clang diagnostic pop +#endif diff --git a/thirdparty/embree/common/sys/thread.cpp b/thirdparty/embree/common/sys/thread.cpp index 530c3c7810..8b072067e6 100644 --- a/thirdparty/embree/common/sys/thread.cpp +++ b/thirdparty/embree/common/sys/thread.cpp @@ -3,7 +3,7 @@ #include "thread.h" #include "sysinfo.h" -#include "string.h" +#include "estring.h" #include <iostream> #if defined(__ARM_NEON) diff --git a/thirdparty/embree/common/sys/vector.h b/thirdparty/embree/common/sys/vector.h index d05e1deb18..226cd34c21 100644 --- a/thirdparty/embree/common/sys/vector.h +++ b/thirdparty/embree/common/sys/vector.h @@ -8,6 +8,8 @@ namespace embree { + class Device; + template<typename T, typename allocator> class vector_t { @@ -25,6 +27,12 @@ namespace embree template<typename M> __forceinline explicit vector_t (M alloc, size_t sz) : alloc(alloc), size_active(0), size_alloced(0), items(nullptr) { internal_resize_init(sz); } + + __forceinline vector_t (Device* alloc) + : vector_t(alloc,0) {} + + __forceinline vector_t(void* data, size_t bytes) + : size_active(0), size_alloced(bytes/sizeof(T)), items((T*)data) {} __forceinline ~vector_t() { clear(); @@ -65,6 +73,10 @@ namespace embree return *this; } + __forceinline allocator& getAlloc() { + return alloc; + } + /********************** Iterators ****************************/ __forceinline iterator begin() { return items; }; @@ -215,6 +227,10 @@ namespace embree if (new_alloced <= size_alloced) return size_alloced; + /* if current size is 0 allocate exact requested size */ + if (size_alloced == 0) + return new_alloced; + /* resize to next power of 2 otherwise */ size_t new_size_alloced = size_alloced; while (new_size_alloced < new_alloced) { @@ -237,8 +253,12 @@ namespace embree /*! vector class that performs aligned allocations */ template<typename T> using avector = vector_t<T,aligned_allocator<T,std::alignment_of<T>::value> >; - + /*! vector class that performs OS allocations */ template<typename T> using ovector = vector_t<T,os_allocator<T> >; + + /*! vector class with externally managed data buffer */ + template<typename T> + using evector = vector_t<T,no_allocator<T>>; } |