diff options
Diffstat (limited to 'thirdparty/embree-aarch64/common/algorithms')
21 files changed, 0 insertions, 2292 deletions
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h b/thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h deleted file mode 100644 index 01f1f80f6c..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include <functional> -#include "parallel_reduce.h" - -namespace embree -{ - - template<typename Index, class UnaryPredicate> - __forceinline bool parallel_any_of (Index first, Index last, UnaryPredicate pred) - { - bool ret = false; - -#if defined(TASKING_TBB) -#if TBB_INTERFACE_VERSION >= 12002 - tbb::task_group_context context; - tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred,&context](const tbb::blocked_range<size_t>& r) { - if (context.is_group_execution_cancelled()) return; - for (size_t i = r.begin(); i != r.end(); ++i) { - if (pred(i)) { - ret = true; - context.cancel_group_execution(); - } - } - }); -#else - tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred](const tbb::blocked_range<size_t>& r) { - if (tbb::task::self().is_cancelled()) return; - for (size_t i = r.begin(); i != r.end(); ++i) { - if (pred(i)) { - ret = true; - tbb::task::self().cancel_group_execution(); - } - } - }); -#endif -#else - ret = parallel_reduce (first, last, false, [pred](const range<size_t>& r)->bool { - bool localret = false; - for (auto i=r.begin(); i<r.end(); ++i) { - localret |= pred(i); - } - return localret; - }, - std::bit_or<bool>() - ); -#endif - - return ret; - } - -} // end namespace diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp deleted file mode 100644 index acddc0ff81..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "parallel_filter.h" -#include "../sys/regression.h" -#include <map> - -namespace embree -{ - struct parallel_filter_regression_test : public RegressionTest - { - parallel_filter_regression_test(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - auto pred = [&]( uint32_t v ) { return (v & 0x3) == 0; }; - - for (size_t N=10; N<1000000; N=size_t(2.1*N)) - { - size_t N0 = rand() % N; - - /* initialize array with random numbers */ - std::vector<uint32_t> src(N); - std::map<uint32_t,int> m; - for (size_t i=0; i<N; i++) src[i] = rand(); - - /* count elements up */ - for (size_t i=N0; i<N; i++) - if (pred(src[i])) - m[src[i]] = 0; - for (size_t i=N0; i<N; i++) - if (pred(src[i])) - m[src[i]]++; - - /* filter array */ - //size_t M = sequential_filter(src.data(),N0,N,pred); - size_t M = parallel_filter(src.data(),N0,N,size_t(1024),pred); - - /* check if filtered data is correct */ - for (size_t i=N0; i<M; i++) { - passed &= pred(src[i]); - m[src[i]]--; - } - for (size_t i=N0; i<M; i++) - passed &= (m[src[i]] == 0); - } - - return passed; - } - }; - - parallel_filter_regression_test parallel_filter_regression("parallel_filter_regression"); -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_filter.h b/thirdparty/embree-aarch64/common/algorithms/parallel_filter.h deleted file mode 100644 index 5823fc631f..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_filter.h +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "parallel_for.h" - -namespace embree -{ - template<typename Ty, typename Index, typename Predicate> - inline Index sequential_filter( Ty* data, const Index first, const Index last, const Predicate& predicate) - { - Index j = first; - for (Index i=first; i<last; i++) - if (predicate(data[i])) - data[j++] = data[i]; - - return j; - } - - template<typename Ty, typename Index, typename Predicate> - inline Index parallel_filter( Ty* data, const Index begin, const Index end, const Index minStepSize, const Predicate& predicate) - { - /* sequential fallback */ - if (end-begin <= minStepSize) - return sequential_filter(data,begin,end,predicate); - - /* calculate number of tasks to use */ - enum { MAX_TASKS = 64 }; - const Index numThreads = TaskScheduler::threadCount(); - const Index numBlocks = (end-begin+minStepSize-1)/minStepSize; - const Index taskCount = min(numThreads,numBlocks,(Index)MAX_TASKS); - - /* filter blocks */ - Index nused[MAX_TASKS]; - Index nfree[MAX_TASKS]; - parallel_for(taskCount, [&](const Index taskIndex) - { - const Index i0 = begin+(taskIndex+0)*(end-begin)/taskCount; - const Index i1 = begin+(taskIndex+1)*(end-begin)/taskCount; - const Index i2 = sequential_filter(data,i0,i1,predicate); - nused[taskIndex] = i2-i0; - nfree[taskIndex] = i1-i2; - }); - - /* calculate offsets */ - Index sused=0; - Index sfree=0; - Index pfree[MAX_TASKS]; - for (Index i=0; i<taskCount; i++) - { - sused+=nused[i]; - Index cfree = nfree[i]; pfree[i] = sfree; sfree+=cfree; - } - - /* return if we did not filter out any element */ - assert(sfree <= end-begin); - assert(sused <= end-begin); - if (sused == end-begin) - return end; - - /* otherwise we have to copy misplaced elements around */ - parallel_for(taskCount, [&](const Index taskIndex) - { - /* destination to write elements to */ - Index dst = begin+(taskIndex+0)*(end-begin)/taskCount+nused[taskIndex]; - Index dst_end = min(dst+nfree[taskIndex],begin+sused); - if (dst_end <= dst) return; - - /* range of misplaced elements to copy to destination */ - Index r0 = pfree[taskIndex]; - Index r1 = r0+dst_end-dst; - - /* find range in misplaced elements in back to front order */ - Index k0=0; - for (Index i=taskCount-1; i>0; i--) - { - if (k0 > r1) break; - Index k1 = k0+nused[i]; - Index src = begin+(i+0)*(end-begin)/taskCount+nused[i]; - for (Index i=max(r0,k0); i<min(r1,k1); i++) { - Index isrc = src-i+k0-1; - assert(dst >= begin && dst < end); - assert(isrc >= begin && isrc < end); - data[dst++] = data[isrc]; - } - k0 = k1; - } - }); - - return begin+sused; - } -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp deleted file mode 100644 index ef070ebc4d..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "parallel_for.h" -#include "../sys/regression.h" - -namespace embree -{ - struct parallel_for_regression_test : public RegressionTest - { - parallel_for_regression_test(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - - const size_t M = 10; - for (size_t N=10; N<10000000; N=size_t(2.1*N)) - { - /* sequentially calculate sum of squares */ - size_t sum0 = 0; - for (size_t i=0; i<N; i++) { - sum0 += i*i; - } - - /* parallel calculation of sum of squares */ - for (size_t m=0; m<M; m++) - { - std::atomic<size_t> sum1(0); - parallel_for( size_t(0), size_t(N), size_t(1024), [&](const range<size_t>& r) - { - size_t s = 0; - for (size_t i=r.begin(); i<r.end(); i++) - s += i*i; - sum1 += s; - }); - passed = sum0 == sum1; - } - } - - return passed; - } - }; - - parallel_for_regression_test parallel_for_regression("parallel_for_regression_test"); -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for.h b/thirdparty/embree-aarch64/common/algorithms/parallel_for.h deleted file mode 100644 index 51d296fb16..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_for.h +++ /dev/null @@ -1,229 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../tasking/taskscheduler.h" -#include "../sys/array.h" -#include "../math/math.h" -#include "../math/range.h" - -#if defined(TASKING_GCD) && defined(BUILD_IOS) -#include <dispatch/dispatch.h> -#include <algorithm> -#include <type_traits> -#endif - -namespace embree -{ - /* parallel_for without range */ - template<typename Index, typename Func> - __forceinline void parallel_for( const Index N, const Func& func) - { -#if defined(TASKING_INTERNAL) - if (N) { - TaskScheduler::spawn(Index(0),N,Index(1),[&] (const range<Index>& r) { - assert(r.size() == 1); - func(r.begin()); - }); - if (!TaskScheduler::wait()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - } -#elif defined(TASKING_GCD) && defined(BUILD_IOS) - - const size_t baselineNumBlocks = (TaskScheduler::threadCount() > 1)? TaskScheduler::threadCount() : 1; - const size_t length = N; - const size_t blockSize = (length + baselineNumBlocks-1) / baselineNumBlocks; - const size_t numBlocks = (length + blockSize-1) / blockSize; - - dispatch_apply(numBlocks, DISPATCH_APPLY_AUTO, ^(size_t currentBlock) { - - const size_t start = (currentBlock * blockSize); - const size_t blockLength = std::min(length - start, blockSize); - const size_t end = start + blockLength; - - for(size_t i=start; i < end; i++) - { - func(i); - } - }); - -#elif defined(TASKING_TBB) - #if TBB_INTERFACE_VERSION >= 12002 - tbb::task_group_context context; - tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { - func(i); - },context); - if (context.is_group_execution_cancelled()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - #else - tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { - func(i); - }); - if (tbb::task::self().is_cancelled()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - #endif - -#elif defined(TASKING_PPL) - concurrency::parallel_for(Index(0),N,Index(1),[&](Index i) { - func(i); - }); -#else -# error "no tasking system enabled" -#endif - } - - /* parallel for with range and granulatity */ - template<typename Index, typename Func> - __forceinline void parallel_for( const Index first, const Index last, const Index minStepSize, const Func& func) - { - assert(first <= last); -#if defined(TASKING_INTERNAL) - TaskScheduler::spawn(first,last,minStepSize,func); - if (!TaskScheduler::wait()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - -#elif defined(TASKING_GCD) && defined(BUILD_IOS) - - const size_t baselineNumBlocks = (TaskScheduler::threadCount() > 1)? 4*TaskScheduler::threadCount() : 1; - const size_t length = last - first; - const size_t blockSizeByThreads = (length + baselineNumBlocks-1) / baselineNumBlocks; - size_t blockSize = std::max<size_t>(minStepSize,blockSizeByThreads); - blockSize += blockSize % 4; - - const size_t numBlocks = (length + blockSize-1) / blockSize; - - dispatch_apply(numBlocks, DISPATCH_APPLY_AUTO, ^(size_t currentBlock) { - - const size_t start = first + (currentBlock * blockSize); - const size_t end = std::min<size_t>(last, start + blockSize); - - func( embree::range<Index>(start,end) ); - }); - - -#elif defined(TASKING_TBB) - #if TBB_INTERFACE_VERSION >= 12002 - tbb::task_group_context context; - tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) { - func(range<Index>(r.begin(),r.end())); - },context); - if (context.is_group_execution_cancelled()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - #else - tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) { - func(range<Index>(r.begin(),r.end())); - }); - if (tbb::task::self().is_cancelled()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - #endif - -#elif defined(TASKING_PPL) - concurrency::parallel_for(first, last, Index(1) /*minStepSize*/, [&](Index i) { - func(range<Index>(i,i+1)); - }); - -#else -# error "no tasking system enabled" -#endif - } - - /* parallel for with range */ - template<typename Index, typename Func> - __forceinline void parallel_for( const Index first, const Index last, const Func& func) - { - assert(first <= last); - parallel_for(first,last,(Index)1,func); - } - -#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION > 4001) - - template<typename Index, typename Func> - __forceinline void parallel_for_static( const Index N, const Func& func) - { - #if TBB_INTERFACE_VERSION >= 12002 - tbb::task_group_context context; - tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { - func(i); - },tbb::simple_partitioner(),context); - if (context.is_group_execution_cancelled()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - #else - tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { - func(i); - },tbb::simple_partitioner()); - if (tbb::task::self().is_cancelled()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - #endif - } - - typedef tbb::affinity_partitioner affinity_partitioner; - - template<typename Index, typename Func> - __forceinline void parallel_for_affinity( const Index N, const Func& func, tbb::affinity_partitioner& ap) - { - #if TBB_INTERFACE_VERSION >= 12002 - tbb::task_group_context context; - tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { - func(i); - },ap,context); - if (context.is_group_execution_cancelled()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - #else - tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { - func(i); - },ap); - if (tbb::task::self().is_cancelled()) - // -- GODOT start -- - // throw std::runtime_error("task cancelled"); - abort(); - // -- GODOT end -- - #endif - } - -#else - - template<typename Index, typename Func> - __forceinline void parallel_for_static( const Index N, const Func& func) - { - parallel_for(N,func); - } - - struct affinity_partitioner { - }; - - template<typename Index, typename Func> - __forceinline void parallel_for_affinity( const Index N, const Func& func, affinity_partitioner& ap) - { - parallel_for(N,func); - } - -#endif -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp deleted file mode 100644 index 0337611b35..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "parallel_for_for.h" -#include "../sys/regression.h" - -namespace embree -{ - struct parallel_for_for_regression_test : public RegressionTest - { - parallel_for_for_regression_test(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - - /* create vector with random numbers */ - size_t sum0 = 0; - size_t K = 0; - const size_t M = 1000; - std::vector<std::vector<size_t>* > array2(M); - for (size_t i=0; i<M; i++) { - const size_t N = rand() % 1024; - K+=N; - array2[i] = new std::vector<size_t>(N); - for (size_t j=0; j<N; j++) - sum0 += (*array2[i])[j] = rand(); - } - - /* array to test global index */ - std::vector<atomic<size_t>> verify_k(K); - for (size_t i=0; i<K; i++) verify_k[i].store(0); - - /* add all numbers using parallel_for_for */ - std::atomic<size_t> sum1(0); - parallel_for_for( array2, size_t(1), [&](std::vector<size_t>* v, const range<size_t>& r, size_t k) -> size_t - { - size_t s = 0; - for (size_t i=r.begin(); i<r.end(); i++) { - s += (*v)[i]; - verify_k[k++]++; - } - sum1 += s; - return sum1; - }); - passed &= (sum0 == sum1); - - /* check global index */ - for (size_t i=0; i<K; i++) - passed &= (verify_k[i] == 1); - - /* delete vectors again */ - for (size_t i=0; i<array2.size(); i++) - delete array2[i]; - - return passed; - } - }; - - parallel_for_for_regression_test parallel_for_for_regression("parallel_for_for_regression_test"); -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.h b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.h deleted file mode 100644 index 852b8a0900..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.h +++ /dev/null @@ -1,149 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "parallel_for.h" - -namespace embree -{ - template<typename ArrayArray, typename Func> - __forceinline void sequential_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func ) - { - size_t k=0; - for (size_t i=0; i!=array2.size(); ++i) { - const size_t N = array2[i]->size(); - if (N) func(array2[i],range<size_t>(0,N),k); - k+=N; - } - } - - class ParallelForForState - { - public: - - enum { MAX_TASKS = 64 }; - - __forceinline ParallelForForState () - : taskCount(0) {} - - template<typename ArrayArray> - __forceinline ParallelForForState (ArrayArray& array2, const size_t minStepSize) { - init(array2,minStepSize); - } - - template<typename ArrayArray> - __forceinline void init ( ArrayArray& array2, const size_t minStepSize ) - { - /* first calculate total number of elements */ - size_t N = 0; - for (size_t i=0; i<array2.size(); i++) { - N += array2[i] ? array2[i]->size() : 0; - } - this->N = N; - - /* calculate number of tasks to use */ - const size_t numThreads = TaskScheduler::threadCount(); - const size_t numBlocks = (N+minStepSize-1)/minStepSize; - taskCount = max(size_t(1),min(numThreads,numBlocks,size_t(ParallelForForState::MAX_TASKS))); - - /* calculate start (i,j) for each task */ - size_t taskIndex = 0; - i0[taskIndex] = 0; - j0[taskIndex] = 0; - size_t k0 = (++taskIndex)*N/taskCount; - for (size_t i=0, k=0; taskIndex < taskCount; i++) - { - assert(i<array2.size()); - size_t j=0, M = array2[i] ? array2[i]->size() : 0; - while (j<M && k+M-j >= k0 && taskIndex < taskCount) { - assert(taskIndex<taskCount); - i0[taskIndex] = i; - j0[taskIndex] = j += k0-k; - k=k0; - k0 = (++taskIndex)*N/taskCount; - } - k+=M-j; - } - } - - __forceinline size_t size() const { - return N; - } - - public: - size_t i0[MAX_TASKS]; - size_t j0[MAX_TASKS]; - size_t taskCount; - size_t N; - }; - - template<typename ArrayArray, typename Func> - __forceinline void parallel_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func ) - { - ParallelForForState state(array2,minStepSize); - - parallel_for(state.taskCount, [&](const size_t taskIndex) - { - /* calculate range */ - const size_t k0 = (taskIndex+0)*state.size()/state.taskCount; - const size_t k1 = (taskIndex+1)*state.size()/state.taskCount; - size_t i0 = state.i0[taskIndex]; - size_t j0 = state.j0[taskIndex]; - - /* iterate over arrays */ - size_t k=k0; - for (size_t i=i0; k<k1; i++) { - const size_t N = array2[i] ? array2[i]->size() : 0; - const size_t r0 = j0, r1 = min(N,r0+k1-k); - if (r1 > r0) func(array2[i],range<size_t>(r0,r1),k); - k+=r1-r0; j0 = 0; - } - }); - } - - template<typename ArrayArray, typename Func> - __forceinline void parallel_for_for( ArrayArray& array2, const Func& func ) - { - parallel_for_for(array2,1,func); - } - - template<typename ArrayArray, typename Value, typename Func, typename Reduction> - __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const size_t minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) - { - ParallelForForState state(array2,minStepSize); - Value temp[ParallelForForState::MAX_TASKS]; - - for (size_t i=0; i<state.taskCount; i++) - temp[i] = identity; - - parallel_for(state.taskCount, [&](const size_t taskIndex) - { - /* calculate range */ - const size_t k0 = (taskIndex+0)*state.size()/state.taskCount; - const size_t k1 = (taskIndex+1)*state.size()/state.taskCount; - size_t i0 = state.i0[taskIndex]; - size_t j0 = state.j0[taskIndex]; - - /* iterate over arrays */ - size_t k=k0; - for (size_t i=i0; k<k1; i++) { - const size_t N = array2[i] ? array2[i]->size() : 0; - const size_t r0 = j0, r1 = min(N,r0+k1-k); - if (r1 > r0) temp[taskIndex] = reduction(temp[taskIndex],func(array2[i],range<size_t>(r0,r1),k)); - k+=r1-r0; j0 = 0; - } - }); - - Value ret = identity; - for (size_t i=0; i<state.taskCount; i++) - ret = reduction(ret,temp[i]); - return ret; - } - - template<typename ArrayArray, typename Value, typename Func, typename Reduction> - __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const Value& identity, const Func& func, const Reduction& reduction) - { - return parallel_for_for_reduce(array2,1,identity,func,reduction); - } -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp deleted file mode 100644 index 0169d8e481..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "parallel_for_for_prefix_sum.h" -#include "../sys/regression.h" - -namespace embree -{ - struct parallel_for_for_prefix_sum_regression_test : public RegressionTest - { - parallel_for_for_prefix_sum_regression_test(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - - /* create vector with random numbers */ - const size_t M = 10; - std::vector<atomic<size_t>> flattened; - typedef std::vector<std::vector<size_t>* > ArrayArray; - ArrayArray array2(M); - size_t K = 0; - for (size_t i=0; i<M; i++) { - const size_t N = rand() % 10; - K += N; - array2[i] = new std::vector<size_t>(N); - for (size_t j=0; j<N; j++) - (*array2[i])[j] = rand() % 10; - } - - /* array to test global index */ - std::vector<atomic<size_t>> verify_k(K); - for (size_t i=0; i<K; i++) verify_k[i].store(0); - - ParallelForForPrefixSumState<size_t> state(array2,size_t(1)); - - /* dry run only counts */ - size_t S = parallel_for_for_prefix_sum0( state, array2, size_t(0), [&](std::vector<size_t>* v, const range<size_t>& r, size_t k, size_t i) -> size_t - { - size_t s = 0; - for (size_t i=r.begin(); i<r.end(); i++) { - s += (*v)[i]; - verify_k[k++]++; - } - return s; - }, [](size_t v0, size_t v1) { return v0+v1; }); - - /* create properly sized output array */ - flattened.resize(S); - for (auto& a : flattened) a.store(0); - - /* now we actually fill the flattened array */ - parallel_for_for_prefix_sum1( state, array2, size_t(0), [&](std::vector<size_t>* v, const range<size_t>& r, size_t k, size_t i, const size_t base) -> size_t - { - size_t s = 0; - for (size_t i=r.begin(); i<r.end(); i++) { - for (size_t j=0; j<(*v)[i]; j++) { - flattened[base+s+j]++; - } - s += (*v)[i]; - verify_k[k++]++; - } - return s; - }, [](size_t v0, size_t v1) { return v0+v1; }); - - /* check global index */ - for (size_t i=0; i<K; i++) - passed &= (verify_k[i] == 2); - - /* check if each element was assigned exactly once */ - for (size_t i=0; i<flattened.size(); i++) - passed &= (flattened[i] == 1); - - /* delete arrays again */ - for (size_t i=0; i<array2.size(); i++) - delete array2[i]; - - return passed; - } - }; - - parallel_for_for_prefix_sum_regression_test parallel_for_for_prefix_sum_regression("parallel_for_for_prefix_sum_regression_test"); -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.h b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.h deleted file mode 100644 index d2671d8a6a..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.h +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "parallel_for_for.h" -#include "parallel_prefix_sum.h" - -namespace embree -{ - template<typename Value> - struct ParallelForForPrefixSumState : public ParallelForForState - { - __forceinline ParallelForForPrefixSumState () {} - - template<typename ArrayArray> - __forceinline ParallelForForPrefixSumState (ArrayArray& array2, const size_t minStepSize) - : ParallelForForState(array2,minStepSize) {} - - ParallelPrefixSumState<Value> prefix_state; - }; - - template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction> - __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize, - const Value& identity, const Func& func, const Reduction& reduction) - { - /* calculate number of tasks to use */ - const size_t taskCount = state.taskCount; - /* perform parallel prefix sum */ - parallel_for(taskCount, [&](const size_t taskIndex) - { - const size_t k0 = (taskIndex+0)*state.size()/taskCount; - const size_t k1 = (taskIndex+1)*state.size()/taskCount; - size_t i0 = state.i0[taskIndex]; - size_t j0 = state.j0[taskIndex]; - - /* iterate over arrays */ - size_t k=k0; - Value N=identity; - for (size_t i=i0; k<k1; i++) { - const size_t size = array2[i] ? array2[i]->size() : 0; - const size_t r0 = j0, r1 = min(size,r0+k1-k); - if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i)); - k+=r1-r0; j0 = 0; - } - state.prefix_state.counts[taskIndex] = N; - }); - - /* calculate prefix sum */ - Value sum=identity; - for (size_t i=0; i<taskCount; i++) - { - const Value c = state.prefix_state.counts[i]; - state.prefix_state.sums[i] = sum; - sum=reduction(sum,c); - } - - return sum; - } - - template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction> - __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize, - const Value& identity, const Func& func, const Reduction& reduction) - { - /* calculate number of tasks to use */ - const size_t taskCount = state.taskCount; - /* perform parallel prefix sum */ - parallel_for(taskCount, [&](const size_t taskIndex) - { - const size_t k0 = (taskIndex+0)*state.size()/taskCount; - const size_t k1 = (taskIndex+1)*state.size()/taskCount; - size_t i0 = state.i0[taskIndex]; - size_t j0 = state.j0[taskIndex]; - - /* iterate over arrays */ - size_t k=k0; - Value N=identity; - for (size_t i=i0; k<k1; i++) { - const size_t size = array2[i] ? array2[i]->size() : 0; - const size_t r0 = j0, r1 = min(size,r0+k1-k); - if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i,reduction(state.prefix_state.sums[taskIndex],N))); - k+=r1-r0; j0 = 0; - } - state.prefix_state.counts[taskIndex] = N; - }); - - /* calculate prefix sum */ - Value sum=identity; - for (size_t i=0; i<taskCount; i++) - { - const Value c = state.prefix_state.counts[i]; - state.prefix_state.sums[i] = sum; - sum=reduction(sum,c); - } - - return sum; - } - - template<typename ArrayArray, typename Value, typename Func, typename Reduction> - __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, - const Value& identity, const Func& func, const Reduction& reduction) - { - return parallel_for_for_prefix_sum0(state,array2,size_t(1),identity,func,reduction); - } - - template<typename ArrayArray, typename Value, typename Func, typename Reduction> - __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, - const Value& identity, const Func& func, const Reduction& reduction) - { - return parallel_for_for_prefix_sum1(state,array2,size_t(1),identity,func,reduction); - } -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp deleted file mode 100644 index 09dc303f81..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "parallel_map.h" -#include "../sys/regression.h" - -namespace embree -{ - struct parallel_map_regression_test : public RegressionTest - { - parallel_map_regression_test(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - - /* create key/value vectors with random numbers */ - const size_t N = 10000; - std::vector<uint32_t> keys(N); - std::vector<uint32_t> vals(N); - for (size_t i=0; i<N; i++) keys[i] = 2*unsigned(i)*647382649; - for (size_t i=0; i<N; i++) std::swap(keys[i],keys[rand()%N]); - for (size_t i=0; i<N; i++) vals[i] = 2*rand(); - - /* create map */ - parallel_map<uint32_t,uint32_t> map; - map.init(keys,vals); - - /* check that all keys are properly mapped */ - for (size_t i=0; i<N; i++) { - const uint32_t* val = map.lookup(keys[i]); - passed &= val && (*val == vals[i]); - } - - /* check that these keys are not in the map */ - for (size_t i=0; i<N; i++) { - passed &= !map.lookup(keys[i]+1); - } - - return passed; - } - }; - - parallel_map_regression_test parallel_map_regression("parallel_map_regression_test"); -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_map.h b/thirdparty/embree-aarch64/common/algorithms/parallel_map.h deleted file mode 100644 index 02e1a8f8d0..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_map.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "parallel_sort.h" - -namespace embree -{ - /*! implementation of a key/value map with parallel construction */ - template<typename Key, typename Val> - class parallel_map - { - /* key/value pair to build the map */ - struct KeyValue - { - __forceinline KeyValue () {} - - __forceinline KeyValue (const Key key, const Val val) - : key(key), val(val) {} - - __forceinline operator Key() const { - return key; - } - - public: - Key key; - Val val; - }; - - public: - - /*! parallel map constructors */ - parallel_map () {} - - /*! construction from pair of vectors */ - template<typename KeyVector, typename ValVector> - parallel_map (const KeyVector& keys, const ValVector& values) { init(keys,values); } - - /*! initialized the parallel map from a vector with keys and values */ - template<typename KeyVector, typename ValVector> - void init(const KeyVector& keys, const ValVector& values) - { - /* reserve sufficient space for all data */ - assert(keys.size() == values.size()); - vec.resize(keys.size()); - - /* generate key/value pairs */ - parallel_for( size_t(0), keys.size(), size_t(4*4096), [&](const range<size_t>& r) { - for (size_t i=r.begin(); i<r.end(); i++) - vec[i] = KeyValue((Key)keys[i],values[i]); - }); - - /* perform parallel radix sort of the key/value pairs */ - std::vector<KeyValue> temp(keys.size()); - radix_sort<KeyValue,Key>(vec.data(),temp.data(),keys.size()); - } - - /*! Returns a pointer to the value associated with the specified key. The pointer will be nullptr of the key is not contained in the map. */ - __forceinline const Val* lookup(const Key& key) const - { - typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key); - if (i == vec.end()) return nullptr; - if (i->key != key) return nullptr; - return &i->val; - } - - /*! If the key is in the map, the function returns the value associated with the key, otherwise it returns the default value. */ - __forceinline Val lookup(const Key& key, const Val& def) const - { - typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key); - if (i == vec.end()) return def; - if (i->key != key) return def; - return i->val; - } - - /*! clears all state */ - void clear() { - vec.clear(); - } - - private: - std::vector<KeyValue> vec; //!< vector containing sorted elements - }; -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp deleted file mode 100644 index eb20c4465d..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "parallel_partition.h" -#include "../sys/regression.h" - -namespace embree -{ - struct parallel_partition_regression_test : public RegressionTest - { - parallel_partition_regression_test(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - - for (size_t i=0; i<100; i++) - { - /* create random permutation */ - size_t N = std::rand() % 1000000; - std::vector<unsigned> array(N); - for (unsigned i=0; i<N; i++) array[i] = i; - for (auto& v : array) std::swap(v,array[std::rand()%array.size()]); - size_t split = std::rand() % (N+1); - - /* perform parallel partitioning */ - size_t left_sum = 0, right_sum = 0; - size_t mid = parallel_partitioning(array.data(),0,array.size(),0,left_sum,right_sum, - [&] ( size_t i ) { return i < split; }, - [] ( size_t& sum, unsigned v) { sum += v; }, - [] ( size_t& sum, size_t v) { sum += v; }, - 128); - - /*serial_partitioning(array.data(),0,array.size(),left_sum,right_sum, - [&] ( size_t i ) { return i < split; }, - [] ( size_t& left_sum, int v) { left_sum += v; });*/ - - /* verify result */ - passed &= mid == split; - passed &= left_sum == split*(split-1)/2; - passed &= right_sum == N*(N-1)/2-left_sum; - for (size_t i=0; i<split; i++) passed &= array[i] < split; - for (size_t i=split; i<N; i++) passed &= array[i] >= split; - } - - return passed; - } - }; - - parallel_partition_regression_test parallel_partition_regression("parallel_partition_regression_test"); -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.h b/thirdparty/embree-aarch64/common/algorithms/parallel_partition.h deleted file mode 100644 index 3b3ad7c854..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.h +++ /dev/null @@ -1,283 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "parallel_for.h" -#include "../math/range.h" - -namespace embree -{ - /* serial partitioning */ - template<typename T, typename V, typename IsLeft, typename Reduction_T> - __forceinline size_t serial_partitioning(T* array, - const size_t begin, - const size_t end, - V& leftReduction, - V& rightReduction, - const IsLeft& is_left, - const Reduction_T& reduction_t) - { - T* l = array + begin; - T* r = array + end - 1; - - while(1) - { - /* *l < pivot */ - while (likely(l <= r && is_left(*l) )) - { - //prefetchw(l+4); // FIXME: enable? - reduction_t(leftReduction,*l); - ++l; - } - /* *r >= pivot) */ - while (likely(l <= r && !is_left(*r))) - { - //prefetchw(r-4); FIXME: enable? - reduction_t(rightReduction,*r); - --r; - } - if (r<l) break; - - reduction_t(leftReduction ,*r); - reduction_t(rightReduction,*l); - xchg(*l,*r); - l++; r--; - } - - return l - array; - } - - template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V> - class __aligned(64) parallel_partition_task - { - ALIGNED_CLASS_(64); - private: - - static const size_t MAX_TASKS = 64; - - T* array; - size_t N; - const IsLeft& is_left; - const Reduction_T& reduction_t; - const Reduction_V& reduction_v; - const Vi& identity; - - size_t numTasks; - __aligned(64) size_t counter_start[MAX_TASKS+1]; - __aligned(64) size_t counter_left[MAX_TASKS+1]; - __aligned(64) range<ssize_t> leftMisplacedRanges[MAX_TASKS]; - __aligned(64) range<ssize_t> rightMisplacedRanges[MAX_TASKS]; - __aligned(64) V leftReductions[MAX_TASKS]; - __aligned(64) V rightReductions[MAX_TASKS]; - - public: - - __forceinline parallel_partition_task(T* array, - const size_t N, - const Vi& identity, - const IsLeft& is_left, - const Reduction_T& reduction_t, - const Reduction_V& reduction_v, - const size_t BLOCK_SIZE) - - : array(array), N(N), is_left(is_left), reduction_t(reduction_t), reduction_v(reduction_v), identity(identity), - numTasks(min((N+BLOCK_SIZE-1)/BLOCK_SIZE,min(TaskScheduler::threadCount(),MAX_TASKS))) {} - - __forceinline const range<ssize_t>* findStartRange(size_t& index, const range<ssize_t>* const r, const size_t numRanges) - { - size_t i = 0; - while(index >= (size_t)r[i].size()) - { - assert(i < numRanges); - index -= (size_t)r[i].size(); - i++; - } - return &r[i]; - } - - __forceinline void swapItemsInMisplacedRanges(const size_t numLeftMisplacedRanges, - const size_t numRightMisplacedRanges, - const size_t startID, - const size_t endID) - { - size_t leftLocalIndex = startID; - size_t rightLocalIndex = startID; - const range<ssize_t>* l_range = findStartRange(leftLocalIndex,leftMisplacedRanges,numLeftMisplacedRanges); - const range<ssize_t>* r_range = findStartRange(rightLocalIndex,rightMisplacedRanges,numRightMisplacedRanges); - - size_t l_left = l_range->size() - leftLocalIndex; - size_t r_left = r_range->size() - rightLocalIndex; - T *__restrict__ l = &array[l_range->begin() + leftLocalIndex]; - T *__restrict__ r = &array[r_range->begin() + rightLocalIndex]; - size_t size = endID - startID; - size_t items = min(size,min(l_left,r_left)); - - while (size) - { - if (unlikely(l_left == 0)) - { - l_range++; - l_left = l_range->size(); - l = &array[l_range->begin()]; - items = min(size,min(l_left,r_left)); - } - - if (unlikely(r_left == 0)) - { - r_range++; - r_left = r_range->size(); - r = &array[r_range->begin()]; - items = min(size,min(l_left,r_left)); - } - - size -= items; - l_left -= items; - r_left -= items; - - while(items) { - items--; - xchg(*l++,*r++); - } - } - } - - __forceinline size_t partition(V& leftReduction, V& rightReduction) - { - /* partition the individual ranges for each task */ - parallel_for(numTasks,[&] (const size_t taskID) { - const size_t startID = (taskID+0)*N/numTasks; - const size_t endID = (taskID+1)*N/numTasks; - V local_left(identity); - V local_right(identity); - const size_t mid = serial_partitioning(array,startID,endID,local_left,local_right,is_left,reduction_t); - counter_start[taskID] = startID; - counter_left [taskID] = mid-startID; - leftReductions[taskID] = local_left; - rightReductions[taskID] = local_right; - }); - counter_start[numTasks] = N; - counter_left[numTasks] = 0; - - /* finalize the reductions */ - for (size_t i=0; i<numTasks; i++) { - reduction_v(leftReduction,leftReductions[i]); - reduction_v(rightReduction,rightReductions[i]); - } - - /* calculate mid point for partitioning */ - size_t mid = counter_left[0]; - for (size_t i=1; i<numTasks; i++) - mid += counter_left[i]; - const range<ssize_t> globalLeft (0,mid); - const range<ssize_t> globalRight(mid,N); - - /* calculate all left and right ranges that are on the wrong global side */ - size_t numMisplacedRangesLeft = 0; - size_t numMisplacedRangesRight = 0; - size_t numMisplacedItemsLeft = 0; - size_t numMisplacedItemsRight = 0; - - for (size_t i=0; i<numTasks; i++) - { - const range<ssize_t> left_range (counter_start[i], counter_start[i] + counter_left[i]); - const range<ssize_t> right_range(counter_start[i] + counter_left[i], counter_start[i+1]); - const range<ssize_t> left_misplaced = globalLeft. intersect(right_range); - const range<ssize_t> right_misplaced = globalRight.intersect(left_range); - - if (!left_misplaced.empty()) - { - numMisplacedItemsLeft += left_misplaced.size(); - leftMisplacedRanges[numMisplacedRangesLeft++] = left_misplaced; - } - - if (!right_misplaced.empty()) - { - numMisplacedItemsRight += right_misplaced.size(); - rightMisplacedRanges[numMisplacedRangesRight++] = right_misplaced; - } - } - assert( numMisplacedItemsLeft == numMisplacedItemsRight ); - - /* if no items are misplaced we are done */ - if (numMisplacedItemsLeft == 0) - return mid; - - /* otherwise we copy the items to the right place in parallel */ - parallel_for(numTasks,[&] (const size_t taskID) { - const size_t startID = (taskID+0)*numMisplacedItemsLeft/numTasks; - const size_t endID = (taskID+1)*numMisplacedItemsLeft/numTasks; - swapItemsInMisplacedRanges(numMisplacedRangesLeft,numMisplacedRangesRight,startID,endID); - }); - - return mid; - } - }; - - template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V> - __noinline size_t parallel_partitioning(T* array, - const size_t begin, - const size_t end, - const Vi &identity, - V &leftReduction, - V &rightReduction, - const IsLeft& is_left, - const Reduction_T& reduction_t, - const Reduction_V& reduction_v, - size_t BLOCK_SIZE = 128) - { - /* fall back to single threaded partitioning for small N */ - if (unlikely(end-begin < BLOCK_SIZE)) - return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t); - - /* otherwise use parallel code */ - else { - typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task; - std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE)); - return begin+p->partition(leftReduction,rightReduction); - } - } - - template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V> - __noinline size_t parallel_partitioning(T* array, - const size_t begin, - const size_t end, - const Vi &identity, - V &leftReduction, - V &rightReduction, - const IsLeft& is_left, - const Reduction_T& reduction_t, - const Reduction_V& reduction_v, - size_t BLOCK_SIZE, - size_t PARALLEL_THRESHOLD) - { - /* fall back to single threaded partitioning for small N */ - if (unlikely(end-begin < PARALLEL_THRESHOLD)) - return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t); - - /* otherwise use parallel code */ - else { - typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task; - std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE)); - return begin+p->partition(leftReduction,rightReduction); - } - } - - - template<typename T, typename IsLeft> - inline size_t parallel_partitioning(T* array, - const size_t begin, - const size_t end, - const IsLeft& is_left, - size_t BLOCK_SIZE = 128) - { - size_t leftReduction = 0; - size_t rightReduction = 0; - return parallel_partitioning( - array,begin,end,0,leftReduction,rightReduction,is_left, - [] (size_t& t,const T& ref) { }, - [] (size_t& t0,size_t& t1) { }, - BLOCK_SIZE); - } - -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp deleted file mode 100644 index 685952c3dc..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "parallel_prefix_sum.h" -#include "../sys/regression.h" - -namespace embree -{ - struct parallel_prefix_sum_regression_test : public RegressionTest - { - parallel_prefix_sum_regression_test(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - const size_t M = 10; - - for (size_t N=10; N<10000000; N=size_t(2.1*N)) - { - /* initialize array with random numbers */ - uint32_t sum0 = 0; - std::vector<uint32_t> src(N); - for (size_t i=0; i<N; i++) { - sum0 += src[i] = rand(); - } - - /* calculate parallel prefix sum */ - std::vector<uint32_t> dst(N); - for (auto& v : dst) v = 0; - - for (size_t i=0; i<M; i++) { - uint32_t sum1 = parallel_prefix_sum(src,dst,N,0,std::plus<uint32_t>()); - passed &= (sum0 == sum1); - } - - /* check if prefix sum is correct */ - for (size_t i=0, sum=0; i<N; sum+=src[i++]) - passed &= ((uint32_t)sum == dst[i]); - } - - return passed; - } - }; - - parallel_prefix_sum_regression_test parallel_prefix_sum_regression("parallel_prefix_sum_regression"); -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.h b/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.h deleted file mode 100644 index 117c7a79b0..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "parallel_for.h" - -namespace embree -{ - template<typename Value> - struct ParallelPrefixSumState - { - enum { MAX_TASKS = 64 }; - Value counts[MAX_TASKS]; - Value sums [MAX_TASKS]; - }; - - template<typename Index, typename Value, typename Func, typename Reduction> - __forceinline Value parallel_prefix_sum( ParallelPrefixSumState<Value>& state, Index first, Index last, Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction) - { - /* calculate number of tasks to use */ - const size_t numThreads = TaskScheduler::threadCount(); - const size_t numBlocks = (last-first+minStepSize-1)/minStepSize; - const size_t taskCount = min(numThreads,numBlocks,size_t(ParallelPrefixSumState<Value>::MAX_TASKS)); - - /* perform parallel prefix sum */ - parallel_for(taskCount, [&](const size_t taskIndex) - { - const size_t i0 = first+(taskIndex+0)*(last-first)/taskCount; - const size_t i1 = first+(taskIndex+1)*(last-first)/taskCount; - state.counts[taskIndex] = func(range<size_t>(i0,i1),state.sums[taskIndex]); - }); - - /* calculate prefix sum */ - Value sum=identity; - for (size_t i=0; i<taskCount; i++) - { - const Value c = state.counts[i]; - state.sums[i] = sum; - sum=reduction(sum,c); - } - - return sum; - } - - /*! parallel calculation of prefix sums */ - template<typename SrcArray, typename DstArray, typename Value, typename Add> - __forceinline Value parallel_prefix_sum(const SrcArray& src, DstArray& dst, size_t N, const Value& identity, const Add& add, const size_t SINGLE_THREAD_THRESHOLD = 4096) - { - /* perform single threaded prefix operation for small N */ - if (N < SINGLE_THREAD_THRESHOLD) - { - Value sum=identity; - for (size_t i=0; i<N; sum=add(sum,src[i++])) dst[i] = sum; - return sum; - } - - /* perform parallel prefix operation for large N */ - else - { - ParallelPrefixSumState<Value> state; - - /* initial run just sets up start values for subtasks */ - parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value { - - Value s = identity; - for (size_t i=r.begin(); i<r.end(); i++) s = add(s,src[i]); - return s; - - }, add); - - /* final run calculates prefix sum */ - return parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value { - - Value s = identity; - for (size_t i=r.begin(); i<r.end(); i++) { - dst[i] = add(sum,s); - s = add(s,src[i]); - } - return s; - - }, add); - } - } -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.cpp deleted file mode 100644 index 331fe4288e..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.cpp +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "parallel_reduce.h" -#include "../sys/regression.h" - -namespace embree -{ - struct parallel_reduce_regression_test : public RegressionTest - { - parallel_reduce_regression_test(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - - const size_t M = 10; - for (size_t N=10; N<10000000; N=size_t(2.1*N)) - { - /* sequentially calculate sum of squares */ - size_t sum0 = 0; - for (size_t i=0; i<N; i++) { - sum0 += i*i; - } - - /* parallel calculation of sum of squares */ - for (size_t m=0; m<M; m++) - { - size_t sum1 = parallel_reduce( size_t(0), size_t(N), size_t(1024), size_t(0), [&](const range<size_t>& r) -> size_t - { - size_t s = 0; - for (size_t i=r.begin(); i<r.end(); i++) - s += i*i; - return s; - }, - [](const size_t v0, const size_t v1) { - return v0+v1; - }); - passed = sum0 == sum1; - } - } - return passed; - } - }; - - parallel_reduce_regression_test parallel_reduce_regression("parallel_reduce_regression_test"); -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h b/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h deleted file mode 100644 index 0daf94e50e..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h +++ /dev/null @@ -1,150 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "parallel_for.h" - -namespace embree -{ - template<typename Index, typename Value, typename Func, typename Reduction> - __forceinline Value sequential_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction ) - { - return func(range<Index>(first,last)); - } - - template<typename Index, typename Value, typename Func, typename Reduction> - __forceinline Value sequential_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) - { - return func(range<Index>(first,last)); - } - - template<typename Index, typename Value, typename Func, typename Reduction> - __noinline Value parallel_reduce_internal( Index taskCount, const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) - { - const Index maxTasks = 512; - const Index threadCount = (Index) TaskScheduler::threadCount(); - taskCount = min(taskCount,threadCount,maxTasks); - - /* parallel invokation of all tasks */ - dynamic_large_stack_array(Value,values,taskCount,8192); // consumes at most 8192 bytes on the stack - parallel_for(taskCount, [&](const Index taskIndex) { - const Index k0 = first+(taskIndex+0)*(last-first)/taskCount; - const Index k1 = first+(taskIndex+1)*(last-first)/taskCount; - values[taskIndex] = func(range<Index>(k0,k1)); - }); - - /* perform reduction over all tasks */ - Value v = identity; - for (Index i=0; i<taskCount; i++) v = reduction(v,values[i]); - return v; - } - - template<typename Index, typename Value, typename Func, typename Reduction> - __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) - { -#if defined(TASKING_INTERNAL) || (defined(TASKING_GCD) && defined(BUILD_IOS)) - - /* fast path for small number of iterations */ - Index taskCount = (last-first+minStepSize-1)/minStepSize; - if (likely(taskCount == 1)) { - return func(range<Index>(first,last)); - } - return parallel_reduce_internal(taskCount,first,last,minStepSize,identity,func,reduction); - -#elif defined(TASKING_TBB) - #if TBB_INTERFACE_VERSION >= 12002 - tbb::task_group_context context; - const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity, - [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); }, - reduction,context); - // -- GODOT start -- - // if (context.is_group_execution_cancelled()) - // throw std::runtime_error("task cancelled"); - // -- GODOT end -- - return v; - #else - const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity, - [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); }, - reduction); - // -- GODOT start -- - // if (tbb::task::self().is_cancelled()) - // throw std::runtime_error("task cancelled"); - // -- GODOT end -- - return v; - #endif -#else // TASKING_PPL - struct AlignedValue - { - char storage[__alignof(Value)+sizeof(Value)]; - static uintptr_t alignUp(uintptr_t p, size_t a) { return p + (~(p - 1) % a); }; - Value* getValuePtr() { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); } - const Value* getValuePtr() const { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); } - AlignedValue(const Value& v) { new(getValuePtr()) Value(v); } - AlignedValue(const AlignedValue& v) { new(getValuePtr()) Value(*v.getValuePtr()); } - AlignedValue(const AlignedValue&& v) { new(getValuePtr()) Value(*v.getValuePtr()); }; - AlignedValue& operator = (const AlignedValue& v) { *getValuePtr() = *v.getValuePtr(); return *this; }; - AlignedValue& operator = (const AlignedValue&& v) { *getValuePtr() = *v.getValuePtr(); return *this; }; - operator Value() const { return *getValuePtr(); } - }; - - struct Iterator_Index - { - Index v; - typedef std::forward_iterator_tag iterator_category; - typedef AlignedValue value_type; - typedef Index difference_type; - typedef Index distance_type; - typedef AlignedValue* pointer; - typedef AlignedValue& reference; - __forceinline Iterator_Index() {} - __forceinline Iterator_Index(Index v) : v(v) {} - __forceinline bool operator== (Iterator_Index other) { return v == other.v; } - __forceinline bool operator!= (Iterator_Index other) { return v != other.v; } - __forceinline Iterator_Index operator++() { return Iterator_Index(++v); } - __forceinline Iterator_Index operator++(int) { return Iterator_Index(v++); } - }; - - auto range_reduction = [&](Iterator_Index begin, Iterator_Index end, const AlignedValue& start) { - assert(begin.v < end.v); - return reduction(start, func(range<Index>(begin.v, end.v))); - }; - const Value v = concurrency::parallel_reduce(Iterator_Index(first), Iterator_Index(last), AlignedValue(identity), range_reduction, reduction); - return v; -#endif - } - - template<typename Index, typename Value, typename Func, typename Reduction> - __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction ) - { - if (likely(last-first < parallel_threshold)) { - return func(range<Index>(first,last)); - } else { - return parallel_reduce(first,last,minStepSize,identity,func,reduction); - } - } - - template<typename Index, typename Value, typename Func, typename Reduction> - __forceinline Value parallel_reduce( const range<Index> range, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction ) - { - return parallel_reduce(range.begin(),range.end(),minStepSize,parallel_threshold,identity,func,reduction); - } - - template<typename Index, typename Value, typename Func, typename Reduction> - __forceinline Value parallel_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction ) - { - auto funcr = [&] ( const range<Index> r ) { - Value v = identity; - for (Index i=r.begin(); i<r.end(); i++) - v = reduction(v,func(i)); - return v; - }; - return parallel_reduce(first,last,Index(1),identity,funcr,reduction); - } - - template<typename Index, typename Value, typename Func, typename Reduction> - __forceinline Value parallel_reduce( const range<Index> range, const Value& identity, const Func& func, const Reduction& reduction ) - { - return parallel_reduce(range.begin(),range.end(),Index(1),identity,func,reduction); - } -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp deleted file mode 100644 index 20b639c1c9..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "parallel_set.h" -#include "../sys/regression.h" - -namespace embree -{ - struct parallel_set_regression_test : public RegressionTest - { - parallel_set_regression_test(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - - /* create vector with random numbers */ - const size_t N = 10000; - std::vector<uint32_t> unsorted(N); - for (size_t i=0; i<N; i++) unsorted[i] = 2*rand(); - - /* created set from numbers */ - parallel_set<uint32_t> sorted; - sorted.init(unsorted); - - /* check that all elements are in the set */ - for (size_t i=0; i<N; i++) { - passed &= sorted.lookup(unsorted[i]); - } - - /* check that these elements are not in the set */ - for (size_t i=0; i<N; i++) { - passed &= !sorted.lookup(unsorted[i]+1); - } - - return passed; - } - }; - - parallel_set_regression_test parallel_set_regression("parallel_set_regression_test"); -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_set.h b/thirdparty/embree-aarch64/common/algorithms/parallel_set.h deleted file mode 100644 index 640beba7ec..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_set.h +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "parallel_sort.h" - -namespace embree -{ - /* implementation of a set of values with parallel construction */ - template<typename T> - class parallel_set - { - public: - - /*! default constructor for the parallel set */ - parallel_set () {} - - /*! construction from vector */ - template<typename Vector> - parallel_set (const Vector& in) { init(in); } - - /*! initialized the parallel set from a vector */ - template<typename Vector> - void init(const Vector& in) - { - /* copy data to internal vector */ - vec.resize(in.size()); - parallel_for( size_t(0), in.size(), size_t(4*4096), [&](const range<size_t>& r) { - for (size_t i=r.begin(); i<r.end(); i++) - vec[i] = in[i]; - }); - - /* sort the data */ - std::vector<T> temp(in.size()); - radix_sort<T>(vec.data(),temp.data(),vec.size()); - } - - /*! tests if some element is in the set */ - __forceinline bool lookup(const T& elt) const { - return std::binary_search(vec.begin(), vec.end(), elt); - } - - /*! clears all state */ - void clear() { - vec.clear(); - } - - private: - std::vector<T> vec; //!< vector containing sorted elements - }; -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp deleted file mode 100644 index 5e7ec79ac1..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "parallel_sort.h" -#include "../sys/regression.h" - -namespace embree -{ - template<typename Key> - struct RadixSortRegressionTest : public RegressionTest - { - RadixSortRegressionTest(const char* name) : RegressionTest(name) { - registerRegressionTest(this); - } - - bool run () - { - bool passed = true; - const size_t M = 10; - - for (size_t N=10; N<1000000; N=size_t(2.1*N)) - { - std::vector<Key> src(N); memset(src.data(),0,N*sizeof(Key)); - std::vector<Key> tmp(N); memset(tmp.data(),0,N*sizeof(Key)); - for (size_t i=0; i<N; i++) src[i] = uint64_t(rand())*uint64_t(rand()); - - /* calculate checksum */ - Key sum0 = 0; for (size_t i=0; i<N; i++) sum0 += src[i]; - - /* sort numbers */ - for (size_t i=0; i<M; i++) { - radix_sort<Key>(src.data(),tmp.data(),N); - } - - /* calculate checksum */ - Key sum1 = 0; for (size_t i=0; i<N; i++) sum1 += src[i]; - if (sum0 != sum1) passed = false; - - /* check if numbers are sorted */ - for (size_t i=1; i<N; i++) - passed &= src[i-1] <= src[i]; - } - - return passed; - } - }; - - RadixSortRegressionTest<uint32_t> test_u32("RadixSortRegressionTestU32"); - RadixSortRegressionTest<uint64_t> test_u64("RadixSortRegressionTestU64"); -} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.h b/thirdparty/embree-aarch64/common/algorithms/parallel_sort.h deleted file mode 100644 index a758227c1b..0000000000 --- a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.h +++ /dev/null @@ -1,457 +0,0 @@ -// Copyright 2009-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "../simd/simd.h" -#include "parallel_for.h" -#if defined(TASKING_GCD) && defined(BUILD_IOS) -#include "../sys/alloc.h" -#endif -#include <algorithm> - -namespace embree -{ - template<class T> - __forceinline void insertionsort_ascending(T *__restrict__ array, const size_t length) - { - for(size_t i = 1;i<length;++i) - { - T v = array[i]; - size_t j = i; - while(j > 0 && v < array[j-1]) - { - array[j] = array[j-1]; - --j; - } - array[j] = v; - } - } - - template<class T> - __forceinline void insertionsort_decending(T *__restrict__ array, const size_t length) - { - for(size_t i = 1;i<length;++i) - { - T v = array[i]; - size_t j = i; - while(j > 0 && v > array[j-1]) - { - array[j] = array[j-1]; - --j; - } - array[j] = v; - } - } - - template<class T> - void quicksort_ascending(T *__restrict__ t, - const ssize_t begin, - const ssize_t end) - { - if (likely(begin < end)) - { - const T pivotvalue = t[begin]; - ssize_t left = begin - 1; - ssize_t right = end + 1; - - while(1) - { - while (t[--right] > pivotvalue); - while (t[++left] < pivotvalue); - - if (left >= right) break; - - const T temp = t[right]; - t[right] = t[left]; - t[left] = temp; - } - - const int pivot = right; - quicksort_ascending(t, begin, pivot); - quicksort_ascending(t, pivot + 1, end); - } - } - - template<class T> - void quicksort_decending(T *__restrict__ t, - const ssize_t begin, - const ssize_t end) - { - if (likely(begin < end)) - { - const T pivotvalue = t[begin]; - ssize_t left = begin - 1; - ssize_t right = end + 1; - - while(1) - { - while (t[--right] < pivotvalue); - while (t[++left] > pivotvalue); - - if (left >= right) break; - - const T temp = t[right]; - t[right] = t[left]; - t[left] = temp; - } - - const int pivot = right; - quicksort_decending(t, begin, pivot); - quicksort_decending(t, pivot + 1, end); - } - } - - - template<class T, ssize_t THRESHOLD> - void quicksort_insertionsort_ascending(T *__restrict__ t, - const ssize_t begin, - const ssize_t end) - { - if (likely(begin < end)) - { - const ssize_t size = end-begin+1; - if (likely(size <= THRESHOLD)) - { - insertionsort_ascending<T>(&t[begin],size); - } - else - { - const T pivotvalue = t[begin]; - ssize_t left = begin - 1; - ssize_t right = end + 1; - - while(1) - { - while (t[--right] > pivotvalue); - while (t[++left] < pivotvalue); - - if (left >= right) break; - - const T temp = t[right]; - t[right] = t[left]; - t[left] = temp; - } - - const ssize_t pivot = right; - quicksort_insertionsort_ascending<T,THRESHOLD>(t, begin, pivot); - quicksort_insertionsort_ascending<T,THRESHOLD>(t, pivot + 1, end); - } - } - } - - - template<class T, ssize_t THRESHOLD> - void quicksort_insertionsort_decending(T *__restrict__ t, - const ssize_t begin, - const ssize_t end) - { - if (likely(begin < end)) - { - const ssize_t size = end-begin+1; - if (likely(size <= THRESHOLD)) - { - insertionsort_decending<T>(&t[begin],size); - } - else - { - - const T pivotvalue = t[begin]; - ssize_t left = begin - 1; - ssize_t right = end + 1; - - while(1) - { - while (t[--right] < pivotvalue); - while (t[++left] > pivotvalue); - - if (left >= right) break; - - const T temp = t[right]; - t[right] = t[left]; - t[left] = temp; - } - - const ssize_t pivot = right; - quicksort_insertionsort_decending<T,THRESHOLD>(t, begin, pivot); - quicksort_insertionsort_decending<T,THRESHOLD>(t, pivot + 1, end); - } - } - } - - template<typename T> - static void radixsort32(T* const morton, const size_t num, const unsigned int shift = 3*8) - { - static const unsigned int BITS = 8; - static const unsigned int BUCKETS = (1 << BITS); - static const unsigned int CMP_SORT_THRESHOLD = 16; - - __aligned(64) unsigned int count[BUCKETS]; - - /* clear buckets */ - for (size_t i=0;i<BUCKETS;i++) count[i] = 0; - - /* count buckets */ -#if defined(__INTEL_COMPILER) -#pragma nounroll -#endif - for (size_t i=0;i<num;i++) - count[(unsigned(morton[i]) >> shift) & (BUCKETS-1)]++; - - /* prefix sums */ - __aligned(64) unsigned int head[BUCKETS]; - __aligned(64) unsigned int tail[BUCKETS]; - - head[0] = 0; - for (size_t i=1; i<BUCKETS; i++) - head[i] = head[i-1] + count[i-1]; - - for (size_t i=0; i<BUCKETS-1; i++) - tail[i] = head[i+1]; - - tail[BUCKETS-1] = head[BUCKETS-1] + count[BUCKETS-1]; - - assert(tail[BUCKETS-1] == head[BUCKETS-1] + count[BUCKETS-1]); - assert(tail[BUCKETS-1] == num); - - /* in-place swap */ - for (size_t i=0;i<BUCKETS;i++) - { - /* process bucket */ - while(head[i] < tail[i]) - { - T v = morton[head[i]]; - while(1) - { - const size_t b = (unsigned(v) >> shift) & (BUCKETS-1); - if (b == i) break; - std::swap(v,morton[head[b]++]); - } - assert((unsigned(v) >> shift & (BUCKETS-1)) == i); - morton[head[i]++] = v; - } - } - if (shift == 0) return; - - size_t offset = 0; - for (size_t i=0;i<BUCKETS;i++) - if (count[i]) - { - - for (size_t j=offset;j<offset+count[i]-1;j++) - assert(((unsigned(morton[j]) >> shift) & (BUCKETS-1)) == i); - - if (unlikely(count[i] < CMP_SORT_THRESHOLD)) - insertionsort_ascending(morton + offset, count[i]); - else - radixsort32(morton + offset, count[i], shift-BITS); - - for (size_t j=offset;j<offset+count[i]-1;j++) - assert(morton[j] <= morton[j+1]); - - offset += count[i]; - } - } - - template<typename Ty, typename Key> - class ParallelRadixSort - { - static const size_t MAX_TASKS = 64; - static const size_t BITS = 8; - static const size_t BUCKETS = (1 << BITS); - typedef unsigned int TyRadixCount[BUCKETS]; - - template<typename T> - static bool compare(const T& v0, const T& v1) { - return (Key)v0 < (Key)v1; - } - - private: - ParallelRadixSort (const ParallelRadixSort& other) DELETED; // do not implement - ParallelRadixSort& operator= (const ParallelRadixSort& other) DELETED; // do not implement - - - public: - ParallelRadixSort (Ty* const src, Ty* const tmp, const size_t N) - : radixCount(nullptr), src(src), tmp(tmp), N(N) {} - - void sort(const size_t blockSize) - { - assert(blockSize > 0); - - /* perform single threaded sort for small N */ - if (N<=blockSize) // handles also special case of 0! - { - /* do inplace sort inside destination array */ - std::sort(src,src+N,compare<Ty>); - } - - /* perform parallel sort for large N */ - else - { - const size_t numThreads = min((N+blockSize-1)/blockSize,TaskScheduler::threadCount(),size_t(MAX_TASKS)); - tbbRadixSort(numThreads); - } - } - - ~ParallelRadixSort() - { - alignedFree(radixCount); - radixCount = nullptr; - } - - private: - - void tbbRadixIteration0(const Key shift, - const Ty* __restrict const src, - Ty* __restrict const dst, - const size_t threadIndex, const size_t threadCount) - { - const size_t startID = (threadIndex+0)*N/threadCount; - const size_t endID = (threadIndex+1)*N/threadCount; - - /* mask to extract some number of bits */ - const Key mask = BUCKETS-1; - - /* count how many items go into the buckets */ - for (size_t i=0; i<BUCKETS; i++) - radixCount[threadIndex][i] = 0; - - /* iterate over src array and count buckets */ - unsigned int * __restrict const count = radixCount[threadIndex]; -#if defined(__INTEL_COMPILER) -#pragma nounroll -#endif - for (size_t i=startID; i<endID; i++) { -#if defined(__X86_64__) || defined(__aarch64__) - const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask; -#else - const Key index = ((Key)src[i] >> shift) & mask; -#endif - count[index]++; - } - } - - void tbbRadixIteration1(const Key shift, - const Ty* __restrict const src, - Ty* __restrict const dst, - const size_t threadIndex, const size_t threadCount) - { - const size_t startID = (threadIndex+0)*N/threadCount; - const size_t endID = (threadIndex+1)*N/threadCount; - - /* mask to extract some number of bits */ - const Key mask = BUCKETS-1; - - /* calculate total number of items for each bucket */ - __aligned(64) unsigned int total[BUCKETS]; - /* - for (size_t i=0; i<BUCKETS; i++) - total[i] = 0; - */ - for (size_t i=0; i<BUCKETS; i+=VSIZEX) - vintx::store(&total[i], zero); - - for (size_t i=0; i<threadCount; i++) - { - /* - for (size_t j=0; j<BUCKETS; j++) - total[j] += radixCount[i][j]; - */ - for (size_t j=0; j<BUCKETS; j+=VSIZEX) - vintx::store(&total[j], vintx::load(&total[j]) + vintx::load(&radixCount[i][j])); - } - - /* calculate start offset of each bucket */ - __aligned(64) unsigned int offset[BUCKETS]; - offset[0] = 0; - for (size_t i=1; i<BUCKETS; i++) - offset[i] = offset[i-1] + total[i-1]; - - /* calculate start offset of each bucket for this thread */ - for (size_t i=0; i<threadIndex; i++) - { - /* - for (size_t j=0; j<BUCKETS; j++) - offset[j] += radixCount[i][j]; - */ - for (size_t j=0; j<BUCKETS; j+=VSIZEX) - vintx::store(&offset[j], vintx::load(&offset[j]) + vintx::load(&radixCount[i][j])); - } - - /* copy items into their buckets */ -#if defined(__INTEL_COMPILER) -#pragma nounroll -#endif - for (size_t i=startID; i<endID; i++) { - const Ty elt = src[i]; -#if defined(__X86_64__) || defined(__aarch64__) - const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask; -#else - const size_t index = ((Key)src[i] >> shift) & mask; -#endif - dst[offset[index]++] = elt; - } - } - - void tbbRadixIteration(const Key shift, const bool last, - const Ty* __restrict src, Ty* __restrict dst, - const size_t numTasks) - { - affinity_partitioner ap; - parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration0(shift,src,dst,taskIndex,numTasks); },ap); - parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration1(shift,src,dst,taskIndex,numTasks); },ap); - } - - void tbbRadixSort(const size_t numTasks) - { - radixCount = (TyRadixCount*) alignedMalloc(MAX_TASKS*sizeof(TyRadixCount),64); - - if (sizeof(Key) == sizeof(uint32_t)) { - tbbRadixIteration(0*BITS,0,src,tmp,numTasks); - tbbRadixIteration(1*BITS,0,tmp,src,numTasks); - tbbRadixIteration(2*BITS,0,src,tmp,numTasks); - tbbRadixIteration(3*BITS,1,tmp,src,numTasks); - } - else if (sizeof(Key) == sizeof(uint64_t)) - { - tbbRadixIteration(0*BITS,0,src,tmp,numTasks); - tbbRadixIteration(1*BITS,0,tmp,src,numTasks); - tbbRadixIteration(2*BITS,0,src,tmp,numTasks); - tbbRadixIteration(3*BITS,0,tmp,src,numTasks); - tbbRadixIteration(4*BITS,0,src,tmp,numTasks); - tbbRadixIteration(5*BITS,0,tmp,src,numTasks); - tbbRadixIteration(6*BITS,0,src,tmp,numTasks); - tbbRadixIteration(7*BITS,1,tmp,src,numTasks); - } - } - - private: - TyRadixCount* radixCount; - Ty* const src; - Ty* const tmp; - const size_t N; - }; - - template<typename Ty> - void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) - { - ParallelRadixSort<Ty,Ty>(src,tmp,N).sort(blockSize); - } - - template<typename Ty, typename Key> - void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) - { - ParallelRadixSort<Ty,Key>(src,tmp,N).sort(blockSize); - } - - template<typename Ty> - void radix_sort_u32(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) { - radix_sort<Ty,uint32_t>(src,tmp,N,blockSize); - } - - template<typename Ty> - void radix_sort_u64(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) { - radix_sort<Ty,uint64_t>(src,tmp,N,blockSize); - } -} |