summaryrefslogtreecommitdiffstats
path: root/thirdparty/embree-aarch64/common/algorithms
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/embree-aarch64/common/algorithms')
-rw-r--r--thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h55
-rw-r--r--thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp56
-rw-r--r--thirdparty/embree-aarch64/common/algorithms/parallel_filter.h93
-rw-r--r--thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp48
-rw-r--r--thirdparty/embree-aarch64/common/algorithms/parallel_for.h229
-rw-r--r--thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp63
-rw-r--r--thirdparty/embree-aarch64/common/algorithms/parallel_for_for.h149
-rw-r--r--thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp85
-rw-r--r--thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.h112
-rw-r--r--thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp47
-rw-r--r--thirdparty/embree-aarch64/common/algorithms/parallel_map.h85
-rw-r--r--thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp53
-rw-r--r--thirdparty/embree-aarch64/common/algorithms/parallel_partition.h283
-rw-r--r--thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp48
-rw-r--r--thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.h85
-rw-r--r--thirdparty/embree-aarch64/common/algorithms/parallel_reduce.cpp49
-rw-r--r--thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h150
-rw-r--r--thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp43
-rw-r--r--thirdparty/embree-aarch64/common/algorithms/parallel_set.h52
-rw-r--r--thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp50
-rw-r--r--thirdparty/embree-aarch64/common/algorithms/parallel_sort.h457
21 files changed, 0 insertions, 2292 deletions
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h b/thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h
deleted file mode 100644
index 01f1f80f6c..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include <functional>
-#include "parallel_reduce.h"
-
-namespace embree
-{
-
- template<typename Index, class UnaryPredicate>
- __forceinline bool parallel_any_of (Index first, Index last, UnaryPredicate pred)
- {
- bool ret = false;
-
-#if defined(TASKING_TBB)
-#if TBB_INTERFACE_VERSION >= 12002
- tbb::task_group_context context;
- tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred,&context](const tbb::blocked_range<size_t>& r) {
- if (context.is_group_execution_cancelled()) return;
- for (size_t i = r.begin(); i != r.end(); ++i) {
- if (pred(i)) {
- ret = true;
- context.cancel_group_execution();
- }
- }
- });
-#else
- tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred](const tbb::blocked_range<size_t>& r) {
- if (tbb::task::self().is_cancelled()) return;
- for (size_t i = r.begin(); i != r.end(); ++i) {
- if (pred(i)) {
- ret = true;
- tbb::task::self().cancel_group_execution();
- }
- }
- });
-#endif
-#else
- ret = parallel_reduce (first, last, false, [pred](const range<size_t>& r)->bool {
- bool localret = false;
- for (auto i=r.begin(); i<r.end(); ++i) {
- localret |= pred(i);
- }
- return localret;
- },
- std::bit_or<bool>()
- );
-#endif
-
- return ret;
- }
-
-} // end namespace
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp
deleted file mode 100644
index acddc0ff81..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include "parallel_filter.h"
-#include "../sys/regression.h"
-#include <map>
-
-namespace embree
-{
- struct parallel_filter_regression_test : public RegressionTest
- {
- parallel_filter_regression_test(const char* name) : RegressionTest(name) {
- registerRegressionTest(this);
- }
-
- bool run ()
- {
- bool passed = true;
- auto pred = [&]( uint32_t v ) { return (v & 0x3) == 0; };
-
- for (size_t N=10; N<1000000; N=size_t(2.1*N))
- {
- size_t N0 = rand() % N;
-
- /* initialize array with random numbers */
- std::vector<uint32_t> src(N);
- std::map<uint32_t,int> m;
- for (size_t i=0; i<N; i++) src[i] = rand();
-
- /* count elements up */
- for (size_t i=N0; i<N; i++)
- if (pred(src[i]))
- m[src[i]] = 0;
- for (size_t i=N0; i<N; i++)
- if (pred(src[i]))
- m[src[i]]++;
-
- /* filter array */
- //size_t M = sequential_filter(src.data(),N0,N,pred);
- size_t M = parallel_filter(src.data(),N0,N,size_t(1024),pred);
-
- /* check if filtered data is correct */
- for (size_t i=N0; i<M; i++) {
- passed &= pred(src[i]);
- m[src[i]]--;
- }
- for (size_t i=N0; i<M; i++)
- passed &= (m[src[i]] == 0);
- }
-
- return passed;
- }
- };
-
- parallel_filter_regression_test parallel_filter_regression("parallel_filter_regression");
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_filter.h b/thirdparty/embree-aarch64/common/algorithms/parallel_filter.h
deleted file mode 100644
index 5823fc631f..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_filter.h
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "parallel_for.h"
-
-namespace embree
-{
- template<typename Ty, typename Index, typename Predicate>
- inline Index sequential_filter( Ty* data, const Index first, const Index last, const Predicate& predicate)
- {
- Index j = first;
- for (Index i=first; i<last; i++)
- if (predicate(data[i]))
- data[j++] = data[i];
-
- return j;
- }
-
- template<typename Ty, typename Index, typename Predicate>
- inline Index parallel_filter( Ty* data, const Index begin, const Index end, const Index minStepSize, const Predicate& predicate)
- {
- /* sequential fallback */
- if (end-begin <= minStepSize)
- return sequential_filter(data,begin,end,predicate);
-
- /* calculate number of tasks to use */
- enum { MAX_TASKS = 64 };
- const Index numThreads = TaskScheduler::threadCount();
- const Index numBlocks = (end-begin+minStepSize-1)/minStepSize;
- const Index taskCount = min(numThreads,numBlocks,(Index)MAX_TASKS);
-
- /* filter blocks */
- Index nused[MAX_TASKS];
- Index nfree[MAX_TASKS];
- parallel_for(taskCount, [&](const Index taskIndex)
- {
- const Index i0 = begin+(taskIndex+0)*(end-begin)/taskCount;
- const Index i1 = begin+(taskIndex+1)*(end-begin)/taskCount;
- const Index i2 = sequential_filter(data,i0,i1,predicate);
- nused[taskIndex] = i2-i0;
- nfree[taskIndex] = i1-i2;
- });
-
- /* calculate offsets */
- Index sused=0;
- Index sfree=0;
- Index pfree[MAX_TASKS];
- for (Index i=0; i<taskCount; i++)
- {
- sused+=nused[i];
- Index cfree = nfree[i]; pfree[i] = sfree; sfree+=cfree;
- }
-
- /* return if we did not filter out any element */
- assert(sfree <= end-begin);
- assert(sused <= end-begin);
- if (sused == end-begin)
- return end;
-
- /* otherwise we have to copy misplaced elements around */
- parallel_for(taskCount, [&](const Index taskIndex)
- {
- /* destination to write elements to */
- Index dst = begin+(taskIndex+0)*(end-begin)/taskCount+nused[taskIndex];
- Index dst_end = min(dst+nfree[taskIndex],begin+sused);
- if (dst_end <= dst) return;
-
- /* range of misplaced elements to copy to destination */
- Index r0 = pfree[taskIndex];
- Index r1 = r0+dst_end-dst;
-
- /* find range in misplaced elements in back to front order */
- Index k0=0;
- for (Index i=taskCount-1; i>0; i--)
- {
- if (k0 > r1) break;
- Index k1 = k0+nused[i];
- Index src = begin+(i+0)*(end-begin)/taskCount+nused[i];
- for (Index i=max(r0,k0); i<min(r1,k1); i++) {
- Index isrc = src-i+k0-1;
- assert(dst >= begin && dst < end);
- assert(isrc >= begin && isrc < end);
- data[dst++] = data[isrc];
- }
- k0 = k1;
- }
- });
-
- return begin+sused;
- }
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp
deleted file mode 100644
index ef070ebc4d..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include "parallel_for.h"
-#include "../sys/regression.h"
-
-namespace embree
-{
- struct parallel_for_regression_test : public RegressionTest
- {
- parallel_for_regression_test(const char* name) : RegressionTest(name) {
- registerRegressionTest(this);
- }
-
- bool run ()
- {
- bool passed = true;
-
- const size_t M = 10;
- for (size_t N=10; N<10000000; N=size_t(2.1*N))
- {
- /* sequentially calculate sum of squares */
- size_t sum0 = 0;
- for (size_t i=0; i<N; i++) {
- sum0 += i*i;
- }
-
- /* parallel calculation of sum of squares */
- for (size_t m=0; m<M; m++)
- {
- std::atomic<size_t> sum1(0);
- parallel_for( size_t(0), size_t(N), size_t(1024), [&](const range<size_t>& r)
- {
- size_t s = 0;
- for (size_t i=r.begin(); i<r.end(); i++)
- s += i*i;
- sum1 += s;
- });
- passed = sum0 == sum1;
- }
- }
-
- return passed;
- }
- };
-
- parallel_for_regression_test parallel_for_regression("parallel_for_regression_test");
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for.h b/thirdparty/embree-aarch64/common/algorithms/parallel_for.h
deleted file mode 100644
index 51d296fb16..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_for.h
+++ /dev/null
@@ -1,229 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "../tasking/taskscheduler.h"
-#include "../sys/array.h"
-#include "../math/math.h"
-#include "../math/range.h"
-
-#if defined(TASKING_GCD) && defined(BUILD_IOS)
-#include <dispatch/dispatch.h>
-#include <algorithm>
-#include <type_traits>
-#endif
-
-namespace embree
-{
- /* parallel_for without range */
- template<typename Index, typename Func>
- __forceinline void parallel_for( const Index N, const Func& func)
- {
-#if defined(TASKING_INTERNAL)
- if (N) {
- TaskScheduler::spawn(Index(0),N,Index(1),[&] (const range<Index>& r) {
- assert(r.size() == 1);
- func(r.begin());
- });
- if (!TaskScheduler::wait())
- // -- GODOT start --
- // throw std::runtime_error("task cancelled");
- abort();
- // -- GODOT end --
- }
-#elif defined(TASKING_GCD) && defined(BUILD_IOS)
-
- const size_t baselineNumBlocks = (TaskScheduler::threadCount() > 1)? TaskScheduler::threadCount() : 1;
- const size_t length = N;
- const size_t blockSize = (length + baselineNumBlocks-1) / baselineNumBlocks;
- const size_t numBlocks = (length + blockSize-1) / blockSize;
-
- dispatch_apply(numBlocks, DISPATCH_APPLY_AUTO, ^(size_t currentBlock) {
-
- const size_t start = (currentBlock * blockSize);
- const size_t blockLength = std::min(length - start, blockSize);
- const size_t end = start + blockLength;
-
- for(size_t i=start; i < end; i++)
- {
- func(i);
- }
- });
-
-#elif defined(TASKING_TBB)
- #if TBB_INTERFACE_VERSION >= 12002
- tbb::task_group_context context;
- tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
- func(i);
- },context);
- if (context.is_group_execution_cancelled())
- // -- GODOT start --
- // throw std::runtime_error("task cancelled");
- abort();
- // -- GODOT end --
- #else
- tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
- func(i);
- });
- if (tbb::task::self().is_cancelled())
- // -- GODOT start --
- // throw std::runtime_error("task cancelled");
- abort();
- // -- GODOT end --
- #endif
-
-#elif defined(TASKING_PPL)
- concurrency::parallel_for(Index(0),N,Index(1),[&](Index i) {
- func(i);
- });
-#else
-# error "no tasking system enabled"
-#endif
- }
-
- /* parallel for with range and granulatity */
- template<typename Index, typename Func>
- __forceinline void parallel_for( const Index first, const Index last, const Index minStepSize, const Func& func)
- {
- assert(first <= last);
-#if defined(TASKING_INTERNAL)
- TaskScheduler::spawn(first,last,minStepSize,func);
- if (!TaskScheduler::wait())
- // -- GODOT start --
- // throw std::runtime_error("task cancelled");
- abort();
- // -- GODOT end --
-
-#elif defined(TASKING_GCD) && defined(BUILD_IOS)
-
- const size_t baselineNumBlocks = (TaskScheduler::threadCount() > 1)? 4*TaskScheduler::threadCount() : 1;
- const size_t length = last - first;
- const size_t blockSizeByThreads = (length + baselineNumBlocks-1) / baselineNumBlocks;
- size_t blockSize = std::max<size_t>(minStepSize,blockSizeByThreads);
- blockSize += blockSize % 4;
-
- const size_t numBlocks = (length + blockSize-1) / blockSize;
-
- dispatch_apply(numBlocks, DISPATCH_APPLY_AUTO, ^(size_t currentBlock) {
-
- const size_t start = first + (currentBlock * blockSize);
- const size_t end = std::min<size_t>(last, start + blockSize);
-
- func( embree::range<Index>(start,end) );
- });
-
-
-#elif defined(TASKING_TBB)
- #if TBB_INTERFACE_VERSION >= 12002
- tbb::task_group_context context;
- tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
- func(range<Index>(r.begin(),r.end()));
- },context);
- if (context.is_group_execution_cancelled())
- // -- GODOT start --
- // throw std::runtime_error("task cancelled");
- abort();
- // -- GODOT end --
- #else
- tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
- func(range<Index>(r.begin(),r.end()));
- });
- if (tbb::task::self().is_cancelled())
- // -- GODOT start --
- // throw std::runtime_error("task cancelled");
- abort();
- // -- GODOT end --
- #endif
-
-#elif defined(TASKING_PPL)
- concurrency::parallel_for(first, last, Index(1) /*minStepSize*/, [&](Index i) {
- func(range<Index>(i,i+1));
- });
-
-#else
-# error "no tasking system enabled"
-#endif
- }
-
- /* parallel for with range */
- template<typename Index, typename Func>
- __forceinline void parallel_for( const Index first, const Index last, const Func& func)
- {
- assert(first <= last);
- parallel_for(first,last,(Index)1,func);
- }
-
-#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION > 4001)
-
- template<typename Index, typename Func>
- __forceinline void parallel_for_static( const Index N, const Func& func)
- {
- #if TBB_INTERFACE_VERSION >= 12002
- tbb::task_group_context context;
- tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
- func(i);
- },tbb::simple_partitioner(),context);
- if (context.is_group_execution_cancelled())
- // -- GODOT start --
- // throw std::runtime_error("task cancelled");
- abort();
- // -- GODOT end --
- #else
- tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
- func(i);
- },tbb::simple_partitioner());
- if (tbb::task::self().is_cancelled())
- // -- GODOT start --
- // throw std::runtime_error("task cancelled");
- abort();
- // -- GODOT end --
- #endif
- }
-
- typedef tbb::affinity_partitioner affinity_partitioner;
-
- template<typename Index, typename Func>
- __forceinline void parallel_for_affinity( const Index N, const Func& func, tbb::affinity_partitioner& ap)
- {
- #if TBB_INTERFACE_VERSION >= 12002
- tbb::task_group_context context;
- tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
- func(i);
- },ap,context);
- if (context.is_group_execution_cancelled())
- // -- GODOT start --
- // throw std::runtime_error("task cancelled");
- abort();
- // -- GODOT end --
- #else
- tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
- func(i);
- },ap);
- if (tbb::task::self().is_cancelled())
- // -- GODOT start --
- // throw std::runtime_error("task cancelled");
- abort();
- // -- GODOT end --
- #endif
- }
-
-#else
-
- template<typename Index, typename Func>
- __forceinline void parallel_for_static( const Index N, const Func& func)
- {
- parallel_for(N,func);
- }
-
- struct affinity_partitioner {
- };
-
- template<typename Index, typename Func>
- __forceinline void parallel_for_affinity( const Index N, const Func& func, affinity_partitioner& ap)
- {
- parallel_for(N,func);
- }
-
-#endif
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp
deleted file mode 100644
index 0337611b35..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include "parallel_for_for.h"
-#include "../sys/regression.h"
-
-namespace embree
-{
- struct parallel_for_for_regression_test : public RegressionTest
- {
- parallel_for_for_regression_test(const char* name) : RegressionTest(name) {
- registerRegressionTest(this);
- }
-
- bool run ()
- {
- bool passed = true;
-
- /* create vector with random numbers */
- size_t sum0 = 0;
- size_t K = 0;
- const size_t M = 1000;
- std::vector<std::vector<size_t>* > array2(M);
- for (size_t i=0; i<M; i++) {
- const size_t N = rand() % 1024;
- K+=N;
- array2[i] = new std::vector<size_t>(N);
- for (size_t j=0; j<N; j++)
- sum0 += (*array2[i])[j] = rand();
- }
-
- /* array to test global index */
- std::vector<atomic<size_t>> verify_k(K);
- for (size_t i=0; i<K; i++) verify_k[i].store(0);
-
- /* add all numbers using parallel_for_for */
- std::atomic<size_t> sum1(0);
- parallel_for_for( array2, size_t(1), [&](std::vector<size_t>* v, const range<size_t>& r, size_t k) -> size_t
- {
- size_t s = 0;
- for (size_t i=r.begin(); i<r.end(); i++) {
- s += (*v)[i];
- verify_k[k++]++;
- }
- sum1 += s;
- return sum1;
- });
- passed &= (sum0 == sum1);
-
- /* check global index */
- for (size_t i=0; i<K; i++)
- passed &= (verify_k[i] == 1);
-
- /* delete vectors again */
- for (size_t i=0; i<array2.size(); i++)
- delete array2[i];
-
- return passed;
- }
- };
-
- parallel_for_for_regression_test parallel_for_for_regression("parallel_for_for_regression_test");
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.h b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.h
deleted file mode 100644
index 852b8a0900..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.h
+++ /dev/null
@@ -1,149 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "parallel_for.h"
-
-namespace embree
-{
- template<typename ArrayArray, typename Func>
- __forceinline void sequential_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func )
- {
- size_t k=0;
- for (size_t i=0; i!=array2.size(); ++i) {
- const size_t N = array2[i]->size();
- if (N) func(array2[i],range<size_t>(0,N),k);
- k+=N;
- }
- }
-
- class ParallelForForState
- {
- public:
-
- enum { MAX_TASKS = 64 };
-
- __forceinline ParallelForForState ()
- : taskCount(0) {}
-
- template<typename ArrayArray>
- __forceinline ParallelForForState (ArrayArray& array2, const size_t minStepSize) {
- init(array2,minStepSize);
- }
-
- template<typename ArrayArray>
- __forceinline void init ( ArrayArray& array2, const size_t minStepSize )
- {
- /* first calculate total number of elements */
- size_t N = 0;
- for (size_t i=0; i<array2.size(); i++) {
- N += array2[i] ? array2[i]->size() : 0;
- }
- this->N = N;
-
- /* calculate number of tasks to use */
- const size_t numThreads = TaskScheduler::threadCount();
- const size_t numBlocks = (N+minStepSize-1)/minStepSize;
- taskCount = max(size_t(1),min(numThreads,numBlocks,size_t(ParallelForForState::MAX_TASKS)));
-
- /* calculate start (i,j) for each task */
- size_t taskIndex = 0;
- i0[taskIndex] = 0;
- j0[taskIndex] = 0;
- size_t k0 = (++taskIndex)*N/taskCount;
- for (size_t i=0, k=0; taskIndex < taskCount; i++)
- {
- assert(i<array2.size());
- size_t j=0, M = array2[i] ? array2[i]->size() : 0;
- while (j<M && k+M-j >= k0 && taskIndex < taskCount) {
- assert(taskIndex<taskCount);
- i0[taskIndex] = i;
- j0[taskIndex] = j += k0-k;
- k=k0;
- k0 = (++taskIndex)*N/taskCount;
- }
- k+=M-j;
- }
- }
-
- __forceinline size_t size() const {
- return N;
- }
-
- public:
- size_t i0[MAX_TASKS];
- size_t j0[MAX_TASKS];
- size_t taskCount;
- size_t N;
- };
-
- template<typename ArrayArray, typename Func>
- __forceinline void parallel_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func )
- {
- ParallelForForState state(array2,minStepSize);
-
- parallel_for(state.taskCount, [&](const size_t taskIndex)
- {
- /* calculate range */
- const size_t k0 = (taskIndex+0)*state.size()/state.taskCount;
- const size_t k1 = (taskIndex+1)*state.size()/state.taskCount;
- size_t i0 = state.i0[taskIndex];
- size_t j0 = state.j0[taskIndex];
-
- /* iterate over arrays */
- size_t k=k0;
- for (size_t i=i0; k<k1; i++) {
- const size_t N = array2[i] ? array2[i]->size() : 0;
- const size_t r0 = j0, r1 = min(N,r0+k1-k);
- if (r1 > r0) func(array2[i],range<size_t>(r0,r1),k);
- k+=r1-r0; j0 = 0;
- }
- });
- }
-
- template<typename ArrayArray, typename Func>
- __forceinline void parallel_for_for( ArrayArray& array2, const Func& func )
- {
- parallel_for_for(array2,1,func);
- }
-
- template<typename ArrayArray, typename Value, typename Func, typename Reduction>
- __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const size_t minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
- {
- ParallelForForState state(array2,minStepSize);
- Value temp[ParallelForForState::MAX_TASKS];
-
- for (size_t i=0; i<state.taskCount; i++)
- temp[i] = identity;
-
- parallel_for(state.taskCount, [&](const size_t taskIndex)
- {
- /* calculate range */
- const size_t k0 = (taskIndex+0)*state.size()/state.taskCount;
- const size_t k1 = (taskIndex+1)*state.size()/state.taskCount;
- size_t i0 = state.i0[taskIndex];
- size_t j0 = state.j0[taskIndex];
-
- /* iterate over arrays */
- size_t k=k0;
- for (size_t i=i0; k<k1; i++) {
- const size_t N = array2[i] ? array2[i]->size() : 0;
- const size_t r0 = j0, r1 = min(N,r0+k1-k);
- if (r1 > r0) temp[taskIndex] = reduction(temp[taskIndex],func(array2[i],range<size_t>(r0,r1),k));
- k+=r1-r0; j0 = 0;
- }
- });
-
- Value ret = identity;
- for (size_t i=0; i<state.taskCount; i++)
- ret = reduction(ret,temp[i]);
- return ret;
- }
-
- template<typename ArrayArray, typename Value, typename Func, typename Reduction>
- __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const Value& identity, const Func& func, const Reduction& reduction)
- {
- return parallel_for_for_reduce(array2,1,identity,func,reduction);
- }
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp
deleted file mode 100644
index 0169d8e481..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include "parallel_for_for_prefix_sum.h"
-#include "../sys/regression.h"
-
-namespace embree
-{
- struct parallel_for_for_prefix_sum_regression_test : public RegressionTest
- {
- parallel_for_for_prefix_sum_regression_test(const char* name) : RegressionTest(name) {
- registerRegressionTest(this);
- }
-
- bool run ()
- {
- bool passed = true;
-
- /* create vector with random numbers */
- const size_t M = 10;
- std::vector<atomic<size_t>> flattened;
- typedef std::vector<std::vector<size_t>* > ArrayArray;
- ArrayArray array2(M);
- size_t K = 0;
- for (size_t i=0; i<M; i++) {
- const size_t N = rand() % 10;
- K += N;
- array2[i] = new std::vector<size_t>(N);
- for (size_t j=0; j<N; j++)
- (*array2[i])[j] = rand() % 10;
- }
-
- /* array to test global index */
- std::vector<atomic<size_t>> verify_k(K);
- for (size_t i=0; i<K; i++) verify_k[i].store(0);
-
- ParallelForForPrefixSumState<size_t> state(array2,size_t(1));
-
- /* dry run only counts */
- size_t S = parallel_for_for_prefix_sum0( state, array2, size_t(0), [&](std::vector<size_t>* v, const range<size_t>& r, size_t k, size_t i) -> size_t
- {
- size_t s = 0;
- for (size_t i=r.begin(); i<r.end(); i++) {
- s += (*v)[i];
- verify_k[k++]++;
- }
- return s;
- }, [](size_t v0, size_t v1) { return v0+v1; });
-
- /* create properly sized output array */
- flattened.resize(S);
- for (auto& a : flattened) a.store(0);
-
- /* now we actually fill the flattened array */
- parallel_for_for_prefix_sum1( state, array2, size_t(0), [&](std::vector<size_t>* v, const range<size_t>& r, size_t k, size_t i, const size_t base) -> size_t
- {
- size_t s = 0;
- for (size_t i=r.begin(); i<r.end(); i++) {
- for (size_t j=0; j<(*v)[i]; j++) {
- flattened[base+s+j]++;
- }
- s += (*v)[i];
- verify_k[k++]++;
- }
- return s;
- }, [](size_t v0, size_t v1) { return v0+v1; });
-
- /* check global index */
- for (size_t i=0; i<K; i++)
- passed &= (verify_k[i] == 2);
-
- /* check if each element was assigned exactly once */
- for (size_t i=0; i<flattened.size(); i++)
- passed &= (flattened[i] == 1);
-
- /* delete arrays again */
- for (size_t i=0; i<array2.size(); i++)
- delete array2[i];
-
- return passed;
- }
- };
-
- parallel_for_for_prefix_sum_regression_test parallel_for_for_prefix_sum_regression("parallel_for_for_prefix_sum_regression_test");
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.h b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.h
deleted file mode 100644
index d2671d8a6a..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.h
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "parallel_for_for.h"
-#include "parallel_prefix_sum.h"
-
-namespace embree
-{
- template<typename Value>
- struct ParallelForForPrefixSumState : public ParallelForForState
- {
- __forceinline ParallelForForPrefixSumState () {}
-
- template<typename ArrayArray>
- __forceinline ParallelForForPrefixSumState (ArrayArray& array2, const size_t minStepSize)
- : ParallelForForState(array2,minStepSize) {}
-
- ParallelPrefixSumState<Value> prefix_state;
- };
-
- template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
- __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize,
- const Value& identity, const Func& func, const Reduction& reduction)
- {
- /* calculate number of tasks to use */
- const size_t taskCount = state.taskCount;
- /* perform parallel prefix sum */
- parallel_for(taskCount, [&](const size_t taskIndex)
- {
- const size_t k0 = (taskIndex+0)*state.size()/taskCount;
- const size_t k1 = (taskIndex+1)*state.size()/taskCount;
- size_t i0 = state.i0[taskIndex];
- size_t j0 = state.j0[taskIndex];
-
- /* iterate over arrays */
- size_t k=k0;
- Value N=identity;
- for (size_t i=i0; k<k1; i++) {
- const size_t size = array2[i] ? array2[i]->size() : 0;
- const size_t r0 = j0, r1 = min(size,r0+k1-k);
- if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i));
- k+=r1-r0; j0 = 0;
- }
- state.prefix_state.counts[taskIndex] = N;
- });
-
- /* calculate prefix sum */
- Value sum=identity;
- for (size_t i=0; i<taskCount; i++)
- {
- const Value c = state.prefix_state.counts[i];
- state.prefix_state.sums[i] = sum;
- sum=reduction(sum,c);
- }
-
- return sum;
- }
-
- template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
- __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize,
- const Value& identity, const Func& func, const Reduction& reduction)
- {
- /* calculate number of tasks to use */
- const size_t taskCount = state.taskCount;
- /* perform parallel prefix sum */
- parallel_for(taskCount, [&](const size_t taskIndex)
- {
- const size_t k0 = (taskIndex+0)*state.size()/taskCount;
- const size_t k1 = (taskIndex+1)*state.size()/taskCount;
- size_t i0 = state.i0[taskIndex];
- size_t j0 = state.j0[taskIndex];
-
- /* iterate over arrays */
- size_t k=k0;
- Value N=identity;
- for (size_t i=i0; k<k1; i++) {
- const size_t size = array2[i] ? array2[i]->size() : 0;
- const size_t r0 = j0, r1 = min(size,r0+k1-k);
- if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i,reduction(state.prefix_state.sums[taskIndex],N)));
- k+=r1-r0; j0 = 0;
- }
- state.prefix_state.counts[taskIndex] = N;
- });
-
- /* calculate prefix sum */
- Value sum=identity;
- for (size_t i=0; i<taskCount; i++)
- {
- const Value c = state.prefix_state.counts[i];
- state.prefix_state.sums[i] = sum;
- sum=reduction(sum,c);
- }
-
- return sum;
- }
-
- template<typename ArrayArray, typename Value, typename Func, typename Reduction>
- __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2,
- const Value& identity, const Func& func, const Reduction& reduction)
- {
- return parallel_for_for_prefix_sum0(state,array2,size_t(1),identity,func,reduction);
- }
-
- template<typename ArrayArray, typename Value, typename Func, typename Reduction>
- __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2,
- const Value& identity, const Func& func, const Reduction& reduction)
- {
- return parallel_for_for_prefix_sum1(state,array2,size_t(1),identity,func,reduction);
- }
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp
deleted file mode 100644
index 09dc303f81..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include "parallel_map.h"
-#include "../sys/regression.h"
-
-namespace embree
-{
- struct parallel_map_regression_test : public RegressionTest
- {
- parallel_map_regression_test(const char* name) : RegressionTest(name) {
- registerRegressionTest(this);
- }
-
- bool run ()
- {
- bool passed = true;
-
- /* create key/value vectors with random numbers */
- const size_t N = 10000;
- std::vector<uint32_t> keys(N);
- std::vector<uint32_t> vals(N);
- for (size_t i=0; i<N; i++) keys[i] = 2*unsigned(i)*647382649;
- for (size_t i=0; i<N; i++) std::swap(keys[i],keys[rand()%N]);
- for (size_t i=0; i<N; i++) vals[i] = 2*rand();
-
- /* create map */
- parallel_map<uint32_t,uint32_t> map;
- map.init(keys,vals);
-
- /* check that all keys are properly mapped */
- for (size_t i=0; i<N; i++) {
- const uint32_t* val = map.lookup(keys[i]);
- passed &= val && (*val == vals[i]);
- }
-
- /* check that these keys are not in the map */
- for (size_t i=0; i<N; i++) {
- passed &= !map.lookup(keys[i]+1);
- }
-
- return passed;
- }
- };
-
- parallel_map_regression_test parallel_map_regression("parallel_map_regression_test");
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_map.h b/thirdparty/embree-aarch64/common/algorithms/parallel_map.h
deleted file mode 100644
index 02e1a8f8d0..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_map.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "parallel_sort.h"
-
-namespace embree
-{
- /*! implementation of a key/value map with parallel construction */
- template<typename Key, typename Val>
- class parallel_map
- {
- /* key/value pair to build the map */
- struct KeyValue
- {
- __forceinline KeyValue () {}
-
- __forceinline KeyValue (const Key key, const Val val)
- : key(key), val(val) {}
-
- __forceinline operator Key() const {
- return key;
- }
-
- public:
- Key key;
- Val val;
- };
-
- public:
-
- /*! parallel map constructors */
- parallel_map () {}
-
- /*! construction from pair of vectors */
- template<typename KeyVector, typename ValVector>
- parallel_map (const KeyVector& keys, const ValVector& values) { init(keys,values); }
-
- /*! initialized the parallel map from a vector with keys and values */
- template<typename KeyVector, typename ValVector>
- void init(const KeyVector& keys, const ValVector& values)
- {
- /* reserve sufficient space for all data */
- assert(keys.size() == values.size());
- vec.resize(keys.size());
-
- /* generate key/value pairs */
- parallel_for( size_t(0), keys.size(), size_t(4*4096), [&](const range<size_t>& r) {
- for (size_t i=r.begin(); i<r.end(); i++)
- vec[i] = KeyValue((Key)keys[i],values[i]);
- });
-
- /* perform parallel radix sort of the key/value pairs */
- std::vector<KeyValue> temp(keys.size());
- radix_sort<KeyValue,Key>(vec.data(),temp.data(),keys.size());
- }
-
- /*! Returns a pointer to the value associated with the specified key. The pointer will be nullptr of the key is not contained in the map. */
- __forceinline const Val* lookup(const Key& key) const
- {
- typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key);
- if (i == vec.end()) return nullptr;
- if (i->key != key) return nullptr;
- return &i->val;
- }
-
- /*! If the key is in the map, the function returns the value associated with the key, otherwise it returns the default value. */
- __forceinline Val lookup(const Key& key, const Val& def) const
- {
- typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key);
- if (i == vec.end()) return def;
- if (i->key != key) return def;
- return i->val;
- }
-
- /*! clears all state */
- void clear() {
- vec.clear();
- }
-
- private:
- std::vector<KeyValue> vec; //!< vector containing sorted elements
- };
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp
deleted file mode 100644
index eb20c4465d..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include "parallel_partition.h"
-#include "../sys/regression.h"
-
-namespace embree
-{
- struct parallel_partition_regression_test : public RegressionTest
- {
- parallel_partition_regression_test(const char* name) : RegressionTest(name) {
- registerRegressionTest(this);
- }
-
- bool run ()
- {
- bool passed = true;
-
- for (size_t i=0; i<100; i++)
- {
- /* create random permutation */
- size_t N = std::rand() % 1000000;
- std::vector<unsigned> array(N);
- for (unsigned i=0; i<N; i++) array[i] = i;
- for (auto& v : array) std::swap(v,array[std::rand()%array.size()]);
- size_t split = std::rand() % (N+1);
-
- /* perform parallel partitioning */
- size_t left_sum = 0, right_sum = 0;
- size_t mid = parallel_partitioning(array.data(),0,array.size(),0,left_sum,right_sum,
- [&] ( size_t i ) { return i < split; },
- [] ( size_t& sum, unsigned v) { sum += v; },
- [] ( size_t& sum, size_t v) { sum += v; },
- 128);
-
- /*serial_partitioning(array.data(),0,array.size(),left_sum,right_sum,
- [&] ( size_t i ) { return i < split; },
- [] ( size_t& left_sum, int v) { left_sum += v; });*/
-
- /* verify result */
- passed &= mid == split;
- passed &= left_sum == split*(split-1)/2;
- passed &= right_sum == N*(N-1)/2-left_sum;
- for (size_t i=0; i<split; i++) passed &= array[i] < split;
- for (size_t i=split; i<N; i++) passed &= array[i] >= split;
- }
-
- return passed;
- }
- };
-
- parallel_partition_regression_test parallel_partition_regression("parallel_partition_regression_test");
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.h b/thirdparty/embree-aarch64/common/algorithms/parallel_partition.h
deleted file mode 100644
index 3b3ad7c854..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.h
+++ /dev/null
@@ -1,283 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "parallel_for.h"
-#include "../math/range.h"
-
-namespace embree
-{
- /* serial partitioning */
- template<typename T, typename V, typename IsLeft, typename Reduction_T>
- __forceinline size_t serial_partitioning(T* array,
- const size_t begin,
- const size_t end,
- V& leftReduction,
- V& rightReduction,
- const IsLeft& is_left,
- const Reduction_T& reduction_t)
- {
- T* l = array + begin;
- T* r = array + end - 1;
-
- while(1)
- {
- /* *l < pivot */
- while (likely(l <= r && is_left(*l) ))
- {
- //prefetchw(l+4); // FIXME: enable?
- reduction_t(leftReduction,*l);
- ++l;
- }
- /* *r >= pivot) */
- while (likely(l <= r && !is_left(*r)))
- {
- //prefetchw(r-4); FIXME: enable?
- reduction_t(rightReduction,*r);
- --r;
- }
- if (r<l) break;
-
- reduction_t(leftReduction ,*r);
- reduction_t(rightReduction,*l);
- xchg(*l,*r);
- l++; r--;
- }
-
- return l - array;
- }
-
- template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
- class __aligned(64) parallel_partition_task
- {
- ALIGNED_CLASS_(64);
- private:
-
- static const size_t MAX_TASKS = 64;
-
- T* array;
- size_t N;
- const IsLeft& is_left;
- const Reduction_T& reduction_t;
- const Reduction_V& reduction_v;
- const Vi& identity;
-
- size_t numTasks;
- __aligned(64) size_t counter_start[MAX_TASKS+1];
- __aligned(64) size_t counter_left[MAX_TASKS+1];
- __aligned(64) range<ssize_t> leftMisplacedRanges[MAX_TASKS];
- __aligned(64) range<ssize_t> rightMisplacedRanges[MAX_TASKS];
- __aligned(64) V leftReductions[MAX_TASKS];
- __aligned(64) V rightReductions[MAX_TASKS];
-
- public:
-
- __forceinline parallel_partition_task(T* array,
- const size_t N,
- const Vi& identity,
- const IsLeft& is_left,
- const Reduction_T& reduction_t,
- const Reduction_V& reduction_v,
- const size_t BLOCK_SIZE)
-
- : array(array), N(N), is_left(is_left), reduction_t(reduction_t), reduction_v(reduction_v), identity(identity),
- numTasks(min((N+BLOCK_SIZE-1)/BLOCK_SIZE,min(TaskScheduler::threadCount(),MAX_TASKS))) {}
-
- __forceinline const range<ssize_t>* findStartRange(size_t& index, const range<ssize_t>* const r, const size_t numRanges)
- {
- size_t i = 0;
- while(index >= (size_t)r[i].size())
- {
- assert(i < numRanges);
- index -= (size_t)r[i].size();
- i++;
- }
- return &r[i];
- }
-
- __forceinline void swapItemsInMisplacedRanges(const size_t numLeftMisplacedRanges,
- const size_t numRightMisplacedRanges,
- const size_t startID,
- const size_t endID)
- {
- size_t leftLocalIndex = startID;
- size_t rightLocalIndex = startID;
- const range<ssize_t>* l_range = findStartRange(leftLocalIndex,leftMisplacedRanges,numLeftMisplacedRanges);
- const range<ssize_t>* r_range = findStartRange(rightLocalIndex,rightMisplacedRanges,numRightMisplacedRanges);
-
- size_t l_left = l_range->size() - leftLocalIndex;
- size_t r_left = r_range->size() - rightLocalIndex;
- T *__restrict__ l = &array[l_range->begin() + leftLocalIndex];
- T *__restrict__ r = &array[r_range->begin() + rightLocalIndex];
- size_t size = endID - startID;
- size_t items = min(size,min(l_left,r_left));
-
- while (size)
- {
- if (unlikely(l_left == 0))
- {
- l_range++;
- l_left = l_range->size();
- l = &array[l_range->begin()];
- items = min(size,min(l_left,r_left));
- }
-
- if (unlikely(r_left == 0))
- {
- r_range++;
- r_left = r_range->size();
- r = &array[r_range->begin()];
- items = min(size,min(l_left,r_left));
- }
-
- size -= items;
- l_left -= items;
- r_left -= items;
-
- while(items) {
- items--;
- xchg(*l++,*r++);
- }
- }
- }
-
- __forceinline size_t partition(V& leftReduction, V& rightReduction)
- {
- /* partition the individual ranges for each task */
- parallel_for(numTasks,[&] (const size_t taskID) {
- const size_t startID = (taskID+0)*N/numTasks;
- const size_t endID = (taskID+1)*N/numTasks;
- V local_left(identity);
- V local_right(identity);
- const size_t mid = serial_partitioning(array,startID,endID,local_left,local_right,is_left,reduction_t);
- counter_start[taskID] = startID;
- counter_left [taskID] = mid-startID;
- leftReductions[taskID] = local_left;
- rightReductions[taskID] = local_right;
- });
- counter_start[numTasks] = N;
- counter_left[numTasks] = 0;
-
- /* finalize the reductions */
- for (size_t i=0; i<numTasks; i++) {
- reduction_v(leftReduction,leftReductions[i]);
- reduction_v(rightReduction,rightReductions[i]);
- }
-
- /* calculate mid point for partitioning */
- size_t mid = counter_left[0];
- for (size_t i=1; i<numTasks; i++)
- mid += counter_left[i];
- const range<ssize_t> globalLeft (0,mid);
- const range<ssize_t> globalRight(mid,N);
-
- /* calculate all left and right ranges that are on the wrong global side */
- size_t numMisplacedRangesLeft = 0;
- size_t numMisplacedRangesRight = 0;
- size_t numMisplacedItemsLeft = 0;
- size_t numMisplacedItemsRight = 0;
-
- for (size_t i=0; i<numTasks; i++)
- {
- const range<ssize_t> left_range (counter_start[i], counter_start[i] + counter_left[i]);
- const range<ssize_t> right_range(counter_start[i] + counter_left[i], counter_start[i+1]);
- const range<ssize_t> left_misplaced = globalLeft. intersect(right_range);
- const range<ssize_t> right_misplaced = globalRight.intersect(left_range);
-
- if (!left_misplaced.empty())
- {
- numMisplacedItemsLeft += left_misplaced.size();
- leftMisplacedRanges[numMisplacedRangesLeft++] = left_misplaced;
- }
-
- if (!right_misplaced.empty())
- {
- numMisplacedItemsRight += right_misplaced.size();
- rightMisplacedRanges[numMisplacedRangesRight++] = right_misplaced;
- }
- }
- assert( numMisplacedItemsLeft == numMisplacedItemsRight );
-
- /* if no items are misplaced we are done */
- if (numMisplacedItemsLeft == 0)
- return mid;
-
- /* otherwise we copy the items to the right place in parallel */
- parallel_for(numTasks,[&] (const size_t taskID) {
- const size_t startID = (taskID+0)*numMisplacedItemsLeft/numTasks;
- const size_t endID = (taskID+1)*numMisplacedItemsLeft/numTasks;
- swapItemsInMisplacedRanges(numMisplacedRangesLeft,numMisplacedRangesRight,startID,endID);
- });
-
- return mid;
- }
- };
-
- template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
- __noinline size_t parallel_partitioning(T* array,
- const size_t begin,
- const size_t end,
- const Vi &identity,
- V &leftReduction,
- V &rightReduction,
- const IsLeft& is_left,
- const Reduction_T& reduction_t,
- const Reduction_V& reduction_v,
- size_t BLOCK_SIZE = 128)
- {
- /* fall back to single threaded partitioning for small N */
- if (unlikely(end-begin < BLOCK_SIZE))
- return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
-
- /* otherwise use parallel code */
- else {
- typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
- std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
- return begin+p->partition(leftReduction,rightReduction);
- }
- }
-
- template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
- __noinline size_t parallel_partitioning(T* array,
- const size_t begin,
- const size_t end,
- const Vi &identity,
- V &leftReduction,
- V &rightReduction,
- const IsLeft& is_left,
- const Reduction_T& reduction_t,
- const Reduction_V& reduction_v,
- size_t BLOCK_SIZE,
- size_t PARALLEL_THRESHOLD)
- {
- /* fall back to single threaded partitioning for small N */
- if (unlikely(end-begin < PARALLEL_THRESHOLD))
- return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
-
- /* otherwise use parallel code */
- else {
- typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
- std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
- return begin+p->partition(leftReduction,rightReduction);
- }
- }
-
-
- template<typename T, typename IsLeft>
- inline size_t parallel_partitioning(T* array,
- const size_t begin,
- const size_t end,
- const IsLeft& is_left,
- size_t BLOCK_SIZE = 128)
- {
- size_t leftReduction = 0;
- size_t rightReduction = 0;
- return parallel_partitioning(
- array,begin,end,0,leftReduction,rightReduction,is_left,
- [] (size_t& t,const T& ref) { },
- [] (size_t& t0,size_t& t1) { },
- BLOCK_SIZE);
- }
-
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp
deleted file mode 100644
index 685952c3dc..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include "parallel_prefix_sum.h"
-#include "../sys/regression.h"
-
-namespace embree
-{
- struct parallel_prefix_sum_regression_test : public RegressionTest
- {
- parallel_prefix_sum_regression_test(const char* name) : RegressionTest(name) {
- registerRegressionTest(this);
- }
-
- bool run ()
- {
- bool passed = true;
- const size_t M = 10;
-
- for (size_t N=10; N<10000000; N=size_t(2.1*N))
- {
- /* initialize array with random numbers */
- uint32_t sum0 = 0;
- std::vector<uint32_t> src(N);
- for (size_t i=0; i<N; i++) {
- sum0 += src[i] = rand();
- }
-
- /* calculate parallel prefix sum */
- std::vector<uint32_t> dst(N);
- for (auto& v : dst) v = 0;
-
- for (size_t i=0; i<M; i++) {
- uint32_t sum1 = parallel_prefix_sum(src,dst,N,0,std::plus<uint32_t>());
- passed &= (sum0 == sum1);
- }
-
- /* check if prefix sum is correct */
- for (size_t i=0, sum=0; i<N; sum+=src[i++])
- passed &= ((uint32_t)sum == dst[i]);
- }
-
- return passed;
- }
- };
-
- parallel_prefix_sum_regression_test parallel_prefix_sum_regression("parallel_prefix_sum_regression");
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.h b/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.h
deleted file mode 100644
index 117c7a79b0..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "parallel_for.h"
-
-namespace embree
-{
- template<typename Value>
- struct ParallelPrefixSumState
- {
- enum { MAX_TASKS = 64 };
- Value counts[MAX_TASKS];
- Value sums [MAX_TASKS];
- };
-
- template<typename Index, typename Value, typename Func, typename Reduction>
- __forceinline Value parallel_prefix_sum( ParallelPrefixSumState<Value>& state, Index first, Index last, Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction)
- {
- /* calculate number of tasks to use */
- const size_t numThreads = TaskScheduler::threadCount();
- const size_t numBlocks = (last-first+minStepSize-1)/minStepSize;
- const size_t taskCount = min(numThreads,numBlocks,size_t(ParallelPrefixSumState<Value>::MAX_TASKS));
-
- /* perform parallel prefix sum */
- parallel_for(taskCount, [&](const size_t taskIndex)
- {
- const size_t i0 = first+(taskIndex+0)*(last-first)/taskCount;
- const size_t i1 = first+(taskIndex+1)*(last-first)/taskCount;
- state.counts[taskIndex] = func(range<size_t>(i0,i1),state.sums[taskIndex]);
- });
-
- /* calculate prefix sum */
- Value sum=identity;
- for (size_t i=0; i<taskCount; i++)
- {
- const Value c = state.counts[i];
- state.sums[i] = sum;
- sum=reduction(sum,c);
- }
-
- return sum;
- }
-
- /*! parallel calculation of prefix sums */
- template<typename SrcArray, typename DstArray, typename Value, typename Add>
- __forceinline Value parallel_prefix_sum(const SrcArray& src, DstArray& dst, size_t N, const Value& identity, const Add& add, const size_t SINGLE_THREAD_THRESHOLD = 4096)
- {
- /* perform single threaded prefix operation for small N */
- if (N < SINGLE_THREAD_THRESHOLD)
- {
- Value sum=identity;
- for (size_t i=0; i<N; sum=add(sum,src[i++])) dst[i] = sum;
- return sum;
- }
-
- /* perform parallel prefix operation for large N */
- else
- {
- ParallelPrefixSumState<Value> state;
-
- /* initial run just sets up start values for subtasks */
- parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
-
- Value s = identity;
- for (size_t i=r.begin(); i<r.end(); i++) s = add(s,src[i]);
- return s;
-
- }, add);
-
- /* final run calculates prefix sum */
- return parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
-
- Value s = identity;
- for (size_t i=r.begin(); i<r.end(); i++) {
- dst[i] = add(sum,s);
- s = add(s,src[i]);
- }
- return s;
-
- }, add);
- }
- }
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.cpp
deleted file mode 100644
index 331fe4288e..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include "parallel_reduce.h"
-#include "../sys/regression.h"
-
-namespace embree
-{
- struct parallel_reduce_regression_test : public RegressionTest
- {
- parallel_reduce_regression_test(const char* name) : RegressionTest(name) {
- registerRegressionTest(this);
- }
-
- bool run ()
- {
- bool passed = true;
-
- const size_t M = 10;
- for (size_t N=10; N<10000000; N=size_t(2.1*N))
- {
- /* sequentially calculate sum of squares */
- size_t sum0 = 0;
- for (size_t i=0; i<N; i++) {
- sum0 += i*i;
- }
-
- /* parallel calculation of sum of squares */
- for (size_t m=0; m<M; m++)
- {
- size_t sum1 = parallel_reduce( size_t(0), size_t(N), size_t(1024), size_t(0), [&](const range<size_t>& r) -> size_t
- {
- size_t s = 0;
- for (size_t i=r.begin(); i<r.end(); i++)
- s += i*i;
- return s;
- },
- [](const size_t v0, const size_t v1) {
- return v0+v1;
- });
- passed = sum0 == sum1;
- }
- }
- return passed;
- }
- };
-
- parallel_reduce_regression_test parallel_reduce_regression("parallel_reduce_regression_test");
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h b/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h
deleted file mode 100644
index 0daf94e50e..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h
+++ /dev/null
@@ -1,150 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "parallel_for.h"
-
-namespace embree
-{
- template<typename Index, typename Value, typename Func, typename Reduction>
- __forceinline Value sequential_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction )
- {
- return func(range<Index>(first,last));
- }
-
- template<typename Index, typename Value, typename Func, typename Reduction>
- __forceinline Value sequential_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
- {
- return func(range<Index>(first,last));
- }
-
- template<typename Index, typename Value, typename Func, typename Reduction>
- __noinline Value parallel_reduce_internal( Index taskCount, const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
- {
- const Index maxTasks = 512;
- const Index threadCount = (Index) TaskScheduler::threadCount();
- taskCount = min(taskCount,threadCount,maxTasks);
-
- /* parallel invokation of all tasks */
- dynamic_large_stack_array(Value,values,taskCount,8192); // consumes at most 8192 bytes on the stack
- parallel_for(taskCount, [&](const Index taskIndex) {
- const Index k0 = first+(taskIndex+0)*(last-first)/taskCount;
- const Index k1 = first+(taskIndex+1)*(last-first)/taskCount;
- values[taskIndex] = func(range<Index>(k0,k1));
- });
-
- /* perform reduction over all tasks */
- Value v = identity;
- for (Index i=0; i<taskCount; i++) v = reduction(v,values[i]);
- return v;
- }
-
- template<typename Index, typename Value, typename Func, typename Reduction>
- __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
- {
-#if defined(TASKING_INTERNAL) || (defined(TASKING_GCD) && defined(BUILD_IOS))
-
- /* fast path for small number of iterations */
- Index taskCount = (last-first+minStepSize-1)/minStepSize;
- if (likely(taskCount == 1)) {
- return func(range<Index>(first,last));
- }
- return parallel_reduce_internal(taskCount,first,last,minStepSize,identity,func,reduction);
-
-#elif defined(TASKING_TBB)
- #if TBB_INTERFACE_VERSION >= 12002
- tbb::task_group_context context;
- const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
- [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
- reduction,context);
- // -- GODOT start --
- // if (context.is_group_execution_cancelled())
- // throw std::runtime_error("task cancelled");
- // -- GODOT end --
- return v;
- #else
- const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
- [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
- reduction);
- // -- GODOT start --
- // if (tbb::task::self().is_cancelled())
- // throw std::runtime_error("task cancelled");
- // -- GODOT end --
- return v;
- #endif
-#else // TASKING_PPL
- struct AlignedValue
- {
- char storage[__alignof(Value)+sizeof(Value)];
- static uintptr_t alignUp(uintptr_t p, size_t a) { return p + (~(p - 1) % a); };
- Value* getValuePtr() { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); }
- const Value* getValuePtr() const { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); }
- AlignedValue(const Value& v) { new(getValuePtr()) Value(v); }
- AlignedValue(const AlignedValue& v) { new(getValuePtr()) Value(*v.getValuePtr()); }
- AlignedValue(const AlignedValue&& v) { new(getValuePtr()) Value(*v.getValuePtr()); };
- AlignedValue& operator = (const AlignedValue& v) { *getValuePtr() = *v.getValuePtr(); return *this; };
- AlignedValue& operator = (const AlignedValue&& v) { *getValuePtr() = *v.getValuePtr(); return *this; };
- operator Value() const { return *getValuePtr(); }
- };
-
- struct Iterator_Index
- {
- Index v;
- typedef std::forward_iterator_tag iterator_category;
- typedef AlignedValue value_type;
- typedef Index difference_type;
- typedef Index distance_type;
- typedef AlignedValue* pointer;
- typedef AlignedValue& reference;
- __forceinline Iterator_Index() {}
- __forceinline Iterator_Index(Index v) : v(v) {}
- __forceinline bool operator== (Iterator_Index other) { return v == other.v; }
- __forceinline bool operator!= (Iterator_Index other) { return v != other.v; }
- __forceinline Iterator_Index operator++() { return Iterator_Index(++v); }
- __forceinline Iterator_Index operator++(int) { return Iterator_Index(v++); }
- };
-
- auto range_reduction = [&](Iterator_Index begin, Iterator_Index end, const AlignedValue& start) {
- assert(begin.v < end.v);
- return reduction(start, func(range<Index>(begin.v, end.v)));
- };
- const Value v = concurrency::parallel_reduce(Iterator_Index(first), Iterator_Index(last), AlignedValue(identity), range_reduction, reduction);
- return v;
-#endif
- }
-
- template<typename Index, typename Value, typename Func, typename Reduction>
- __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction )
- {
- if (likely(last-first < parallel_threshold)) {
- return func(range<Index>(first,last));
- } else {
- return parallel_reduce(first,last,minStepSize,identity,func,reduction);
- }
- }
-
- template<typename Index, typename Value, typename Func, typename Reduction>
- __forceinline Value parallel_reduce( const range<Index> range, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction )
- {
- return parallel_reduce(range.begin(),range.end(),minStepSize,parallel_threshold,identity,func,reduction);
- }
-
- template<typename Index, typename Value, typename Func, typename Reduction>
- __forceinline Value parallel_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction )
- {
- auto funcr = [&] ( const range<Index> r ) {
- Value v = identity;
- for (Index i=r.begin(); i<r.end(); i++)
- v = reduction(v,func(i));
- return v;
- };
- return parallel_reduce(first,last,Index(1),identity,funcr,reduction);
- }
-
- template<typename Index, typename Value, typename Func, typename Reduction>
- __forceinline Value parallel_reduce( const range<Index> range, const Value& identity, const Func& func, const Reduction& reduction )
- {
- return parallel_reduce(range.begin(),range.end(),Index(1),identity,func,reduction);
- }
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp
deleted file mode 100644
index 20b639c1c9..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include "parallel_set.h"
-#include "../sys/regression.h"
-
-namespace embree
-{
- struct parallel_set_regression_test : public RegressionTest
- {
- parallel_set_regression_test(const char* name) : RegressionTest(name) {
- registerRegressionTest(this);
- }
-
- bool run ()
- {
- bool passed = true;
-
- /* create vector with random numbers */
- const size_t N = 10000;
- std::vector<uint32_t> unsorted(N);
- for (size_t i=0; i<N; i++) unsorted[i] = 2*rand();
-
- /* created set from numbers */
- parallel_set<uint32_t> sorted;
- sorted.init(unsorted);
-
- /* check that all elements are in the set */
- for (size_t i=0; i<N; i++) {
- passed &= sorted.lookup(unsorted[i]);
- }
-
- /* check that these elements are not in the set */
- for (size_t i=0; i<N; i++) {
- passed &= !sorted.lookup(unsorted[i]+1);
- }
-
- return passed;
- }
- };
-
- parallel_set_regression_test parallel_set_regression("parallel_set_regression_test");
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_set.h b/thirdparty/embree-aarch64/common/algorithms/parallel_set.h
deleted file mode 100644
index 640beba7ec..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_set.h
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "parallel_sort.h"
-
-namespace embree
-{
- /* implementation of a set of values with parallel construction */
- template<typename T>
- class parallel_set
- {
- public:
-
- /*! default constructor for the parallel set */
- parallel_set () {}
-
- /*! construction from vector */
- template<typename Vector>
- parallel_set (const Vector& in) { init(in); }
-
- /*! initialized the parallel set from a vector */
- template<typename Vector>
- void init(const Vector& in)
- {
- /* copy data to internal vector */
- vec.resize(in.size());
- parallel_for( size_t(0), in.size(), size_t(4*4096), [&](const range<size_t>& r) {
- for (size_t i=r.begin(); i<r.end(); i++)
- vec[i] = in[i];
- });
-
- /* sort the data */
- std::vector<T> temp(in.size());
- radix_sort<T>(vec.data(),temp.data(),vec.size());
- }
-
- /*! tests if some element is in the set */
- __forceinline bool lookup(const T& elt) const {
- return std::binary_search(vec.begin(), vec.end(), elt);
- }
-
- /*! clears all state */
- void clear() {
- vec.clear();
- }
-
- private:
- std::vector<T> vec; //!< vector containing sorted elements
- };
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp
deleted file mode 100644
index 5e7ec79ac1..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include "parallel_sort.h"
-#include "../sys/regression.h"
-
-namespace embree
-{
- template<typename Key>
- struct RadixSortRegressionTest : public RegressionTest
- {
- RadixSortRegressionTest(const char* name) : RegressionTest(name) {
- registerRegressionTest(this);
- }
-
- bool run ()
- {
- bool passed = true;
- const size_t M = 10;
-
- for (size_t N=10; N<1000000; N=size_t(2.1*N))
- {
- std::vector<Key> src(N); memset(src.data(),0,N*sizeof(Key));
- std::vector<Key> tmp(N); memset(tmp.data(),0,N*sizeof(Key));
- for (size_t i=0; i<N; i++) src[i] = uint64_t(rand())*uint64_t(rand());
-
- /* calculate checksum */
- Key sum0 = 0; for (size_t i=0; i<N; i++) sum0 += src[i];
-
- /* sort numbers */
- for (size_t i=0; i<M; i++) {
- radix_sort<Key>(src.data(),tmp.data(),N);
- }
-
- /* calculate checksum */
- Key sum1 = 0; for (size_t i=0; i<N; i++) sum1 += src[i];
- if (sum0 != sum1) passed = false;
-
- /* check if numbers are sorted */
- for (size_t i=1; i<N; i++)
- passed &= src[i-1] <= src[i];
- }
-
- return passed;
- }
- };
-
- RadixSortRegressionTest<uint32_t> test_u32("RadixSortRegressionTestU32");
- RadixSortRegressionTest<uint64_t> test_u64("RadixSortRegressionTestU64");
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.h b/thirdparty/embree-aarch64/common/algorithms/parallel_sort.h
deleted file mode 100644
index a758227c1b..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.h
+++ /dev/null
@@ -1,457 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "../simd/simd.h"
-#include "parallel_for.h"
-#if defined(TASKING_GCD) && defined(BUILD_IOS)
-#include "../sys/alloc.h"
-#endif
-#include <algorithm>
-
-namespace embree
-{
- template<class T>
- __forceinline void insertionsort_ascending(T *__restrict__ array, const size_t length)
- {
- for(size_t i = 1;i<length;++i)
- {
- T v = array[i];
- size_t j = i;
- while(j > 0 && v < array[j-1])
- {
- array[j] = array[j-1];
- --j;
- }
- array[j] = v;
- }
- }
-
- template<class T>
- __forceinline void insertionsort_decending(T *__restrict__ array, const size_t length)
- {
- for(size_t i = 1;i<length;++i)
- {
- T v = array[i];
- size_t j = i;
- while(j > 0 && v > array[j-1])
- {
- array[j] = array[j-1];
- --j;
- }
- array[j] = v;
- }
- }
-
- template<class T>
- void quicksort_ascending(T *__restrict__ t,
- const ssize_t begin,
- const ssize_t end)
- {
- if (likely(begin < end))
- {
- const T pivotvalue = t[begin];
- ssize_t left = begin - 1;
- ssize_t right = end + 1;
-
- while(1)
- {
- while (t[--right] > pivotvalue);
- while (t[++left] < pivotvalue);
-
- if (left >= right) break;
-
- const T temp = t[right];
- t[right] = t[left];
- t[left] = temp;
- }
-
- const int pivot = right;
- quicksort_ascending(t, begin, pivot);
- quicksort_ascending(t, pivot + 1, end);
- }
- }
-
- template<class T>
- void quicksort_decending(T *__restrict__ t,
- const ssize_t begin,
- const ssize_t end)
- {
- if (likely(begin < end))
- {
- const T pivotvalue = t[begin];
- ssize_t left = begin - 1;
- ssize_t right = end + 1;
-
- while(1)
- {
- while (t[--right] < pivotvalue);
- while (t[++left] > pivotvalue);
-
- if (left >= right) break;
-
- const T temp = t[right];
- t[right] = t[left];
- t[left] = temp;
- }
-
- const int pivot = right;
- quicksort_decending(t, begin, pivot);
- quicksort_decending(t, pivot + 1, end);
- }
- }
-
-
- template<class T, ssize_t THRESHOLD>
- void quicksort_insertionsort_ascending(T *__restrict__ t,
- const ssize_t begin,
- const ssize_t end)
- {
- if (likely(begin < end))
- {
- const ssize_t size = end-begin+1;
- if (likely(size <= THRESHOLD))
- {
- insertionsort_ascending<T>(&t[begin],size);
- }
- else
- {
- const T pivotvalue = t[begin];
- ssize_t left = begin - 1;
- ssize_t right = end + 1;
-
- while(1)
- {
- while (t[--right] > pivotvalue);
- while (t[++left] < pivotvalue);
-
- if (left >= right) break;
-
- const T temp = t[right];
- t[right] = t[left];
- t[left] = temp;
- }
-
- const ssize_t pivot = right;
- quicksort_insertionsort_ascending<T,THRESHOLD>(t, begin, pivot);
- quicksort_insertionsort_ascending<T,THRESHOLD>(t, pivot + 1, end);
- }
- }
- }
-
-
- template<class T, ssize_t THRESHOLD>
- void quicksort_insertionsort_decending(T *__restrict__ t,
- const ssize_t begin,
- const ssize_t end)
- {
- if (likely(begin < end))
- {
- const ssize_t size = end-begin+1;
- if (likely(size <= THRESHOLD))
- {
- insertionsort_decending<T>(&t[begin],size);
- }
- else
- {
-
- const T pivotvalue = t[begin];
- ssize_t left = begin - 1;
- ssize_t right = end + 1;
-
- while(1)
- {
- while (t[--right] < pivotvalue);
- while (t[++left] > pivotvalue);
-
- if (left >= right) break;
-
- const T temp = t[right];
- t[right] = t[left];
- t[left] = temp;
- }
-
- const ssize_t pivot = right;
- quicksort_insertionsort_decending<T,THRESHOLD>(t, begin, pivot);
- quicksort_insertionsort_decending<T,THRESHOLD>(t, pivot + 1, end);
- }
- }
- }
-
- template<typename T>
- static void radixsort32(T* const morton, const size_t num, const unsigned int shift = 3*8)
- {
- static const unsigned int BITS = 8;
- static const unsigned int BUCKETS = (1 << BITS);
- static const unsigned int CMP_SORT_THRESHOLD = 16;
-
- __aligned(64) unsigned int count[BUCKETS];
-
- /* clear buckets */
- for (size_t i=0;i<BUCKETS;i++) count[i] = 0;
-
- /* count buckets */
-#if defined(__INTEL_COMPILER)
-#pragma nounroll
-#endif
- for (size_t i=0;i<num;i++)
- count[(unsigned(morton[i]) >> shift) & (BUCKETS-1)]++;
-
- /* prefix sums */
- __aligned(64) unsigned int head[BUCKETS];
- __aligned(64) unsigned int tail[BUCKETS];
-
- head[0] = 0;
- for (size_t i=1; i<BUCKETS; i++)
- head[i] = head[i-1] + count[i-1];
-
- for (size_t i=0; i<BUCKETS-1; i++)
- tail[i] = head[i+1];
-
- tail[BUCKETS-1] = head[BUCKETS-1] + count[BUCKETS-1];
-
- assert(tail[BUCKETS-1] == head[BUCKETS-1] + count[BUCKETS-1]);
- assert(tail[BUCKETS-1] == num);
-
- /* in-place swap */
- for (size_t i=0;i<BUCKETS;i++)
- {
- /* process bucket */
- while(head[i] < tail[i])
- {
- T v = morton[head[i]];
- while(1)
- {
- const size_t b = (unsigned(v) >> shift) & (BUCKETS-1);
- if (b == i) break;
- std::swap(v,morton[head[b]++]);
- }
- assert((unsigned(v) >> shift & (BUCKETS-1)) == i);
- morton[head[i]++] = v;
- }
- }
- if (shift == 0) return;
-
- size_t offset = 0;
- for (size_t i=0;i<BUCKETS;i++)
- if (count[i])
- {
-
- for (size_t j=offset;j<offset+count[i]-1;j++)
- assert(((unsigned(morton[j]) >> shift) & (BUCKETS-1)) == i);
-
- if (unlikely(count[i] < CMP_SORT_THRESHOLD))
- insertionsort_ascending(morton + offset, count[i]);
- else
- radixsort32(morton + offset, count[i], shift-BITS);
-
- for (size_t j=offset;j<offset+count[i]-1;j++)
- assert(morton[j] <= morton[j+1]);
-
- offset += count[i];
- }
- }
-
- template<typename Ty, typename Key>
- class ParallelRadixSort
- {
- static const size_t MAX_TASKS = 64;
- static const size_t BITS = 8;
- static const size_t BUCKETS = (1 << BITS);
- typedef unsigned int TyRadixCount[BUCKETS];
-
- template<typename T>
- static bool compare(const T& v0, const T& v1) {
- return (Key)v0 < (Key)v1;
- }
-
- private:
- ParallelRadixSort (const ParallelRadixSort& other) DELETED; // do not implement
- ParallelRadixSort& operator= (const ParallelRadixSort& other) DELETED; // do not implement
-
-
- public:
- ParallelRadixSort (Ty* const src, Ty* const tmp, const size_t N)
- : radixCount(nullptr), src(src), tmp(tmp), N(N) {}
-
- void sort(const size_t blockSize)
- {
- assert(blockSize > 0);
-
- /* perform single threaded sort for small N */
- if (N<=blockSize) // handles also special case of 0!
- {
- /* do inplace sort inside destination array */
- std::sort(src,src+N,compare<Ty>);
- }
-
- /* perform parallel sort for large N */
- else
- {
- const size_t numThreads = min((N+blockSize-1)/blockSize,TaskScheduler::threadCount(),size_t(MAX_TASKS));
- tbbRadixSort(numThreads);
- }
- }
-
- ~ParallelRadixSort()
- {
- alignedFree(radixCount);
- radixCount = nullptr;
- }
-
- private:
-
- void tbbRadixIteration0(const Key shift,
- const Ty* __restrict const src,
- Ty* __restrict const dst,
- const size_t threadIndex, const size_t threadCount)
- {
- const size_t startID = (threadIndex+0)*N/threadCount;
- const size_t endID = (threadIndex+1)*N/threadCount;
-
- /* mask to extract some number of bits */
- const Key mask = BUCKETS-1;
-
- /* count how many items go into the buckets */
- for (size_t i=0; i<BUCKETS; i++)
- radixCount[threadIndex][i] = 0;
-
- /* iterate over src array and count buckets */
- unsigned int * __restrict const count = radixCount[threadIndex];
-#if defined(__INTEL_COMPILER)
-#pragma nounroll
-#endif
- for (size_t i=startID; i<endID; i++) {
-#if defined(__X86_64__) || defined(__aarch64__)
- const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
-#else
- const Key index = ((Key)src[i] >> shift) & mask;
-#endif
- count[index]++;
- }
- }
-
- void tbbRadixIteration1(const Key shift,
- const Ty* __restrict const src,
- Ty* __restrict const dst,
- const size_t threadIndex, const size_t threadCount)
- {
- const size_t startID = (threadIndex+0)*N/threadCount;
- const size_t endID = (threadIndex+1)*N/threadCount;
-
- /* mask to extract some number of bits */
- const Key mask = BUCKETS-1;
-
- /* calculate total number of items for each bucket */
- __aligned(64) unsigned int total[BUCKETS];
- /*
- for (size_t i=0; i<BUCKETS; i++)
- total[i] = 0;
- */
- for (size_t i=0; i<BUCKETS; i+=VSIZEX)
- vintx::store(&total[i], zero);
-
- for (size_t i=0; i<threadCount; i++)
- {
- /*
- for (size_t j=0; j<BUCKETS; j++)
- total[j] += radixCount[i][j];
- */
- for (size_t j=0; j<BUCKETS; j+=VSIZEX)
- vintx::store(&total[j], vintx::load(&total[j]) + vintx::load(&radixCount[i][j]));
- }
-
- /* calculate start offset of each bucket */
- __aligned(64) unsigned int offset[BUCKETS];
- offset[0] = 0;
- for (size_t i=1; i<BUCKETS; i++)
- offset[i] = offset[i-1] + total[i-1];
-
- /* calculate start offset of each bucket for this thread */
- for (size_t i=0; i<threadIndex; i++)
- {
- /*
- for (size_t j=0; j<BUCKETS; j++)
- offset[j] += radixCount[i][j];
- */
- for (size_t j=0; j<BUCKETS; j+=VSIZEX)
- vintx::store(&offset[j], vintx::load(&offset[j]) + vintx::load(&radixCount[i][j]));
- }
-
- /* copy items into their buckets */
-#if defined(__INTEL_COMPILER)
-#pragma nounroll
-#endif
- for (size_t i=startID; i<endID; i++) {
- const Ty elt = src[i];
-#if defined(__X86_64__) || defined(__aarch64__)
- const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
-#else
- const size_t index = ((Key)src[i] >> shift) & mask;
-#endif
- dst[offset[index]++] = elt;
- }
- }
-
- void tbbRadixIteration(const Key shift, const bool last,
- const Ty* __restrict src, Ty* __restrict dst,
- const size_t numTasks)
- {
- affinity_partitioner ap;
- parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration0(shift,src,dst,taskIndex,numTasks); },ap);
- parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration1(shift,src,dst,taskIndex,numTasks); },ap);
- }
-
- void tbbRadixSort(const size_t numTasks)
- {
- radixCount = (TyRadixCount*) alignedMalloc(MAX_TASKS*sizeof(TyRadixCount),64);
-
- if (sizeof(Key) == sizeof(uint32_t)) {
- tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
- tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
- tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
- tbbRadixIteration(3*BITS,1,tmp,src,numTasks);
- }
- else if (sizeof(Key) == sizeof(uint64_t))
- {
- tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
- tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
- tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
- tbbRadixIteration(3*BITS,0,tmp,src,numTasks);
- tbbRadixIteration(4*BITS,0,src,tmp,numTasks);
- tbbRadixIteration(5*BITS,0,tmp,src,numTasks);
- tbbRadixIteration(6*BITS,0,src,tmp,numTasks);
- tbbRadixIteration(7*BITS,1,tmp,src,numTasks);
- }
- }
-
- private:
- TyRadixCount* radixCount;
- Ty* const src;
- Ty* const tmp;
- const size_t N;
- };
-
- template<typename Ty>
- void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
- {
- ParallelRadixSort<Ty,Ty>(src,tmp,N).sort(blockSize);
- }
-
- template<typename Ty, typename Key>
- void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
- {
- ParallelRadixSort<Ty,Key>(src,tmp,N).sort(blockSize);
- }
-
- template<typename Ty>
- void radix_sort_u32(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
- radix_sort<Ty,uint32_t>(src,tmp,N,blockSize);
- }
-
- template<typename Ty>
- void radix_sort_u64(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
- radix_sort<Ty,uint64_t>(src,tmp,N,blockSize);
- }
-}