21 files changed, 0 insertions, 2292 deletions
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h b/thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h
deleted file mode 100644
index 01f1f80f6c..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include <functional>
-#include "parallel_reduce.h"
-
-namespace embree
-{
-  
-  template<typename Index, class UnaryPredicate>
-    __forceinline bool parallel_any_of (Index first, Index last, UnaryPredicate pred)
-  {
-    bool ret = false;
-    
-#if defined(TASKING_TBB)
-#if TBB_INTERFACE_VERSION >= 12002
-    tbb::task_group_context context;
-    tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred,&context](const tbb::blocked_range<size_t>& r) {
-        if (context.is_group_execution_cancelled()) return;
-        for (size_t i = r.begin(); i != r.end(); ++i) {
-          if (pred(i)) {
-            ret = true;
-            context.cancel_group_execution();
-          }
-        }
-      });
-#else
-    tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred](const tbb::blocked_range<size_t>& r) {
-        if (tbb::task::self().is_cancelled()) return;
-        for (size_t i = r.begin(); i != r.end(); ++i) {
-          if (pred(i)) {
-            ret = true;
-            tbb::task::self().cancel_group_execution();
-          }
-        }
-      });
-#endif
-#else
-    ret = parallel_reduce (first, last, false, [pred](const range<size_t>& r)->bool {
-        bool localret = false;
-        for (auto i=r.begin(); i<r.end(); ++i) {
-          localret |= pred(i);
-        }
-        return localret;
-      },
-      std::bit_or<bool>()
-      );
-#endif
-    
-    return ret;
-  }
-  
-} // end namespace
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp
deleted file mode 100644
index acddc0ff81..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include "parallel_filter.h"
-#include "../sys/regression.h"
-#include <map>
-
-namespace embree
-{
-  struct parallel_filter_regression_test : public RegressionTest
-  {
-    parallel_filter_regression_test(const char* name) : RegressionTest(name) {
-      registerRegressionTest(this);
-    }
-    
-    bool run ()
-    {
-      bool passed = true;
-      auto pred = [&]( uint32_t v ) { return (v & 0x3) == 0; };
-      
-      for (size_t N=10; N<1000000; N=size_t(2.1*N))
-      {
-        size_t N0 = rand() % N;
-        
-	/* initialize array with random numbers */
-	std::vector<uint32_t> src(N);
-        std::map<uint32_t,int> m;
-	for (size_t i=0; i<N; i++) src[i] = rand();
-
-        /* count elements up */
-	for (size_t i=N0; i<N; i++)
-          if (pred(src[i]))
-            m[src[i]] = 0;
-        for (size_t i=N0; i<N; i++)
-          if (pred(src[i]))
-            m[src[i]]++;
-
-        /* filter array */
-        //size_t M = sequential_filter(src.data(),N0,N,pred);
-        size_t M = parallel_filter(src.data(),N0,N,size_t(1024),pred);
-        
-	/* check if filtered data is correct */
-	for (size_t i=N0; i<M; i++) {
-          passed &= pred(src[i]);
-          m[src[i]]--;
-        }
-	for (size_t i=N0; i<M; i++)
-          passed &= (m[src[i]] == 0);
-      }
-
-      return passed;
-    }
-  };
-
-  parallel_filter_regression_test parallel_filter_regression("parallel_filter_regression");
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_filter.h b/thirdparty/embree-aarch64/common/algorithms/parallel_filter.h
deleted file mode 100644
index 5823fc631f..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_filter.h
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "parallel_for.h"
-
-namespace embree
-{
-  template<typename Ty, typename Index, typename Predicate>
-    inline Index sequential_filter( Ty* data, const Index first, const Index last, const Predicate& predicate)
-  {
-    Index j = first;
-    for (Index i=first; i<last; i++)
-      if (predicate(data[i]))
-        data[j++] = data[i];
-
-    return j;
-  }
-
-  template<typename Ty, typename Index, typename Predicate>
-    inline Index parallel_filter( Ty* data, const Index begin, const Index end, const Index minStepSize, const Predicate& predicate)
-  {
-    /* sequential fallback */
-    if (end-begin <= minStepSize)
-      return sequential_filter(data,begin,end,predicate);
-
-    /* calculate number of tasks to use */
-    enum { MAX_TASKS = 64 };
-    const Index numThreads = TaskScheduler::threadCount();
-    const Index numBlocks  = (end-begin+minStepSize-1)/minStepSize;
-    const Index taskCount  = min(numThreads,numBlocks,(Index)MAX_TASKS);
-
-    /* filter blocks */
-    Index nused[MAX_TASKS];
-    Index nfree[MAX_TASKS];
-    parallel_for(taskCount, [&](const Index taskIndex)
-    {
-      const Index i0 = begin+(taskIndex+0)*(end-begin)/taskCount;
-      const Index i1 = begin+(taskIndex+1)*(end-begin)/taskCount;
-      const Index i2 = sequential_filter(data,i0,i1,predicate);
-      nused[taskIndex] = i2-i0;
-      nfree[taskIndex] = i1-i2;
-    });
-
-    /* calculate offsets */
-    Index sused=0;
-    Index sfree=0;
-    Index pfree[MAX_TASKS];
-    for (Index i=0; i<taskCount; i++) 
-    {
-      sused+=nused[i];
-      Index cfree = nfree[i]; pfree[i] = sfree; sfree+=cfree;
-    }
-
-    /* return if we did not filter out any element */
-    assert(sfree <= end-begin);
-    assert(sused <= end-begin);
-    if (sused == end-begin)
-      return end;
-
-    /* otherwise we have to copy misplaced elements around */
-    parallel_for(taskCount, [&](const Index taskIndex)
-    {
-      /* destination to write elements to */
-      Index dst = begin+(taskIndex+0)*(end-begin)/taskCount+nused[taskIndex];
-      Index dst_end = min(dst+nfree[taskIndex],begin+sused);
-      if (dst_end <= dst) return;
-
-      /* range of misplaced elements to copy to destination */
-      Index r0 = pfree[taskIndex];
-      Index r1 = r0+dst_end-dst;
-
-      /* find range in misplaced elements in back to front order */
-      Index k0=0;
-      for (Index i=taskCount-1; i>0; i--)
-      {
-        if (k0 > r1) break;
-        Index k1 = k0+nused[i];
-        Index src = begin+(i+0)*(end-begin)/taskCount+nused[i];
-        for (Index i=max(r0,k0); i<min(r1,k1); i++) {
-          Index isrc = src-i+k0-1;
-          assert(dst >= begin && dst < end);
-          assert(isrc >= begin && isrc < end);
-          data[dst++] = data[isrc];
-        }
-        k0 = k1;
-      }
-    });
-
-    return begin+sused;
-  }
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp
deleted file mode 100644
index ef070ebc4d..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include "parallel_for.h"
-#include "../sys/regression.h"
-
-namespace embree
-{
-  struct parallel_for_regression_test : public RegressionTest
-  {
-    parallel_for_regression_test(const char* name) : RegressionTest(name) {
-      registerRegressionTest(this);
-    }
-    
-    bool run ()
-    {
-      bool passed = true;
-
-      const size_t M = 10;
-      for (size_t N=10; N<10000000; N=size_t(2.1*N))
-      {
-        /* sequentially calculate sum of squares */
-        size_t sum0 = 0;
-        for (size_t i=0; i<N; i++) {
-          sum0 += i*i;
-        }
-
-        /* parallel calculation of sum of squares */
-        for (size_t m=0; m<M; m++)
-        {
-          std::atomic<size_t> sum1(0);
-          parallel_for( size_t(0), size_t(N), size_t(1024), [&](const range<size_t>& r) 
-          {
-            size_t s = 0;
-            for (size_t i=r.begin(); i<r.end(); i++) 
-              s += i*i;
-            sum1 += s;
-          });
-          passed = sum0 == sum1;
-        }
-      }
-      
-      return passed;
-    }
-  };
-
-  parallel_for_regression_test parallel_for_regression("parallel_for_regression_test");
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for.h b/thirdparty/embree-aarch64/common/algorithms/parallel_for.h
deleted file mode 100644
index 51d296fb16..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_for.h
+++ /dev/null
@@ -1,229 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "../tasking/taskscheduler.h"
-#include "../sys/array.h"
-#include "../math/math.h"
-#include "../math/range.h"
-
-#if defined(TASKING_GCD) && defined(BUILD_IOS)
-#include <dispatch/dispatch.h>
-#include <algorithm>
-#include <type_traits>
-#endif
-
-namespace embree
-{
-  /* parallel_for without range */
-  template<typename Index, typename Func>
-    __forceinline void parallel_for( const Index N, const Func& func)
-  {
-#if defined(TASKING_INTERNAL)
-    if (N) {
-      TaskScheduler::spawn(Index(0),N,Index(1),[&] (const range<Index>& r) {
-          assert(r.size() == 1);
-          func(r.begin());
-        });
-      if (!TaskScheduler::wait())
-        // -- GODOT start --
-        // throw std::runtime_error("task cancelled");
-        abort(); 
-        // -- GODOT end --
-    }
-#elif defined(TASKING_GCD) && defined(BUILD_IOS)
-      
-    const size_t baselineNumBlocks = (TaskScheduler::threadCount() > 1)? TaskScheduler::threadCount() : 1;
-    const size_t length = N;
-    const size_t blockSize = (length + baselineNumBlocks-1) / baselineNumBlocks;
-    const size_t numBlocks = (length + blockSize-1) / blockSize;
-      
-    dispatch_apply(numBlocks, DISPATCH_APPLY_AUTO, ^(size_t currentBlock) {
-          
-        const size_t start = (currentBlock * blockSize);
-        const size_t blockLength = std::min(length - start, blockSize);
-        const size_t end = start + blockLength;
-          
-        for(size_t i=start; i < end; i++)
-        {
-            func(i);
-        }
-    });
-      
-#elif defined(TASKING_TBB)
-  #if TBB_INTERFACE_VERSION >= 12002
-    tbb::task_group_context context;
-    tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
-        func(i);
-      },context);
-    if (context.is_group_execution_cancelled())
-      // -- GODOT start --
-      // throw std::runtime_error("task cancelled");
-      abort(); 
-      // -- GODOT end --
-  #else
-    tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
-        func(i);
-      });
-    if (tbb::task::self().is_cancelled())
-      // -- GODOT start --
-      // throw std::runtime_error("task cancelled");
-      abort(); 
-      // -- GODOT end --
-  #endif
-
-#elif defined(TASKING_PPL)
-    concurrency::parallel_for(Index(0),N,Index(1),[&](Index i) { 
-        func(i);
-      });
-#else
-#  error "no tasking system enabled"
-#endif
-  }
-  
-  /* parallel for with range and granulatity */
-  template<typename Index, typename Func>
-    __forceinline void parallel_for( const Index first, const Index last, const Index minStepSize, const Func& func)
-  {
-    assert(first <= last);
-#if defined(TASKING_INTERNAL)
-    TaskScheduler::spawn(first,last,minStepSize,func);
-    if (!TaskScheduler::wait())
-      // -- GODOT start --
-      // throw std::runtime_error("task cancelled");
-      abort(); 
-      // -- GODOT end --
-
-#elif defined(TASKING_GCD) && defined(BUILD_IOS)
-      
-    const size_t baselineNumBlocks = (TaskScheduler::threadCount() > 1)? 4*TaskScheduler::threadCount() : 1;
-    const size_t length = last - first;
-    const size_t blockSizeByThreads = (length + baselineNumBlocks-1) / baselineNumBlocks;
-    size_t blockSize = std::max<size_t>(minStepSize,blockSizeByThreads);
-    blockSize += blockSize % 4;
-      
-    const size_t numBlocks = (length + blockSize-1) / blockSize;
-      
-    dispatch_apply(numBlocks, DISPATCH_APPLY_AUTO, ^(size_t currentBlock) {
-          
-        const size_t start = first + (currentBlock * blockSize);
-        const size_t end = std::min<size_t>(last, start + blockSize);
-          
-        func( embree::range<Index>(start,end) );
-    });
-      
-
-#elif defined(TASKING_TBB)
-  #if TBB_INTERFACE_VERSION >= 12002
-    tbb::task_group_context context;
-    tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
-        func(range<Index>(r.begin(),r.end()));
-      },context);
-    if (context.is_group_execution_cancelled())
-      // -- GODOT start --
-      // throw std::runtime_error("task cancelled");
-      abort(); 
-      // -- GODOT end --
-  #else
-    tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
-        func(range<Index>(r.begin(),r.end()));
-      });
-    if (tbb::task::self().is_cancelled())
-      // -- GODOT start --
-      // throw std::runtime_error("task cancelled");
-      abort(); 
-      // -- GODOT end --
-  #endif
-
-#elif defined(TASKING_PPL)
-    concurrency::parallel_for(first, last, Index(1) /*minStepSize*/, [&](Index i) { 
-        func(range<Index>(i,i+1)); 
-      });
-
-#else
-#  error "no tasking system enabled"
-#endif
-  }
-  
-  /* parallel for with range */
-  template<typename Index, typename Func>
-    __forceinline void parallel_for( const Index first, const Index last, const Func& func)
-  {
-    assert(first <= last);
-    parallel_for(first,last,(Index)1,func);
-  }
-
-#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION > 4001)
-
-  template<typename Index, typename Func>
-    __forceinline void parallel_for_static( const Index N, const Func& func)
-  {
-    #if TBB_INTERFACE_VERSION >= 12002
-      tbb::task_group_context context;
-      tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
-          func(i);
-        },tbb::simple_partitioner(),context);
-      if (context.is_group_execution_cancelled())
-        // -- GODOT start --
-        // throw std::runtime_error("task cancelled");
-        abort(); 
-        // -- GODOT end --
-    #else
-      tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
-          func(i);
-        },tbb::simple_partitioner());
-      if (tbb::task::self().is_cancelled())
-        // -- GODOT start --
-        // throw std::runtime_error("task cancelled");
-        abort(); 
-        // -- GODOT end --
-    #endif
-  }
-
-  typedef tbb::affinity_partitioner affinity_partitioner;
-
-  template<typename Index, typename Func>
-    __forceinline void parallel_for_affinity( const Index N, const Func& func, tbb::affinity_partitioner& ap)
-  {
-    #if TBB_INTERFACE_VERSION >= 12002
-      tbb::task_group_context context;
-      tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
-          func(i);
-        },ap,context);
-      if (context.is_group_execution_cancelled())
-       // -- GODOT start --
-       // throw std::runtime_error("task cancelled");
-       abort(); 
-       // -- GODOT end --
-    #else
-      tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
-          func(i);
-        },ap);
-      if (tbb::task::self().is_cancelled())
-        // -- GODOT start --
-        // throw std::runtime_error("task cancelled");
-        abort(); 
-        // -- GODOT end --
-    #endif
-  }
-
-#else
-
-  template<typename Index, typename Func>
-    __forceinline void parallel_for_static( const Index N, const Func& func) 
-  {
-    parallel_for(N,func);
-  }
-
-  struct affinity_partitioner {
-  };
-
-  template<typename Index, typename Func>
-    __forceinline void parallel_for_affinity( const Index N, const Func& func, affinity_partitioner& ap) 
-  {
-    parallel_for(N,func);
-  }
-
-#endif
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp
deleted file mode 100644
index 0337611b35..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include "parallel_for_for.h"
-#include "../sys/regression.h"
-
-namespace embree
-{
-  struct parallel_for_for_regression_test : public RegressionTest
-  {
-    parallel_for_for_regression_test(const char* name) : RegressionTest(name) {
-      registerRegressionTest(this);
-    }
-    
-    bool run ()
-    {
-      bool passed = true;
-
-      /* create vector with random numbers */
-      size_t sum0 = 0;
-      size_t K = 0;
-      const size_t M = 1000;
-      std::vector<std::vector<size_t>* > array2(M);
-      for (size_t i=0; i<M; i++) {
-        const size_t N = rand() % 1024;
-        K+=N;
-        array2[i] = new std::vector<size_t>(N);
-        for (size_t j=0; j<N; j++) 
-          sum0 += (*array2[i])[j] = rand();
-      }
-
-      /* array to test global index */
-      std::vector<atomic<size_t>> verify_k(K);
-      for (size_t i=0; i<K; i++) verify_k[i].store(0);
-
-      /* add all numbers using parallel_for_for */
-      std::atomic<size_t> sum1(0);
-      parallel_for_for( array2, size_t(1), [&](std::vector<size_t>* v, const range<size_t>& r, size_t k) -> size_t
-      {
-        size_t s = 0;
-	for (size_t i=r.begin(); i<r.end(); i++) {
-	  s += (*v)[i];
-          verify_k[k++]++;
-        }
-        sum1 += s;
-	return sum1;
-      });
-      passed &= (sum0 == sum1);
-
-      /* check global index */
-      for (size_t i=0; i<K; i++) 
-        passed &= (verify_k[i] == 1);
-
-      /* delete vectors again */
-      for (size_t i=0; i<array2.size(); i++)
-	delete array2[i];
-      
-      return passed;
-    }
-  };
-
-  parallel_for_for_regression_test parallel_for_for_regression("parallel_for_for_regression_test");
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.h b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.h
deleted file mode 100644
index 852b8a0900..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.h
+++ /dev/null
@@ -1,149 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "parallel_for.h"
-
-namespace embree
-{
-  template<typename ArrayArray, typename Func>
-    __forceinline void sequential_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func ) 
-  {
-    size_t k=0;
-    for (size_t i=0; i!=array2.size(); ++i) {
-      const size_t N = array2[i]->size();
-      if (N) func(array2[i],range<size_t>(0,N),k);
-      k+=N;
-    }
-  }
-
-  class ParallelForForState
-  {
-  public:
-
-    enum { MAX_TASKS = 64 };
-
-    __forceinline ParallelForForState () 
-      : taskCount(0) {}
-
-    template<typename ArrayArray>
-      __forceinline ParallelForForState (ArrayArray& array2, const size_t minStepSize) {
-      init(array2,minStepSize);
-    } 
-
-    template<typename ArrayArray>
-      __forceinline void init ( ArrayArray& array2, const size_t minStepSize )
-    {
-      /* first calculate total number of elements */
-      size_t N = 0;
-      for (size_t i=0; i<array2.size(); i++) {
-	N += array2[i] ? array2[i]->size() : 0;
-      }
-      this->N = N;
-
-      /* calculate number of tasks to use */
-      const size_t numThreads = TaskScheduler::threadCount();
-      const size_t numBlocks  = (N+minStepSize-1)/minStepSize;
-      taskCount = max(size_t(1),min(numThreads,numBlocks,size_t(ParallelForForState::MAX_TASKS)));
-      
-      /* calculate start (i,j) for each task */
-      size_t taskIndex = 0;
-      i0[taskIndex] = 0;
-      j0[taskIndex] = 0;
-      size_t k0 = (++taskIndex)*N/taskCount;
-      for (size_t i=0, k=0; taskIndex < taskCount; i++) 
-      {
-	assert(i<array2.size());
-	size_t j=0, M = array2[i] ? array2[i]->size() : 0;
-	while (j<M && k+M-j >= k0 && taskIndex < taskCount) {
-	  assert(taskIndex<taskCount);
-	  i0[taskIndex] = i;
-	  j0[taskIndex] = j += k0-k;
-	  k=k0;
-	  k0 = (++taskIndex)*N/taskCount;
-	}
-	k+=M-j;
-      }
-    }
-
-    __forceinline size_t size() const {
-      return N;
-    }
-    
-  public:
-    size_t i0[MAX_TASKS];
-    size_t j0[MAX_TASKS];
-    size_t taskCount;
-    size_t N;
-  };
-
-  template<typename ArrayArray, typename Func>
-    __forceinline void parallel_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func )
-  {
-    ParallelForForState state(array2,minStepSize);
-    
-    parallel_for(state.taskCount, [&](const size_t taskIndex) 
-    {
-      /* calculate range */
-      const size_t k0 = (taskIndex+0)*state.size()/state.taskCount;
-      const size_t k1 = (taskIndex+1)*state.size()/state.taskCount;
-      size_t i0 = state.i0[taskIndex];
-      size_t j0 = state.j0[taskIndex];
-
-      /* iterate over arrays */
-      size_t k=k0;
-      for (size_t i=i0; k<k1; i++) {
-        const size_t N =  array2[i] ? array2[i]->size() : 0;
-        const size_t r0 = j0, r1 = min(N,r0+k1-k);
-        if (r1 > r0) func(array2[i],range<size_t>(r0,r1),k);
-        k+=r1-r0; j0 = 0;
-      }
-    });
-  }
-
-  template<typename ArrayArray, typename Func>
-    __forceinline void parallel_for_for( ArrayArray& array2, const Func& func )
-  {
-    parallel_for_for(array2,1,func);
-  }
-
-  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
-    __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const size_t minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
-  {
-    ParallelForForState state(array2,minStepSize);
-    Value temp[ParallelForForState::MAX_TASKS];
-
-    for (size_t i=0; i<state.taskCount; i++)
-      temp[i] = identity;
-    
-    parallel_for(state.taskCount, [&](const size_t taskIndex) 
-    {
-      /* calculate range */
-      const size_t k0 = (taskIndex+0)*state.size()/state.taskCount;
-      const size_t k1 = (taskIndex+1)*state.size()/state.taskCount;
-      size_t i0 = state.i0[taskIndex];
-      size_t j0 = state.j0[taskIndex];
-
-      /* iterate over arrays */
-      size_t k=k0;
-      for (size_t i=i0; k<k1; i++) {
-        const size_t N =  array2[i] ? array2[i]->size() : 0;
-        const size_t r0 = j0, r1 = min(N,r0+k1-k);
-        if (r1 > r0) temp[taskIndex] = reduction(temp[taskIndex],func(array2[i],range<size_t>(r0,r1),k));
-        k+=r1-r0; j0 = 0;
-      }
-    });
-
-    Value ret = identity;
-    for (size_t i=0; i<state.taskCount; i++)
-      ret = reduction(ret,temp[i]);
-    return ret;
-  }
-
-  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
-    __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const Value& identity, const Func& func, const Reduction& reduction)
-  {
-    return parallel_for_for_reduce(array2,1,identity,func,reduction);
-  }
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp
deleted file mode 100644
index 0169d8e481..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include "parallel_for_for_prefix_sum.h"
-#include "../sys/regression.h"
-
-namespace embree
-{
-  struct parallel_for_for_prefix_sum_regression_test : public RegressionTest
-  {
-    parallel_for_for_prefix_sum_regression_test(const char* name) : RegressionTest(name) {
-      registerRegressionTest(this);
-    }
-    
-    bool run ()
-    {
-      bool passed = true;
-
-      /* create vector with random numbers */
-      const size_t M = 10;
-      std::vector<atomic<size_t>> flattened;
-      typedef std::vector<std::vector<size_t>* > ArrayArray;
-      ArrayArray array2(M);
-      size_t K = 0;
-      for (size_t i=0; i<M; i++) {
-        const size_t N = rand() % 10;
-        K += N;
-        array2[i] = new std::vector<size_t>(N);
-        for (size_t j=0; j<N; j++) 
-          (*array2[i])[j] = rand() % 10;
-      }
-  
-      /* array to test global index */
-      std::vector<atomic<size_t>> verify_k(K);
-      for (size_t i=0; i<K; i++) verify_k[i].store(0);
-
-      ParallelForForPrefixSumState<size_t> state(array2,size_t(1));
-  
-      /* dry run only counts */
-      size_t S = parallel_for_for_prefix_sum0( state, array2, size_t(0), [&](std::vector<size_t>* v, const range<size_t>& r, size_t k, size_t i) -> size_t
-      {
-        size_t s = 0;
-	for (size_t i=r.begin(); i<r.end(); i++) {
-          s += (*v)[i];
-          verify_k[k++]++;
-        }
-        return s;
-      }, [](size_t v0, size_t v1) { return v0+v1; });
-      
-      /* create properly sized output array */
-      flattened.resize(S);
-      for (auto& a : flattened) a.store(0);
-
-      /* now we actually fill the flattened array */
-      parallel_for_for_prefix_sum1( state, array2, size_t(0), [&](std::vector<size_t>* v, const range<size_t>& r, size_t k, size_t i, const size_t base) -> size_t
-      {
-        size_t s = 0;
-	for (size_t i=r.begin(); i<r.end(); i++) {
-          for (size_t j=0; j<(*v)[i]; j++) {
-            flattened[base+s+j]++;
-          }
-          s += (*v)[i];
-          verify_k[k++]++;
-        }
-        return s;
-      }, [](size_t v0, size_t v1) { return v0+v1; });
-
-      /* check global index */
-      for (size_t i=0; i<K; i++) 
-        passed &= (verify_k[i] == 2);
-
-      /* check if each element was assigned exactly once */
-      for (size_t i=0; i<flattened.size(); i++)
-        passed &= (flattened[i] == 1);
-      
-      /* delete arrays again */
-      for (size_t i=0; i<array2.size(); i++)
-	delete array2[i];
-
-      return passed;
-    }
-  };
-
-  parallel_for_for_prefix_sum_regression_test parallel_for_for_prefix_sum_regression("parallel_for_for_prefix_sum_regression_test");
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.h b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.h
deleted file mode 100644
index d2671d8a6a..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.h
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "parallel_for_for.h"
-#include "parallel_prefix_sum.h"
-
-namespace embree
-{
-  template<typename Value>
-    struct ParallelForForPrefixSumState : public ParallelForForState
-  {
-    __forceinline ParallelForForPrefixSumState () {}
-
-    template<typename ArrayArray>
-      __forceinline ParallelForForPrefixSumState (ArrayArray& array2, const size_t minStepSize)
-      : ParallelForForState(array2,minStepSize) {}
-
-    ParallelPrefixSumState<Value> prefix_state;
-  };
-  
-  template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
-    __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize, 
-                                                      const Value& identity, const Func& func, const Reduction& reduction)
-  {
-    /* calculate number of tasks to use */
-    const size_t taskCount = state.taskCount;
-    /* perform parallel prefix sum */
-    parallel_for(taskCount, [&](const size_t taskIndex)
-    {
-      const size_t k0 = (taskIndex+0)*state.size()/taskCount;
-      const size_t k1 = (taskIndex+1)*state.size()/taskCount;
-      size_t i0 = state.i0[taskIndex];
-      size_t j0 = state.j0[taskIndex];
-
-      /* iterate over arrays */
-      size_t k=k0;
-      Value N=identity;
-      for (size_t i=i0; k<k1; i++) {
-	const size_t size = array2[i] ? array2[i]->size() : 0;
-        const size_t r0 = j0, r1 = min(size,r0+k1-k);
-        if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i));
-        k+=r1-r0; j0 = 0;
-      }
-      state.prefix_state.counts[taskIndex] = N;
-    });
-
-    /* calculate prefix sum */
-    Value sum=identity;
-    for (size_t i=0; i<taskCount; i++)
-    {
-      const Value c = state.prefix_state.counts[i];
-      state.prefix_state.sums[i] = sum;
-      sum=reduction(sum,c);
-    }
-
-    return sum;
-  }
-
-  template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
-    __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize, 
-                                                      const Value& identity, const Func& func, const Reduction& reduction)
-  {
-    /* calculate number of tasks to use */
-    const size_t taskCount = state.taskCount;
-    /* perform parallel prefix sum */
-    parallel_for(taskCount, [&](const size_t taskIndex)
-    {
-      const size_t k0 = (taskIndex+0)*state.size()/taskCount;
-      const size_t k1 = (taskIndex+1)*state.size()/taskCount;
-      size_t i0 = state.i0[taskIndex];
-      size_t j0 = state.j0[taskIndex];
-
-      /* iterate over arrays */
-      size_t k=k0;
-      Value N=identity;
-      for (size_t i=i0; k<k1; i++) {
-	const size_t size = array2[i] ? array2[i]->size() : 0;
-        const size_t r0 = j0, r1 = min(size,r0+k1-k);
-        if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i,reduction(state.prefix_state.sums[taskIndex],N)));
-        k+=r1-r0; j0 = 0;
-      }
-      state.prefix_state.counts[taskIndex] = N;
-    });
-
-    /* calculate prefix sum */
-    Value sum=identity;
-    for (size_t i=0; i<taskCount; i++)
-    {
-      const Value c = state.prefix_state.counts[i];
-      state.prefix_state.sums[i] = sum;
-      sum=reduction(sum,c);
-    }
-
-    return sum;
-  }
-
-  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
-    __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, 
-						     const Value& identity, const Func& func, const Reduction& reduction)
-  {
-    return parallel_for_for_prefix_sum0(state,array2,size_t(1),identity,func,reduction);
-  }
-
-  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
-    __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, 
-						     const Value& identity, const Func& func, const Reduction& reduction)
-  {
-    return parallel_for_for_prefix_sum1(state,array2,size_t(1),identity,func,reduction);
-  }
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp
deleted file mode 100644
index 09dc303f81..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include "parallel_map.h"
-#include "../sys/regression.h"
-
-namespace embree
-{
-  struct parallel_map_regression_test : public RegressionTest
-  {
-    parallel_map_regression_test(const char* name) : RegressionTest(name) {
-      registerRegressionTest(this);
-    }
-    
-    bool run ()
-    {
-      bool passed = true;
-
-      /* create key/value vectors with random numbers */
-      const size_t N = 10000;
-      std::vector<uint32_t> keys(N);
-      std::vector<uint32_t> vals(N);
-      for (size_t i=0; i<N; i++) keys[i] = 2*unsigned(i)*647382649;
-      for (size_t i=0; i<N; i++) std::swap(keys[i],keys[rand()%N]);
-      for (size_t i=0; i<N; i++) vals[i] = 2*rand();
-      
-      /* create map */
-      parallel_map<uint32_t,uint32_t> map;
-      map.init(keys,vals);
-
-      /* check that all keys are properly mapped */
-      for (size_t i=0; i<N; i++) {
-        const uint32_t* val = map.lookup(keys[i]);
-        passed &= val && (*val == vals[i]);
-      }
-
-      /* check that these keys are not in the map */
-      for (size_t i=0; i<N; i++) {
-        passed &= !map.lookup(keys[i]+1);
-      }
-
-      return passed;
-    }
-  };
-
-  parallel_map_regression_test parallel_map_regression("parallel_map_regression_test");
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_map.h b/thirdparty/embree-aarch64/common/algorithms/parallel_map.h
deleted file mode 100644
index 02e1a8f8d0..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_map.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "parallel_sort.h"
-
-namespace embree
-{
-  /*! implementation of a key/value map with parallel construction */
-  template<typename Key, typename Val>
-  class parallel_map
-  {
-    /* key/value pair to build the map */
-    struct KeyValue
-    {
-      __forceinline KeyValue () {}
-
-      __forceinline KeyValue (const Key key, const Val val)
-	: key(key), val(val) {}
-
-      __forceinline operator Key() const {
-	return key;
-      }
-
-    public:
-      Key key;
-      Val val;
-    };
-
-  public:
-    
-    /*! parallel map constructors */
-    parallel_map () {}
-
-    /*! construction from pair of vectors */
-    template<typename KeyVector, typename ValVector>
-      parallel_map (const KeyVector& keys, const ValVector& values) { init(keys,values); }
-
-    /*! initialized the parallel map from a vector with keys and values */
-    template<typename KeyVector, typename ValVector>
-      void init(const KeyVector& keys, const ValVector& values) 
-    {
-      /* reserve sufficient space for all data */
-      assert(keys.size() == values.size());
-      vec.resize(keys.size());
-      
-      /* generate key/value pairs */
-      parallel_for( size_t(0), keys.size(), size_t(4*4096), [&](const range<size_t>& r) {
-	for (size_t i=r.begin(); i<r.end(); i++)
-	  vec[i] = KeyValue((Key)keys[i],values[i]);
-      });
-
-      /* perform parallel radix sort of the key/value pairs */
-      std::vector<KeyValue> temp(keys.size());
-      radix_sort<KeyValue,Key>(vec.data(),temp.data(),keys.size());
-    }
-
-    /*! Returns a pointer to the value associated with the specified key. The pointer will be nullptr of the key is not contained in the map. */
-    __forceinline const Val* lookup(const Key& key) const 
-    {
-      typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key);
-      if (i == vec.end()) return nullptr;
-      if (i->key != key) return nullptr;
-      return &i->val;
-    }
-
-    /*! If the key is in the map, the function returns the value associated with the key, otherwise it returns the default value. */
-    __forceinline Val lookup(const Key& key, const Val& def) const 
-    {
-      typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key);
-      if (i == vec.end()) return def;
-      if (i->key != key) return def;
-      return i->val;
-    }
-
-    /*! clears all state */
-    void clear() {
-      vec.clear();
-    }
-
-  private:
-    std::vector<KeyValue> vec;    //!< vector containing sorted elements
-  };
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp
deleted file mode 100644
index eb20c4465d..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include "parallel_partition.h"
-#include "../sys/regression.h"
-
-namespace embree
-{
-  struct parallel_partition_regression_test : public RegressionTest
-  {
-    parallel_partition_regression_test(const char* name) : RegressionTest(name) {
-      registerRegressionTest(this);
-    }
-    
-    bool run ()
-    {
-      bool passed = true;
-
-      for (size_t i=0; i<100; i++)
-      {
-        /* create random permutation */
-        size_t N = std::rand() % 1000000;
-        std::vector<unsigned> array(N);
-        for (unsigned i=0; i<N; i++) array[i] = i;
-        for (auto& v : array) std::swap(v,array[std::rand()%array.size()]);
-        size_t split = std::rand() % (N+1);
-
-        /* perform parallel partitioning */
-        size_t left_sum = 0, right_sum = 0;
-        size_t mid = parallel_partitioning(array.data(),0,array.size(),0,left_sum,right_sum,
-                                           [&] ( size_t i ) { return i < split; },
-                                           []  ( size_t& sum, unsigned v) { sum += v; },
-                                           []  ( size_t& sum, size_t v) { sum += v; },
-                                           128);
-        
-        /*serial_partitioning(array.data(),0,array.size(),left_sum,right_sum,
-                            [&] ( size_t i ) { return i < split; },
-                            []  ( size_t& left_sum, int v) { left_sum += v; });*/
-
-        /* verify result */
-        passed &= mid == split;
-        passed &= left_sum == split*(split-1)/2;
-        passed &= right_sum == N*(N-1)/2-left_sum;
-        for (size_t i=0; i<split; i++) passed &= array[i] < split;
-        for (size_t i=split; i<N; i++) passed &= array[i] >= split;
-      }
-      
-      return passed;
-    }
-  };
-
-  parallel_partition_regression_test parallel_partition_regression("parallel_partition_regression_test");
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.h b/thirdparty/embree-aarch64/common/algorithms/parallel_partition.h
deleted file mode 100644
index 3b3ad7c854..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.h
+++ /dev/null
@@ -1,283 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "parallel_for.h"
-#include "../math/range.h"
-
-namespace embree
-{
-  /* serial partitioning */
-  template<typename T, typename V, typename IsLeft, typename Reduction_T>
-    __forceinline size_t serial_partitioning(T* array, 
-                                             const size_t begin,
-                                             const size_t end, 
-                                             V& leftReduction,
-                                             V& rightReduction,
-                                             const IsLeft& is_left, 
-                                             const Reduction_T& reduction_t)
-  {
-    T* l = array + begin;
-    T* r = array + end - 1;
-    
-    while(1)
-    {
-      /* *l < pivot */
-      while (likely(l <= r && is_left(*l) )) 
-      {
-        //prefetchw(l+4); // FIXME: enable?
-        reduction_t(leftReduction,*l);
-        ++l;
-      }
-      /* *r >= pivot) */
-      while (likely(l <= r && !is_left(*r)))
-      {
-        //prefetchw(r-4); FIXME: enable?
-        reduction_t(rightReduction,*r);
-        --r;
-      }
-      if (r<l) break;
-      
-      reduction_t(leftReduction ,*r);
-      reduction_t(rightReduction,*l);
-      xchg(*l,*r);
-      l++; r--;
-    }
-    
-    return l - array;        
-  }
-
-  template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
-    class __aligned(64) parallel_partition_task
-  {
-    ALIGNED_CLASS_(64);
-  private:
-
-    static const size_t MAX_TASKS = 64;
-
-    T* array;
-    size_t N;
-    const IsLeft& is_left;
-    const Reduction_T& reduction_t;
-    const Reduction_V& reduction_v;
-    const Vi& identity;
-
-    size_t numTasks; 
-    __aligned(64) size_t counter_start[MAX_TASKS+1]; 
-    __aligned(64) size_t counter_left[MAX_TASKS+1];  
-    __aligned(64) range<ssize_t> leftMisplacedRanges[MAX_TASKS];  
-    __aligned(64) range<ssize_t> rightMisplacedRanges[MAX_TASKS]; 
-    __aligned(64) V leftReductions[MAX_TASKS];           
-    __aligned(64) V rightReductions[MAX_TASKS];    
-
-  public:
-     
-    __forceinline parallel_partition_task(T* array, 
-                                          const size_t N, 
-                                          const Vi& identity, 
-                                          const IsLeft& is_left, 
-                                          const Reduction_T& reduction_t, 
-                                          const Reduction_V& reduction_v,
-                                          const size_t BLOCK_SIZE) 
-
-      : array(array), N(N), is_left(is_left), reduction_t(reduction_t), reduction_v(reduction_v), identity(identity),
-      numTasks(min((N+BLOCK_SIZE-1)/BLOCK_SIZE,min(TaskScheduler::threadCount(),MAX_TASKS))) {}
-
-    __forceinline const range<ssize_t>* findStartRange(size_t& index, const range<ssize_t>* const r, const size_t numRanges)
-    {
-      size_t i = 0;
-      while(index >= (size_t)r[i].size())
-      {
-        assert(i < numRanges);
-        index -= (size_t)r[i].size();
-        i++;
-      }	    
-      return &r[i];
-    }
-
-    __forceinline void swapItemsInMisplacedRanges(const size_t numLeftMisplacedRanges,
-                                                  const size_t numRightMisplacedRanges,
-                                                  const size_t startID,
-                                                  const size_t endID)
-    {
-      size_t leftLocalIndex  = startID;
-      size_t rightLocalIndex = startID;
-      const range<ssize_t>* l_range = findStartRange(leftLocalIndex,leftMisplacedRanges,numLeftMisplacedRanges);
-      const range<ssize_t>* r_range = findStartRange(rightLocalIndex,rightMisplacedRanges,numRightMisplacedRanges);
-      
-      size_t l_left = l_range->size() - leftLocalIndex;
-      size_t r_left = r_range->size() - rightLocalIndex;
-      T *__restrict__ l = &array[l_range->begin() + leftLocalIndex];
-      T *__restrict__ r = &array[r_range->begin() + rightLocalIndex];
-      size_t size = endID - startID;
-      size_t items = min(size,min(l_left,r_left)); 
-     
-      while (size)
-      {
-        if (unlikely(l_left == 0))
-        {
-          l_range++;
-          l_left = l_range->size();
-          l = &array[l_range->begin()];
-          items = min(size,min(l_left,r_left));
-        }
-
-        if (unlikely(r_left == 0))
-        {		
-          r_range++;
-          r_left = r_range->size();
-          r = &array[r_range->begin()];          
-          items = min(size,min(l_left,r_left));
-        }
-
-        size   -= items;
-        l_left -= items;
-        r_left -= items;
-
-        while(items) {
-          items--;
-          xchg(*l++,*r++);
-        }
-      }
-    }
-
-    __forceinline size_t partition(V& leftReduction, V& rightReduction)
-    {
-      /* partition the individual ranges for each task */
-      parallel_for(numTasks,[&] (const size_t taskID) {
-          const size_t startID = (taskID+0)*N/numTasks;
-          const size_t endID   = (taskID+1)*N/numTasks;
-          V local_left(identity);
-          V local_right(identity);
-          const size_t mid = serial_partitioning(array,startID,endID,local_left,local_right,is_left,reduction_t);
-          counter_start[taskID] = startID;
-          counter_left [taskID] = mid-startID;
-          leftReductions[taskID]  = local_left;
-          rightReductions[taskID] = local_right;
-        });
-      counter_start[numTasks] = N;
-      counter_left[numTasks]  = 0;
-      
-      /* finalize the reductions */
-      for (size_t i=0; i<numTasks; i++) {
-        reduction_v(leftReduction,leftReductions[i]);
-        reduction_v(rightReduction,rightReductions[i]);
-      }
-
-      /* calculate mid point for partitioning */
-      size_t mid = counter_left[0];
-      for (size_t i=1; i<numTasks; i++)
-        mid += counter_left[i];
-      const range<ssize_t> globalLeft (0,mid);
-      const range<ssize_t> globalRight(mid,N);
-
-      /* calculate all left and right ranges that are on the wrong global side */
-      size_t numMisplacedRangesLeft  = 0;
-      size_t numMisplacedRangesRight = 0;
-      size_t numMisplacedItemsLeft   = 0;
-      size_t numMisplacedItemsRight  = 0;
-
-      for (size_t i=0; i<numTasks; i++)
-      {	    
-        const range<ssize_t> left_range (counter_start[i], counter_start[i] + counter_left[i]);
-        const range<ssize_t> right_range(counter_start[i] + counter_left[i], counter_start[i+1]);
-        const range<ssize_t> left_misplaced  = globalLeft. intersect(right_range);
-        const range<ssize_t> right_misplaced = globalRight.intersect(left_range);
-
-        if (!left_misplaced.empty())  
-        {
-          numMisplacedItemsLeft += left_misplaced.size();
-          leftMisplacedRanges[numMisplacedRangesLeft++] = left_misplaced;
-        }
-
-        if (!right_misplaced.empty()) 
-        {
-          numMisplacedItemsRight += right_misplaced.size();
-          rightMisplacedRanges[numMisplacedRangesRight++] = right_misplaced;
-        }
-      }
-      assert( numMisplacedItemsLeft == numMisplacedItemsRight );
-
-      /* if no items are misplaced we are done */
-      if (numMisplacedItemsLeft == 0)
-        return mid;
-
-      /* otherwise we copy the items to the right place in parallel */
-      parallel_for(numTasks,[&] (const size_t taskID) {
-          const size_t startID = (taskID+0)*numMisplacedItemsLeft/numTasks;
-          const size_t endID   = (taskID+1)*numMisplacedItemsLeft/numTasks;
-          swapItemsInMisplacedRanges(numMisplacedRangesLeft,numMisplacedRangesRight,startID,endID);	                             
-        });
-
-      return mid;
-    }
-  };
-
-  template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
-    __noinline size_t parallel_partitioning(T* array, 
-                                            const size_t begin,
-                                            const size_t end, 
-                                            const Vi &identity,
-                                            V &leftReduction,
-                                            V &rightReduction,
-                                            const IsLeft& is_left, 
-                                            const Reduction_T& reduction_t,
-                                            const Reduction_V& reduction_v,
-                                            size_t BLOCK_SIZE = 128)
-  {
-    /* fall back to single threaded partitioning for small N */
-    if (unlikely(end-begin < BLOCK_SIZE))
-      return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
-
-    /* otherwise use parallel code */
-    else {
-      typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
-      std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
-      return begin+p->partition(leftReduction,rightReduction);    
-    }
-  }
-
-  template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
-    __noinline size_t parallel_partitioning(T* array, 
-                                            const size_t begin,
-                                            const size_t end, 
-                                            const Vi &identity,
-                                            V &leftReduction,
-                                            V &rightReduction,
-                                            const IsLeft& is_left, 
-                                            const Reduction_T& reduction_t,
-                                            const Reduction_V& reduction_v,
-                                            size_t BLOCK_SIZE,
-                                            size_t PARALLEL_THRESHOLD)
-  {
-    /* fall back to single threaded partitioning for small N */
-    if (unlikely(end-begin < PARALLEL_THRESHOLD))
-      return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
-
-    /* otherwise use parallel code */
-    else {
-      typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
-      std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
-      return begin+p->partition(leftReduction,rightReduction);    
-    }
-  }
-
-
-  template<typename T, typename IsLeft>
-    inline size_t parallel_partitioning(T* array, 
-                                        const size_t begin,
-                                        const size_t end, 
-                                        const IsLeft& is_left, 
-                                        size_t BLOCK_SIZE = 128)
-  {
-    size_t leftReduction = 0;
-    size_t rightReduction = 0;
-    return parallel_partitioning(
-      array,begin,end,0,leftReduction,rightReduction,is_left,
-      [] (size_t& t,const T& ref) {  },
-      [] (size_t& t0,size_t& t1) { },
-      BLOCK_SIZE);
-  }
-
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp
deleted file mode 100644
index 685952c3dc..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include "parallel_prefix_sum.h"
-#include "../sys/regression.h"
-
-namespace embree
-{
-  struct parallel_prefix_sum_regression_test : public RegressionTest
-  {
-    parallel_prefix_sum_regression_test(const char* name) : RegressionTest(name) {
-      registerRegressionTest(this);
-    }
-    
-    bool run ()
-    {
-      bool passed = true;
-      const size_t M = 10;
-      
-      for (size_t N=10; N<10000000; N=size_t(2.1*N))
-      {
-	/* initialize array with random numbers */
-        uint32_t sum0 = 0;
-	std::vector<uint32_t> src(N);
-	for (size_t i=0; i<N; i++) {
-	  sum0 += src[i] = rand();
-        }
-        
-	/* calculate parallel prefix sum */
-	std::vector<uint32_t> dst(N);
-	for (auto& v : dst) v = 0;
-	
-	for (size_t i=0; i<M; i++) {
-	  uint32_t sum1 = parallel_prefix_sum(src,dst,N,0,std::plus<uint32_t>());
-          passed &= (sum0 == sum1);
-        }
-        
-	/* check if prefix sum is correct */
-	for (size_t i=0, sum=0; i<N; sum+=src[i++])
-	  passed &= ((uint32_t)sum == dst[i]);
-      }
-      
-      return passed;
-    }
-  };
-
-  parallel_prefix_sum_regression_test parallel_prefix_sum_regression("parallel_prefix_sum_regression");
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.h b/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.h
deleted file mode 100644
index 117c7a79b0..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "parallel_for.h"
-
-namespace embree
-{
-  template<typename Value>
-    struct ParallelPrefixSumState 
-  {
-    enum { MAX_TASKS = 64 };
-    Value counts[MAX_TASKS];
-    Value sums  [MAX_TASKS];
-  };
-
-  template<typename Index, typename Value, typename Func, typename Reduction>
-    __forceinline Value parallel_prefix_sum( ParallelPrefixSumState<Value>& state, Index first, Index last, Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction)
-  {
-    /* calculate number of tasks to use */
-    const size_t numThreads = TaskScheduler::threadCount();
-    const size_t numBlocks  = (last-first+minStepSize-1)/minStepSize;
-    const size_t taskCount  = min(numThreads,numBlocks,size_t(ParallelPrefixSumState<Value>::MAX_TASKS));
-
-    /* perform parallel prefix sum */
-    parallel_for(taskCount, [&](const size_t taskIndex)
-    {
-      const size_t i0 = first+(taskIndex+0)*(last-first)/taskCount;
-      const size_t i1 = first+(taskIndex+1)*(last-first)/taskCount;
-      state.counts[taskIndex] = func(range<size_t>(i0,i1),state.sums[taskIndex]);
-    });
-
-    /* calculate prefix sum */
-    Value sum=identity;
-    for (size_t i=0; i<taskCount; i++) 
-    {
-      const Value c = state.counts[i];
-      state.sums[i] = sum;
-      sum=reduction(sum,c);
-    }
-
-    return sum;
-  }
-
-  /*! parallel calculation of prefix sums */
-  template<typename SrcArray, typename DstArray, typename Value, typename Add>
-    __forceinline Value parallel_prefix_sum(const SrcArray& src, DstArray& dst, size_t N, const Value& identity, const Add& add, const size_t SINGLE_THREAD_THRESHOLD = 4096) 
-  {
-    /* perform single threaded prefix operation for small N */
-    if (N < SINGLE_THREAD_THRESHOLD) 
-    {
-      Value sum=identity;
-      for (size_t i=0; i<N; sum=add(sum,src[i++])) dst[i] = sum;
-      return sum;
-    }
-    
-    /* perform parallel prefix operation for large N */
-    else 
-    {
-      ParallelPrefixSumState<Value> state;
-      
-      /* initial run just sets up start values for subtasks */
-      parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
-          
-          Value s = identity;
-          for (size_t i=r.begin(); i<r.end(); i++) s = add(s,src[i]);
-          return s;
-          
-        }, add);
-      
-      /* final run calculates prefix sum */
-      return parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
-          
-          Value s = identity;
-          for (size_t i=r.begin(); i<r.end(); i++) {
-            dst[i] = add(sum,s);
-            s = add(s,src[i]);
-          }
-          return s;
-          
-        }, add);
-    }
-  }
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.cpp
deleted file mode 100644
index 331fe4288e..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include "parallel_reduce.h"
-#include "../sys/regression.h"
-
-namespace embree
-{
-  struct parallel_reduce_regression_test : public RegressionTest
-  {
-    parallel_reduce_regression_test(const char* name) : RegressionTest(name) {
-      registerRegressionTest(this);
-    }
-    
-    bool run ()
-    {
-      bool passed = true;
-
-      const size_t M = 10;
-      for (size_t N=10; N<10000000; N=size_t(2.1*N))
-      {
-        /* sequentially calculate sum of squares */
-        size_t sum0 = 0;
-        for (size_t i=0; i<N; i++) {
-          sum0 += i*i;
-        }
-
-        /* parallel calculation of sum of squares */
-        for (size_t m=0; m<M; m++)
-        {
-          size_t sum1 = parallel_reduce( size_t(0), size_t(N), size_t(1024), size_t(0), [&](const range<size_t>& r) -> size_t
-          {
-            size_t s = 0;
-            for (size_t i=r.begin(); i<r.end(); i++) 
-              s += i*i;
-            return s;
-          }, 
-          [](const size_t v0, const size_t v1) {
-            return v0+v1;
-          });
-          passed = sum0 == sum1;
-        }
-      }
-      return passed;
-    }
-  };
-
-  parallel_reduce_regression_test parallel_reduce_regression("parallel_reduce_regression_test");
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h b/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h
deleted file mode 100644
index 0daf94e50e..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h
+++ /dev/null
@@ -1,150 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "parallel_for.h"
-
-namespace embree
-{
-  template<typename Index, typename Value, typename Func, typename Reduction>
-    __forceinline Value sequential_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction ) 
-  {
-    return func(range<Index>(first,last));
-  }
-
-  template<typename Index, typename Value, typename Func, typename Reduction>
-    __forceinline Value sequential_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
-  {
-    return func(range<Index>(first,last));
-  }
-
-  template<typename Index, typename Value, typename Func, typename Reduction>
-    __noinline Value parallel_reduce_internal( Index taskCount, const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
-  {
-    const Index maxTasks = 512;
-    const Index threadCount = (Index) TaskScheduler::threadCount();
-    taskCount = min(taskCount,threadCount,maxTasks);
-
-    /* parallel invokation of all tasks */
-    dynamic_large_stack_array(Value,values,taskCount,8192); // consumes at most 8192 bytes on the stack
-    parallel_for(taskCount, [&](const Index taskIndex) {
-        const Index k0 = first+(taskIndex+0)*(last-first)/taskCount;
-        const Index k1 = first+(taskIndex+1)*(last-first)/taskCount;
-        values[taskIndex] = func(range<Index>(k0,k1));
-      });
-
-    /* perform reduction over all tasks */
-    Value v = identity;
-    for (Index i=0; i<taskCount; i++) v = reduction(v,values[i]);
-    return v;
-  }
-
-  template<typename Index, typename Value, typename Func, typename Reduction>
-    __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
-  {
-#if defined(TASKING_INTERNAL) || (defined(TASKING_GCD) && defined(BUILD_IOS))
-
-    /* fast path for small number of iterations */
-    Index taskCount = (last-first+minStepSize-1)/minStepSize;
-    if (likely(taskCount == 1)) {
-      return func(range<Index>(first,last));
-    }
-    return parallel_reduce_internal(taskCount,first,last,minStepSize,identity,func,reduction);
-
-#elif defined(TASKING_TBB)
-  #if TBB_INTERFACE_VERSION >= 12002
-    tbb::task_group_context context;
-    const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
-      [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
-      reduction,context);
-    // -- GODOT start --
-    // if (context.is_group_execution_cancelled())
-    //   throw std::runtime_error("task cancelled");
-    // -- GODOT end --
-    return v;
-  #else
-    const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
-      [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
-      reduction);
-    // -- GODOT start --
-    // if (tbb::task::self().is_cancelled())
-    //   throw std::runtime_error("task cancelled");
-    // -- GODOT end --
-    return v;
-  #endif
-#else // TASKING_PPL
-    struct AlignedValue
-    {
-      char storage[__alignof(Value)+sizeof(Value)];
-      static uintptr_t alignUp(uintptr_t p, size_t a) { return p + (~(p - 1) % a); };
-      Value* getValuePtr() { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); }
-      const Value* getValuePtr() const { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); }
-      AlignedValue(const Value& v) { new(getValuePtr()) Value(v); }
-      AlignedValue(const AlignedValue& v) { new(getValuePtr()) Value(*v.getValuePtr()); }
-      AlignedValue(const AlignedValue&& v) { new(getValuePtr()) Value(*v.getValuePtr()); };
-      AlignedValue& operator = (const AlignedValue& v) { *getValuePtr() = *v.getValuePtr(); return *this; };
-      AlignedValue& operator = (const AlignedValue&& v) { *getValuePtr() = *v.getValuePtr(); return *this; };
-      operator Value() const { return *getValuePtr(); }
-    };
-    
-    struct Iterator_Index
-    {
-      Index v;
-      typedef std::forward_iterator_tag iterator_category;
-      typedef AlignedValue value_type;
-      typedef Index difference_type;
-      typedef Index distance_type;
-      typedef AlignedValue* pointer;
-      typedef AlignedValue& reference;
-      __forceinline Iterator_Index() {}
-      __forceinline Iterator_Index(Index v) : v(v) {}
-      __forceinline bool operator== (Iterator_Index other) { return v == other.v; }
-      __forceinline bool operator!= (Iterator_Index other) { return v != other.v; }
-      __forceinline Iterator_Index operator++() { return Iterator_Index(++v); }
-      __forceinline Iterator_Index operator++(int) { return Iterator_Index(v++); }
-    };
-    
-    auto range_reduction = [&](Iterator_Index begin, Iterator_Index end, const AlignedValue& start) {
-      assert(begin.v < end.v);
-      return reduction(start, func(range<Index>(begin.v, end.v)));
-    };
-    const Value v = concurrency::parallel_reduce(Iterator_Index(first), Iterator_Index(last), AlignedValue(identity), range_reduction, reduction);
-    return v;
-#endif
-  }
-
-  template<typename Index, typename Value, typename Func, typename Reduction>
-    __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction )
-  {
-    if (likely(last-first < parallel_threshold)) {
-      return func(range<Index>(first,last)); 
-    } else {
-      return parallel_reduce(first,last,minStepSize,identity,func,reduction);
-    }
-  }
-
-  template<typename Index, typename Value, typename Func, typename Reduction>
-    __forceinline Value parallel_reduce( const range<Index> range, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction ) 
-  {
-    return parallel_reduce(range.begin(),range.end(),minStepSize,parallel_threshold,identity,func,reduction);
-  }
-
-  template<typename Index, typename Value, typename Func, typename Reduction>
-    __forceinline Value parallel_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction )
-  {
-    auto funcr = [&] ( const range<Index> r ) {
-      Value v = identity;
-      for (Index i=r.begin(); i<r.end(); i++)
-        v = reduction(v,func(i));
-      return v;
-    };
-    return parallel_reduce(first,last,Index(1),identity,funcr,reduction);
-  }
-
-  template<typename Index, typename Value, typename Func, typename Reduction>
-    __forceinline Value parallel_reduce( const range<Index> range, const Value& identity, const Func& func, const Reduction& reduction )
-  {
-    return parallel_reduce(range.begin(),range.end(),Index(1),identity,func,reduction);
-  }
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp
deleted file mode 100644
index 20b639c1c9..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include "parallel_set.h"
-#include "../sys/regression.h"
-
-namespace embree
-{
-  struct parallel_set_regression_test : public RegressionTest
-  {
-    parallel_set_regression_test(const char* name) : RegressionTest(name) {
-      registerRegressionTest(this);
-    }
-    
-    bool run ()
-    {
-      bool passed = true;
-
-      /* create vector with random numbers */
-      const size_t N = 10000;
-      std::vector<uint32_t> unsorted(N);
-      for (size_t i=0; i<N; i++) unsorted[i] = 2*rand();
-      
-      /* created set from numbers */
-      parallel_set<uint32_t> sorted;
-      sorted.init(unsorted);
-
-      /* check that all elements are in the set */
-      for (size_t i=0; i<N; i++) {
-	passed &= sorted.lookup(unsorted[i]);
-      }
-
-      /* check that these elements are not in the set */
-      for (size_t i=0; i<N; i++) {
-	passed &= !sorted.lookup(unsorted[i]+1);
-      }
-
-      return passed;
-    }
-  };
-
-  parallel_set_regression_test parallel_set_regression("parallel_set_regression_test");
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_set.h b/thirdparty/embree-aarch64/common/algorithms/parallel_set.h
deleted file mode 100644
index 640beba7ec..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_set.h
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "parallel_sort.h"
-
-namespace embree
-{
-  /* implementation of a set of values with parallel construction */
-  template<typename T>
-  class parallel_set
-  {
-  public:
-
-    /*! default constructor for the parallel set */
-    parallel_set () {}
-
-    /*! construction from vector */
-    template<typename Vector>
-      parallel_set (const Vector& in) { init(in); }
-
-    /*! initialized the parallel set from a vector */
-    template<typename Vector>
-      void init(const Vector& in) 
-    {
-      /* copy data to internal vector */
-      vec.resize(in.size());
-      parallel_for( size_t(0), in.size(), size_t(4*4096), [&](const range<size_t>& r) {
-	for (size_t i=r.begin(); i<r.end(); i++) 
-	  vec[i] = in[i];
-      });
-
-      /* sort the data */
-      std::vector<T> temp(in.size());
-      radix_sort<T>(vec.data(),temp.data(),vec.size());
-    }
-
-    /*! tests if some element is in the set */
-    __forceinline bool lookup(const T& elt) const {
-      return std::binary_search(vec.begin(), vec.end(), elt);
-    }
-
-    /*! clears all state */
-    void clear() {
-      vec.clear();
-    }
-
-  private:
-    std::vector<T> vec;   //!< vector containing sorted elements
-  };
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp
deleted file mode 100644
index 5e7ec79ac1..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include "parallel_sort.h"
-#include "../sys/regression.h"
-
-namespace embree
-{
-  template<typename Key>
-  struct RadixSortRegressionTest : public RegressionTest
-  {
-    RadixSortRegressionTest(const char* name) : RegressionTest(name) {
-      registerRegressionTest(this);
-    }
-    
-    bool run ()
-    {
-      bool passed = true;
-      const size_t M = 10;
-
-      for (size_t N=10; N<1000000; N=size_t(2.1*N))
-      {
-	std::vector<Key> src(N); memset(src.data(),0,N*sizeof(Key));
-	std::vector<Key> tmp(N); memset(tmp.data(),0,N*sizeof(Key));
-	for (size_t i=0; i<N; i++) src[i] = uint64_t(rand())*uint64_t(rand());
-	
-	/* calculate checksum */
-	Key sum0 = 0; for (size_t i=0; i<N; i++) sum0 += src[i];
-        
-	/* sort numbers */
-	for (size_t i=0; i<M; i++) {
-          radix_sort<Key>(src.data(),tmp.data(),N);
-        }
-	
-	/* calculate checksum */
-	Key sum1 = 0; for (size_t i=0; i<N; i++) sum1 += src[i];
-	if (sum0 != sum1) passed = false;
-        
-	/* check if numbers are sorted */
-	for (size_t i=1; i<N; i++)
-	  passed &= src[i-1] <= src[i];
-      }
-      
-      return passed;
-    }
-  };
-
-  RadixSortRegressionTest<uint32_t> test_u32("RadixSortRegressionTestU32");
-  RadixSortRegressionTest<uint64_t> test_u64("RadixSortRegressionTestU64");
-}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.h b/thirdparty/embree-aarch64/common/algorithms/parallel_sort.h
deleted file mode 100644
index a758227c1b..0000000000
--- a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.h
+++ /dev/null
@@ -1,457 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "../simd/simd.h"
-#include "parallel_for.h"
-#if defined(TASKING_GCD) && defined(BUILD_IOS)
-#include "../sys/alloc.h"
-#endif
-#include <algorithm>
-
-namespace embree
-{
-  template<class T>
-    __forceinline void insertionsort_ascending(T *__restrict__ array, const size_t length)
-  {
-    for(size_t i = 1;i<length;++i)
-    {
-      T v = array[i];
-      size_t j = i;
-      while(j > 0 && v < array[j-1])
-      {
-        array[j] = array[j-1];
-        --j;
-      }
-      array[j] = v;
-    }
-  }
-  
-  template<class T>
-    __forceinline void insertionsort_decending(T *__restrict__ array, const size_t length)
-  {
-    for(size_t i = 1;i<length;++i)
-    {
-      T v = array[i];
-      size_t j = i;
-      while(j > 0 && v > array[j-1])
-      {
-        array[j] = array[j-1];
-        --j;
-      }
-      array[j] = v;
-    }
-  }
-  
-  template<class T> 
-    void quicksort_ascending(T *__restrict__ t, 
-			     const ssize_t begin, 
-			     const ssize_t end)
-  {
-    if (likely(begin < end)) 
-    {      
-      const T pivotvalue = t[begin];
-      ssize_t left  = begin - 1;
-      ssize_t right = end   + 1;
-      
-      while(1) 
-      {
-        while (t[--right] > pivotvalue);
-        while (t[++left] < pivotvalue);
-        
-        if (left >= right) break;
-        
-        const T temp = t[right];
-        t[right] = t[left];
-        t[left] = temp;
-      }
-      
-      const int pivot = right;
-      quicksort_ascending(t, begin, pivot);
-      quicksort_ascending(t, pivot + 1, end);
-    }
-  }
-  
-  template<class T> 
-    void quicksort_decending(T *__restrict__ t, 
-			     const ssize_t begin, 
-			     const ssize_t end)
-  {
-    if (likely(begin < end)) 
-    {
-      const T pivotvalue = t[begin];
-      ssize_t left  = begin - 1;
-      ssize_t right = end   + 1;
-      
-      while(1) 
-      {
-        while (t[--right] < pivotvalue);
-        while (t[++left] > pivotvalue);
-        
-        if (left >= right) break;
-        
-        const T temp = t[right];
-        t[right] = t[left];
-        t[left] = temp;
-      }
-      
-      const int pivot = right;
-      quicksort_decending(t, begin, pivot);
-      quicksort_decending(t, pivot + 1, end);
-    }
-  }
-  
-  
-  template<class T, ssize_t THRESHOLD> 
-    void quicksort_insertionsort_ascending(T *__restrict__ t, 
-					   const ssize_t begin, 
-					   const ssize_t end)
-  {
-    if (likely(begin < end)) 
-    {      
-      const ssize_t size = end-begin+1;
-      if (likely(size <= THRESHOLD))
-      {
-        insertionsort_ascending<T>(&t[begin],size);
-      }
-      else
-      {
-        const T pivotvalue = t[begin];
-        ssize_t left  = begin - 1;
-        ssize_t right = end   + 1;
-        
-        while(1) 
-        {
-          while (t[--right] > pivotvalue);
-          while (t[++left] < pivotvalue);
-          
-          if (left >= right) break;
-          
-          const T temp = t[right];
-          t[right] = t[left];
-          t[left] = temp;
-        }
-        
-        const ssize_t pivot = right;
-        quicksort_insertionsort_ascending<T,THRESHOLD>(t, begin, pivot);
-        quicksort_insertionsort_ascending<T,THRESHOLD>(t, pivot + 1, end);
-      }
-    }
-  }
-  
-  
-  template<class T, ssize_t THRESHOLD> 
-    void quicksort_insertionsort_decending(T *__restrict__ t, 
-					   const ssize_t begin, 
-					   const ssize_t end)
-  {
-    if (likely(begin < end)) 
-    {
-      const ssize_t size = end-begin+1;
-      if (likely(size <= THRESHOLD))
-      {
-        insertionsort_decending<T>(&t[begin],size);
-      }
-      else
-      {
-        
-        const T pivotvalue = t[begin];
-        ssize_t left  = begin - 1;
-        ssize_t right = end   + 1;
-        
-        while(1) 
-        {
-          while (t[--right] < pivotvalue);
-          while (t[++left] > pivotvalue);
-          
-          if (left >= right) break;
-          
-          const T temp = t[right];
-          t[right] = t[left];
-          t[left] = temp;
-        }
-        
-        const ssize_t pivot = right;
-        quicksort_insertionsort_decending<T,THRESHOLD>(t, begin, pivot);
-        quicksort_insertionsort_decending<T,THRESHOLD>(t, pivot + 1, end);
-      }
-    }
-  }
-  
-  template<typename T>
-    static void radixsort32(T* const morton, const size_t num, const unsigned int shift = 3*8)
-  {
-    static const unsigned int BITS = 8;
-    static const unsigned int BUCKETS = (1 << BITS);
-    static const unsigned int CMP_SORT_THRESHOLD = 16;
-    
-    __aligned(64) unsigned int count[BUCKETS];
-    
-    /* clear buckets */
-    for (size_t i=0;i<BUCKETS;i++) count[i] = 0;
-    
-    /* count buckets */
-#if defined(__INTEL_COMPILER)
-#pragma nounroll
-#endif
-    for (size_t i=0;i<num;i++)
-      count[(unsigned(morton[i]) >> shift) & (BUCKETS-1)]++;
-    
-    /* prefix sums */
-    __aligned(64) unsigned int head[BUCKETS];
-    __aligned(64) unsigned int tail[BUCKETS];
-    
-    head[0] = 0;
-    for (size_t i=1; i<BUCKETS; i++)    
-      head[i] = head[i-1] + count[i-1];
-    
-    for (size_t i=0; i<BUCKETS-1; i++)    
-      tail[i] = head[i+1];
-    
-    tail[BUCKETS-1] = head[BUCKETS-1] + count[BUCKETS-1];
-    
-    assert(tail[BUCKETS-1] == head[BUCKETS-1] + count[BUCKETS-1]);      
-    assert(tail[BUCKETS-1] == num);      
-    
-    /* in-place swap */      
-    for (size_t i=0;i<BUCKETS;i++)
-    {
-      /* process bucket */
-      while(head[i] < tail[i])
-      {
-        T v = morton[head[i]];
-        while(1)
-        {
-          const size_t b = (unsigned(v) >> shift) & (BUCKETS-1);
-          if (b == i) break;
-          std::swap(v,morton[head[b]++]);
-        }
-        assert((unsigned(v) >> shift & (BUCKETS-1)) == i);
-        morton[head[i]++] = v;
-      }
-    }
-    if (shift == 0) return;
-    
-    size_t offset = 0;
-    for (size_t i=0;i<BUCKETS;i++)
-      if (count[i])
-      {
-        
-        for (size_t j=offset;j<offset+count[i]-1;j++)
-          assert(((unsigned(morton[j]) >> shift) & (BUCKETS-1)) == i);
-        
-        if (unlikely(count[i] < CMP_SORT_THRESHOLD))
-          insertionsort_ascending(morton + offset, count[i]);
-        else
-          radixsort32(morton + offset, count[i], shift-BITS);
-        
-        for (size_t j=offset;j<offset+count[i]-1;j++)
-          assert(morton[j] <= morton[j+1]);
-        
-        offset += count[i];
-      }      
-  }    
-
-  template<typename Ty, typename Key>
-    class ParallelRadixSort
-  {
-    static const size_t MAX_TASKS = 64;
-    static const size_t BITS = 8;
-    static const size_t BUCKETS = (1 << BITS);
-    typedef unsigned int TyRadixCount[BUCKETS];
-    
-    template<typename T>
-      static bool compare(const T& v0, const T& v1) {
-      return (Key)v0 < (Key)v1;
-    }
-
-  private:
-    ParallelRadixSort (const ParallelRadixSort& other) DELETED; // do not implement
-    ParallelRadixSort& operator= (const ParallelRadixSort& other) DELETED; // do not implement
-
-    
-  public:
-    ParallelRadixSort (Ty* const src, Ty* const tmp, const size_t N)
-      : radixCount(nullptr), src(src), tmp(tmp), N(N) {}
-
-    void sort(const size_t blockSize)
-    {
-      assert(blockSize > 0);
-      
-      /* perform single threaded sort for small N */
-      if (N<=blockSize) // handles also special case of 0!
-      {	  
-        /* do inplace sort inside destination array */
-        std::sort(src,src+N,compare<Ty>);
-      }
-      
-      /* perform parallel sort for large N */
-      else 
-      {
-        const size_t numThreads = min((N+blockSize-1)/blockSize,TaskScheduler::threadCount(),size_t(MAX_TASKS));
-        tbbRadixSort(numThreads);
-      }
-    }
-
-    ~ParallelRadixSort()
-    {
-      alignedFree(radixCount); 
-      radixCount = nullptr;
-    }
-    
-  private:
-    
-    void tbbRadixIteration0(const Key shift, 
-                            const Ty* __restrict const src, 
-                            Ty* __restrict const dst, 
-                            const size_t threadIndex, const size_t threadCount)
-    {
-      const size_t startID = (threadIndex+0)*N/threadCount;
-      const size_t endID   = (threadIndex+1)*N/threadCount;
-      
-      /* mask to extract some number of bits */
-      const Key mask = BUCKETS-1;
-      
-      /* count how many items go into the buckets */
-      for (size_t i=0; i<BUCKETS; i++)
-        radixCount[threadIndex][i] = 0;
-
-      /* iterate over src array and count buckets */
-      unsigned int * __restrict const count = radixCount[threadIndex];
-#if defined(__INTEL_COMPILER)
-#pragma nounroll      
-#endif
-      for (size_t i=startID; i<endID; i++) {
-#if defined(__X86_64__) || defined(__aarch64__)
-        const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
-#else
-        const Key index = ((Key)src[i] >> shift) & mask;
-#endif
-        count[index]++;
-      }
-    }
-    
-    void tbbRadixIteration1(const Key shift, 
-                            const Ty* __restrict const src, 
-                            Ty* __restrict const dst, 
-                            const size_t threadIndex, const size_t threadCount)
-    {
-      const size_t startID = (threadIndex+0)*N/threadCount;
-      const size_t endID   = (threadIndex+1)*N/threadCount;
-      
-      /* mask to extract some number of bits */
-      const Key mask = BUCKETS-1;
-      
-      /* calculate total number of items for each bucket */
-      __aligned(64) unsigned int total[BUCKETS];
-      /*
-      for (size_t i=0; i<BUCKETS; i++)
-        total[i] = 0;
-      */
-      for (size_t i=0; i<BUCKETS; i+=VSIZEX)
-        vintx::store(&total[i], zero);
-      
-      for (size_t i=0; i<threadCount; i++)
-      {
-        /*
-        for (size_t j=0; j<BUCKETS; j++)
-          total[j] += radixCount[i][j];
-        */
-        for (size_t j=0; j<BUCKETS; j+=VSIZEX)
-          vintx::store(&total[j], vintx::load(&total[j]) + vintx::load(&radixCount[i][j]));
-      }
-      
-      /* calculate start offset of each bucket */
-      __aligned(64) unsigned int offset[BUCKETS];
-      offset[0] = 0;
-      for (size_t i=1; i<BUCKETS; i++)    
-        offset[i] = offset[i-1] + total[i-1];
-      
-      /* calculate start offset of each bucket for this thread */
-      for (size_t i=0; i<threadIndex; i++)
-      {
-        /*
-        for (size_t j=0; j<BUCKETS; j++)
-          offset[j] += radixCount[i][j];
-        */
-        for (size_t j=0; j<BUCKETS; j+=VSIZEX)
-          vintx::store(&offset[j], vintx::load(&offset[j]) + vintx::load(&radixCount[i][j]));
-      }
-      
-      /* copy items into their buckets */
-#if defined(__INTEL_COMPILER)
-#pragma nounroll
-#endif
-      for (size_t i=startID; i<endID; i++) {
-        const Ty elt = src[i];
-#if defined(__X86_64__) || defined(__aarch64__)
-        const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
-#else
-        const size_t index = ((Key)src[i] >> shift) & mask;
-#endif
-        dst[offset[index]++] = elt;
-      }
-    }
-    
-    void tbbRadixIteration(const Key shift, const bool last,
-                           const Ty* __restrict src, Ty* __restrict dst,
-                           const size_t numTasks)
-    {
-      affinity_partitioner ap;
-      parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration0(shift,src,dst,taskIndex,numTasks); },ap);
-      parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration1(shift,src,dst,taskIndex,numTasks); },ap);
-    }
-    
-    void tbbRadixSort(const size_t numTasks)
-    {
-      radixCount = (TyRadixCount*) alignedMalloc(MAX_TASKS*sizeof(TyRadixCount),64);
-      
-      if (sizeof(Key) == sizeof(uint32_t)) {
-        tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
-        tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
-        tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
-        tbbRadixIteration(3*BITS,1,tmp,src,numTasks);
-      }
-      else if (sizeof(Key) == sizeof(uint64_t))
-      {
-        tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
-        tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
-        tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
-        tbbRadixIteration(3*BITS,0,tmp,src,numTasks);
-        tbbRadixIteration(4*BITS,0,src,tmp,numTasks);
-        tbbRadixIteration(5*BITS,0,tmp,src,numTasks);
-        tbbRadixIteration(6*BITS,0,src,tmp,numTasks);
-        tbbRadixIteration(7*BITS,1,tmp,src,numTasks);
-      }
-    }
-    
-  private:
-    TyRadixCount* radixCount;
-    Ty* const src;
-    Ty* const tmp;
-    const size_t N;
-  };
-
-  template<typename Ty>
-    void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
-  {
-    ParallelRadixSort<Ty,Ty>(src,tmp,N).sort(blockSize);
-  }
-  
-  template<typename Ty, typename Key>
-    void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
-  {
-    ParallelRadixSort<Ty,Key>(src,tmp,N).sort(blockSize);
-  }
-  
-  template<typename Ty>
-    void radix_sort_u32(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
-    radix_sort<Ty,uint32_t>(src,tmp,N,blockSize);
-  }
-  
-  template<typename Ty>
-    void radix_sort_u64(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
-    radix_sort<Ty,uint64_t>(src,tmp,N,blockSize);
-  }
-}