| 1 | // Copyright 2009-2021 Intel Corporation | 
| 2 | // SPDX-License-Identifier: Apache-2.0 | 
| 3 |  | 
| 4 | #pragma once | 
| 5 |  | 
| 6 | #include "parallel_for.h" | 
| 7 |  | 
| 8 | namespace embree | 
| 9 | { | 
| 10 |   template<typename ArrayArray, typename Func> | 
| 11 |     __forceinline void sequential_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func )  | 
| 12 |   { | 
| 13 |     size_t k=0; | 
| 14 |     for (size_t i=0; i!=array2.size(); ++i) { | 
| 15 |       const size_t N = array2[i]->size(); | 
| 16 |       if (N) func(array2[i],range<size_t>(0,N),k); | 
| 17 |       k+=N; | 
| 18 |     } | 
| 19 |   } | 
| 20 |  | 
| 21 |   class ParallelForForState | 
| 22 |   { | 
| 23 |   public: | 
| 24 |  | 
| 25 |     enum { MAX_TASKS = 64 }; | 
| 26 |  | 
| 27 |     __forceinline ParallelForForState ()  | 
| 28 |       : taskCount(0) {} | 
| 29 |  | 
| 30 |     template<typename ArrayArray> | 
| 31 |       __forceinline ParallelForForState (ArrayArray& array2, const size_t minStepSize) { | 
| 32 |       init(array2,minStepSize); | 
| 33 |     }  | 
| 34 |  | 
| 35 |     template<typename ArrayArray> | 
| 36 |       __forceinline void init ( ArrayArray& array2, const size_t minStepSize ) | 
| 37 |     { | 
| 38 |       /* first calculate total number of elements */ | 
| 39 |       size_t N = 0; | 
| 40 |       for (size_t i=0; i<array2.size(); i++) { | 
| 41 | 	N += array2[i] ? array2[i]->size() : 0; | 
| 42 |       } | 
| 43 |       this->N = N; | 
| 44 |  | 
| 45 |       /* calculate number of tasks to use */ | 
| 46 |       const size_t numThreads = TaskScheduler::threadCount(); | 
| 47 |       const size_t numBlocks  = (N+minStepSize-1)/minStepSize; | 
| 48 |       taskCount = max(a: size_t(1),b: min(a: numThreads,b: numBlocks,c: size_t(ParallelForForState::MAX_TASKS))); | 
| 49 |        | 
| 50 |       /* calculate start (i,j) for each task */ | 
| 51 |       size_t taskIndex = 0; | 
| 52 |       i0[taskIndex] = 0; | 
| 53 |       j0[taskIndex] = 0; | 
| 54 |       size_t k0 = (++taskIndex)*N/taskCount; | 
| 55 |       for (size_t i=0, k=0; taskIndex < taskCount; i++)  | 
| 56 |       { | 
| 57 | 	assert(i<array2.size()); | 
| 58 | 	size_t j=0, M = array2[i] ? array2[i]->size() : 0; | 
| 59 | 	while (j<M && k+M-j >= k0 && taskIndex < taskCount) { | 
| 60 | 	  assert(taskIndex<taskCount); | 
| 61 | 	  i0[taskIndex] = i; | 
| 62 | 	  j0[taskIndex] = j += k0-k; | 
| 63 | 	  k=k0; | 
| 64 | 	  k0 = (++taskIndex)*N/taskCount; | 
| 65 | 	} | 
| 66 | 	k+=M-j; | 
| 67 |       } | 
| 68 |     } | 
| 69 |  | 
| 70 |     __forceinline size_t size() const { | 
| 71 |       return N; | 
| 72 |     } | 
| 73 |      | 
| 74 |   public: | 
| 75 |     size_t i0[MAX_TASKS]; | 
| 76 |     size_t j0[MAX_TASKS]; | 
| 77 |     size_t taskCount; | 
| 78 |     size_t N; | 
| 79 |   }; | 
| 80 |  | 
| 81 |   template<typename ArrayArray, typename Func> | 
| 82 |     __forceinline void parallel_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func ) | 
| 83 |   { | 
| 84 |     ParallelForForState state(array2,minStepSize); | 
| 85 |      | 
| 86 |     parallel_for(state.taskCount, [&](const size_t taskIndex)  | 
| 87 |     { | 
| 88 |       /* calculate range */ | 
| 89 |       const size_t k0 = (taskIndex+0)*state.size()/state.taskCount; | 
| 90 |       const size_t k1 = (taskIndex+1)*state.size()/state.taskCount; | 
| 91 |       size_t i0 = state.i0[taskIndex]; | 
| 92 |       size_t j0 = state.j0[taskIndex]; | 
| 93 |  | 
| 94 |       /* iterate over arrays */ | 
| 95 |       size_t k=k0; | 
| 96 |       for (size_t i=i0; k<k1; i++) { | 
| 97 |         const size_t N =  array2[i] ? array2[i]->size() : 0; | 
| 98 |         const size_t r0 = j0, r1 = min(a: N,b: r0+k1-k); | 
| 99 |         if (r1 > r0) func(array2[i],range<size_t>(r0,r1),k); | 
| 100 |         k+=r1-r0; j0 = 0; | 
| 101 |       } | 
| 102 |     }); | 
| 103 |   } | 
| 104 |  | 
| 105 |   template<typename ArrayArray, typename Func> | 
| 106 |     __forceinline void parallel_for_for( ArrayArray& array2, const Func& func ) | 
| 107 |   { | 
| 108 |     parallel_for_for(array2,1,func); | 
| 109 |   } | 
| 110 |  | 
| 111 |   template<typename ArrayArray, typename Value, typename Func, typename Reduction> | 
| 112 |     __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const size_t minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) | 
| 113 |   { | 
| 114 |     ParallelForForState state(array2,minStepSize); | 
| 115 |     Value temp[ParallelForForState::MAX_TASKS]; | 
| 116 |  | 
| 117 |     for (size_t i=0; i<state.taskCount; i++) | 
| 118 |       temp[i] = identity; | 
| 119 |      | 
| 120 |     parallel_for(state.taskCount, [&](const size_t taskIndex)  | 
| 121 |     { | 
| 122 |       /* calculate range */ | 
| 123 |       const size_t k0 = (taskIndex+0)*state.size()/state.taskCount; | 
| 124 |       const size_t k1 = (taskIndex+1)*state.size()/state.taskCount; | 
| 125 |       size_t i0 = state.i0[taskIndex]; | 
| 126 |       size_t j0 = state.j0[taskIndex]; | 
| 127 |  | 
| 128 |       /* iterate over arrays */ | 
| 129 |       size_t k=k0; | 
| 130 |       for (size_t i=i0; k<k1; i++) { | 
| 131 |         const size_t N =  array2[i] ? array2[i]->size() : 0; | 
| 132 |         const size_t r0 = j0, r1 = min(a: N,b: r0+k1-k); | 
| 133 |         if (r1 > r0) temp[taskIndex] = reduction(temp[taskIndex],func(array2[i],range<size_t>(r0,r1),k)); | 
| 134 |         k+=r1-r0; j0 = 0; | 
| 135 |       } | 
| 136 |     }); | 
| 137 |  | 
| 138 |     Value ret = identity; | 
| 139 |     for (size_t i=0; i<state.taskCount; i++) | 
| 140 |       ret = reduction(ret,temp[i]); | 
| 141 |     return ret; | 
| 142 |   } | 
| 143 |  | 
| 144 |   template<typename ArrayArray, typename Value, typename Func, typename Reduction> | 
| 145 |     __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const Value& identity, const Func& func, const Reduction& reduction) | 
| 146 |   { | 
| 147 |     return parallel_for_for_reduce(array2,1,identity,func,reduction); | 
| 148 |   } | 
| 149 | } | 
| 150 |  |