1 | // Copyright 2009-2021 Intel Corporation |
2 | // SPDX-License-Identifier: Apache-2.0 |
3 | |
4 | #pragma once |
5 | |
6 | #include "parallel_for.h" |
7 | |
8 | namespace embree |
9 | { |
10 | template<typename ArrayArray, typename Func> |
11 | __forceinline void sequential_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func ) |
12 | { |
13 | size_t k=0; |
14 | for (size_t i=0; i!=array2.size(); ++i) { |
15 | const size_t N = array2[i]->size(); |
16 | if (N) func(array2[i],range<size_t>(0,N),k); |
17 | k+=N; |
18 | } |
19 | } |
20 | |
21 | class ParallelForForState |
22 | { |
23 | public: |
24 | |
25 | enum { MAX_TASKS = 64 }; |
26 | |
27 | __forceinline ParallelForForState () |
28 | : taskCount(0) {} |
29 | |
30 | template<typename ArrayArray> |
31 | __forceinline ParallelForForState (ArrayArray& array2, const size_t minStepSize) { |
32 | init(array2,minStepSize); |
33 | } |
34 | |
35 | template<typename ArrayArray> |
36 | __forceinline void init ( ArrayArray& array2, const size_t minStepSize ) |
37 | { |
38 | /* first calculate total number of elements */ |
39 | size_t N = 0; |
40 | for (size_t i=0; i<array2.size(); i++) { |
41 | N += array2[i] ? array2[i]->size() : 0; |
42 | } |
43 | this->N = N; |
44 | |
45 | /* calculate number of tasks to use */ |
46 | const size_t numThreads = TaskScheduler::threadCount(); |
47 | const size_t numBlocks = (N+minStepSize-1)/minStepSize; |
48 | taskCount = max(a: size_t(1),b: min(a: numThreads,b: numBlocks,c: size_t(ParallelForForState::MAX_TASKS))); |
49 | |
50 | /* calculate start (i,j) for each task */ |
51 | size_t taskIndex = 0; |
52 | i0[taskIndex] = 0; |
53 | j0[taskIndex] = 0; |
54 | size_t k0 = (++taskIndex)*N/taskCount; |
55 | for (size_t i=0, k=0; taskIndex < taskCount; i++) |
56 | { |
57 | assert(i<array2.size()); |
58 | size_t j=0, M = array2[i] ? array2[i]->size() : 0; |
59 | while (j<M && k+M-j >= k0 && taskIndex < taskCount) { |
60 | assert(taskIndex<taskCount); |
61 | i0[taskIndex] = i; |
62 | j0[taskIndex] = j += k0-k; |
63 | k=k0; |
64 | k0 = (++taskIndex)*N/taskCount; |
65 | } |
66 | k+=M-j; |
67 | } |
68 | } |
69 | |
70 | __forceinline size_t size() const { |
71 | return N; |
72 | } |
73 | |
74 | public: |
75 | size_t i0[MAX_TASKS]; |
76 | size_t j0[MAX_TASKS]; |
77 | size_t taskCount; |
78 | size_t N; |
79 | }; |
80 | |
81 | template<typename ArrayArray, typename Func> |
82 | __forceinline void parallel_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func ) |
83 | { |
84 | ParallelForForState state(array2,minStepSize); |
85 | |
86 | parallel_for(state.taskCount, [&](const size_t taskIndex) |
87 | { |
88 | /* calculate range */ |
89 | const size_t k0 = (taskIndex+0)*state.size()/state.taskCount; |
90 | const size_t k1 = (taskIndex+1)*state.size()/state.taskCount; |
91 | size_t i0 = state.i0[taskIndex]; |
92 | size_t j0 = state.j0[taskIndex]; |
93 | |
94 | /* iterate over arrays */ |
95 | size_t k=k0; |
96 | for (size_t i=i0; k<k1; i++) { |
97 | const size_t N = array2[i] ? array2[i]->size() : 0; |
98 | const size_t r0 = j0, r1 = min(a: N,b: r0+k1-k); |
99 | if (r1 > r0) func(array2[i],range<size_t>(r0,r1),k); |
100 | k+=r1-r0; j0 = 0; |
101 | } |
102 | }); |
103 | } |
104 | |
105 | template<typename ArrayArray, typename Func> |
106 | __forceinline void parallel_for_for( ArrayArray& array2, const Func& func ) |
107 | { |
108 | parallel_for_for(array2,1,func); |
109 | } |
110 | |
111 | template<typename ArrayArray, typename Value, typename Func, typename Reduction> |
112 | __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const size_t minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) |
113 | { |
114 | ParallelForForState state(array2,minStepSize); |
115 | Value temp[ParallelForForState::MAX_TASKS]; |
116 | |
117 | for (size_t i=0; i<state.taskCount; i++) |
118 | temp[i] = identity; |
119 | |
120 | parallel_for(state.taskCount, [&](const size_t taskIndex) |
121 | { |
122 | /* calculate range */ |
123 | const size_t k0 = (taskIndex+0)*state.size()/state.taskCount; |
124 | const size_t k1 = (taskIndex+1)*state.size()/state.taskCount; |
125 | size_t i0 = state.i0[taskIndex]; |
126 | size_t j0 = state.j0[taskIndex]; |
127 | |
128 | /* iterate over arrays */ |
129 | size_t k=k0; |
130 | for (size_t i=i0; k<k1; i++) { |
131 | const size_t N = array2[i] ? array2[i]->size() : 0; |
132 | const size_t r0 = j0, r1 = min(a: N,b: r0+k1-k); |
133 | if (r1 > r0) temp[taskIndex] = reduction(temp[taskIndex],func(array2[i],range<size_t>(r0,r1),k)); |
134 | k+=r1-r0; j0 = 0; |
135 | } |
136 | }); |
137 | |
138 | Value ret = identity; |
139 | for (size_t i=0; i<state.taskCount; i++) |
140 | ret = reduction(ret,temp[i]); |
141 | return ret; |
142 | } |
143 | |
144 | template<typename ArrayArray, typename Value, typename Func, typename Reduction> |
145 | __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const Value& identity, const Func& func, const Reduction& reduction) |
146 | { |
147 | return parallel_for_for_reduce(array2,1,identity,func,reduction); |
148 | } |
149 | } |
150 | |