1 | //===-- runtime/reduction-templates.h ---------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | // Generic function templates used by various reduction transformation |
10 | // intrinsic functions (SUM, PRODUCT, &c.) |
11 | // |
12 | // * Partial reductions (i.e., those with DIM= arguments that are not |
13 | // required to be 1 by the rank of the argument) return arrays that |
14 | // are dynamically allocated in a caller-supplied descriptor. |
15 | // * Total reductions (i.e., no DIM= argument) with FINDLOC, MAXLOC, & MINLOC |
16 | // return integer vectors of some kind, not scalars; a caller-supplied |
17 | // descriptor is used |
18 | // * Character-valued reductions (MAXVAL & MINVAL) return arbitrary |
19 | // length results, dynamically allocated in a caller-supplied descriptor |
20 | |
21 | #ifndef FORTRAN_RUNTIME_REDUCTION_TEMPLATES_H_ |
22 | #define FORTRAN_RUNTIME_REDUCTION_TEMPLATES_H_ |
23 | |
24 | #include "numeric-templates.h" |
25 | #include "terminator.h" |
26 | #include "tools.h" |
27 | #include "flang/Runtime/cpp-type.h" |
28 | #include "flang/Runtime/descriptor.h" |
29 | #include <algorithm> |
30 | |
31 | namespace Fortran::runtime { |
32 | |
33 | // Reductions are implemented with *accumulators*, which are instances of |
34 | // classes that incrementally build up the result (or an element thereof) during |
35 | // a traversal of the unmasked elements of an array. Each accumulator class |
36 | // supports a constructor (which captures a reference to the array), an |
37 | // AccumulateAt() member function that applies supplied subscripts to the |
38 | // array and does something with a scalar element, and a GetResult() |
39 | // member function that copies a final result into its destination. |
40 | |
41 | // Total reduction of the array argument to a scalar (or to a vector in the |
42 | // cases of FINDLOC, MAXLOC, & MINLOC). These are the cases without DIM= or |
43 | // cases where the argument has rank 1 and DIM=, if present, must be 1. |
44 | template <typename TYPE, typename ACCUMULATOR> |
45 | inline RT_API_ATTRS void DoTotalReduction(const Descriptor &x, int dim, |
46 | const Descriptor *mask, ACCUMULATOR &accumulator, const char *intrinsic, |
47 | Terminator &terminator) { |
48 | if (dim < 0 || dim > 1) { |
49 | terminator.Crash("%s: bad DIM=%d for ARRAY argument with rank %d" , |
50 | intrinsic, dim, x.rank()); |
51 | } |
52 | SubscriptValue xAt[maxRank]; |
53 | x.GetLowerBounds(xAt); |
54 | if (mask) { |
55 | CheckConformability(to: x, x: *mask, terminator, funcName: intrinsic, toName: "ARRAY" , fromName: "MASK" ); |
56 | if (mask->rank() > 0) { |
57 | SubscriptValue maskAt[maxRank]; |
58 | mask->GetLowerBounds(maskAt); |
59 | for (auto elements{x.Elements()}; elements--; |
60 | x.IncrementSubscripts(xAt), mask->IncrementSubscripts(maskAt)) { |
61 | if (IsLogicalElementTrue(*mask, maskAt)) { |
62 | if (!accumulator.template AccumulateAt<TYPE>(xAt)) { |
63 | break; |
64 | } |
65 | } |
66 | } |
67 | return; |
68 | } else if (!IsLogicalScalarTrue(*mask)) { |
69 | // scalar MASK=.FALSE.: return identity value |
70 | return; |
71 | } |
72 | } |
73 | // No MASK=, or scalar MASK=.TRUE. |
74 | for (auto elements{x.Elements()}; elements--; x.IncrementSubscripts(xAt)) { |
75 | if (!accumulator.template AccumulateAt<TYPE>(xAt)) { |
76 | break; // cut short, result is known |
77 | } |
78 | } |
79 | } |
80 | |
81 | template <TypeCategory CAT, int KIND, typename ACCUMULATOR> |
82 | inline RT_API_ATTRS CppTypeFor<CAT, KIND> GetTotalReduction(const Descriptor &x, |
83 | const char *source, int line, int dim, const Descriptor *mask, |
84 | ACCUMULATOR &&accumulator, const char *intrinsic) { |
85 | Terminator terminator{source, line}; |
86 | RUNTIME_CHECK(terminator, TypeCode(CAT, KIND) == x.type()); |
87 | using CppType = CppTypeFor<CAT, KIND>; |
88 | DoTotalReduction<CppType>(x, dim, mask, accumulator, intrinsic, terminator); |
89 | if constexpr (std::is_void_v<CppType>) { |
90 | // Result is returned from accumulator, as in REDUCE() for derived type |
91 | #ifdef _MSC_VER // work around MSVC spurious error |
92 | accumulator.GetResult(); |
93 | #else |
94 | accumulator.template GetResult(); |
95 | #endif |
96 | } else { |
97 | CppType result; |
98 | #ifdef _MSC_VER // work around MSVC spurious error |
99 | accumulator.GetResult(&result); |
100 | #else |
101 | accumulator.template GetResult(&result); |
102 | #endif |
103 | return result; |
104 | } |
105 | } |
106 | |
107 | // For reductions on a dimension, e.g. SUM(array,DIM=2) where the shape |
108 | // of the array is [2,3,5], the shape of the result is [2,5] and |
109 | // result(j,k) = SUM(array(j,:,k)), possibly modified if the array has |
110 | // lower bounds other than one. This utility subroutine creates an |
111 | // array of subscripts [j,_,k] for result subscripts [j,k] so that the |
112 | // elements of array(j,:,k) can be reduced. |
113 | inline RT_API_ATTRS void GetExpandedSubscripts(SubscriptValue at[], |
114 | const Descriptor &descriptor, int zeroBasedDim, |
115 | const SubscriptValue from[]) { |
116 | descriptor.GetLowerBounds(at); |
117 | int rank{descriptor.rank()}; |
118 | int j{0}; |
119 | for (; j < zeroBasedDim; ++j) { |
120 | at[j] += from[j] - 1 /*lower bound*/; |
121 | } |
122 | for (++j; j < rank; ++j) { |
123 | at[j] += from[j - 1] - 1; |
124 | } |
125 | } |
126 | |
127 | template <typename TYPE, typename ACCUMULATOR> |
128 | inline RT_API_ATTRS void ReduceDimToScalar(const Descriptor &x, |
129 | int zeroBasedDim, SubscriptValue subscripts[], TYPE *result, |
130 | ACCUMULATOR &accumulator) { |
131 | SubscriptValue xAt[maxRank]; |
132 | GetExpandedSubscripts(xAt, x, zeroBasedDim, subscripts); |
133 | const auto &dim{x.GetDimension(zeroBasedDim)}; |
134 | SubscriptValue at{dim.LowerBound()}; |
135 | for (auto n{dim.Extent()}; n-- > 0; ++at) { |
136 | xAt[zeroBasedDim] = at; |
137 | if (!accumulator.template AccumulateAt<TYPE>(xAt)) { |
138 | break; |
139 | } |
140 | } |
141 | #ifdef _MSC_VER // work around MSVC spurious error |
142 | accumulator.GetResult(result, zeroBasedDim); |
143 | #else |
144 | accumulator.template GetResult(result, zeroBasedDim); |
145 | #endif |
146 | } |
147 | |
148 | template <typename TYPE, typename ACCUMULATOR> |
149 | inline RT_API_ATTRS void ReduceDimMaskToScalar(const Descriptor &x, |
150 | int zeroBasedDim, SubscriptValue subscripts[], const Descriptor &mask, |
151 | TYPE *result, ACCUMULATOR &accumulator) { |
152 | SubscriptValue xAt[maxRank], maskAt[maxRank]; |
153 | GetExpandedSubscripts(xAt, x, zeroBasedDim, subscripts); |
154 | GetExpandedSubscripts(maskAt, mask, zeroBasedDim, subscripts); |
155 | const auto &xDim{x.GetDimension(zeroBasedDim)}; |
156 | SubscriptValue xPos{xDim.LowerBound()}; |
157 | const auto &maskDim{mask.GetDimension(zeroBasedDim)}; |
158 | SubscriptValue maskPos{maskDim.LowerBound()}; |
159 | for (auto n{x.GetDimension(zeroBasedDim).Extent()}; n-- > 0; |
160 | ++xPos, ++maskPos) { |
161 | maskAt[zeroBasedDim] = maskPos; |
162 | if (IsLogicalElementTrue(mask, maskAt)) { |
163 | xAt[zeroBasedDim] = xPos; |
164 | if (!accumulator.template AccumulateAt<TYPE>(xAt)) { |
165 | break; |
166 | } |
167 | } |
168 | } |
169 | #ifdef _MSC_VER // work around MSVC spurious error |
170 | accumulator.GetResult(result, zeroBasedDim); |
171 | #else |
172 | accumulator.template GetResult(result, zeroBasedDim); |
173 | #endif |
174 | } |
175 | |
176 | // Partial reductions with DIM= |
177 | |
178 | template <typename ACCUMULATOR, TypeCategory CAT, int KIND> |
179 | inline RT_API_ATTRS void PartialReduction(Descriptor &result, |
180 | const Descriptor &x, std::size_t resultElementSize, int dim, |
181 | const Descriptor *mask, Terminator &terminator, const char *intrinsic, |
182 | ACCUMULATOR &accumulator) { |
183 | CreatePartialReductionResult(result, x, resultElementSize, dim, terminator, |
184 | intrinsic, TypeCode{CAT, KIND}); |
185 | SubscriptValue at[maxRank]; |
186 | result.GetLowerBounds(at); |
187 | INTERNAL_CHECK(result.rank() == 0 || at[0] == 1); |
188 | using CppType = CppTypeFor<CAT, KIND>; |
189 | if (mask) { |
190 | CheckConformability(to: x, x: *mask, terminator, funcName: intrinsic, toName: "ARRAY" , fromName: "MASK" ); |
191 | if (mask->rank() > 0) { |
192 | for (auto n{result.Elements()}; n-- > 0; result.IncrementSubscripts(at)) { |
193 | accumulator.Reinitialize(); |
194 | ReduceDimMaskToScalar<CppType, ACCUMULATOR>( |
195 | x, dim - 1, at, *mask, result.Element<CppType>(at), accumulator); |
196 | } |
197 | return; |
198 | } else if (!IsLogicalScalarTrue(*mask)) { |
199 | // scalar MASK=.FALSE. |
200 | accumulator.Reinitialize(); |
201 | for (auto n{result.Elements()}; n-- > 0; result.IncrementSubscripts(at)) { |
202 | accumulator.GetResult(result.Element<CppType>(at)); |
203 | } |
204 | return; |
205 | } |
206 | } |
207 | // No MASK= or scalar MASK=.TRUE. |
208 | for (auto n{result.Elements()}; n-- > 0; result.IncrementSubscripts(at)) { |
209 | accumulator.Reinitialize(); |
210 | ReduceDimToScalar<CppType, ACCUMULATOR>( |
211 | x, dim - 1, at, result.Element<CppType>(at), accumulator); |
212 | } |
213 | } |
214 | |
215 | template <template <typename> class ACCUM> |
216 | struct PartialIntegerReductionHelper { |
217 | template <int KIND> struct Functor { |
218 | static constexpr int Intermediate{ |
219 | std::max(a: KIND, b: 4)}; // use at least "int" for intermediate results |
220 | RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x, |
221 | int dim, const Descriptor *mask, Terminator &terminator, |
222 | const char *intrinsic) const { |
223 | using Accumulator = |
224 | ACCUM<CppTypeFor<TypeCategory::Integer, Intermediate>>; |
225 | Accumulator accumulator{x}; |
226 | // Element size of the destination descriptor is the same |
227 | // as the element size of the source. |
228 | PartialReduction<Accumulator, TypeCategory::Integer, KIND>(result, x, |
229 | x.ElementBytes(), dim, mask, terminator, intrinsic, accumulator); |
230 | } |
231 | }; |
232 | }; |
233 | |
234 | template <template <typename> class INTEGER_ACCUM> |
235 | inline RT_API_ATTRS void PartialIntegerReduction(Descriptor &result, |
236 | const Descriptor &x, int dim, int kind, const Descriptor *mask, |
237 | const char *intrinsic, Terminator &terminator) { |
238 | ApplyIntegerKind< |
239 | PartialIntegerReductionHelper<INTEGER_ACCUM>::template Functor, void>( |
240 | kind, terminator, result, x, dim, mask, terminator, intrinsic); |
241 | } |
242 | |
243 | template <TypeCategory CAT, template <typename> class ACCUM> |
244 | struct PartialFloatingReductionHelper { |
245 | template <int KIND> struct Functor { |
246 | static constexpr int Intermediate{ |
247 | std::max(a: KIND, b: 8)}; // use at least "double" for intermediate results |
248 | RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x, |
249 | int dim, const Descriptor *mask, Terminator &terminator, |
250 | const char *intrinsic) const { |
251 | using Accumulator = ACCUM<CppTypeFor<TypeCategory::Real, Intermediate>>; |
252 | Accumulator accumulator{x}; |
253 | // Element size of the destination descriptor is the same |
254 | // as the element size of the source. |
255 | PartialReduction<Accumulator, CAT, KIND>(result, x, x.ElementBytes(), dim, |
256 | mask, terminator, intrinsic, accumulator); |
257 | } |
258 | }; |
259 | }; |
260 | |
261 | template <template <typename> class INTEGER_ACCUM, |
262 | template <typename> class REAL_ACCUM, |
263 | template <typename> class COMPLEX_ACCUM> |
264 | inline RT_API_ATTRS void TypedPartialNumericReduction(Descriptor &result, |
265 | const Descriptor &x, int dim, const char *source, int line, |
266 | const Descriptor *mask, const char *intrinsic) { |
267 | Terminator terminator{source, line}; |
268 | auto catKind{x.type().GetCategoryAndKind()}; |
269 | RUNTIME_CHECK(terminator, catKind.has_value()); |
270 | switch (catKind->first) { |
271 | case TypeCategory::Integer: |
272 | PartialIntegerReduction<INTEGER_ACCUM>( |
273 | result, x, dim, catKind->second, mask, intrinsic, terminator); |
274 | break; |
275 | case TypeCategory::Real: |
276 | ApplyFloatingPointKind<PartialFloatingReductionHelper<TypeCategory::Real, |
277 | REAL_ACCUM>::template Functor, |
278 | void>(catKind->second, terminator, result, x, dim, mask, terminator, |
279 | intrinsic); |
280 | break; |
281 | case TypeCategory::Complex: |
282 | ApplyFloatingPointKind<PartialFloatingReductionHelper<TypeCategory::Complex, |
283 | COMPLEX_ACCUM>::template Functor, |
284 | void>(catKind->second, terminator, result, x, dim, mask, terminator, |
285 | intrinsic); |
286 | break; |
287 | default: |
288 | terminator.Crash("%s: bad type code %d" , intrinsic, x.type().raw()); |
289 | } |
290 | } |
291 | |
292 | template <typename ACCUMULATOR> struct LocationResultHelper { |
293 | template <int KIND> struct Functor { |
294 | RT_API_ATTRS void operator()( |
295 | ACCUMULATOR &accumulator, const Descriptor &result) const { |
296 | accumulator.GetResult( |
297 | result.OffsetElement<CppTypeFor<TypeCategory::Integer, KIND>>()); |
298 | } |
299 | }; |
300 | }; |
301 | |
302 | template <typename ACCUMULATOR> struct PartialLocationHelper { |
303 | template <int KIND> struct Functor { |
304 | RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x, |
305 | int dim, const Descriptor *mask, Terminator &terminator, |
306 | const char *intrinsic, ACCUMULATOR &accumulator) const { |
307 | // Element size of the destination descriptor is the size |
308 | // of {TypeCategory::Integer, KIND}. |
309 | PartialReduction<ACCUMULATOR, TypeCategory::Integer, KIND>(result, x, |
310 | Descriptor::BytesFor(TypeCategory::Integer, KIND), dim, mask, |
311 | terminator, intrinsic, accumulator); |
312 | } |
313 | }; |
314 | }; |
315 | |
316 | // NORM2 templates |
317 | |
318 | RT_VAR_GROUP_BEGIN |
319 | |
320 | // Use at least double precision for accumulators. |
321 | // Don't use __float128, it doesn't work with abs() or sqrt() yet. |
322 | static constexpr RT_CONST_VAR_ATTRS int Norm2LargestLDKind { |
323 | #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 |
324 | 16 |
325 | #elif LDBL_MANT_DIG == 64 |
326 | 10 |
327 | #else |
328 | 8 |
329 | #endif |
330 | }; |
331 | |
332 | RT_VAR_GROUP_END |
333 | |
334 | template <TypeCategory CAT, int KIND, typename ACCUMULATOR> |
335 | inline RT_API_ATTRS void DoMaxMinNorm2(Descriptor &result, const Descriptor &x, |
336 | int dim, const Descriptor *mask, const char *intrinsic, |
337 | Terminator &terminator) { |
338 | using Type = CppTypeFor<CAT, KIND>; |
339 | ACCUMULATOR accumulator{x}; |
340 | if (dim == 0 || x.rank() == 1) { |
341 | // Total reduction |
342 | |
343 | // Element size of the destination descriptor is the same |
344 | // as the element size of the source. |
345 | result.Establish(x.type(), x.ElementBytes(), nullptr, 0, nullptr, |
346 | CFI_attribute_allocatable); |
347 | if (int stat{result.Allocate()}) { |
348 | terminator.Crash( |
349 | "%s: could not allocate memory for result; STAT=%d" , intrinsic, stat); |
350 | } |
351 | DoTotalReduction<Type>(x, dim, mask, accumulator, intrinsic, terminator); |
352 | accumulator.GetResult(result.OffsetElement<Type>()); |
353 | } else { |
354 | // Partial reduction |
355 | |
356 | // Element size of the destination descriptor is the same |
357 | // as the element size of the source. |
358 | PartialReduction<ACCUMULATOR, CAT, KIND>(result, x, x.ElementBytes(), dim, |
359 | mask, terminator, intrinsic, accumulator); |
360 | } |
361 | } |
362 | |
363 | // The data type used by Norm2Accumulator. |
364 | template <int KIND> |
365 | using Norm2AccumType = |
366 | CppTypeFor<TypeCategory::Real, std::clamp(KIND, 8, Norm2LargestLDKind)>; |
367 | |
368 | template <int KIND> class Norm2Accumulator { |
369 | public: |
370 | using Type = CppTypeFor<TypeCategory::Real, KIND>; |
371 | using AccumType = Norm2AccumType<KIND>; |
372 | explicit RT_API_ATTRS Norm2Accumulator(const Descriptor &array) |
373 | : array_{array} {} |
374 | RT_API_ATTRS void Reinitialize() { max_ = sum_ = 0; } |
375 | template <typename A> |
376 | RT_API_ATTRS void GetResult(A *p, int /*zeroBasedDim*/ = -1) const { |
377 | // m * sqrt(1 + sum((others(:)/m)**2)) |
378 | *p = static_cast<Type>(max_ * SQRTTy<AccumType>::compute(1 + sum_)); |
379 | } |
380 | RT_API_ATTRS bool Accumulate(Type x) { |
381 | auto absX{ABSTy<AccumType>::compute(static_cast<AccumType>(x))}; |
382 | if (!max_) { |
383 | max_ = absX; |
384 | } else if (absX > max_) { |
385 | auto t{max_ / absX}; // < 1.0 |
386 | auto tsq{t * t}; |
387 | sum_ *= tsq; // scale sum to reflect change to the max |
388 | sum_ += tsq; // include a term for the previous max |
389 | max_ = absX; |
390 | } else { // absX <= max_ |
391 | auto t{absX / max_}; |
392 | sum_ += t * t; |
393 | } |
394 | return true; |
395 | } |
396 | template <typename A> |
397 | RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) { |
398 | return Accumulate(*array_.Element<A>(at)); |
399 | } |
400 | |
401 | private: |
402 | const Descriptor &array_; |
403 | AccumType max_{0}; // value (m) with largest magnitude |
404 | AccumType sum_{0}; // sum((others(:)/m)**2) |
405 | }; |
406 | |
407 | template <int KIND> struct Norm2Helper { |
408 | RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x, int dim, |
409 | const Descriptor *mask, Terminator &terminator) const { |
410 | DoMaxMinNorm2<TypeCategory::Real, KIND, Norm2Accumulator<KIND>>( |
411 | result, x, dim, mask, "NORM2" , terminator); |
412 | } |
413 | }; |
414 | |
415 | } // namespace Fortran::runtime |
416 | #endif // FORTRAN_RUNTIME_REDUCTION_TEMPLATES_H_ |
417 | |