1//===-- runtime/matmul-transpose.cpp --------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9// Implements a fused matmul-transpose operation
10//
11// There are two main entry points; one establishes a descriptor for the
12// result and allocates it, and the other expects a result descriptor that
13// points to existing storage.
14//
15// This implementation must handle all combinations of numeric types and
16// kinds (100 - 165 cases depending on the target), plus all combinations
17// of logical kinds (16). A single template undergoes many instantiations
18// to cover all of the valid possibilities.
19//
20// The usefulness of this optimization should be reviewed once Matmul is swapped
21// to use the faster BLAS routines.
22
23#include "flang/Runtime/matmul-transpose.h"
24#include "terminator.h"
25#include "tools.h"
26#include "flang/Runtime/c-or-cpp.h"
27#include "flang/Runtime/cpp-type.h"
28#include "flang/Runtime/descriptor.h"
29#include <cstring>
30
31namespace {
32using namespace Fortran::runtime;
33
34// Suppress the warnings about calling __host__-only std::complex operators,
35// defined in C++ STD header files, from __device__ code.
36RT_DIAG_PUSH
37RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
38
39// Contiguous numeric TRANSPOSE(matrix)*matrix multiplication
40// TRANSPOSE(matrix(n, rows)) * matrix(n,cols) ->
41// matrix(rows, n) * matrix(n,cols) -> matrix(rows,cols)
42// The transpose is implemented by swapping the indices of accesses into the LHS
43//
44// Straightforward algorithm:
45// DO 1 I = 1, NROWS
46// DO 1 J = 1, NCOLS
47// RES(I,J) = 0
48// DO 1 K = 1, N
49// 1 RES(I,J) = RES(I,J) + X(K,I)*Y(K,J)
50//
51// With loop distribution and transposition to avoid the inner sum
52// reduction and to avoid non-unit strides:
53// DO 1 I = 1, NROWS
54// DO 1 J = 1, NCOLS
55// 1 RES(I,J) = 0
56// DO 2 J = 1, NCOLS
57// DO 2 I = 1, NROWS
58// DO 2 K = 1, N
59// 2 RES(I,J) = RES(I,J) + X(K,I)*Y(K,J) ! loop-invariant last term
60template <TypeCategory RCAT, int RKIND, typename XT, typename YT,
61 bool X_HAS_STRIDED_COLUMNS, bool Y_HAS_STRIDED_COLUMNS>
62inline static RT_API_ATTRS void MatrixTransposedTimesMatrix(
63 CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
64 SubscriptValue cols, const XT *RESTRICT x, const YT *RESTRICT y,
65 SubscriptValue n, std::size_t xColumnByteStride = 0,
66 std::size_t yColumnByteStride = 0) {
67 using ResultType = CppTypeFor<RCAT, RKIND>;
68
69 std::memset(product, 0, rows * cols * sizeof *product);
70 for (SubscriptValue j{0}; j < cols; ++j) {
71 for (SubscriptValue i{0}; i < rows; ++i) {
72 for (SubscriptValue k{0}; k < n; ++k) {
73 ResultType x_ki;
74 if constexpr (!X_HAS_STRIDED_COLUMNS) {
75 x_ki = static_cast<ResultType>(x[i * n + k]);
76 } else {
77 x_ki = static_cast<ResultType>(reinterpret_cast<const XT *>(
78 reinterpret_cast<const char *>(x) + i * xColumnByteStride)[k]);
79 }
80 ResultType y_kj;
81 if constexpr (!Y_HAS_STRIDED_COLUMNS) {
82 y_kj = static_cast<ResultType>(y[j * n + k]);
83 } else {
84 y_kj = static_cast<ResultType>(reinterpret_cast<const YT *>(
85 reinterpret_cast<const char *>(y) + j * yColumnByteStride)[k]);
86 }
87 product[j * rows + i] += x_ki * y_kj;
88 }
89 }
90 }
91}
92
93RT_DIAG_POP
94
95template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
96inline static RT_API_ATTRS void MatrixTransposedTimesMatrixHelper(
97 CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
98 SubscriptValue cols, const XT *RESTRICT x, const YT *RESTRICT y,
99 SubscriptValue n, std::optional<std::size_t> xColumnByteStride,
100 std::optional<std::size_t> yColumnByteStride) {
101 if (!xColumnByteStride) {
102 if (!yColumnByteStride) {
103 MatrixTransposedTimesMatrix<RCAT, RKIND, XT, YT, false, false>(
104 product, rows, cols, x, y, n);
105 } else {
106 MatrixTransposedTimesMatrix<RCAT, RKIND, XT, YT, false, true>(
107 product, rows, cols, x, y, n, 0, *yColumnByteStride);
108 }
109 } else {
110 if (!yColumnByteStride) {
111 MatrixTransposedTimesMatrix<RCAT, RKIND, XT, YT, true, false>(
112 product, rows, cols, x, y, n, *xColumnByteStride);
113 } else {
114 MatrixTransposedTimesMatrix<RCAT, RKIND, XT, YT, true, true>(
115 product, rows, cols, x, y, n, *xColumnByteStride, *yColumnByteStride);
116 }
117 }
118}
119
120RT_DIAG_PUSH
121RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
122
123// Contiguous numeric matrix*vector multiplication
124// matrix(rows,n) * column vector(n) -> column vector(rows)
125// Straightforward algorithm:
126// DO 1 I = 1, NROWS
127// RES(I) = 0
128// DO 1 K = 1, N
129// 1 RES(I) = RES(I) + X(K,I)*Y(K)
130// With loop distribution and transposition to avoid the inner
131// sum reduction and to avoid non-unit strides:
132// DO 1 I = 1, NROWS
133// 1 RES(I) = 0
134// DO 2 I = 1, NROWS
135// DO 2 K = 1, N
136// 2 RES(I) = RES(I) + X(K,I)*Y(K)
137template <TypeCategory RCAT, int RKIND, typename XT, typename YT,
138 bool X_HAS_STRIDED_COLUMNS>
139inline static RT_API_ATTRS void MatrixTransposedTimesVector(
140 CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
141 SubscriptValue n, const XT *RESTRICT x, const YT *RESTRICT y,
142 std::size_t xColumnByteStride = 0) {
143 using ResultType = CppTypeFor<RCAT, RKIND>;
144 std::memset(product, 0, rows * sizeof *product);
145 for (SubscriptValue i{0}; i < rows; ++i) {
146 for (SubscriptValue k{0}; k < n; ++k) {
147 ResultType x_ki;
148 if constexpr (!X_HAS_STRIDED_COLUMNS) {
149 x_ki = static_cast<ResultType>(x[i * n + k]);
150 } else {
151 x_ki = static_cast<ResultType>(reinterpret_cast<const XT *>(
152 reinterpret_cast<const char *>(x) + i * xColumnByteStride)[k]);
153 }
154 ResultType y_k = static_cast<ResultType>(y[k]);
155 product[i] += x_ki * y_k;
156 }
157 }
158}
159
160RT_DIAG_POP
161
162template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
163inline static RT_API_ATTRS void MatrixTransposedTimesVectorHelper(
164 CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
165 SubscriptValue n, const XT *RESTRICT x, const YT *RESTRICT y,
166 std::optional<std::size_t> xColumnByteStride) {
167 if (!xColumnByteStride) {
168 MatrixTransposedTimesVector<RCAT, RKIND, XT, YT, false>(
169 product, rows, n, x, y);
170 } else {
171 MatrixTransposedTimesVector<RCAT, RKIND, XT, YT, true>(
172 product, rows, n, x, y, *xColumnByteStride);
173 }
174}
175
176RT_DIAG_PUSH
177RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
178
179// Implements an instance of MATMUL for given argument types.
180template <bool IS_ALLOCATING, TypeCategory RCAT, int RKIND, typename XT,
181 typename YT>
182inline static RT_API_ATTRS void DoMatmulTranspose(
183 std::conditional_t<IS_ALLOCATING, Descriptor, const Descriptor> &result,
184 const Descriptor &x, const Descriptor &y, Terminator &terminator) {
185 int xRank{x.rank()};
186 int yRank{y.rank()};
187 int resRank{xRank + yRank - 2};
188 if (xRank * yRank != 2 * resRank) {
189 terminator.Crash(
190 "MATMUL-TRANSPOSE: bad argument ranks (%d * %d)", xRank, yRank);
191 }
192 SubscriptValue extent[2]{x.GetDimension(1).Extent(),
193 resRank == 2 ? y.GetDimension(1).Extent() : 0};
194 if constexpr (IS_ALLOCATING) {
195 result.Establish(
196 RCAT, RKIND, nullptr, resRank, extent, CFI_attribute_allocatable);
197 for (int j{0}; j < resRank; ++j) {
198 result.GetDimension(j).SetBounds(1, extent[j]);
199 }
200 if (int stat{result.Allocate()}) {
201 terminator.Crash(
202 "MATMUL-TRANSPOSE: could not allocate memory for result; STAT=%d",
203 stat);
204 }
205 } else {
206 RUNTIME_CHECK(terminator, resRank == result.rank());
207 RUNTIME_CHECK(
208 terminator, result.ElementBytes() == static_cast<std::size_t>(RKIND));
209 RUNTIME_CHECK(terminator, result.GetDimension(0).Extent() == extent[0]);
210 RUNTIME_CHECK(terminator,
211 resRank == 1 || result.GetDimension(1).Extent() == extent[1]);
212 }
213 SubscriptValue n{x.GetDimension(0).Extent()};
214 if (n != y.GetDimension(0).Extent()) {
215 terminator.Crash(
216 "MATMUL-TRANSPOSE: unacceptable operand shapes (%jdx%jd, %jdx%jd)",
217 static_cast<std::intmax_t>(x.GetDimension(0).Extent()),
218 static_cast<std::intmax_t>(x.GetDimension(1).Extent()),
219 static_cast<std::intmax_t>(y.GetDimension(0).Extent()),
220 static_cast<std::intmax_t>(y.GetDimension(1).Extent()));
221 }
222 using WriteResult =
223 CppTypeFor<RCAT == TypeCategory::Logical ? TypeCategory::Integer : RCAT,
224 RKIND>;
225 const SubscriptValue rows{extent[0]};
226 const SubscriptValue cols{extent[1]};
227 if constexpr (RCAT != TypeCategory::Logical) {
228 if (x.IsContiguous(1) && y.IsContiguous(1) &&
229 (IS_ALLOCATING || result.IsContiguous())) {
230 // Contiguous numeric matrices (maybe with columns
231 // separated by a stride).
232 std::optional<std::size_t> xColumnByteStride;
233 if (!x.IsContiguous()) {
234 // X's columns are strided.
235 SubscriptValue xAt[2]{};
236 x.GetLowerBounds(xAt);
237 xAt[1]++;
238 xColumnByteStride = x.SubscriptsToByteOffset(xAt);
239 }
240 std::optional<std::size_t> yColumnByteStride;
241 if (!y.IsContiguous()) {
242 // Y's columns are strided.
243 SubscriptValue yAt[2]{};
244 y.GetLowerBounds(yAt);
245 yAt[1]++;
246 yColumnByteStride = y.SubscriptsToByteOffset(yAt);
247 }
248 if (resRank == 2) { // M*M -> M
249 // TODO: use BLAS-3 GEMM for supported types.
250 MatrixTransposedTimesMatrixHelper<RCAT, RKIND, XT, YT>(
251 result.template OffsetElement<WriteResult>(), rows, cols,
252 x.OffsetElement<XT>(), y.OffsetElement<YT>(), n, xColumnByteStride,
253 yColumnByteStride);
254 return;
255 }
256 if (xRank == 2) { // M*V -> V
257 // TODO: use BLAS-2 GEMM for supported types.
258 MatrixTransposedTimesVectorHelper<RCAT, RKIND, XT, YT>(
259 result.template OffsetElement<WriteResult>(), rows, n,
260 x.OffsetElement<XT>(), y.OffsetElement<YT>(), xColumnByteStride);
261 return;
262 }
263 // else V*M -> V (not allowed because TRANSPOSE() is only defined for rank
264 // 1 matrices
265 terminator.Crash(
266 "MATMUL-TRANSPOSE: unacceptable operand shapes (%jdx%jd, %jdx%jd)",
267 static_cast<std::intmax_t>(x.GetDimension(0).Extent()),
268 static_cast<std::intmax_t>(n),
269 static_cast<std::intmax_t>(y.GetDimension(0).Extent()),
270 static_cast<std::intmax_t>(y.GetDimension(1).Extent()));
271 return;
272 }
273 }
274 // General algorithms for LOGICAL and noncontiguity
275 SubscriptValue xLB[2], yLB[2], resLB[2];
276 x.GetLowerBounds(xLB);
277 y.GetLowerBounds(yLB);
278 result.GetLowerBounds(resLB);
279 using ResultType = CppTypeFor<RCAT, RKIND>;
280 if (resRank == 2) { // M*M -> M
281 for (SubscriptValue i{0}; i < rows; ++i) {
282 for (SubscriptValue j{0}; j < cols; ++j) {
283 ResultType res_ij;
284 if constexpr (RCAT == TypeCategory::Logical) {
285 res_ij = false;
286 } else {
287 res_ij = 0;
288 }
289
290 for (SubscriptValue k{0}; k < n; ++k) {
291 SubscriptValue xAt[2]{k + xLB[0], i + xLB[1]};
292 SubscriptValue yAt[2]{k + yLB[0], j + yLB[1]};
293 if constexpr (RCAT == TypeCategory::Logical) {
294 ResultType x_ki = IsLogicalElementTrue(x, xAt);
295 ResultType y_kj = IsLogicalElementTrue(y, yAt);
296 res_ij = res_ij || (x_ki && y_kj);
297 } else {
298 ResultType x_ki = static_cast<ResultType>(*x.Element<XT>(xAt));
299 ResultType y_kj = static_cast<ResultType>(*y.Element<YT>(yAt));
300 res_ij += x_ki * y_kj;
301 }
302 }
303 SubscriptValue resAt[2]{i + resLB[0], j + resLB[1]};
304 *result.template Element<WriteResult>(resAt) = res_ij;
305 }
306 }
307 } else if (xRank == 2) { // M*V -> V
308 for (SubscriptValue i{0}; i < rows; ++i) {
309 ResultType res_i;
310 if constexpr (RCAT == TypeCategory::Logical) {
311 res_i = false;
312 } else {
313 res_i = 0;
314 }
315
316 for (SubscriptValue k{0}; k < n; ++k) {
317 SubscriptValue xAt[2]{k + xLB[0], i + xLB[1]};
318 SubscriptValue yAt[1]{k + yLB[0]};
319 if constexpr (RCAT == TypeCategory::Logical) {
320 ResultType x_ki = IsLogicalElementTrue(x, xAt);
321 ResultType y_k = IsLogicalElementTrue(y, yAt);
322 res_i = res_i || (x_ki && y_k);
323 } else {
324 ResultType x_ki = static_cast<ResultType>(*x.Element<XT>(xAt));
325 ResultType y_k = static_cast<ResultType>(*y.Element<YT>(yAt));
326 res_i += x_ki * y_k;
327 }
328 }
329 SubscriptValue resAt[1]{i + resLB[0]};
330 *result.template Element<WriteResult>(resAt) = res_i;
331 }
332 } else { // V*M -> V
333 // TRANSPOSE(V) not allowed by fortran standard
334 terminator.Crash(
335 "MATMUL-TRANSPOSE: unacceptable operand shapes (%jdx%jd, %jdx%jd)",
336 static_cast<std::intmax_t>(x.GetDimension(0).Extent()),
337 static_cast<std::intmax_t>(n),
338 static_cast<std::intmax_t>(y.GetDimension(0).Extent()),
339 static_cast<std::intmax_t>(y.GetDimension(1).Extent()));
340 }
341}
342
343RT_DIAG_POP
344
345// Maps the dynamic type information from the arguments' descriptors
346// to the right instantiation of DoMatmul() for valid combinations of
347// types.
348template <bool IS_ALLOCATING> struct MatmulTranspose {
349 using ResultDescriptor =
350 std::conditional_t<IS_ALLOCATING, Descriptor, const Descriptor>;
351 template <TypeCategory XCAT, int XKIND> struct MM1 {
352 template <TypeCategory YCAT, int YKIND> struct MM2 {
353 RT_API_ATTRS void operator()(ResultDescriptor &result,
354 const Descriptor &x, const Descriptor &y,
355 Terminator &terminator) const {
356 if constexpr (constexpr auto resultType{
357 GetResultType(XCAT, XKIND, YCAT, YKIND)}) {
358 if constexpr (Fortran::common::IsNumericTypeCategory(
359 resultType->first) ||
360 resultType->first == TypeCategory::Logical) {
361 return DoMatmulTranspose<IS_ALLOCATING, resultType->first,
362 resultType->second, CppTypeFor<XCAT, XKIND>,
363 CppTypeFor<YCAT, YKIND>>(result, x, y, terminator);
364 }
365 }
366 terminator.Crash("MATMUL-TRANSPOSE: bad operand types (%d(%d), %d(%d))",
367 static_cast<int>(XCAT), XKIND, static_cast<int>(YCAT), YKIND);
368 }
369 };
370 RT_API_ATTRS void operator()(ResultDescriptor &result, const Descriptor &x,
371 const Descriptor &y, Terminator &terminator, TypeCategory yCat,
372 int yKind) const {
373 ApplyType<MM2, void>(yCat, yKind, terminator, result, x, y, terminator);
374 }
375 };
376 RT_API_ATTRS void operator()(ResultDescriptor &result, const Descriptor &x,
377 const Descriptor &y, const char *sourceFile, int line) const {
378 Terminator terminator{sourceFile, line};
379 auto xCatKind{x.type().GetCategoryAndKind()};
380 auto yCatKind{y.type().GetCategoryAndKind()};
381 RUNTIME_CHECK(terminator, xCatKind.has_value() && yCatKind.has_value());
382 ApplyType<MM1, void>(xCatKind->first, xCatKind->second, terminator, result,
383 x, y, terminator, yCatKind->first, yCatKind->second);
384 }
385};
386} // namespace
387
388namespace Fortran::runtime {
389extern "C" {
390RT_EXT_API_GROUP_BEGIN
391
392void RTDEF(MatmulTranspose)(Descriptor &result, const Descriptor &x,
393 const Descriptor &y, const char *sourceFile, int line) {
394 MatmulTranspose<true>{}(result, x, y, sourceFile, line);
395}
396void RTDEF(MatmulTransposeDirect)(const Descriptor &result, const Descriptor &x,
397 const Descriptor &y, const char *sourceFile, int line) {
398 MatmulTranspose<false>{}(result, x, y, sourceFile, line);
399}
400
401RT_EXT_API_GROUP_END
402} // extern "C"
403} // namespace Fortran::runtime
404

source code of flang/runtime/matmul-transpose.cpp