bug49334.cpp source code [offload/test/offloading/bug49334.cpp]

1	// RUN: %libomptarget-compilexx-generic -O3 && %libomptarget-run-generic
2	// RUN: %libomptarget-compilexx-generic -O3 -ffast-math && \
3	// RUN: %libomptarget-run-generic
4	// RUN: %libomptarget-compileoptxx-generic -O3 && %libomptarget-run-generic
5	// RUN: %libomptarget-compileoptxx-generic -O3 -ffast-math && \
6	// RUN: %libomptarget-run-generic
7
8	// REQUIRES: gpu
9	// UNSUPPORTED: nvidiagpu
10	// UNSUPPORTED: amdgpu
11
12	#include <cassert>
13	#include <cmath>
14	#include <iostream>
15	#include <limits>
16	#include <memory>
17	#include <vector>
18
19	class BlockMatrix {
20	private:
21	const int rowsPerBlock;
22	const int colsPerBlock;
23	const long nRows;
24	const long nCols;
25	const int nBlocksPerRow;
26	const int nBlocksPerCol;
27	std::vector<std::vector<std::unique_ptr<float[]>>> Blocks;
28
29	public:
30	BlockMatrix(const int _rowsPerBlock, const int _colsPerBlock,
31	const long _nRows, const long _nCols)
32	: rowsPerBlock(_rowsPerBlock), colsPerBlock(_colsPerBlock), nRows(_nRows),
33	nCols(_nCols), nBlocksPerRow(_nRows / _rowsPerBlock),
34	nBlocksPerCol(_nCols / _colsPerBlock), Blocks (nBlocksPerCol) {
35	for (int i = `0`; i < nBlocksPerCol; i++) {
36	for (int j = `0`; j < nBlocksPerRow; j++) {
37	Blocks [i].emplace_back(args: new float[_rowsPerBlock * _colsPerBlock]);
38	}
39	}
40	};
41
42	// Initialize the BlockMatrix from 2D arrays
43	void Initialize(const std::vector<float> &matrix) {
44	for (int i = `0`; i < nBlocksPerCol; i++)
45	for (int j = `0`; j < nBlocksPerRow; j++) {
46	float *CurrBlock = GetBlock(i, j);
47	for (int ii = `0`; ii < colsPerBlock; ++ii)
48	for (int jj = `0`; jj < rowsPerBlock; ++jj) {
49	int curri = i * colsPerBlock + ii;
50	int currj = j * rowsPerBlock + jj;
51	CurrBlock[ii + jj * colsPerBlock] = matrix [curri + currj * nCols];
52	}
53	}
54	}
55
56	void Compare(const std::vector<float> &matrix) const {
57	for (int i = `0`; i < nBlocksPerCol; i++)
58	for (int j = `0`; j < nBlocksPerRow; j++) {
59	float *CurrBlock = GetBlock(i, j);
60	for (int ii = `0`; ii < colsPerBlock; ++ii)
61	for (int jj = `0`; jj < rowsPerBlock; ++jj) {
62	int curri = i * colsPerBlock + ii;
63	int currj = j * rowsPerBlock + jj;
64	float m_value = matrix [curri + currj * nCols];
65	float bm_value = CurrBlock[ii + jj * colsPerBlock];
66	assert(std::fabs(bm_value - m_value) <
67	std::numeric_limits<float>::epsilon());
68	}
69	}
70	}
71
72	float GetBlock(int* i, int j) const {
73	assert(i < nBlocksPerCol && j < nBlocksPerRow && "Accessing outside block");
74	return Blocks [i][j].get();
75	}
76	};
77
78	constexpr const int BS = `16`;
79	constexpr const int N = `256`;
80
81	int BlockMatMul_TargetNowait(BlockMatrix &A, BlockMatrix &B, BlockMatrix &C) {
82	#pragma omp parallel
83	#pragma omp master
84	for (int i = `0`; i < N / BS; ++i)
85	for (int j = `0`; j < N / BS; ++j) {
86	float *BlockC = C.GetBlock(i, j);
87	for (int k = `0`; k < N / BS; ++k) {
88	float *BlockA = A.GetBlock(i, j: k);
89	float *BlockB = B.GetBlock(i: k, j);
90	// clang-format off
91	#pragma omp target depend(in: BlockA[0], BlockB[0]) depend(inout: BlockC[0]) \
92	map(to: BlockA[:BS * BS], BlockB[:BS * BS]) \
93	map(tofrom: BlockC[:BS * BS]) nowait
94	// clang-format on
95	#pragma omp parallel for
96	for (int ii = `0`; ii < BS; ii++)
97	for (int jj = `0`; jj < BS; jj++) {
98	for (int kk = `0`; kk < BS; ++kk)
99	BlockC[ii + jj * BS] +=
100	BlockA[ii + kk * BS] * BlockB[kk + jj * BS];
101	}
102	}
103	}
104	return `0`;
105	}
106
107	void Matmul(const std::vector<float> &a, const std::vector<float> &b,
108	std::vector<float> &c) {
109	for (int i = `0`; i < N; ++i) {
110	for (int j = `0`; j < N; ++j) {
111	float sum = `0.0`;
112	for (int k = `0`; k < N; ++k) {
113	sum = sum + a [i * N + k] * b [k * N + j];
114	}
115	c [i * N + j] = sum;
116	}
117	}
118	}
119
120	int main(int argc, char *argv[]) {
121	std::vector<float> a(N * N);
122	std::vector<float> b(N * N);
123	std::vector<float> c(N * N, `0.0`);
124
125	for (int i = `0`; i < N; ++i) {
126	for (int j = `0`; j < N; ++j) {
127	a [i * N + j] = b [i * N + j] = i + j % `100`;
128	}
129	}
130
131	auto BlockedA = BlockMatrix (BS, BS, N, N);
132	auto BlockedB = BlockMatrix (BS, BS, N, N);
133	auto BlockedC = BlockMatrix (BS, BS, N, N);
134	BlockedA.Initialize(matrix: a);
135	BlockedB.Initialize(matrix: b);
136	BlockedC.Initialize(matrix: c);
137	BlockedA.Compare(matrix: a);
138	BlockedB.Compare(matrix: b);
139	BlockedC.Compare(matrix: c);
140
141	Matmul(a, b, c);
142	BlockMatMul_TargetNowait(A&: BlockedA, B&: BlockedB, C&: BlockedC);
143
144	BlockedC.Compare(matrix: c);
145
146	std::cout << "PASS\n";
147
148	return `0`;
149	}
150
151	// CHECK: PASS
152

Provided by KDAB

Improve your Profiling and Debugging skills

Find out more

Definitions

source code of offload/test/offloading/bug49334.cpp