matrix_transpose.cpp source code [boost/libs/compute/example/matrix_transpose.cpp]

1	//---------------------------------------------------------------------------//
2	// Copyright (c) 2014 Benoit Dequidt <benoit.dequidt@gmail.com>
3	//
4	// Distributed under the Boost Software License, Version 1.0
5	// See accompanying file LICENSE_1_0.txt or copy at
6	// http://www.boost.org/LICENSE_1_0.txt
7	//
8	// See http://boostorg.github.com/compute for more information.
9	//---------------------------------------------------------------------------//
10
11	#include <iostream>
12	#include <cstdlib>
13
14	#include <boost/program_options.hpp>
15
16	#include <boost/compute/core.hpp>
17	#include <boost/compute/algorithm/copy.hpp>
18	#include <boost/compute/container/vector.hpp>
19	#include <boost/compute/type_traits/type_name.hpp>
20	#include <boost/compute/utility/source.hpp>
21
22	namespace compute = boost::compute;
23	namespace po = boost::program_options;
24
25	using compute::uint_;
26
27	const uint_ TILE_DIM = `32`;
28	const uint_ BLOCK_ROWS = `8`;
29
30	// generate a copy kernel program
31	compute::kernel make_copy_kernel(const compute::context& context)
32	{
33	// source for the copy_kernel program
34	const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
35	__kernel void copy_kernel(__global const float src, __global float* *dst)
36	{
37	uint x = get_group_id(`0`) * TILE_DIM + get_local_id(`0`);
38	uint y = get_group_id(`1`) * TILE_DIM + get_local_id(`1`);
39
40	uint width = get_num_groups(`0`) * TILE_DIM;
41
42	for(uint i = `0` ; i < TILE_DIM ; i+= BLOCK_ROWS){
43	dst[(y+i)width +x] = src[(y+i)width + x];
44	}
45	}
46	);
47
48	// setup compilation flags for the copy program
49	std::stringstream options;
50	options << "-DTILE_DIM=" << TILE_DIM << " -DBLOCK_ROWS=" << BLOCK_ROWS;
51
52	// create and build the copy program
53	compute::program program =
54	compute::program::build_with_source(source, context, options.str());
55
56	// create and return the copy kernel
57	return program.create_kernel(name: "copy_kernel");
58	}
59
60	// generate a naive transpose kernel
61	compute::kernel make_naive_transpose_kernel(const compute::context& context)
62	{
63	// source for the naive_transpose kernel
64	const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
65	__kernel void naive_transpose(__global const float src, __global float* *dst)
66	{
67	uint x = get_group_id(`0`) * TILE_DIM + get_local_id(`0`);
68	uint y = get_group_id(`1`) * TILE_DIM + get_local_id(`1`);
69
70	uint width = get_num_groups(`0`) * TILE_DIM;
71
72	for(uint i = `0` ; i < TILE_DIM; i+= BLOCK_ROWS){
73	dst[xwidth + y+i] = src[(y+i)width + x];
74	}
75	}
76	);
77
78	// setup compilation flags for the naive_transpose program
79	std::stringstream options;
80	options << "-DTILE_DIM=" << TILE_DIM << " -DBLOCK_ROWS=" << BLOCK_ROWS;
81
82	// create and build the naive_transpose program
83	compute::program program =
84	compute::program::build_with_source(source, context, options.str());
85
86	// create and return the naive_transpose kernel
87	return program.create_kernel(name: "naive_transpose");
88	}
89
90	// generates a coalesced transpose kernel
91	compute::kernel make_coalesced_transpose_kernel(const compute::context& context)
92	{
93	// source for the coalesced_transpose kernel
94	const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
95	__kernel void coalesced_transpose(__global const float src, __global float* *dst)
96	{
97	__local float tile[TILE_DIM][TILE_DIM];
98
99	// compute indexes
100	uint x = get_group_id(`0`) * TILE_DIM + get_local_id(`0`);
101	uint y = get_group_id(`1`) * TILE_DIM + get_local_id(`1`);
102
103	uint width = get_num_groups(`0`) * TILE_DIM;
104
105	// load inside local memory
106	for(uint i = `0` ; i < TILE_DIM; i+= BLOCK_ROWS){
107	tile[get_local_id(`1`)+i][get_local_id(`0`)] = src[(y+i)*width + x];
108	}
109
110	barrier(CLK_LOCAL_MEM_FENCE);
111
112	// transpose indexes
113	x = get_group_id(`1`) * TILE_DIM + get_local_id(`0`);
114	y = get_group_id(`0`) * TILE_DIM + get_local_id(`1`);
115
116	// write output from local memory
117	for(uint i = `0` ; i < TILE_DIM ; i+=BLOCK_ROWS){
118	dst[(y+i)*width + x] = tile[get_local_id(`0`)][get_local_id(`1`)+i];
119	}
120	}
121	);
122
123	// setup compilation flags for the coalesced_transpose program
124	std::stringstream options;
125	options << "-DTILE_DIM=" << TILE_DIM << " -DBLOCK_ROWS=" << BLOCK_ROWS;
126
127	// create and build the coalesced_transpose program
128	compute::program program =
129	compute::program::build_with_source(source, context, options.str());
130
131	// create and return coalesced_transpose kernel
132	return program.create_kernel(name: "coalesced_transpose");
133	}
134
135	// generate a coalesced withtout bank conflicts kernel
136	compute::kernel make_coalesced_no_bank_conflicts_kernel(const compute::context& context)
137	{
138	const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
139	__kernel void coalesced_no_bank_conflicts(__global const float src, __global float* *dst)
140	{
141	// TILE_DIM+1 is here to avoid bank conflicts in local memory
142	__local float tile[TILE_DIM][TILE_DIM+`1`];
143
144	// compute indexes
145	uint x = get_group_id(`0`) * TILE_DIM + get_local_id(`0`);
146	uint y = get_group_id(`1`) * TILE_DIM + get_local_id(`1`);
147
148	uint width = get_num_groups(`0`) * TILE_DIM;
149
150	// load inside local memory
151	for(uint i = `0` ; i < TILE_DIM; i+= BLOCK_ROWS){
152	tile[get_local_id(`1`)+i][get_local_id(`0`)] = src[(y+i)*width + x];
153	}
154
155	barrier(CLK_LOCAL_MEM_FENCE);
156
157	// transpose indexes
158	x = get_group_id(`1`) * TILE_DIM + get_local_id(`0`);
159	y = get_group_id(`0`) * TILE_DIM + get_local_id(`1`);
160
161	// write output from local memory
162	for(uint i = `0` ; i < TILE_DIM ; i+=BLOCK_ROWS){
163	dst[(y+i)*width + x] = tile[get_local_id(`0`)][get_local_id(`1`)+i];
164	}
165	}
166	);
167
168	// setup compilation flags for the coalesced_no_bank_conflicts program
169	std::stringstream options;
170	options << "-DTILE_DIM=" << TILE_DIM << " -DBLOCK_ROWS=" << BLOCK_ROWS;
171
172	// create and build the coalesced_no_bank_conflicts program
173	compute::program program =
174	compute::program::build_with_source(source, context, options.str());
175
176	// create and return the coalesced_no_bank_conflicts kernel
177	return program.create_kernel(name: "coalesced_no_bank_conflicts");
178	}
179
180	// compare 'expectedResult' to 'transposedMatrix'. prints an error message if not equal.
181	bool check_transposition(const std::vector<float>& expectedResult,
182	uint_ size,
183	const std::vector<float>& transposedMatrix)
184	{
185	for(uint_ i = `0` ; i < size ; ++i){
186	if(expectedResult [i] != transposedMatrix [i]){
187	std::cout << "idx = " << i << " , expected " << expectedResult [i]
188	<< " , got " << transposedMatrix [i] << std::endl;
189	std::cout << "FAILED" << std::endl;
190	return false;
191	}
192	}
193	return true;
194	}
195
196	// generate a matrix inside 'in' and do the tranposition inside 'out'
197	void generate_matrix(std::vector<float>& in, std::vector<float>& out, uint_ rows, uint_ cols)
198	{
199	// generate a matrix
200	for(uint_ i = `0` ; i < rows ; ++i){
201	for(uint_ j = `0` ; j < cols ; ++j){
202	in [icols + j] = icols + j;
203	}
204	}
205
206	// store transposed result
207	for(uint_ j = `0`; j < cols ; ++j){
208	for(uint_ i = `0` ; i < rows ; ++i){
209	out [jrows + i] = in [icols + j];
210	}
211	}
212	}
213
214	// neccessary for 64-bit integer on win32
215	#ifdef _WIN32
216	#define uint64_t unsigned __int64
217	#endif
218
219	int main(int argc, char *argv[])
220	{
221	// setup command line arguments
222	po::options_description options("options");
223	options.add_options()
224	("help", "show usage instructions")
225	("rows", po::value<uint_>()->default_value(v: `4096`), "number of matrix rows")
226	("cols", po::value<uint_>()->default_value(v: `4096`), "number of matrix columns")
227	;
228
229	// parse command line
230	po::variables_map vm;
231	po::store(options: po::parse_command_line(argc, argv, desc: options), m&: vm);
232	po::notify(m&: vm);
233
234	// check command line arguments
235	if(vm.count(x: "help")){
236	std::cout << options << std::endl;
237	return `0`;
238	}
239
240	// get number rows and columns for the matrix
241	const uint_ rows = vm ["rows"].as<uint_>();
242	const uint_ cols = vm ["cols"].as<uint_>();
243
244	// get the default device
245	compute::device device = compute::system::default_device();
246
247	// print out device name and matrix information
248	std::cout << "Device: " << device.name() << std::endl;
249	std::cout << "Matrix Size: " << rows << "x" << cols << std::endl;
250	std::cout << "Grid Size: " << rows/TILE_DIM << "x" << cols/TILE_DIM << " blocks" << std::endl;
251	std::cout << "Local Size: " << TILE_DIM << "x" << BLOCK_ROWS << " threads" << std::endl;
252	std::cout << std::endl;
253
254	// On OSX this example does not work on CPU devices
255	#if defined(__APPLE__)
256	if(device.type() & compute::device::cpu) {
257	std::cout << "On OSX this example does not work on CPU devices" << std::endl;
258	return `0`;
259	}
260	#endif
261
262	const size_t global_work_size[`2`] = {rows, cols*BLOCK_ROWS/TILE_DIM};
263	const size_t local_work_size[`2`] = {TILE_DIM, BLOCK_ROWS};
264
265	// setup input data on the host
266	const uint_ size = rows * cols;
267	std::vector<float> h_input(size);
268	std::vector<float> h_output(size);
269	std::vector<float> expectedResult(size);
270	generate_matrix(in&: h_input, out&: expectedResult, rows, cols);
271
272	// create a context for the device
273	compute::context context(device);
274
275	// device vectors
276	compute::vector<float> d_input(size, context);
277	compute::vector<float> d_output(size, context);
278
279	// command_queue with profiling
280	compute::command_queue queue(context, device, compute::command_queue::enable_profiling);
281
282	// copy input data
283	compute::copy(h_input.begin(), h_input.end(), d_input.begin(), queue);
284
285	// simple copy kernel
286	std::cout << "Testing copy_kernel:" << std::endl;
287	compute::kernel kernel = make_copy_kernel(context);
288	kernel.set_arg(`0`, d_input);
289	kernel.set_arg(index: `1`, value: d_output);
290
291	compute::event start;
292	start = queue.enqueue_nd_range_kernel(kernel, work_dim: `2`, global_work_offset: `0`, global_work_size, local_work_size);
293	queue.finish();
294	uint64_t elapsed = start.duration<boost::chrono::nanoseconds>().count();
295
296	std::cout << " Elapsed: " << elapsed << " ns" << std::endl;
297	std::cout << " BandWidth: " << `2`rowscols*sizeof(float) / elapsed << " GB/s" << std::endl;
298	compute::copy(first: d_output.begin(), last: d_output.end(), result: h_output.begin(), queue);
299
300	check_transposition(expectedResult: h_input, size: rows*cols, transposedMatrix: h_output);
301	std::cout << std::endl;
302
303	// naive_transpose kernel
304	std::cout << "Testing naive_transpose:" << std::endl;
305	kernel = make_naive_transpose_kernel(context);
306	kernel.set_arg(`0`, d_input);
307	kernel.set_arg(index: `1`, value: d_output);
308
309	start = queue.enqueue_nd_range_kernel(kernel, work_dim: `2`, global_work_offset: `0`, global_work_size, local_work_size);
310	queue.finish();
311	elapsed = start.duration<boost::chrono::nanoseconds>().count();
312	std::cout << " Elapsed: " << elapsed << " ns" << std::endl;
313	std::cout << " BandWidth: " << `2`rowscols*sizeof(float) / elapsed << " GB/s" << std::endl;
314	compute::copy(first: d_output.begin(), last: d_output.end(), result: h_output.begin(), queue);
315
316	check_transposition(expectedResult, size: rows*cols, transposedMatrix: h_output);
317	std::cout << std::endl;
318
319	// coalesced_transpose kernel
320	std::cout << "Testing coalesced_transpose:" << std::endl;
321	kernel = make_coalesced_transpose_kernel(context);
322	kernel.set_arg(`0`, d_input);
323	kernel.set_arg(index: `1`, value: d_output);
324
325	start = queue.enqueue_nd_range_kernel(kernel, work_dim: `2`, global_work_offset: `0`, global_work_size, local_work_size);
326	queue.finish();
327	elapsed = start.duration<boost::chrono::nanoseconds>().count();
328	std::cout << " Elapsed: " << elapsed << " ns" << std::endl;
329	std::cout << " BandWidth: " << `2`rowscols*sizeof(float) / elapsed << " GB/s" << std::endl;
330
331	compute::copy(first: d_output.begin(), last: d_output.end(), result: h_output.begin(), queue);
332
333	check_transposition(expectedResult, size: rows*cols, transposedMatrix: h_output);
334	std::cout << std::endl;
335
336	// coalesced_no_bank_conflicts kernel
337	std::cout << "Testing coalesced_no_bank_conflicts:" << std::endl;
338
339	kernel = make_coalesced_no_bank_conflicts_kernel(context);
340	kernel.set_arg(`0`, d_input);
341	kernel.set_arg(index: `1`, value: d_output);
342
343	start = queue.enqueue_nd_range_kernel(kernel, work_dim: `2`, global_work_offset: `0`, global_work_size, local_work_size);
344	queue.finish();
345	elapsed = start.duration<boost::chrono::nanoseconds>().count();
346	std::cout << " Elapsed: " << elapsed << " ns" << std::endl;
347	std::cout << " BandWidth: " << `2`rowscols*sizeof(float) / elapsed << " GB/s" << std::endl;
348
349	compute::copy(first: d_output.begin(), last: d_output.end(), result: h_output.begin(), queue);
350
351	check_transposition(expectedResult, size: rows*cols, transposedMatrix: h_output);
352	std::cout << std::endl;
353
354	return `0`;
355	}
356

source code of boost/libs/compute/example/matrix_transpose.cpp