1/* Bits of OpenMP and OpenACC handling that is specific to device offloading
2 and a lowering pass for OpenACC device directives.
3
4 Copyright (C) 2005-2023 Free Software Foundation, Inc.
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3. If not see
20<http://www.gnu.org/licenses/>. */
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
25#include "backend.h"
26#include "target.h"
27#include "tree.h"
28#include "gimple.h"
29#include "tree-pass.h"
30#include "ssa.h"
31#include "cgraph.h"
32#include "pretty-print.h"
33#include "diagnostic-core.h"
34#include "fold-const.h"
35#include "internal-fn.h"
36#include "langhooks.h"
37#include "gimplify.h"
38#include "gimple-iterator.h"
39#include "gimplify-me.h"
40#include "gimple-walk.h"
41#include "tree-cfg.h"
42#include "tree-into-ssa.h"
43#include "tree-nested.h"
44#include "stor-layout.h"
45#include "common/common-target.h"
46#include "omp-general.h"
47#include "omp-offload.h"
48#include "lto-section-names.h"
49#include "gomp-constants.h"
50#include "gimple-pretty-print.h"
51#include "intl.h"
52#include "stringpool.h"
53#include "attribs.h"
54#include "cfgloop.h"
55#include "context.h"
56#include "convert.h"
57#include "opts.h"
58
59/* Describe the OpenACC looping structure of a function. The entire
60 function is held in a 'NULL' loop. */
61
62struct oacc_loop
63{
64 oacc_loop *parent; /* Containing loop. */
65
66 oacc_loop *child; /* First inner loop. */
67
68 oacc_loop *sibling; /* Next loop within same parent. */
69
70 location_t loc; /* Location of the loop start. */
71
72 gcall *marker; /* Initial head marker. */
73
74 gcall *heads[GOMP_DIM_MAX]; /* Head marker functions. */
75 gcall *tails[GOMP_DIM_MAX]; /* Tail marker functions. */
76
77 tree routine; /* Pseudo-loop enclosing a routine. */
78
79 unsigned mask; /* Partitioning mask. */
80 unsigned e_mask; /* Partitioning of element loops (when tiling). */
81 unsigned inner; /* Partitioning of inner loops. */
82 unsigned flags; /* Partitioning flags. */
83 vec<gcall *> ifns; /* Contained loop abstraction functions. */
84 tree chunk_size; /* Chunk size. */
85 gcall *head_end; /* Final marker of head sequence. */
86};
87
88/* Holds offload tables with decls. */
89vec<tree, va_gc> *offload_funcs, *offload_vars, *offload_ind_funcs;
90
91/* Return level at which oacc routine may spawn a partitioned loop, or
92 -1 if it is not a routine (i.e. is an offload fn). */
93
94int
95oacc_fn_attrib_level (tree attr)
96{
97 tree pos = TREE_VALUE (attr);
98
99 if (!TREE_PURPOSE (pos))
100 return -1;
101
102 int ix = 0;
103 for (ix = 0; ix != GOMP_DIM_MAX;
104 ix++, pos = TREE_CHAIN (pos))
105 if (!integer_zerop (TREE_PURPOSE (pos)))
106 break;
107
108 return ix;
109}
110
111/* Helper function for omp_finish_file routine. Takes decls from V_DECLS and
112 adds their addresses and sizes to constructor-vector V_CTOR. */
113
114static void
115add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls,
116 vec<constructor_elt, va_gc> *v_ctor)
117{
118 unsigned len = vec_safe_length (v: v_decls);
119 for (unsigned i = 0; i < len; i++)
120 {
121 tree it = (*v_decls)[i];
122 bool is_var = VAR_P (it);
123 bool is_link_var
124 = is_var
125#ifdef ACCEL_COMPILER
126 && DECL_HAS_VALUE_EXPR_P (it)
127#endif
128 && lookup_attribute (attr_name: "omp declare target link", DECL_ATTRIBUTES (it));
129
130 /* See also omp_finish_file and output_offload_tables in lto-cgraph.cc. */
131 if (!in_lto_p && !symtab_node::get (decl: it))
132 continue;
133
134 tree size = NULL_TREE;
135 if (is_var)
136 size = fold_convert (const_ptr_type_node, DECL_SIZE_UNIT (it));
137
138 tree addr;
139 if (!is_link_var)
140 addr = build_fold_addr_expr (it);
141 else
142 {
143#ifdef ACCEL_COMPILER
144 /* For "omp declare target link" vars add address of the pointer to
145 the target table, instead of address of the var. */
146 tree value_expr = DECL_VALUE_EXPR (it);
147 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
148 varpool_node::finalize_decl (link_ptr_decl);
149 addr = build_fold_addr_expr (link_ptr_decl);
150#else
151 addr = build_fold_addr_expr (it);
152#endif
153
154 /* Most significant bit of the size marks "omp declare target link"
155 vars in host and target tables. */
156 unsigned HOST_WIDE_INT isize = tree_to_uhwi (size);
157 isize |= 1ULL << (int_size_in_bytes (const_ptr_type_node)
158 * BITS_PER_UNIT - 1);
159 size = wide_int_to_tree (const_ptr_type_node, cst: isize);
160 }
161
162 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, addr);
163 if (is_var)
164 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, size);
165 }
166}
167
168/* Return true if DECL is a function for which its references should be
169 analyzed. */
170
171static bool
172omp_declare_target_fn_p (tree decl)
173{
174 return (TREE_CODE (decl) == FUNCTION_DECL
175 && lookup_attribute (attr_name: "omp declare target", DECL_ATTRIBUTES (decl))
176 && !lookup_attribute (attr_name: "omp declare target host",
177 DECL_ATTRIBUTES (decl))
178 && (!flag_openacc
179 || oacc_get_fn_attrib (fn: decl) == NULL_TREE));
180}
181
182/* Return true if DECL Is a variable for which its initializer references
183 should be analyzed. */
184
185static bool
186omp_declare_target_var_p (tree decl)
187{
188 return (VAR_P (decl)
189 && lookup_attribute (attr_name: "omp declare target", DECL_ATTRIBUTES (decl))
190 && !lookup_attribute (attr_name: "omp declare target link",
191 DECL_ATTRIBUTES (decl)));
192}
193
194/* Helper function for omp_discover_implicit_declare_target, called through
195 walk_tree. Mark referenced FUNCTION_DECLs implicitly as
196 declare target to. */
197
198static tree
199omp_discover_declare_target_tgt_fn_r (tree *tp, int *walk_subtrees, void *data)
200{
201 if (TREE_CODE (*tp) == CALL_EXPR
202 && CALL_EXPR_FN (*tp)
203 && TREE_CODE (CALL_EXPR_FN (*tp)) == ADDR_EXPR
204 && TREE_CODE (TREE_OPERAND (CALL_EXPR_FN (*tp), 0)) == FUNCTION_DECL
205 && lookup_attribute (attr_name: "omp declare variant base",
206 DECL_ATTRIBUTES (TREE_OPERAND (CALL_EXPR_FN (*tp),
207 0))))
208 {
209 tree fn = TREE_OPERAND (CALL_EXPR_FN (*tp), 0);
210 for (tree attr = DECL_ATTRIBUTES (fn); attr; attr = TREE_CHAIN (attr))
211 {
212 attr = lookup_attribute (attr_name: "omp declare variant base", list: attr);
213 if (attr == NULL_TREE)
214 break;
215 tree purpose = TREE_PURPOSE (TREE_VALUE (attr));
216 if (TREE_CODE (purpose) == FUNCTION_DECL)
217 omp_discover_declare_target_tgt_fn_r (tp: &purpose, walk_subtrees, data);
218 }
219 }
220 else if (TREE_CODE (*tp) == FUNCTION_DECL)
221 {
222 tree decl = *tp;
223 tree id = get_identifier ("omp declare target");
224 symtab_node *node = symtab_node::get (decl: *tp);
225 if (node != NULL)
226 {
227 while (node->alias_target
228 && TREE_CODE (node->alias_target) == FUNCTION_DECL)
229 {
230 if (!omp_declare_target_fn_p (decl: node->decl)
231 && !lookup_attribute (attr_name: "omp declare target host",
232 DECL_ATTRIBUTES (node->decl)))
233 {
234 node->offloadable = 1;
235 DECL_ATTRIBUTES (node->decl)
236 = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (node->decl));
237 }
238 node = symtab_node::get (decl: node->alias_target);
239 }
240 symtab_node *new_node = node->ultimate_alias_target ();
241 decl = new_node->decl;
242 while (node != new_node)
243 {
244 if (!omp_declare_target_fn_p (decl: node->decl)
245 && !lookup_attribute (attr_name: "omp declare target host",
246 DECL_ATTRIBUTES (node->decl)))
247 {
248 node->offloadable = 1;
249 DECL_ATTRIBUTES (node->decl)
250 = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (node->decl));
251 }
252 gcc_assert (node->alias && node->analyzed);
253 node = node->get_alias_target ();
254 }
255 node->offloadable = 1;
256 if (ENABLE_OFFLOADING)
257 g->have_offload = true;
258 }
259 if (omp_declare_target_fn_p (decl)
260 || lookup_attribute (attr_name: "omp declare target host",
261 DECL_ATTRIBUTES (decl)))
262 return NULL_TREE;
263
264 if (!DECL_EXTERNAL (decl) && DECL_SAVED_TREE (decl))
265 ((vec<tree> *) data)->safe_push (obj: decl);
266 DECL_ATTRIBUTES (decl) = tree_cons (id, NULL_TREE,
267 DECL_ATTRIBUTES (decl));
268 }
269 else if (TYPE_P (*tp))
270 *walk_subtrees = 0;
271 else if (TREE_CODE (*tp) == OMP_TARGET)
272 {
273 tree c = omp_find_clause (OMP_CLAUSES (*tp), kind: OMP_CLAUSE_DEVICE);
274 if (c && OMP_CLAUSE_DEVICE_ANCESTOR (c))
275 *walk_subtrees = 0;
276 }
277 return NULL_TREE;
278}
279
280/* Similarly, but ignore references outside of OMP_TARGET regions. */
281
282static tree
283omp_discover_declare_target_fn_r (tree *tp, int *walk_subtrees, void *data)
284{
285 if (TREE_CODE (*tp) == OMP_TARGET)
286 {
287 tree c = omp_find_clause (OMP_CLAUSES (*tp), kind: OMP_CLAUSE_DEVICE);
288 if (!c || !OMP_CLAUSE_DEVICE_ANCESTOR (c))
289 walk_tree_without_duplicates (&OMP_TARGET_BODY (*tp),
290 omp_discover_declare_target_tgt_fn_r,
291 data);
292 *walk_subtrees = 0;
293 }
294 else if (TYPE_P (*tp))
295 *walk_subtrees = 0;
296 return NULL_TREE;
297}
298
299/* Helper function for omp_discover_implicit_declare_target, called through
300 walk_tree. Mark referenced FUNCTION_DECLs implicitly as
301 declare target to. */
302
303static tree
304omp_discover_declare_target_var_r (tree *tp, int *walk_subtrees, void *data)
305{
306 if (TREE_CODE (*tp) == FUNCTION_DECL)
307 return omp_discover_declare_target_tgt_fn_r (tp, walk_subtrees, data);
308 else if (VAR_P (*tp)
309 && is_global_var (t: *tp)
310 && !omp_declare_target_var_p (decl: *tp))
311 {
312 tree id = get_identifier ("omp declare target");
313 if (lookup_attribute (attr_name: "omp declare target link", DECL_ATTRIBUTES (*tp)))
314 {
315 error_at (DECL_SOURCE_LOCATION (*tp),
316 "%qD specified both in declare target %<link%> and "
317 "implicitly in %<to%> clauses", *tp);
318 DECL_ATTRIBUTES (*tp)
319 = remove_attribute ("omp declare target link", DECL_ATTRIBUTES (*tp));
320 }
321 if (TREE_STATIC (*tp) && lang_hooks.decls.omp_get_decl_init (*tp))
322 ((vec<tree> *) data)->safe_push (obj: *tp);
323 DECL_ATTRIBUTES (*tp) = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (*tp));
324 symtab_node *node = symtab_node::get (decl: *tp);
325 if (node != NULL && !node->offloadable)
326 {
327 node->offloadable = 1;
328 if (ENABLE_OFFLOADING)
329 {
330 g->have_offload = true;
331 if (is_a <varpool_node *> (p: node))
332 vec_safe_push (v&: offload_vars, obj: node->decl);
333 }
334 }
335 }
336 else if (TYPE_P (*tp))
337 *walk_subtrees = 0;
338 return NULL_TREE;
339}
340
341/* Perform the OpenMP implicit declare target to discovery. */
342
343void
344omp_discover_implicit_declare_target (void)
345{
346 cgraph_node *node;
347 varpool_node *vnode;
348 auto_vec<tree> worklist;
349
350 FOR_EACH_DEFINED_FUNCTION (node)
351 if (DECL_SAVED_TREE (node->decl))
352 {
353 struct cgraph_node *cgn;
354 if (lookup_attribute (attr_name: "omp declare target indirect",
355 DECL_ATTRIBUTES (node->decl)))
356 vec_safe_push (v&: offload_ind_funcs, obj: node->decl);
357 if (omp_declare_target_fn_p (decl: node->decl))
358 worklist.safe_push (obj: node->decl);
359 else if (DECL_STRUCT_FUNCTION (node->decl)
360 && DECL_STRUCT_FUNCTION (node->decl)->has_omp_target)
361 worklist.safe_push (obj: node->decl);
362 for (cgn = first_nested_function (node);
363 cgn; cgn = next_nested_function (node: cgn))
364 if (omp_declare_target_fn_p (decl: cgn->decl))
365 worklist.safe_push (obj: cgn->decl);
366 else if (DECL_STRUCT_FUNCTION (cgn->decl)
367 && DECL_STRUCT_FUNCTION (cgn->decl)->has_omp_target)
368 worklist.safe_push (obj: cgn->decl);
369 }
370 FOR_EACH_VARIABLE (vnode)
371 if (lang_hooks.decls.omp_get_decl_init (vnode->decl)
372 && omp_declare_target_var_p (decl: vnode->decl))
373 worklist.safe_push (obj: vnode->decl);
374 while (!worklist.is_empty ())
375 {
376 tree decl = worklist.pop ();
377 if (VAR_P (decl))
378 walk_tree_without_duplicates (lang_hooks.decls.omp_get_decl_init (decl),
379 omp_discover_declare_target_var_r,
380 &worklist);
381 else if (omp_declare_target_fn_p (decl))
382 walk_tree_without_duplicates (&DECL_SAVED_TREE (decl),
383 omp_discover_declare_target_tgt_fn_r,
384 &worklist);
385 else
386 walk_tree_without_duplicates (&DECL_SAVED_TREE (decl),
387 omp_discover_declare_target_fn_r,
388 &worklist);
389 }
390
391 lang_hooks.decls.omp_finish_decl_inits ();
392}
393
394
395/* Create new symbols containing (address, size) pairs for global variables,
396 marked with "omp declare target" attribute, as well as addresses for the
397 functions, which are outlined offloading regions. */
398void
399omp_finish_file (void)
400{
401 unsigned num_funcs = vec_safe_length (v: offload_funcs);
402 unsigned num_vars = vec_safe_length (v: offload_vars);
403 unsigned num_ind_funcs = vec_safe_length (v: offload_ind_funcs);
404
405 if (num_funcs == 0 && num_vars == 0 && num_ind_funcs == 0)
406 return;
407
408 if (targetm_common.have_named_sections)
409 {
410 vec<constructor_elt, va_gc> *v_f, *v_v, *v_if;
411 vec_alloc (v&: v_f, nelems: num_funcs);
412 vec_alloc (v&: v_v, nelems: num_vars * 2);
413 vec_alloc (v&: v_if, nelems: num_ind_funcs);
414
415 add_decls_addresses_to_decl_constructor (v_decls: offload_funcs, v_ctor: v_f);
416 add_decls_addresses_to_decl_constructor (v_decls: offload_vars, v_ctor: v_v);
417 add_decls_addresses_to_decl_constructor (v_decls: offload_ind_funcs, v_ctor: v_if);
418
419 tree vars_decl_type = build_array_type_nelts (pointer_sized_int_node,
420 vec_safe_length (v: v_v));
421 tree funcs_decl_type = build_array_type_nelts (pointer_sized_int_node,
422 num_funcs);
423 tree ind_funcs_decl_type = build_array_type_nelts (pointer_sized_int_node,
424 num_ind_funcs);
425
426 SET_TYPE_ALIGN (vars_decl_type, TYPE_ALIGN (pointer_sized_int_node));
427 SET_TYPE_ALIGN (funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node));
428 SET_TYPE_ALIGN (ind_funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node));
429 tree ctor_v = build_constructor (vars_decl_type, v_v);
430 tree ctor_f = build_constructor (funcs_decl_type, v_f);
431 tree ctor_if = build_constructor (ind_funcs_decl_type, v_if);
432 TREE_CONSTANT (ctor_v) = TREE_CONSTANT (ctor_f) = TREE_CONSTANT (ctor_if) = 1;
433 TREE_STATIC (ctor_v) = TREE_STATIC (ctor_f) = TREE_STATIC (ctor_if) = 1;
434 tree funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
435 get_identifier (".offload_func_table"),
436 funcs_decl_type);
437 tree vars_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
438 get_identifier (".offload_var_table"),
439 vars_decl_type);
440 tree ind_funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
441 get_identifier (".offload_ind_func_table"),
442 ind_funcs_decl_type);
443 TREE_STATIC (funcs_decl) = TREE_STATIC (ind_funcs_decl) = 1;
444 TREE_STATIC (vars_decl) = 1;
445 /* Do not align tables more than TYPE_ALIGN (pointer_sized_int_node),
446 otherwise a joint table in a binary will contain padding between
447 tables from multiple object files. */
448 DECL_USER_ALIGN (funcs_decl) = DECL_USER_ALIGN (ind_funcs_decl) = 1;
449 DECL_USER_ALIGN (vars_decl) = 1;
450 SET_DECL_ALIGN (funcs_decl, TYPE_ALIGN (funcs_decl_type));
451 SET_DECL_ALIGN (vars_decl, TYPE_ALIGN (vars_decl_type));
452 SET_DECL_ALIGN (ind_funcs_decl, TYPE_ALIGN (ind_funcs_decl_type));
453 DECL_INITIAL (funcs_decl) = ctor_f;
454 DECL_INITIAL (vars_decl) = ctor_v;
455 DECL_INITIAL (ind_funcs_decl) = ctor_if;
456 set_decl_section_name (funcs_decl, OFFLOAD_FUNC_TABLE_SECTION_NAME);
457 set_decl_section_name (vars_decl, OFFLOAD_VAR_TABLE_SECTION_NAME);
458 set_decl_section_name (ind_funcs_decl,
459 OFFLOAD_IND_FUNC_TABLE_SECTION_NAME);
460 varpool_node::finalize_decl (decl: vars_decl);
461 varpool_node::finalize_decl (decl: funcs_decl);
462 varpool_node::finalize_decl (decl: ind_funcs_decl);
463 }
464 else
465 {
466 for (unsigned i = 0; i < num_funcs; i++)
467 {
468 tree it = (*offload_funcs)[i];
469 /* See also add_decls_addresses_to_decl_constructor
470 and output_offload_tables in lto-cgraph.cc. */
471 if (!in_lto_p && !symtab_node::get (decl: it))
472 continue;
473 targetm.record_offload_symbol (it);
474 }
475 for (unsigned i = 0; i < num_vars; i++)
476 {
477 tree it = (*offload_vars)[i];
478 if (!in_lto_p && !symtab_node::get (decl: it))
479 continue;
480#ifdef ACCEL_COMPILER
481 if (DECL_HAS_VALUE_EXPR_P (it)
482 && lookup_attribute ("omp declare target link",
483 DECL_ATTRIBUTES (it)))
484 {
485 tree value_expr = DECL_VALUE_EXPR (it);
486 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
487 targetm.record_offload_symbol (link_ptr_decl);
488 varpool_node::finalize_decl (link_ptr_decl);
489 }
490 else
491#endif
492 targetm.record_offload_symbol (it);
493 }
494 for (unsigned i = 0; i < num_ind_funcs; i++)
495 {
496 tree it = (*offload_ind_funcs)[i];
497 /* See also add_decls_addresses_to_decl_constructor
498 and output_offload_tables in lto-cgraph.cc. */
499 if (!in_lto_p && !symtab_node::get (decl: it))
500 continue;
501 targetm.record_offload_symbol (it);
502 }
503 }
504}
505
506/* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
507 axis DIM. Return a tmp var holding the result. */
508
509static tree
510oacc_dim_call (bool pos, int dim, gimple_seq *seq)
511{
512 tree arg = build_int_cst (unsigned_type_node, dim);
513 tree size = create_tmp_var (integer_type_node);
514 enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
515 gimple *call = gimple_build_call_internal (fn, 1, arg);
516
517 gimple_call_set_lhs (gs: call, lhs: size);
518 gimple_seq_add_stmt (seq, call);
519
520 return size;
521}
522
523/* Find the number of threads (POS = false), or thread number (POS =
524 true) for an OpenACC region partitioned as MASK. Setup code
525 required for the calculation is added to SEQ. */
526
527static tree
528oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
529{
530 tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1);
531 unsigned ix;
532
533 /* Start at gang level, and examine relevant dimension indices. */
534 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
535 if (GOMP_DIM_MASK (ix) & mask)
536 {
537 if (res)
538 {
539 /* We had an outer index, so scale that by the size of
540 this dimension. */
541 tree n = oacc_dim_call (pos: false, dim: ix, seq);
542 res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
543 }
544 if (pos)
545 {
546 /* Determine index in this dimension. */
547 tree id = oacc_dim_call (pos: true, dim: ix, seq);
548 if (res)
549 res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
550 else
551 res = id;
552 }
553 }
554
555 if (res == NULL_TREE)
556 res = integer_zero_node;
557
558 return res;
559}
560
561/* Transform IFN_GOACC_LOOP calls to actual code. See
562 expand_oacc_for for where these are generated. At the vector
563 level, we stride loops, such that each member of a warp will
564 operate on adjacent iterations. At the worker and gang level,
565 each gang/warp executes a set of contiguous iterations. Chunking
566 can override this such that each iteration engine executes a
567 contiguous chunk, and then moves on to stride to the next chunk. */
568
569static void
570oacc_xform_loop (gcall *call)
571{
572 gimple_stmt_iterator gsi = gsi_for_stmt (call);
573 enum ifn_goacc_loop_kind code
574 = (enum ifn_goacc_loop_kind) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
575 tree dir = gimple_call_arg (gs: call, index: 1);
576 tree range = gimple_call_arg (gs: call, index: 2);
577 tree step = gimple_call_arg (gs: call, index: 3);
578 tree chunk_size = NULL_TREE;
579 unsigned mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 5));
580 tree lhs = gimple_call_lhs (gs: call);
581 tree type = NULL_TREE;
582 tree diff_type = TREE_TYPE (range);
583 tree r = NULL_TREE;
584 gimple_seq seq = NULL;
585 bool chunking = false, striding = true;
586 unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
587 unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
588
589 /* Skip lowering if return value of IFN_GOACC_LOOP call is not used. */
590 if (!lhs)
591 {
592 gsi_replace_with_seq (&gsi, seq, true);
593 return;
594 }
595
596 type = TREE_TYPE (lhs);
597
598#ifdef ACCEL_COMPILER
599 chunk_size = gimple_call_arg (call, 4);
600 if (integer_minus_onep (chunk_size) /* Force static allocation. */
601 || integer_zerop (chunk_size)) /* Default (also static). */
602 {
603 /* If we're at the gang level, we want each to execute a
604 contiguous run of iterations. Otherwise we want each element
605 to stride. */
606 striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG));
607 chunking = false;
608 }
609 else
610 {
611 /* Chunk of size 1 is striding. */
612 striding = integer_onep (chunk_size);
613 chunking = !striding;
614 }
615#endif
616
617 /* striding=true, chunking=true
618 -> invalid.
619 striding=true, chunking=false
620 -> chunks=1
621 striding=false,chunking=true
622 -> chunks=ceil (range/(chunksize*threads*step))
623 striding=false,chunking=false
624 -> chunk_size=ceil(range/(threads*step)),chunks=1 */
625 push_gimplify_context (in_ssa: true);
626
627 switch (code)
628 {
629 default: gcc_unreachable ();
630
631 case IFN_GOACC_LOOP_CHUNKS:
632 if (!chunking)
633 r = build_int_cst (type, 1);
634 else
635 {
636 /* chunk_max
637 = (range - dir) / (chunks * step * num_threads) + dir */
638 tree per = oacc_thread_numbers (pos: false, mask, seq: &seq);
639 per = fold_convert (type, per);
640 chunk_size = fold_convert (type, chunk_size);
641 per = fold_build2 (MULT_EXPR, type, per, chunk_size);
642 per = fold_build2 (MULT_EXPR, type, per, step);
643 r = build2 (MINUS_EXPR, type, range, dir);
644 r = build2 (PLUS_EXPR, type, r, per);
645 r = build2 (TRUNC_DIV_EXPR, type, r, per);
646 }
647 break;
648
649 case IFN_GOACC_LOOP_STEP:
650 {
651 /* If striding, step by the entire compute volume, otherwise
652 step by the inner volume. */
653 unsigned volume = striding ? mask : inner_mask;
654
655 r = oacc_thread_numbers (pos: false, mask: volume, seq: &seq);
656 r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
657 }
658 break;
659
660 case IFN_GOACC_LOOP_OFFSET:
661 /* Enable vectorization on non-SIMT targets. */
662 if (!targetm.simt.vf
663 && outer_mask == GOMP_DIM_MASK (GOMP_DIM_VECTOR)
664 /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
665 the loop. */
666 && (flag_tree_loop_vectorize
667 || !OPTION_SET_P (flag_tree_loop_vectorize)))
668 {
669 basic_block bb = gsi_bb (i: gsi);
670 class loop *parent = bb->loop_father;
671 class loop *body = parent->inner;
672
673 parent->force_vectorize = true;
674 parent->safelen = INT_MAX;
675
676 /* "Chunking loops" may have inner loops. */
677 if (parent->inner)
678 {
679 body->force_vectorize = true;
680 body->safelen = INT_MAX;
681 }
682
683 cfun->has_force_vectorize_loops = true;
684 }
685 if (striding)
686 {
687 r = oacc_thread_numbers (pos: true, mask, seq: &seq);
688 r = fold_convert (diff_type, r);
689 }
690 else
691 {
692 tree inner_size = oacc_thread_numbers (pos: false, mask: inner_mask, seq: &seq);
693 tree outer_size = oacc_thread_numbers (pos: false, mask: outer_mask, seq: &seq);
694 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
695 inner_size, outer_size);
696
697 volume = fold_convert (diff_type, volume);
698 if (chunking)
699 chunk_size = fold_convert (diff_type, chunk_size);
700 else
701 {
702 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
703
704 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
705 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
706 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
707 }
708
709 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
710 fold_convert (diff_type, inner_size));
711 r = oacc_thread_numbers (pos: true, mask: outer_mask, seq: &seq);
712 r = fold_convert (diff_type, r);
713 r = build2 (MULT_EXPR, diff_type, r, span);
714
715 tree inner = oacc_thread_numbers (pos: true, mask: inner_mask, seq: &seq);
716 inner = fold_convert (diff_type, inner);
717 r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
718
719 if (chunking)
720 {
721 tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
722 tree per
723 = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
724 per = build2 (MULT_EXPR, diff_type, per, chunk);
725
726 r = build2 (PLUS_EXPR, diff_type, r, per);
727 }
728 }
729 r = fold_build2 (MULT_EXPR, diff_type, r, step);
730 if (type != diff_type)
731 r = fold_convert (type, r);
732 break;
733
734 case IFN_GOACC_LOOP_BOUND:
735 if (striding)
736 r = range;
737 else
738 {
739 tree inner_size = oacc_thread_numbers (pos: false, mask: inner_mask, seq: &seq);
740 tree outer_size = oacc_thread_numbers (pos: false, mask: outer_mask, seq: &seq);
741 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
742 inner_size, outer_size);
743
744 volume = fold_convert (diff_type, volume);
745 if (chunking)
746 chunk_size = fold_convert (diff_type, chunk_size);
747 else
748 {
749 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
750
751 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
752 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
753 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
754 }
755
756 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
757 fold_convert (diff_type, inner_size));
758
759 r = fold_build2 (MULT_EXPR, diff_type, span, step);
760
761 tree offset = gimple_call_arg (gs: call, index: 6);
762 r = build2 (PLUS_EXPR, diff_type, r,
763 fold_convert (diff_type, offset));
764 r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
765 diff_type, r, range);
766 }
767 if (diff_type != type)
768 r = fold_convert (type, r);
769 break;
770 }
771
772 gimplify_assign (lhs, r, &seq);
773
774 pop_gimplify_context (NULL);
775
776 gsi_replace_with_seq (&gsi, seq, true);
777}
778
779/* Transform a GOACC_TILE call. Determines the element loop span for
780 the specified loop of the nest. This is 1 if we're not tiling.
781
782 GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element); */
783
784static void
785oacc_xform_tile (gcall *call)
786{
787 gimple_stmt_iterator gsi = gsi_for_stmt (call);
788 unsigned collapse = tree_to_uhwi (gimple_call_arg (gs: call, index: 0));
789 /* Inner loops have higher loop_nos. */
790 unsigned loop_no = tree_to_uhwi (gimple_call_arg (gs: call, index: 1));
791 tree tile_size = gimple_call_arg (gs: call, index: 2);
792 unsigned e_mask = tree_to_uhwi (gimple_call_arg (gs: call, index: 4));
793 tree lhs = gimple_call_lhs (gs: call);
794 tree type = TREE_TYPE (lhs);
795 gimple_seq seq = NULL;
796 tree span = build_int_cst (type, 1);
797
798 gcc_assert (!(e_mask
799 & ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
800 | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
801 push_gimplify_context (in_ssa: !seen_error ());
802
803#ifndef ACCEL_COMPILER
804 /* Partitioning disabled on host compilers. */
805 e_mask = 0;
806#endif
807 if (!e_mask)
808 /* Not paritioning. */
809 span = integer_one_node;
810 else if (!integer_zerop (tile_size))
811 /* User explicitly specified size. */
812 span = tile_size;
813 else
814 {
815 /* Pick a size based on the paritioning of the element loop and
816 the number of loop nests. */
817 tree first_size = NULL_TREE;
818 tree second_size = NULL_TREE;
819
820 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
821 first_size = oacc_dim_call (pos: false, GOMP_DIM_VECTOR, seq: &seq);
822 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
823 second_size = oacc_dim_call (pos: false, GOMP_DIM_WORKER, seq: &seq);
824
825 if (!first_size)
826 {
827 first_size = second_size;
828 second_size = NULL_TREE;
829 }
830
831 if (loop_no + 1 == collapse)
832 {
833 span = first_size;
834 if (!loop_no && second_size)
835 span = fold_build2 (MULT_EXPR, TREE_TYPE (span),
836 span, second_size);
837 }
838 else if (loop_no + 2 == collapse)
839 span = second_size;
840 else
841 span = NULL_TREE;
842
843 if (!span)
844 /* There's no obvious element size for this loop. Options
845 are 1, first_size or some non-unity constant (32 is my
846 favourite). We should gather some statistics. */
847 span = first_size;
848 }
849
850 span = fold_convert (type, span);
851 gimplify_assign (lhs, span, &seq);
852
853 pop_gimplify_context (NULL);
854
855 gsi_replace_with_seq (&gsi, seq, true);
856}
857
858/* Default partitioned and minimum partitioned dimensions. */
859
860static int oacc_default_dims[GOMP_DIM_MAX];
861static int oacc_min_dims[GOMP_DIM_MAX];
862
863int
864oacc_get_default_dim (int dim)
865{
866 gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
867 return oacc_default_dims[dim];
868}
869
870int
871oacc_get_min_dim (int dim)
872{
873 gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
874 return oacc_min_dims[dim];
875}
876
877/* Parse the default dimension parameter. This is a set of
878 :-separated optional compute dimensions. Each specified dimension
879 is a positive integer. When device type support is added, it is
880 planned to be a comma separated list of such compute dimensions,
881 with all but the first prefixed by the colon-terminated device
882 type. */
883
884static void
885oacc_parse_default_dims (const char *dims)
886{
887 int ix;
888
889 for (ix = GOMP_DIM_MAX; ix--;)
890 {
891 oacc_default_dims[ix] = -1;
892 oacc_min_dims[ix] = 1;
893 }
894
895#ifndef ACCEL_COMPILER
896 /* Cannot be overridden on the host. */
897 dims = NULL;
898#endif
899 if (dims)
900 {
901 const char *pos = dims;
902
903 for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++)
904 {
905 if (ix)
906 {
907 if (*pos != ':')
908 goto malformed;
909 pos++;
910 }
911
912 if (*pos != ':')
913 {
914 long val;
915 const char *eptr;
916
917 errno = 0;
918 val = strtol (nptr: pos, CONST_CAST (char **, &eptr), base: 10);
919 if (errno || val <= 0 || (int) val != val)
920 goto malformed;
921 pos = eptr;
922 oacc_default_dims[ix] = (int) val;
923 }
924 }
925 if (*pos)
926 {
927 malformed:
928 error_at (UNKNOWN_LOCATION,
929 "%<-fopenacc-dim%> operand is malformed at %qs", pos);
930 }
931 }
932
933 /* Allow the backend to validate the dimensions. */
934 targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1, 0);
935 targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2, 0);
936}
937
938/* Validate and update the dimensions for offloaded FN. ATTRS is the
939 raw attribute. DIMS is an array of dimensions, which is filled in.
940 LEVEL is the partitioning level of a routine, or -1 for an offload
941 region itself. USED is the mask of partitioned execution in the
942 function. */
943
944static void
945oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
946{
947 tree purpose[GOMP_DIM_MAX];
948 unsigned ix;
949 tree pos = TREE_VALUE (attrs);
950
951 /* Make sure the attribute creator attached the dimension
952 information. */
953 gcc_assert (pos);
954
955 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
956 {
957 purpose[ix] = TREE_PURPOSE (pos);
958 tree val = TREE_VALUE (pos);
959 dims[ix] = val ? TREE_INT_CST_LOW (val) : -1;
960 pos = TREE_CHAIN (pos);
961 }
962
963 bool check = true;
964#ifdef ACCEL_COMPILER
965 check = false;
966#endif
967 if (check
968 && warn_openacc_parallelism
969 && !lookup_attribute (attr_name: "oacc kernels", DECL_ATTRIBUTES (fn)))
970 {
971 static char const *const axes[] =
972 /* Must be kept in sync with GOMP_DIM enumeration. */
973 { "gang", "worker", "vector" };
974 for (ix = level >= 0 ? level : 0; ix != GOMP_DIM_MAX; ix++)
975 if (dims[ix] < 0)
976 ; /* Defaulting axis. */
977 else if ((used & GOMP_DIM_MASK (ix)) && dims[ix] == 1)
978 /* There is partitioned execution, but the user requested a
979 dimension size of 1. They're probably confused. */
980 warning_at (DECL_SOURCE_LOCATION (fn), OPT_Wopenacc_parallelism,
981 "region contains %s partitioned code but"
982 " is not %s partitioned", axes[ix], axes[ix]);
983 else if (!(used & GOMP_DIM_MASK (ix)) && dims[ix] != 1)
984 /* The dimension is explicitly partitioned to non-unity, but
985 no use is made within the region. */
986 warning_at (DECL_SOURCE_LOCATION (fn), OPT_Wopenacc_parallelism,
987 "region is %s partitioned but"
988 " does not contain %s partitioned code",
989 axes[ix], axes[ix]);
990 }
991
992 bool changed = targetm.goacc.validate_dims (fn, dims, level, used);
993
994 /* Default anything left to 1 or a partitioned default. */
995 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
996 if (dims[ix] < 0)
997 {
998 /* The OpenACC spec says 'If the [num_gangs] clause is not
999 specified, an implementation-defined default will be used;
1000 the default may depend on the code within the construct.'
1001 (2.5.6). Thus an implementation is free to choose
1002 non-unity default for a parallel region that doesn't have
1003 any gang-partitioned loops. However, it appears that there
1004 is a sufficient body of user code that expects non-gang
1005 partitioned regions to not execute in gang-redundant mode.
1006 So we (a) don't warn about the non-portability and (b) pick
1007 the minimum permissible dimension size when there is no
1008 partitioned execution. Otherwise we pick the global
1009 default for the dimension, which the user can control. The
1010 same wording and logic applies to num_workers and
1011 vector_length, however the worker- or vector- single
1012 execution doesn't have the same impact as gang-redundant
1013 execution. (If the minimum gang-level partioning is not 1,
1014 the target is probably too confusing.) */
1015 dims[ix] = (used & GOMP_DIM_MASK (ix)
1016 ? oacc_default_dims[ix] : oacc_min_dims[ix]);
1017 changed = true;
1018 }
1019
1020 if (changed)
1021 {
1022 /* Replace the attribute with new values. */
1023 pos = NULL_TREE;
1024 for (ix = GOMP_DIM_MAX; ix--;)
1025 pos = tree_cons (purpose[ix],
1026 build_int_cst (integer_type_node, dims[ix]), pos);
1027 oacc_replace_fn_attrib (fn, dims: pos);
1028 }
1029}
1030
1031/* Create an empty OpenACC loop structure at LOC. */
1032
1033static oacc_loop *
1034new_oacc_loop_raw (oacc_loop *parent, location_t loc)
1035{
1036 oacc_loop *loop = XCNEW (oacc_loop);
1037
1038 loop->parent = parent;
1039
1040 if (parent)
1041 {
1042 loop->sibling = parent->child;
1043 parent->child = loop;
1044 }
1045
1046 loop->loc = loc;
1047 return loop;
1048}
1049
1050/* Create an outermost, dummy OpenACC loop for offloaded function
1051 DECL. */
1052
1053static oacc_loop *
1054new_oacc_loop_outer (tree decl)
1055{
1056 return new_oacc_loop_raw (NULL, DECL_SOURCE_LOCATION (decl));
1057}
1058
1059/* Start a new OpenACC loop structure beginning at head marker HEAD.
1060 Link into PARENT loop. Return the new loop. */
1061
1062static oacc_loop *
1063new_oacc_loop (oacc_loop *parent, gcall *marker)
1064{
1065 oacc_loop *loop = new_oacc_loop_raw (parent, loc: gimple_location (g: marker));
1066
1067 loop->marker = marker;
1068
1069 /* TODO: This is where device_type flattening would occur for the loop
1070 flags. */
1071
1072 loop->flags = TREE_INT_CST_LOW (gimple_call_arg (marker, 3));
1073
1074 tree chunk_size = integer_zero_node;
1075 if (loop->flags & OLF_GANG_STATIC)
1076 chunk_size = gimple_call_arg (gs: marker, index: 4);
1077 loop->chunk_size = chunk_size;
1078
1079 return loop;
1080}
1081
1082/* Create a dummy loop encompassing a call to a openACC routine.
1083 Extract the routine's partitioning requirements. */
1084
1085static void
1086new_oacc_loop_routine (oacc_loop *parent, gcall *call, tree decl, tree attrs)
1087{
1088 oacc_loop *loop = new_oacc_loop_raw (parent, loc: gimple_location (g: call));
1089 int level = oacc_fn_attrib_level (attr: attrs);
1090
1091 gcc_assert (level >= 0);
1092
1093 loop->marker = call;
1094 loop->routine = decl;
1095 loop->mask = ((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
1096 ^ (GOMP_DIM_MASK (level) - 1));
1097}
1098
1099/* Finish off the current OpenACC loop ending at tail marker TAIL.
1100 Return the parent loop. */
1101
1102static oacc_loop *
1103finish_oacc_loop (oacc_loop *loop)
1104{
1105 /* If the loop has been collapsed, don't partition it. */
1106 if (loop->ifns.is_empty ())
1107 loop->mask = loop->flags = 0;
1108 return loop->parent;
1109}
1110
1111/* Free all OpenACC loop structures within LOOP (inclusive). */
1112
1113static void
1114free_oacc_loop (oacc_loop *loop)
1115{
1116 if (loop->sibling)
1117 free_oacc_loop (loop: loop->sibling);
1118 if (loop->child)
1119 free_oacc_loop (loop: loop->child);
1120
1121 loop->ifns.release ();
1122 free (ptr: loop);
1123}
1124
1125/* Dump out the OpenACC loop head or tail beginning at FROM. */
1126
1127static void
1128dump_oacc_loop_part (FILE *file, gcall *from, int depth,
1129 const char *title, int level)
1130{
1131 enum ifn_unique_kind kind
1132 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1133
1134 fprintf (stream: file, format: "%*s%s-%d:\n", depth * 2, "", title, level);
1135 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1136 {
1137 gimple *stmt = gsi_stmt (i: gsi);
1138
1139 if (gimple_call_internal_p (gs: stmt, fn: IFN_UNIQUE))
1140 {
1141 enum ifn_unique_kind k
1142 = ((enum ifn_unique_kind) TREE_INT_CST_LOW
1143 (gimple_call_arg (stmt, 0)));
1144
1145 if (k == kind && stmt != from)
1146 break;
1147 }
1148 print_gimple_stmt (file, stmt, depth * 2 + 2);
1149
1150 gsi_next (i: &gsi);
1151 while (gsi_end_p (i: gsi))
1152 gsi = gsi_start_bb (bb: single_succ (bb: gsi_bb (i: gsi)));
1153 }
1154}
1155
1156/* Dump OpenACC loop LOOP, its children, and its siblings. */
1157
1158static void
1159dump_oacc_loop (FILE *file, oacc_loop *loop, int depth)
1160{
1161 int ix;
1162
1163 fprintf (stream: file, format: "%*sLoop %x(%x) %s:%u\n", depth * 2, "",
1164 loop->flags, loop->mask,
1165 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc));
1166
1167 if (loop->marker)
1168 print_gimple_stmt (file, loop->marker, depth * 2);
1169
1170 if (loop->routine)
1171 fprintf (stream: file, format: "%*sRoutine %s:%u:%s\n",
1172 depth * 2, "", DECL_SOURCE_FILE (loop->routine),
1173 DECL_SOURCE_LINE (loop->routine),
1174 IDENTIFIER_POINTER (DECL_NAME (loop->routine)));
1175
1176 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
1177 if (loop->heads[ix])
1178 dump_oacc_loop_part (file, from: loop->heads[ix], depth, title: "Head", level: ix);
1179 for (ix = GOMP_DIM_MAX; ix--;)
1180 if (loop->tails[ix])
1181 dump_oacc_loop_part (file, from: loop->tails[ix], depth, title: "Tail", level: ix);
1182
1183 if (loop->child)
1184 dump_oacc_loop (file, loop: loop->child, depth: depth + 1);
1185 if (loop->sibling)
1186 dump_oacc_loop (file, loop: loop->sibling, depth);
1187}
1188
1189void debug_oacc_loop (oacc_loop *);
1190
1191/* Dump loops to stderr. */
1192
1193DEBUG_FUNCTION void
1194debug_oacc_loop (oacc_loop *loop)
1195{
1196 dump_oacc_loop (stderr, loop, depth: 0);
1197}
1198
1199/* Provide diagnostics on OpenACC loop LOOP, its children, and its
1200 siblings. */
1201
1202static void
1203inform_oacc_loop (const oacc_loop *loop)
1204{
1205 const char *gang
1206 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG) ? " gang" : "";
1207 const char *worker
1208 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER) ? " worker" : "";
1209 const char *vector
1210 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR) ? " vector" : "";
1211 const char *seq = loop->mask == 0 ? " seq" : "";
1212 const dump_user_location_t loc
1213 = dump_user_location_t::from_location_t (loc: loop->loc);
1214 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
1215 "assigned OpenACC%s%s%s%s loop parallelism\n", gang, worker,
1216 vector, seq);
1217
1218 if (loop->child)
1219 inform_oacc_loop (loop: loop->child);
1220 if (loop->sibling)
1221 inform_oacc_loop (loop: loop->sibling);
1222}
1223
1224/* DFS walk of basic blocks BB onwards, creating OpenACC loop
1225 structures as we go. By construction these loops are properly
1226 nested. */
1227
1228static void
1229oacc_loop_discover_walk (oacc_loop *loop, basic_block bb)
1230{
1231 int marker = 0;
1232 int remaining = 0;
1233
1234 if (bb->flags & BB_VISITED)
1235 return;
1236
1237 follow:
1238 bb->flags |= BB_VISITED;
1239
1240 /* Scan for loop markers. */
1241 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (i: gsi);
1242 gsi_next (i: &gsi))
1243 {
1244 gimple *stmt = gsi_stmt (i: gsi);
1245
1246 if (!is_gimple_call (gs: stmt))
1247 continue;
1248
1249 gcall *call = as_a <gcall *> (p: stmt);
1250
1251 /* If this is a routine, make a dummy loop for it. */
1252 if (tree decl = gimple_call_fndecl (gs: call))
1253 if (tree attrs = oacc_get_fn_attrib (fn: decl))
1254 {
1255 gcc_assert (!marker);
1256 new_oacc_loop_routine (parent: loop, call, decl, attrs);
1257 }
1258
1259 if (!gimple_call_internal_p (gs: call))
1260 continue;
1261
1262 switch (gimple_call_internal_fn (gs: call))
1263 {
1264 default:
1265 break;
1266
1267 case IFN_GOACC_LOOP:
1268 case IFN_GOACC_TILE:
1269 /* Record the abstraction function, so we can manipulate it
1270 later. */
1271 loop->ifns.safe_push (obj: call);
1272 break;
1273
1274 case IFN_UNIQUE:
1275 enum ifn_unique_kind kind
1276 = (enum ifn_unique_kind) (TREE_INT_CST_LOW
1277 (gimple_call_arg (call, 0)));
1278 if (kind == IFN_UNIQUE_OACC_HEAD_MARK
1279 || kind == IFN_UNIQUE_OACC_TAIL_MARK)
1280 {
1281 if (gimple_call_num_args (gs: call) == 2)
1282 {
1283 gcc_assert (marker && !remaining);
1284 marker = 0;
1285 if (kind == IFN_UNIQUE_OACC_TAIL_MARK)
1286 loop = finish_oacc_loop (loop);
1287 else
1288 loop->head_end = call;
1289 }
1290 else
1291 {
1292 int count = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
1293
1294 if (!marker)
1295 {
1296 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1297 loop = new_oacc_loop (parent: loop, marker: call);
1298 remaining = count;
1299 }
1300 gcc_assert (count == remaining);
1301 if (remaining)
1302 {
1303 remaining--;
1304 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1305 loop->heads[marker] = call;
1306 else
1307 loop->tails[remaining] = call;
1308 }
1309 marker++;
1310 }
1311 }
1312 }
1313 }
1314 if (remaining || marker)
1315 {
1316 bb = single_succ (bb);
1317 gcc_assert (single_pred_p (bb) && !(bb->flags & BB_VISITED));
1318 goto follow;
1319 }
1320
1321 /* Walk successor blocks. */
1322 edge e;
1323 edge_iterator ei;
1324
1325 FOR_EACH_EDGE (e, ei, bb->succs)
1326 oacc_loop_discover_walk (loop, bb: e->dest);
1327}
1328
1329/* LOOP is the first sibling. Reverse the order in place and return
1330 the new first sibling. Recurse to child loops. */
1331
1332static oacc_loop *
1333oacc_loop_sibling_nreverse (oacc_loop *loop)
1334{
1335 oacc_loop *last = NULL;
1336 do
1337 {
1338 if (loop->child)
1339 loop->child = oacc_loop_sibling_nreverse (loop: loop->child);
1340
1341 oacc_loop *next = loop->sibling;
1342 loop->sibling = last;
1343 last = loop;
1344 loop = next;
1345 }
1346 while (loop);
1347
1348 return last;
1349}
1350
1351/* Discover the OpenACC loops marked up by HEAD and TAIL markers for
1352 the current function. */
1353
1354static oacc_loop *
1355oacc_loop_discovery ()
1356{
1357 /* Clear basic block flags, in particular BB_VISITED which we're going to use
1358 in the following. */
1359 clear_bb_flags ();
1360
1361 oacc_loop *top = new_oacc_loop_outer (decl: current_function_decl);
1362 oacc_loop_discover_walk (loop: top, ENTRY_BLOCK_PTR_FOR_FN (cfun));
1363
1364 /* The siblings were constructed in reverse order, reverse them so
1365 that diagnostics come out in an unsurprising order. */
1366 top = oacc_loop_sibling_nreverse (loop: top);
1367
1368 return top;
1369}
1370
1371/* Transform the abstract internal function markers starting at FROM
1372 to be for partitioning level LEVEL. Stop when we meet another HEAD
1373 or TAIL marker. */
1374
1375static void
1376oacc_loop_xform_head_tail (gcall *from, int level)
1377{
1378 enum ifn_unique_kind kind
1379 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1380 tree replacement = build_int_cst (unsigned_type_node, level);
1381
1382 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1383 {
1384 gimple *stmt = gsi_stmt (i: gsi);
1385
1386 if (gimple_call_internal_p (gs: stmt, fn: IFN_UNIQUE))
1387 {
1388 enum ifn_unique_kind k
1389 = ((enum ifn_unique_kind)
1390 TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
1391
1392 if (k == IFN_UNIQUE_OACC_FORK
1393 || k == IFN_UNIQUE_OACC_JOIN
1394 || k == IFN_UNIQUE_OACC_PRIVATE)
1395 *gimple_call_arg_ptr (gs: stmt, index: 2) = replacement;
1396 else if (k == kind && stmt != from)
1397 break;
1398 }
1399 else if (gimple_call_internal_p (gs: stmt, fn: IFN_GOACC_REDUCTION))
1400 *gimple_call_arg_ptr (gs: stmt, index: 3) = replacement;
1401 update_stmt (s: stmt);
1402
1403 gsi_next (i: &gsi);
1404 while (gsi_end_p (i: gsi))
1405 gsi = gsi_start_bb (bb: single_succ (bb: gsi_bb (i: gsi)));
1406 }
1407}
1408
1409/* Process the discovered OpenACC loops, setting the correct
1410 partitioning level etc. */
1411
1412static void
1413oacc_loop_process (oacc_loop *loop, int fn_level)
1414{
1415 if (loop->child)
1416 oacc_loop_process (loop: loop->child, fn_level);
1417
1418 if (loop->mask && !loop->routine)
1419 {
1420 int ix;
1421 tree mask_arg = build_int_cst (unsigned_type_node, loop->mask);
1422 tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask);
1423 tree chunk_arg = loop->chunk_size;
1424 gcall *call;
1425
1426 for (ix = 0; loop->ifns.iterate (ix, ptr: &call); ix++)
1427 {
1428 switch (gimple_call_internal_fn (gs: call))
1429 {
1430 case IFN_GOACC_LOOP:
1431 {
1432 bool is_e = gimple_call_arg (gs: call, index: 5) == integer_minus_one_node;
1433 gimple_call_set_arg (gs: call, index: 5, arg: is_e ? e_mask_arg : mask_arg);
1434 if (!is_e)
1435 gimple_call_set_arg (gs: call, index: 4, arg: chunk_arg);
1436 }
1437 break;
1438
1439 case IFN_GOACC_TILE:
1440 gimple_call_set_arg (gs: call, index: 3, arg: mask_arg);
1441 gimple_call_set_arg (gs: call, index: 4, arg: e_mask_arg);
1442 break;
1443
1444 default:
1445 gcc_unreachable ();
1446 }
1447 update_stmt (s: call);
1448 }
1449
1450 unsigned dim = GOMP_DIM_GANG;
1451 unsigned mask = loop->mask | loop->e_mask;
1452 for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
1453 {
1454 while (!(GOMP_DIM_MASK (dim) & mask))
1455 dim++;
1456
1457 oacc_loop_xform_head_tail (from: loop->heads[ix], level: dim);
1458 oacc_loop_xform_head_tail (from: loop->tails[ix], level: dim);
1459
1460 mask ^= GOMP_DIM_MASK (dim);
1461 }
1462 }
1463
1464 if (loop->sibling)
1465 oacc_loop_process (loop: loop->sibling, fn_level);
1466
1467
1468 /* OpenACC 2.6, 2.9.11. "reduction clause" places a restriction such that
1469 "The 'reduction' clause may not be specified on an orphaned 'loop'
1470 construct with the 'gang' clause, or on an orphaned 'loop' construct that
1471 will generate gang parallelism in a procedure that is compiled with the
1472 'routine gang' clause." */
1473 if (fn_level == GOMP_DIM_GANG
1474 && (loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
1475 && (loop->flags & OLF_REDUCTION))
1476 error_at (loop->loc,
1477 "gang reduction on an orphan loop");
1478}
1479
1480/* Walk the OpenACC loop heirarchy checking and assigning the
1481 programmer-specified partitionings. OUTER_MASK is the partitioning
1482 this loop is contained within. Return mask of partitioning
1483 encountered. If any auto loops are discovered, set GOMP_DIM_MAX
1484 bit. */
1485
1486static unsigned
1487oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
1488{
1489 unsigned this_mask = loop->mask;
1490 unsigned mask_all = 0;
1491 bool noisy = true;
1492
1493#ifdef ACCEL_COMPILER
1494 /* When device_type is supported, we want the device compiler to be
1495 noisy, if the loop parameters are device_type-specific. */
1496 noisy = false;
1497#endif
1498
1499 if (!loop->routine)
1500 {
1501 bool auto_par = (loop->flags & OLF_AUTO) != 0;
1502 bool seq_par = (loop->flags & OLF_SEQ) != 0;
1503 bool tiling = (loop->flags & OLF_TILE) != 0;
1504
1505 this_mask = ((loop->flags >> OLF_DIM_BASE)
1506 & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
1507
1508 /* Apply auto partitioning if this is a non-partitioned regular
1509 loop, or (no more than) single axis tiled loop. */
1510 bool maybe_auto
1511 = !seq_par && this_mask == (tiling ? this_mask & -this_mask : 0);
1512
1513 if ((this_mask != 0) + auto_par + seq_par > 1)
1514 {
1515 if (noisy)
1516 error_at (loop->loc,
1517 seq_par
1518 ? G_("%<seq%> overrides other OpenACC loop specifiers")
1519 : G_("%<auto%> conflicts with other OpenACC loop "
1520 "specifiers"));
1521 maybe_auto = false;
1522 loop->flags &= ~OLF_AUTO;
1523 if (seq_par)
1524 {
1525 loop->flags
1526 &= ~((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE);
1527 this_mask = 0;
1528 }
1529 }
1530
1531 if (maybe_auto && (loop->flags & OLF_INDEPENDENT))
1532 {
1533 loop->flags |= OLF_AUTO;
1534 mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
1535 }
1536 }
1537
1538 if (this_mask & outer_mask)
1539 {
1540 const oacc_loop *outer;
1541 for (outer = loop->parent; outer; outer = outer->parent)
1542 if ((outer->mask | outer->e_mask) & this_mask)
1543 break;
1544
1545 if (noisy)
1546 {
1547 if (outer)
1548 {
1549 error_at (loop->loc,
1550 loop->routine
1551 ? G_("routine call uses same OpenACC parallelism"
1552 " as containing loop")
1553 : G_("inner loop uses same OpenACC parallelism"
1554 " as containing loop"));
1555 inform (outer->loc, "containing loop here");
1556 }
1557 else
1558 error_at (loop->loc,
1559 loop->routine
1560 ? G_("routine call uses OpenACC parallelism disallowed"
1561 " by containing routine")
1562 : G_("loop uses OpenACC parallelism disallowed"
1563 " by containing routine"));
1564
1565 if (loop->routine)
1566 inform (DECL_SOURCE_LOCATION (loop->routine),
1567 "routine %qD declared here", loop->routine);
1568 }
1569 this_mask &= ~outer_mask;
1570 }
1571 else
1572 {
1573 unsigned outermost = least_bit_hwi (x: this_mask);
1574
1575 if (outermost && outermost <= outer_mask)
1576 {
1577 if (noisy)
1578 {
1579 error_at (loop->loc,
1580 "incorrectly nested OpenACC loop parallelism");
1581
1582 const oacc_loop *outer;
1583 for (outer = loop->parent;
1584 outer->flags && outer->flags < outermost;
1585 outer = outer->parent)
1586 continue;
1587 inform (outer->loc, "containing loop here");
1588 }
1589
1590 this_mask &= ~outermost;
1591 }
1592 }
1593
1594 mask_all |= this_mask;
1595
1596 if (loop->flags & OLF_TILE)
1597 {
1598 /* When tiling, vector goes to the element loop, and failing
1599 that we put worker there. The std doesn't contemplate
1600 specifying all three. We choose to put worker and vector on
1601 the element loops in that case. */
1602 unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR);
1603 if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
1604 this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
1605
1606 loop->e_mask = this_e_mask;
1607 this_mask ^= this_e_mask;
1608 }
1609
1610 loop->mask = this_mask;
1611
1612 if (dump_file)
1613 fprintf (stream: dump_file, format: "Loop %s:%d user specified %d & %d\n",
1614 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1615 loop->mask, loop->e_mask);
1616
1617 if (loop->child)
1618 {
1619 unsigned tmp_mask = outer_mask | this_mask | loop->e_mask;
1620 loop->inner = oacc_loop_fixed_partitions (loop: loop->child, outer_mask: tmp_mask);
1621 mask_all |= loop->inner;
1622 }
1623
1624 if (loop->sibling)
1625 mask_all |= oacc_loop_fixed_partitions (loop: loop->sibling, outer_mask);
1626
1627 return mask_all;
1628}
1629
1630/* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
1631 OUTER_MASK is the partitioning this loop is contained within.
1632 OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
1633 Return the cumulative partitioning used by this loop, siblings and
1634 children. */
1635
1636static unsigned
1637oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
1638 bool outer_assign)
1639{
1640 bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
1641 bool noisy = true;
1642 bool tiling = loop->flags & OLF_TILE;
1643
1644#ifdef ACCEL_COMPILER
1645 /* When device_type is supported, we want the device compiler to be
1646 noisy, if the loop parameters are device_type-specific. */
1647 noisy = false;
1648#endif
1649
1650 if (assign && (!outer_assign || loop->inner))
1651 {
1652 /* Allocate outermost and non-innermost loops at the outermost
1653 non-innermost available level. */
1654 unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
1655
1656 /* Find the first outermost available partition. */
1657 while (this_mask <= outer_mask)
1658 this_mask <<= 1;
1659
1660 /* Grab two axes if tiling, and we've not assigned anything */
1661 if (tiling && !(loop->mask | loop->e_mask))
1662 this_mask |= this_mask << 1;
1663
1664 /* Prohibit the innermost partitioning at the moment. */
1665 this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1;
1666
1667 /* Don't use any dimension explicitly claimed by an inner loop. */
1668 this_mask &= ~loop->inner;
1669
1670 if (tiling && !loop->e_mask)
1671 {
1672 /* If we got two axes, allocate the inner one to the element
1673 loop. */
1674 loop->e_mask = this_mask & (this_mask << 1);
1675 this_mask ^= loop->e_mask;
1676 }
1677
1678 loop->mask |= this_mask;
1679 }
1680
1681 if (loop->child)
1682 {
1683 unsigned tmp_mask = outer_mask | loop->mask | loop->e_mask;
1684 loop->inner = oacc_loop_auto_partitions (loop: loop->child, outer_mask: tmp_mask,
1685 outer_assign: outer_assign | assign);
1686 }
1687
1688 if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
1689 {
1690 /* Allocate the loop at the innermost available level. Note
1691 that we do this even if we already assigned this loop the
1692 outermost available level above. That way we'll partition
1693 this along 2 axes, if they are available. */
1694 unsigned this_mask = 0;
1695
1696 /* Determine the outermost partitioning used within this loop. */
1697 this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
1698 this_mask = least_bit_hwi (x: this_mask);
1699
1700 /* Pick the partitioning just inside that one. */
1701 this_mask >>= 1;
1702
1703 /* And avoid picking one use by an outer loop. */
1704 this_mask &= ~outer_mask;
1705
1706 /* If tiling and we failed completely above, grab the next one
1707 too. Making sure it doesn't hit an outer loop. */
1708 if (tiling)
1709 {
1710 this_mask &= ~(loop->e_mask | loop->mask);
1711 unsigned tile_mask = ((this_mask >> 1)
1712 & ~(outer_mask | loop->e_mask | loop->mask));
1713
1714 if (tile_mask || loop->mask)
1715 {
1716 loop->e_mask |= this_mask;
1717 this_mask = tile_mask;
1718 }
1719 if (!loop->e_mask && noisy)
1720 warning_at (loop->loc, 0,
1721 "insufficient partitioning available"
1722 " to parallelize element loop");
1723 }
1724
1725 loop->mask |= this_mask;
1726 if (!loop->mask && noisy)
1727 warning_at (loop->loc, 0,
1728 tiling
1729 ? G_("insufficient partitioning available"
1730 " to parallelize tile loop")
1731 : G_("insufficient partitioning available"
1732 " to parallelize loop"));
1733 }
1734
1735 if (assign && dump_file)
1736 fprintf (stream: dump_file, format: "Auto loop %s:%d assigned %d & %d\n",
1737 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1738 loop->mask, loop->e_mask);
1739
1740 unsigned inner_mask = 0;
1741
1742 if (loop->sibling)
1743 inner_mask |= oacc_loop_auto_partitions (loop: loop->sibling,
1744 outer_mask, outer_assign);
1745
1746 inner_mask |= loop->inner | loop->mask | loop->e_mask;
1747
1748 return inner_mask;
1749}
1750
1751/* Walk the OpenACC loop heirarchy to check and assign partitioning
1752 axes. Return mask of partitioning. */
1753
1754static unsigned
1755oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
1756{
1757 unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
1758
1759 if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
1760 {
1761 mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
1762 mask_all |= oacc_loop_auto_partitions (loop, outer_mask, outer_assign: false);
1763 }
1764 return mask_all;
1765}
1766
1767/* Default fork/join early expander. Delete the function calls if
1768 there is no RTL expander. */
1769
1770bool
1771default_goacc_fork_join (gcall *ARG_UNUSED (call),
1772 const int *ARG_UNUSED (dims), bool is_fork)
1773{
1774 if (is_fork)
1775 return targetm.have_oacc_fork ();
1776 else
1777 return targetm.have_oacc_join ();
1778}
1779
1780/* Default goacc.reduction early expander.
1781
1782 LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET)
1783 If RES_PTR is not integer-zerop:
1784 SETUP - emit 'LHS = *RES_PTR', LHS = NULL
1785 TEARDOWN - emit '*RES_PTR = VAR'
1786 If LHS is not NULL
1787 emit 'LHS = VAR' */
1788
1789void
1790default_goacc_reduction (gcall *call)
1791{
1792 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
1793 gimple_stmt_iterator gsi = gsi_for_stmt (call);
1794 tree lhs = gimple_call_lhs (gs: call);
1795 tree var = gimple_call_arg (gs: call, index: 2);
1796 gimple_seq seq = NULL;
1797
1798 if (code == IFN_GOACC_REDUCTION_SETUP
1799 || code == IFN_GOACC_REDUCTION_TEARDOWN)
1800 {
1801 /* Setup and Teardown need to copy from/to the receiver object,
1802 if there is one. */
1803 tree ref_to_res = gimple_call_arg (gs: call, index: 1);
1804
1805 if (!integer_zerop (ref_to_res))
1806 {
1807 tree dst = build_simple_mem_ref (ref_to_res);
1808 tree src = var;
1809
1810 if (code == IFN_GOACC_REDUCTION_SETUP)
1811 {
1812 src = dst;
1813 dst = lhs;
1814 lhs = NULL;
1815 }
1816 gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src));
1817 }
1818 }
1819
1820 /* Copy VAR to LHS, if there is an LHS. */
1821 if (lhs)
1822 gimple_seq_add_stmt (&seq, gimple_build_assign (lhs, var));
1823
1824 gsi_replace_with_seq (&gsi, seq, true);
1825}
1826
1827struct var_decl_rewrite_info
1828{
1829 gimple *stmt;
1830 hash_map<tree, tree> *adjusted_vars;
1831 bool avoid_pointer_conversion;
1832 bool modified;
1833};
1834
1835/* Helper function for execute_oacc_device_lower. Rewrite VAR_DECLs (by
1836 themselves or wrapped in various other nodes) according to ADJUSTED_VARS in
1837 the var_decl_rewrite_info pointed to via DATA. Used as part of coercing
1838 gang-private variables in OpenACC offload regions to reside in GPU shared
1839 memory. */
1840
1841static tree
1842oacc_rewrite_var_decl (tree *tp, int *walk_subtrees, void *data)
1843{
1844 walk_stmt_info *wi = (walk_stmt_info *) data;
1845 var_decl_rewrite_info *info = (var_decl_rewrite_info *) wi->info;
1846
1847 if (TREE_CODE (*tp) == ADDR_EXPR)
1848 {
1849 tree arg = TREE_OPERAND (*tp, 0);
1850 tree *new_arg = info->adjusted_vars->get (k: arg);
1851
1852 if (new_arg)
1853 {
1854 if (info->avoid_pointer_conversion)
1855 {
1856 *tp = build_fold_addr_expr (*new_arg);
1857 info->modified = true;
1858 *walk_subtrees = 0;
1859 }
1860 else
1861 {
1862 gimple_stmt_iterator gsi = gsi_for_stmt (info->stmt);
1863 tree repl = build_fold_addr_expr (*new_arg);
1864 gimple *stmt1
1865 = gimple_build_assign (make_ssa_name (TREE_TYPE (repl)), repl);
1866 tree conv = convert_to_pointer (TREE_TYPE (*tp),
1867 gimple_assign_lhs (gs: stmt1));
1868 gimple *stmt2
1869 = gimple_build_assign (make_ssa_name (TREE_TYPE (*tp)), conv);
1870 gsi_insert_before (&gsi, stmt1, GSI_SAME_STMT);
1871 gsi_insert_before (&gsi, stmt2, GSI_SAME_STMT);
1872 *tp = gimple_assign_lhs (gs: stmt2);
1873 info->modified = true;
1874 *walk_subtrees = 0;
1875 }
1876 }
1877 }
1878 else if (TREE_CODE (*tp) == COMPONENT_REF || TREE_CODE (*tp) == ARRAY_REF)
1879 {
1880 tree *base = &TREE_OPERAND (*tp, 0);
1881
1882 while (TREE_CODE (*base) == COMPONENT_REF
1883 || TREE_CODE (*base) == ARRAY_REF)
1884 base = &TREE_OPERAND (*base, 0);
1885
1886 if (TREE_CODE (*base) != VAR_DECL)
1887 return NULL;
1888
1889 tree *new_decl = info->adjusted_vars->get (k: *base);
1890 if (!new_decl)
1891 return NULL;
1892
1893 int base_quals = TYPE_QUALS (TREE_TYPE (*new_decl));
1894 tree field = TREE_OPERAND (*tp, 1);
1895
1896 /* Adjust the type of the field. */
1897 int field_quals = TYPE_QUALS (TREE_TYPE (field));
1898 if (TREE_CODE (field) == FIELD_DECL && field_quals != base_quals)
1899 {
1900 tree *field_type = &TREE_TYPE (field);
1901 while (TREE_CODE (*field_type) == ARRAY_TYPE)
1902 field_type = &TREE_TYPE (*field_type);
1903 field_quals |= base_quals;
1904 *field_type = build_qualified_type (*field_type, field_quals);
1905 }
1906
1907 /* Adjust the type of the component ref itself. */
1908 tree comp_type = TREE_TYPE (*tp);
1909 int comp_quals = TYPE_QUALS (comp_type);
1910 if (TREE_CODE (*tp) == COMPONENT_REF && comp_quals != base_quals)
1911 {
1912 comp_quals |= base_quals;
1913 TREE_TYPE (*tp)
1914 = build_qualified_type (comp_type, comp_quals);
1915 }
1916
1917 *base = *new_decl;
1918 info->modified = true;
1919 }
1920 else if (VAR_P (*tp))
1921 {
1922 tree *new_decl = info->adjusted_vars->get (k: *tp);
1923 if (new_decl)
1924 {
1925 *tp = *new_decl;
1926 info->modified = true;
1927 }
1928 }
1929
1930 return NULL_TREE;
1931}
1932
1933/* Return TRUE if CALL is a call to a builtin atomic/sync operation. */
1934
1935static bool
1936is_sync_builtin_call (gcall *call)
1937{
1938 tree callee = gimple_call_fndecl (gs: call);
1939
1940 if (callee != NULL_TREE
1941 && gimple_call_builtin_p (call, BUILT_IN_NORMAL))
1942 switch (DECL_FUNCTION_CODE (decl: callee))
1943 {
1944#undef DEF_SYNC_BUILTIN
1945#define DEF_SYNC_BUILTIN(ENUM, NAME, TYPE, ATTRS) case ENUM:
1946#include "sync-builtins.def"
1947#undef DEF_SYNC_BUILTIN
1948 return true;
1949
1950 default:
1951 ;
1952 }
1953
1954 return false;
1955}
1956
1957/* Main entry point for oacc transformations which run on the device
1958 compiler after LTO, so we know what the target device is at this
1959 point (including the host fallback). */
1960
1961static unsigned int
1962execute_oacc_loop_designation ()
1963{
1964 tree attrs = oacc_get_fn_attrib (fn: current_function_decl);
1965
1966 if (!attrs)
1967 /* Not an offloaded function. */
1968 return 0;
1969
1970 /* Parse the default dim argument exactly once. */
1971 if ((const void *)flag_openacc_dims != &flag_openacc_dims)
1972 {
1973 oacc_parse_default_dims (flag_openacc_dims);
1974 flag_openacc_dims = (char *)&flag_openacc_dims;
1975 }
1976
1977 bool is_oacc_parallel
1978 = (lookup_attribute (attr_name: "oacc parallel",
1979 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1980 bool is_oacc_kernels
1981 = (lookup_attribute (attr_name: "oacc kernels",
1982 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1983 bool is_oacc_serial
1984 = (lookup_attribute (attr_name: "oacc serial",
1985 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1986 bool is_oacc_parallel_kernels_parallelized
1987 = (lookup_attribute (attr_name: "oacc parallel_kernels_parallelized",
1988 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1989 bool is_oacc_parallel_kernels_gang_single
1990 = (lookup_attribute (attr_name: "oacc parallel_kernels_gang_single",
1991 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1992 int fn_level = oacc_fn_attrib_level (attr: attrs);
1993 bool is_oacc_routine = (fn_level >= 0);
1994 gcc_checking_assert (is_oacc_parallel
1995 + is_oacc_kernels
1996 + is_oacc_serial
1997 + is_oacc_parallel_kernels_parallelized
1998 + is_oacc_parallel_kernels_gang_single
1999 + is_oacc_routine
2000 == 1);
2001
2002 bool is_oacc_kernels_parallelized
2003 = (lookup_attribute (attr_name: "oacc kernels parallelized",
2004 DECL_ATTRIBUTES (current_function_decl)) != NULL);
2005 if (is_oacc_kernels_parallelized)
2006 gcc_checking_assert (is_oacc_kernels);
2007
2008 if (dump_file)
2009 {
2010 if (is_oacc_parallel)
2011 fprintf (stream: dump_file, format: "Function is OpenACC parallel offload\n");
2012 else if (is_oacc_kernels)
2013 fprintf (stream: dump_file, format: "Function is %s OpenACC kernels offload\n",
2014 (is_oacc_kernels_parallelized
2015 ? "parallelized" : "unparallelized"));
2016 else if (is_oacc_serial)
2017 fprintf (stream: dump_file, format: "Function is OpenACC serial offload\n");
2018 else if (is_oacc_parallel_kernels_parallelized)
2019 fprintf (stream: dump_file, format: "Function is %s OpenACC kernels offload\n",
2020 "parallel_kernels_parallelized");
2021 else if (is_oacc_parallel_kernels_gang_single)
2022 fprintf (stream: dump_file, format: "Function is %s OpenACC kernels offload\n",
2023 "parallel_kernels_gang_single");
2024 else if (is_oacc_routine)
2025 fprintf (stream: dump_file, format: "Function is OpenACC routine level %d\n",
2026 fn_level);
2027 else
2028 gcc_unreachable ();
2029 }
2030
2031 /* This doesn't belong into 'pass_oacc_loop_designation' conceptually, but
2032 it's a convenient place, so... */
2033 if (is_oacc_routine)
2034 {
2035 tree attr = lookup_attribute (attr_name: "omp declare target",
2036 DECL_ATTRIBUTES (current_function_decl));
2037 gcc_checking_assert (attr);
2038 tree clauses = TREE_VALUE (attr);
2039 gcc_checking_assert (clauses);
2040
2041 /* Should this OpenACC routine be discarded? */
2042 bool discard = false;
2043
2044 tree clause_nohost = omp_find_clause (clauses, kind: OMP_CLAUSE_NOHOST);
2045 if (dump_file)
2046 fprintf (stream: dump_file,
2047 format: "OpenACC routine '%s' %s '%s' clause.\n",
2048 lang_hooks.decl_printable_name (current_function_decl, 2),
2049 clause_nohost ? "has" : "doesn't have",
2050 omp_clause_code_name[OMP_CLAUSE_NOHOST]);
2051 /* Host compiler, 'nohost' clause? */
2052#ifndef ACCEL_COMPILER
2053 if (clause_nohost)
2054 discard = true;
2055#endif
2056
2057 if (dump_file)
2058 fprintf (stream: dump_file,
2059 format: "OpenACC routine '%s' %sdiscarded.\n",
2060 lang_hooks.decl_printable_name (current_function_decl, 2),
2061 discard ? "" : "not ");
2062 if (discard)
2063 {
2064 TREE_ASM_WRITTEN (current_function_decl) = 1;
2065 return TODO_discard_function;
2066 }
2067 }
2068
2069 /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
2070 kernels, so remove the parallelism dimensions function attributes
2071 potentially set earlier on. */
2072 if (is_oacc_kernels && !is_oacc_kernels_parallelized)
2073 {
2074 oacc_set_fn_attrib (fn: current_function_decl, NULL, NULL);
2075 attrs = oacc_get_fn_attrib (fn: current_function_decl);
2076 }
2077
2078 /* Discover, partition and process the loops. */
2079 oacc_loop *loops = oacc_loop_discovery ();
2080
2081 unsigned outer_mask = 0;
2082 if (is_oacc_routine)
2083 outer_mask = GOMP_DIM_MASK (fn_level) - 1;
2084 unsigned used_mask = oacc_loop_partition (loop: loops, outer_mask);
2085 /* OpenACC kernels constructs are special: they currently don't use the
2086 generic oacc_loop infrastructure and attribute/dimension processing. */
2087 if (is_oacc_kernels && is_oacc_kernels_parallelized)
2088 {
2089 /* Parallelized OpenACC kernels constructs use gang parallelism. See
2090 also tree-parloops.cc:create_parallel_loop. */
2091 used_mask |= GOMP_DIM_MASK (GOMP_DIM_GANG);
2092 }
2093
2094 int dims[GOMP_DIM_MAX];
2095 oacc_validate_dims (fn: current_function_decl, attrs, dims, level: fn_level, used: used_mask);
2096
2097 if (dump_file)
2098 {
2099 const char *comma = "Compute dimensions [";
2100 for (int ix = 0; ix != GOMP_DIM_MAX; ix++, comma = ", ")
2101 fprintf (stream: dump_file, format: "%s%d", comma, dims[ix]);
2102 fprintf (stream: dump_file, format: "]\n");
2103 }
2104
2105 /* Verify that for OpenACC 'kernels' decomposed "gang-single" parts we launch
2106 a single gang only. */
2107 if (is_oacc_parallel_kernels_gang_single)
2108 gcc_checking_assert (dims[GOMP_DIM_GANG] == 1);
2109
2110 oacc_loop_process (loop: loops, fn_level);
2111 if (dump_file)
2112 {
2113 fprintf (stream: dump_file, format: "OpenACC loops\n");
2114 dump_oacc_loop (file: dump_file, loop: loops, depth: 0);
2115 fprintf (stream: dump_file, format: "\n");
2116 }
2117 if (dump_enabled_p ())
2118 {
2119 oacc_loop *l = loops;
2120 /* OpenACC kernels constructs are special: they currently don't use the
2121 generic oacc_loop infrastructure. */
2122 if (is_oacc_kernels)
2123 {
2124 /* Create a fake oacc_loop for diagnostic purposes. */
2125 l = new_oacc_loop_raw (NULL,
2126 DECL_SOURCE_LOCATION (current_function_decl));
2127 l->mask = used_mask;
2128 }
2129 else
2130 {
2131 /* Skip the outermost, dummy OpenACC loop */
2132 l = l->child;
2133 }
2134 if (l)
2135 inform_oacc_loop (loop: l);
2136 if (is_oacc_kernels)
2137 free_oacc_loop (loop: l);
2138 }
2139
2140 free_oacc_loop (loop: loops);
2141
2142 return 0;
2143}
2144
2145static unsigned int
2146execute_oacc_device_lower ()
2147{
2148 tree attrs = oacc_get_fn_attrib (fn: current_function_decl);
2149
2150 if (!attrs)
2151 /* Not an offloaded function. */
2152 return 0;
2153
2154 int dims[GOMP_DIM_MAX];
2155 for (unsigned i = 0; i < GOMP_DIM_MAX; i++)
2156 dims[i] = oacc_get_fn_dim_size (fn: current_function_decl, axis: i);
2157
2158 hash_map<tree, tree> adjusted_vars;
2159
2160 /* Now lower internal loop functions to target-specific code
2161 sequences. */
2162 basic_block bb;
2163 FOR_ALL_BB_FN (bb, cfun)
2164 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (i: gsi);)
2165 {
2166 gimple *stmt = gsi_stmt (i: gsi);
2167 if (!is_gimple_call (gs: stmt))
2168 {
2169 gsi_next (i: &gsi);
2170 continue;
2171 }
2172
2173 gcall *call = as_a <gcall *> (p: stmt);
2174 if (!gimple_call_internal_p (gs: call))
2175 {
2176 gsi_next (i: &gsi);
2177 continue;
2178 }
2179
2180 /* Rewind to allow rescan. */
2181 gsi_prev (i: &gsi);
2182 bool rescan = false, remove = false;
2183 enum internal_fn ifn_code = gimple_call_internal_fn (gs: call);
2184
2185 switch (ifn_code)
2186 {
2187 default: break;
2188
2189 case IFN_GOACC_TILE:
2190 oacc_xform_tile (call);
2191 rescan = true;
2192 break;
2193
2194 case IFN_GOACC_LOOP:
2195 oacc_xform_loop (call);
2196 rescan = true;
2197 break;
2198
2199 case IFN_GOACC_REDUCTION:
2200 /* Mark the function for SSA renaming. */
2201 mark_virtual_operands_for_renaming (cfun);
2202
2203 /* If the level is -1, this ended up being an unused
2204 axis. Handle as a default. */
2205 if (integer_minus_onep (gimple_call_arg (gs: call, index: 3)))
2206 default_goacc_reduction (call);
2207 else
2208 targetm.goacc.reduction (call);
2209 rescan = true;
2210 break;
2211
2212 case IFN_UNIQUE:
2213 {
2214 enum ifn_unique_kind kind
2215 = ((enum ifn_unique_kind)
2216 TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
2217
2218 switch (kind)
2219 {
2220 default:
2221 break;
2222
2223 case IFN_UNIQUE_OACC_FORK:
2224 case IFN_UNIQUE_OACC_JOIN:
2225 if (integer_minus_onep (gimple_call_arg (gs: call, index: 2)))
2226 remove = true;
2227 else if (!targetm.goacc.fork_join
2228 (call, dims, kind == IFN_UNIQUE_OACC_FORK))
2229 remove = true;
2230 break;
2231
2232 case IFN_UNIQUE_OACC_HEAD_MARK:
2233 case IFN_UNIQUE_OACC_TAIL_MARK:
2234 remove = true;
2235 break;
2236
2237 case IFN_UNIQUE_OACC_PRIVATE:
2238 {
2239 dump_flags_t l_dump_flags
2240 = get_openacc_privatization_dump_flags ();
2241
2242 location_t loc = gimple_location (g: stmt);
2243 if (LOCATION_LOCUS (loc) == UNKNOWN_LOCATION)
2244 loc = DECL_SOURCE_LOCATION (current_function_decl);
2245 const dump_user_location_t d_u_loc
2246 = dump_user_location_t::from_location_t (loc);
2247
2248 HOST_WIDE_INT level
2249 = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
2250 gcc_checking_assert (level == -1
2251 || (level >= 0
2252 && level < GOMP_DIM_MAX));
2253 for (unsigned i = 3;
2254 i < gimple_call_num_args (gs: call);
2255 i++)
2256 {
2257 static char const *const axes[] =
2258 /* Must be kept in sync with GOMP_DIM enumeration. */
2259 { "gang", "worker", "vector" };
2260
2261 tree arg = gimple_call_arg (gs: call, index: i);
2262 gcc_checking_assert (TREE_CODE (arg) == ADDR_EXPR);
2263 tree decl = TREE_OPERAND (arg, 0);
2264 if (dump_enabled_p ())
2265/* PR100695 "Format decoder, quoting in 'dump_printf' etc." */
2266#if __GNUC__ >= 10
2267# pragma GCC diagnostic push
2268# pragma GCC diagnostic ignored "-Wformat"
2269#endif
2270 dump_printf_loc (l_dump_flags, d_u_loc,
2271 "variable %<%T%> ought to be"
2272 " adjusted for OpenACC"
2273 " privatization level: %qs\n",
2274 decl,
2275 (level == -1
2276 ? "UNKNOWN" : axes[level]));
2277#if __GNUC__ >= 10
2278# pragma GCC diagnostic pop
2279#endif
2280 bool adjusted;
2281 if (level == -1)
2282 adjusted = false;
2283 else if (!targetm.goacc.adjust_private_decl)
2284 adjusted = false;
2285 else if (level == GOMP_DIM_VECTOR)
2286 {
2287 /* That's the default behavior. */
2288 adjusted = true;
2289 }
2290 else
2291 {
2292 tree oldtype = TREE_TYPE (decl);
2293 tree newdecl
2294 = targetm.goacc.adjust_private_decl (loc, decl,
2295 level);
2296 adjusted = (TREE_TYPE (newdecl) != oldtype
2297 || newdecl != decl);
2298 if (adjusted)
2299 adjusted_vars.put (k: decl, v: newdecl);
2300 }
2301 if (adjusted
2302 && dump_enabled_p ())
2303/* PR100695 "Format decoder, quoting in 'dump_printf' etc." */
2304#if __GNUC__ >= 10
2305# pragma GCC diagnostic push
2306# pragma GCC diagnostic ignored "-Wformat"
2307#endif
2308 dump_printf_loc (l_dump_flags, d_u_loc,
2309 "variable %<%T%> adjusted for"
2310 " OpenACC privatization level:"
2311 " %qs\n",
2312 decl, axes[level]);
2313#if __GNUC__ >= 10
2314# pragma GCC diagnostic pop
2315#endif
2316 }
2317 remove = true;
2318 }
2319 break;
2320 }
2321 break;
2322 }
2323 }
2324
2325 if (gsi_end_p (i: gsi))
2326 /* We rewound past the beginning of the BB. */
2327 gsi = gsi_start_bb (bb);
2328 else
2329 /* Undo the rewind. */
2330 gsi_next (i: &gsi);
2331
2332 if (remove)
2333 {
2334 if (gimple_vdef (g: call))
2335 replace_uses_by (gimple_vdef (g: call), gimple_vuse (g: call));
2336 if (gimple_call_lhs (gs: call))
2337 {
2338 /* Propagate the data dependency var. */
2339 gimple *ass = gimple_build_assign (gimple_call_lhs (gs: call),
2340 gimple_call_arg (gs: call, index: 1));
2341 gsi_replace (&gsi, ass, false);
2342 }
2343 else
2344 gsi_remove (&gsi, true);
2345 }
2346 else if (!rescan)
2347 /* If not rescanning, advance over the call. */
2348 gsi_next (i: &gsi);
2349 }
2350
2351 /* Regarding the OpenACC privatization level, we're currently only looking at
2352 making the gang-private level work. Regarding that, we have the following
2353 configurations:
2354
2355 - GCN offloading: 'targetm.goacc.adjust_private_decl' does the work (in
2356 particular, change 'TREE_TYPE', etc.) and there is no
2357 'targetm.goacc.expand_var_decl'.
2358
2359 - nvptx offloading: 'targetm.goacc.adjust_private_decl' only sets a
2360 marker and then 'targetm.goacc.expand_var_decl' does the work.
2361
2362 Eventually (in particular, for worker-private level?), both
2363 'targetm.goacc.adjust_private_decl' and 'targetm.goacc.expand_var_decl'
2364 may need to do things, but that's currently not meant to be addressed, and
2365 thus not fully worked out and implemented, and thus untested. Hence,
2366 'assert' what currently is implemented/tested, only. */
2367
2368 if (targetm.goacc.expand_var_decl)
2369 gcc_assert (adjusted_vars.is_empty ());
2370
2371 /* Make adjustments to gang-private local variables if required by the
2372 target, e.g. forcing them into a particular address space. Afterwards,
2373 ADDR_EXPR nodes which have adjusted variables as their argument need to
2374 be modified in one of two ways:
2375
2376 1. They can be recreated, making a pointer to the variable in the new
2377 address space, or
2378
2379 2. The address of the variable in the new address space can be taken,
2380 converted to the default (original) address space, and the result of
2381 that conversion subsituted in place of the original ADDR_EXPR node.
2382
2383 Which of these is done depends on the gimple statement being processed.
2384 At present atomic operations and inline asms use (1), and everything else
2385 uses (2). At least on AMD GCN, there are atomic operations that work
2386 directly in the LDS address space.
2387
2388 COMPONENT_REFS, ARRAY_REFS and plain VAR_DECLs are also rewritten to use
2389 the new decl, adjusting types of appropriate tree nodes as necessary. */
2390
2391 if (targetm.goacc.adjust_private_decl
2392 && !adjusted_vars.is_empty ())
2393 {
2394 FOR_ALL_BB_FN (bb, cfun)
2395 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
2396 !gsi_end_p (i: gsi);
2397 gsi_next (i: &gsi))
2398 {
2399 gimple *stmt = gsi_stmt (i: gsi);
2400 walk_stmt_info wi;
2401 var_decl_rewrite_info info;
2402
2403 info.avoid_pointer_conversion
2404 = (is_gimple_call (gs: stmt)
2405 && is_sync_builtin_call (call: as_a <gcall *> (p: stmt)))
2406 || gimple_code (g: stmt) == GIMPLE_ASM;
2407 info.stmt = stmt;
2408 info.modified = false;
2409 info.adjusted_vars = &adjusted_vars;
2410
2411 memset (s: &wi, c: 0, n: sizeof (wi));
2412 wi.info = &info;
2413
2414 walk_gimple_op (stmt, oacc_rewrite_var_decl, &wi);
2415
2416 if (info.modified)
2417 update_stmt (s: stmt);
2418 }
2419 }
2420
2421 return 0;
2422}
2423
2424/* Default launch dimension validator. Force everything to 1. A
2425 backend that wants to provide larger dimensions must override this
2426 hook. */
2427
2428bool
2429default_goacc_validate_dims (tree ARG_UNUSED (decl), int *dims,
2430 int ARG_UNUSED (fn_level),
2431 unsigned ARG_UNUSED (used))
2432{
2433 bool changed = false;
2434
2435 for (unsigned ix = 0; ix != GOMP_DIM_MAX; ix++)
2436 {
2437 if (dims[ix] != 1)
2438 {
2439 dims[ix] = 1;
2440 changed = true;
2441 }
2442 }
2443
2444 return changed;
2445}
2446
2447/* Default dimension bound is unknown on accelerator and 1 on host. */
2448
2449int
2450default_goacc_dim_limit (int ARG_UNUSED (axis))
2451{
2452#ifdef ACCEL_COMPILER
2453 return 0;
2454#else
2455 return 1;
2456#endif
2457}
2458
2459namespace {
2460
2461const pass_data pass_data_oacc_loop_designation =
2462{
2463 .type: GIMPLE_PASS, /* type */
2464 .name: "oaccloops", /* name */
2465 .optinfo_flags: OPTGROUP_OMP, /* optinfo_flags */
2466 .tv_id: TV_NONE, /* tv_id */
2467 PROP_cfg, /* properties_required */
2468 .properties_provided: 0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */
2469 .properties_destroyed: 0, /* properties_destroyed */
2470 .todo_flags_start: 0, /* todo_flags_start */
2471 TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
2472};
2473
2474class pass_oacc_loop_designation : public gimple_opt_pass
2475{
2476public:
2477 pass_oacc_loop_designation (gcc::context *ctxt)
2478 : gimple_opt_pass (pass_data_oacc_loop_designation, ctxt)
2479 {}
2480
2481 /* opt_pass methods: */
2482 bool gate (function *) final override { return flag_openacc; };
2483
2484 unsigned int execute (function *) final override
2485 {
2486 return execute_oacc_loop_designation ();
2487 }
2488
2489}; // class pass_oacc_loop_designation
2490
2491const pass_data pass_data_oacc_device_lower =
2492{
2493 .type: GIMPLE_PASS, /* type */
2494 .name: "oaccdevlow", /* name */
2495 .optinfo_flags: OPTGROUP_OMP, /* optinfo_flags */
2496 .tv_id: TV_NONE, /* tv_id */
2497 PROP_cfg, /* properties_required */
2498 .properties_provided: 0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */
2499 .properties_destroyed: 0, /* properties_destroyed */
2500 .todo_flags_start: 0, /* todo_flags_start */
2501 TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
2502};
2503
2504class pass_oacc_device_lower : public gimple_opt_pass
2505{
2506public:
2507 pass_oacc_device_lower (gcc::context *ctxt)
2508 : gimple_opt_pass (pass_data_oacc_device_lower, ctxt)
2509 {}
2510
2511 /* opt_pass methods: */
2512 bool gate (function *) final override { return flag_openacc; };
2513
2514 unsigned int execute (function *) final override
2515 {
2516 return execute_oacc_device_lower ();
2517 }
2518
2519}; // class pass_oacc_device_lower
2520
2521} // anon namespace
2522
2523gimple_opt_pass *
2524make_pass_oacc_loop_designation (gcc::context *ctxt)
2525{
2526 return new pass_oacc_loop_designation (ctxt);
2527}
2528
2529gimple_opt_pass *
2530make_pass_oacc_device_lower (gcc::context *ctxt)
2531{
2532 return new pass_oacc_device_lower (ctxt);
2533}
2534
2535
2536/* Rewrite GOMP_SIMT_ENTER_ALLOC call given by GSI and remove the preceding
2537 GOMP_SIMT_ENTER call identifying the privatized variables, which are
2538 turned to structure fields and receive a DECL_VALUE_EXPR accordingly.
2539 Set *REGIMPLIFY to true, except if no privatized variables were seen. */
2540
2541static void
2542ompdevlow_adjust_simt_enter (gimple_stmt_iterator *gsi, bool *regimplify)
2543{
2544 gimple *alloc_stmt = gsi_stmt (i: *gsi);
2545 tree simtrec = gimple_call_lhs (gs: alloc_stmt);
2546 tree simduid = gimple_call_arg (gs: alloc_stmt, index: 0);
2547 gimple *enter_stmt = SSA_NAME_DEF_STMT (simduid);
2548 gcc_assert (gimple_call_internal_p (enter_stmt, IFN_GOMP_SIMT_ENTER));
2549 tree rectype = lang_hooks.types.make_type (RECORD_TYPE);
2550 TYPE_ARTIFICIAL (rectype) = TYPE_NAMELESS (rectype) = 1;
2551 TREE_ADDRESSABLE (rectype) = 1;
2552 TREE_TYPE (simtrec) = build_pointer_type (rectype);
2553 for (unsigned i = 1; i < gimple_call_num_args (gs: enter_stmt); i++)
2554 {
2555 tree *argp = gimple_call_arg_ptr (gs: enter_stmt, index: i);
2556 if (*argp == null_pointer_node)
2557 continue;
2558 gcc_assert (TREE_CODE (*argp) == ADDR_EXPR
2559 && VAR_P (TREE_OPERAND (*argp, 0)));
2560 tree var = TREE_OPERAND (*argp, 0);
2561
2562 tree field = build_decl (DECL_SOURCE_LOCATION (var), FIELD_DECL,
2563 DECL_NAME (var), TREE_TYPE (var));
2564 SET_DECL_ALIGN (field, DECL_ALIGN (var));
2565 DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
2566 TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
2567
2568 insert_field_into_struct (rectype, field);
2569
2570 tree t = build_simple_mem_ref (simtrec);
2571 t = build3 (COMPONENT_REF, TREE_TYPE (var), t, field, NULL);
2572 TREE_THIS_VOLATILE (t) = TREE_THIS_VOLATILE (var);
2573 SET_DECL_VALUE_EXPR (var, t);
2574 DECL_HAS_VALUE_EXPR_P (var) = 1;
2575 *regimplify = true;
2576 }
2577 layout_type (rectype);
2578 tree size = TYPE_SIZE_UNIT (rectype);
2579 tree align = build_int_cst (TREE_TYPE (size), TYPE_ALIGN_UNIT (rectype));
2580
2581 alloc_stmt
2582 = gimple_build_call_internal (IFN_GOMP_SIMT_ENTER_ALLOC, 2, size, align);
2583 gimple_call_set_lhs (gs: alloc_stmt, lhs: simtrec);
2584 gsi_replace (gsi, alloc_stmt, false);
2585 gimple_stmt_iterator enter_gsi = gsi_for_stmt (enter_stmt);
2586 enter_stmt = gimple_build_assign (simduid, gimple_call_arg (gs: enter_stmt, index: 0));
2587 gsi_replace (&enter_gsi, enter_stmt, false);
2588
2589 use_operand_p use;
2590 gimple *exit_stmt;
2591 if (single_imm_use (var: simtrec, use_p: &use, stmt: &exit_stmt))
2592 {
2593 gcc_assert (gimple_call_internal_p (exit_stmt, IFN_GOMP_SIMT_EXIT));
2594 gimple_stmt_iterator exit_gsi = gsi_for_stmt (exit_stmt);
2595 tree clobber = build_clobber (rectype);
2596 exit_stmt = gimple_build_assign (build_simple_mem_ref (simtrec), clobber);
2597 gsi_insert_before (&exit_gsi, exit_stmt, GSI_SAME_STMT);
2598 }
2599 else
2600 gcc_checking_assert (has_zero_uses (simtrec));
2601}
2602
2603/* Callback for walk_gimple_stmt used to scan for SIMT-privatized variables. */
2604
2605static tree
2606find_simtpriv_var_op (tree *tp, int *walk_subtrees, void *)
2607{
2608 tree t = *tp;
2609
2610 if (VAR_P (t)
2611 && DECL_HAS_VALUE_EXPR_P (t)
2612 && lookup_attribute (attr_name: "omp simt private", DECL_ATTRIBUTES (t)))
2613 {
2614 *walk_subtrees = 0;
2615 return t;
2616 }
2617 return NULL_TREE;
2618}
2619
2620/* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
2621 VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
2622 LANE is kept to be expanded to RTL later on. Also cleanup all other SIMT
2623 internal functions on non-SIMT targets, and likewise some SIMD internal
2624 functions on SIMT targets. */
2625
2626static unsigned int
2627execute_omp_device_lower ()
2628{
2629 int vf = targetm.simt.vf ? targetm.simt.vf () : 1;
2630 bool regimplify = false;
2631 basic_block bb;
2632 gimple_stmt_iterator gsi;
2633 bool calls_declare_variant_alt
2634 = cgraph_node::get (cfun->decl)->calls_declare_variant_alt;
2635#ifdef ACCEL_COMPILER
2636 bool omp_redirect_indirect_calls = vec_safe_length (offload_ind_funcs) > 0;
2637 tree map_ptr_fn
2638 = builtin_decl_explicit (BUILT_IN_GOMP_TARGET_MAP_INDIRECT_PTR);
2639#endif
2640 FOR_EACH_BB_FN (bb, cfun)
2641 for (gsi = gsi_start_bb (bb); !gsi_end_p (i: gsi); gsi_next (i: &gsi))
2642 {
2643 gimple *stmt = gsi_stmt (i: gsi);
2644 if (!is_gimple_call (gs: stmt))
2645 continue;
2646 if (!gimple_call_internal_p (gs: stmt))
2647 {
2648 if (calls_declare_variant_alt)
2649 if (tree fndecl = gimple_call_fndecl (gs: stmt))
2650 {
2651 tree new_fndecl = omp_resolve_declare_variant (fndecl);
2652 if (new_fndecl != fndecl)
2653 {
2654 gimple_call_set_fndecl (gs: stmt, decl: new_fndecl);
2655 update_stmt (s: stmt);
2656 }
2657 }
2658#ifdef ACCEL_COMPILER
2659 if (omp_redirect_indirect_calls
2660 && gimple_call_fndecl (stmt) == NULL_TREE)
2661 {
2662 gcall *orig_call = dyn_cast <gcall *> (stmt);
2663 tree call_fn = gimple_call_fn (stmt);
2664 tree fn_ty = TREE_TYPE (call_fn);
2665
2666 if (TREE_CODE (call_fn) == OBJ_TYPE_REF)
2667 {
2668 tree obj_ref = create_tmp_reg (TREE_TYPE (call_fn),
2669 ".ind_fn_objref");
2670 gimple *gassign = gimple_build_assign (obj_ref, call_fn);
2671 gsi_insert_before (&gsi, gassign, GSI_SAME_STMT);
2672 call_fn = obj_ref;
2673 }
2674 tree mapped_fn = create_tmp_reg (fn_ty, ".ind_fn");
2675 gimple *gcall =
2676 gimple_build_call (map_ptr_fn, 1, call_fn);
2677 gimple_set_location (gcall, gimple_location (stmt));
2678 gimple_call_set_lhs (gcall, mapped_fn);
2679 gsi_insert_before (&gsi, gcall, GSI_SAME_STMT);
2680
2681 gimple_call_set_fn (orig_call, mapped_fn);
2682 update_stmt (orig_call);
2683 }
2684#endif
2685 continue;
2686 }
2687 tree lhs = gimple_call_lhs (gs: stmt), rhs = NULL_TREE;
2688 tree type = lhs ? TREE_TYPE (lhs) : integer_type_node;
2689 switch (gimple_call_internal_fn (gs: stmt))
2690 {
2691 case IFN_GOMP_TARGET_REV:
2692 {
2693#ifndef ACCEL_COMPILER
2694 gimple_stmt_iterator gsi2 = gsi;
2695 gsi_next (i: &gsi2);
2696 gcc_assert (!gsi_end_p (gsi2));
2697 gcc_assert (gimple_call_builtin_p (gsi_stmt (gsi2),
2698 BUILT_IN_GOMP_TARGET));
2699 tree old_decl
2700 = TREE_OPERAND (gimple_call_arg (gsi_stmt (gsi2), 1), 0);
2701 tree new_decl = gimple_call_arg (gs: gsi_stmt (i: gsi), index: 0);
2702 gimple_call_set_arg (gs: gsi_stmt (i: gsi2), index: 1, arg: new_decl);
2703 update_stmt (s: gsi_stmt (i: gsi2));
2704 new_decl = TREE_OPERAND (new_decl, 0);
2705 unsigned i;
2706 unsigned num_funcs = vec_safe_length (v: offload_funcs);
2707 for (i = 0; i < num_funcs; i++)
2708 {
2709 if ((*offload_funcs)[i] == old_decl)
2710 {
2711 (*offload_funcs)[i] = new_decl;
2712 break;
2713 }
2714 else if ((*offload_funcs)[i] == new_decl)
2715 break; /* This can happen due to inlining. */
2716 }
2717 gcc_assert (i < num_funcs);
2718#else
2719 tree old_decl = TREE_OPERAND (gimple_call_arg (gsi_stmt (gsi), 0),
2720 0);
2721#endif
2722 /* FIXME: Find a way to actually prevent outputting the empty-body
2723 old_decl as debug symbol + function in the assembly file. */
2724 cgraph_node *node = cgraph_node::get (decl: old_decl);
2725 node->address_taken = false;
2726 node->need_lto_streaming = false;
2727 node->offloadable = false;
2728
2729 unlink_stmt_vdef (stmt);
2730 }
2731 break;
2732 case IFN_GOMP_USE_SIMT:
2733 rhs = vf == 1 ? integer_zero_node : integer_one_node;
2734 break;
2735 case IFN_GOMP_SIMT_ENTER:
2736 rhs = vf == 1 ? gimple_call_arg (gs: stmt, index: 0) : NULL_TREE;
2737 goto simtreg_enter_exit;
2738 case IFN_GOMP_SIMT_ENTER_ALLOC:
2739 if (vf != 1)
2740 ompdevlow_adjust_simt_enter (gsi: &gsi, regimplify: &regimplify);
2741 rhs = vf == 1 ? null_pointer_node : NULL_TREE;
2742 goto simtreg_enter_exit;
2743 case IFN_GOMP_SIMT_EXIT:
2744 simtreg_enter_exit:
2745 if (vf != 1)
2746 continue;
2747 unlink_stmt_vdef (stmt);
2748 break;
2749 case IFN_GOMP_SIMT_LANE:
2750 case IFN_GOMP_SIMT_LAST_LANE:
2751 rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE;
2752 break;
2753 case IFN_GOMP_SIMT_VF:
2754 rhs = build_int_cst (type, vf);
2755 break;
2756 case IFN_GOMP_SIMT_ORDERED_PRED:
2757 rhs = vf == 1 ? integer_zero_node : NULL_TREE;
2758 if (rhs || !lhs)
2759 unlink_stmt_vdef (stmt);
2760 break;
2761 case IFN_GOMP_SIMT_VOTE_ANY:
2762 case IFN_GOMP_SIMT_XCHG_BFLY:
2763 case IFN_GOMP_SIMT_XCHG_IDX:
2764 rhs = vf == 1 ? gimple_call_arg (gs: stmt, index: 0) : NULL_TREE;
2765 break;
2766 case IFN_GOMP_SIMD_LANE:
2767 case IFN_GOMP_SIMD_LAST_LANE:
2768 rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE;
2769 break;
2770 case IFN_GOMP_SIMD_VF:
2771 rhs = vf != 1 ? build_one_cst (type) : NULL_TREE;
2772 break;
2773 default:
2774 continue;
2775 }
2776 if (lhs && !rhs)
2777 continue;
2778 stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop ();
2779 gsi_replace (&gsi, stmt, false);
2780 }
2781 if (regimplify)
2782 FOR_EACH_BB_REVERSE_FN (bb, cfun)
2783 for (gsi = gsi_last_bb (bb); !gsi_end_p (i: gsi); gsi_prev (i: &gsi))
2784 if (walk_gimple_stmt (&gsi, NULL, find_simtpriv_var_op, NULL))
2785 {
2786 if (gimple_clobber_p (s: gsi_stmt (i: gsi)))
2787 gsi_remove (&gsi, true);
2788 else
2789 gimple_regimplify_operands (gsi_stmt (i: gsi), &gsi);
2790 }
2791 if (vf != 1)
2792 cfun->has_force_vectorize_loops = false;
2793 return 0;
2794}
2795
2796namespace {
2797
2798const pass_data pass_data_omp_device_lower =
2799{
2800 .type: GIMPLE_PASS, /* type */
2801 .name: "ompdevlow", /* name */
2802 .optinfo_flags: OPTGROUP_OMP, /* optinfo_flags */
2803 .tv_id: TV_NONE, /* tv_id */
2804 PROP_cfg, /* properties_required */
2805 PROP_gimple_lomp_dev, /* properties_provided */
2806 .properties_destroyed: 0, /* properties_destroyed */
2807 .todo_flags_start: 0, /* todo_flags_start */
2808 TODO_update_ssa, /* todo_flags_finish */
2809};
2810
2811class pass_omp_device_lower : public gimple_opt_pass
2812{
2813public:
2814 pass_omp_device_lower (gcc::context *ctxt)
2815 : gimple_opt_pass (pass_data_omp_device_lower, ctxt)
2816 {}
2817
2818 /* opt_pass methods: */
2819 bool gate (function *fun) final override
2820 {
2821#ifdef ACCEL_COMPILER
2822 bool offload_ind_funcs_p = vec_safe_length (offload_ind_funcs) > 0;
2823#else
2824 bool offload_ind_funcs_p = false;
2825#endif
2826 return (!(fun->curr_properties & PROP_gimple_lomp_dev)
2827 || (flag_openmp
2828 && (cgraph_node::get (decl: fun->decl)->calls_declare_variant_alt
2829 || offload_ind_funcs_p)));
2830 }
2831 unsigned int execute (function *) final override
2832 {
2833 return execute_omp_device_lower ();
2834 }
2835
2836}; // class pass_expand_omp_ssa
2837
2838} // anon namespace
2839
2840gimple_opt_pass *
2841make_pass_omp_device_lower (gcc::context *ctxt)
2842{
2843 return new pass_omp_device_lower (ctxt);
2844}
2845
2846/* "omp declare target link" handling pass. */
2847
2848namespace {
2849
2850const pass_data pass_data_omp_target_link =
2851{
2852 .type: GIMPLE_PASS, /* type */
2853 .name: "omptargetlink", /* name */
2854 .optinfo_flags: OPTGROUP_OMP, /* optinfo_flags */
2855 .tv_id: TV_NONE, /* tv_id */
2856 PROP_ssa, /* properties_required */
2857 .properties_provided: 0, /* properties_provided */
2858 .properties_destroyed: 0, /* properties_destroyed */
2859 .todo_flags_start: 0, /* todo_flags_start */
2860 TODO_update_ssa, /* todo_flags_finish */
2861};
2862
2863class pass_omp_target_link : public gimple_opt_pass
2864{
2865public:
2866 pass_omp_target_link (gcc::context *ctxt)
2867 : gimple_opt_pass (pass_data_omp_target_link, ctxt)
2868 {}
2869
2870 /* opt_pass methods: */
2871 bool gate (function *fun) final override
2872 {
2873#ifdef ACCEL_COMPILER
2874 return offloading_function_p (fun->decl);
2875#else
2876 (void) fun;
2877 return false;
2878#endif
2879 }
2880
2881 unsigned execute (function *) final override;
2882};
2883
2884/* Callback for walk_gimple_stmt used to scan for link var operands. */
2885
2886static tree
2887find_link_var_op (tree *tp, int *walk_subtrees, void *)
2888{
2889 tree t = *tp;
2890
2891 if (VAR_P (t)
2892 && DECL_HAS_VALUE_EXPR_P (t)
2893 && is_global_var (t)
2894 && lookup_attribute (attr_name: "omp declare target link", DECL_ATTRIBUTES (t)))
2895 {
2896 *walk_subtrees = 0;
2897 return t;
2898 }
2899
2900 return NULL_TREE;
2901}
2902
2903unsigned
2904pass_omp_target_link::execute (function *fun)
2905{
2906 basic_block bb;
2907 FOR_EACH_BB_FN (bb, fun)
2908 {
2909 gimple_stmt_iterator gsi;
2910 for (gsi = gsi_start_bb (bb); !gsi_end_p (i: gsi); gsi_next (i: &gsi))
2911 {
2912 if (gimple_call_builtin_p (gsi_stmt (i: gsi), BUILT_IN_GOMP_TARGET))
2913 {
2914 tree dev = gimple_call_arg (gs: gsi_stmt (i: gsi), index: 0);
2915 tree fn = gimple_call_arg (gs: gsi_stmt (i: gsi), index: 1);
2916 if (POINTER_TYPE_P (TREE_TYPE (fn)))
2917 fn = TREE_OPERAND (fn, 0);
2918 if (TREE_CODE (dev) == INTEGER_CST
2919 && wi::to_wide (t: dev) == GOMP_DEVICE_HOST_FALLBACK
2920 && lookup_attribute (attr_name: "omp target device_ancestor_nohost",
2921 DECL_ATTRIBUTES (fn)) != NULL_TREE)
2922 continue; /* ancestor:1 */
2923 /* Nullify the second argument of __builtin_GOMP_target_ext. */
2924 gimple_call_set_arg (gs: gsi_stmt (i: gsi), index: 1, null_pointer_node);
2925 update_stmt (s: gsi_stmt (i: gsi));
2926 }
2927 if (walk_gimple_stmt (&gsi, NULL, find_link_var_op, NULL))
2928 gimple_regimplify_operands (gsi_stmt (i: gsi), &gsi);
2929 }
2930 }
2931
2932 return 0;
2933}
2934
2935} // anon namespace
2936
2937gimple_opt_pass *
2938make_pass_omp_target_link (gcc::context *ctxt)
2939{
2940 return new pass_omp_target_link (ctxt);
2941}
2942

source code of gcc/omp-offload.cc