1/* Bits of OpenMP and OpenACC handling that is specific to device offloading
2 and a lowering pass for OpenACC device directives.
3
4 Copyright (C) 2005-2026 Free Software Foundation, Inc.
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3. If not see
20<http://www.gnu.org/licenses/>. */
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
25#include "backend.h"
26#include "target.h"
27#include "tree.h"
28#include "gimple.h"
29#include "tree-pass.h"
30#include "ssa.h"
31#include "cgraph.h"
32#include "pretty-print.h"
33#include "diagnostic-core.h"
34#include "fold-const.h"
35#include "internal-fn.h"
36#include "langhooks.h"
37#include "gimplify.h"
38#include "gimple-iterator.h"
39#include "gimplify-me.h"
40#include "gimple-walk.h"
41#include "tree-cfg.h"
42#include "tree-into-ssa.h"
43#include "tree-nested.h"
44#include "stor-layout.h"
45#include "common/common-target.h"
46#include "omp-general.h"
47#include "omp-offload.h"
48#include "lto-section-names.h"
49#include "gomp-constants.h"
50#include "gimple-pretty-print.h"
51#include "intl.h"
52#include "stringpool.h"
53#include "attribs.h"
54#include "cfgloop.h"
55#include "context.h"
56#include "convert.h"
57#include "opts.h"
58
59/* Describe the OpenACC looping structure of a function. The entire
60 function is held in a 'NULL' loop. */
61
62struct oacc_loop
63{
64 oacc_loop *parent; /* Containing loop. */
65
66 oacc_loop *child; /* First inner loop. */
67
68 oacc_loop *sibling; /* Next loop within same parent. */
69
70 location_t loc; /* Location of the loop start. */
71
72 gcall *marker; /* Initial head marker. */
73
74 gcall *heads[GOMP_DIM_MAX]; /* Head marker functions. */
75 gcall *tails[GOMP_DIM_MAX]; /* Tail marker functions. */
76
77 tree routine; /* Pseudo-loop enclosing a routine. */
78
79 unsigned mask; /* Partitioning mask. */
80 unsigned e_mask; /* Partitioning of element loops (when tiling). */
81 unsigned inner; /* Partitioning of inner loops. */
82 unsigned flags; /* Partitioning flags. */
83 vec<gcall *> ifns; /* Contained loop abstraction functions. */
84 tree chunk_size; /* Chunk size. */
85 gcall *head_end; /* Final marker of head sequence. */
86};
87
88/* Holds offload tables with decls. */
89vec<tree, va_gc> *offload_funcs, *offload_vars, *offload_ind_funcs;
90
91/* Return level at which oacc routine may spawn a partitioned loop, or
92 -1 if it is not a routine (i.e. is an offload fn). */
93
94int
95oacc_fn_attrib_level (tree attr)
96{
97 tree pos = TREE_VALUE (attr);
98
99 if (!TREE_PURPOSE (pos))
100 return -1;
101
102 int ix = 0;
103 for (ix = 0; ix != GOMP_DIM_MAX;
104 ix++, pos = TREE_CHAIN (pos))
105 if (!integer_zerop (TREE_PURPOSE (pos)))
106 break;
107
108 return ix;
109}
110
111/* Helper function for omp_finish_file routine. Takes decls from V_DECLS and
112 adds their addresses and sizes to constructor-vector V_CTOR. */
113
114static void
115add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls,
116 vec<constructor_elt, va_gc> *v_ctor)
117{
118 unsigned len = vec_safe_length (v: v_decls);
119 for (unsigned i = 0; i < len; i++)
120 {
121 tree it = (*v_decls)[i];
122 bool is_var = VAR_P (it);
123 bool is_link_var
124 = is_var
125#ifdef ACCEL_COMPILER
126 && DECL_HAS_VALUE_EXPR_P (it)
127#endif
128 && lookup_attribute (attr_name: "omp declare target link", DECL_ATTRIBUTES (it));
129
130 /* See also omp_finish_file and output_offload_tables in lto-cgraph.cc. */
131 if (!in_lto_p && !symtab_node::get (decl: it))
132 continue;
133
134 tree size = NULL_TREE;
135 if (is_var)
136 size = fold_convert (const_ptr_type_node, DECL_SIZE_UNIT (it));
137
138 tree addr;
139 if (!is_link_var)
140 addr = build_fold_addr_expr (it);
141 else
142 {
143#ifdef ACCEL_COMPILER
144 /* For "omp declare target link" vars add address of the pointer to
145 the target table, instead of address of the var. */
146 tree value_expr = DECL_VALUE_EXPR (it);
147 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
148 varpool_node::finalize_decl (link_ptr_decl);
149 addr = build_fold_addr_expr (link_ptr_decl);
150#else
151 addr = build_fold_addr_expr (it);
152#endif
153
154 /* Most significant bit of the size marks "omp declare target link"
155 vars in host and target tables. */
156 unsigned HOST_WIDE_INT isize = tree_to_uhwi (size);
157 isize |= 1ULL << (int_size_in_bytes (const_ptr_type_node)
158 * BITS_PER_UNIT - 1);
159 size = wide_int_to_tree (const_ptr_type_node, cst: isize);
160 }
161
162 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, addr);
163 if (is_var)
164 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, size);
165 }
166}
167
168/* Return true if DECL is a function for which its references should be
169 analyzed. */
170
171static bool
172omp_declare_target_fn_p (tree decl)
173{
174 return (TREE_CODE (decl) == FUNCTION_DECL
175 && lookup_attribute (attr_name: "omp declare target", DECL_ATTRIBUTES (decl))
176 && !lookup_attribute (attr_name: "omp declare target host",
177 DECL_ATTRIBUTES (decl))
178 && (!flag_openacc
179 || oacc_get_fn_attrib (fn: decl) == NULL_TREE));
180}
181
182/* Return true if DECL Is a variable for which its initializer references
183 should be analyzed. */
184
185static bool
186omp_declare_target_var_p (tree decl)
187{
188 return (VAR_P (decl)
189 && lookup_attribute (attr_name: "omp declare target", DECL_ATTRIBUTES (decl))
190 && !lookup_attribute (attr_name: "omp declare target link",
191 DECL_ATTRIBUTES (decl)));
192}
193
194/* Helper function for omp_discover_implicit_declare_target, called through
195 walk_tree. Mark referenced FUNCTION_DECLs implicitly as
196 declare target to. */
197
198static tree
199omp_discover_declare_target_tgt_fn_r (tree *tp, int *walk_subtrees, void *data)
200{
201 if (TREE_CODE (*tp) == CALL_EXPR
202 && CALL_EXPR_FN (*tp)
203 && TREE_CODE (CALL_EXPR_FN (*tp)) == ADDR_EXPR
204 && TREE_CODE (TREE_OPERAND (CALL_EXPR_FN (*tp), 0)) == FUNCTION_DECL
205 && lookup_attribute (attr_name: "omp declare variant base",
206 DECL_ATTRIBUTES (TREE_OPERAND (CALL_EXPR_FN (*tp),
207 0))))
208 {
209 tree fn = TREE_OPERAND (CALL_EXPR_FN (*tp), 0);
210 for (tree attr = DECL_ATTRIBUTES (fn); attr; attr = TREE_CHAIN (attr))
211 {
212 attr = lookup_attribute (attr_name: "omp declare variant base", list: attr);
213 if (attr == NULL_TREE)
214 break;
215 tree purpose = TREE_PURPOSE (TREE_VALUE (attr));
216 if (TREE_CODE (purpose) == FUNCTION_DECL)
217 omp_discover_declare_target_tgt_fn_r (tp: &purpose, walk_subtrees, data);
218 }
219 }
220 else if (TREE_CODE (*tp) == FUNCTION_DECL)
221 {
222 tree decl = *tp;
223 tree id = get_identifier ("omp declare target");
224 symtab_node *node = symtab_node::get (decl: *tp);
225 if (node != NULL)
226 {
227 while (node->alias_target
228 && TREE_CODE (node->alias_target) == FUNCTION_DECL)
229 {
230 if (!omp_declare_target_fn_p (decl: node->decl)
231 && !lookup_attribute (attr_name: "omp declare target host",
232 DECL_ATTRIBUTES (node->decl)))
233 {
234 node->offloadable = 1;
235 DECL_ATTRIBUTES (node->decl)
236 = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (node->decl));
237 }
238 node = symtab_node::get (decl: node->alias_target);
239 }
240 symtab_node *new_node = node->ultimate_alias_target ();
241 decl = new_node->decl;
242 while (node != new_node)
243 {
244 if (!omp_declare_target_fn_p (decl: node->decl)
245 && !lookup_attribute (attr_name: "omp declare target host",
246 DECL_ATTRIBUTES (node->decl)))
247 {
248 node->offloadable = 1;
249 DECL_ATTRIBUTES (node->decl)
250 = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (node->decl));
251 }
252 gcc_assert (node->alias && node->analyzed);
253 node = node->get_alias_target ();
254 }
255 node->offloadable = 1;
256 if (ENABLE_OFFLOADING)
257 g->have_offload = true;
258 }
259 if (omp_declare_target_fn_p (decl)
260 || lookup_attribute (attr_name: "omp declare target host",
261 DECL_ATTRIBUTES (decl)))
262 return NULL_TREE;
263
264 if (DECL_SAVED_TREE (decl)
265 && (!DECL_EXTERNAL (decl) || DECL_DECLARED_INLINE_P (decl)))
266 ((vec<tree> *) data)->safe_push (obj: decl);
267 DECL_ATTRIBUTES (decl) = tree_cons (id, NULL_TREE,
268 DECL_ATTRIBUTES (decl));
269 }
270 else if (TYPE_P (*tp))
271 *walk_subtrees = 0;
272 else if (TREE_CODE (*tp) == OMP_TARGET)
273 {
274 tree c = omp_find_clause (OMP_CLAUSES (*tp), kind: OMP_CLAUSE_DEVICE);
275 if (c && OMP_CLAUSE_DEVICE_ANCESTOR (c))
276 *walk_subtrees = 0;
277 }
278 return NULL_TREE;
279}
280
281/* Similarly, but ignore references outside of OMP_TARGET regions. */
282
283static tree
284omp_discover_declare_target_fn_r (tree *tp, int *walk_subtrees, void *data)
285{
286 if (TREE_CODE (*tp) == OMP_TARGET)
287 {
288 tree c = omp_find_clause (OMP_CLAUSES (*tp), kind: OMP_CLAUSE_DEVICE);
289 if (!c || !OMP_CLAUSE_DEVICE_ANCESTOR (c))
290 walk_tree_without_duplicates (&OMP_TARGET_BODY (*tp),
291 omp_discover_declare_target_tgt_fn_r,
292 data);
293 *walk_subtrees = 0;
294 }
295 else if (TYPE_P (*tp))
296 *walk_subtrees = 0;
297 return NULL_TREE;
298}
299
300/* Helper function for omp_discover_implicit_declare_target, called through
301 walk_tree. Mark referenced FUNCTION_DECLs implicitly as
302 declare target to. */
303
304static tree
305omp_discover_declare_target_var_r (tree *tp, int *walk_subtrees, void *data)
306{
307 if (TREE_CODE (*tp) == FUNCTION_DECL)
308 return omp_discover_declare_target_tgt_fn_r (tp, walk_subtrees, data);
309 else if (VAR_P (*tp)
310 && is_global_var (t: *tp)
311 && !omp_declare_target_var_p (decl: *tp))
312 {
313 tree id = get_identifier ("omp declare target");
314 if (lookup_attribute (attr_name: "omp declare target link", DECL_ATTRIBUTES (*tp)))
315 {
316 error_at (DECL_SOURCE_LOCATION (*tp),
317 "%qD specified both in declare target %<link%> and "
318 "implicitly in %<to%> clauses", *tp);
319 DECL_ATTRIBUTES (*tp)
320 = remove_attribute ("omp declare target link", DECL_ATTRIBUTES (*tp));
321 }
322 if (TREE_STATIC (*tp) && lang_hooks.decls.omp_get_decl_init (*tp))
323 ((vec<tree> *) data)->safe_push (obj: *tp);
324 DECL_ATTRIBUTES (*tp) = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (*tp));
325 symtab_node *node = symtab_node::get (decl: *tp);
326 if (node != NULL && !node->offloadable)
327 {
328 node->offloadable = 1;
329 if (ENABLE_OFFLOADING)
330 {
331 g->have_offload = true;
332 if (is_a <varpool_node *> (p: node))
333 vec_safe_push (v&: offload_vars, obj: node->decl);
334 }
335 }
336 }
337 else if (TYPE_P (*tp))
338 *walk_subtrees = 0;
339 return NULL_TREE;
340}
341
342/* Perform the OpenMP implicit declare target to discovery. */
343
344void
345omp_discover_implicit_declare_target (void)
346{
347 cgraph_node *node;
348 varpool_node *vnode;
349 auto_vec<tree> worklist;
350
351 FOR_EACH_DEFINED_FUNCTION (node)
352 if (DECL_SAVED_TREE (node->decl))
353 {
354 struct cgraph_node *cgn;
355 if (lookup_attribute (attr_name: "omp declare target indirect",
356 DECL_ATTRIBUTES (node->decl)))
357 vec_safe_push (v&: offload_ind_funcs, obj: node->decl);
358 if (omp_declare_target_fn_p (decl: node->decl))
359 worklist.safe_push (obj: node->decl);
360 else if (DECL_STRUCT_FUNCTION (node->decl)
361 && DECL_STRUCT_FUNCTION (node->decl)->has_omp_target)
362 worklist.safe_push (obj: node->decl);
363 for (cgn = first_nested_function (node);
364 cgn; cgn = next_nested_function (node: cgn))
365 if (omp_declare_target_fn_p (decl: cgn->decl))
366 worklist.safe_push (obj: cgn->decl);
367 else if (DECL_STRUCT_FUNCTION (cgn->decl)
368 && DECL_STRUCT_FUNCTION (cgn->decl)->has_omp_target)
369 worklist.safe_push (obj: cgn->decl);
370 }
371 FOR_EACH_VARIABLE (vnode)
372 if (lang_hooks.decls.omp_get_decl_init (vnode->decl)
373 && omp_declare_target_var_p (decl: vnode->decl))
374 worklist.safe_push (obj: vnode->decl);
375 while (!worklist.is_empty ())
376 {
377 tree decl = worklist.pop ();
378 if (VAR_P (decl))
379 walk_tree_without_duplicates (lang_hooks.decls.omp_get_decl_init (decl),
380 omp_discover_declare_target_var_r,
381 &worklist);
382 else if (omp_declare_target_fn_p (decl))
383 walk_tree_without_duplicates (&DECL_SAVED_TREE (decl),
384 omp_discover_declare_target_tgt_fn_r,
385 &worklist);
386 else
387 walk_tree_without_duplicates (&DECL_SAVED_TREE (decl),
388 omp_discover_declare_target_fn_r,
389 &worklist);
390 }
391
392 lang_hooks.decls.omp_finish_decl_inits ();
393}
394
395
396/* Create new symbols containing (address, size) pairs for global variables,
397 marked with "omp declare target" attribute, as well as addresses for the
398 functions, which are outlined offloading regions. */
399void
400omp_finish_file (void)
401{
402 unsigned num_funcs = vec_safe_length (v: offload_funcs);
403 unsigned num_vars = vec_safe_length (v: offload_vars);
404 unsigned num_ind_funcs = vec_safe_length (v: offload_ind_funcs);
405
406 if (num_funcs == 0 && num_vars == 0 && num_ind_funcs == 0)
407 return;
408
409 if (targetm_common.have_named_sections)
410 {
411 vec<constructor_elt, va_gc> *v_f, *v_v, *v_if;
412 vec_alloc (v&: v_f, nelems: num_funcs);
413 vec_alloc (v&: v_v, nelems: num_vars * 2);
414 vec_alloc (v&: v_if, nelems: num_ind_funcs);
415
416 add_decls_addresses_to_decl_constructor (v_decls: offload_funcs, v_ctor: v_f);
417 add_decls_addresses_to_decl_constructor (v_decls: offload_vars, v_ctor: v_v);
418 add_decls_addresses_to_decl_constructor (v_decls: offload_ind_funcs, v_ctor: v_if);
419
420 tree vars_decl_type = build_array_type_nelts (pointer_sized_int_node,
421 vec_safe_length (v: v_v));
422 tree funcs_decl_type = build_array_type_nelts (pointer_sized_int_node,
423 num_funcs);
424 tree ind_funcs_decl_type = build_array_type_nelts (pointer_sized_int_node,
425 num_ind_funcs);
426
427 SET_TYPE_ALIGN (vars_decl_type, TYPE_ALIGN (pointer_sized_int_node));
428 SET_TYPE_ALIGN (funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node));
429 SET_TYPE_ALIGN (ind_funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node));
430 tree ctor_v = build_constructor (vars_decl_type, v_v);
431 tree ctor_f = build_constructor (funcs_decl_type, v_f);
432 tree ctor_if = build_constructor (ind_funcs_decl_type, v_if);
433 TREE_CONSTANT (ctor_v) = TREE_CONSTANT (ctor_f) = TREE_CONSTANT (ctor_if) = 1;
434 TREE_STATIC (ctor_v) = TREE_STATIC (ctor_f) = TREE_STATIC (ctor_if) = 1;
435 tree funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
436 get_identifier (".offload_func_table"),
437 funcs_decl_type);
438 tree vars_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
439 get_identifier (".offload_var_table"),
440 vars_decl_type);
441 tree ind_funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
442 get_identifier (".offload_ind_func_table"),
443 ind_funcs_decl_type);
444 TREE_STATIC (funcs_decl) = TREE_STATIC (ind_funcs_decl) = 1;
445 TREE_STATIC (vars_decl) = 1;
446 /* Do not align tables more than TYPE_ALIGN (pointer_sized_int_node),
447 otherwise a joint table in a binary will contain padding between
448 tables from multiple object files. */
449 DECL_USER_ALIGN (funcs_decl) = DECL_USER_ALIGN (ind_funcs_decl) = 1;
450 DECL_USER_ALIGN (vars_decl) = 1;
451 SET_DECL_ALIGN (funcs_decl, TYPE_ALIGN (funcs_decl_type));
452 SET_DECL_ALIGN (vars_decl, TYPE_ALIGN (vars_decl_type));
453 SET_DECL_ALIGN (ind_funcs_decl, TYPE_ALIGN (ind_funcs_decl_type));
454 DECL_INITIAL (funcs_decl) = ctor_f;
455 DECL_INITIAL (vars_decl) = ctor_v;
456 DECL_INITIAL (ind_funcs_decl) = ctor_if;
457 set_decl_section_name (funcs_decl, OFFLOAD_FUNC_TABLE_SECTION_NAME);
458 set_decl_section_name (vars_decl, OFFLOAD_VAR_TABLE_SECTION_NAME);
459 set_decl_section_name (ind_funcs_decl,
460 OFFLOAD_IND_FUNC_TABLE_SECTION_NAME);
461 varpool_node::finalize_decl (decl: vars_decl);
462 varpool_node::finalize_decl (decl: funcs_decl);
463 varpool_node::finalize_decl (decl: ind_funcs_decl);
464 }
465 else
466 {
467 for (unsigned i = 0; i < num_funcs; i++)
468 {
469 tree it = (*offload_funcs)[i];
470 /* See also add_decls_addresses_to_decl_constructor
471 and output_offload_tables in lto-cgraph.cc. */
472 if (!in_lto_p && !symtab_node::get (decl: it))
473 continue;
474 targetm.record_offload_symbol (it);
475 }
476 for (unsigned i = 0; i < num_vars; i++)
477 {
478 tree it = (*offload_vars)[i];
479 if (!in_lto_p && !symtab_node::get (decl: it))
480 continue;
481#ifdef ACCEL_COMPILER
482 if (DECL_HAS_VALUE_EXPR_P (it)
483 && lookup_attribute ("omp declare target link",
484 DECL_ATTRIBUTES (it)))
485 {
486 tree value_expr = DECL_VALUE_EXPR (it);
487 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
488 targetm.record_offload_symbol (link_ptr_decl);
489 varpool_node::finalize_decl (link_ptr_decl);
490 }
491 else
492#endif
493 targetm.record_offload_symbol (it);
494 }
495 for (unsigned i = 0; i < num_ind_funcs; i++)
496 {
497 tree it = (*offload_ind_funcs)[i];
498 /* See also add_decls_addresses_to_decl_constructor
499 and output_offload_tables in lto-cgraph.cc. */
500 if (!in_lto_p && !symtab_node::get (decl: it))
501 continue;
502 targetm.record_offload_symbol (it);
503 }
504 }
505}
506
507/* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
508 axis DIM. Return a tmp var holding the result. */
509
510static tree
511oacc_dim_call (bool pos, int dim, gimple_seq *seq)
512{
513 tree arg = build_int_cst (unsigned_type_node, dim);
514 tree size = create_tmp_var (integer_type_node);
515 enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
516 gimple *call = gimple_build_call_internal (fn, 1, arg);
517
518 gimple_call_set_lhs (gs: call, lhs: size);
519 gimple_seq_add_stmt (seq, call);
520
521 return size;
522}
523
524/* Find the number of threads (POS = false), or thread number (POS =
525 true) for an OpenACC region partitioned as MASK. Setup code
526 required for the calculation is added to SEQ. */
527
528static tree
529oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
530{
531 tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1);
532 unsigned ix;
533
534 /* Start at gang level, and examine relevant dimension indices. */
535 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
536 if (GOMP_DIM_MASK (ix) & mask)
537 {
538 if (res)
539 {
540 /* We had an outer index, so scale that by the size of
541 this dimension. */
542 tree n = oacc_dim_call (pos: false, dim: ix, seq);
543 res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
544 }
545 if (pos)
546 {
547 /* Determine index in this dimension. */
548 tree id = oacc_dim_call (pos: true, dim: ix, seq);
549 if (res)
550 res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
551 else
552 res = id;
553 }
554 }
555
556 if (res == NULL_TREE)
557 res = integer_zero_node;
558
559 return res;
560}
561
562/* Transform IFN_GOACC_LOOP calls to actual code. See
563 expand_oacc_for for where these are generated. At the vector
564 level, we stride loops, such that each member of a warp will
565 operate on adjacent iterations. At the worker and gang level,
566 each gang/warp executes a set of contiguous iterations. Chunking
567 can override this such that each iteration engine executes a
568 contiguous chunk, and then moves on to stride to the next chunk. */
569
570static void
571oacc_xform_loop (gcall *call)
572{
573 gimple_stmt_iterator gsi = gsi_for_stmt (call);
574 enum ifn_goacc_loop_kind code
575 = (enum ifn_goacc_loop_kind) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
576 tree dir = gimple_call_arg (gs: call, index: 1);
577 tree range = gimple_call_arg (gs: call, index: 2);
578 tree step = gimple_call_arg (gs: call, index: 3);
579 tree chunk_size = NULL_TREE;
580 unsigned mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 5));
581 tree lhs = gimple_call_lhs (gs: call);
582 tree type = NULL_TREE;
583 tree diff_type = TREE_TYPE (range);
584 tree r = NULL_TREE;
585 gimple_seq seq = NULL;
586 bool chunking = false, striding = true;
587 unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
588 unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
589
590 /* Skip lowering if return value of IFN_GOACC_LOOP call is not used. */
591 if (!lhs)
592 {
593 gsi_replace_with_seq (&gsi, seq, true);
594 return;
595 }
596
597 type = TREE_TYPE (lhs);
598
599#ifdef ACCEL_COMPILER
600 chunk_size = gimple_call_arg (call, 4);
601 if (integer_minus_onep (chunk_size) /* Force static allocation. */
602 || integer_zerop (chunk_size)) /* Default (also static). */
603 {
604 /* If we're at the gang level, we want each to execute a
605 contiguous run of iterations. Otherwise we want each element
606 to stride. */
607 striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG));
608 chunking = false;
609 }
610 else
611 {
612 /* Chunk of size 1 is striding. */
613 striding = integer_onep (chunk_size);
614 chunking = !striding;
615 }
616#endif
617
618 /* striding=true, chunking=true
619 -> invalid.
620 striding=true, chunking=false
621 -> chunks=1
622 striding=false,chunking=true
623 -> chunks=ceil (range/(chunksize*threads*step))
624 striding=false,chunking=false
625 -> chunk_size=ceil(range/(threads*step)),chunks=1 */
626 push_gimplify_context (in_ssa: true);
627
628 switch (code)
629 {
630 default: gcc_unreachable ();
631
632 case IFN_GOACC_LOOP_CHUNKS:
633 if (!chunking)
634 r = build_int_cst (type, 1);
635 else
636 {
637 /* chunk_max
638 = (range - dir) / (chunks * step * num_threads) + dir */
639 tree per = oacc_thread_numbers (pos: false, mask, seq: &seq);
640 per = fold_convert (type, per);
641 chunk_size = fold_convert (type, chunk_size);
642 per = fold_build2 (MULT_EXPR, type, per, chunk_size);
643 per = fold_build2 (MULT_EXPR, type, per, step);
644 r = build2 (MINUS_EXPR, type, range, dir);
645 r = build2 (PLUS_EXPR, type, r, per);
646 r = build2 (TRUNC_DIV_EXPR, type, r, per);
647 }
648 break;
649
650 case IFN_GOACC_LOOP_STEP:
651 {
652 /* If striding, step by the entire compute volume, otherwise
653 step by the inner volume. */
654 unsigned volume = striding ? mask : inner_mask;
655
656 r = oacc_thread_numbers (pos: false, mask: volume, seq: &seq);
657 r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
658 }
659 break;
660
661 case IFN_GOACC_LOOP_OFFSET:
662 /* Enable vectorization on non-SIMT targets. */
663 if (!targetm.simt.vf
664 && outer_mask == GOMP_DIM_MASK (GOMP_DIM_VECTOR)
665 /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
666 the loop. */
667 && (flag_tree_loop_vectorize
668 || !OPTION_SET_P (flag_tree_loop_vectorize)))
669 {
670 basic_block bb = gsi_bb (i: gsi);
671 class loop *parent = bb->loop_father;
672 class loop *body = parent->inner;
673
674 parent->force_vectorize = true;
675 parent->safelen = INT_MAX;
676
677 /* "Chunking loops" may have inner loops. */
678 if (parent->inner)
679 {
680 body->force_vectorize = true;
681 body->safelen = INT_MAX;
682 }
683
684 cfun->has_force_vectorize_loops = true;
685 }
686 if (striding)
687 {
688 r = oacc_thread_numbers (pos: true, mask, seq: &seq);
689 r = fold_convert (diff_type, r);
690 }
691 else
692 {
693 tree inner_size = oacc_thread_numbers (pos: false, mask: inner_mask, seq: &seq);
694 tree outer_size = oacc_thread_numbers (pos: false, mask: outer_mask, seq: &seq);
695 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
696 inner_size, outer_size);
697
698 volume = fold_convert (diff_type, volume);
699 if (chunking)
700 chunk_size = fold_convert (diff_type, chunk_size);
701 else
702 {
703 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
704
705 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
706 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
707 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
708 }
709
710 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
711 fold_convert (diff_type, inner_size));
712 r = oacc_thread_numbers (pos: true, mask: outer_mask, seq: &seq);
713 r = fold_convert (diff_type, r);
714 r = build2 (MULT_EXPR, diff_type, r, span);
715
716 tree inner = oacc_thread_numbers (pos: true, mask: inner_mask, seq: &seq);
717 inner = fold_convert (diff_type, inner);
718 r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
719
720 if (chunking)
721 {
722 tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
723 tree per
724 = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
725 per = build2 (MULT_EXPR, diff_type, per, chunk);
726
727 r = build2 (PLUS_EXPR, diff_type, r, per);
728 }
729 }
730 r = fold_build2 (MULT_EXPR, diff_type, r, step);
731 if (type != diff_type)
732 r = fold_convert (type, r);
733 break;
734
735 case IFN_GOACC_LOOP_BOUND:
736 if (striding)
737 r = range;
738 else
739 {
740 tree inner_size = oacc_thread_numbers (pos: false, mask: inner_mask, seq: &seq);
741 tree outer_size = oacc_thread_numbers (pos: false, mask: outer_mask, seq: &seq);
742 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
743 inner_size, outer_size);
744
745 volume = fold_convert (diff_type, volume);
746 if (chunking)
747 chunk_size = fold_convert (diff_type, chunk_size);
748 else
749 {
750 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
751
752 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
753 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
754 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
755 }
756
757 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
758 fold_convert (diff_type, inner_size));
759
760 r = fold_build2 (MULT_EXPR, diff_type, span, step);
761
762 tree offset = gimple_call_arg (gs: call, index: 6);
763 r = build2 (PLUS_EXPR, diff_type, r,
764 fold_convert (diff_type, offset));
765 r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
766 diff_type, r, range);
767 }
768 if (diff_type != type)
769 r = fold_convert (type, r);
770 break;
771 }
772
773 gimplify_assign (lhs, r, &seq);
774
775 pop_gimplify_context (NULL);
776
777 gsi_replace_with_seq (&gsi, seq, true);
778}
779
780/* Transform a GOACC_TILE call. Determines the element loop span for
781 the specified loop of the nest. This is 1 if we're not tiling.
782
783 GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element); */
784
785static void
786oacc_xform_tile (gcall *call)
787{
788 gimple_stmt_iterator gsi = gsi_for_stmt (call);
789 unsigned collapse = tree_to_uhwi (gimple_call_arg (gs: call, index: 0));
790 /* Inner loops have higher loop_nos. */
791 unsigned loop_no = tree_to_uhwi (gimple_call_arg (gs: call, index: 1));
792 tree tile_size = gimple_call_arg (gs: call, index: 2);
793 unsigned e_mask = tree_to_uhwi (gimple_call_arg (gs: call, index: 4));
794 tree lhs = gimple_call_lhs (gs: call);
795 tree type = TREE_TYPE (lhs);
796 gimple_seq seq = NULL;
797 tree span = build_int_cst (type, 1);
798
799 gcc_assert (!(e_mask
800 & ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
801 | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
802 push_gimplify_context (in_ssa: !seen_error ());
803
804#ifndef ACCEL_COMPILER
805 /* Partitioning disabled on host compilers. */
806 e_mask = 0;
807#endif
808 if (!e_mask)
809 /* Not paritioning. */
810 span = integer_one_node;
811 else if (!integer_zerop (tile_size))
812 /* User explicitly specified size. */
813 span = tile_size;
814 else
815 {
816 /* Pick a size based on the paritioning of the element loop and
817 the number of loop nests. */
818 tree first_size = NULL_TREE;
819 tree second_size = NULL_TREE;
820
821 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
822 first_size = oacc_dim_call (pos: false, GOMP_DIM_VECTOR, seq: &seq);
823 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
824 second_size = oacc_dim_call (pos: false, GOMP_DIM_WORKER, seq: &seq);
825
826 if (!first_size)
827 {
828 first_size = second_size;
829 second_size = NULL_TREE;
830 }
831
832 if (loop_no + 1 == collapse)
833 {
834 span = first_size;
835 if (!loop_no && second_size)
836 span = fold_build2 (MULT_EXPR, TREE_TYPE (span),
837 span, second_size);
838 }
839 else if (loop_no + 2 == collapse)
840 span = second_size;
841 else
842 span = NULL_TREE;
843
844 if (!span)
845 /* There's no obvious element size for this loop. Options
846 are 1, first_size or some non-unity constant (32 is my
847 favourite). We should gather some statistics. */
848 span = first_size;
849 }
850
851 span = fold_convert (type, span);
852 gimplify_assign (lhs, span, &seq);
853
854 pop_gimplify_context (NULL);
855
856 gsi_replace_with_seq (&gsi, seq, true);
857}
858
859/* Default partitioned and minimum partitioned dimensions. */
860
861static int oacc_default_dims[GOMP_DIM_MAX];
862static int oacc_min_dims[GOMP_DIM_MAX];
863
864int
865oacc_get_default_dim (int dim)
866{
867 gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
868 return oacc_default_dims[dim];
869}
870
871int
872oacc_get_min_dim (int dim)
873{
874 gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
875 return oacc_min_dims[dim];
876}
877
878/* Parse the default dimension parameter. This is a set of
879 :-separated optional compute dimensions. Each specified dimension
880 is a positive integer. When device type support is added, it is
881 planned to be a comma separated list of such compute dimensions,
882 with all but the first prefixed by the colon-terminated device
883 type. */
884
885static void
886oacc_parse_default_dims (const char *dims)
887{
888 int ix;
889
890 for (ix = GOMP_DIM_MAX; ix--;)
891 {
892 oacc_default_dims[ix] = -1;
893 oacc_min_dims[ix] = 1;
894 }
895
896#ifndef ACCEL_COMPILER
897 /* Cannot be overridden on the host. */
898 dims = NULL;
899#endif
900 if (dims)
901 {
902 const char *pos = dims;
903
904 for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++)
905 {
906 if (ix)
907 {
908 if (*pos != ':')
909 goto malformed;
910 pos++;
911 }
912
913 if (*pos != ':')
914 {
915 long val;
916 const char *eptr;
917
918 errno = 0;
919 val = strtol (nptr: pos, endptr: const_cast<char **> (&eptr), base: 10);
920 if (errno || val <= 0 || (int) val != val)
921 goto malformed;
922 pos = eptr;
923 oacc_default_dims[ix] = (int) val;
924 }
925 }
926 if (*pos)
927 {
928 malformed:
929 error_at (UNKNOWN_LOCATION,
930 "%<-fopenacc-dim%> operand is malformed at %qs", pos);
931 }
932 }
933
934 /* Allow the backend to validate the dimensions. */
935 targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1, 0);
936 targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2, 0);
937}
938
939/* Validate and update the dimensions for offloaded FN. ATTRS is the
940 raw attribute. DIMS is an array of dimensions, which is filled in.
941 LEVEL is the partitioning level of a routine, or -1 for an offload
942 region itself. USED is the mask of partitioned execution in the
943 function. */
944
945static void
946oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
947{
948 tree purpose[GOMP_DIM_MAX];
949 unsigned ix;
950 tree pos = TREE_VALUE (attrs);
951
952 /* Make sure the attribute creator attached the dimension
953 information. */
954 gcc_assert (pos);
955
956 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
957 {
958 purpose[ix] = TREE_PURPOSE (pos);
959 tree val = TREE_VALUE (pos);
960 dims[ix] = val ? TREE_INT_CST_LOW (val) : -1;
961 pos = TREE_CHAIN (pos);
962 }
963
964 bool check = true;
965#ifdef ACCEL_COMPILER
966 check = false;
967#endif
968 if (check
969 && warn_openacc_parallelism
970 && !lookup_attribute (attr_name: "oacc kernels", DECL_ATTRIBUTES (fn)))
971 {
972 static char const *const axes[] =
973 /* Must be kept in sync with GOMP_DIM enumeration. */
974 { "gang", "worker", "vector" };
975 for (ix = level >= 0 ? level : 0; ix != GOMP_DIM_MAX; ix++)
976 if (dims[ix] < 0)
977 ; /* Defaulting axis. */
978 else if ((used & GOMP_DIM_MASK (ix)) && dims[ix] == 1)
979 /* There is partitioned execution, but the user requested a
980 dimension size of 1. They're probably confused. */
981 warning_at (DECL_SOURCE_LOCATION (fn), OPT_Wopenacc_parallelism,
982 "region contains %s partitioned code but"
983 " is not %s partitioned", axes[ix], axes[ix]);
984 else if (!(used & GOMP_DIM_MASK (ix)) && dims[ix] != 1)
985 /* The dimension is explicitly partitioned to non-unity, but
986 no use is made within the region. */
987 warning_at (DECL_SOURCE_LOCATION (fn), OPT_Wopenacc_parallelism,
988 "region is %s partitioned but"
989 " does not contain %s partitioned code",
990 axes[ix], axes[ix]);
991 }
992
993 bool changed = targetm.goacc.validate_dims (fn, dims, level, used);
994
995 /* Default anything left to 1 or a partitioned default. */
996 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
997 if (dims[ix] < 0)
998 {
999 /* The OpenACC spec says 'If the [num_gangs] clause is not
1000 specified, an implementation-defined default will be used;
1001 the default may depend on the code within the construct.'
1002 (2.5.6). Thus an implementation is free to choose
1003 non-unity default for a parallel region that doesn't have
1004 any gang-partitioned loops. However, it appears that there
1005 is a sufficient body of user code that expects non-gang
1006 partitioned regions to not execute in gang-redundant mode.
1007 So we (a) don't warn about the non-portability and (b) pick
1008 the minimum permissible dimension size when there is no
1009 partitioned execution. Otherwise we pick the global
1010 default for the dimension, which the user can control. The
1011 same wording and logic applies to num_workers and
1012 vector_length, however the worker- or vector- single
1013 execution doesn't have the same impact as gang-redundant
1014 execution. (If the minimum gang-level partioning is not 1,
1015 the target is probably too confusing.) */
1016 dims[ix] = (used & GOMP_DIM_MASK (ix)
1017 ? oacc_default_dims[ix] : oacc_min_dims[ix]);
1018 changed = true;
1019 }
1020
1021 if (changed)
1022 {
1023 /* Replace the attribute with new values. */
1024 pos = NULL_TREE;
1025 for (ix = GOMP_DIM_MAX; ix--;)
1026 pos = tree_cons (purpose[ix],
1027 build_int_cst (integer_type_node, dims[ix]), pos);
1028 oacc_replace_fn_attrib (fn, dims: pos);
1029 }
1030}
1031
1032/* Create an empty OpenACC loop structure at LOC. */
1033
1034static oacc_loop *
1035new_oacc_loop_raw (oacc_loop *parent, location_t loc)
1036{
1037 oacc_loop *loop = XCNEW (oacc_loop);
1038
1039 loop->parent = parent;
1040
1041 if (parent)
1042 {
1043 loop->sibling = parent->child;
1044 parent->child = loop;
1045 }
1046
1047 loop->loc = loc;
1048 return loop;
1049}
1050
1051/* Create an outermost, dummy OpenACC loop for offloaded function
1052 DECL. */
1053
1054static oacc_loop *
1055new_oacc_loop_outer (tree decl)
1056{
1057 return new_oacc_loop_raw (NULL, DECL_SOURCE_LOCATION (decl));
1058}
1059
1060/* Start a new OpenACC loop structure beginning at head marker HEAD.
1061 Link into PARENT loop. Return the new loop. */
1062
1063static oacc_loop *
1064new_oacc_loop (oacc_loop *parent, gcall *marker)
1065{
1066 oacc_loop *loop = new_oacc_loop_raw (parent, loc: gimple_location (g: marker));
1067
1068 loop->marker = marker;
1069
1070 /* TODO: This is where device_type flattening would occur for the loop
1071 flags. */
1072
1073 loop->flags = TREE_INT_CST_LOW (gimple_call_arg (marker, 3));
1074
1075 tree chunk_size = integer_zero_node;
1076 if (loop->flags & OLF_GANG_STATIC)
1077 chunk_size = gimple_call_arg (gs: marker, index: 4);
1078 loop->chunk_size = chunk_size;
1079
1080 return loop;
1081}
1082
1083/* Create a dummy loop encompassing a call to a openACC routine.
1084 Extract the routine's partitioning requirements. */
1085
1086static void
1087new_oacc_loop_routine (oacc_loop *parent, gcall *call, tree decl, tree attrs)
1088{
1089 oacc_loop *loop = new_oacc_loop_raw (parent, loc: gimple_location (g: call));
1090 int level = oacc_fn_attrib_level (attr: attrs);
1091
1092 gcc_assert (level >= 0);
1093
1094 loop->marker = call;
1095 loop->routine = decl;
1096 loop->mask = ((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
1097 ^ (GOMP_DIM_MASK (level) - 1));
1098}
1099
1100/* Finish off the current OpenACC loop ending at tail marker TAIL.
1101 Return the parent loop. */
1102
1103static oacc_loop *
1104finish_oacc_loop (oacc_loop *loop)
1105{
1106 /* If the loop has been collapsed, don't partition it. */
1107 if (loop->ifns.is_empty ())
1108 loop->mask = loop->flags = 0;
1109 return loop->parent;
1110}
1111
1112/* Free all OpenACC loop structures within LOOP (inclusive). */
1113
1114static void
1115free_oacc_loop (oacc_loop *loop)
1116{
1117 if (loop->sibling)
1118 free_oacc_loop (loop: loop->sibling);
1119 if (loop->child)
1120 free_oacc_loop (loop: loop->child);
1121
1122 loop->ifns.release ();
1123 free (ptr: loop);
1124}
1125
1126/* Dump out the OpenACC loop head or tail beginning at FROM. */
1127
1128static void
1129dump_oacc_loop_part (FILE *file, gcall *from, int depth,
1130 const char *title, int level)
1131{
1132 enum ifn_unique_kind kind
1133 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1134
1135 fprintf (stream: file, format: "%*s%s-%d:\n", depth * 2, "", title, level);
1136 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1137 {
1138 gimple *stmt = gsi_stmt (i: gsi);
1139
1140 if (gimple_call_internal_p (gs: stmt, fn: IFN_UNIQUE))
1141 {
1142 enum ifn_unique_kind k
1143 = ((enum ifn_unique_kind) TREE_INT_CST_LOW
1144 (gimple_call_arg (stmt, 0)));
1145
1146 if (k == kind && stmt != from)
1147 break;
1148 }
1149 print_gimple_stmt (file, stmt, depth * 2 + 2);
1150
1151 gsi_next (i: &gsi);
1152 while (gsi_end_p (i: gsi))
1153 gsi = gsi_start_bb (bb: single_succ (bb: gsi_bb (i: gsi)));
1154 }
1155}
1156
1157/* Dump OpenACC loop LOOP, its children, and its siblings. */
1158
1159static void
1160dump_oacc_loop (FILE *file, oacc_loop *loop, int depth)
1161{
1162 int ix;
1163
1164 fprintf (stream: file, format: "%*sLoop %x(%x) %s:%u\n", depth * 2, "",
1165 loop->flags, loop->mask,
1166 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc));
1167
1168 if (loop->marker)
1169 print_gimple_stmt (file, loop->marker, depth * 2);
1170
1171 if (loop->routine)
1172 fprintf (stream: file, format: "%*sRoutine %s:%u:%s\n",
1173 depth * 2, "", DECL_SOURCE_FILE (loop->routine),
1174 DECL_SOURCE_LINE (loop->routine),
1175 IDENTIFIER_POINTER (DECL_NAME (loop->routine)));
1176
1177 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
1178 if (loop->heads[ix])
1179 dump_oacc_loop_part (file, from: loop->heads[ix], depth, title: "Head", level: ix);
1180 for (ix = GOMP_DIM_MAX; ix--;)
1181 if (loop->tails[ix])
1182 dump_oacc_loop_part (file, from: loop->tails[ix], depth, title: "Tail", level: ix);
1183
1184 if (loop->child)
1185 dump_oacc_loop (file, loop: loop->child, depth: depth + 1);
1186 if (loop->sibling)
1187 dump_oacc_loop (file, loop: loop->sibling, depth);
1188}
1189
1190void debug_oacc_loop (oacc_loop *);
1191
1192/* Dump loops to stderr. */
1193
1194DEBUG_FUNCTION void
1195debug_oacc_loop (oacc_loop *loop)
1196{
1197 dump_oacc_loop (stderr, loop, depth: 0);
1198}
1199
1200/* Provide diagnostics on OpenACC loop LOOP, its children, and its
1201 siblings. */
1202
1203static void
1204inform_oacc_loop (const oacc_loop *loop)
1205{
1206 const char *gang
1207 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG) ? " gang" : "";
1208 const char *worker
1209 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER) ? " worker" : "";
1210 const char *vector
1211 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR) ? " vector" : "";
1212 const char *seq = loop->mask == 0 ? " seq" : "";
1213 const dump_user_location_t loc
1214 = dump_user_location_t::from_location_t (loc: loop->loc);
1215 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
1216 "assigned OpenACC%s%s%s%s loop parallelism\n", gang, worker,
1217 vector, seq);
1218
1219 if (loop->child)
1220 inform_oacc_loop (loop: loop->child);
1221 if (loop->sibling)
1222 inform_oacc_loop (loop: loop->sibling);
1223}
1224
1225/* DFS walk of basic blocks BB onwards, creating OpenACC loop
1226 structures as we go. By construction these loops are properly
1227 nested. */
1228
1229static void
1230oacc_loop_discover_walk (oacc_loop *loop, basic_block bb)
1231{
1232 int marker = 0;
1233 int remaining = 0;
1234
1235 if (bb->flags & BB_VISITED)
1236 return;
1237
1238 follow:
1239 bb->flags |= BB_VISITED;
1240
1241 /* Scan for loop markers. */
1242 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (i: gsi);
1243 gsi_next (i: &gsi))
1244 {
1245 gimple *stmt = gsi_stmt (i: gsi);
1246
1247 if (!is_gimple_call (gs: stmt))
1248 continue;
1249
1250 gcall *call = as_a <gcall *> (p: stmt);
1251
1252 /* If this is a routine, make a dummy loop for it. */
1253 if (tree decl = gimple_call_fndecl (gs: call))
1254 if (tree attrs = oacc_get_fn_attrib (fn: decl))
1255 {
1256 gcc_assert (!marker);
1257 new_oacc_loop_routine (parent: loop, call, decl, attrs);
1258 }
1259
1260 if (!gimple_call_internal_p (gs: call))
1261 continue;
1262
1263 switch (gimple_call_internal_fn (gs: call))
1264 {
1265 default:
1266 break;
1267
1268 case IFN_GOACC_LOOP:
1269 case IFN_GOACC_TILE:
1270 /* Record the abstraction function, so we can manipulate it
1271 later. */
1272 loop->ifns.safe_push (obj: call);
1273 break;
1274
1275 case IFN_UNIQUE:
1276 enum ifn_unique_kind kind
1277 = (enum ifn_unique_kind) (TREE_INT_CST_LOW
1278 (gimple_call_arg (call, 0)));
1279 if (kind == IFN_UNIQUE_OACC_HEAD_MARK
1280 || kind == IFN_UNIQUE_OACC_TAIL_MARK)
1281 {
1282 if (gimple_call_num_args (gs: call) == 2)
1283 {
1284 gcc_assert (marker && !remaining);
1285 marker = 0;
1286 if (kind == IFN_UNIQUE_OACC_TAIL_MARK)
1287 loop = finish_oacc_loop (loop);
1288 else
1289 loop->head_end = call;
1290 }
1291 else
1292 {
1293 int count = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
1294
1295 if (!marker)
1296 {
1297 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1298 loop = new_oacc_loop (parent: loop, marker: call);
1299 remaining = count;
1300 }
1301 gcc_assert (count == remaining);
1302 if (remaining)
1303 {
1304 remaining--;
1305 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1306 loop->heads[marker] = call;
1307 else
1308 loop->tails[remaining] = call;
1309 }
1310 marker++;
1311 }
1312 }
1313 }
1314 }
1315 if (remaining || marker)
1316 {
1317 bb = single_succ (bb);
1318 gcc_assert (single_pred_p (bb) && !(bb->flags & BB_VISITED));
1319 goto follow;
1320 }
1321
1322 /* Walk successor blocks. */
1323 edge e;
1324 edge_iterator ei;
1325
1326 FOR_EACH_EDGE (e, ei, bb->succs)
1327 oacc_loop_discover_walk (loop, bb: e->dest);
1328}
1329
1330/* LOOP is the first sibling. Reverse the order in place and return
1331 the new first sibling. Recurse to child loops. */
1332
1333static oacc_loop *
1334oacc_loop_sibling_nreverse (oacc_loop *loop)
1335{
1336 oacc_loop *last = NULL;
1337 do
1338 {
1339 if (loop->child)
1340 loop->child = oacc_loop_sibling_nreverse (loop: loop->child);
1341
1342 oacc_loop *next = loop->sibling;
1343 loop->sibling = last;
1344 last = loop;
1345 loop = next;
1346 }
1347 while (loop);
1348
1349 return last;
1350}
1351
1352/* Discover the OpenACC loops marked up by HEAD and TAIL markers for
1353 the current function. */
1354
1355static oacc_loop *
1356oacc_loop_discovery ()
1357{
1358 /* Clear basic block flags, in particular BB_VISITED which we're going to use
1359 in the following. */
1360 clear_bb_flags ();
1361
1362 oacc_loop *top = new_oacc_loop_outer (decl: current_function_decl);
1363 oacc_loop_discover_walk (loop: top, ENTRY_BLOCK_PTR_FOR_FN (cfun));
1364
1365 /* The siblings were constructed in reverse order, reverse them so
1366 that diagnostics come out in an unsurprising order. */
1367 top = oacc_loop_sibling_nreverse (loop: top);
1368
1369 return top;
1370}
1371
1372/* Transform the abstract internal function markers starting at FROM
1373 to be for partitioning level LEVEL. Stop when we meet another HEAD
1374 or TAIL marker. */
1375
1376static void
1377oacc_loop_xform_head_tail (gcall *from, int level)
1378{
1379 enum ifn_unique_kind kind
1380 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1381 tree replacement = build_int_cst (unsigned_type_node, level);
1382
1383 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1384 {
1385 gimple *stmt = gsi_stmt (i: gsi);
1386
1387 if (gimple_call_internal_p (gs: stmt, fn: IFN_UNIQUE))
1388 {
1389 enum ifn_unique_kind k
1390 = ((enum ifn_unique_kind)
1391 TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
1392
1393 if (k == IFN_UNIQUE_OACC_FORK
1394 || k == IFN_UNIQUE_OACC_JOIN
1395 || k == IFN_UNIQUE_OACC_PRIVATE)
1396 *gimple_call_arg_ptr (gs: stmt, index: 2) = replacement;
1397 else if (k == kind && stmt != from)
1398 break;
1399 }
1400 else if (gimple_call_internal_p (gs: stmt, fn: IFN_GOACC_REDUCTION))
1401 *gimple_call_arg_ptr (gs: stmt, index: 3) = replacement;
1402 update_stmt (s: stmt);
1403
1404 gsi_next (i: &gsi);
1405 while (gsi_end_p (i: gsi))
1406 gsi = gsi_start_bb (bb: single_succ (bb: gsi_bb (i: gsi)));
1407 }
1408}
1409
1410/* Process the discovered OpenACC loops, setting the correct
1411 partitioning level etc. */
1412
1413static void
1414oacc_loop_process (oacc_loop *loop, int fn_level)
1415{
1416 if (loop->child)
1417 oacc_loop_process (loop: loop->child, fn_level);
1418
1419 if (loop->mask && !loop->routine)
1420 {
1421 int ix;
1422 tree mask_arg = build_int_cst (unsigned_type_node, loop->mask);
1423 tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask);
1424 tree chunk_arg = loop->chunk_size;
1425 gcall *call;
1426
1427 for (ix = 0; loop->ifns.iterate (ix, ptr: &call); ix++)
1428 {
1429 switch (gimple_call_internal_fn (gs: call))
1430 {
1431 case IFN_GOACC_LOOP:
1432 {
1433 bool is_e = gimple_call_arg (gs: call, index: 5) == integer_minus_one_node;
1434 gimple_call_set_arg (gs: call, index: 5, arg: is_e ? e_mask_arg : mask_arg);
1435 if (!is_e)
1436 gimple_call_set_arg (gs: call, index: 4, arg: chunk_arg);
1437 }
1438 break;
1439
1440 case IFN_GOACC_TILE:
1441 gimple_call_set_arg (gs: call, index: 3, arg: mask_arg);
1442 gimple_call_set_arg (gs: call, index: 4, arg: e_mask_arg);
1443 break;
1444
1445 default:
1446 gcc_unreachable ();
1447 }
1448 update_stmt (s: call);
1449 }
1450
1451 unsigned dim = GOMP_DIM_GANG;
1452 unsigned mask = loop->mask | loop->e_mask;
1453 for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
1454 {
1455 while (!(GOMP_DIM_MASK (dim) & mask))
1456 dim++;
1457
1458 oacc_loop_xform_head_tail (from: loop->heads[ix], level: dim);
1459 oacc_loop_xform_head_tail (from: loop->tails[ix], level: dim);
1460
1461 mask ^= GOMP_DIM_MASK (dim);
1462 }
1463 }
1464
1465 if (loop->sibling)
1466 oacc_loop_process (loop: loop->sibling, fn_level);
1467
1468
1469 /* OpenACC 2.6, 2.9.11. "reduction clause" places a restriction such that
1470 "The 'reduction' clause may not be specified on an orphaned 'loop'
1471 construct with the 'gang' clause, or on an orphaned 'loop' construct that
1472 will generate gang parallelism in a procedure that is compiled with the
1473 'routine gang' clause." */
1474 if (fn_level == GOMP_DIM_GANG
1475 && (loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
1476 && (loop->flags & OLF_REDUCTION))
1477 error_at (loop->loc,
1478 "gang reduction on an orphan loop");
1479}
1480
1481/* Walk the OpenACC loop heirarchy checking and assigning the
1482 programmer-specified partitionings. OUTER_MASK is the partitioning
1483 this loop is contained within. Return mask of partitioning
1484 encountered. If any auto loops are discovered, set GOMP_DIM_MAX
1485 bit. */
1486
1487static unsigned
1488oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
1489{
1490 unsigned this_mask = loop->mask;
1491 unsigned mask_all = 0;
1492 bool noisy = true;
1493
1494#ifdef ACCEL_COMPILER
1495 /* When device_type is supported, we want the device compiler to be
1496 noisy, if the loop parameters are device_type-specific. */
1497 noisy = false;
1498#endif
1499
1500 if (!loop->routine)
1501 {
1502 bool auto_par = (loop->flags & OLF_AUTO) != 0;
1503 bool seq_par = (loop->flags & OLF_SEQ) != 0;
1504 bool tiling = (loop->flags & OLF_TILE) != 0;
1505
1506 this_mask = ((loop->flags >> OLF_DIM_BASE)
1507 & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
1508
1509 /* Apply auto partitioning if this is a non-partitioned regular
1510 loop, or (no more than) single axis tiled loop. */
1511 bool maybe_auto
1512 = !seq_par && this_mask == (tiling ? this_mask & -this_mask : 0);
1513
1514 if ((this_mask != 0) + auto_par + seq_par > 1)
1515 {
1516 if (noisy)
1517 error_at (loop->loc,
1518 seq_par
1519 ? G_("%<seq%> overrides other OpenACC loop specifiers")
1520 : G_("%<auto%> conflicts with other OpenACC loop "
1521 "specifiers"));
1522 maybe_auto = false;
1523 loop->flags &= ~OLF_AUTO;
1524 if (seq_par)
1525 {
1526 loop->flags
1527 &= ~((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE);
1528 this_mask = 0;
1529 }
1530 }
1531
1532 if (maybe_auto && (loop->flags & OLF_INDEPENDENT))
1533 {
1534 loop->flags |= OLF_AUTO;
1535 mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
1536 }
1537 }
1538
1539 if (this_mask & outer_mask)
1540 {
1541 const oacc_loop *outer;
1542 for (outer = loop->parent; outer; outer = outer->parent)
1543 if ((outer->mask | outer->e_mask) & this_mask)
1544 break;
1545
1546 if (noisy)
1547 {
1548 if (outer)
1549 {
1550 error_at (loop->loc,
1551 loop->routine
1552 ? G_("routine call uses same OpenACC parallelism"
1553 " as containing loop")
1554 : G_("inner loop uses same OpenACC parallelism"
1555 " as containing loop"));
1556 inform (outer->loc, "containing loop here");
1557 }
1558 else
1559 error_at (loop->loc,
1560 loop->routine
1561 ? G_("routine call uses OpenACC parallelism disallowed"
1562 " by containing routine")
1563 : G_("loop uses OpenACC parallelism disallowed"
1564 " by containing routine"));
1565
1566 if (loop->routine)
1567 inform (DECL_SOURCE_LOCATION (loop->routine),
1568 "routine %qD declared here", loop->routine);
1569 }
1570 this_mask &= ~outer_mask;
1571 }
1572 else
1573 {
1574 unsigned outermost = least_bit_hwi (x: this_mask);
1575
1576 if (outermost && outermost <= outer_mask)
1577 {
1578 if (noisy)
1579 {
1580 error_at (loop->loc,
1581 "incorrectly nested OpenACC loop parallelism");
1582
1583 const oacc_loop *outer;
1584 for (outer = loop->parent;
1585 outer->flags && outer->flags < outermost;
1586 outer = outer->parent)
1587 continue;
1588 inform (outer->loc, "containing loop here");
1589 }
1590
1591 this_mask &= ~outermost;
1592 }
1593 }
1594
1595 mask_all |= this_mask;
1596
1597 if (loop->flags & OLF_TILE)
1598 {
1599 /* When tiling, vector goes to the element loop, and failing
1600 that we put worker there. The std doesn't contemplate
1601 specifying all three. We choose to put worker and vector on
1602 the element loops in that case. */
1603 unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR);
1604 if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
1605 this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
1606
1607 loop->e_mask = this_e_mask;
1608 this_mask ^= this_e_mask;
1609 }
1610
1611 loop->mask = this_mask;
1612
1613 if (dump_file)
1614 fprintf (stream: dump_file, format: "Loop %s:%d user specified %d & %d\n",
1615 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1616 loop->mask, loop->e_mask);
1617
1618 if (loop->child)
1619 {
1620 unsigned tmp_mask = outer_mask | this_mask | loop->e_mask;
1621 loop->inner = oacc_loop_fixed_partitions (loop: loop->child, outer_mask: tmp_mask);
1622 mask_all |= loop->inner;
1623 }
1624
1625 if (loop->sibling)
1626 mask_all |= oacc_loop_fixed_partitions (loop: loop->sibling, outer_mask);
1627
1628 return mask_all;
1629}
1630
1631/* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
1632 OUTER_MASK is the partitioning this loop is contained within.
1633 OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
1634 Return the cumulative partitioning used by this loop, siblings and
1635 children. */
1636
1637static unsigned
1638oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
1639 bool outer_assign)
1640{
1641 bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
1642 bool noisy = true;
1643 bool tiling = loop->flags & OLF_TILE;
1644
1645#ifdef ACCEL_COMPILER
1646 /* When device_type is supported, we want the device compiler to be
1647 noisy, if the loop parameters are device_type-specific. */
1648 noisy = false;
1649#endif
1650
1651 if (assign && (!outer_assign || loop->inner))
1652 {
1653 /* Allocate outermost and non-innermost loops at the outermost
1654 non-innermost available level. */
1655 unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
1656
1657 /* Find the first outermost available partition. */
1658 while (this_mask <= outer_mask)
1659 this_mask <<= 1;
1660
1661 /* Grab two axes if tiling, and we've not assigned anything */
1662 if (tiling && !(loop->mask | loop->e_mask))
1663 this_mask |= this_mask << 1;
1664
1665 /* Prohibit the innermost partitioning at the moment. */
1666 this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1;
1667
1668 /* Don't use any dimension explicitly claimed by an inner loop. */
1669 this_mask &= ~loop->inner;
1670
1671 if (tiling && !loop->e_mask)
1672 {
1673 /* If we got two axes, allocate the inner one to the element
1674 loop. */
1675 loop->e_mask = this_mask & (this_mask << 1);
1676 this_mask ^= loop->e_mask;
1677 }
1678
1679 loop->mask |= this_mask;
1680 }
1681
1682 if (loop->child)
1683 {
1684 unsigned tmp_mask = outer_mask | loop->mask | loop->e_mask;
1685 loop->inner = oacc_loop_auto_partitions (loop: loop->child, outer_mask: tmp_mask,
1686 outer_assign: outer_assign | assign);
1687 }
1688
1689 if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
1690 {
1691 /* Allocate the loop at the innermost available level. Note
1692 that we do this even if we already assigned this loop the
1693 outermost available level above. That way we'll partition
1694 this along 2 axes, if they are available. */
1695 unsigned this_mask = 0;
1696
1697 /* Determine the outermost partitioning used within this loop. */
1698 this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
1699 this_mask = least_bit_hwi (x: this_mask);
1700
1701 /* Pick the partitioning just inside that one. */
1702 this_mask >>= 1;
1703
1704 /* And avoid picking one use by an outer loop. */
1705 this_mask &= ~outer_mask;
1706
1707 /* If tiling and we failed completely above, grab the next one
1708 too. Making sure it doesn't hit an outer loop. */
1709 if (tiling)
1710 {
1711 this_mask &= ~(loop->e_mask | loop->mask);
1712 unsigned tile_mask = ((this_mask >> 1)
1713 & ~(outer_mask | loop->e_mask | loop->mask));
1714
1715 if (tile_mask || loop->mask)
1716 {
1717 loop->e_mask |= this_mask;
1718 this_mask = tile_mask;
1719 }
1720 if (!loop->e_mask && noisy)
1721 warning_at (loop->loc, 0,
1722 "insufficient partitioning available"
1723 " to parallelize element loop");
1724 }
1725
1726 loop->mask |= this_mask;
1727 if (!loop->mask && noisy)
1728 warning_at (loop->loc, 0,
1729 tiling
1730 ? G_("insufficient partitioning available"
1731 " to parallelize tile loop")
1732 : G_("insufficient partitioning available"
1733 " to parallelize loop"));
1734 }
1735
1736 if (assign && dump_file)
1737 fprintf (stream: dump_file, format: "Auto loop %s:%d assigned %d & %d\n",
1738 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1739 loop->mask, loop->e_mask);
1740
1741 unsigned inner_mask = 0;
1742
1743 if (loop->sibling)
1744 inner_mask |= oacc_loop_auto_partitions (loop: loop->sibling,
1745 outer_mask, outer_assign);
1746
1747 inner_mask |= loop->inner | loop->mask | loop->e_mask;
1748
1749 return inner_mask;
1750}
1751
1752/* Walk the OpenACC loop heirarchy to check and assign partitioning
1753 axes. Return mask of partitioning. */
1754
1755static unsigned
1756oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
1757{
1758 unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
1759
1760 if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
1761 {
1762 mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
1763 mask_all |= oacc_loop_auto_partitions (loop, outer_mask, outer_assign: false);
1764 }
1765 return mask_all;
1766}
1767
1768/* Default fork/join early expander. Delete the function calls if
1769 there is no RTL expander. */
1770
1771bool
1772default_goacc_fork_join (gcall *ARG_UNUSED (call),
1773 const int *ARG_UNUSED (dims), bool is_fork)
1774{
1775 if (is_fork)
1776 return targetm.have_oacc_fork ();
1777 else
1778 return targetm.have_oacc_join ();
1779}
1780
1781/* Default goacc.reduction early expander.
1782
1783 LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET)
1784 If RES_PTR is not integer-zerop:
1785 SETUP - emit 'LHS = *RES_PTR', LHS = NULL
1786 TEARDOWN - emit '*RES_PTR = VAR'
1787 If LHS is not NULL
1788 emit 'LHS = VAR' */
1789
1790void
1791default_goacc_reduction (gcall *call)
1792{
1793 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
1794 gimple_stmt_iterator gsi = gsi_for_stmt (call);
1795 tree lhs = gimple_call_lhs (gs: call);
1796 tree var = gimple_call_arg (gs: call, index: 2);
1797 gimple_seq seq = NULL;
1798
1799 if (code == IFN_GOACC_REDUCTION_SETUP
1800 || code == IFN_GOACC_REDUCTION_TEARDOWN)
1801 {
1802 /* Setup and Teardown need to copy from/to the receiver object,
1803 if there is one. */
1804 tree ref_to_res = gimple_call_arg (gs: call, index: 1);
1805
1806 if (!integer_zerop (ref_to_res))
1807 {
1808 tree dst = build_simple_mem_ref (ref_to_res);
1809 tree src = var;
1810
1811 if (code == IFN_GOACC_REDUCTION_SETUP)
1812 {
1813 src = dst;
1814 dst = lhs;
1815 lhs = NULL;
1816 }
1817 gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src));
1818 }
1819 }
1820
1821 /* Copy VAR to LHS, if there is an LHS. */
1822 if (lhs)
1823 gimple_seq_add_stmt (&seq, gimple_build_assign (lhs, var));
1824
1825 gsi_replace_with_seq (&gsi, seq, true);
1826}
1827
1828struct var_decl_rewrite_info
1829{
1830 gimple *stmt;
1831 hash_map<tree, tree> *adjusted_vars;
1832 bool avoid_pointer_conversion;
1833 bool modified;
1834};
1835
1836/* Helper function for execute_oacc_device_lower. Rewrite VAR_DECLs (by
1837 themselves or wrapped in various other nodes) according to ADJUSTED_VARS in
1838 the var_decl_rewrite_info pointed to via DATA. Used as part of coercing
1839 gang-private variables in OpenACC offload regions to reside in GPU shared
1840 memory. */
1841
1842static tree
1843oacc_rewrite_var_decl (tree *tp, int *walk_subtrees, void *data)
1844{
1845 walk_stmt_info *wi = (walk_stmt_info *) data;
1846 var_decl_rewrite_info *info = (var_decl_rewrite_info *) wi->info;
1847
1848 if (TREE_CODE (*tp) == ADDR_EXPR)
1849 {
1850 tree arg = TREE_OPERAND (*tp, 0);
1851 tree *new_arg = info->adjusted_vars->get (k: arg);
1852
1853 if (new_arg)
1854 {
1855 if (info->avoid_pointer_conversion)
1856 {
1857 *tp = build_fold_addr_expr (*new_arg);
1858 info->modified = true;
1859 *walk_subtrees = 0;
1860 }
1861 else
1862 {
1863 gimple_stmt_iterator gsi = gsi_for_stmt (info->stmt);
1864 tree repl = build_fold_addr_expr (*new_arg);
1865 gimple *stmt1
1866 = gimple_build_assign (make_ssa_name (TREE_TYPE (repl)), repl);
1867 tree conv = convert_to_pointer (TREE_TYPE (*tp),
1868 gimple_assign_lhs (gs: stmt1));
1869 gimple *stmt2
1870 = gimple_build_assign (make_ssa_name (TREE_TYPE (*tp)), conv);
1871 gsi_insert_before (&gsi, stmt1, GSI_SAME_STMT);
1872 gsi_insert_before (&gsi, stmt2, GSI_SAME_STMT);
1873 *tp = gimple_assign_lhs (gs: stmt2);
1874 info->modified = true;
1875 *walk_subtrees = 0;
1876 }
1877 }
1878 }
1879 else if (TREE_CODE (*tp) == COMPONENT_REF || TREE_CODE (*tp) == ARRAY_REF)
1880 {
1881 tree *base = &TREE_OPERAND (*tp, 0);
1882
1883 while (TREE_CODE (*base) == COMPONENT_REF
1884 || TREE_CODE (*base) == ARRAY_REF)
1885 base = &TREE_OPERAND (*base, 0);
1886
1887 if (TREE_CODE (*base) != VAR_DECL)
1888 return NULL;
1889
1890 tree *new_decl = info->adjusted_vars->get (k: *base);
1891 if (!new_decl)
1892 return NULL;
1893
1894 int base_quals = TYPE_QUALS (TREE_TYPE (*new_decl));
1895 tree field = TREE_OPERAND (*tp, 1);
1896
1897 /* Adjust the type of the field. */
1898 int field_quals = TYPE_QUALS (TREE_TYPE (field));
1899 if (TREE_CODE (field) == FIELD_DECL && field_quals != base_quals)
1900 {
1901 tree *field_type = &TREE_TYPE (field);
1902 while (TREE_CODE (*field_type) == ARRAY_TYPE)
1903 field_type = &TREE_TYPE (*field_type);
1904 field_quals |= base_quals;
1905 *field_type = build_qualified_type (*field_type, field_quals);
1906 }
1907
1908 /* Adjust the type of the component ref itself. */
1909 tree comp_type = TREE_TYPE (*tp);
1910 int comp_quals = TYPE_QUALS (comp_type);
1911 if (TREE_CODE (*tp) == COMPONENT_REF && comp_quals != base_quals)
1912 {
1913 comp_quals |= base_quals;
1914 TREE_TYPE (*tp)
1915 = build_qualified_type (comp_type, comp_quals);
1916 }
1917
1918 *base = *new_decl;
1919 info->modified = true;
1920 }
1921 else if (VAR_P (*tp))
1922 {
1923 tree *new_decl = info->adjusted_vars->get (k: *tp);
1924 if (new_decl)
1925 {
1926 *tp = *new_decl;
1927 info->modified = true;
1928 }
1929 }
1930
1931 return NULL_TREE;
1932}
1933
1934/* Return TRUE if CALL is a call to a builtin atomic/sync operation. */
1935
1936static bool
1937is_sync_builtin_call (gcall *call)
1938{
1939 tree callee = gimple_call_fndecl (gs: call);
1940
1941 if (callee != NULL_TREE
1942 && gimple_call_builtin_p (call, BUILT_IN_NORMAL))
1943 switch (DECL_FUNCTION_CODE (decl: callee))
1944 {
1945#undef DEF_SYNC_BUILTIN
1946#define DEF_SYNC_BUILTIN(ENUM, NAME, TYPE, ATTRS) case ENUM:
1947#include "sync-builtins.def"
1948#undef DEF_SYNC_BUILTIN
1949 return true;
1950
1951 default:
1952 ;
1953 }
1954
1955 return false;
1956}
1957
1958/* Main entry point for oacc transformations which run on the device
1959 compiler after LTO, so we know what the target device is at this
1960 point (including the host fallback). */
1961
1962static unsigned int
1963execute_oacc_loop_designation ()
1964{
1965 tree attrs = oacc_get_fn_attrib (fn: current_function_decl);
1966
1967 if (!attrs)
1968 /* Not an offloaded function. */
1969 return 0;
1970
1971 /* Parse the default dim argument exactly once. */
1972 if ((const void *)flag_openacc_dims != &flag_openacc_dims)
1973 {
1974 oacc_parse_default_dims (flag_openacc_dims);
1975 flag_openacc_dims = (char *)&flag_openacc_dims;
1976 }
1977
1978 bool is_oacc_parallel
1979 = (lookup_attribute (attr_name: "oacc parallel",
1980 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1981 bool is_oacc_kernels
1982 = (lookup_attribute (attr_name: "oacc kernels",
1983 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1984 bool is_oacc_serial
1985 = (lookup_attribute (attr_name: "oacc serial",
1986 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1987 bool is_oacc_parallel_kernels_parallelized
1988 = (lookup_attribute (attr_name: "oacc parallel_kernels_parallelized",
1989 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1990 bool is_oacc_parallel_kernels_gang_single
1991 = (lookup_attribute (attr_name: "oacc parallel_kernels_gang_single",
1992 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1993 int fn_level = oacc_fn_attrib_level (attr: attrs);
1994 bool is_oacc_routine = (fn_level >= 0);
1995 gcc_checking_assert (is_oacc_parallel
1996 + is_oacc_kernels
1997 + is_oacc_serial
1998 + is_oacc_parallel_kernels_parallelized
1999 + is_oacc_parallel_kernels_gang_single
2000 + is_oacc_routine
2001 == 1);
2002
2003 bool is_oacc_kernels_parallelized
2004 = (lookup_attribute (attr_name: "oacc kernels parallelized",
2005 DECL_ATTRIBUTES (current_function_decl)) != NULL);
2006 if (is_oacc_kernels_parallelized)
2007 gcc_checking_assert (is_oacc_kernels);
2008
2009 if (dump_file)
2010 {
2011 if (is_oacc_parallel)
2012 fprintf (stream: dump_file, format: "Function is OpenACC parallel offload\n");
2013 else if (is_oacc_kernels)
2014 fprintf (stream: dump_file, format: "Function is %s OpenACC kernels offload\n",
2015 (is_oacc_kernels_parallelized
2016 ? "parallelized" : "unparallelized"));
2017 else if (is_oacc_serial)
2018 fprintf (stream: dump_file, format: "Function is OpenACC serial offload\n");
2019 else if (is_oacc_parallel_kernels_parallelized)
2020 fprintf (stream: dump_file, format: "Function is %s OpenACC kernels offload\n",
2021 "parallel_kernels_parallelized");
2022 else if (is_oacc_parallel_kernels_gang_single)
2023 fprintf (stream: dump_file, format: "Function is %s OpenACC kernels offload\n",
2024 "parallel_kernels_gang_single");
2025 else if (is_oacc_routine)
2026 fprintf (stream: dump_file, format: "Function is OpenACC routine level %d\n",
2027 fn_level);
2028 else
2029 gcc_unreachable ();
2030 }
2031
2032 /* This doesn't belong into 'pass_oacc_loop_designation' conceptually, but
2033 it's a convenient place, so... */
2034 if (is_oacc_routine)
2035 {
2036 tree attr = lookup_attribute (attr_name: "omp declare target",
2037 DECL_ATTRIBUTES (current_function_decl));
2038 gcc_checking_assert (attr);
2039 tree clauses = TREE_VALUE (attr);
2040 gcc_checking_assert (clauses);
2041
2042 /* Should this OpenACC routine be discarded? */
2043 bool discard = false;
2044
2045 tree clause_nohost = omp_find_clause (clauses, kind: OMP_CLAUSE_NOHOST);
2046 if (dump_file)
2047 fprintf (stream: dump_file,
2048 format: "OpenACC routine '%s' %s '%s' clause.\n",
2049 lang_hooks.decl_printable_name (current_function_decl, 2),
2050 clause_nohost ? "has" : "doesn't have",
2051 omp_clause_code_name[OMP_CLAUSE_NOHOST]);
2052 /* Host compiler, 'nohost' clause? */
2053#ifndef ACCEL_COMPILER
2054 if (clause_nohost)
2055 discard = true;
2056#endif
2057
2058 if (dump_file)
2059 fprintf (stream: dump_file,
2060 format: "OpenACC routine '%s' %sdiscarded.\n",
2061 lang_hooks.decl_printable_name (current_function_decl, 2),
2062 discard ? "" : "not ");
2063 if (discard)
2064 {
2065 TREE_ASM_WRITTEN (current_function_decl) = 1;
2066 return TODO_discard_function;
2067 }
2068 }
2069
2070 /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
2071 kernels, so remove the parallelism dimensions function attributes
2072 potentially set earlier on. */
2073 if (is_oacc_kernels && !is_oacc_kernels_parallelized)
2074 {
2075 oacc_set_fn_attrib (fn: current_function_decl, NULL, NULL);
2076 attrs = oacc_get_fn_attrib (fn: current_function_decl);
2077 }
2078
2079 /* Discover, partition and process the loops. */
2080 oacc_loop *loops = oacc_loop_discovery ();
2081
2082 unsigned outer_mask = 0;
2083 if (is_oacc_routine)
2084 outer_mask = GOMP_DIM_MASK (fn_level) - 1;
2085 unsigned used_mask = oacc_loop_partition (loop: loops, outer_mask);
2086 /* OpenACC kernels constructs are special: they currently don't use the
2087 generic oacc_loop infrastructure and attribute/dimension processing. */
2088 if (is_oacc_kernels && is_oacc_kernels_parallelized)
2089 {
2090 /* Parallelized OpenACC kernels constructs use gang parallelism. See
2091 also tree-parloops.cc:create_parallel_loop. */
2092 used_mask |= GOMP_DIM_MASK (GOMP_DIM_GANG);
2093 }
2094
2095 int dims[GOMP_DIM_MAX];
2096 oacc_validate_dims (fn: current_function_decl, attrs, dims, level: fn_level, used: used_mask);
2097
2098 if (dump_file)
2099 {
2100 const char *comma = "Compute dimensions [";
2101 for (int ix = 0; ix != GOMP_DIM_MAX; ix++, comma = ", ")
2102 fprintf (stream: dump_file, format: "%s%d", comma, dims[ix]);
2103 fprintf (stream: dump_file, format: "]\n");
2104 }
2105
2106 /* Verify that for OpenACC 'kernels' decomposed "gang-single" parts we launch
2107 a single gang only. */
2108 if (is_oacc_parallel_kernels_gang_single)
2109 gcc_checking_assert (dims[GOMP_DIM_GANG] == 1);
2110
2111 oacc_loop_process (loop: loops, fn_level);
2112 if (dump_file)
2113 {
2114 fprintf (stream: dump_file, format: "OpenACC loops\n");
2115 dump_oacc_loop (file: dump_file, loop: loops, depth: 0);
2116 fprintf (stream: dump_file, format: "\n");
2117 }
2118 if (dump_enabled_p ())
2119 {
2120 oacc_loop *l = loops;
2121 /* OpenACC kernels constructs are special: they currently don't use the
2122 generic oacc_loop infrastructure. */
2123 if (is_oacc_kernels)
2124 {
2125 /* Create a fake oacc_loop for diagnostic purposes. */
2126 l = new_oacc_loop_raw (NULL,
2127 DECL_SOURCE_LOCATION (current_function_decl));
2128 l->mask = used_mask;
2129 }
2130 else
2131 {
2132 /* Skip the outermost, dummy OpenACC loop */
2133 l = l->child;
2134 }
2135 if (l)
2136 inform_oacc_loop (loop: l);
2137 if (is_oacc_kernels)
2138 free_oacc_loop (loop: l);
2139 }
2140
2141 free_oacc_loop (loop: loops);
2142
2143 return 0;
2144}
2145
2146static unsigned int
2147execute_oacc_device_lower ()
2148{
2149 tree attrs = oacc_get_fn_attrib (fn: current_function_decl);
2150
2151 if (!attrs)
2152 /* Not an offloaded function. */
2153 return 0;
2154
2155 int dims[GOMP_DIM_MAX];
2156 for (unsigned i = 0; i < GOMP_DIM_MAX; i++)
2157 dims[i] = oacc_get_fn_dim_size (fn: current_function_decl, axis: i);
2158
2159 hash_map<tree, tree> adjusted_vars;
2160
2161 /* Now lower internal loop functions to target-specific code
2162 sequences. */
2163 basic_block bb;
2164 FOR_ALL_BB_FN (bb, cfun)
2165 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (i: gsi);)
2166 {
2167 gimple *stmt = gsi_stmt (i: gsi);
2168 if (!is_gimple_call (gs: stmt))
2169 {
2170 gsi_next (i: &gsi);
2171 continue;
2172 }
2173
2174 gcall *call = as_a <gcall *> (p: stmt);
2175 if (!gimple_call_internal_p (gs: call))
2176 {
2177 gsi_next (i: &gsi);
2178 continue;
2179 }
2180
2181 /* Rewind to allow rescan. */
2182 gsi_prev (i: &gsi);
2183 bool rescan = false, remove = false;
2184 enum internal_fn ifn_code = gimple_call_internal_fn (gs: call);
2185
2186 switch (ifn_code)
2187 {
2188 default: break;
2189
2190 case IFN_GOACC_TILE:
2191 oacc_xform_tile (call);
2192 rescan = true;
2193 break;
2194
2195 case IFN_GOACC_LOOP:
2196 oacc_xform_loop (call);
2197 rescan = true;
2198 break;
2199
2200 case IFN_GOACC_REDUCTION:
2201 /* Mark the function for SSA renaming. */
2202 mark_virtual_operands_for_renaming (cfun);
2203
2204 /* If the level is -1, this ended up being an unused
2205 axis. Handle as a default. */
2206 if (integer_minus_onep (gimple_call_arg (gs: call, index: 3)))
2207 default_goacc_reduction (call);
2208 else
2209 targetm.goacc.reduction (call);
2210 rescan = true;
2211 break;
2212
2213 case IFN_UNIQUE:
2214 {
2215 enum ifn_unique_kind kind
2216 = ((enum ifn_unique_kind)
2217 TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
2218
2219 switch (kind)
2220 {
2221 default:
2222 break;
2223
2224 case IFN_UNIQUE_OACC_FORK:
2225 case IFN_UNIQUE_OACC_JOIN:
2226 if (integer_minus_onep (gimple_call_arg (gs: call, index: 2)))
2227 remove = true;
2228 else if (!targetm.goacc.fork_join
2229 (call, dims, kind == IFN_UNIQUE_OACC_FORK))
2230 remove = true;
2231 break;
2232
2233 case IFN_UNIQUE_OACC_HEAD_MARK:
2234 case IFN_UNIQUE_OACC_TAIL_MARK:
2235 remove = true;
2236 break;
2237
2238 case IFN_UNIQUE_OACC_PRIVATE:
2239 {
2240 dump_flags_t l_dump_flags
2241 = get_openacc_privatization_dump_flags ();
2242
2243 location_t loc = gimple_location (g: stmt);
2244 if (LOCATION_LOCUS (loc) == UNKNOWN_LOCATION)
2245 loc = DECL_SOURCE_LOCATION (current_function_decl);
2246 const dump_user_location_t d_u_loc
2247 = dump_user_location_t::from_location_t (loc);
2248
2249 HOST_WIDE_INT level
2250 = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
2251 gcc_checking_assert (level == -1
2252 || (level >= 0
2253 && level < GOMP_DIM_MAX));
2254 for (unsigned i = 3;
2255 i < gimple_call_num_args (gs: call);
2256 i++)
2257 {
2258 static char const *const axes[] =
2259 /* Must be kept in sync with GOMP_DIM enumeration. */
2260 { "gang", "worker", "vector" };
2261
2262 tree arg = gimple_call_arg (gs: call, index: i);
2263 gcc_checking_assert (TREE_CODE (arg) == ADDR_EXPR);
2264 tree decl = TREE_OPERAND (arg, 0);
2265 if (dump_enabled_p ())
2266/* PR100695 "Format decoder, quoting in 'dump_printf' etc." */
2267#if __GNUC__ >= 10
2268# pragma GCC diagnostic push
2269# pragma GCC diagnostic ignored "-Wformat"
2270#endif
2271 dump_printf_loc (l_dump_flags, d_u_loc,
2272 "variable %<%T%> ought to be"
2273 " adjusted for OpenACC"
2274 " privatization level: %qs\n",
2275 decl,
2276 (level == -1
2277 ? "UNKNOWN" : axes[level]));
2278#if __GNUC__ >= 10
2279# pragma GCC diagnostic pop
2280#endif
2281 bool adjusted;
2282 if (level == -1)
2283 adjusted = false;
2284 else if (!targetm.goacc.adjust_private_decl)
2285 adjusted = false;
2286 else if (level == GOMP_DIM_VECTOR)
2287 {
2288 /* That's the default behavior. */
2289 adjusted = true;
2290 }
2291 else
2292 {
2293 tree oldtype = TREE_TYPE (decl);
2294 tree newdecl
2295 = targetm.goacc.adjust_private_decl (loc, decl,
2296 level);
2297 adjusted = (TREE_TYPE (newdecl) != oldtype
2298 || newdecl != decl);
2299 if (adjusted)
2300 adjusted_vars.put (k: decl, v: newdecl);
2301 }
2302 if (adjusted
2303 && dump_enabled_p ())
2304/* PR100695 "Format decoder, quoting in 'dump_printf' etc." */
2305#if __GNUC__ >= 10
2306# pragma GCC diagnostic push
2307# pragma GCC diagnostic ignored "-Wformat"
2308#endif
2309 dump_printf_loc (l_dump_flags, d_u_loc,
2310 "variable %<%T%> adjusted for"
2311 " OpenACC privatization level:"
2312 " %qs\n",
2313 decl, axes[level]);
2314#if __GNUC__ >= 10
2315# pragma GCC diagnostic pop
2316#endif
2317 }
2318 remove = true;
2319 }
2320 break;
2321 }
2322 break;
2323 }
2324 }
2325
2326 if (gsi_end_p (i: gsi))
2327 /* We rewound past the beginning of the BB. */
2328 gsi = gsi_start_bb (bb);
2329 else
2330 /* Undo the rewind. */
2331 gsi_next (i: &gsi);
2332
2333 if (remove)
2334 {
2335 if (gimple_vdef (g: call))
2336 replace_uses_by (gimple_vdef (g: call), gimple_vuse (g: call));
2337 if (gimple_call_lhs (gs: call))
2338 {
2339 /* Propagate the data dependency var. */
2340 gimple *ass = gimple_build_assign (gimple_call_lhs (gs: call),
2341 gimple_call_arg (gs: call, index: 1));
2342 gsi_replace (&gsi, ass, false);
2343 }
2344 else
2345 gsi_remove (&gsi, true);
2346 }
2347 else if (!rescan)
2348 /* If not rescanning, advance over the call. */
2349 gsi_next (i: &gsi);
2350 }
2351
2352 /* Regarding the OpenACC privatization level, we're currently only looking at
2353 making the gang-private level work. Regarding that, we have the following
2354 configurations:
2355
2356 - GCN offloading: 'targetm.goacc.adjust_private_decl' does the work (in
2357 particular, change 'TREE_TYPE', etc.) and there is no
2358 'targetm.goacc.expand_var_decl'.
2359
2360 - nvptx offloading: 'targetm.goacc.adjust_private_decl' only sets a
2361 marker and then 'targetm.goacc.expand_var_decl' does the work.
2362
2363 Eventually (in particular, for worker-private level?), both
2364 'targetm.goacc.adjust_private_decl' and 'targetm.goacc.expand_var_decl'
2365 may need to do things, but that's currently not meant to be addressed, and
2366 thus not fully worked out and implemented, and thus untested. Hence,
2367 'assert' what currently is implemented/tested, only. */
2368
2369 if (targetm.goacc.expand_var_decl)
2370 gcc_assert (adjusted_vars.is_empty ());
2371
2372 /* Make adjustments to gang-private local variables if required by the
2373 target, e.g. forcing them into a particular address space. Afterwards,
2374 ADDR_EXPR nodes which have adjusted variables as their argument need to
2375 be modified in one of two ways:
2376
2377 1. They can be recreated, making a pointer to the variable in the new
2378 address space, or
2379
2380 2. The address of the variable in the new address space can be taken,
2381 converted to the default (original) address space, and the result of
2382 that conversion subsituted in place of the original ADDR_EXPR node.
2383
2384 Which of these is done depends on the gimple statement being processed.
2385 At present atomic operations and inline asms use (1), and everything else
2386 uses (2). At least on AMD GCN, there are atomic operations that work
2387 directly in the LDS address space.
2388
2389 COMPONENT_REFS, ARRAY_REFS and plain VAR_DECLs are also rewritten to use
2390 the new decl, adjusting types of appropriate tree nodes as necessary. */
2391
2392 if (targetm.goacc.adjust_private_decl
2393 && !adjusted_vars.is_empty ())
2394 {
2395 FOR_ALL_BB_FN (bb, cfun)
2396 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
2397 !gsi_end_p (i: gsi);
2398 gsi_next (i: &gsi))
2399 {
2400 gimple *stmt = gsi_stmt (i: gsi);
2401 walk_stmt_info wi;
2402 var_decl_rewrite_info info;
2403
2404 info.avoid_pointer_conversion
2405 = (is_gimple_call (gs: stmt)
2406 && is_sync_builtin_call (call: as_a <gcall *> (p: stmt)))
2407 || gimple_code (g: stmt) == GIMPLE_ASM;
2408 info.stmt = stmt;
2409 info.modified = false;
2410 info.adjusted_vars = &adjusted_vars;
2411
2412 memset (s: &wi, c: 0, n: sizeof (wi));
2413 wi.info = &info;
2414
2415 walk_gimple_op (stmt, oacc_rewrite_var_decl, &wi);
2416
2417 if (info.modified)
2418 update_stmt (s: stmt);
2419 }
2420 }
2421
2422 return 0;
2423}
2424
2425/* Default launch dimension validator. Force everything to 1. A
2426 backend that wants to provide larger dimensions must override this
2427 hook. */
2428
2429bool
2430default_goacc_validate_dims (tree ARG_UNUSED (decl), int *dims,
2431 int ARG_UNUSED (fn_level),
2432 unsigned ARG_UNUSED (used))
2433{
2434 bool changed = false;
2435
2436 for (unsigned ix = 0; ix != GOMP_DIM_MAX; ix++)
2437 {
2438 if (dims[ix] != 1)
2439 {
2440 dims[ix] = 1;
2441 changed = true;
2442 }
2443 }
2444
2445 return changed;
2446}
2447
2448/* Default dimension bound is unknown on accelerator and 1 on host. */
2449
2450int
2451default_goacc_dim_limit (int ARG_UNUSED (axis))
2452{
2453#ifdef ACCEL_COMPILER
2454 return 0;
2455#else
2456 return 1;
2457#endif
2458}
2459
2460namespace {
2461
2462const pass_data pass_data_oacc_loop_designation =
2463{
2464 .type: GIMPLE_PASS, /* type */
2465 .name: "oaccloops", /* name */
2466 .optinfo_flags: OPTGROUP_OMP, /* optinfo_flags */
2467 .tv_id: TV_NONE, /* tv_id */
2468 PROP_cfg, /* properties_required */
2469 .properties_provided: 0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */
2470 .properties_destroyed: 0, /* properties_destroyed */
2471 .todo_flags_start: 0, /* todo_flags_start */
2472 TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
2473};
2474
2475class pass_oacc_loop_designation : public gimple_opt_pass
2476{
2477public:
2478 pass_oacc_loop_designation (gcc::context *ctxt)
2479 : gimple_opt_pass (pass_data_oacc_loop_designation, ctxt)
2480 {}
2481
2482 /* opt_pass methods: */
2483 bool gate (function *) final override { return flag_openacc; };
2484
2485 unsigned int execute (function *) final override
2486 {
2487 return execute_oacc_loop_designation ();
2488 }
2489
2490}; // class pass_oacc_loop_designation
2491
2492const pass_data pass_data_oacc_device_lower =
2493{
2494 .type: GIMPLE_PASS, /* type */
2495 .name: "oaccdevlow", /* name */
2496 .optinfo_flags: OPTGROUP_OMP, /* optinfo_flags */
2497 .tv_id: TV_NONE, /* tv_id */
2498 PROP_cfg, /* properties_required */
2499 .properties_provided: 0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */
2500 .properties_destroyed: 0, /* properties_destroyed */
2501 .todo_flags_start: 0, /* todo_flags_start */
2502 TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
2503};
2504
2505class pass_oacc_device_lower : public gimple_opt_pass
2506{
2507public:
2508 pass_oacc_device_lower (gcc::context *ctxt)
2509 : gimple_opt_pass (pass_data_oacc_device_lower, ctxt)
2510 {}
2511
2512 /* opt_pass methods: */
2513 bool gate (function *) final override { return flag_openacc; };
2514
2515 unsigned int execute (function *) final override
2516 {
2517 return execute_oacc_device_lower ();
2518 }
2519
2520}; // class pass_oacc_device_lower
2521
2522} // anon namespace
2523
2524gimple_opt_pass *
2525make_pass_oacc_loop_designation (gcc::context *ctxt)
2526{
2527 return new pass_oacc_loop_designation (ctxt);
2528}
2529
2530gimple_opt_pass *
2531make_pass_oacc_device_lower (gcc::context *ctxt)
2532{
2533 return new pass_oacc_device_lower (ctxt);
2534}
2535
2536
2537/* Rewrite GOMP_SIMT_ENTER_ALLOC call given by GSI and remove the preceding
2538 GOMP_SIMT_ENTER call identifying the privatized variables, which are
2539 turned to structure fields and receive a DECL_VALUE_EXPR accordingly.
2540 Set *REGIMPLIFY to true, except if no privatized variables were seen. */
2541
2542static void
2543ompdevlow_adjust_simt_enter (gimple_stmt_iterator *gsi, bool *regimplify)
2544{
2545 gimple *alloc_stmt = gsi_stmt (i: *gsi);
2546 tree simtrec = gimple_call_lhs (gs: alloc_stmt);
2547 tree simduid = gimple_call_arg (gs: alloc_stmt, index: 0);
2548 gimple *enter_stmt = SSA_NAME_DEF_STMT (simduid);
2549 gcc_assert (gimple_call_internal_p (enter_stmt, IFN_GOMP_SIMT_ENTER));
2550 tree rectype = lang_hooks.types.make_type (RECORD_TYPE);
2551 TYPE_ARTIFICIAL (rectype) = TYPE_NAMELESS (rectype) = 1;
2552 TREE_ADDRESSABLE (rectype) = 1;
2553 TREE_TYPE (simtrec) = build_pointer_type (rectype);
2554 for (unsigned i = 1; i < gimple_call_num_args (gs: enter_stmt); i++)
2555 {
2556 tree *argp = gimple_call_arg_ptr (gs: enter_stmt, index: i);
2557 if (*argp == null_pointer_node)
2558 continue;
2559 gcc_assert (TREE_CODE (*argp) == ADDR_EXPR
2560 && VAR_P (TREE_OPERAND (*argp, 0)));
2561 tree var = TREE_OPERAND (*argp, 0);
2562
2563 tree field = build_decl (DECL_SOURCE_LOCATION (var), FIELD_DECL,
2564 DECL_NAME (var), TREE_TYPE (var));
2565 SET_DECL_ALIGN (field, DECL_ALIGN (var));
2566 DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
2567 TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
2568
2569 insert_field_into_struct (rectype, field);
2570
2571 tree t = build_simple_mem_ref (simtrec);
2572 t = build3 (COMPONENT_REF, TREE_TYPE (var), t, field, NULL);
2573 TREE_THIS_VOLATILE (t) = TREE_THIS_VOLATILE (var);
2574 SET_DECL_VALUE_EXPR (var, t);
2575 DECL_HAS_VALUE_EXPR_P (var) = 1;
2576 *regimplify = true;
2577 }
2578 layout_type (rectype);
2579 tree size = TYPE_SIZE_UNIT (rectype);
2580 tree align = build_int_cst (TREE_TYPE (size), TYPE_ALIGN_UNIT (rectype));
2581
2582 alloc_stmt
2583 = gimple_build_call_internal (IFN_GOMP_SIMT_ENTER_ALLOC, 2, size, align);
2584 gimple_call_set_lhs (gs: alloc_stmt, lhs: simtrec);
2585 gsi_replace (gsi, alloc_stmt, false);
2586 gimple_stmt_iterator enter_gsi = gsi_for_stmt (enter_stmt);
2587 enter_stmt = gimple_build_assign (simduid, gimple_call_arg (gs: enter_stmt, index: 0));
2588 gsi_replace (&enter_gsi, enter_stmt, false);
2589
2590 use_operand_p use;
2591 gimple *exit_stmt;
2592 if (single_imm_use (var: simtrec, use_p: &use, stmt: &exit_stmt))
2593 {
2594 gcc_assert (gimple_call_internal_p (exit_stmt, IFN_GOMP_SIMT_EXIT));
2595 gimple_stmt_iterator exit_gsi = gsi_for_stmt (exit_stmt);
2596 tree clobber = build_clobber (rectype);
2597 exit_stmt = gimple_build_assign (build_simple_mem_ref (simtrec), clobber);
2598 gsi_insert_before (&exit_gsi, exit_stmt, GSI_SAME_STMT);
2599 }
2600 else
2601 gcc_checking_assert (has_zero_uses (simtrec));
2602}
2603
2604/* Callback for walk_gimple_stmt used to scan for SIMT-privatized variables. */
2605
2606static tree
2607find_simtpriv_var_op (tree *tp, int *walk_subtrees, void *)
2608{
2609 tree t = *tp;
2610
2611 if (VAR_P (t)
2612 && DECL_HAS_VALUE_EXPR_P (t)
2613 && lookup_attribute (attr_name: "omp simt private", DECL_ATTRIBUTES (t)))
2614 {
2615 *walk_subtrees = 0;
2616 return t;
2617 }
2618 return NULL_TREE;
2619}
2620
2621/* Helper function for execute_omp_device_lower, invoked via walk_gimple_op.
2622 Resolve any OMP_TARGET_DEVICE_MATCHES and OMP_NEXT_VARIANT exprs to
2623 constants. */
2624static tree
2625resolve_omp_variant_cookies (tree *tp, int *walk_subtrees,
2626 void *data ATTRIBUTE_UNUSED)
2627{
2628 if (TREE_CODE (*tp) == OMP_TARGET_DEVICE_MATCHES)
2629 {
2630 *tp = resolve_omp_target_device_matches (node: *tp);
2631 *walk_subtrees = 0;
2632 return NULL_TREE;
2633 }
2634
2635 if (TREE_CODE (*tp) != OMP_NEXT_VARIANT)
2636 return NULL_TREE;
2637 tree index = OMP_NEXT_VARIANT_INDEX (*tp);
2638 tree state = OMP_NEXT_VARIANT_STATE (*tp);
2639
2640 /* State is a triplet of (result-vector, construct_context, selector_vec).
2641 If result-vector has already been computed, just use it. Otherwise we
2642 must resolve the variant and fill in that part of the state object.
2643 All OMP_NEXT_VARIANT exprs for the same variant construct are supposed
2644 to share the same state object, but if something bad happens and we end
2645 up with copies, that is OK, it will just cause the result-vector to be
2646 computed multiple times. */
2647 tree result_vector = TREE_PURPOSE (state);
2648 if (!result_vector)
2649 {
2650 tree construct_context = TREE_VALUE (state);
2651 tree selectors = TREE_CHAIN (state);
2652
2653 vec<struct omp_variant> candidates
2654 = omp_resolve_variant_construct (construct_context, selectors);
2655 int n = TREE_VEC_LENGTH (selectors);
2656 TREE_PURPOSE (state) = result_vector = make_tree_vec (n + 1);
2657 /* The result vector maps the index of each element of the original
2658 selectors vector onto the index of the next element of the filtered/
2659 sorted candidates vector. Since some of the original variants may
2660 have been discarded as non-matching in candidates, initialize the
2661 whole array to zero so that we have a placeholder "next" value for
2662 those elements. Hopefully dead code elimination will take care of
2663 subsequently discarding the unreachable cases in the already-generated
2664 switch statement. */
2665 for (int i = 1; i <= n; i++)
2666 TREE_VEC_ELT (result_vector, i) = integer_zero_node;
2667 /* Element 0 is the case label of the first variant in the sorted
2668 list. */
2669 if (dump_file)
2670 fprintf (stream: dump_file, format: "Computing case map for variant directive\n");
2671 int j = 0;
2672 for (unsigned int i = 0; i < candidates.length(); i++)
2673 {
2674 if (dump_file)
2675 fprintf (stream: dump_file, format: " %d -> case %d\n",
2676 j, (int) tree_to_shwi (candidates[i].alternative));
2677 TREE_VEC_ELT (result_vector, j) = candidates[i].alternative;
2678 j = (int) tree_to_shwi (candidates[i].alternative);
2679 }
2680 }
2681
2682 /* Now just grab the value out of the precomputed array. */
2683 gcc_assert (TREE_CODE (index) == INTEGER_CST);
2684 int indexval = (int) tree_to_shwi (index);
2685 *tp = TREE_VEC_ELT (result_vector, indexval);
2686 *walk_subtrees = 0;
2687 return NULL_TREE;
2688}
2689
2690
2691/* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
2692 VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
2693 LANE is kept to be expanded to RTL later on. Also cleanup all other SIMT
2694 internal functions on non-SIMT targets, and likewise some SIMD internal
2695 functions on SIMT targets. */
2696
2697static unsigned int
2698execute_omp_device_lower ()
2699{
2700 int vf = targetm.simt.vf ? targetm.simt.vf () : 1;
2701 bool regimplify = false;
2702 basic_block bb;
2703 gimple_stmt_iterator gsi;
2704#ifdef ACCEL_COMPILER
2705 bool omp_redirect_indirect_calls = vec_safe_length (offload_ind_funcs) > 0;
2706 tree map_ptr_fn
2707 = builtin_decl_explicit (BUILT_IN_GOMP_TARGET_MAP_INDIRECT_PTR);
2708#endif
2709
2710 /* Handle expansion of magic cookies for variant constructs first. */
2711 if (cgraph_node::get (cfun->decl)->has_omp_variant_constructs)
2712 FOR_EACH_BB_FN (bb, cfun)
2713 {
2714 for (gsi = gsi_start_bb (bb); !gsi_end_p (i: gsi); gsi_next (i: &gsi))
2715 walk_gimple_op (gsi_stmt (i: gsi), resolve_omp_variant_cookies, NULL);
2716 for (gsi = gsi_start_phis (bb); !gsi_end_p (i: gsi); gsi_next (i: &gsi))
2717 walk_gimple_op (gsi_stmt (i: gsi), resolve_omp_variant_cookies, NULL);
2718 }
2719
2720 FOR_EACH_BB_FN (bb, cfun)
2721 for (gsi = gsi_start_bb (bb); !gsi_end_p (i: gsi); gsi_next (i: &gsi))
2722 {
2723 gimple *stmt = gsi_stmt (i: gsi);
2724 if (!is_gimple_call (gs: stmt))
2725 continue;
2726 if (!gimple_call_internal_p (gs: stmt))
2727 {
2728#ifdef ACCEL_COMPILER
2729 if (omp_redirect_indirect_calls
2730 && gimple_call_fndecl (stmt) == NULL_TREE)
2731 {
2732 gcall *orig_call = dyn_cast <gcall *> (stmt);
2733 tree call_fn = gimple_call_fn (stmt);
2734 tree fn_ty = TREE_TYPE (call_fn);
2735
2736 if (TREE_CODE (call_fn) == OBJ_TYPE_REF)
2737 {
2738 tree obj_ref = create_tmp_reg (TREE_TYPE (call_fn),
2739 ".ind_fn_objref");
2740 gimple *gassign = gimple_build_assign (obj_ref, call_fn);
2741 gsi_insert_before (&gsi, gassign, GSI_SAME_STMT);
2742 call_fn = obj_ref;
2743 }
2744 tree mapped_fn = create_tmp_reg (fn_ty, ".ind_fn");
2745 gimple *gcall =
2746 gimple_build_call (map_ptr_fn, 1, call_fn);
2747 gimple_set_location (gcall, gimple_location (stmt));
2748 gimple_call_set_lhs (gcall, mapped_fn);
2749 gsi_insert_before (&gsi, gcall, GSI_SAME_STMT);
2750
2751 gimple_call_set_fn (orig_call, mapped_fn);
2752 update_stmt (orig_call);
2753 }
2754#endif
2755 continue;
2756 }
2757 tree lhs = gimple_call_lhs (gs: stmt), rhs = NULL_TREE;
2758 tree type = lhs ? TREE_TYPE (lhs) : integer_type_node;
2759 switch (gimple_call_internal_fn (gs: stmt))
2760 {
2761 case IFN_GOMP_TARGET_REV:
2762 {
2763#ifndef ACCEL_COMPILER
2764 gimple_stmt_iterator gsi2 = gsi;
2765 gsi_next (i: &gsi2);
2766 gcc_assert (!gsi_end_p (gsi2));
2767 gcc_assert (gimple_call_builtin_p (gsi_stmt (gsi2),
2768 BUILT_IN_GOMP_TARGET));
2769 tree old_decl
2770 = TREE_OPERAND (gimple_call_arg (gsi_stmt (gsi2), 1), 0);
2771 tree new_decl = gimple_call_arg (gs: gsi_stmt (i: gsi), index: 0);
2772 gimple_call_set_arg (gs: gsi_stmt (i: gsi2), index: 1, arg: new_decl);
2773 update_stmt (s: gsi_stmt (i: gsi2));
2774 new_decl = TREE_OPERAND (new_decl, 0);
2775 unsigned i;
2776 unsigned num_funcs = vec_safe_length (v: offload_funcs);
2777 for (i = 0; i < num_funcs; i++)
2778 {
2779 if ((*offload_funcs)[i] == old_decl)
2780 {
2781 (*offload_funcs)[i] = new_decl;
2782 break;
2783 }
2784 else if ((*offload_funcs)[i] == new_decl)
2785 break; /* This can happen due to inlining. */
2786 }
2787 gcc_assert (i < num_funcs);
2788#else
2789 tree old_decl = TREE_OPERAND (gimple_call_arg (gsi_stmt (gsi), 0),
2790 0);
2791#endif
2792 /* FIXME: Find a way to actually prevent outputting the empty-body
2793 old_decl as debug symbol + function in the assembly file. */
2794 cgraph_node *node = cgraph_node::get (decl: old_decl);
2795 node->address_taken = false;
2796 node->need_lto_streaming = false;
2797 node->offloadable = false;
2798
2799 unlink_stmt_vdef (stmt);
2800 }
2801 break;
2802 case IFN_GOMP_USE_SIMT:
2803 rhs = vf == 1 ? integer_zero_node : integer_one_node;
2804 break;
2805 case IFN_GOMP_SIMT_ENTER:
2806 rhs = vf == 1 ? gimple_call_arg (gs: stmt, index: 0) : NULL_TREE;
2807 goto simtreg_enter_exit;
2808 case IFN_GOMP_SIMT_ENTER_ALLOC:
2809 if (vf != 1)
2810 ompdevlow_adjust_simt_enter (gsi: &gsi, regimplify: &regimplify);
2811 rhs = vf == 1 ? null_pointer_node : NULL_TREE;
2812 goto simtreg_enter_exit;
2813 case IFN_GOMP_SIMT_EXIT:
2814 simtreg_enter_exit:
2815 if (vf != 1)
2816 continue;
2817 unlink_stmt_vdef (stmt);
2818 break;
2819 case IFN_GOMP_SIMT_LANE:
2820 case IFN_GOMP_SIMT_LAST_LANE:
2821 rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE;
2822 break;
2823 case IFN_GOMP_SIMT_VF:
2824 rhs = build_int_cst (type, vf);
2825 break;
2826 case IFN_GOMP_MAX_VF:
2827 rhs = build_int_cst (type, omp_max_vf (false));
2828 break;
2829 case IFN_GOMP_SIMT_ORDERED_PRED:
2830 rhs = vf == 1 ? integer_zero_node : NULL_TREE;
2831 if (rhs || !lhs)
2832 unlink_stmt_vdef (stmt);
2833 break;
2834 case IFN_GOMP_SIMT_VOTE_ANY:
2835 case IFN_GOMP_SIMT_XCHG_BFLY:
2836 case IFN_GOMP_SIMT_XCHG_IDX:
2837 rhs = vf == 1 ? gimple_call_arg (gs: stmt, index: 0) : NULL_TREE;
2838 break;
2839 case IFN_GOMP_SIMD_LANE:
2840 case IFN_GOMP_SIMD_LAST_LANE:
2841 rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE;
2842 break;
2843 case IFN_GOMP_SIMD_VF:
2844 rhs = vf != 1 ? build_one_cst (type) : NULL_TREE;
2845 break;
2846 default:
2847 continue;
2848 }
2849 if (lhs && !rhs)
2850 continue;
2851 stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop ();
2852 gsi_replace (&gsi, stmt, false);
2853 }
2854 if (regimplify)
2855 FOR_EACH_BB_REVERSE_FN (bb, cfun)
2856 for (gsi = gsi_last_bb (bb); !gsi_end_p (i: gsi); gsi_prev (i: &gsi))
2857 if (walk_gimple_stmt (&gsi, NULL, find_simtpriv_var_op, NULL))
2858 {
2859 if (gimple_clobber_p (s: gsi_stmt (i: gsi)))
2860 gsi_remove (&gsi, true);
2861 else
2862 gimple_regimplify_operands (gsi_stmt (i: gsi), &gsi);
2863 }
2864 if (vf != 1)
2865 cfun->has_force_vectorize_loops = false;
2866 return 0;
2867}
2868
2869namespace {
2870
2871const pass_data pass_data_omp_device_lower =
2872{
2873 .type: GIMPLE_PASS, /* type */
2874 .name: "ompdevlow", /* name */
2875 .optinfo_flags: OPTGROUP_OMP, /* optinfo_flags */
2876 .tv_id: TV_NONE, /* tv_id */
2877 PROP_cfg, /* properties_required */
2878 PROP_gimple_lomp_dev, /* properties_provided */
2879 .properties_destroyed: 0, /* properties_destroyed */
2880 .todo_flags_start: 0, /* todo_flags_start */
2881 TODO_update_ssa, /* todo_flags_finish */
2882};
2883
2884class pass_omp_device_lower : public gimple_opt_pass
2885{
2886public:
2887 pass_omp_device_lower (gcc::context *ctxt)
2888 : gimple_opt_pass (pass_data_omp_device_lower, ctxt)
2889 {}
2890
2891 /* opt_pass methods: */
2892 bool gate (function *fun) final override
2893 {
2894 cgraph_node *node = cgraph_node::get (decl: fun->decl);
2895#ifdef ACCEL_COMPILER
2896 bool offload_ind_funcs_p = vec_safe_length (offload_ind_funcs) > 0;
2897#else
2898 bool offload_ind_funcs_p = false;
2899#endif
2900 return (!(fun->curr_properties & PROP_gimple_lomp_dev)
2901 || (flag_openmp
2902 && (node->has_omp_variant_constructs || offload_ind_funcs_p)));
2903 }
2904 unsigned int execute (function *) final override
2905 {
2906 return execute_omp_device_lower ();
2907 }
2908
2909}; // class pass_expand_omp_ssa
2910
2911} // anon namespace
2912
2913gimple_opt_pass *
2914make_pass_omp_device_lower (gcc::context *ctxt)
2915{
2916 return new pass_omp_device_lower (ctxt);
2917}
2918
2919/* "omp declare target link" handling pass. */
2920
2921namespace {
2922
2923const pass_data pass_data_omp_target_link =
2924{
2925 .type: GIMPLE_PASS, /* type */
2926 .name: "omptargetlink", /* name */
2927 .optinfo_flags: OPTGROUP_OMP, /* optinfo_flags */
2928 .tv_id: TV_NONE, /* tv_id */
2929 PROP_ssa, /* properties_required */
2930 .properties_provided: 0, /* properties_provided */
2931 .properties_destroyed: 0, /* properties_destroyed */
2932 .todo_flags_start: 0, /* todo_flags_start */
2933 TODO_update_ssa, /* todo_flags_finish */
2934};
2935
2936class pass_omp_target_link : public gimple_opt_pass
2937{
2938public:
2939 pass_omp_target_link (gcc::context *ctxt)
2940 : gimple_opt_pass (pass_data_omp_target_link, ctxt)
2941 {}
2942
2943 /* opt_pass methods: */
2944 bool gate (function *fun) final override
2945 {
2946#ifdef ACCEL_COMPILER
2947 return offloading_function_p (fun->decl);
2948#else
2949 (void) fun;
2950 return false;
2951#endif
2952 }
2953
2954 unsigned execute (function *) final override;
2955};
2956
2957/* Callback for walk_gimple_stmt used to scan for link var operands. */
2958
2959static tree
2960process_link_var_op (tree *tp, int *walk_subtrees, void *data)
2961{
2962 struct walk_stmt_info *wi = (struct walk_stmt_info *) data;
2963 tree t = *tp;
2964
2965 if (VAR_P (t)
2966 && DECL_HAS_VALUE_EXPR_P (t)
2967 && is_global_var (t)
2968 && lookup_attribute (attr_name: "omp declare target link", DECL_ATTRIBUTES (t)))
2969 {
2970 wi->info = *tp = unshare_expr (DECL_VALUE_EXPR (t));
2971 *walk_subtrees = 0;
2972 return NULL_TREE;
2973 }
2974
2975 return NULL_TREE;
2976}
2977
2978unsigned
2979pass_omp_target_link::execute (function *fun)
2980{
2981 basic_block bb;
2982 FOR_EACH_BB_FN (bb, fun)
2983 {
2984 gimple_stmt_iterator gsi;
2985 for (gsi = gsi_start_bb (bb); !gsi_end_p (i: gsi); gsi_next (i: &gsi))
2986 {
2987 if (gimple_call_builtin_p (gsi_stmt (i: gsi), BUILT_IN_GOMP_TARGET))
2988 {
2989 tree dev = gimple_call_arg (gs: gsi_stmt (i: gsi), index: 0);
2990 tree fn = gimple_call_arg (gs: gsi_stmt (i: gsi), index: 1);
2991 if (POINTER_TYPE_P (TREE_TYPE (fn)))
2992 fn = TREE_OPERAND (fn, 0);
2993 if (TREE_CODE (dev) == INTEGER_CST
2994 && wi::to_wide (t: dev) == GOMP_DEVICE_HOST_FALLBACK
2995 && lookup_attribute (attr_name: "omp target device_ancestor_nohost",
2996 DECL_ATTRIBUTES (fn)) != NULL_TREE)
2997 continue; /* ancestor:1 */
2998 /* Nullify the second argument of __builtin_GOMP_target_ext. */
2999 gimple_call_set_arg (gs: gsi_stmt (i: gsi), index: 1, null_pointer_node);
3000 update_stmt (s: gsi_stmt (i: gsi));
3001 }
3002 struct walk_stmt_info wi;
3003 memset (s: &wi, c: 0, n: sizeof (wi));
3004 walk_gimple_stmt (&gsi, NULL, process_link_var_op, &wi);
3005 if (wi.info)
3006 gimple_regimplify_operands (gsi_stmt (i: gsi), &gsi);
3007 }
3008 }
3009
3010 return 0;
3011}
3012
3013} // anon namespace
3014
3015gimple_opt_pass *
3016make_pass_omp_target_link (gcc::context *ctxt)
3017{
3018 return new pass_omp_target_link (ctxt);
3019}
3020

source code of gcc/omp-offload.cc