1 | /* OpenACC worker partitioning via middle end neutering/broadcasting scheme |
2 | |
3 | Copyright (C) 2015-2023 Free Software Foundation, Inc. |
4 | |
5 | This file is part of GCC. |
6 | |
7 | GCC is free software; you can redistribute it and/or modify it |
8 | under the terms of the GNU General Public License as published |
9 | by the Free Software Foundation; either version 3, or (at your |
10 | option) any later version. |
11 | |
12 | GCC is distributed in the hope that it will be useful, but WITHOUT |
13 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
14 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public |
15 | License for more details. |
16 | |
17 | You should have received a copy of the GNU General Public License |
18 | along with GCC; see the file COPYING3. If not see |
19 | <http://www.gnu.org/licenses/>. */ |
20 | |
21 | #include "config.h" |
22 | #include "system.h" |
23 | #include "coretypes.h" |
24 | #include "backend.h" |
25 | #include "rtl.h" |
26 | #include "tree.h" |
27 | #include "gimple.h" |
28 | #include "tree-pass.h" |
29 | #include "ssa.h" |
30 | #include "cgraph.h" |
31 | #include "pretty-print.h" |
32 | #include "fold-const.h" |
33 | #include "gimplify.h" |
34 | #include "gimple-iterator.h" |
35 | #include "gimple-walk.h" |
36 | #include "tree-inline.h" |
37 | #include "langhooks.h" |
38 | #include "omp-general.h" |
39 | #include "omp-low.h" |
40 | #include "gimple-pretty-print.h" |
41 | #include "cfghooks.h" |
42 | #include "insn-config.h" |
43 | #include "recog.h" |
44 | #include "internal-fn.h" |
45 | #include "bitmap.h" |
46 | #include "tree-nested.h" |
47 | #include "stor-layout.h" |
48 | #include "tree-ssa-threadupdate.h" |
49 | #include "tree-into-ssa.h" |
50 | #include "splay-tree.h" |
51 | #include "target.h" |
52 | #include "cfgloop.h" |
53 | #include "tree-cfg.h" |
54 | #include "omp-offload.h" |
55 | #include "attribs.h" |
56 | #include "targhooks.h" |
57 | #include "diagnostic-core.h" |
58 | |
59 | /* Loop structure of the function. The entire function is described as |
60 | a NULL loop. */ |
61 | /* Adapted from 'gcc/config/nvptx/nvptx.cc:struct parallel'. */ |
62 | |
63 | struct parallel_g |
64 | { |
65 | /* Parent parallel. */ |
66 | parallel_g *parent; |
67 | |
68 | /* Next sibling parallel. */ |
69 | parallel_g *next; |
70 | |
71 | /* First child parallel. */ |
72 | parallel_g *inner; |
73 | |
74 | /* Partitioning mask of the parallel. */ |
75 | unsigned mask; |
76 | |
77 | /* Partitioning used within inner parallels. */ |
78 | unsigned inner_mask; |
79 | |
80 | /* Location of parallel forked and join. The forked is the first |
81 | block in the parallel and the join is the first block after of |
82 | the partition. */ |
83 | basic_block forked_block; |
84 | basic_block join_block; |
85 | |
86 | gimple *forked_stmt; |
87 | gimple *join_stmt; |
88 | |
89 | gimple *fork_stmt; |
90 | gimple *joining_stmt; |
91 | |
92 | /* Basic blocks in this parallel, but not in child parallels. The |
93 | FORKED and JOINING blocks are in the partition. The FORK and JOIN |
94 | blocks are not. */ |
95 | auto_vec<basic_block> blocks; |
96 | |
97 | tree record_type; |
98 | tree sender_decl; |
99 | tree receiver_decl; |
100 | |
101 | public: |
102 | parallel_g (parallel_g *parent, unsigned mode); |
103 | ~parallel_g (); |
104 | }; |
105 | |
106 | /* Constructor links the new parallel into it's parent's chain of |
107 | children. */ |
108 | |
109 | parallel_g::parallel_g (parallel_g *parent_, unsigned mask_) |
110 | :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0) |
111 | { |
112 | forked_block = join_block = 0; |
113 | forked_stmt = join_stmt = NULL; |
114 | fork_stmt = joining_stmt = NULL; |
115 | |
116 | record_type = NULL_TREE; |
117 | sender_decl = NULL_TREE; |
118 | receiver_decl = NULL_TREE; |
119 | |
120 | if (parent) |
121 | { |
122 | next = parent->inner; |
123 | parent->inner = this; |
124 | } |
125 | } |
126 | |
127 | parallel_g::~parallel_g () |
128 | { |
129 | delete inner; |
130 | delete next; |
131 | } |
132 | |
133 | static bool |
134 | local_var_based_p (tree decl) |
135 | { |
136 | switch (TREE_CODE (decl)) |
137 | { |
138 | case VAR_DECL: |
139 | return !is_global_var (t: decl); |
140 | |
141 | case COMPONENT_REF: |
142 | case BIT_FIELD_REF: |
143 | case ARRAY_REF: |
144 | return local_var_based_p (TREE_OPERAND (decl, 0)); |
145 | |
146 | default: |
147 | return false; |
148 | } |
149 | } |
150 | |
151 | /* Map of basic blocks to gimple stmts. */ |
152 | typedef hash_map<basic_block, gimple *> bb_stmt_map_t; |
153 | |
154 | /* Calls to OpenACC routines are made by all workers/wavefronts/warps, since |
155 | the routine likely contains partitioned loops (else will do its own |
156 | neutering and variable propagation). Return TRUE if a function call CALL |
157 | should be made in (worker) single mode instead, rather than redundant |
158 | mode. */ |
159 | |
160 | static bool |
161 | omp_sese_active_worker_call (gcall *call) |
162 | { |
163 | #define GOMP_DIM_SEQ GOMP_DIM_MAX |
164 | tree fndecl = gimple_call_fndecl (gs: call); |
165 | |
166 | if (!fndecl) |
167 | return true; |
168 | |
169 | tree attrs = oacc_get_fn_attrib (fn: fndecl); |
170 | |
171 | if (!attrs) |
172 | return true; |
173 | |
174 | int level = oacc_fn_attrib_level (attr: attrs); |
175 | |
176 | /* Neither regular functions nor "seq" routines should be run by all threads |
177 | in worker-single mode. */ |
178 | return level == -1 || level == GOMP_DIM_SEQ; |
179 | #undef GOMP_DIM_SEQ |
180 | } |
181 | |
182 | /* Split basic blocks such that each forked and join unspecs are at |
183 | the start of their basic blocks. Thus afterwards each block will |
184 | have a single partitioning mode. We also do the same for return |
185 | insns, as they are executed by every thread. Return the |
186 | partitioning mode of the function as a whole. Populate MAP with |
187 | head and tail blocks. We also clear the BB visited flag, which is |
188 | used when finding partitions. */ |
189 | /* Adapted from 'gcc/config/nvptx/nvptx.cc:nvptx_split_blocks'. */ |
190 | |
191 | static void |
192 | omp_sese_split_blocks (bb_stmt_map_t *map) |
193 | { |
194 | auto_vec<gimple *> worklist; |
195 | basic_block block; |
196 | |
197 | /* Locate all the reorg instructions of interest. */ |
198 | FOR_ALL_BB_FN (block, cfun) |
199 | { |
200 | /* Clear visited flag, for use by parallel locator */ |
201 | block->flags &= ~BB_VISITED; |
202 | |
203 | for (gimple_stmt_iterator gsi = gsi_start_bb (bb: block); |
204 | !gsi_end_p (i: gsi); |
205 | gsi_next (i: &gsi)) |
206 | { |
207 | gimple *stmt = gsi_stmt (i: gsi); |
208 | |
209 | if (gimple_call_internal_p (gs: stmt, fn: IFN_UNIQUE)) |
210 | { |
211 | enum ifn_unique_kind k = ((enum ifn_unique_kind) |
212 | TREE_INT_CST_LOW (gimple_call_arg (stmt, 0))); |
213 | |
214 | if (k == IFN_UNIQUE_OACC_JOIN) |
215 | worklist.safe_push (obj: stmt); |
216 | else if (k == IFN_UNIQUE_OACC_FORK) |
217 | { |
218 | gcc_assert (gsi_one_before_end_p (gsi)); |
219 | basic_block forked_block = single_succ (bb: block); |
220 | gimple_stmt_iterator gsi2 = gsi_start_bb (bb: forked_block); |
221 | |
222 | /* We push a NOP as a placeholder for the "forked" stmt. |
223 | This is then recognized in omp_sese_find_par. */ |
224 | gimple *nop = gimple_build_nop (); |
225 | gsi_insert_before (&gsi2, nop, GSI_SAME_STMT); |
226 | |
227 | worklist.safe_push (obj: nop); |
228 | } |
229 | } |
230 | else if (gimple_code (g: stmt) == GIMPLE_RETURN |
231 | || gimple_code (g: stmt) == GIMPLE_COND |
232 | || gimple_code (g: stmt) == GIMPLE_SWITCH |
233 | || (gimple_code (g: stmt) == GIMPLE_CALL |
234 | && !gimple_call_internal_p (gs: stmt) |
235 | && !omp_sese_active_worker_call (call: as_a <gcall *> (p: stmt)))) |
236 | worklist.safe_push (obj: stmt); |
237 | else if (is_gimple_assign (gs: stmt)) |
238 | { |
239 | tree lhs = gimple_assign_lhs (gs: stmt); |
240 | |
241 | /* Force assignments to components/fields/elements of local |
242 | aggregates into fully-partitioned (redundant) mode. This |
243 | avoids having to broadcast the whole aggregate. The RHS of |
244 | the assignment will be propagated using the normal |
245 | mechanism. */ |
246 | |
247 | switch (TREE_CODE (lhs)) |
248 | { |
249 | case COMPONENT_REF: |
250 | case BIT_FIELD_REF: |
251 | case ARRAY_REF: |
252 | { |
253 | tree aggr = TREE_OPERAND (lhs, 0); |
254 | |
255 | if (local_var_based_p (decl: aggr)) |
256 | worklist.safe_push (obj: stmt); |
257 | } |
258 | break; |
259 | |
260 | default: |
261 | ; |
262 | } |
263 | } |
264 | } |
265 | } |
266 | |
267 | /* Split blocks on the worklist. */ |
268 | unsigned ix; |
269 | gimple *stmt; |
270 | |
271 | for (ix = 0; worklist.iterate (ix, ptr: &stmt); ix++) |
272 | { |
273 | basic_block block = gimple_bb (g: stmt); |
274 | |
275 | if (gimple_code (g: stmt) == GIMPLE_COND) |
276 | { |
277 | gcond *orig_cond = as_a <gcond *> (p: stmt); |
278 | tree_code code = gimple_expr_code (stmt: orig_cond); |
279 | tree pred = make_ssa_name (boolean_type_node); |
280 | gimple *asgn = gimple_build_assign (pred, code, |
281 | gimple_cond_lhs (gs: orig_cond), |
282 | gimple_cond_rhs (gs: orig_cond)); |
283 | gcond *new_cond |
284 | = gimple_build_cond (NE_EXPR, pred, boolean_false_node, |
285 | gimple_cond_true_label (gs: orig_cond), |
286 | gimple_cond_false_label (gs: orig_cond)); |
287 | |
288 | gimple_stmt_iterator gsi = gsi_for_stmt (stmt); |
289 | gsi_insert_before (&gsi, asgn, GSI_SAME_STMT); |
290 | gsi_replace (&gsi, new_cond, true); |
291 | |
292 | edge e = split_block (block, asgn); |
293 | block = e->dest; |
294 | map->get_or_insert (k: block) = new_cond; |
295 | } |
296 | else if ((gimple_code (g: stmt) == GIMPLE_CALL |
297 | && !gimple_call_internal_p (gs: stmt)) |
298 | || is_gimple_assign (gs: stmt)) |
299 | { |
300 | gimple_stmt_iterator gsi = gsi_for_stmt (stmt); |
301 | gsi_prev (i: &gsi); |
302 | |
303 | edge call = split_block (block, gsi_stmt (i: gsi)); |
304 | |
305 | gimple *call_stmt = gsi_stmt (i: gsi_start_bb (bb: call->dest)); |
306 | |
307 | edge call_to_ret = split_block (call->dest, call_stmt); |
308 | |
309 | map->get_or_insert (k: call_to_ret->src) = call_stmt; |
310 | } |
311 | else |
312 | { |
313 | gimple_stmt_iterator gsi = gsi_for_stmt (stmt); |
314 | gsi_prev (i: &gsi); |
315 | |
316 | if (gsi_end_p (i: gsi)) |
317 | map->get_or_insert (k: block) = stmt; |
318 | else |
319 | { |
320 | /* Split block before insn. The insn is in the new block. */ |
321 | edge e = split_block (block, gsi_stmt (i: gsi)); |
322 | |
323 | block = e->dest; |
324 | map->get_or_insert (k: block) = stmt; |
325 | } |
326 | } |
327 | } |
328 | } |
329 | |
330 | static const char * |
331 | mask_name (unsigned mask) |
332 | { |
333 | switch (mask) |
334 | { |
335 | case 0: return "gang redundant" ; |
336 | case 1: return "gang partitioned" ; |
337 | case 2: return "worker partitioned" ; |
338 | case 3: return "gang+worker partitioned" ; |
339 | case 4: return "vector partitioned" ; |
340 | case 5: return "gang+vector partitioned" ; |
341 | case 6: return "worker+vector partitioned" ; |
342 | case 7: return "fully partitioned" ; |
343 | default: return "<illegal>" ; |
344 | } |
345 | } |
346 | |
347 | /* Dump this parallel and all its inner parallels. */ |
348 | /* Adapted from 'gcc/config/nvptx/nvptx.cc:nvptx_dump_pars'. */ |
349 | |
350 | static void |
351 | omp_sese_dump_pars (parallel_g *par, unsigned depth) |
352 | { |
353 | fprintf (stream: dump_file, format: "%u: mask %d (%s) head=%d, tail=%d\n" , |
354 | depth, par->mask, mask_name (mask: par->mask), |
355 | par->forked_block ? par->forked_block->index : -1, |
356 | par->join_block ? par->join_block->index : -1); |
357 | |
358 | fprintf (stream: dump_file, format: " blocks:" ); |
359 | |
360 | basic_block block; |
361 | for (unsigned ix = 0; par->blocks.iterate (ix, ptr: &block); ix++) |
362 | fprintf (stream: dump_file, format: " %d" , block->index); |
363 | fprintf (stream: dump_file, format: "\n" ); |
364 | if (par->inner) |
365 | omp_sese_dump_pars (par: par->inner, depth: depth + 1); |
366 | |
367 | if (par->next) |
368 | omp_sese_dump_pars (par: par->next, depth); |
369 | } |
370 | |
371 | /* If BLOCK contains a fork/join marker, process it to create or |
372 | terminate a loop structure. Add this block to the current loop, |
373 | and then walk successor blocks. */ |
374 | /* Adapted from 'gcc/config/nvptx/nvptx.cc:nvptx_find_par'. */ |
375 | |
376 | static parallel_g * |
377 | omp_sese_find_par (bb_stmt_map_t *map, parallel_g *par, basic_block block) |
378 | { |
379 | if (block->flags & BB_VISITED) |
380 | return par; |
381 | block->flags |= BB_VISITED; |
382 | |
383 | if (gimple **stmtp = map->get (k: block)) |
384 | { |
385 | gimple *stmt = *stmtp; |
386 | |
387 | if (gimple_code (g: stmt) == GIMPLE_COND |
388 | || gimple_code (g: stmt) == GIMPLE_SWITCH |
389 | || gimple_code (g: stmt) == GIMPLE_RETURN |
390 | || (gimple_code (g: stmt) == GIMPLE_CALL |
391 | && !gimple_call_internal_p (gs: stmt)) |
392 | || is_gimple_assign (gs: stmt)) |
393 | { |
394 | /* A single block that is forced to be at the maximum partition |
395 | level. Make a singleton par for it. */ |
396 | par = new parallel_g (par, GOMP_DIM_MASK (GOMP_DIM_GANG) |
397 | | GOMP_DIM_MASK (GOMP_DIM_WORKER) |
398 | | GOMP_DIM_MASK (GOMP_DIM_VECTOR)); |
399 | par->forked_block = block; |
400 | par->forked_stmt = stmt; |
401 | par->blocks.safe_push (obj: block); |
402 | par = par->parent; |
403 | goto walk_successors; |
404 | } |
405 | else if (gimple_nop_p (g: stmt)) |
406 | { |
407 | basic_block pred = single_pred (bb: block); |
408 | gcc_assert (pred); |
409 | gimple_stmt_iterator gsi = gsi_last_bb (bb: pred); |
410 | gimple *final_stmt = gsi_stmt (i: gsi); |
411 | |
412 | if (gimple_call_internal_p (gs: final_stmt, fn: IFN_UNIQUE)) |
413 | { |
414 | gcall *call = as_a <gcall *> (p: final_stmt); |
415 | enum ifn_unique_kind k = ((enum ifn_unique_kind) |
416 | TREE_INT_CST_LOW (gimple_call_arg (call, 0))); |
417 | |
418 | if (k == IFN_UNIQUE_OACC_FORK) |
419 | { |
420 | HOST_WIDE_INT dim |
421 | = TREE_INT_CST_LOW (gimple_call_arg (call, 2)); |
422 | unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0; |
423 | |
424 | par = new parallel_g (par, mask); |
425 | par->forked_block = block; |
426 | par->forked_stmt = final_stmt; |
427 | par->fork_stmt = stmt; |
428 | } |
429 | else |
430 | gcc_unreachable (); |
431 | } |
432 | else |
433 | gcc_unreachable (); |
434 | } |
435 | else if (gimple_call_internal_p (gs: stmt, fn: IFN_UNIQUE)) |
436 | { |
437 | gcall *call = as_a <gcall *> (p: stmt); |
438 | enum ifn_unique_kind k = ((enum ifn_unique_kind) |
439 | TREE_INT_CST_LOW (gimple_call_arg (call, 0))); |
440 | if (k == IFN_UNIQUE_OACC_JOIN) |
441 | { |
442 | HOST_WIDE_INT dim = TREE_INT_CST_LOW (gimple_call_arg (stmt, 2)); |
443 | unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0; |
444 | |
445 | gcc_assert (par->mask == mask); |
446 | par->join_block = block; |
447 | par->join_stmt = stmt; |
448 | par = par->parent; |
449 | } |
450 | else |
451 | gcc_unreachable (); |
452 | } |
453 | else |
454 | gcc_unreachable (); |
455 | } |
456 | |
457 | if (par) |
458 | /* Add this block onto the current loop's list of blocks. */ |
459 | par->blocks.safe_push (obj: block); |
460 | else |
461 | /* This must be the entry block. Create a NULL parallel. */ |
462 | par = new parallel_g (0, 0); |
463 | |
464 | walk_successors: |
465 | /* Walk successor blocks. */ |
466 | edge e; |
467 | edge_iterator ei; |
468 | |
469 | FOR_EACH_EDGE (e, ei, block->succs) |
470 | omp_sese_find_par (map, par, block: e->dest); |
471 | |
472 | return par; |
473 | } |
474 | |
475 | /* DFS walk the CFG looking for fork & join markers. Construct |
476 | loop structures as we go. MAP is a mapping of basic blocks |
477 | to head & tail markers, discovered when splitting blocks. This |
478 | speeds up the discovery. We rely on the BB visited flag having |
479 | been cleared when splitting blocks. */ |
480 | /* Adapted from 'gcc/config/nvptx/nvptx.cc:nvptx_discover_pars'. */ |
481 | |
482 | static parallel_g * |
483 | omp_sese_discover_pars (bb_stmt_map_t *map) |
484 | { |
485 | basic_block block; |
486 | |
487 | /* Mark exit blocks as visited. */ |
488 | block = EXIT_BLOCK_PTR_FOR_FN (cfun); |
489 | block->flags |= BB_VISITED; |
490 | |
491 | /* And entry block as not. */ |
492 | block = ENTRY_BLOCK_PTR_FOR_FN (cfun); |
493 | block->flags &= ~BB_VISITED; |
494 | |
495 | parallel_g *par = omp_sese_find_par (map, par: 0, block); |
496 | |
497 | if (dump_file) |
498 | { |
499 | fprintf (stream: dump_file, format: "\nLoops\n" ); |
500 | omp_sese_dump_pars (par, depth: 0); |
501 | fprintf (stream: dump_file, format: "\n" ); |
502 | } |
503 | |
504 | return par; |
505 | } |
506 | |
507 | static void |
508 | populate_single_mode_bitmaps (parallel_g *par, bitmap worker_single, |
509 | bitmap vector_single, unsigned outer_mask, |
510 | int depth) |
511 | { |
512 | unsigned mask = outer_mask | par->mask; |
513 | |
514 | basic_block block; |
515 | |
516 | for (unsigned i = 0; par->blocks.iterate (ix: i, ptr: &block); i++) |
517 | { |
518 | if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0) |
519 | bitmap_set_bit (worker_single, block->index); |
520 | |
521 | if ((mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) == 0) |
522 | bitmap_set_bit (vector_single, block->index); |
523 | } |
524 | |
525 | if (par->inner) |
526 | populate_single_mode_bitmaps (par: par->inner, worker_single, vector_single, |
527 | outer_mask: mask, depth: depth + 1); |
528 | if (par->next) |
529 | populate_single_mode_bitmaps (par: par->next, worker_single, vector_single, |
530 | outer_mask, depth); |
531 | } |
532 | |
533 | /* A map from SSA names or var decls to record fields. */ |
534 | |
535 | typedef hash_map<tree, tree> field_map_t; |
536 | |
537 | /* For each propagation record type, this is a map from SSA names or var decls |
538 | to propagate, to the field in the record type that should be used for |
539 | transmission and reception. */ |
540 | |
541 | typedef hash_map<tree, field_map_t> record_field_map_t; |
542 | |
543 | static void |
544 | install_var_field (tree var, tree record_type, field_map_t *fields) |
545 | { |
546 | tree name; |
547 | char tmp[20]; |
548 | |
549 | if (TREE_CODE (var) == SSA_NAME) |
550 | { |
551 | name = SSA_NAME_IDENTIFIER (var); |
552 | if (!name) |
553 | { |
554 | sprintf (s: tmp, format: "_%u" , (unsigned) SSA_NAME_VERSION (var)); |
555 | name = get_identifier (tmp); |
556 | } |
557 | } |
558 | else if (VAR_P (var)) |
559 | { |
560 | name = DECL_NAME (var); |
561 | if (!name) |
562 | { |
563 | sprintf (s: tmp, format: "D_%u" , (unsigned) DECL_UID (var)); |
564 | name = get_identifier (tmp); |
565 | } |
566 | } |
567 | else |
568 | gcc_unreachable (); |
569 | |
570 | gcc_assert (!fields->get (var)); |
571 | |
572 | tree type = TREE_TYPE (var); |
573 | |
574 | if (POINTER_TYPE_P (type) |
575 | && TYPE_RESTRICT (type)) |
576 | type = build_qualified_type (type, TYPE_QUALS (type) & ~TYPE_QUAL_RESTRICT); |
577 | |
578 | tree field = build_decl (BUILTINS_LOCATION, FIELD_DECL, name, type); |
579 | |
580 | if (VAR_P (var) && type == TREE_TYPE (var)) |
581 | { |
582 | SET_DECL_ALIGN (field, DECL_ALIGN (var)); |
583 | DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var); |
584 | TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var); |
585 | } |
586 | else |
587 | SET_DECL_ALIGN (field, TYPE_ALIGN (type)); |
588 | |
589 | fields->put (k: var, v: field); |
590 | |
591 | insert_field_into_struct (record_type, field); |
592 | } |
593 | |
594 | /* Sets of SSA_NAMES or VAR_DECLs to propagate. */ |
595 | typedef hash_set<tree> propagation_set; |
596 | |
597 | static void |
598 | find_ssa_names_to_propagate (parallel_g *par, unsigned outer_mask, |
599 | bitmap worker_single, bitmap vector_single, |
600 | vec<propagation_set *> *prop_set) |
601 | { |
602 | unsigned mask = outer_mask | par->mask; |
603 | |
604 | if (par->inner) |
605 | find_ssa_names_to_propagate (par: par->inner, outer_mask: mask, worker_single, |
606 | vector_single, prop_set); |
607 | if (par->next) |
608 | find_ssa_names_to_propagate (par: par->next, outer_mask, worker_single, |
609 | vector_single, prop_set); |
610 | |
611 | if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) |
612 | { |
613 | basic_block block; |
614 | int ix; |
615 | |
616 | for (ix = 0; par->blocks.iterate (ix, ptr: &block); ix++) |
617 | { |
618 | for (gphi_iterator psi = gsi_start_phis (block); |
619 | !gsi_end_p (i: psi); gsi_next (i: &psi)) |
620 | { |
621 | gphi *phi = psi.phi (); |
622 | use_operand_p use; |
623 | ssa_op_iter iter; |
624 | |
625 | FOR_EACH_PHI_ARG (use, phi, iter, SSA_OP_USE) |
626 | { |
627 | tree var = USE_FROM_PTR (use); |
628 | |
629 | if (TREE_CODE (var) != SSA_NAME) |
630 | continue; |
631 | |
632 | gimple *def_stmt = SSA_NAME_DEF_STMT (var); |
633 | |
634 | if (gimple_nop_p (g: def_stmt)) |
635 | continue; |
636 | |
637 | basic_block def_bb = gimple_bb (g: def_stmt); |
638 | |
639 | if (bitmap_bit_p (worker_single, def_bb->index)) |
640 | { |
641 | if (!(*prop_set)[def_bb->index]) |
642 | (*prop_set)[def_bb->index] = new propagation_set; |
643 | |
644 | propagation_set *ws_prop = (*prop_set)[def_bb->index]; |
645 | |
646 | ws_prop->add (k: var); |
647 | } |
648 | } |
649 | } |
650 | |
651 | for (gimple_stmt_iterator gsi = gsi_start_bb (bb: block); |
652 | !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
653 | { |
654 | use_operand_p use; |
655 | ssa_op_iter iter; |
656 | gimple *stmt = gsi_stmt (i: gsi); |
657 | |
658 | FOR_EACH_SSA_USE_OPERAND (use, stmt, iter, SSA_OP_USE) |
659 | { |
660 | tree var = USE_FROM_PTR (use); |
661 | |
662 | gimple *def_stmt = SSA_NAME_DEF_STMT (var); |
663 | |
664 | if (gimple_nop_p (g: def_stmt)) |
665 | continue; |
666 | |
667 | basic_block def_bb = gimple_bb (g: def_stmt); |
668 | |
669 | if (bitmap_bit_p (worker_single, def_bb->index)) |
670 | { |
671 | if (!(*prop_set)[def_bb->index]) |
672 | (*prop_set)[def_bb->index] = new propagation_set; |
673 | |
674 | propagation_set *ws_prop = (*prop_set)[def_bb->index]; |
675 | |
676 | ws_prop->add (k: var); |
677 | } |
678 | } |
679 | } |
680 | } |
681 | } |
682 | } |
683 | |
684 | /* Callback for walk_gimple_stmt to find RHS VAR_DECLs (uses) in a |
685 | statement. */ |
686 | |
687 | static tree |
688 | find_partitioned_var_uses_1 (tree *node, int *, void *data) |
689 | { |
690 | walk_stmt_info *wi = (walk_stmt_info *) data; |
691 | hash_set<tree> *partitioned_var_uses = (hash_set<tree> *) wi->info; |
692 | |
693 | if (!wi->is_lhs && VAR_P (*node)) |
694 | partitioned_var_uses->add (k: *node); |
695 | |
696 | return NULL_TREE; |
697 | } |
698 | |
699 | static void |
700 | find_partitioned_var_uses (parallel_g *par, unsigned outer_mask, |
701 | hash_set<tree> *partitioned_var_uses) |
702 | { |
703 | unsigned mask = outer_mask | par->mask; |
704 | |
705 | if (par->inner) |
706 | find_partitioned_var_uses (par: par->inner, outer_mask: mask, partitioned_var_uses); |
707 | if (par->next) |
708 | find_partitioned_var_uses (par: par->next, outer_mask, partitioned_var_uses); |
709 | |
710 | if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) |
711 | { |
712 | basic_block block; |
713 | int ix; |
714 | |
715 | for (ix = 0; par->blocks.iterate (ix, ptr: &block); ix++) |
716 | for (gimple_stmt_iterator gsi = gsi_start_bb (bb: block); |
717 | !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
718 | { |
719 | walk_stmt_info wi; |
720 | memset (s: &wi, c: 0, n: sizeof (wi)); |
721 | wi.info = (void *) partitioned_var_uses; |
722 | walk_gimple_stmt (&gsi, NULL, find_partitioned_var_uses_1, &wi); |
723 | } |
724 | } |
725 | } |
726 | |
727 | /* Gang-private variables (typically placed in a GPU's shared memory) do not |
728 | need to be processed by the worker-propagation mechanism. Populate the |
729 | GANG_PRIVATE_VARS set with any such variables found in the current |
730 | function. */ |
731 | |
732 | static void |
733 | find_gang_private_vars (hash_set<tree> *gang_private_vars) |
734 | { |
735 | basic_block block; |
736 | |
737 | FOR_EACH_BB_FN (block, cfun) |
738 | { |
739 | for (gimple_stmt_iterator gsi = gsi_start_bb (bb: block); |
740 | !gsi_end_p (i: gsi); |
741 | gsi_next (i: &gsi)) |
742 | { |
743 | gimple *stmt = gsi_stmt (i: gsi); |
744 | |
745 | if (gimple_call_internal_p (gs: stmt, fn: IFN_UNIQUE)) |
746 | { |
747 | enum ifn_unique_kind k = ((enum ifn_unique_kind) |
748 | TREE_INT_CST_LOW (gimple_call_arg (stmt, 0))); |
749 | if (k == IFN_UNIQUE_OACC_PRIVATE) |
750 | { |
751 | HOST_WIDE_INT level |
752 | = TREE_INT_CST_LOW (gimple_call_arg (stmt, 2)); |
753 | if (level != GOMP_DIM_GANG) |
754 | continue; |
755 | for (unsigned i = 3; i < gimple_call_num_args (gs: stmt); i++) |
756 | { |
757 | tree arg = gimple_call_arg (gs: stmt, index: i); |
758 | gcc_assert (TREE_CODE (arg) == ADDR_EXPR); |
759 | tree decl = TREE_OPERAND (arg, 0); |
760 | gang_private_vars->add (k: decl); |
761 | } |
762 | } |
763 | } |
764 | } |
765 | } |
766 | } |
767 | |
768 | static void |
769 | find_local_vars_to_propagate (parallel_g *par, unsigned outer_mask, |
770 | hash_set<tree> *partitioned_var_uses, |
771 | hash_set<tree> *gang_private_vars, |
772 | bitmap writes_gang_private, |
773 | vec<propagation_set *> *prop_set) |
774 | { |
775 | unsigned mask = outer_mask | par->mask; |
776 | |
777 | if (par->inner) |
778 | find_local_vars_to_propagate (par: par->inner, outer_mask: mask, partitioned_var_uses, |
779 | gang_private_vars, writes_gang_private, |
780 | prop_set); |
781 | if (par->next) |
782 | find_local_vars_to_propagate (par: par->next, outer_mask, partitioned_var_uses, |
783 | gang_private_vars, writes_gang_private, |
784 | prop_set); |
785 | |
786 | if (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))) |
787 | { |
788 | basic_block block; |
789 | int ix; |
790 | |
791 | for (ix = 0; par->blocks.iterate (ix, ptr: &block); ix++) |
792 | { |
793 | for (gimple_stmt_iterator gsi = gsi_start_bb (bb: block); |
794 | !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
795 | { |
796 | gimple *stmt = gsi_stmt (i: gsi); |
797 | tree var; |
798 | unsigned i; |
799 | |
800 | FOR_EACH_LOCAL_DECL (cfun, i, var) |
801 | { |
802 | if (!VAR_P (var) |
803 | || is_global_var (t: var) |
804 | || AGGREGATE_TYPE_P (TREE_TYPE (var)) |
805 | || !partitioned_var_uses->contains (k: var)) |
806 | continue; |
807 | |
808 | if (stmt_may_clobber_ref_p (stmt, var)) |
809 | { |
810 | if (dump_file) |
811 | { |
812 | fprintf (stream: dump_file, format: "bb %u: local variable may be " |
813 | "clobbered in %s mode: " , block->index, |
814 | mask_name (mask)); |
815 | print_generic_expr (dump_file, var, TDF_SLIM); |
816 | fprintf (stream: dump_file, format: "\n" ); |
817 | } |
818 | |
819 | if (gang_private_vars->contains (k: var)) |
820 | { |
821 | /* If we write a gang-private variable, we want a |
822 | barrier at the end of the block. */ |
823 | bitmap_set_bit (writes_gang_private, block->index); |
824 | continue; |
825 | } |
826 | |
827 | if (!(*prop_set)[block->index]) |
828 | (*prop_set)[block->index] = new propagation_set; |
829 | |
830 | propagation_set *ws_prop |
831 | = (*prop_set)[block->index]; |
832 | |
833 | ws_prop->add (k: var); |
834 | } |
835 | } |
836 | } |
837 | } |
838 | } |
839 | } |
840 | |
841 | /* Transform basic blocks FROM, TO (which may be the same block) into: |
842 | if (GOACC_single_start ()) |
843 | BLOCK; |
844 | GOACC_barrier (); |
845 | \ | / |
846 | +----+ |
847 | | | (new) predicate block |
848 | +----+-- |
849 | \ | / \ | / |t \ |
850 | +----+ +----+ +----+ | |
851 | | | | | ===> | | | f (old) from block |
852 | +----+ +----+ +----+ | |
853 | | t/ \f | / |
854 | +----+/ |
855 | (split (split before | | skip block |
856 | at end) condition) +----+ |
857 | t/ \f |
858 | */ |
859 | |
860 | static void |
861 | worker_single_simple (basic_block from, basic_block to, |
862 | hash_set<tree> *def_escapes_block) |
863 | { |
864 | gimple *call, *cond; |
865 | tree lhs, decl; |
866 | basic_block skip_block; |
867 | |
868 | gimple_stmt_iterator gsi = gsi_last_bb (bb: to); |
869 | if (EDGE_COUNT (to->succs) > 1) |
870 | { |
871 | gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_COND); |
872 | gsi_prev (i: &gsi); |
873 | } |
874 | edge e = split_block (to, gsi_stmt (i: gsi)); |
875 | skip_block = e->dest; |
876 | |
877 | gimple_stmt_iterator start = gsi_after_labels (bb: from); |
878 | |
879 | decl = builtin_decl_explicit (fncode: BUILT_IN_GOACC_SINGLE_START); |
880 | lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl))); |
881 | call = gimple_build_call (decl, 0); |
882 | gimple_call_set_lhs (gs: call, lhs); |
883 | gsi_insert_before (&start, call, GSI_NEW_STMT); |
884 | update_stmt (s: call); |
885 | |
886 | cond = gimple_build_cond (EQ_EXPR, lhs, |
887 | fold_convert_loc (UNKNOWN_LOCATION, |
888 | TREE_TYPE (lhs), |
889 | boolean_true_node), |
890 | NULL_TREE, NULL_TREE); |
891 | gsi_insert_after (&start, cond, GSI_NEW_STMT); |
892 | update_stmt (s: cond); |
893 | |
894 | edge et = split_block (from, cond); |
895 | et->flags &= ~EDGE_FALLTHRU; |
896 | et->flags |= EDGE_TRUE_VALUE; |
897 | /* Make the active worker the more probable path so we prefer fallthrough |
898 | (letting the idle workers jump around more). */ |
899 | et->probability = profile_probability::likely (); |
900 | |
901 | edge ef = make_edge (from, skip_block, EDGE_FALSE_VALUE); |
902 | ef->probability = et->probability.invert (); |
903 | |
904 | basic_block neutered = split_edge (ef); |
905 | gimple_stmt_iterator neut_gsi = gsi_last_bb (bb: neutered); |
906 | |
907 | for (gsi = gsi_start_bb (bb: et->dest); !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
908 | { |
909 | gimple *stmt = gsi_stmt (i: gsi); |
910 | ssa_op_iter iter; |
911 | tree var; |
912 | |
913 | FOR_EACH_SSA_TREE_OPERAND (var, stmt, iter, SSA_OP_DEF) |
914 | { |
915 | if (def_escapes_block->contains (k: var)) |
916 | { |
917 | gphi *join_phi = create_phi_node (NULL_TREE, skip_block); |
918 | create_new_def_for (var, join_phi, |
919 | gimple_phi_result_ptr (gs: join_phi)); |
920 | add_phi_arg (join_phi, var, e, UNKNOWN_LOCATION); |
921 | |
922 | tree neutered_def = copy_ssa_name (var, NULL); |
923 | /* We really want "don't care" or some value representing |
924 | undefined here, but optimizers will probably get rid of the |
925 | zero-assignments anyway. */ |
926 | gassign *zero = gimple_build_assign (neutered_def, |
927 | build_zero_cst (TREE_TYPE (neutered_def))); |
928 | |
929 | gsi_insert_after (&neut_gsi, zero, GSI_CONTINUE_LINKING); |
930 | update_stmt (s: zero); |
931 | |
932 | add_phi_arg (join_phi, neutered_def, single_succ_edge (bb: neutered), |
933 | UNKNOWN_LOCATION); |
934 | update_stmt (s: join_phi); |
935 | } |
936 | } |
937 | } |
938 | } |
939 | |
940 | static tree |
941 | build_receiver_ref (tree var, tree receiver_decl, field_map_t *fields) |
942 | { |
943 | tree x = build_simple_mem_ref (receiver_decl); |
944 | tree field = *fields->get (k: var); |
945 | TREE_THIS_NOTRAP (x) = 1; |
946 | x = omp_build_component_ref (obj: x, field); |
947 | return x; |
948 | } |
949 | |
950 | static tree |
951 | build_sender_ref (tree var, tree sender_decl, field_map_t *fields) |
952 | { |
953 | if (POINTER_TYPE_P (TREE_TYPE (sender_decl))) |
954 | sender_decl = build_simple_mem_ref (sender_decl); |
955 | tree field = *fields->get (k: var); |
956 | return omp_build_component_ref (obj: sender_decl, field); |
957 | } |
958 | |
959 | static int |
960 | sort_by_ssa_version_or_uid (const void *p1, const void *p2) |
961 | { |
962 | const tree t1 = *(const tree *)p1; |
963 | const tree t2 = *(const tree *)p2; |
964 | |
965 | if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) == SSA_NAME) |
966 | return SSA_NAME_VERSION (t1) - SSA_NAME_VERSION (t2); |
967 | else if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) != SSA_NAME) |
968 | return -1; |
969 | else if (TREE_CODE (t1) != SSA_NAME && TREE_CODE (t2) == SSA_NAME) |
970 | return 1; |
971 | else |
972 | return DECL_UID (t1) - DECL_UID (t2); |
973 | } |
974 | |
975 | static int |
976 | sort_by_size_then_ssa_version_or_uid (const void *p1, const void *p2) |
977 | { |
978 | const tree t1 = *(const tree *)p1; |
979 | const tree t2 = *(const tree *)p2; |
980 | unsigned HOST_WIDE_INT s1 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t1))); |
981 | unsigned HOST_WIDE_INT s2 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t2))); |
982 | if (s1 != s2) |
983 | return s2 - s1; |
984 | else |
985 | return sort_by_ssa_version_or_uid (p1, p2); |
986 | } |
987 | |
988 | static void |
989 | worker_single_copy (basic_block from, basic_block to, |
990 | hash_set<tree> *def_escapes_block, |
991 | hash_set<tree> *worker_partitioned_uses, |
992 | tree record_type, record_field_map_t *record_field_map, |
993 | unsigned HOST_WIDE_INT placement, |
994 | bool isolate_broadcasts, bool has_gang_private_write) |
995 | { |
996 | /* If we only have virtual defs, we'll have no record type, but we still want |
997 | to emit single_copy_start and (particularly) single_copy_end to act as |
998 | a vdef source on the neutered edge representing memory writes on the |
999 | non-neutered edge. */ |
1000 | if (!record_type) |
1001 | record_type = char_type_node; |
1002 | |
1003 | tree sender_decl |
1004 | = targetm.goacc.create_worker_broadcast_record (record_type, true, |
1005 | ".oacc_worker_o" , |
1006 | placement); |
1007 | tree receiver_decl |
1008 | = targetm.goacc.create_worker_broadcast_record (record_type, false, |
1009 | ".oacc_worker_i" , |
1010 | placement); |
1011 | |
1012 | gimple_stmt_iterator gsi = gsi_last_bb (bb: to); |
1013 | if (EDGE_COUNT (to->succs) > 1) |
1014 | gsi_prev (i: &gsi); |
1015 | edge e = split_block (to, gsi_stmt (i: gsi)); |
1016 | basic_block barrier_block = e->dest; |
1017 | |
1018 | gimple_stmt_iterator start = gsi_after_labels (bb: from); |
1019 | |
1020 | tree decl = builtin_decl_explicit (fncode: BUILT_IN_GOACC_SINGLE_COPY_START); |
1021 | |
1022 | tree lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl))); |
1023 | |
1024 | gimple *call |
1025 | = gimple_build_call (decl, 1, |
1026 | POINTER_TYPE_P (TREE_TYPE (sender_decl)) |
1027 | ? sender_decl : build_fold_addr_expr (sender_decl)); |
1028 | gimple_call_set_lhs (gs: call, lhs); |
1029 | gsi_insert_before (&start, call, GSI_NEW_STMT); |
1030 | update_stmt (s: call); |
1031 | |
1032 | /* The shared-memory range for this block overflowed. Add a barrier before |
1033 | the GOACC_single_copy_start call. */ |
1034 | if (isolate_broadcasts) |
1035 | { |
1036 | decl = builtin_decl_explicit (fncode: BUILT_IN_GOACC_BARRIER); |
1037 | gimple *acc_bar = gimple_build_call (decl, 0); |
1038 | gsi_insert_before (&start, acc_bar, GSI_SAME_STMT); |
1039 | } |
1040 | |
1041 | tree conv_tmp = make_ssa_name (TREE_TYPE (receiver_decl)); |
1042 | |
1043 | gimple *conv = gimple_build_assign (conv_tmp, |
1044 | fold_convert (TREE_TYPE (receiver_decl), |
1045 | lhs)); |
1046 | update_stmt (s: conv); |
1047 | gsi_insert_after (&start, conv, GSI_NEW_STMT); |
1048 | gimple *asgn = gimple_build_assign (receiver_decl, conv_tmp); |
1049 | gsi_insert_after (&start, asgn, GSI_NEW_STMT); |
1050 | update_stmt (s: asgn); |
1051 | |
1052 | tree zero_ptr = build_int_cst (TREE_TYPE (receiver_decl), 0); |
1053 | |
1054 | tree recv_tmp = make_ssa_name (TREE_TYPE (receiver_decl)); |
1055 | asgn = gimple_build_assign (recv_tmp, receiver_decl); |
1056 | gsi_insert_after (&start, asgn, GSI_NEW_STMT); |
1057 | update_stmt (s: asgn); |
1058 | |
1059 | gimple *cond = gimple_build_cond (EQ_EXPR, recv_tmp, zero_ptr, NULL_TREE, |
1060 | NULL_TREE); |
1061 | update_stmt (s: cond); |
1062 | |
1063 | gsi_insert_after (&start, cond, GSI_NEW_STMT); |
1064 | |
1065 | edge et = split_block (from, cond); |
1066 | et->flags &= ~EDGE_FALLTHRU; |
1067 | et->flags |= EDGE_TRUE_VALUE; |
1068 | /* Make the active worker the more probable path so we prefer fallthrough |
1069 | (letting the idle workers jump around more). */ |
1070 | et->probability = profile_probability::likely (); |
1071 | |
1072 | basic_block body = et->dest; |
1073 | |
1074 | edge ef = make_edge (from, barrier_block, EDGE_FALSE_VALUE); |
1075 | ef->probability = et->probability.invert (); |
1076 | |
1077 | gimple_stmt_iterator bar_gsi = gsi_start_bb (bb: barrier_block); |
1078 | cond = gimple_build_cond (NE_EXPR, recv_tmp, zero_ptr, NULL_TREE, NULL_TREE); |
1079 | |
1080 | if (record_type != char_type_node || has_gang_private_write) |
1081 | { |
1082 | decl = builtin_decl_explicit (fncode: BUILT_IN_GOACC_BARRIER); |
1083 | gimple *acc_bar = gimple_build_call (decl, 0); |
1084 | |
1085 | gsi_insert_before (&bar_gsi, acc_bar, GSI_NEW_STMT); |
1086 | gsi_insert_after (&bar_gsi, cond, GSI_NEW_STMT); |
1087 | } |
1088 | else |
1089 | gsi_insert_before (&bar_gsi, cond, GSI_NEW_STMT); |
1090 | |
1091 | edge et2 = split_block (barrier_block, cond); |
1092 | et2->flags &= ~EDGE_FALLTHRU; |
1093 | et2->flags |= EDGE_TRUE_VALUE; |
1094 | et2->probability = profile_probability::unlikely (); |
1095 | |
1096 | basic_block exit_block = et2->dest; |
1097 | |
1098 | basic_block copyout_block = split_edge (et2); |
1099 | edge ef2 = make_edge (barrier_block, exit_block, EDGE_FALSE_VALUE); |
1100 | ef2->probability = et2->probability.invert (); |
1101 | |
1102 | gimple_stmt_iterator copyout_gsi = gsi_start_bb (bb: copyout_block); |
1103 | |
1104 | edge copyout_to_exit = single_succ_edge (bb: copyout_block); |
1105 | |
1106 | gimple_seq sender_seq = NULL; |
1107 | |
1108 | /* Make sure we iterate over definitions in a stable order. */ |
1109 | auto_vec<tree> escape_vec (def_escapes_block->elements ()); |
1110 | for (hash_set<tree>::iterator it = def_escapes_block->begin (); |
1111 | it != def_escapes_block->end (); ++it) |
1112 | escape_vec.quick_push (obj: *it); |
1113 | escape_vec.qsort (sort_by_ssa_version_or_uid); |
1114 | |
1115 | for (unsigned i = 0; i < escape_vec.length (); i++) |
1116 | { |
1117 | tree var = escape_vec[i]; |
1118 | |
1119 | if (TREE_CODE (var) == SSA_NAME && SSA_NAME_IS_VIRTUAL_OPERAND (var)) |
1120 | continue; |
1121 | |
1122 | tree barrier_def = 0; |
1123 | |
1124 | if (TREE_CODE (var) == SSA_NAME) |
1125 | { |
1126 | gimple *def_stmt = SSA_NAME_DEF_STMT (var); |
1127 | |
1128 | if (gimple_nop_p (g: def_stmt)) |
1129 | continue; |
1130 | |
1131 | /* The barrier phi takes one result from the actual work of the |
1132 | block we're neutering, and the other result is constant zero of |
1133 | the same type. */ |
1134 | |
1135 | gphi *barrier_phi = create_phi_node (NULL_TREE, barrier_block); |
1136 | barrier_def = create_new_def_for (var, barrier_phi, |
1137 | gimple_phi_result_ptr (gs: barrier_phi)); |
1138 | |
1139 | add_phi_arg (barrier_phi, var, e, UNKNOWN_LOCATION); |
1140 | add_phi_arg (barrier_phi, build_zero_cst (TREE_TYPE (var)), ef, |
1141 | UNKNOWN_LOCATION); |
1142 | |
1143 | update_stmt (s: barrier_phi); |
1144 | } |
1145 | else |
1146 | gcc_assert (VAR_P (var)); |
1147 | |
1148 | /* If we had no record type, we will have no fields map. */ |
1149 | field_map_t *fields = record_field_map->get (k: record_type); |
1150 | |
1151 | if (worker_partitioned_uses->contains (k: var) |
1152 | && fields |
1153 | && fields->get (k: var)) |
1154 | { |
1155 | tree neutered_def = make_ssa_name (TREE_TYPE (var)); |
1156 | |
1157 | /* Receive definition from shared memory block. */ |
1158 | |
1159 | tree receiver_ref = build_receiver_ref (var, receiver_decl, fields); |
1160 | gassign *recv = gimple_build_assign (neutered_def, |
1161 | receiver_ref); |
1162 | gsi_insert_after (©out_gsi, recv, GSI_CONTINUE_LINKING); |
1163 | update_stmt (s: recv); |
1164 | |
1165 | if (VAR_P (var)) |
1166 | { |
1167 | /* If it's a VAR_DECL, we only copied to an SSA temporary. Copy |
1168 | to the final location now. */ |
1169 | gassign *asgn = gimple_build_assign (var, neutered_def); |
1170 | gsi_insert_after (©out_gsi, asgn, GSI_CONTINUE_LINKING); |
1171 | update_stmt (s: asgn); |
1172 | } |
1173 | else |
1174 | { |
1175 | /* If it's an SSA name, create a new phi at the join node to |
1176 | represent either the output from the active worker (the |
1177 | barrier) or the inactive workers (the copyout block). */ |
1178 | gphi *join_phi = create_phi_node (NULL_TREE, exit_block); |
1179 | create_new_def_for (barrier_def, join_phi, |
1180 | gimple_phi_result_ptr (gs: join_phi)); |
1181 | add_phi_arg (join_phi, barrier_def, ef2, UNKNOWN_LOCATION); |
1182 | add_phi_arg (join_phi, neutered_def, copyout_to_exit, |
1183 | UNKNOWN_LOCATION); |
1184 | update_stmt (s: join_phi); |
1185 | } |
1186 | |
1187 | /* Send definition to shared memory block. */ |
1188 | |
1189 | tree sender_ref = build_sender_ref (var, sender_decl, fields); |
1190 | |
1191 | if (TREE_CODE (var) == SSA_NAME) |
1192 | { |
1193 | gassign *send = gimple_build_assign (sender_ref, var); |
1194 | gimple_seq_add_stmt (&sender_seq, send); |
1195 | update_stmt (s: send); |
1196 | } |
1197 | else if (VAR_P (var)) |
1198 | { |
1199 | tree tmp = make_ssa_name (TREE_TYPE (var)); |
1200 | gassign *send = gimple_build_assign (tmp, var); |
1201 | gimple_seq_add_stmt (&sender_seq, send); |
1202 | update_stmt (s: send); |
1203 | send = gimple_build_assign (sender_ref, tmp); |
1204 | gimple_seq_add_stmt (&sender_seq, send); |
1205 | update_stmt (s: send); |
1206 | } |
1207 | else |
1208 | gcc_unreachable (); |
1209 | } |
1210 | } |
1211 | |
1212 | /* The shared-memory range for this block overflowed. Add a barrier at the |
1213 | end. */ |
1214 | if (isolate_broadcasts) |
1215 | { |
1216 | gsi = gsi_start_bb (bb: exit_block); |
1217 | decl = builtin_decl_explicit (fncode: BUILT_IN_GOACC_BARRIER); |
1218 | gimple *acc_bar = gimple_build_call (decl, 0); |
1219 | gsi_insert_before (&gsi, acc_bar, GSI_SAME_STMT); |
1220 | } |
1221 | |
1222 | /* It's possible for the ET->DEST block (the work done by the active thread) |
1223 | to finish with a control-flow insn, e.g. a UNIQUE function call. Split |
1224 | the block and add SENDER_SEQ in the latter part to avoid having control |
1225 | flow in the middle of a BB. */ |
1226 | |
1227 | decl = builtin_decl_explicit (fncode: BUILT_IN_GOACC_SINGLE_COPY_END); |
1228 | call = gimple_build_call (decl, 1, |
1229 | POINTER_TYPE_P (TREE_TYPE (sender_decl)) |
1230 | ? sender_decl |
1231 | : build_fold_addr_expr (sender_decl)); |
1232 | gimple_seq_add_stmt (&sender_seq, call); |
1233 | |
1234 | gsi = gsi_last_bb (bb: body); |
1235 | gimple *last = gsi_stmt (i: gsi); |
1236 | basic_block sender_block = split_block (body, last)->dest; |
1237 | gsi = gsi_last_bb (bb: sender_block); |
1238 | gsi_insert_seq_after (&gsi, sender_seq, GSI_CONTINUE_LINKING); |
1239 | } |
1240 | |
1241 | typedef hash_map<basic_block, std::pair<unsigned HOST_WIDE_INT, bool> > |
1242 | blk_offset_map_t; |
1243 | |
1244 | static void |
1245 | neuter_worker_single (parallel_g *par, unsigned outer_mask, |
1246 | bitmap worker_single, bitmap vector_single, |
1247 | vec<propagation_set *> *prop_set, |
1248 | hash_set<tree> *partitioned_var_uses, |
1249 | record_field_map_t *record_field_map, |
1250 | blk_offset_map_t *blk_offset_map, |
1251 | bitmap writes_gang_private) |
1252 | { |
1253 | unsigned mask = outer_mask | par->mask; |
1254 | |
1255 | if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0) |
1256 | { |
1257 | basic_block block; |
1258 | |
1259 | for (unsigned i = 0; par->blocks.iterate (ix: i, ptr: &block); i++) |
1260 | { |
1261 | bool has_defs = false; |
1262 | hash_set<tree> def_escapes_block; |
1263 | hash_set<tree> worker_partitioned_uses; |
1264 | unsigned j; |
1265 | tree var; |
1266 | |
1267 | FOR_EACH_SSA_NAME (j, var, cfun) |
1268 | { |
1269 | if (SSA_NAME_IS_VIRTUAL_OPERAND (var)) |
1270 | { |
1271 | has_defs = true; |
1272 | continue; |
1273 | } |
1274 | |
1275 | gimple *def_stmt = SSA_NAME_DEF_STMT (var); |
1276 | |
1277 | if (gimple_nop_p (g: def_stmt)) |
1278 | continue; |
1279 | |
1280 | if (gimple_bb (g: def_stmt)->index != block->index) |
1281 | continue; |
1282 | |
1283 | gimple *use_stmt; |
1284 | imm_use_iterator use_iter; |
1285 | bool uses_outside_block = false; |
1286 | bool worker_partitioned_use = false; |
1287 | |
1288 | FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, var) |
1289 | { |
1290 | int blocknum = gimple_bb (g: use_stmt)->index; |
1291 | |
1292 | /* Don't propagate SSA names that are only used in the |
1293 | current block, unless the usage is in a phi node: that |
1294 | means the name left the block, then came back in at the |
1295 | top. */ |
1296 | if (blocknum != block->index |
1297 | || gimple_code (g: use_stmt) == GIMPLE_PHI) |
1298 | uses_outside_block = true; |
1299 | if (!bitmap_bit_p (worker_single, blocknum)) |
1300 | worker_partitioned_use = true; |
1301 | } |
1302 | |
1303 | if (uses_outside_block) |
1304 | def_escapes_block.add (k: var); |
1305 | |
1306 | if (worker_partitioned_use) |
1307 | { |
1308 | worker_partitioned_uses.add (k: var); |
1309 | has_defs = true; |
1310 | } |
1311 | } |
1312 | |
1313 | propagation_set *ws_prop = (*prop_set)[block->index]; |
1314 | |
1315 | if (ws_prop) |
1316 | { |
1317 | for (propagation_set::iterator it = ws_prop->begin (); |
1318 | it != ws_prop->end (); |
1319 | ++it) |
1320 | { |
1321 | tree var = *it; |
1322 | if (TREE_CODE (var) == VAR_DECL) |
1323 | { |
1324 | def_escapes_block.add (k: var); |
1325 | if (partitioned_var_uses->contains (k: var)) |
1326 | { |
1327 | worker_partitioned_uses.add (k: var); |
1328 | has_defs = true; |
1329 | } |
1330 | } |
1331 | } |
1332 | |
1333 | delete ws_prop; |
1334 | (*prop_set)[block->index] = 0; |
1335 | } |
1336 | |
1337 | bool only_marker_fns = true; |
1338 | bool join_block = false; |
1339 | |
1340 | for (gimple_stmt_iterator gsi = gsi_start_bb (bb: block); |
1341 | !gsi_end_p (i: gsi); |
1342 | gsi_next (i: &gsi)) |
1343 | { |
1344 | gimple *stmt = gsi_stmt (i: gsi); |
1345 | if (gimple_code (g: stmt) == GIMPLE_CALL |
1346 | && gimple_call_internal_p (gs: stmt, fn: IFN_UNIQUE)) |
1347 | { |
1348 | enum ifn_unique_kind k = ((enum ifn_unique_kind) |
1349 | TREE_INT_CST_LOW (gimple_call_arg (stmt, 0))); |
1350 | if (k != IFN_UNIQUE_OACC_PRIVATE |
1351 | && k != IFN_UNIQUE_OACC_JOIN |
1352 | && k != IFN_UNIQUE_OACC_FORK |
1353 | && k != IFN_UNIQUE_OACC_HEAD_MARK |
1354 | && k != IFN_UNIQUE_OACC_TAIL_MARK) |
1355 | only_marker_fns = false; |
1356 | else if (k == IFN_UNIQUE_OACC_JOIN) |
1357 | /* The JOIN marker is special in that it *cannot* be |
1358 | predicated for worker zero, because it may be lowered |
1359 | to a barrier instruction and all workers must typically |
1360 | execute that barrier. We shouldn't be doing any |
1361 | broadcasts from the join block anyway. */ |
1362 | join_block = true; |
1363 | } |
1364 | else if (gimple_code (g: stmt) == GIMPLE_CALL |
1365 | && gimple_call_internal_p (gs: stmt, fn: IFN_GOACC_LOOP)) |
1366 | /* Empty. */; |
1367 | else if (gimple_nop_p (g: stmt)) |
1368 | /* Empty. */; |
1369 | else |
1370 | only_marker_fns = false; |
1371 | } |
1372 | |
1373 | /* We can skip predicating this block for worker zero if the only |
1374 | thing it contains is marker functions that will be removed in the |
1375 | oaccdevlow pass anyway. |
1376 | Don't do this if the block has (any) phi nodes, because those |
1377 | might define SSA names that need broadcasting. |
1378 | TODO: We might be able to skip transforming blocks that only |
1379 | contain some other trivial statements too. */ |
1380 | if (only_marker_fns && !phi_nodes (bb: block)) |
1381 | continue; |
1382 | |
1383 | gcc_assert (!join_block); |
1384 | |
1385 | if (has_defs) |
1386 | { |
1387 | tree record_type = (tree) block->aux; |
1388 | std::pair<unsigned HOST_WIDE_INT, bool> *off_rngalloc |
1389 | = blk_offset_map->get (k: block); |
1390 | gcc_assert (!record_type || off_rngalloc); |
1391 | unsigned HOST_WIDE_INT offset |
1392 | = off_rngalloc ? off_rngalloc->first : 0; |
1393 | bool range_allocated |
1394 | = off_rngalloc ? off_rngalloc->second : true; |
1395 | bool has_gang_private_write |
1396 | = bitmap_bit_p (writes_gang_private, block->index); |
1397 | worker_single_copy (from: block, to: block, def_escapes_block: &def_escapes_block, |
1398 | worker_partitioned_uses: &worker_partitioned_uses, record_type, |
1399 | record_field_map, |
1400 | placement: offset, isolate_broadcasts: !range_allocated, |
1401 | has_gang_private_write); |
1402 | } |
1403 | else |
1404 | worker_single_simple (from: block, to: block, def_escapes_block: &def_escapes_block); |
1405 | } |
1406 | } |
1407 | |
1408 | if ((outer_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0) |
1409 | { |
1410 | basic_block block; |
1411 | |
1412 | for (unsigned i = 0; par->blocks.iterate (ix: i, ptr: &block); i++) |
1413 | for (gimple_stmt_iterator gsi = gsi_start_bb (bb: block); |
1414 | !gsi_end_p (i: gsi); |
1415 | gsi_next (i: &gsi)) |
1416 | { |
1417 | gimple *stmt = gsi_stmt (i: gsi); |
1418 | |
1419 | if (gimple_code (g: stmt) == GIMPLE_CALL |
1420 | && !gimple_call_internal_p (gs: stmt) |
1421 | && !omp_sese_active_worker_call (call: as_a <gcall *> (p: stmt))) |
1422 | { |
1423 | /* If we have an OpenACC routine call in worker-single mode, |
1424 | place barriers before and afterwards to prevent |
1425 | clobbering re-used shared memory regions (as are used |
1426 | for AMDGCN at present, for example). */ |
1427 | tree decl = builtin_decl_explicit (fncode: BUILT_IN_GOACC_BARRIER); |
1428 | gsi_insert_before (&gsi, gimple_build_call (decl, 0), |
1429 | GSI_SAME_STMT); |
1430 | gsi_insert_after (&gsi, gimple_build_call (decl, 0), |
1431 | GSI_NEW_STMT); |
1432 | } |
1433 | } |
1434 | } |
1435 | |
1436 | if (par->inner) |
1437 | neuter_worker_single (par: par->inner, outer_mask: mask, worker_single, vector_single, |
1438 | prop_set, partitioned_var_uses, record_field_map, |
1439 | blk_offset_map, writes_gang_private); |
1440 | if (par->next) |
1441 | neuter_worker_single (par: par->next, outer_mask, worker_single, vector_single, |
1442 | prop_set, partitioned_var_uses, record_field_map, |
1443 | blk_offset_map, writes_gang_private); |
1444 | } |
1445 | |
1446 | static void |
1447 | dfs_broadcast_reachable_1 (basic_block bb, sbitmap reachable) |
1448 | { |
1449 | if (bb->flags & BB_VISITED) |
1450 | return; |
1451 | |
1452 | bb->flags |= BB_VISITED; |
1453 | |
1454 | if (bb->succs) |
1455 | { |
1456 | edge e; |
1457 | edge_iterator ei; |
1458 | FOR_EACH_EDGE (e, ei, bb->succs) |
1459 | { |
1460 | basic_block dest = e->dest; |
1461 | if (dest->aux) |
1462 | bitmap_set_bit (map: reachable, bitno: dest->index); |
1463 | else |
1464 | dfs_broadcast_reachable_1 (bb: dest, reachable); |
1465 | } |
1466 | } |
1467 | } |
1468 | |
1469 | typedef std::pair<int, tree> idx_decl_pair_t; |
1470 | |
1471 | typedef auto_vec<splay_tree> used_range_vec_t; |
1472 | |
1473 | static int |
1474 | sort_size_descending (const void *a, const void *b) |
1475 | { |
1476 | const idx_decl_pair_t *pa = (const idx_decl_pair_t *) a; |
1477 | const idx_decl_pair_t *pb = (const idx_decl_pair_t *) b; |
1478 | unsigned HOST_WIDE_INT asize = tree_to_uhwi (TYPE_SIZE_UNIT (pa->second)); |
1479 | unsigned HOST_WIDE_INT bsize = tree_to_uhwi (TYPE_SIZE_UNIT (pb->second)); |
1480 | return bsize - asize; |
1481 | } |
1482 | |
1483 | class addr_range |
1484 | { |
1485 | public: |
1486 | addr_range (unsigned HOST_WIDE_INT addr_lo, unsigned HOST_WIDE_INT addr_hi) |
1487 | : lo (addr_lo), hi (addr_hi) |
1488 | { } |
1489 | addr_range (const addr_range &ar) : lo (ar.lo), hi (ar.hi) |
1490 | { } |
1491 | addr_range () : lo (0), hi (0) |
1492 | { } |
1493 | |
1494 | bool invalid () { return lo == 0 && hi == 0; } |
1495 | |
1496 | unsigned HOST_WIDE_INT lo; |
1497 | unsigned HOST_WIDE_INT hi; |
1498 | }; |
1499 | |
1500 | static int |
1501 | splay_tree_compare_addr_range (splay_tree_key a, splay_tree_key b) |
1502 | { |
1503 | addr_range *ar = (addr_range *) a; |
1504 | addr_range *br = (addr_range *) b; |
1505 | if (ar->lo == br->lo && ar->hi == br->hi) |
1506 | return 0; |
1507 | if (ar->hi <= br->lo) |
1508 | return -1; |
1509 | else if (ar->lo >= br->hi) |
1510 | return 1; |
1511 | return 0; |
1512 | } |
1513 | |
1514 | static void |
1515 | splay_tree_free_key (splay_tree_key k) |
1516 | { |
1517 | addr_range *ar = (addr_range *) k; |
1518 | delete ar; |
1519 | } |
1520 | |
1521 | static addr_range |
1522 | first_fit_range (splay_tree s, unsigned HOST_WIDE_INT size, |
1523 | unsigned HOST_WIDE_INT align, addr_range *bounds) |
1524 | { |
1525 | splay_tree_node min = splay_tree_min (s); |
1526 | if (min) |
1527 | { |
1528 | splay_tree_node next; |
1529 | while ((next = splay_tree_successor (s, min->key))) |
1530 | { |
1531 | unsigned HOST_WIDE_INT lo = ((addr_range *) min->key)->hi; |
1532 | unsigned HOST_WIDE_INT hi = ((addr_range *) next->key)->lo; |
1533 | unsigned HOST_WIDE_INT base = (lo + align - 1) & ~(align - 1); |
1534 | if (base + size <= hi) |
1535 | return addr_range (base, base + size); |
1536 | min = next; |
1537 | } |
1538 | |
1539 | unsigned HOST_WIDE_INT base = ((addr_range *)min->key)->hi; |
1540 | base = (base + align - 1) & ~(align - 1); |
1541 | if (base + size <= bounds->hi) |
1542 | return addr_range (base, base + size); |
1543 | else |
1544 | return addr_range (); |
1545 | } |
1546 | else |
1547 | { |
1548 | unsigned HOST_WIDE_INT lo = bounds->lo; |
1549 | lo = (lo + align - 1) & ~(align - 1); |
1550 | if (lo + size <= bounds->hi) |
1551 | return addr_range (lo, lo + size); |
1552 | else |
1553 | return addr_range (); |
1554 | } |
1555 | } |
1556 | |
1557 | static int |
1558 | merge_ranges_1 (splay_tree_node n, void *ptr) |
1559 | { |
1560 | splay_tree accum = (splay_tree) ptr; |
1561 | addr_range ar = *(addr_range *) n->key; |
1562 | |
1563 | splay_tree_node old = splay_tree_lookup (accum, n->key); |
1564 | |
1565 | /* We might have an overlap. Create a new range covering the |
1566 | overlapping parts. */ |
1567 | if (old) |
1568 | { |
1569 | addr_range *old_ar = (addr_range *) old->key; |
1570 | ar.lo = MIN (old_ar->lo, ar.lo); |
1571 | ar.hi = MAX (old_ar->hi, ar.hi); |
1572 | splay_tree_remove (accum, old->key); |
1573 | } |
1574 | |
1575 | addr_range *new_ar = new addr_range (ar); |
1576 | |
1577 | splay_tree_insert (accum, (splay_tree_key) new_ar, n->value); |
1578 | |
1579 | return 0; |
1580 | } |
1581 | |
1582 | static void |
1583 | merge_ranges (splay_tree accum, splay_tree sp) |
1584 | { |
1585 | splay_tree_foreach (sp, merge_ranges_1, (void *) accum); |
1586 | } |
1587 | |
1588 | static void |
1589 | oacc_do_neutering (unsigned HOST_WIDE_INT bounds_lo, |
1590 | unsigned HOST_WIDE_INT bounds_hi) |
1591 | { |
1592 | bb_stmt_map_t bb_stmt_map; |
1593 | auto_bitmap worker_single, vector_single; |
1594 | |
1595 | omp_sese_split_blocks (map: &bb_stmt_map); |
1596 | |
1597 | if (dump_file) |
1598 | { |
1599 | fprintf (stream: dump_file, format: "\n\nAfter splitting:\n\n" ); |
1600 | dump_function_to_file (current_function_decl, dump_file, dump_flags); |
1601 | } |
1602 | |
1603 | unsigned mask = 0; |
1604 | |
1605 | /* If this is a routine, calculate MASK as if the outer levels are already |
1606 | partitioned. */ |
1607 | { |
1608 | tree attr = oacc_get_fn_attrib (fn: current_function_decl); |
1609 | tree dims = TREE_VALUE (attr); |
1610 | unsigned ix; |
1611 | for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims)) |
1612 | { |
1613 | tree allowed = TREE_PURPOSE (dims); |
1614 | if (allowed && integer_zerop (allowed)) |
1615 | mask |= GOMP_DIM_MASK (ix); |
1616 | } |
1617 | } |
1618 | |
1619 | parallel_g *par = omp_sese_discover_pars (map: &bb_stmt_map); |
1620 | populate_single_mode_bitmaps (par, worker_single, vector_single, outer_mask: mask, depth: 0); |
1621 | |
1622 | basic_block bb; |
1623 | FOR_ALL_BB_FN (bb, cfun) |
1624 | bb->aux = NULL; |
1625 | |
1626 | vec<propagation_set *> prop_set (vNULL); |
1627 | prop_set.safe_grow_cleared (last_basic_block_for_fn (cfun), exact: true); |
1628 | |
1629 | find_ssa_names_to_propagate (par, outer_mask: mask, worker_single, vector_single, |
1630 | prop_set: &prop_set); |
1631 | |
1632 | hash_set<tree> partitioned_var_uses; |
1633 | hash_set<tree> gang_private_vars; |
1634 | auto_bitmap writes_gang_private; |
1635 | |
1636 | find_gang_private_vars (gang_private_vars: &gang_private_vars); |
1637 | find_partitioned_var_uses (par, outer_mask: mask, partitioned_var_uses: &partitioned_var_uses); |
1638 | find_local_vars_to_propagate (par, outer_mask: mask, partitioned_var_uses: &partitioned_var_uses, |
1639 | gang_private_vars: &gang_private_vars, writes_gang_private, |
1640 | prop_set: &prop_set); |
1641 | |
1642 | record_field_map_t record_field_map; |
1643 | |
1644 | FOR_ALL_BB_FN (bb, cfun) |
1645 | { |
1646 | propagation_set *ws_prop = prop_set[bb->index]; |
1647 | if (ws_prop) |
1648 | { |
1649 | tree record_type = lang_hooks.types.make_type (RECORD_TYPE); |
1650 | tree name = create_tmp_var_name (".oacc_ws_data_s" ); |
1651 | name = build_decl (UNKNOWN_LOCATION, TYPE_DECL, name, record_type); |
1652 | DECL_ARTIFICIAL (name) = 1; |
1653 | DECL_NAMELESS (name) = 1; |
1654 | TYPE_NAME (record_type) = name; |
1655 | TYPE_ARTIFICIAL (record_type) = 1; |
1656 | |
1657 | auto_vec<tree> field_vec (ws_prop->elements ()); |
1658 | for (hash_set<tree>::iterator it = ws_prop->begin (); |
1659 | it != ws_prop->end (); ++it) |
1660 | field_vec.quick_push (obj: *it); |
1661 | |
1662 | field_vec.qsort (sort_by_size_then_ssa_version_or_uid); |
1663 | |
1664 | bool existed; |
1665 | field_map_t *fields |
1666 | = &record_field_map.get_or_insert (k: record_type, existed: &existed); |
1667 | gcc_checking_assert (!existed); |
1668 | |
1669 | /* Insert var fields in reverse order, so the last inserted element |
1670 | is the first in the structure. */ |
1671 | for (int i = field_vec.length () - 1; i >= 0; i--) |
1672 | install_var_field (var: field_vec[i], record_type, fields); |
1673 | |
1674 | layout_type (record_type); |
1675 | |
1676 | bb->aux = (tree) record_type; |
1677 | } |
1678 | } |
1679 | |
1680 | sbitmap *reachable |
1681 | = sbitmap_vector_alloc (last_basic_block_for_fn (cfun), |
1682 | last_basic_block_for_fn (cfun)); |
1683 | |
1684 | bitmap_vector_clear (reachable, last_basic_block_for_fn (cfun)); |
1685 | |
1686 | auto_vec<std::pair<int, tree> > priority; |
1687 | |
1688 | FOR_ALL_BB_FN (bb, cfun) |
1689 | { |
1690 | if (bb->aux) |
1691 | { |
1692 | tree record_type = (tree) bb->aux; |
1693 | |
1694 | basic_block bb2; |
1695 | FOR_ALL_BB_FN (bb2, cfun) |
1696 | bb2->flags &= ~BB_VISITED; |
1697 | |
1698 | priority.safe_push (obj: std::make_pair (x&: bb->index, y&: record_type)); |
1699 | dfs_broadcast_reachable_1 (bb, reachable: reachable[bb->index]); |
1700 | } |
1701 | } |
1702 | |
1703 | sbitmap *inverted |
1704 | = sbitmap_vector_alloc (last_basic_block_for_fn (cfun), |
1705 | last_basic_block_for_fn (cfun)); |
1706 | |
1707 | bitmap_vector_clear (inverted, last_basic_block_for_fn (cfun)); |
1708 | |
1709 | for (int i = 0; i < last_basic_block_for_fn (cfun); i++) |
1710 | { |
1711 | sbitmap_iterator bi; |
1712 | unsigned int j; |
1713 | EXECUTE_IF_SET_IN_BITMAP (reachable[i], 0, j, bi) |
1714 | bitmap_set_bit (map: inverted[j], bitno: i); |
1715 | } |
1716 | |
1717 | for (int i = 0; i < last_basic_block_for_fn (cfun); i++) |
1718 | bitmap_ior (reachable[i], reachable[i], inverted[i]); |
1719 | |
1720 | sbitmap_vector_free (vec: inverted); |
1721 | |
1722 | used_range_vec_t used_ranges; |
1723 | |
1724 | used_ranges.safe_grow_cleared (last_basic_block_for_fn (cfun)); |
1725 | |
1726 | blk_offset_map_t blk_offset_map; |
1727 | |
1728 | addr_range worker_shm_bounds (bounds_lo, bounds_hi); |
1729 | |
1730 | priority.qsort (sort_size_descending); |
1731 | for (unsigned int i = 0; i < priority.length (); i++) |
1732 | { |
1733 | idx_decl_pair_t p = priority[i]; |
1734 | int blkno = p.first; |
1735 | tree record_type = p.second; |
1736 | HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (record_type)); |
1737 | HOST_WIDE_INT align = TYPE_ALIGN_UNIT (record_type); |
1738 | |
1739 | splay_tree conflicts = splay_tree_new (splay_tree_compare_addr_range, |
1740 | splay_tree_free_key, NULL); |
1741 | |
1742 | if (!used_ranges[blkno]) |
1743 | used_ranges[blkno] = splay_tree_new (splay_tree_compare_addr_range, |
1744 | splay_tree_free_key, NULL); |
1745 | else |
1746 | merge_ranges (accum: conflicts, sp: used_ranges[blkno]); |
1747 | |
1748 | sbitmap_iterator bi; |
1749 | unsigned int j; |
1750 | EXECUTE_IF_SET_IN_BITMAP (reachable[blkno], 0, j, bi) |
1751 | if (used_ranges[j]) |
1752 | merge_ranges (accum: conflicts, sp: used_ranges[j]); |
1753 | |
1754 | addr_range ar |
1755 | = first_fit_range (s: conflicts, size, align, bounds: &worker_shm_bounds); |
1756 | |
1757 | splay_tree_delete (conflicts); |
1758 | |
1759 | if (ar.invalid ()) |
1760 | { |
1761 | unsigned HOST_WIDE_INT base |
1762 | = (bounds_lo + align - 1) & ~(align - 1); |
1763 | if (base + size > bounds_hi) |
1764 | error_at (UNKNOWN_LOCATION, "shared-memory region overflow" ); |
1765 | std::pair<unsigned HOST_WIDE_INT, bool> base_inrng |
1766 | = std::make_pair (x&: base, y: false); |
1767 | blk_offset_map.put (BASIC_BLOCK_FOR_FN (cfun, blkno), v: base_inrng); |
1768 | } |
1769 | else |
1770 | { |
1771 | splay_tree_node old = splay_tree_lookup (used_ranges[blkno], |
1772 | (splay_tree_key) &ar); |
1773 | if (old) |
1774 | { |
1775 | fprintf (stderr, format: "trying to map [%d..%d] but [%d..%d] is " |
1776 | "already mapped in block %d\n" , (int) ar.lo, |
1777 | (int) ar.hi, (int) ((addr_range *) old->key)->lo, |
1778 | (int) ((addr_range *) old->key)->hi, blkno); |
1779 | abort (); |
1780 | } |
1781 | |
1782 | addr_range *arp = new addr_range (ar); |
1783 | splay_tree_insert (used_ranges[blkno], (splay_tree_key) arp, |
1784 | (splay_tree_value) blkno); |
1785 | std::pair<unsigned HOST_WIDE_INT, bool> base_inrng |
1786 | = std::make_pair (x&: ar.lo, y: true); |
1787 | blk_offset_map.put (BASIC_BLOCK_FOR_FN (cfun, blkno), v: base_inrng); |
1788 | } |
1789 | } |
1790 | |
1791 | sbitmap_vector_free (vec: reachable); |
1792 | |
1793 | neuter_worker_single (par, outer_mask: mask, worker_single, vector_single, prop_set: &prop_set, |
1794 | partitioned_var_uses: &partitioned_var_uses, record_field_map: &record_field_map, |
1795 | blk_offset_map: &blk_offset_map, writes_gang_private); |
1796 | |
1797 | record_field_map.empty (); |
1798 | |
1799 | /* These are supposed to have been 'delete'd by 'neuter_worker_single'. */ |
1800 | for (auto it : prop_set) |
1801 | gcc_checking_assert (!it); |
1802 | prop_set.release (); |
1803 | |
1804 | delete par; |
1805 | |
1806 | /* This doesn't seem to make a difference. */ |
1807 | loops_state_clear (flags: LOOP_CLOSED_SSA); |
1808 | |
1809 | /* Neutering worker-single neutered blocks will invalidate dominance info. |
1810 | It may be possible to incrementally update just the affected blocks, but |
1811 | obliterate everything for now. */ |
1812 | free_dominance_info (CDI_DOMINATORS); |
1813 | free_dominance_info (CDI_POST_DOMINATORS); |
1814 | |
1815 | if (dump_file) |
1816 | { |
1817 | fprintf (stream: dump_file, format: "\n\nAfter neutering:\n\n" ); |
1818 | dump_function_to_file (current_function_decl, dump_file, dump_flags); |
1819 | } |
1820 | } |
1821 | |
1822 | static int |
1823 | execute_omp_oacc_neuter_broadcast () |
1824 | { |
1825 | unsigned HOST_WIDE_INT reduction_size[GOMP_DIM_MAX]; |
1826 | unsigned HOST_WIDE_INT private_size[GOMP_DIM_MAX]; |
1827 | |
1828 | for (unsigned i = 0; i < GOMP_DIM_MAX; i++) |
1829 | { |
1830 | reduction_size[i] = 0; |
1831 | private_size[i] = 0; |
1832 | } |
1833 | |
1834 | /* Calculate shared memory size required for reduction variables and |
1835 | gang-private memory for this offloaded function. */ |
1836 | basic_block bb; |
1837 | FOR_ALL_BB_FN (bb, cfun) |
1838 | { |
1839 | for (gimple_stmt_iterator gsi = gsi_start_bb (bb); |
1840 | !gsi_end_p (i: gsi); |
1841 | gsi_next (i: &gsi)) |
1842 | { |
1843 | gimple *stmt = gsi_stmt (i: gsi); |
1844 | if (!is_gimple_call (gs: stmt)) |
1845 | continue; |
1846 | gcall *call = as_a <gcall *> (p: stmt); |
1847 | if (!gimple_call_internal_p (gs: call)) |
1848 | continue; |
1849 | enum internal_fn ifn_code = gimple_call_internal_fn (gs: call); |
1850 | switch (ifn_code) |
1851 | { |
1852 | default: break; |
1853 | case IFN_GOACC_REDUCTION: |
1854 | if (integer_minus_onep (gimple_call_arg (gs: call, index: 3))) |
1855 | continue; |
1856 | else |
1857 | { |
1858 | unsigned code = TREE_INT_CST_LOW (gimple_call_arg (call, 0)); |
1859 | /* Only count reduction variables once: the choice to pick |
1860 | the setup call is fairly arbitrary. */ |
1861 | if (code == IFN_GOACC_REDUCTION_SETUP) |
1862 | { |
1863 | int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3)); |
1864 | tree var = gimple_call_arg (gs: call, index: 2); |
1865 | tree offset = gimple_call_arg (gs: call, index: 5); |
1866 | tree var_type = TREE_TYPE (var); |
1867 | unsigned HOST_WIDE_INT limit |
1868 | = (tree_to_uhwi (offset) |
1869 | + tree_to_uhwi (TYPE_SIZE_UNIT (var_type))); |
1870 | reduction_size[level] |
1871 | = MAX (reduction_size[level], limit); |
1872 | } |
1873 | } |
1874 | break; |
1875 | case IFN_UNIQUE: |
1876 | { |
1877 | enum ifn_unique_kind kind |
1878 | = ((enum ifn_unique_kind) |
1879 | TREE_INT_CST_LOW (gimple_call_arg (call, 0))); |
1880 | |
1881 | if (kind == IFN_UNIQUE_OACC_PRIVATE) |
1882 | { |
1883 | HOST_WIDE_INT level |
1884 | = TREE_INT_CST_LOW (gimple_call_arg (call, 2)); |
1885 | if (level == -1) |
1886 | break; |
1887 | for (unsigned i = 3; |
1888 | i < gimple_call_num_args (gs: call); |
1889 | i++) |
1890 | { |
1891 | tree arg = gimple_call_arg (gs: call, index: i); |
1892 | gcc_assert (TREE_CODE (arg) == ADDR_EXPR); |
1893 | tree decl = TREE_OPERAND (arg, 0); |
1894 | unsigned HOST_WIDE_INT align = DECL_ALIGN_UNIT (decl); |
1895 | private_size[level] = ((private_size[level] + align - 1) |
1896 | & ~(align - 1)); |
1897 | unsigned HOST_WIDE_INT decl_size |
1898 | = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (decl))); |
1899 | private_size[level] += decl_size; |
1900 | } |
1901 | } |
1902 | } |
1903 | break; |
1904 | } |
1905 | } |
1906 | } |
1907 | |
1908 | int dims[GOMP_DIM_MAX]; |
1909 | for (unsigned i = 0; i < GOMP_DIM_MAX; i++) |
1910 | dims[i] = oacc_get_fn_dim_size (fn: current_function_decl, axis: i); |
1911 | |
1912 | /* Find bounds of shared-memory buffer space we can use. */ |
1913 | unsigned HOST_WIDE_INT bounds_lo = 0, bounds_hi = 0; |
1914 | if (targetm.goacc.shared_mem_layout) |
1915 | targetm.goacc.shared_mem_layout (&bounds_lo, &bounds_hi, dims, |
1916 | private_size, reduction_size); |
1917 | |
1918 | /* Perform worker partitioning unless we know 'num_workers(1)'. */ |
1919 | if (dims[GOMP_DIM_WORKER] != 1) |
1920 | oacc_do_neutering (bounds_lo, bounds_hi); |
1921 | |
1922 | return 0; |
1923 | } |
1924 | |
1925 | namespace { |
1926 | |
1927 | const pass_data pass_data_omp_oacc_neuter_broadcast = |
1928 | { |
1929 | .type: GIMPLE_PASS, /* type */ |
1930 | .name: "omp_oacc_neuter_broadcast" , /* name */ |
1931 | .optinfo_flags: OPTGROUP_OMP, /* optinfo_flags */ |
1932 | .tv_id: TV_NONE, /* tv_id */ |
1933 | PROP_cfg, /* properties_required */ |
1934 | .properties_provided: 0, /* properties_provided */ |
1935 | .properties_destroyed: 0, /* properties_destroyed */ |
1936 | .todo_flags_start: 0, /* todo_flags_start */ |
1937 | TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */ |
1938 | }; |
1939 | |
1940 | class pass_omp_oacc_neuter_broadcast : public gimple_opt_pass |
1941 | { |
1942 | public: |
1943 | pass_omp_oacc_neuter_broadcast (gcc::context *ctxt) |
1944 | : gimple_opt_pass (pass_data_omp_oacc_neuter_broadcast, ctxt) |
1945 | {} |
1946 | |
1947 | /* opt_pass methods: */ |
1948 | bool gate (function *fun) final override |
1949 | { |
1950 | if (!flag_openacc) |
1951 | return false; |
1952 | |
1953 | if (!targetm.goacc.create_worker_broadcast_record) |
1954 | return false; |
1955 | |
1956 | /* Only relevant for OpenACC offloaded functions. */ |
1957 | tree attr = oacc_get_fn_attrib (fn: fun->decl); |
1958 | if (!attr) |
1959 | return false; |
1960 | |
1961 | return true; |
1962 | } |
1963 | |
1964 | unsigned int execute (function *) final override |
1965 | { |
1966 | return execute_omp_oacc_neuter_broadcast (); |
1967 | } |
1968 | |
1969 | }; // class pass_omp_oacc_neuter_broadcast |
1970 | |
1971 | } // anon namespace |
1972 | |
1973 | gimple_opt_pass * |
1974 | make_pass_omp_oacc_neuter_broadcast (gcc::context *ctxt) |
1975 | { |
1976 | return new pass_omp_oacc_neuter_broadcast (ctxt); |
1977 | } |
1978 | |