1 | /* Loop distribution. |
2 | Copyright (C) 2006-2023 Free Software Foundation, Inc. |
3 | Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr> |
4 | and Sebastian Pop <sebastian.pop@amd.com>. |
5 | |
6 | This file is part of GCC. |
7 | |
8 | GCC is free software; you can redistribute it and/or modify it |
9 | under the terms of the GNU General Public License as published by the |
10 | Free Software Foundation; either version 3, or (at your option) any |
11 | later version. |
12 | |
13 | GCC is distributed in the hope that it will be useful, but WITHOUT |
14 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
15 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
16 | for more details. |
17 | |
18 | You should have received a copy of the GNU General Public License |
19 | along with GCC; see the file COPYING3. If not see |
20 | <http://www.gnu.org/licenses/>. */ |
21 | |
22 | /* This pass performs loop distribution: for example, the loop |
23 | |
24 | |DO I = 2, N |
25 | | A(I) = B(I) + C |
26 | | D(I) = A(I-1)*E |
27 | |ENDDO |
28 | |
29 | is transformed to |
30 | |
31 | |DOALL I = 2, N |
32 | | A(I) = B(I) + C |
33 | |ENDDO |
34 | | |
35 | |DOALL I = 2, N |
36 | | D(I) = A(I-1)*E |
37 | |ENDDO |
38 | |
39 | Loop distribution is the dual of loop fusion. It separates statements |
40 | of a loop (or loop nest) into multiple loops (or loop nests) with the |
41 | same loop header. The major goal is to separate statements which may |
42 | be vectorized from those that can't. This pass implements distribution |
43 | in the following steps: |
44 | |
45 | 1) Seed partitions with specific type statements. For now we support |
46 | two types seed statements: statement defining variable used outside |
47 | of loop; statement storing to memory. |
48 | 2) Build reduced dependence graph (RDG) for loop to be distributed. |
49 | The vertices (RDG:V) model all statements in the loop and the edges |
50 | (RDG:E) model flow and control dependencies between statements. |
51 | 3) Apart from RDG, compute data dependencies between memory references. |
52 | 4) Starting from seed statement, build up partition by adding depended |
53 | statements according to RDG's dependence information. Partition is |
54 | classified as parallel type if it can be executed paralleled; or as |
55 | sequential type if it can't. Parallel type partition is further |
56 | classified as different builtin kinds if it can be implemented as |
57 | builtin function calls. |
58 | 5) Build partition dependence graph (PG) based on data dependencies. |
59 | The vertices (PG:V) model all partitions and the edges (PG:E) model |
60 | all data dependencies between every partitions pair. In general, |
61 | data dependence is either compilation time known or unknown. In C |
62 | family languages, there exists quite amount compilation time unknown |
63 | dependencies because of possible alias relation of data references. |
64 | We categorize PG's edge to two types: "true" edge that represents |
65 | compilation time known data dependencies; "alias" edge for all other |
66 | data dependencies. |
67 | 6) Traverse subgraph of PG as if all "alias" edges don't exist. Merge |
68 | partitions in each strong connected component (SCC) correspondingly. |
69 | Build new PG for merged partitions. |
70 | 7) Traverse PG again and this time with both "true" and "alias" edges |
71 | included. We try to break SCCs by removing some edges. Because |
72 | SCCs by "true" edges are all fused in step 6), we can break SCCs |
73 | by removing some "alias" edges. It's NP-hard to choose optimal |
74 | edge set, fortunately simple approximation is good enough for us |
75 | given the small problem scale. |
76 | 8) Collect all data dependencies of the removed "alias" edges. Create |
77 | runtime alias checks for collected data dependencies. |
78 | 9) Version loop under the condition of runtime alias checks. Given |
79 | loop distribution generally introduces additional overhead, it is |
80 | only useful if vectorization is achieved in distributed loop. We |
81 | version loop with internal function call IFN_LOOP_DIST_ALIAS. If |
82 | no distributed loop can be vectorized, we simply remove distributed |
83 | loops and recover to the original one. |
84 | |
85 | TODO: |
86 | 1) We only distribute innermost two-level loop nest now. We should |
87 | extend it for arbitrary loop nests in the future. |
88 | 2) We only fuse partitions in SCC now. A better fusion algorithm is |
89 | desired to minimize loop overhead, maximize parallelism and maximize |
90 | data reuse. */ |
91 | |
92 | #include "config.h" |
93 | #include "system.h" |
94 | #include "coretypes.h" |
95 | #include "backend.h" |
96 | #include "tree.h" |
97 | #include "gimple.h" |
98 | #include "cfghooks.h" |
99 | #include "tree-pass.h" |
100 | #include "ssa.h" |
101 | #include "gimple-pretty-print.h" |
102 | #include "fold-const.h" |
103 | #include "cfganal.h" |
104 | #include "gimple-iterator.h" |
105 | #include "gimplify-me.h" |
106 | #include "stor-layout.h" |
107 | #include "tree-cfg.h" |
108 | #include "tree-ssa-loop-manip.h" |
109 | #include "tree-ssa-loop-ivopts.h" |
110 | #include "tree-ssa-loop.h" |
111 | #include "tree-into-ssa.h" |
112 | #include "tree-ssa.h" |
113 | #include "cfgloop.h" |
114 | #include "tree-scalar-evolution.h" |
115 | #include "tree-vectorizer.h" |
116 | #include "tree-eh.h" |
117 | #include "gimple-fold.h" |
118 | #include "tree-affine.h" |
119 | #include "intl.h" |
120 | #include "rtl.h" |
121 | #include "memmodel.h" |
122 | #include "optabs.h" |
123 | #include "tree-ssa-loop-niter.h" |
124 | |
125 | |
126 | #define MAX_DATAREFS_NUM \ |
127 | ((unsigned) param_loop_max_datarefs_for_datadeps) |
128 | |
129 | /* Threshold controlling number of distributed partitions. Given it may |
130 | be unnecessary if a memory stream cost model is invented in the future, |
131 | we define it as a temporary macro, rather than a parameter. */ |
132 | #define NUM_PARTITION_THRESHOLD (4) |
133 | |
134 | /* Hashtable helpers. */ |
135 | |
136 | struct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation> |
137 | { |
138 | static inline hashval_t hash (const data_dependence_relation *); |
139 | static inline bool equal (const data_dependence_relation *, |
140 | const data_dependence_relation *); |
141 | }; |
142 | |
143 | /* Hash function for data dependence. */ |
144 | |
145 | inline hashval_t |
146 | ddr_hasher::hash (const data_dependence_relation *ddr) |
147 | { |
148 | inchash::hash h; |
149 | h.add_ptr (DDR_A (ddr)); |
150 | h.add_ptr (DDR_B (ddr)); |
151 | return h.end (); |
152 | } |
153 | |
154 | /* Hash table equality function for data dependence. */ |
155 | |
156 | inline bool |
157 | ddr_hasher::equal (const data_dependence_relation *ddr1, |
158 | const data_dependence_relation *ddr2) |
159 | { |
160 | return (DDR_A (ddr1) == DDR_A (ddr2) && DDR_B (ddr1) == DDR_B (ddr2)); |
161 | } |
162 | |
163 | |
164 | |
165 | #define DR_INDEX(dr) ((uintptr_t) (dr)->aux) |
166 | |
167 | /* A Reduced Dependence Graph (RDG) vertex representing a statement. */ |
168 | struct rdg_vertex |
169 | { |
170 | /* The statement represented by this vertex. */ |
171 | gimple *stmt; |
172 | |
173 | /* Vector of data-references in this statement. */ |
174 | vec<data_reference_p> datarefs; |
175 | |
176 | /* True when the statement contains a write to memory. */ |
177 | bool has_mem_write; |
178 | |
179 | /* True when the statement contains a read from memory. */ |
180 | bool has_mem_reads; |
181 | }; |
182 | |
183 | #define RDGV_STMT(V) ((struct rdg_vertex *) ((V)->data))->stmt |
184 | #define RDGV_DATAREFS(V) ((struct rdg_vertex *) ((V)->data))->datarefs |
185 | #define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write |
186 | #define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads |
187 | #define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I])) |
188 | #define RDG_DATAREFS(RDG, I) RDGV_DATAREFS (&(RDG->vertices[I])) |
189 | #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I])) |
190 | #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I])) |
191 | |
192 | /* Data dependence type. */ |
193 | |
194 | enum rdg_dep_type |
195 | { |
196 | /* Read After Write (RAW). */ |
197 | flow_dd = 'f', |
198 | |
199 | /* Control dependence (execute conditional on). */ |
200 | control_dd = 'c' |
201 | }; |
202 | |
203 | /* Dependence information attached to an edge of the RDG. */ |
204 | |
205 | struct rdg_edge |
206 | { |
207 | /* Type of the dependence. */ |
208 | enum rdg_dep_type type; |
209 | }; |
210 | |
211 | #define RDGE_TYPE(E) ((struct rdg_edge *) ((E)->data))->type |
212 | |
213 | /* Kind of distributed loop. */ |
214 | enum partition_kind { |
215 | PKIND_NORMAL, |
216 | /* Partial memset stands for a paritition can be distributed into a loop |
217 | of memset calls, rather than a single memset call. It's handled just |
218 | like a normal parition, i.e, distributed as separate loop, no memset |
219 | call is generated. |
220 | |
221 | Note: This is a hacking fix trying to distribute ZERO-ing stmt in a |
222 | loop nest as deep as possible. As a result, parloop achieves better |
223 | parallelization by parallelizing deeper loop nest. This hack should |
224 | be unnecessary and removed once distributed memset can be understood |
225 | and analyzed in data reference analysis. See PR82604 for more. */ |
226 | PKIND_PARTIAL_MEMSET, |
227 | PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE |
228 | }; |
229 | |
230 | /* Type of distributed loop. */ |
231 | enum partition_type { |
232 | /* The distributed loop can be executed parallelly. */ |
233 | PTYPE_PARALLEL = 0, |
234 | /* The distributed loop has to be executed sequentially. */ |
235 | PTYPE_SEQUENTIAL |
236 | }; |
237 | |
238 | /* Builtin info for loop distribution. */ |
239 | struct builtin_info |
240 | { |
241 | /* data-references a kind != PKIND_NORMAL partition is about. */ |
242 | data_reference_p dst_dr; |
243 | data_reference_p src_dr; |
244 | /* Base address and size of memory objects operated by the builtin. Note |
245 | both dest and source memory objects must have the same size. */ |
246 | tree dst_base; |
247 | tree src_base; |
248 | tree size; |
249 | /* Base and offset part of dst_base after stripping constant offset. This |
250 | is only used in memset builtin distribution for now. */ |
251 | tree dst_base_base; |
252 | unsigned HOST_WIDE_INT dst_base_offset; |
253 | }; |
254 | |
255 | /* Partition for loop distribution. */ |
256 | struct partition |
257 | { |
258 | /* Statements of the partition. */ |
259 | bitmap stmts; |
260 | /* True if the partition defines variable which is used outside of loop. */ |
261 | bool reduction_p; |
262 | location_t loc; |
263 | enum partition_kind kind; |
264 | enum partition_type type; |
265 | /* Data references in the partition. */ |
266 | bitmap datarefs; |
267 | /* Information of builtin parition. */ |
268 | struct builtin_info *builtin; |
269 | }; |
270 | |
271 | /* Partitions are fused because of different reasons. */ |
272 | enum fuse_type |
273 | { |
274 | FUSE_NON_BUILTIN = 0, |
275 | FUSE_REDUCTION = 1, |
276 | FUSE_SHARE_REF = 2, |
277 | FUSE_SAME_SCC = 3, |
278 | FUSE_FINALIZE = 4 |
279 | }; |
280 | |
281 | /* Description on different fusing reason. */ |
282 | static const char *fuse_message[] = { |
283 | "they are non-builtins" , |
284 | "they have reductions" , |
285 | "they have shared memory refs" , |
286 | "they are in the same dependence scc" , |
287 | "there is no point to distribute loop" }; |
288 | |
289 | |
290 | /* Dump vertex I in RDG to FILE. */ |
291 | |
292 | static void |
293 | dump_rdg_vertex (FILE *file, struct graph *rdg, int i) |
294 | { |
295 | struct vertex *v = &(rdg->vertices[i]); |
296 | struct graph_edge *e; |
297 | |
298 | fprintf (stream: file, format: "(vertex %d: (%s%s) (in:" , i, |
299 | RDG_MEM_WRITE_STMT (rdg, i) ? "w" : "" , |
300 | RDG_MEM_READS_STMT (rdg, i) ? "r" : "" ); |
301 | |
302 | if (v->pred) |
303 | for (e = v->pred; e; e = e->pred_next) |
304 | fprintf (stream: file, format: " %d" , e->src); |
305 | |
306 | fprintf (stream: file, format: ") (out:" ); |
307 | |
308 | if (v->succ) |
309 | for (e = v->succ; e; e = e->succ_next) |
310 | fprintf (stream: file, format: " %d" , e->dest); |
311 | |
312 | fprintf (stream: file, format: ")\n" ); |
313 | print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS); |
314 | fprintf (stream: file, format: ")\n" ); |
315 | } |
316 | |
317 | /* Call dump_rdg_vertex on stderr. */ |
318 | |
319 | DEBUG_FUNCTION void |
320 | debug_rdg_vertex (struct graph *rdg, int i) |
321 | { |
322 | dump_rdg_vertex (stderr, rdg, i); |
323 | } |
324 | |
325 | /* Dump the reduced dependence graph RDG to FILE. */ |
326 | |
327 | static void |
328 | dump_rdg (FILE *file, struct graph *rdg) |
329 | { |
330 | fprintf (stream: file, format: "(rdg\n" ); |
331 | for (int i = 0; i < rdg->n_vertices; i++) |
332 | dump_rdg_vertex (file, rdg, i); |
333 | fprintf (stream: file, format: ")\n" ); |
334 | } |
335 | |
336 | /* Call dump_rdg on stderr. */ |
337 | |
338 | DEBUG_FUNCTION void |
339 | debug_rdg (struct graph *rdg) |
340 | { |
341 | dump_rdg (stderr, rdg); |
342 | } |
343 | |
344 | static void |
345 | dot_rdg_1 (FILE *file, struct graph *rdg) |
346 | { |
347 | int i; |
348 | pretty_printer buffer; |
349 | pp_needs_newline (&buffer) = false; |
350 | buffer.buffer->stream = file; |
351 | |
352 | fprintf (stream: file, format: "digraph RDG {\n" ); |
353 | |
354 | for (i = 0; i < rdg->n_vertices; i++) |
355 | { |
356 | struct vertex *v = &(rdg->vertices[i]); |
357 | struct graph_edge *e; |
358 | |
359 | fprintf (stream: file, format: "%d [label=\"[%d] " , i, i); |
360 | pp_gimple_stmt_1 (&buffer, RDGV_STMT (v), 0, TDF_SLIM); |
361 | pp_flush (&buffer); |
362 | fprintf (stream: file, format: "\"]\n" ); |
363 | |
364 | /* Highlight reads from memory. */ |
365 | if (RDG_MEM_READS_STMT (rdg, i)) |
366 | fprintf (stream: file, format: "%d [style=filled, fillcolor=green]\n" , i); |
367 | |
368 | /* Highlight stores to memory. */ |
369 | if (RDG_MEM_WRITE_STMT (rdg, i)) |
370 | fprintf (stream: file, format: "%d [style=filled, fillcolor=red]\n" , i); |
371 | |
372 | if (v->succ) |
373 | for (e = v->succ; e; e = e->succ_next) |
374 | switch (RDGE_TYPE (e)) |
375 | { |
376 | case flow_dd: |
377 | /* These are the most common dependences: don't print these. */ |
378 | fprintf (stream: file, format: "%d -> %d \n" , i, e->dest); |
379 | break; |
380 | |
381 | case control_dd: |
382 | fprintf (stream: file, format: "%d -> %d [label=control] \n" , i, e->dest); |
383 | break; |
384 | |
385 | default: |
386 | gcc_unreachable (); |
387 | } |
388 | } |
389 | |
390 | fprintf (stream: file, format: "}\n\n" ); |
391 | } |
392 | |
393 | /* Display the Reduced Dependence Graph using dotty. */ |
394 | |
395 | DEBUG_FUNCTION void |
396 | dot_rdg (struct graph *rdg) |
397 | { |
398 | /* When debugging, you may want to enable the following code. */ |
399 | #ifdef HAVE_POPEN |
400 | FILE *file = popen (command: "dot -Tx11" , modes: "w" ); |
401 | if (!file) |
402 | return; |
403 | dot_rdg_1 (file, rdg); |
404 | fflush (stream: file); |
405 | close (fileno (file)); |
406 | pclose (stream: file); |
407 | #else |
408 | dot_rdg_1 (stderr, rdg); |
409 | #endif |
410 | } |
411 | |
412 | /* Returns the index of STMT in RDG. */ |
413 | |
414 | static int |
415 | rdg_vertex_for_stmt (struct graph *rdg ATTRIBUTE_UNUSED, gimple *stmt) |
416 | { |
417 | int index = gimple_uid (g: stmt); |
418 | gcc_checking_assert (index == -1 || RDG_STMT (rdg, index) == stmt); |
419 | return index; |
420 | } |
421 | |
422 | /* Creates dependence edges in RDG for all the uses of DEF. IDEF is |
423 | the index of DEF in RDG. */ |
424 | |
425 | static void |
426 | create_rdg_edges_for_scalar (struct graph *rdg, tree def, int idef) |
427 | { |
428 | use_operand_p imm_use_p; |
429 | imm_use_iterator iterator; |
430 | |
431 | FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, def) |
432 | { |
433 | struct graph_edge *e; |
434 | int use = rdg_vertex_for_stmt (rdg, USE_STMT (imm_use_p)); |
435 | |
436 | if (use < 0) |
437 | continue; |
438 | |
439 | e = add_edge (rdg, idef, use); |
440 | e->data = XNEW (struct rdg_edge); |
441 | RDGE_TYPE (e) = flow_dd; |
442 | } |
443 | } |
444 | |
445 | /* Creates an edge for the control dependences of BB to the vertex V. */ |
446 | |
447 | static void |
448 | create_edge_for_control_dependence (struct graph *rdg, basic_block bb, |
449 | int v, control_dependences *cd) |
450 | { |
451 | bitmap_iterator bi; |
452 | unsigned edge_n; |
453 | EXECUTE_IF_SET_IN_BITMAP (cd->get_edges_dependent_on (bb->index), |
454 | 0, edge_n, bi) |
455 | { |
456 | basic_block cond_bb = cd->get_edge_src (edge_n); |
457 | gimple *stmt = *gsi_last_bb (bb: cond_bb); |
458 | if (stmt && is_ctrl_stmt (stmt)) |
459 | { |
460 | struct graph_edge *e; |
461 | int c = rdg_vertex_for_stmt (rdg, stmt); |
462 | if (c < 0) |
463 | continue; |
464 | |
465 | e = add_edge (rdg, c, v); |
466 | e->data = XNEW (struct rdg_edge); |
467 | RDGE_TYPE (e) = control_dd; |
468 | } |
469 | } |
470 | } |
471 | |
472 | /* Creates the edges of the reduced dependence graph RDG. */ |
473 | |
474 | static void |
475 | create_rdg_flow_edges (struct graph *rdg) |
476 | { |
477 | int i; |
478 | def_operand_p def_p; |
479 | ssa_op_iter iter; |
480 | |
481 | for (i = 0; i < rdg->n_vertices; i++) |
482 | FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i), |
483 | iter, SSA_OP_DEF) |
484 | create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), idef: i); |
485 | } |
486 | |
487 | /* Creates the edges of the reduced dependence graph RDG. */ |
488 | |
489 | static void |
490 | create_rdg_cd_edges (struct graph *rdg, control_dependences *cd, loop_p loop) |
491 | { |
492 | int i; |
493 | |
494 | for (i = 0; i < rdg->n_vertices; i++) |
495 | { |
496 | gimple *stmt = RDG_STMT (rdg, i); |
497 | if (gimple_code (g: stmt) == GIMPLE_PHI) |
498 | { |
499 | edge_iterator ei; |
500 | edge e; |
501 | FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->preds) |
502 | if (flow_bb_inside_loop_p (loop, e->src)) |
503 | create_edge_for_control_dependence (rdg, bb: e->src, v: i, cd); |
504 | } |
505 | else |
506 | create_edge_for_control_dependence (rdg, bb: gimple_bb (g: stmt), v: i, cd); |
507 | } |
508 | } |
509 | |
510 | |
511 | class loop_distribution |
512 | { |
513 | private: |
514 | /* The loop (nest) to be distributed. */ |
515 | vec<loop_p> loop_nest; |
516 | |
517 | /* Vector of data references in the loop to be distributed. */ |
518 | vec<data_reference_p> datarefs_vec; |
519 | |
520 | /* If there is nonaddressable data reference in above vector. */ |
521 | bool has_nonaddressable_dataref_p; |
522 | |
523 | /* Store index of data reference in aux field. */ |
524 | |
525 | /* Hash table for data dependence relation in the loop to be distributed. */ |
526 | hash_table<ddr_hasher> *ddrs_table; |
527 | |
528 | /* Array mapping basic block's index to its topological order. */ |
529 | int *bb_top_order_index; |
530 | /* And size of the array. */ |
531 | int bb_top_order_index_size; |
532 | |
533 | /* Build the vertices of the reduced dependence graph RDG. Return false |
534 | if that failed. */ |
535 | bool create_rdg_vertices (struct graph *rdg, const vec<gimple *> &stmts, |
536 | loop_p loop); |
537 | |
538 | /* Initialize STMTS with all the statements of LOOP. We use topological |
539 | order to discover all statements. The order is important because |
540 | generate_loops_for_partition is using the same traversal for identifying |
541 | statements in loop copies. */ |
542 | void stmts_from_loop (class loop *loop, vec<gimple *> *stmts); |
543 | |
544 | |
545 | /* Build the Reduced Dependence Graph (RDG) with one vertex per statement of |
546 | LOOP, and one edge per flow dependence or control dependence from control |
547 | dependence CD. During visiting each statement, data references are also |
548 | collected and recorded in global data DATAREFS_VEC. */ |
549 | struct graph * build_rdg (class loop *loop, control_dependences *cd); |
550 | |
551 | /* Merge PARTITION into the partition DEST. RDG is the reduced dependence |
552 | graph and we update type for result partition if it is non-NULL. */ |
553 | void partition_merge_into (struct graph *rdg, |
554 | partition *dest, partition *partition, |
555 | enum fuse_type ft); |
556 | |
557 | |
558 | /* Return data dependence relation for data references A and B. The two |
559 | data references must be in lexicographic order wrto reduced dependence |
560 | graph RDG. We firstly try to find ddr from global ddr hash table. If |
561 | it doesn't exist, compute the ddr and cache it. */ |
562 | data_dependence_relation * get_data_dependence (struct graph *rdg, |
563 | data_reference_p a, |
564 | data_reference_p b); |
565 | |
566 | |
567 | /* In reduced dependence graph RDG for loop distribution, return true if |
568 | dependence between references DR1 and DR2 leads to a dependence cycle |
569 | and such dependence cycle can't be resolved by runtime alias check. */ |
570 | bool data_dep_in_cycle_p (struct graph *rdg, data_reference_p dr1, |
571 | data_reference_p dr2); |
572 | |
573 | |
574 | /* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update |
575 | PARTITION1's type after merging PARTITION2 into PARTITION1. */ |
576 | void update_type_for_merge (struct graph *rdg, |
577 | partition *partition1, partition *partition2); |
578 | |
579 | |
580 | /* Returns a partition with all the statements needed for computing |
581 | the vertex V of the RDG, also including the loop exit conditions. */ |
582 | partition *build_rdg_partition_for_vertex (struct graph *rdg, int v); |
583 | |
584 | /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify |
585 | if it forms builtin memcpy or memmove call. */ |
586 | void classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition, |
587 | data_reference_p dst_dr, data_reference_p src_dr); |
588 | |
589 | /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP. |
590 | For the moment we detect memset, memcpy and memmove patterns. Bitmap |
591 | STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions. |
592 | Returns true if there is a reduction in all partitions and we |
593 | possibly did not mark PARTITION as having one for this reason. */ |
594 | |
595 | bool |
596 | classify_partition (loop_p loop, |
597 | struct graph *rdg, partition *partition, |
598 | bitmap stmt_in_all_partitions); |
599 | |
600 | |
601 | /* Returns true when PARTITION1 and PARTITION2 access the same memory |
602 | object in RDG. */ |
603 | bool share_memory_accesses (struct graph *rdg, |
604 | partition *partition1, partition *partition2); |
605 | |
606 | /* For each seed statement in STARTING_STMTS, this function builds |
607 | partition for it by adding depended statements according to RDG. |
608 | All partitions are recorded in PARTITIONS. */ |
609 | void rdg_build_partitions (struct graph *rdg, |
610 | vec<gimple *> starting_stmts, |
611 | vec<partition *> *partitions); |
612 | |
613 | /* Compute partition dependence created by the data references in DRS1 |
614 | and DRS2, modify and return DIR according to that. IF ALIAS_DDR is |
615 | not NULL, we record dependence introduced by possible alias between |
616 | two data references in ALIAS_DDRS; otherwise, we simply ignore such |
617 | dependence as if it doesn't exist at all. */ |
618 | int pg_add_dependence_edges (struct graph *rdg, int dir, bitmap drs1, |
619 | bitmap drs2, vec<ddr_p> *alias_ddrs); |
620 | |
621 | |
622 | /* Build and return partition dependence graph for PARTITIONS. RDG is |
623 | reduced dependence graph for the loop to be distributed. If IGNORE_ALIAS_P |
624 | is true, data dependence caused by possible alias between references |
625 | is ignored, as if it doesn't exist at all; otherwise all depdendences |
626 | are considered. */ |
627 | struct graph *build_partition_graph (struct graph *rdg, |
628 | vec<struct partition *> *partitions, |
629 | bool ignore_alias_p); |
630 | |
631 | /* Given reduced dependence graph RDG merge strong connected components |
632 | of PARTITIONS. If IGNORE_ALIAS_P is true, data dependence caused by |
633 | possible alias between references is ignored, as if it doesn't exist |
634 | at all; otherwise all depdendences are considered. */ |
635 | void merge_dep_scc_partitions (struct graph *rdg, vec<struct partition *> |
636 | *partitions, bool ignore_alias_p); |
637 | |
638 | /* This is the main function breaking strong conected components in |
639 | PARTITIONS giving reduced depdendence graph RDG. Store data dependence |
640 | relations for runtime alias check in ALIAS_DDRS. */ |
641 | void break_alias_scc_partitions (struct graph *rdg, vec<struct partition *> |
642 | *partitions, vec<ddr_p> *alias_ddrs); |
643 | |
644 | |
645 | /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution. |
646 | ALIAS_DDRS contains ddrs which need runtime alias check. */ |
647 | void finalize_partitions (class loop *loop, vec<struct partition *> |
648 | *partitions, vec<ddr_p> *alias_ddrs); |
649 | |
650 | /* Distributes the code from LOOP in such a way that producer statements |
651 | are placed before consumer statements. Tries to separate only the |
652 | statements from STMTS into separate loops. Returns the number of |
653 | distributed loops. Set NB_CALLS to number of generated builtin calls. |
654 | Set *DESTROY_P to whether LOOP needs to be destroyed. */ |
655 | int distribute_loop (class loop *loop, const vec<gimple *> &stmts, |
656 | control_dependences *cd, int *nb_calls, bool *destroy_p, |
657 | bool only_patterns_p); |
658 | |
659 | /* Transform loops which mimic the effects of builtins rawmemchr or strlen and |
660 | replace them accordingly. */ |
661 | bool transform_reduction_loop (loop_p loop); |
662 | |
663 | /* Compute topological order for basic blocks. Topological order is |
664 | needed because data dependence is computed for data references in |
665 | lexicographical order. */ |
666 | void bb_top_order_init (void); |
667 | |
668 | void bb_top_order_destroy (void); |
669 | |
670 | public: |
671 | |
672 | /* Getter for bb_top_order. */ |
673 | |
674 | inline int get_bb_top_order_index_size (void) |
675 | { |
676 | return bb_top_order_index_size; |
677 | } |
678 | |
679 | inline int get_bb_top_order_index (int i) |
680 | { |
681 | return bb_top_order_index[i]; |
682 | } |
683 | |
684 | unsigned int execute (function *fun); |
685 | }; |
686 | |
687 | |
688 | /* If X has a smaller topological sort number than Y, returns -1; |
689 | if greater, returns 1. */ |
690 | static int |
691 | bb_top_order_cmp_r (const void *x, const void *y, void *loop) |
692 | { |
693 | loop_distribution *_loop = |
694 | (loop_distribution *) loop; |
695 | |
696 | basic_block bb1 = *(const basic_block *) x; |
697 | basic_block bb2 = *(const basic_block *) y; |
698 | |
699 | int bb_top_order_index_size = _loop->get_bb_top_order_index_size (); |
700 | |
701 | gcc_assert (bb1->index < bb_top_order_index_size |
702 | && bb2->index < bb_top_order_index_size); |
703 | gcc_assert (bb1 == bb2 |
704 | || _loop->get_bb_top_order_index(bb1->index) |
705 | != _loop->get_bb_top_order_index(bb2->index)); |
706 | |
707 | return (_loop->get_bb_top_order_index(i: bb1->index) - |
708 | _loop->get_bb_top_order_index(i: bb2->index)); |
709 | } |
710 | |
711 | bool |
712 | loop_distribution::create_rdg_vertices (struct graph *rdg, |
713 | const vec<gimple *> &stmts, |
714 | loop_p loop) |
715 | { |
716 | int i; |
717 | gimple *stmt; |
718 | |
719 | FOR_EACH_VEC_ELT (stmts, i, stmt) |
720 | { |
721 | struct vertex *v = &(rdg->vertices[i]); |
722 | |
723 | /* Record statement to vertex mapping. */ |
724 | gimple_set_uid (g: stmt, uid: i); |
725 | |
726 | v->data = XNEW (struct rdg_vertex); |
727 | RDGV_STMT (v) = stmt; |
728 | RDGV_DATAREFS (v).create (nelems: 0); |
729 | RDGV_HAS_MEM_WRITE (v) = false; |
730 | RDGV_HAS_MEM_READS (v) = false; |
731 | if (gimple_code (g: stmt) == GIMPLE_PHI) |
732 | continue; |
733 | |
734 | unsigned drp = datarefs_vec.length (); |
735 | if (!find_data_references_in_stmt (loop, stmt, &datarefs_vec)) |
736 | return false; |
737 | for (unsigned j = drp; j < datarefs_vec.length (); ++j) |
738 | { |
739 | data_reference_p dr = datarefs_vec[j]; |
740 | if (DR_IS_READ (dr)) |
741 | RDGV_HAS_MEM_READS (v) = true; |
742 | else |
743 | RDGV_HAS_MEM_WRITE (v) = true; |
744 | RDGV_DATAREFS (v).safe_push (obj: dr); |
745 | has_nonaddressable_dataref_p |= may_be_nonaddressable_p (expr: dr->ref); |
746 | } |
747 | } |
748 | return true; |
749 | } |
750 | |
751 | void |
752 | loop_distribution::stmts_from_loop (class loop *loop, vec<gimple *> *stmts) |
753 | { |
754 | unsigned int i; |
755 | basic_block *bbs = get_loop_body_in_custom_order (loop, this, bb_top_order_cmp_r); |
756 | |
757 | for (i = 0; i < loop->num_nodes; i++) |
758 | { |
759 | basic_block bb = bbs[i]; |
760 | |
761 | for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (i: bsi); |
762 | gsi_next (i: &bsi)) |
763 | if (!virtual_operand_p (op: gimple_phi_result (gs: bsi.phi ()))) |
764 | stmts->safe_push (obj: bsi.phi ()); |
765 | |
766 | for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (i: bsi); |
767 | gsi_next (i: &bsi)) |
768 | { |
769 | gimple *stmt = gsi_stmt (i: bsi); |
770 | if (gimple_code (g: stmt) != GIMPLE_LABEL && !is_gimple_debug (gs: stmt)) |
771 | stmts->safe_push (obj: stmt); |
772 | } |
773 | } |
774 | |
775 | free (ptr: bbs); |
776 | } |
777 | |
778 | /* Free the reduced dependence graph RDG. */ |
779 | |
780 | static void |
781 | free_rdg (struct graph *rdg) |
782 | { |
783 | int i; |
784 | |
785 | for (i = 0; i < rdg->n_vertices; i++) |
786 | { |
787 | struct vertex *v = &(rdg->vertices[i]); |
788 | struct graph_edge *e; |
789 | |
790 | for (e = v->succ; e; e = e->succ_next) |
791 | free (ptr: e->data); |
792 | |
793 | if (v->data) |
794 | { |
795 | gimple_set_uid (RDGV_STMT (v), uid: -1); |
796 | (RDGV_DATAREFS (v)).release (); |
797 | free (ptr: v->data); |
798 | } |
799 | } |
800 | |
801 | free_graph (g: rdg); |
802 | } |
803 | |
804 | struct graph * |
805 | loop_distribution::build_rdg (class loop *loop, control_dependences *cd) |
806 | { |
807 | struct graph *rdg; |
808 | |
809 | /* Create the RDG vertices from the stmts of the loop nest. */ |
810 | auto_vec<gimple *, 10> stmts; |
811 | stmts_from_loop (loop, stmts: &stmts); |
812 | rdg = new_graph (stmts.length ()); |
813 | if (!create_rdg_vertices (rdg, stmts, loop)) |
814 | { |
815 | free_rdg (rdg); |
816 | return NULL; |
817 | } |
818 | stmts.release (); |
819 | |
820 | create_rdg_flow_edges (rdg); |
821 | if (cd) |
822 | create_rdg_cd_edges (rdg, cd, loop); |
823 | |
824 | return rdg; |
825 | } |
826 | |
827 | |
828 | /* Allocate and initialize a partition from BITMAP. */ |
829 | |
830 | static partition * |
831 | partition_alloc (void) |
832 | { |
833 | partition *partition = XCNEW (struct partition); |
834 | partition->stmts = BITMAP_ALLOC (NULL); |
835 | partition->reduction_p = false; |
836 | partition->loc = UNKNOWN_LOCATION; |
837 | partition->kind = PKIND_NORMAL; |
838 | partition->type = PTYPE_PARALLEL; |
839 | partition->datarefs = BITMAP_ALLOC (NULL); |
840 | return partition; |
841 | } |
842 | |
843 | /* Free PARTITION. */ |
844 | |
845 | static void |
846 | partition_free (partition *partition) |
847 | { |
848 | BITMAP_FREE (partition->stmts); |
849 | BITMAP_FREE (partition->datarefs); |
850 | if (partition->builtin) |
851 | free (ptr: partition->builtin); |
852 | |
853 | free (ptr: partition); |
854 | } |
855 | |
856 | /* Returns true if the partition can be generated as a builtin. */ |
857 | |
858 | static bool |
859 | partition_builtin_p (partition *partition) |
860 | { |
861 | return partition->kind > PKIND_PARTIAL_MEMSET; |
862 | } |
863 | |
864 | /* Returns true if the partition contains a reduction. */ |
865 | |
866 | static bool |
867 | partition_reduction_p (partition *partition) |
868 | { |
869 | return partition->reduction_p; |
870 | } |
871 | |
872 | void |
873 | loop_distribution::partition_merge_into (struct graph *rdg, |
874 | partition *dest, partition *partition, enum fuse_type ft) |
875 | { |
876 | if (dump_file && (dump_flags & TDF_DETAILS)) |
877 | { |
878 | fprintf (stream: dump_file, format: "Fuse partitions because %s:\n" , fuse_message[ft]); |
879 | fprintf (stream: dump_file, format: " Part 1: " ); |
880 | dump_bitmap (file: dump_file, map: dest->stmts); |
881 | fprintf (stream: dump_file, format: " Part 2: " ); |
882 | dump_bitmap (file: dump_file, map: partition->stmts); |
883 | } |
884 | |
885 | dest->kind = PKIND_NORMAL; |
886 | if (dest->type == PTYPE_PARALLEL) |
887 | dest->type = partition->type; |
888 | |
889 | bitmap_ior_into (dest->stmts, partition->stmts); |
890 | if (partition_reduction_p (partition)) |
891 | dest->reduction_p = true; |
892 | |
893 | /* Further check if any data dependence prevents us from executing the |
894 | new partition parallelly. */ |
895 | if (dest->type == PTYPE_PARALLEL && rdg != NULL) |
896 | update_type_for_merge (rdg, partition1: dest, partition2: partition); |
897 | |
898 | bitmap_ior_into (dest->datarefs, partition->datarefs); |
899 | } |
900 | |
901 | |
902 | /* Returns true when DEF is an SSA_NAME defined in LOOP and used after |
903 | the LOOP. */ |
904 | |
905 | static bool |
906 | ssa_name_has_uses_outside_loop_p (tree def, loop_p loop) |
907 | { |
908 | imm_use_iterator imm_iter; |
909 | use_operand_p use_p; |
910 | |
911 | FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def) |
912 | { |
913 | if (is_gimple_debug (USE_STMT (use_p))) |
914 | continue; |
915 | |
916 | basic_block use_bb = gimple_bb (USE_STMT (use_p)); |
917 | if (!flow_bb_inside_loop_p (loop, use_bb)) |
918 | return true; |
919 | } |
920 | |
921 | return false; |
922 | } |
923 | |
924 | /* Returns true when STMT defines a scalar variable used after the |
925 | loop LOOP. */ |
926 | |
927 | static bool |
928 | stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt) |
929 | { |
930 | def_operand_p def_p; |
931 | ssa_op_iter op_iter; |
932 | |
933 | if (gimple_code (g: stmt) == GIMPLE_PHI) |
934 | return ssa_name_has_uses_outside_loop_p (def: gimple_phi_result (gs: stmt), loop); |
935 | |
936 | FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF) |
937 | if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p), loop)) |
938 | return true; |
939 | |
940 | return false; |
941 | } |
942 | |
943 | /* Return a copy of LOOP placed before LOOP. */ |
944 | |
945 | static class loop * |
946 | copy_loop_before (class loop *loop, bool redirect_lc_phi_defs) |
947 | { |
948 | class loop *res; |
949 | edge = loop_preheader_edge (loop); |
950 | |
951 | initialize_original_copy_tables (); |
952 | res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, single_exit (loop), NULL, |
953 | NULL, preheader, NULL, false); |
954 | gcc_assert (res != NULL); |
955 | |
956 | /* When a not last partition is supposed to keep the LC PHIs computed |
957 | adjust their definitions. */ |
958 | if (redirect_lc_phi_defs) |
959 | { |
960 | edge exit = single_exit (loop); |
961 | for (gphi_iterator si = gsi_start_phis (exit->dest); !gsi_end_p (i: si); |
962 | gsi_next (i: &si)) |
963 | { |
964 | gphi *phi = si.phi (); |
965 | if (virtual_operand_p (op: gimple_phi_result (gs: phi))) |
966 | continue; |
967 | use_operand_p use_p = PHI_ARG_DEF_PTR_FROM_EDGE (phi, exit); |
968 | tree new_def = get_current_def (USE_FROM_PTR (use_p)); |
969 | SET_USE (use_p, new_def); |
970 | } |
971 | } |
972 | |
973 | free_original_copy_tables (); |
974 | delete_update_ssa (); |
975 | |
976 | return res; |
977 | } |
978 | |
979 | /* Creates an empty basic block after LOOP. */ |
980 | |
981 | static void |
982 | create_bb_after_loop (class loop *loop) |
983 | { |
984 | edge exit = single_exit (loop); |
985 | |
986 | if (!exit) |
987 | return; |
988 | |
989 | split_edge (exit); |
990 | } |
991 | |
992 | /* Generate code for PARTITION from the code in LOOP. The loop is |
993 | copied when COPY_P is true. All the statements not flagged in the |
994 | PARTITION bitmap are removed from the loop or from its copy. The |
995 | statements are indexed in sequence inside a basic block, and the |
996 | basic blocks of a loop are taken in dom order. */ |
997 | |
998 | static void |
999 | generate_loops_for_partition (class loop *loop, partition *partition, |
1000 | bool copy_p, bool keep_lc_phis_p) |
1001 | { |
1002 | unsigned i; |
1003 | basic_block *bbs; |
1004 | |
1005 | if (copy_p) |
1006 | { |
1007 | int orig_loop_num = loop->orig_loop_num; |
1008 | loop = copy_loop_before (loop, redirect_lc_phi_defs: keep_lc_phis_p); |
1009 | gcc_assert (loop != NULL); |
1010 | loop->orig_loop_num = orig_loop_num; |
1011 | create_preheader (loop, CP_SIMPLE_PREHEADERS); |
1012 | create_bb_after_loop (loop); |
1013 | } |
1014 | else |
1015 | { |
1016 | /* Origin number is set to the new versioned loop's num. */ |
1017 | gcc_assert (loop->orig_loop_num != loop->num); |
1018 | } |
1019 | |
1020 | /* Remove stmts not in the PARTITION bitmap. */ |
1021 | bbs = get_loop_body_in_dom_order (loop); |
1022 | |
1023 | if (MAY_HAVE_DEBUG_BIND_STMTS) |
1024 | for (i = 0; i < loop->num_nodes; i++) |
1025 | { |
1026 | basic_block bb = bbs[i]; |
1027 | |
1028 | for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (i: bsi); |
1029 | gsi_next (i: &bsi)) |
1030 | { |
1031 | gphi *phi = bsi.phi (); |
1032 | if (!virtual_operand_p (op: gimple_phi_result (gs: phi)) |
1033 | && !bitmap_bit_p (partition->stmts, gimple_uid (g: phi))) |
1034 | reset_debug_uses (phi); |
1035 | } |
1036 | |
1037 | for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (i: bsi); gsi_next (i: &bsi)) |
1038 | { |
1039 | gimple *stmt = gsi_stmt (i: bsi); |
1040 | if (gimple_code (g: stmt) != GIMPLE_LABEL |
1041 | && !is_gimple_debug (gs: stmt) |
1042 | && !bitmap_bit_p (partition->stmts, gimple_uid (g: stmt))) |
1043 | reset_debug_uses (stmt); |
1044 | } |
1045 | } |
1046 | |
1047 | for (i = 0; i < loop->num_nodes; i++) |
1048 | { |
1049 | basic_block bb = bbs[i]; |
1050 | edge inner_exit = NULL; |
1051 | |
1052 | if (loop != bb->loop_father) |
1053 | inner_exit = single_exit (bb->loop_father); |
1054 | |
1055 | for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (i: bsi);) |
1056 | { |
1057 | gphi *phi = bsi.phi (); |
1058 | if (!virtual_operand_p (op: gimple_phi_result (gs: phi)) |
1059 | && !bitmap_bit_p (partition->stmts, gimple_uid (g: phi))) |
1060 | remove_phi_node (&bsi, true); |
1061 | else |
1062 | gsi_next (i: &bsi); |
1063 | } |
1064 | |
1065 | for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (i: bsi);) |
1066 | { |
1067 | gimple *stmt = gsi_stmt (i: bsi); |
1068 | if (gimple_code (g: stmt) != GIMPLE_LABEL |
1069 | && !is_gimple_debug (gs: stmt) |
1070 | && !bitmap_bit_p (partition->stmts, gimple_uid (g: stmt))) |
1071 | { |
1072 | /* In distribution of loop nest, if bb is inner loop's exit_bb, |
1073 | we choose its exit edge/path in order to avoid generating |
1074 | infinite loop. For all other cases, we choose an arbitrary |
1075 | path through the empty CFG part that this unnecessary |
1076 | control stmt controls. */ |
1077 | if (gcond *cond_stmt = dyn_cast <gcond *> (p: stmt)) |
1078 | { |
1079 | if (inner_exit && inner_exit->flags & EDGE_TRUE_VALUE) |
1080 | gimple_cond_make_true (gs: cond_stmt); |
1081 | else |
1082 | gimple_cond_make_false (gs: cond_stmt); |
1083 | update_stmt (s: stmt); |
1084 | } |
1085 | else if (gimple_code (g: stmt) == GIMPLE_SWITCH) |
1086 | { |
1087 | gswitch *switch_stmt = as_a <gswitch *> (p: stmt); |
1088 | gimple_switch_set_index |
1089 | (gs: switch_stmt, CASE_LOW (gimple_switch_label (switch_stmt, 1))); |
1090 | update_stmt (s: stmt); |
1091 | } |
1092 | else |
1093 | { |
1094 | unlink_stmt_vdef (stmt); |
1095 | gsi_remove (&bsi, true); |
1096 | release_defs (stmt); |
1097 | continue; |
1098 | } |
1099 | } |
1100 | gsi_next (i: &bsi); |
1101 | } |
1102 | } |
1103 | |
1104 | free (ptr: bbs); |
1105 | } |
1106 | |
1107 | /* If VAL memory representation contains the same value in all bytes, |
1108 | return that value, otherwise return -1. |
1109 | E.g. for 0x24242424 return 0x24, for IEEE double |
1110 | 747708026454360457216.0 return 0x44, etc. */ |
1111 | |
1112 | static int |
1113 | const_with_all_bytes_same (tree val) |
1114 | { |
1115 | unsigned char buf[64]; |
1116 | int i, len; |
1117 | |
1118 | if (integer_zerop (val) |
1119 | || (TREE_CODE (val) == CONSTRUCTOR |
1120 | && !TREE_CLOBBER_P (val) |
1121 | && CONSTRUCTOR_NELTS (val) == 0)) |
1122 | return 0; |
1123 | |
1124 | if (real_zerop (val)) |
1125 | { |
1126 | /* Only return 0 for +0.0, not for -0.0, which doesn't have |
1127 | an all bytes same memory representation. Don't transform |
1128 | -0.0 stores into +0.0 even for !HONOR_SIGNED_ZEROS. */ |
1129 | switch (TREE_CODE (val)) |
1130 | { |
1131 | case REAL_CST: |
1132 | if (!real_isneg (TREE_REAL_CST_PTR (val))) |
1133 | return 0; |
1134 | break; |
1135 | case COMPLEX_CST: |
1136 | if (!const_with_all_bytes_same (TREE_REALPART (val)) |
1137 | && !const_with_all_bytes_same (TREE_IMAGPART (val))) |
1138 | return 0; |
1139 | break; |
1140 | case VECTOR_CST: |
1141 | { |
1142 | unsigned int count = vector_cst_encoded_nelts (t: val); |
1143 | unsigned int j; |
1144 | for (j = 0; j < count; ++j) |
1145 | if (const_with_all_bytes_same (VECTOR_CST_ENCODED_ELT (val, j))) |
1146 | break; |
1147 | if (j == count) |
1148 | return 0; |
1149 | break; |
1150 | } |
1151 | default: |
1152 | break; |
1153 | } |
1154 | } |
1155 | |
1156 | if (CHAR_BIT != 8 || BITS_PER_UNIT != 8) |
1157 | return -1; |
1158 | |
1159 | len = native_encode_expr (val, buf, sizeof (buf)); |
1160 | if (len == 0) |
1161 | return -1; |
1162 | for (i = 1; i < len; i++) |
1163 | if (buf[i] != buf[0]) |
1164 | return -1; |
1165 | return buf[0]; |
1166 | } |
1167 | |
1168 | /* Generate a call to memset for PARTITION in LOOP. */ |
1169 | |
1170 | static void |
1171 | generate_memset_builtin (class loop *loop, partition *partition) |
1172 | { |
1173 | gimple_stmt_iterator gsi; |
1174 | tree mem, fn, nb_bytes; |
1175 | tree val; |
1176 | struct builtin_info *builtin = partition->builtin; |
1177 | gimple *fn_call; |
1178 | |
1179 | /* The new statements will be placed before LOOP. */ |
1180 | gsi = gsi_last_bb (bb: loop_preheader_edge (loop)->src); |
1181 | |
1182 | nb_bytes = rewrite_to_non_trapping_overflow (builtin->size); |
1183 | nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE, |
1184 | false, GSI_CONTINUE_LINKING); |
1185 | mem = rewrite_to_non_trapping_overflow (builtin->dst_base); |
1186 | mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE, |
1187 | false, GSI_CONTINUE_LINKING); |
1188 | |
1189 | /* This exactly matches the pattern recognition in classify_partition. */ |
1190 | val = gimple_assign_rhs1 (DR_STMT (builtin->dst_dr)); |
1191 | /* Handle constants like 0x15151515 and similarly |
1192 | floating point constants etc. where all bytes are the same. */ |
1193 | int bytev = const_with_all_bytes_same (val); |
1194 | if (bytev != -1) |
1195 | val = build_int_cst (integer_type_node, bytev); |
1196 | else if (TREE_CODE (val) == INTEGER_CST) |
1197 | val = fold_convert (integer_type_node, val); |
1198 | else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val))) |
1199 | { |
1200 | tree tem = make_ssa_name (integer_type_node); |
1201 | gimple *cstmt = gimple_build_assign (tem, NOP_EXPR, val); |
1202 | gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING); |
1203 | val = tem; |
1204 | } |
1205 | |
1206 | fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET)); |
1207 | fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes); |
1208 | gimple_set_location (g: fn_call, location: partition->loc); |
1209 | gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING); |
1210 | fold_stmt (&gsi); |
1211 | |
1212 | if (dump_file && (dump_flags & TDF_DETAILS)) |
1213 | { |
1214 | fprintf (stream: dump_file, format: "generated memset" ); |
1215 | if (bytev == 0) |
1216 | fprintf (stream: dump_file, format: " zero\n" ); |
1217 | else |
1218 | fprintf (stream: dump_file, format: "\n" ); |
1219 | } |
1220 | } |
1221 | |
1222 | /* Generate a call to memcpy for PARTITION in LOOP. */ |
1223 | |
1224 | static void |
1225 | generate_memcpy_builtin (class loop *loop, partition *partition) |
1226 | { |
1227 | gimple_stmt_iterator gsi; |
1228 | gimple *fn_call; |
1229 | tree dest, src, fn, nb_bytes; |
1230 | enum built_in_function kind; |
1231 | struct builtin_info *builtin = partition->builtin; |
1232 | |
1233 | /* The new statements will be placed before LOOP. */ |
1234 | gsi = gsi_last_bb (bb: loop_preheader_edge (loop)->src); |
1235 | |
1236 | nb_bytes = rewrite_to_non_trapping_overflow (builtin->size); |
1237 | nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE, |
1238 | false, GSI_CONTINUE_LINKING); |
1239 | dest = rewrite_to_non_trapping_overflow (builtin->dst_base); |
1240 | src = rewrite_to_non_trapping_overflow (builtin->src_base); |
1241 | if (partition->kind == PKIND_MEMCPY |
1242 | || ! ptr_derefs_may_alias_p (dest, src)) |
1243 | kind = BUILT_IN_MEMCPY; |
1244 | else |
1245 | kind = BUILT_IN_MEMMOVE; |
1246 | /* Try harder if we're copying a constant size. */ |
1247 | if (kind == BUILT_IN_MEMMOVE && poly_int_tree_p (t: nb_bytes)) |
1248 | { |
1249 | aff_tree asrc, adest; |
1250 | tree_to_aff_combination (src, ptr_type_node, &asrc); |
1251 | tree_to_aff_combination (dest, ptr_type_node, &adest); |
1252 | aff_combination_scale (&adest, -1); |
1253 | aff_combination_add (&asrc, &adest); |
1254 | if (aff_comb_cannot_overlap_p (&asrc, wi::to_poly_widest (t: nb_bytes), |
1255 | wi::to_poly_widest (t: nb_bytes))) |
1256 | kind = BUILT_IN_MEMCPY; |
1257 | } |
1258 | |
1259 | dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE, |
1260 | false, GSI_CONTINUE_LINKING); |
1261 | src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE, |
1262 | false, GSI_CONTINUE_LINKING); |
1263 | fn = build_fold_addr_expr (builtin_decl_implicit (kind)); |
1264 | fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes); |
1265 | gimple_set_location (g: fn_call, location: partition->loc); |
1266 | gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING); |
1267 | fold_stmt (&gsi); |
1268 | |
1269 | if (dump_file && (dump_flags & TDF_DETAILS)) |
1270 | { |
1271 | if (kind == BUILT_IN_MEMCPY) |
1272 | fprintf (stream: dump_file, format: "generated memcpy\n" ); |
1273 | else |
1274 | fprintf (stream: dump_file, format: "generated memmove\n" ); |
1275 | } |
1276 | } |
1277 | |
1278 | /* Remove and destroy the loop LOOP. */ |
1279 | |
1280 | static void |
1281 | destroy_loop (class loop *loop) |
1282 | { |
1283 | unsigned nbbs = loop->num_nodes; |
1284 | edge exit = single_exit (loop); |
1285 | basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest; |
1286 | basic_block *bbs; |
1287 | unsigned i; |
1288 | |
1289 | bbs = get_loop_body_in_dom_order (loop); |
1290 | |
1291 | gimple_stmt_iterator dst_gsi = gsi_after_labels (bb: exit->dest); |
1292 | bool safe_p = single_pred_p (bb: exit->dest); |
1293 | for (unsigned i = 0; i < nbbs; ++i) |
1294 | { |
1295 | /* We have made sure to not leave any dangling uses of SSA |
1296 | names defined in the loop. With the exception of virtuals. |
1297 | Make sure we replace all uses of virtual defs that will remain |
1298 | outside of the loop with the bare symbol as delete_basic_block |
1299 | will release them. */ |
1300 | for (gphi_iterator gsi = gsi_start_phis (bbs[i]); !gsi_end_p (i: gsi); |
1301 | gsi_next (i: &gsi)) |
1302 | { |
1303 | gphi *phi = gsi.phi (); |
1304 | if (virtual_operand_p (op: gimple_phi_result (gs: phi))) |
1305 | mark_virtual_phi_result_for_renaming (phi); |
1306 | } |
1307 | for (gimple_stmt_iterator gsi = gsi_start_bb (bb: bbs[i]); !gsi_end_p (i: gsi);) |
1308 | { |
1309 | gimple *stmt = gsi_stmt (i: gsi); |
1310 | tree vdef = gimple_vdef (g: stmt); |
1311 | if (vdef && TREE_CODE (vdef) == SSA_NAME) |
1312 | mark_virtual_operand_for_renaming (vdef); |
1313 | /* Also move and eventually reset debug stmts. We can leave |
1314 | constant values in place in case the stmt dominates the exit. |
1315 | ??? Non-constant values from the last iteration can be |
1316 | replaced with final values if we can compute them. */ |
1317 | if (gimple_debug_bind_p (s: stmt)) |
1318 | { |
1319 | tree val = gimple_debug_bind_get_value (dbg: stmt); |
1320 | gsi_move_before (&gsi, &dst_gsi); |
1321 | if (val |
1322 | && (!safe_p |
1323 | || !is_gimple_min_invariant (val) |
1324 | || !dominated_by_p (CDI_DOMINATORS, exit->src, bbs[i]))) |
1325 | { |
1326 | gimple_debug_bind_reset_value (dbg: stmt); |
1327 | update_stmt (s: stmt); |
1328 | } |
1329 | } |
1330 | else |
1331 | gsi_next (i: &gsi); |
1332 | } |
1333 | } |
1334 | |
1335 | redirect_edge_pred (exit, src); |
1336 | exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE); |
1337 | exit->flags |= EDGE_FALLTHRU; |
1338 | cancel_loop_tree (loop); |
1339 | rescan_loop_exit (exit, false, true); |
1340 | |
1341 | i = nbbs; |
1342 | do |
1343 | { |
1344 | --i; |
1345 | delete_basic_block (bbs[i]); |
1346 | } |
1347 | while (i != 0); |
1348 | |
1349 | free (ptr: bbs); |
1350 | |
1351 | set_immediate_dominator (CDI_DOMINATORS, dest, |
1352 | recompute_dominator (CDI_DOMINATORS, dest)); |
1353 | } |
1354 | |
1355 | /* Generates code for PARTITION. Return whether LOOP needs to be destroyed. */ |
1356 | |
1357 | static bool |
1358 | generate_code_for_partition (class loop *loop, |
1359 | partition *partition, bool copy_p, |
1360 | bool keep_lc_phis_p) |
1361 | { |
1362 | switch (partition->kind) |
1363 | { |
1364 | case PKIND_NORMAL: |
1365 | case PKIND_PARTIAL_MEMSET: |
1366 | /* Reductions all have to be in the last partition. */ |
1367 | gcc_assert (!partition_reduction_p (partition) |
1368 | || !copy_p); |
1369 | generate_loops_for_partition (loop, partition, copy_p, |
1370 | keep_lc_phis_p); |
1371 | return false; |
1372 | |
1373 | case PKIND_MEMSET: |
1374 | generate_memset_builtin (loop, partition); |
1375 | break; |
1376 | |
1377 | case PKIND_MEMCPY: |
1378 | case PKIND_MEMMOVE: |
1379 | generate_memcpy_builtin (loop, partition); |
1380 | break; |
1381 | |
1382 | default: |
1383 | gcc_unreachable (); |
1384 | } |
1385 | |
1386 | /* Common tail for partitions we turn into a call. If this was the last |
1387 | partition for which we generate code, we have to destroy the loop. */ |
1388 | if (!copy_p) |
1389 | return true; |
1390 | return false; |
1391 | } |
1392 | |
1393 | data_dependence_relation * |
1394 | loop_distribution::get_data_dependence (struct graph *rdg, data_reference_p a, |
1395 | data_reference_p b) |
1396 | { |
1397 | struct data_dependence_relation ent, **slot; |
1398 | struct data_dependence_relation *ddr; |
1399 | |
1400 | gcc_assert (DR_IS_WRITE (a) || DR_IS_WRITE (b)); |
1401 | gcc_assert (rdg_vertex_for_stmt (rdg, DR_STMT (a)) |
1402 | <= rdg_vertex_for_stmt (rdg, DR_STMT (b))); |
1403 | ent.a = a; |
1404 | ent.b = b; |
1405 | slot = ddrs_table->find_slot (value: &ent, insert: INSERT); |
1406 | if (*slot == NULL) |
1407 | { |
1408 | ddr = initialize_data_dependence_relation (a, b, loop_nest); |
1409 | compute_affine_dependence (ddr, loop_nest[0]); |
1410 | *slot = ddr; |
1411 | } |
1412 | |
1413 | return *slot; |
1414 | } |
1415 | |
1416 | bool |
1417 | loop_distribution::data_dep_in_cycle_p (struct graph *rdg, |
1418 | data_reference_p dr1, |
1419 | data_reference_p dr2) |
1420 | { |
1421 | struct data_dependence_relation *ddr; |
1422 | |
1423 | /* Re-shuffle data-refs to be in topological order. */ |
1424 | if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1)) |
1425 | > rdg_vertex_for_stmt (rdg, DR_STMT (dr2))) |
1426 | std::swap (a&: dr1, b&: dr2); |
1427 | |
1428 | ddr = get_data_dependence (rdg, a: dr1, b: dr2); |
1429 | |
1430 | /* In case of no data dependence. */ |
1431 | if (DDR_ARE_DEPENDENT (ddr) == chrec_known) |
1432 | return false; |
1433 | /* For unknown data dependence or known data dependence which can't be |
1434 | expressed in classic distance vector, we check if it can be resolved |
1435 | by runtime alias check. If yes, we still consider data dependence |
1436 | as won't introduce data dependence cycle. */ |
1437 | else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know |
1438 | || DDR_NUM_DIST_VECTS (ddr) == 0) |
1439 | return !runtime_alias_check_p (ddr, NULL, true); |
1440 | else if (DDR_NUM_DIST_VECTS (ddr) > 1) |
1441 | return true; |
1442 | else if (DDR_REVERSED_P (ddr) |
1443 | || lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), size: 1)) |
1444 | return false; |
1445 | |
1446 | return true; |
1447 | } |
1448 | |
1449 | void |
1450 | loop_distribution::update_type_for_merge (struct graph *rdg, |
1451 | partition *partition1, |
1452 | partition *partition2) |
1453 | { |
1454 | unsigned i, j; |
1455 | bitmap_iterator bi, bj; |
1456 | data_reference_p dr1, dr2; |
1457 | |
1458 | EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi) |
1459 | { |
1460 | unsigned start = (partition1 == partition2) ? i + 1 : 0; |
1461 | |
1462 | dr1 = datarefs_vec[i]; |
1463 | EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, start, j, bj) |
1464 | { |
1465 | dr2 = datarefs_vec[j]; |
1466 | if (DR_IS_READ (dr1) && DR_IS_READ (dr2)) |
1467 | continue; |
1468 | |
1469 | /* Partition can only be executed sequentially if there is any |
1470 | data dependence cycle. */ |
1471 | if (data_dep_in_cycle_p (rdg, dr1, dr2)) |
1472 | { |
1473 | partition1->type = PTYPE_SEQUENTIAL; |
1474 | return; |
1475 | } |
1476 | } |
1477 | } |
1478 | } |
1479 | |
1480 | partition * |
1481 | loop_distribution::build_rdg_partition_for_vertex (struct graph *rdg, int v) |
1482 | { |
1483 | partition *partition = partition_alloc (); |
1484 | auto_vec<int, 3> nodes; |
1485 | unsigned i, j; |
1486 | int x; |
1487 | data_reference_p dr; |
1488 | |
1489 | graphds_dfs (rdg, &v, 1, &nodes, false, NULL); |
1490 | |
1491 | FOR_EACH_VEC_ELT (nodes, i, x) |
1492 | { |
1493 | bitmap_set_bit (partition->stmts, x); |
1494 | |
1495 | for (j = 0; RDG_DATAREFS (rdg, x).iterate (ix: j, ptr: &dr); ++j) |
1496 | { |
1497 | unsigned idx = (unsigned) DR_INDEX (dr); |
1498 | gcc_assert (idx < datarefs_vec.length ()); |
1499 | |
1500 | /* Partition can only be executed sequentially if there is any |
1501 | unknown data reference. */ |
1502 | if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) |
1503 | || !DR_INIT (dr) || !DR_STEP (dr)) |
1504 | partition->type = PTYPE_SEQUENTIAL; |
1505 | |
1506 | bitmap_set_bit (partition->datarefs, idx); |
1507 | } |
1508 | } |
1509 | |
1510 | if (partition->type == PTYPE_SEQUENTIAL) |
1511 | return partition; |
1512 | |
1513 | /* Further check if any data dependence prevents us from executing the |
1514 | partition parallelly. */ |
1515 | update_type_for_merge (rdg, partition1: partition, partition2: partition); |
1516 | |
1517 | return partition; |
1518 | } |
1519 | |
1520 | /* Given PARTITION of LOOP and RDG, record single load/store data references |
1521 | for builtin partition in SRC_DR/DST_DR, return false if there is no such |
1522 | data references. */ |
1523 | |
1524 | static bool |
1525 | find_single_drs (class loop *loop, struct graph *rdg, const bitmap &partition_stmts, |
1526 | data_reference_p *dst_dr, data_reference_p *src_dr) |
1527 | { |
1528 | unsigned i; |
1529 | data_reference_p single_ld = NULL, single_st = NULL; |
1530 | bitmap_iterator bi; |
1531 | |
1532 | EXECUTE_IF_SET_IN_BITMAP (partition_stmts, 0, i, bi) |
1533 | { |
1534 | gimple *stmt = RDG_STMT (rdg, i); |
1535 | data_reference_p dr; |
1536 | |
1537 | if (gimple_code (g: stmt) == GIMPLE_PHI) |
1538 | continue; |
1539 | |
1540 | /* Any scalar stmts are ok. */ |
1541 | if (!gimple_vuse (g: stmt)) |
1542 | continue; |
1543 | |
1544 | /* Otherwise just regular loads/stores. */ |
1545 | if (!gimple_assign_single_p (gs: stmt)) |
1546 | return false; |
1547 | |
1548 | /* But exactly one store and/or load. */ |
1549 | for (unsigned j = 0; RDG_DATAREFS (rdg, i).iterate (ix: j, ptr: &dr); ++j) |
1550 | { |
1551 | tree type = TREE_TYPE (DR_REF (dr)); |
1552 | |
1553 | /* The memset, memcpy and memmove library calls are only |
1554 | able to deal with generic address space. */ |
1555 | if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type))) |
1556 | return false; |
1557 | |
1558 | if (DR_IS_READ (dr)) |
1559 | { |
1560 | if (single_ld != NULL) |
1561 | return false; |
1562 | single_ld = dr; |
1563 | } |
1564 | else |
1565 | { |
1566 | if (single_st != NULL) |
1567 | return false; |
1568 | single_st = dr; |
1569 | } |
1570 | } |
1571 | } |
1572 | |
1573 | if (!single_ld && !single_st) |
1574 | return false; |
1575 | |
1576 | basic_block bb_ld = NULL; |
1577 | basic_block bb_st = NULL; |
1578 | edge exit = single_exit (loop); |
1579 | |
1580 | if (single_ld) |
1581 | { |
1582 | /* Bail out if this is a bitfield memory reference. */ |
1583 | if (TREE_CODE (DR_REF (single_ld)) == COMPONENT_REF |
1584 | && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld), 1))) |
1585 | return false; |
1586 | |
1587 | /* Data reference must be executed exactly once per iteration of each |
1588 | loop in the loop nest. We only need to check dominance information |
1589 | against the outermost one in a perfect loop nest because a bb can't |
1590 | dominate outermost loop's latch without dominating inner loop's. */ |
1591 | bb_ld = gimple_bb (DR_STMT (single_ld)); |
1592 | if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_ld)) |
1593 | return false; |
1594 | |
1595 | /* The data reference must also be executed before possibly exiting |
1596 | the loop as otherwise we'd for example unconditionally execute |
1597 | memset (ptr, 0, n) which even with n == 0 implies ptr is non-NULL. */ |
1598 | if (bb_ld != loop->header |
1599 | && (!exit |
1600 | || !dominated_by_p (CDI_DOMINATORS, exit->src, bb_ld))) |
1601 | return false; |
1602 | } |
1603 | |
1604 | if (single_st) |
1605 | { |
1606 | /* Bail out if this is a bitfield memory reference. */ |
1607 | if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF |
1608 | && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1))) |
1609 | return false; |
1610 | |
1611 | /* Data reference must be executed exactly once per iteration. |
1612 | Same as single_ld, we only need to check against the outermost |
1613 | loop. */ |
1614 | bb_st = gimple_bb (DR_STMT (single_st)); |
1615 | if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_st)) |
1616 | return false; |
1617 | |
1618 | /* And before exiting the loop. */ |
1619 | if (bb_st != loop->header |
1620 | && (!exit |
1621 | || !dominated_by_p (CDI_DOMINATORS, exit->src, bb_st))) |
1622 | return false; |
1623 | } |
1624 | |
1625 | if (single_ld && single_st) |
1626 | { |
1627 | /* Load and store must be in the same loop nest. */ |
1628 | if (bb_st->loop_father != bb_ld->loop_father) |
1629 | return false; |
1630 | |
1631 | edge e = single_exit (bb_st->loop_father); |
1632 | bool dom_ld = dominated_by_p (CDI_DOMINATORS, e->src, bb_ld); |
1633 | bool dom_st = dominated_by_p (CDI_DOMINATORS, e->src, bb_st); |
1634 | if (dom_ld != dom_st) |
1635 | return false; |
1636 | } |
1637 | |
1638 | *src_dr = single_ld; |
1639 | *dst_dr = single_st; |
1640 | return true; |
1641 | } |
1642 | |
1643 | /* Given data reference DR in LOOP_NEST, this function checks the enclosing |
1644 | loops from inner to outer to see if loop's step equals to access size at |
1645 | each level of loop. Return 2 if we can prove this at all level loops; |
1646 | record access base and size in BASE and SIZE; save loop's step at each |
1647 | level of loop in STEPS if it is not null. For example: |
1648 | |
1649 | int arr[100][100][100]; |
1650 | for (i = 0; i < 100; i++) ;steps[2] = 40000 |
1651 | for (j = 100; j > 0; j--) ;steps[1] = -400 |
1652 | for (k = 0; k < 100; k++) ;steps[0] = 4 |
1653 | arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000 |
1654 | |
1655 | Return 1 if we can prove the equality at the innermost loop, but not all |
1656 | level loops. In this case, no information is recorded. |
1657 | |
1658 | Return 0 if no equality can be proven at any level loops. */ |
1659 | |
1660 | static int |
1661 | compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base, |
1662 | tree *size, vec<tree> *steps = NULL) |
1663 | { |
1664 | location_t loc = gimple_location (DR_STMT (dr)); |
1665 | basic_block bb = gimple_bb (DR_STMT (dr)); |
1666 | class loop *loop = bb->loop_father; |
1667 | tree ref = DR_REF (dr); |
1668 | tree access_base = build_fold_addr_expr (ref); |
1669 | tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref)); |
1670 | int res = 0; |
1671 | |
1672 | do { |
1673 | tree scev_fn = analyze_scalar_evolution (loop, access_base); |
1674 | if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC) |
1675 | return res; |
1676 | |
1677 | access_base = CHREC_LEFT (scev_fn); |
1678 | if (tree_contains_chrecs (access_base, NULL)) |
1679 | return res; |
1680 | |
1681 | tree scev_step = CHREC_RIGHT (scev_fn); |
1682 | /* Only support constant steps. */ |
1683 | if (TREE_CODE (scev_step) != INTEGER_CST) |
1684 | return res; |
1685 | |
1686 | enum ev_direction access_dir = scev_direction (scev_fn); |
1687 | if (access_dir == EV_DIR_UNKNOWN) |
1688 | return res; |
1689 | |
1690 | if (steps != NULL) |
1691 | steps->safe_push (obj: scev_step); |
1692 | |
1693 | scev_step = fold_convert_loc (loc, sizetype, scev_step); |
1694 | /* Compute absolute value of scev step. */ |
1695 | if (access_dir == EV_DIR_DECREASES) |
1696 | scev_step = fold_build1_loc (loc, NEGATE_EXPR, sizetype, scev_step); |
1697 | |
1698 | /* At each level of loop, scev step must equal to access size. In other |
1699 | words, DR must access consecutive memory between loop iterations. */ |
1700 | if (!operand_equal_p (scev_step, access_size, flags: 0)) |
1701 | return res; |
1702 | |
1703 | /* Access stride can be computed for data reference at least for the |
1704 | innermost loop. */ |
1705 | res = 1; |
1706 | |
1707 | /* Compute DR's execution times in loop. */ |
1708 | tree niters = number_of_latch_executions (loop); |
1709 | niters = fold_convert_loc (loc, sizetype, niters); |
1710 | if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, bb)) |
1711 | niters = size_binop_loc (loc, PLUS_EXPR, niters, size_one_node); |
1712 | |
1713 | /* Compute DR's overall access size in loop. */ |
1714 | access_size = fold_build2_loc (loc, MULT_EXPR, sizetype, |
1715 | niters, scev_step); |
1716 | /* Adjust base address in case of negative step. */ |
1717 | if (access_dir == EV_DIR_DECREASES) |
1718 | { |
1719 | tree adj = fold_build2_loc (loc, MINUS_EXPR, sizetype, |
1720 | scev_step, access_size); |
1721 | access_base = fold_build_pointer_plus_loc (loc, ptr: access_base, off: adj); |
1722 | } |
1723 | } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL); |
1724 | |
1725 | *base = access_base; |
1726 | *size = access_size; |
1727 | /* Access stride can be computed for data reference at each level loop. */ |
1728 | return 2; |
1729 | } |
1730 | |
1731 | /* Allocate and return builtin struct. Record information like DST_DR, |
1732 | SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct. */ |
1733 | |
1734 | static struct builtin_info * |
1735 | alloc_builtin (data_reference_p dst_dr, data_reference_p src_dr, |
1736 | tree dst_base, tree src_base, tree size) |
1737 | { |
1738 | struct builtin_info *builtin = XNEW (struct builtin_info); |
1739 | builtin->dst_dr = dst_dr; |
1740 | builtin->src_dr = src_dr; |
1741 | builtin->dst_base = dst_base; |
1742 | builtin->src_base = src_base; |
1743 | builtin->size = size; |
1744 | return builtin; |
1745 | } |
1746 | |
1747 | /* Given data reference DR in loop nest LOOP, classify if it forms builtin |
1748 | memset call. */ |
1749 | |
1750 | static void |
1751 | classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr) |
1752 | { |
1753 | gimple *stmt = DR_STMT (dr); |
1754 | tree base, size, rhs = gimple_assign_rhs1 (gs: stmt); |
1755 | |
1756 | if (const_with_all_bytes_same (val: rhs) == -1 |
1757 | && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs)) |
1758 | || (TYPE_MODE (TREE_TYPE (rhs)) |
1759 | != TYPE_MODE (unsigned_char_type_node)))) |
1760 | return; |
1761 | |
1762 | if (TREE_CODE (rhs) == SSA_NAME |
1763 | && !SSA_NAME_IS_DEFAULT_DEF (rhs) |
1764 | && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs)))) |
1765 | return; |
1766 | |
1767 | int res = compute_access_range (loop_nest: loop, dr, base: &base, size: &size); |
1768 | if (res == 0) |
1769 | return; |
1770 | if (res == 1) |
1771 | { |
1772 | partition->kind = PKIND_PARTIAL_MEMSET; |
1773 | return; |
1774 | } |
1775 | |
1776 | tree base_offset; |
1777 | tree base_base; |
1778 | split_constant_offset (base, &base_base, &base_offset); |
1779 | if (!cst_and_fits_in_hwi (base_offset)) |
1780 | return; |
1781 | unsigned HOST_WIDE_INT const_base_offset = int_cst_value (base_offset); |
1782 | |
1783 | struct builtin_info *builtin; |
1784 | builtin = alloc_builtin (dst_dr: dr, NULL, dst_base: base, NULL_TREE, size); |
1785 | builtin->dst_base_base = base_base; |
1786 | builtin->dst_base_offset = const_base_offset; |
1787 | partition->builtin = builtin; |
1788 | partition->kind = PKIND_MEMSET; |
1789 | } |
1790 | |
1791 | /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify |
1792 | if it forms builtin memcpy or memmove call. */ |
1793 | |
1794 | void |
1795 | loop_distribution::classify_builtin_ldst (loop_p loop, struct graph *rdg, |
1796 | partition *partition, |
1797 | data_reference_p dst_dr, |
1798 | data_reference_p src_dr) |
1799 | { |
1800 | tree base, size, src_base, src_size; |
1801 | auto_vec<tree> dst_steps, src_steps; |
1802 | |
1803 | /* Compute access range of both load and store. */ |
1804 | int res = compute_access_range (loop_nest: loop, dr: dst_dr, base: &base, size: &size, steps: &dst_steps); |
1805 | if (res != 2) |
1806 | return; |
1807 | res = compute_access_range (loop_nest: loop, dr: src_dr, base: &src_base, size: &src_size, steps: &src_steps); |
1808 | if (res != 2) |
1809 | return; |
1810 | |
1811 | /* They must have the same access size. */ |
1812 | if (!operand_equal_p (size, src_size, flags: 0)) |
1813 | return; |
1814 | |
1815 | /* They must have the same storage order. */ |
1816 | if (reverse_storage_order_for_component_p (DR_REF (dst_dr)) |
1817 | != reverse_storage_order_for_component_p (DR_REF (src_dr))) |
1818 | return; |
1819 | |
1820 | /* Load and store in loop nest must access memory in the same way, i.e, |
1821 | their must have the same steps in each loop of the nest. */ |
1822 | if (dst_steps.length () != src_steps.length ()) |
1823 | return; |
1824 | for (unsigned i = 0; i < dst_steps.length (); ++i) |
1825 | if (!operand_equal_p (dst_steps[i], src_steps[i], flags: 0)) |
1826 | return; |
1827 | |
1828 | /* Now check that if there is a dependence. */ |
1829 | ddr_p ddr = get_data_dependence (rdg, a: src_dr, b: dst_dr); |
1830 | |
1831 | /* Classify as memmove if no dependence between load and store. */ |
1832 | if (DDR_ARE_DEPENDENT (ddr) == chrec_known) |
1833 | { |
1834 | partition->builtin = alloc_builtin (dst_dr, src_dr, dst_base: base, src_base, size); |
1835 | partition->kind = PKIND_MEMMOVE; |
1836 | return; |
1837 | } |
1838 | |
1839 | /* Can't do memmove in case of unknown dependence or dependence without |
1840 | classical distance vector. */ |
1841 | if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know |
1842 | || DDR_NUM_DIST_VECTS (ddr) == 0) |
1843 | return; |
1844 | |
1845 | unsigned i; |
1846 | lambda_vector dist_v; |
1847 | int num_lev = (DDR_LOOP_NEST (ddr)).length (); |
1848 | FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) |
1849 | { |
1850 | unsigned dep_lev = dependence_level (dist_vect: dist_v, length: num_lev); |
1851 | /* Can't do memmove if load depends on store. */ |
1852 | if (dep_lev > 0 && dist_v[dep_lev - 1] > 0 && !DDR_REVERSED_P (ddr)) |
1853 | return; |
1854 | } |
1855 | |
1856 | partition->builtin = alloc_builtin (dst_dr, src_dr, dst_base: base, src_base, size); |
1857 | partition->kind = PKIND_MEMMOVE; |
1858 | return; |
1859 | } |
1860 | |
1861 | bool |
1862 | loop_distribution::classify_partition (loop_p loop, |
1863 | struct graph *rdg, partition *partition, |
1864 | bitmap stmt_in_all_partitions) |
1865 | { |
1866 | bitmap_iterator bi; |
1867 | unsigned i; |
1868 | data_reference_p single_ld = NULL, single_st = NULL; |
1869 | bool volatiles_p = false, has_reduction = false; |
1870 | |
1871 | EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi) |
1872 | { |
1873 | gimple *stmt = RDG_STMT (rdg, i); |
1874 | |
1875 | if (gimple_has_volatile_ops (stmt)) |
1876 | volatiles_p = true; |
1877 | |
1878 | /* If the stmt is not included by all partitions and there is uses |
1879 | outside of the loop, then mark the partition as reduction. */ |
1880 | if (stmt_has_scalar_dependences_outside_loop (loop, stmt)) |
1881 | { |
1882 | /* Due to limitation in the transform phase we have to fuse all |
1883 | reduction partitions. As a result, this could cancel valid |
1884 | loop distribution especially for loop that induction variable |
1885 | is used outside of loop. To workaround this issue, we skip |
1886 | marking partition as reudction if the reduction stmt belongs |
1887 | to all partitions. In such case, reduction will be computed |
1888 | correctly no matter how partitions are fused/distributed. */ |
1889 | if (!bitmap_bit_p (stmt_in_all_partitions, i)) |
1890 | partition->reduction_p = true; |
1891 | else |
1892 | has_reduction = true; |
1893 | } |
1894 | } |
1895 | |
1896 | /* Simple workaround to prevent classifying the partition as builtin |
1897 | if it contains any use outside of loop. For the case where all |
1898 | partitions have the reduction this simple workaround is delayed |
1899 | to only affect the last partition. */ |
1900 | if (partition->reduction_p) |
1901 | return has_reduction; |
1902 | |
1903 | /* Perform general partition disqualification for builtins. */ |
1904 | if (volatiles_p |
1905 | || !flag_tree_loop_distribute_patterns) |
1906 | return has_reduction; |
1907 | |
1908 | /* Find single load/store data references for builtin partition. */ |
1909 | if (!find_single_drs (loop, rdg, partition_stmts: partition->stmts, dst_dr: &single_st, src_dr: &single_ld) |
1910 | || !single_st) |
1911 | return has_reduction; |
1912 | |
1913 | if (single_ld && single_st) |
1914 | { |
1915 | gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld); |
1916 | /* Direct aggregate copy or via an SSA name temporary. */ |
1917 | if (load != store |
1918 | && gimple_assign_lhs (gs: load) != gimple_assign_rhs1 (gs: store)) |
1919 | return has_reduction; |
1920 | } |
1921 | |
1922 | partition->loc = gimple_location (DR_STMT (single_st)); |
1923 | |
1924 | /* Classify the builtin kind. */ |
1925 | if (single_ld == NULL) |
1926 | classify_builtin_st (loop, partition, dr: single_st); |
1927 | else |
1928 | classify_builtin_ldst (loop, rdg, partition, dst_dr: single_st, src_dr: single_ld); |
1929 | return has_reduction; |
1930 | } |
1931 | |
1932 | bool |
1933 | loop_distribution::share_memory_accesses (struct graph *rdg, |
1934 | partition *partition1, partition *partition2) |
1935 | { |
1936 | unsigned i, j; |
1937 | bitmap_iterator bi, bj; |
1938 | data_reference_p dr1, dr2; |
1939 | |
1940 | /* First check whether in the intersection of the two partitions are |
1941 | any loads or stores. Common loads are the situation that happens |
1942 | most often. */ |
1943 | EXECUTE_IF_AND_IN_BITMAP (partition1->stmts, partition2->stmts, 0, i, bi) |
1944 | if (RDG_MEM_WRITE_STMT (rdg, i) |
1945 | || RDG_MEM_READS_STMT (rdg, i)) |
1946 | return true; |
1947 | |
1948 | /* Then check whether the two partitions access the same memory object. */ |
1949 | EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi) |
1950 | { |
1951 | dr1 = datarefs_vec[i]; |
1952 | |
1953 | if (!DR_BASE_ADDRESS (dr1) |
1954 | || !DR_OFFSET (dr1) || !DR_INIT (dr1) || !DR_STEP (dr1)) |
1955 | continue; |
1956 | |
1957 | EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, 0, j, bj) |
1958 | { |
1959 | dr2 = datarefs_vec[j]; |
1960 | |
1961 | if (!DR_BASE_ADDRESS (dr2) |
1962 | || !DR_OFFSET (dr2) || !DR_INIT (dr2) || !DR_STEP (dr2)) |
1963 | continue; |
1964 | |
1965 | if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), flags: 0) |
1966 | && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), flags: 0) |
1967 | && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), flags: 0) |
1968 | && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), flags: 0)) |
1969 | return true; |
1970 | } |
1971 | } |
1972 | |
1973 | return false; |
1974 | } |
1975 | |
1976 | /* For each seed statement in STARTING_STMTS, this function builds |
1977 | partition for it by adding depended statements according to RDG. |
1978 | All partitions are recorded in PARTITIONS. */ |
1979 | |
1980 | void |
1981 | loop_distribution::rdg_build_partitions (struct graph *rdg, |
1982 | vec<gimple *> starting_stmts, |
1983 | vec<partition *> *partitions) |
1984 | { |
1985 | auto_bitmap processed; |
1986 | int i; |
1987 | gimple *stmt; |
1988 | |
1989 | FOR_EACH_VEC_ELT (starting_stmts, i, stmt) |
1990 | { |
1991 | int v = rdg_vertex_for_stmt (rdg, stmt); |
1992 | |
1993 | if (dump_file && (dump_flags & TDF_DETAILS)) |
1994 | fprintf (stream: dump_file, |
1995 | format: "ldist asked to generate code for vertex %d\n" , v); |
1996 | |
1997 | /* If the vertex is already contained in another partition so |
1998 | is the partition rooted at it. */ |
1999 | if (bitmap_bit_p (processed, v)) |
2000 | continue; |
2001 | |
2002 | partition *partition = build_rdg_partition_for_vertex (rdg, v); |
2003 | bitmap_ior_into (processed, partition->stmts); |
2004 | |
2005 | if (dump_file && (dump_flags & TDF_DETAILS)) |
2006 | { |
2007 | fprintf (stream: dump_file, format: "ldist creates useful %s partition:\n" , |
2008 | partition->type == PTYPE_PARALLEL ? "parallel" : "sequent" ); |
2009 | bitmap_print (dump_file, partition->stmts, " " , "\n" ); |
2010 | } |
2011 | |
2012 | partitions->safe_push (obj: partition); |
2013 | } |
2014 | |
2015 | /* All vertices should have been assigned to at least one partition now, |
2016 | other than vertices belonging to dead code. */ |
2017 | } |
2018 | |
2019 | /* Dump to FILE the PARTITIONS. */ |
2020 | |
2021 | static void |
2022 | dump_rdg_partitions (FILE *file, const vec<partition *> &partitions) |
2023 | { |
2024 | int i; |
2025 | partition *partition; |
2026 | |
2027 | FOR_EACH_VEC_ELT (partitions, i, partition) |
2028 | debug_bitmap_file (file, partition->stmts); |
2029 | } |
2030 | |
2031 | /* Debug PARTITIONS. */ |
2032 | extern void debug_rdg_partitions (const vec<partition *> &); |
2033 | |
2034 | DEBUG_FUNCTION void |
2035 | debug_rdg_partitions (const vec<partition *> &partitions) |
2036 | { |
2037 | dump_rdg_partitions (stderr, partitions); |
2038 | } |
2039 | |
2040 | /* Returns the number of read and write operations in the RDG. */ |
2041 | |
2042 | static int |
2043 | number_of_rw_in_rdg (struct graph *rdg) |
2044 | { |
2045 | int i, res = 0; |
2046 | |
2047 | for (i = 0; i < rdg->n_vertices; i++) |
2048 | { |
2049 | if (RDG_MEM_WRITE_STMT (rdg, i)) |
2050 | ++res; |
2051 | |
2052 | if (RDG_MEM_READS_STMT (rdg, i)) |
2053 | ++res; |
2054 | } |
2055 | |
2056 | return res; |
2057 | } |
2058 | |
2059 | /* Returns the number of read and write operations in a PARTITION of |
2060 | the RDG. */ |
2061 | |
2062 | static int |
2063 | number_of_rw_in_partition (struct graph *rdg, partition *partition) |
2064 | { |
2065 | int res = 0; |
2066 | unsigned i; |
2067 | bitmap_iterator ii; |
2068 | |
2069 | EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, ii) |
2070 | { |
2071 | if (RDG_MEM_WRITE_STMT (rdg, i)) |
2072 | ++res; |
2073 | |
2074 | if (RDG_MEM_READS_STMT (rdg, i)) |
2075 | ++res; |
2076 | } |
2077 | |
2078 | return res; |
2079 | } |
2080 | |
2081 | /* Returns true when one of the PARTITIONS contains all the read or |
2082 | write operations of RDG. */ |
2083 | |
2084 | static bool |
2085 | partition_contains_all_rw (struct graph *rdg, |
2086 | const vec<partition *> &partitions) |
2087 | { |
2088 | int i; |
2089 | partition *partition; |
2090 | int nrw = number_of_rw_in_rdg (rdg); |
2091 | |
2092 | FOR_EACH_VEC_ELT (partitions, i, partition) |
2093 | if (nrw == number_of_rw_in_partition (rdg, partition)) |
2094 | return true; |
2095 | |
2096 | return false; |
2097 | } |
2098 | |
2099 | int |
2100 | loop_distribution::pg_add_dependence_edges (struct graph *rdg, int dir, |
2101 | bitmap drs1, bitmap drs2, vec<ddr_p> *alias_ddrs) |
2102 | { |
2103 | unsigned i, j; |
2104 | bitmap_iterator bi, bj; |
2105 | data_reference_p dr1, dr2, saved_dr1; |
2106 | |
2107 | /* dependence direction - 0 is no dependence, -1 is back, |
2108 | 1 is forth, 2 is both (we can stop then, merging will occur). */ |
2109 | EXECUTE_IF_SET_IN_BITMAP (drs1, 0, i, bi) |
2110 | { |
2111 | dr1 = datarefs_vec[i]; |
2112 | |
2113 | EXECUTE_IF_SET_IN_BITMAP (drs2, 0, j, bj) |
2114 | { |
2115 | int res, this_dir = 1; |
2116 | ddr_p ddr; |
2117 | |
2118 | dr2 = datarefs_vec[j]; |
2119 | |
2120 | /* Skip all <read, read> data dependence. */ |
2121 | if (DR_IS_READ (dr1) && DR_IS_READ (dr2)) |
2122 | continue; |
2123 | |
2124 | saved_dr1 = dr1; |
2125 | /* Re-shuffle data-refs to be in topological order. */ |
2126 | if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1)) |
2127 | > rdg_vertex_for_stmt (rdg, DR_STMT (dr2))) |
2128 | { |
2129 | std::swap (a&: dr1, b&: dr2); |
2130 | this_dir = -this_dir; |
2131 | } |
2132 | ddr = get_data_dependence (rdg, a: dr1, b: dr2); |
2133 | if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) |
2134 | { |
2135 | this_dir = 0; |
2136 | res = data_ref_compare_tree (DR_BASE_ADDRESS (dr1), |
2137 | DR_BASE_ADDRESS (dr2)); |
2138 | /* Be conservative. If data references are not well analyzed, |
2139 | or the two data references have the same base address and |
2140 | offset, add dependence and consider it alias to each other. |
2141 | In other words, the dependence cannot be resolved by |
2142 | runtime alias check. */ |
2143 | if (!DR_BASE_ADDRESS (dr1) || !DR_BASE_ADDRESS (dr2) |
2144 | || !DR_OFFSET (dr1) || !DR_OFFSET (dr2) |
2145 | || !DR_INIT (dr1) || !DR_INIT (dr2) |
2146 | || !DR_STEP (dr1) || !tree_fits_uhwi_p (DR_STEP (dr1)) |
2147 | || !DR_STEP (dr2) || !tree_fits_uhwi_p (DR_STEP (dr2)) |
2148 | || res == 0) |
2149 | this_dir = 2; |
2150 | /* Data dependence could be resolved by runtime alias check, |
2151 | record it in ALIAS_DDRS. */ |
2152 | else if (alias_ddrs != NULL) |
2153 | alias_ddrs->safe_push (obj: ddr); |
2154 | /* Or simply ignore it. */ |
2155 | } |
2156 | else if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE) |
2157 | { |
2158 | if (DDR_REVERSED_P (ddr)) |
2159 | this_dir = -this_dir; |
2160 | |
2161 | /* Known dependences can still be unordered througout the |
2162 | iteration space, see gcc.dg/tree-ssa/ldist-16.c and |
2163 | gcc.dg/tree-ssa/pr94969.c. */ |
2164 | if (DDR_NUM_DIST_VECTS (ddr) != 1) |
2165 | this_dir = 2; |
2166 | /* If the overlap is exact preserve stmt order. */ |
2167 | else if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), |
2168 | DDR_NB_LOOPS (ddr))) |
2169 | ; |
2170 | /* Else as the distance vector is lexicographic positive swap |
2171 | the dependence direction. */ |
2172 | else |
2173 | this_dir = -this_dir; |
2174 | } |
2175 | else |
2176 | this_dir = 0; |
2177 | if (this_dir == 2) |
2178 | return 2; |
2179 | else if (dir == 0) |
2180 | dir = this_dir; |
2181 | else if (this_dir != 0 && dir != this_dir) |
2182 | return 2; |
2183 | /* Shuffle "back" dr1. */ |
2184 | dr1 = saved_dr1; |
2185 | } |
2186 | } |
2187 | return dir; |
2188 | } |
2189 | |
2190 | /* Compare postorder number of the partition graph vertices V1 and V2. */ |
2191 | |
2192 | static int |
2193 | pgcmp (const void *v1_, const void *v2_) |
2194 | { |
2195 | const vertex *v1 = (const vertex *)v1_; |
2196 | const vertex *v2 = (const vertex *)v2_; |
2197 | return v2->post - v1->post; |
2198 | } |
2199 | |
2200 | /* Data attached to vertices of partition dependence graph. */ |
2201 | struct pg_vdata |
2202 | { |
2203 | /* ID of the corresponding partition. */ |
2204 | int id; |
2205 | /* The partition. */ |
2206 | struct partition *partition; |
2207 | }; |
2208 | |
2209 | /* Data attached to edges of partition dependence graph. */ |
2210 | struct pg_edata |
2211 | { |
2212 | /* If the dependence edge can be resolved by runtime alias check, |
2213 | this vector contains data dependence relations for runtime alias |
2214 | check. On the other hand, if the dependence edge is introduced |
2215 | because of compilation time known data dependence, this vector |
2216 | contains nothing. */ |
2217 | vec<ddr_p> alias_ddrs; |
2218 | }; |
2219 | |
2220 | /* Callback data for traversing edges in graph. */ |
2221 | struct pg_edge_callback_data |
2222 | { |
2223 | /* Bitmap contains strong connected components should be merged. */ |
2224 | bitmap sccs_to_merge; |
2225 | /* Array constains component information for all vertices. */ |
2226 | int *vertices_component; |
2227 | /* Vector to record all data dependence relations which are needed |
2228 | to break strong connected components by runtime alias checks. */ |
2229 | vec<ddr_p> *alias_ddrs; |
2230 | }; |
2231 | |
2232 | /* Initialize vertice's data for partition dependence graph PG with |
2233 | PARTITIONS. */ |
2234 | |
2235 | static void |
2236 | init_partition_graph_vertices (struct graph *pg, |
2237 | vec<struct partition *> *partitions) |
2238 | { |
2239 | int i; |
2240 | partition *partition; |
2241 | struct pg_vdata *data; |
2242 | |
2243 | for (i = 0; partitions->iterate (ix: i, ptr: &partition); ++i) |
2244 | { |
2245 | data = new pg_vdata; |
2246 | pg->vertices[i].data = data; |
2247 | data->id = i; |
2248 | data->partition = partition; |
2249 | } |
2250 | } |
2251 | |
2252 | /* Add edge <I, J> to partition dependence graph PG. Attach vector of data |
2253 | dependence relations to the EDGE if DDRS isn't NULL. */ |
2254 | |
2255 | static void |
2256 | add_partition_graph_edge (struct graph *pg, int i, int j, vec<ddr_p> *ddrs) |
2257 | { |
2258 | struct graph_edge *e = add_edge (pg, i, j); |
2259 | |
2260 | /* If the edge is attached with data dependence relations, it means this |
2261 | dependence edge can be resolved by runtime alias checks. */ |
2262 | if (ddrs != NULL) |
2263 | { |
2264 | struct pg_edata *data = new pg_edata; |
2265 | |
2266 | gcc_assert (ddrs->length () > 0); |
2267 | e->data = data; |
2268 | data->alias_ddrs = vNULL; |
2269 | data->alias_ddrs.safe_splice (src: *ddrs); |
2270 | } |
2271 | } |
2272 | |
2273 | /* Callback function for graph travesal algorithm. It returns true |
2274 | if edge E should skipped when traversing the graph. */ |
2275 | |
2276 | static bool |
2277 | pg_skip_alias_edge (struct graph_edge *e) |
2278 | { |
2279 | struct pg_edata *data = (struct pg_edata *)e->data; |
2280 | return (data != NULL && data->alias_ddrs.length () > 0); |
2281 | } |
2282 | |
2283 | /* Callback function freeing data attached to edge E of graph. */ |
2284 | |
2285 | static void |
2286 | free_partition_graph_edata_cb (struct graph *, struct graph_edge *e, void *) |
2287 | { |
2288 | if (e->data != NULL) |
2289 | { |
2290 | struct pg_edata *data = (struct pg_edata *)e->data; |
2291 | data->alias_ddrs.release (); |
2292 | delete data; |
2293 | } |
2294 | } |
2295 | |
2296 | /* Free data attached to vertice of partition dependence graph PG. */ |
2297 | |
2298 | static void |
2299 | free_partition_graph_vdata (struct graph *pg) |
2300 | { |
2301 | int i; |
2302 | struct pg_vdata *data; |
2303 | |
2304 | for (i = 0; i < pg->n_vertices; ++i) |
2305 | { |
2306 | data = (struct pg_vdata *)pg->vertices[i].data; |
2307 | delete data; |
2308 | } |
2309 | } |
2310 | |
2311 | /* Build and return partition dependence graph for PARTITIONS. RDG is |
2312 | reduced dependence graph for the loop to be distributed. If IGNORE_ALIAS_P |
2313 | is true, data dependence caused by possible alias between references |
2314 | is ignored, as if it doesn't exist at all; otherwise all depdendences |
2315 | are considered. */ |
2316 | |
2317 | struct graph * |
2318 | loop_distribution::build_partition_graph (struct graph *rdg, |
2319 | vec<struct partition *> *partitions, |
2320 | bool ignore_alias_p) |
2321 | { |
2322 | int i, j; |
2323 | struct partition *partition1, *partition2; |
2324 | graph *pg = new_graph (partitions->length ()); |
2325 | auto_vec<ddr_p> alias_ddrs, *alias_ddrs_p; |
2326 | |
2327 | alias_ddrs_p = ignore_alias_p ? NULL : &alias_ddrs; |
2328 | |
2329 | init_partition_graph_vertices (pg, partitions); |
2330 | |
2331 | for (i = 0; partitions->iterate (ix: i, ptr: &partition1); ++i) |
2332 | { |
2333 | for (j = i + 1; partitions->iterate (ix: j, ptr: &partition2); ++j) |
2334 | { |
2335 | /* dependence direction - 0 is no dependence, -1 is back, |
2336 | 1 is forth, 2 is both (we can stop then, merging will occur). */ |
2337 | int dir = 0; |
2338 | |
2339 | /* If the first partition has reduction, add back edge; if the |
2340 | second partition has reduction, add forth edge. This makes |
2341 | sure that reduction partition will be sorted as the last one. */ |
2342 | if (partition_reduction_p (partition: partition1)) |
2343 | dir = -1; |
2344 | else if (partition_reduction_p (partition: partition2)) |
2345 | dir = 1; |
2346 | |
2347 | /* Cleanup the temporary vector. */ |
2348 | alias_ddrs.truncate (size: 0); |
2349 | |
2350 | dir = pg_add_dependence_edges (rdg, dir, drs1: partition1->datarefs, |
2351 | drs2: partition2->datarefs, alias_ddrs: alias_ddrs_p); |
2352 | |
2353 | /* Add edge to partition graph if there exists dependence. There |
2354 | are two types of edges. One type edge is caused by compilation |
2355 | time known dependence, this type cannot be resolved by runtime |
2356 | alias check. The other type can be resolved by runtime alias |
2357 | check. */ |
2358 | if (dir == 1 || dir == 2 |
2359 | || alias_ddrs.length () > 0) |
2360 | { |
2361 | /* Attach data dependence relations to edge that can be resolved |
2362 | by runtime alias check. */ |
2363 | bool alias_edge_p = (dir != 1 && dir != 2); |
2364 | add_partition_graph_edge (pg, i, j, |
2365 | ddrs: (alias_edge_p) ? &alias_ddrs : NULL); |
2366 | } |
2367 | if (dir == -1 || dir == 2 |
2368 | || alias_ddrs.length () > 0) |
2369 | { |
2370 | /* Attach data dependence relations to edge that can be resolved |
2371 | by runtime alias check. */ |
2372 | bool alias_edge_p = (dir != -1 && dir != 2); |
2373 | add_partition_graph_edge (pg, i: j, j: i, |
2374 | ddrs: (alias_edge_p) ? &alias_ddrs : NULL); |
2375 | } |
2376 | } |
2377 | } |
2378 | return pg; |
2379 | } |
2380 | |
2381 | /* Sort partitions in PG in descending post order and store them in |
2382 | PARTITIONS. */ |
2383 | |
2384 | static void |
2385 | sort_partitions_by_post_order (struct graph *pg, |
2386 | vec<struct partition *> *partitions) |
2387 | { |
2388 | int i; |
2389 | struct pg_vdata *data; |
2390 | |
2391 | /* Now order the remaining nodes in descending postorder. */ |
2392 | qsort (pg->vertices, pg->n_vertices, sizeof (vertex), pgcmp); |
2393 | partitions->truncate (size: 0); |
2394 | for (i = 0; i < pg->n_vertices; ++i) |
2395 | { |
2396 | data = (struct pg_vdata *)pg->vertices[i].data; |
2397 | if (data->partition) |
2398 | partitions->safe_push (obj: data->partition); |
2399 | } |
2400 | } |
2401 | |
2402 | void |
2403 | loop_distribution::merge_dep_scc_partitions (struct graph *rdg, |
2404 | vec<struct partition *> *partitions, |
2405 | bool ignore_alias_p) |
2406 | { |
2407 | struct partition *partition1, *partition2; |
2408 | struct pg_vdata *data; |
2409 | graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p); |
2410 | int i, j, num_sccs = graphds_scc (pg, NULL); |
2411 | |
2412 | /* Strong connected compoenent means dependence cycle, we cannot distribute |
2413 | them. So fuse them together. */ |
2414 | if ((unsigned) num_sccs < partitions->length ()) |
2415 | { |
2416 | for (i = 0; i < num_sccs; ++i) |
2417 | { |
2418 | for (j = 0; partitions->iterate (ix: j, ptr: &partition1); ++j) |
2419 | if (pg->vertices[j].component == i) |
2420 | break; |
2421 | for (j = j + 1; partitions->iterate (ix: j, ptr: &partition2); ++j) |
2422 | if (pg->vertices[j].component == i) |
2423 | { |
2424 | partition_merge_into (NULL, dest: partition1, |
2425 | partition: partition2, ft: FUSE_SAME_SCC); |
2426 | partition1->type = PTYPE_SEQUENTIAL; |
2427 | (*partitions)[j] = NULL; |
2428 | partition_free (partition: partition2); |
2429 | data = (struct pg_vdata *)pg->vertices[j].data; |
2430 | data->partition = NULL; |
2431 | } |
2432 | } |
2433 | } |
2434 | |
2435 | sort_partitions_by_post_order (pg, partitions); |
2436 | gcc_assert (partitions->length () == (unsigned)num_sccs); |
2437 | free_partition_graph_vdata (pg); |
2438 | for_each_edge (pg, free_partition_graph_edata_cb, NULL); |
2439 | free_graph (g: pg); |
2440 | } |
2441 | |
2442 | /* Callback function for traversing edge E in graph G. DATA is private |
2443 | callback data. */ |
2444 | |
2445 | static void |
2446 | pg_collect_alias_ddrs (struct graph *g, struct graph_edge *e, void *data) |
2447 | { |
2448 | int i, j, component; |
2449 | struct pg_edge_callback_data *cbdata; |
2450 | struct pg_edata *edata = (struct pg_edata *) e->data; |
2451 | |
2452 | /* If the edge doesn't have attached data dependence, it represents |
2453 | compilation time known dependences. This type dependence cannot |
2454 | be resolved by runtime alias check. */ |
2455 | if (edata == NULL || edata->alias_ddrs.length () == 0) |
2456 | return; |
2457 | |
2458 | cbdata = (struct pg_edge_callback_data *) data; |
2459 | i = e->src; |
2460 | j = e->dest; |
2461 | component = cbdata->vertices_component[i]; |
2462 | /* Vertices are topologically sorted according to compilation time |
2463 | known dependences, so we can break strong connected components |
2464 | by removing edges of the opposite direction, i.e, edges pointing |
2465 | from vertice with smaller post number to vertice with bigger post |
2466 | number. */ |
2467 | if (g->vertices[i].post < g->vertices[j].post |
2468 | /* We only need to remove edges connecting vertices in the same |
2469 | strong connected component to break it. */ |
2470 | && component == cbdata->vertices_component[j] |
2471 | /* Check if we want to break the strong connected component or not. */ |
2472 | && !bitmap_bit_p (cbdata->sccs_to_merge, component)) |
2473 | cbdata->alias_ddrs->safe_splice (src: edata->alias_ddrs); |
2474 | } |
2475 | |
2476 | /* Callback function for traversing edge E. DATA is private |
2477 | callback data. */ |
2478 | |
2479 | static void |
2480 | pg_unmark_merged_alias_ddrs (struct graph *, struct graph_edge *e, void *data) |
2481 | { |
2482 | int i, j, component; |
2483 | struct pg_edge_callback_data *cbdata; |
2484 | struct pg_edata *edata = (struct pg_edata *) e->data; |
2485 | |
2486 | if (edata == NULL || edata->alias_ddrs.length () == 0) |
2487 | return; |
2488 | |
2489 | cbdata = (struct pg_edge_callback_data *) data; |
2490 | i = e->src; |
2491 | j = e->dest; |
2492 | component = cbdata->vertices_component[i]; |
2493 | /* Make sure to not skip vertices inside SCCs we are going to merge. */ |
2494 | if (component == cbdata->vertices_component[j] |
2495 | && bitmap_bit_p (cbdata->sccs_to_merge, component)) |
2496 | { |
2497 | edata->alias_ddrs.release (); |
2498 | delete edata; |
2499 | e->data = NULL; |
2500 | } |
2501 | } |
2502 | |
2503 | /* This is the main function breaking strong conected components in |
2504 | PARTITIONS giving reduced depdendence graph RDG. Store data dependence |
2505 | relations for runtime alias check in ALIAS_DDRS. */ |
2506 | void |
2507 | loop_distribution::break_alias_scc_partitions (struct graph *rdg, |
2508 | vec<struct partition *> *partitions, |
2509 | vec<ddr_p> *alias_ddrs) |
2510 | { |
2511 | int i, j, k, num_sccs, num_sccs_no_alias = 0; |
2512 | /* Build partition dependence graph. */ |
2513 | graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p: false); |
2514 | |
2515 | alias_ddrs->truncate (size: 0); |
2516 | /* Find strong connected components in the graph, with all dependence edges |
2517 | considered. */ |
2518 | num_sccs = graphds_scc (pg, NULL); |
2519 | /* All SCCs now can be broken by runtime alias checks because SCCs caused by |
2520 | compilation time known dependences are merged before this function. */ |
2521 | if ((unsigned) num_sccs < partitions->length ()) |
2522 | { |
2523 | struct pg_edge_callback_data cbdata; |
2524 | auto_bitmap sccs_to_merge; |
2525 | auto_vec<enum partition_type> scc_types; |
2526 | struct partition *partition, *first; |
2527 | |
2528 | /* If all partitions in a SCC have the same type, we can simply merge the |
2529 | SCC. This loop finds out such SCCS and record them in bitmap. */ |
2530 | bitmap_set_range (sccs_to_merge, 0, (unsigned) num_sccs); |
2531 | for (i = 0; i < num_sccs; ++i) |
2532 | { |
2533 | for (j = 0; partitions->iterate (ix: j, ptr: &first); ++j) |
2534 | if (pg->vertices[j].component == i) |
2535 | break; |
2536 | |
2537 | bool same_type = true, all_builtins = partition_builtin_p (partition: first); |
2538 | for (++j; partitions->iterate (ix: j, ptr: &partition); ++j) |
2539 | { |
2540 | if (pg->vertices[j].component != i) |
2541 | continue; |
2542 | |
2543 | if (first->type != partition->type) |
2544 | { |
2545 | same_type = false; |
2546 | break; |
2547 | } |
2548 | all_builtins &= partition_builtin_p (partition); |
2549 | } |
2550 | /* Merge SCC if all partitions in SCC have the same type, though the |
2551 | result partition is sequential, because vectorizer can do better |
2552 | runtime alias check. One expecption is all partitions in SCC are |
2553 | builtins. */ |
2554 | if (!same_type || all_builtins) |
2555 | bitmap_clear_bit (sccs_to_merge, i); |
2556 | } |
2557 | |
2558 | /* Initialize callback data for traversing. */ |
2559 | cbdata.sccs_to_merge = sccs_to_merge; |
2560 | cbdata.alias_ddrs = alias_ddrs; |
2561 | cbdata.vertices_component = XNEWVEC (int, pg->n_vertices); |
2562 | /* Record the component information which will be corrupted by next |
2563 | graph scc finding call. */ |
2564 | for (i = 0; i < pg->n_vertices; ++i) |
2565 | cbdata.vertices_component[i] = pg->vertices[i].component; |
2566 | |
2567 | /* Collect data dependences for runtime alias checks to break SCCs. */ |
2568 | if (bitmap_count_bits (sccs_to_merge) != (unsigned) num_sccs) |
2569 | { |
2570 | /* For SCCs we want to merge clear all alias_ddrs for edges |
2571 | inside the component. */ |
2572 | for_each_edge (pg, pg_unmark_merged_alias_ddrs, &cbdata); |
2573 | |
2574 | /* Run SCC finding algorithm again, with alias dependence edges |
2575 | skipped. This is to topologically sort partitions according to |
2576 | compilation time known dependence. Note the topological order |
2577 | is stored in the form of pg's post order number. */ |
2578 | num_sccs_no_alias = graphds_scc (pg, NULL, pg_skip_alias_edge); |
2579 | /* We cannot assert partitions->length () == num_sccs_no_alias |
2580 | since we are not ignoring alias edges in cycles we are |
2581 | going to merge. That's required to compute correct postorder. */ |
2582 | /* With topological order, we can construct two subgraphs L and R. |
2583 | L contains edge <x, y> where x < y in terms of post order, while |
2584 | R contains edge <x, y> where x > y. Edges for compilation time |
2585 | known dependence all fall in R, so we break SCCs by removing all |
2586 | (alias) edges of in subgraph L. */ |
2587 | for_each_edge (pg, pg_collect_alias_ddrs, &cbdata); |
2588 | } |
2589 | |
2590 | /* For SCC that doesn't need to be broken, merge it. */ |
2591 | for (i = 0; i < num_sccs; ++i) |
2592 | { |
2593 | if (!bitmap_bit_p (sccs_to_merge, i)) |
2594 | continue; |
2595 | |
2596 | for (j = 0; partitions->iterate (ix: j, ptr: &first); ++j) |
2597 | if (cbdata.vertices_component[j] == i) |
2598 | break; |
2599 | for (k = j + 1; partitions->iterate (ix: k, ptr: &partition); ++k) |
2600 | { |
2601 | struct pg_vdata *data; |
2602 | |
2603 | if (cbdata.vertices_component[k] != i) |
2604 | continue; |
2605 | |
2606 | partition_merge_into (NULL, dest: first, partition, ft: FUSE_SAME_SCC); |
2607 | (*partitions)[k] = NULL; |
2608 | partition_free (partition); |
2609 | data = (struct pg_vdata *)pg->vertices[k].data; |
2610 | gcc_assert (data->id == k); |
2611 | data->partition = NULL; |
2612 | /* The result partition of merged SCC must be sequential. */ |
2613 | first->type = PTYPE_SEQUENTIAL; |
2614 | } |
2615 | } |
2616 | /* If reduction partition's SCC is broken by runtime alias checks, |
2617 | we force a negative post order to it making sure it will be scheduled |
2618 | in the last. */ |
2619 | if (num_sccs_no_alias > 0) |
2620 | { |
2621 | j = -1; |
2622 | for (i = 0; i < pg->n_vertices; ++i) |
2623 | { |
2624 | struct pg_vdata *data = (struct pg_vdata *)pg->vertices[i].data; |
2625 | if (data->partition && partition_reduction_p (partition: data->partition)) |
2626 | { |
2627 | gcc_assert (j == -1); |
2628 | j = i; |
2629 | } |
2630 | } |
2631 | if (j >= 0) |
2632 | pg->vertices[j].post = -1; |
2633 | } |
2634 | |
2635 | free (ptr: cbdata.vertices_component); |
2636 | } |
2637 | |
2638 | sort_partitions_by_post_order (pg, partitions); |
2639 | free_partition_graph_vdata (pg); |
2640 | for_each_edge (pg, free_partition_graph_edata_cb, NULL); |
2641 | free_graph (g: pg); |
2642 | |
2643 | if (dump_file && (dump_flags & TDF_DETAILS)) |
2644 | { |
2645 | fprintf (stream: dump_file, format: "Possible alias data dependence to break:\n" ); |
2646 | dump_data_dependence_relations (dump_file, *alias_ddrs); |
2647 | } |
2648 | } |
2649 | |
2650 | /* Compute and return an expression whose value is the segment length which |
2651 | will be accessed by DR in NITERS iterations. */ |
2652 | |
2653 | static tree |
2654 | data_ref_segment_size (struct data_reference *dr, tree niters) |
2655 | { |
2656 | niters = size_binop (MINUS_EXPR, |
2657 | fold_convert (sizetype, niters), |
2658 | size_one_node); |
2659 | return size_binop (MULT_EXPR, |
2660 | fold_convert (sizetype, DR_STEP (dr)), |
2661 | fold_convert (sizetype, niters)); |
2662 | } |
2663 | |
2664 | /* Return true if LOOP's latch is dominated by statement for data reference |
2665 | DR. */ |
2666 | |
2667 | static inline bool |
2668 | latch_dominated_by_data_ref (class loop *loop, data_reference *dr) |
2669 | { |
2670 | return dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, |
2671 | gimple_bb (DR_STMT (dr))); |
2672 | } |
2673 | |
2674 | /* Compute alias check pairs and store them in COMP_ALIAS_PAIRS for LOOP's |
2675 | data dependence relations ALIAS_DDRS. */ |
2676 | |
2677 | static void |
2678 | compute_alias_check_pairs (class loop *loop, vec<ddr_p> *alias_ddrs, |
2679 | vec<dr_with_seg_len_pair_t> *comp_alias_pairs) |
2680 | { |
2681 | unsigned int i; |
2682 | unsigned HOST_WIDE_INT factor = 1; |
2683 | tree niters_plus_one, niters = number_of_latch_executions (loop); |
2684 | |
2685 | gcc_assert (niters != NULL_TREE && niters != chrec_dont_know); |
2686 | niters = fold_convert (sizetype, niters); |
2687 | niters_plus_one = size_binop (PLUS_EXPR, niters, size_one_node); |
2688 | |
2689 | if (dump_file && (dump_flags & TDF_DETAILS)) |
2690 | fprintf (stream: dump_file, format: "Creating alias check pairs:\n" ); |
2691 | |
2692 | /* Iterate all data dependence relations and compute alias check pairs. */ |
2693 | for (i = 0; i < alias_ddrs->length (); i++) |
2694 | { |
2695 | ddr_p ddr = (*alias_ddrs)[i]; |
2696 | struct data_reference *dr_a = DDR_A (ddr); |
2697 | struct data_reference *dr_b = DDR_B (ddr); |
2698 | tree seg_length_a, seg_length_b; |
2699 | |
2700 | if (latch_dominated_by_data_ref (loop, dr: dr_a)) |
2701 | seg_length_a = data_ref_segment_size (dr: dr_a, niters: niters_plus_one); |
2702 | else |
2703 | seg_length_a = data_ref_segment_size (dr: dr_a, niters); |
2704 | |
2705 | if (latch_dominated_by_data_ref (loop, dr: dr_b)) |
2706 | seg_length_b = data_ref_segment_size (dr: dr_b, niters: niters_plus_one); |
2707 | else |
2708 | seg_length_b = data_ref_segment_size (dr: dr_b, niters); |
2709 | |
2710 | unsigned HOST_WIDE_INT access_size_a |
2711 | = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a)))); |
2712 | unsigned HOST_WIDE_INT access_size_b |
2713 | = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b)))); |
2714 | unsigned int align_a = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_a))); |
2715 | unsigned int align_b = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_b))); |
2716 | |
2717 | dr_with_seg_len_pair_t dr_with_seg_len_pair |
2718 | (dr_with_seg_len (dr_a, seg_length_a, access_size_a, align_a), |
2719 | dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b), |
2720 | /* ??? Would WELL_ORDERED be safe? */ |
2721 | dr_with_seg_len_pair_t::REORDERED); |
2722 | |
2723 | comp_alias_pairs->safe_push (obj: dr_with_seg_len_pair); |
2724 | } |
2725 | |
2726 | if (tree_fits_uhwi_p (niters)) |
2727 | factor = tree_to_uhwi (niters); |
2728 | |
2729 | /* Prune alias check pairs. */ |
2730 | prune_runtime_alias_test_list (comp_alias_pairs, factor); |
2731 | if (dump_file && (dump_flags & TDF_DETAILS)) |
2732 | fprintf (stream: dump_file, |
2733 | format: "Improved number of alias checks from %d to %d\n" , |
2734 | alias_ddrs->length (), comp_alias_pairs->length ()); |
2735 | } |
2736 | |
2737 | /* Given data dependence relations in ALIAS_DDRS, generate runtime alias |
2738 | checks and version LOOP under condition of these runtime alias checks. */ |
2739 | |
2740 | static void |
2741 | version_loop_by_alias_check (vec<struct partition *> *partitions, |
2742 | class loop *loop, vec<ddr_p> *alias_ddrs) |
2743 | { |
2744 | profile_probability prob; |
2745 | basic_block cond_bb; |
2746 | class loop *nloop; |
2747 | tree lhs, arg0, cond_expr = NULL_TREE; |
2748 | gimple_seq cond_stmts = NULL; |
2749 | gimple *call_stmt = NULL; |
2750 | auto_vec<dr_with_seg_len_pair_t> comp_alias_pairs; |
2751 | |
2752 | /* Generate code for runtime alias checks if necessary. */ |
2753 | gcc_assert (alias_ddrs->length () > 0); |
2754 | |
2755 | if (dump_file && (dump_flags & TDF_DETAILS)) |
2756 | fprintf (stream: dump_file, |
2757 | format: "Version loop <%d> with runtime alias check\n" , loop->num); |
2758 | |
2759 | compute_alias_check_pairs (loop, alias_ddrs, comp_alias_pairs: &comp_alias_pairs); |
2760 | create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr); |
2761 | cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts, |
2762 | is_gimple_val, NULL_TREE); |
2763 | |
2764 | /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS. */ |
2765 | bool cancelable_p = flag_tree_loop_vectorize; |
2766 | if (cancelable_p) |
2767 | { |
2768 | unsigned i = 0; |
2769 | struct partition *partition; |
2770 | for (; partitions->iterate (ix: i, ptr: &partition); ++i) |
2771 | if (!partition_builtin_p (partition)) |
2772 | break; |
2773 | |
2774 | /* If all partitions are builtins, distributing it would be profitable and |
2775 | we don't want to cancel the runtime alias checks. */ |
2776 | if (i == partitions->length ()) |
2777 | cancelable_p = false; |
2778 | } |
2779 | |
2780 | /* Generate internal function call for loop distribution alias check if the |
2781 | runtime alias check should be cancelable. */ |
2782 | if (cancelable_p) |
2783 | { |
2784 | call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS, |
2785 | 2, NULL_TREE, cond_expr); |
2786 | lhs = make_ssa_name (boolean_type_node); |
2787 | gimple_call_set_lhs (gs: call_stmt, lhs); |
2788 | } |
2789 | else |
2790 | lhs = cond_expr; |
2791 | |
2792 | prob = profile_probability::guessed_always ().apply_scale (num: 9, den: 10); |
2793 | initialize_original_copy_tables (); |
2794 | nloop = loop_version (loop, lhs, &cond_bb, prob, prob.invert (), |
2795 | prob, prob.invert (), true); |
2796 | free_original_copy_tables (); |
2797 | /* Record the original loop number in newly generated loops. In case of |
2798 | distribution, the original loop will be distributed and the new loop |
2799 | is kept. */ |
2800 | loop->orig_loop_num = nloop->num; |
2801 | nloop->orig_loop_num = nloop->num; |
2802 | nloop->dont_vectorize = true; |
2803 | nloop->force_vectorize = false; |
2804 | |
2805 | if (call_stmt) |
2806 | { |
2807 | /* Record new loop's num in IFN_LOOP_DIST_ALIAS because the original |
2808 | loop could be destroyed. */ |
2809 | arg0 = build_int_cst (integer_type_node, loop->orig_loop_num); |
2810 | gimple_call_set_arg (gs: call_stmt, index: 0, arg: arg0); |
2811 | gimple_seq_add_stmt_without_update (&cond_stmts, call_stmt); |
2812 | } |
2813 | |
2814 | if (cond_stmts) |
2815 | { |
2816 | gimple_stmt_iterator cond_gsi = gsi_last_bb (bb: cond_bb); |
2817 | gsi_insert_seq_before (&cond_gsi, cond_stmts, GSI_SAME_STMT); |
2818 | } |
2819 | update_ssa (TODO_update_ssa_no_phi); |
2820 | } |
2821 | |
2822 | /* Return true if loop versioning is needed to distrubute PARTITIONS. |
2823 | ALIAS_DDRS are data dependence relations for runtime alias check. */ |
2824 | |
2825 | static inline bool |
2826 | version_for_distribution_p (vec<struct partition *> *partitions, |
2827 | vec<ddr_p> *alias_ddrs) |
2828 | { |
2829 | /* No need to version loop if we have only one partition. */ |
2830 | if (partitions->length () == 1) |
2831 | return false; |
2832 | |
2833 | /* Need to version loop if runtime alias check is necessary. */ |
2834 | return (alias_ddrs->length () > 0); |
2835 | } |
2836 | |
2837 | /* Compare base offset of builtin mem* partitions P1 and P2. */ |
2838 | |
2839 | static int |
2840 | offset_cmp (const void *vp1, const void *vp2) |
2841 | { |
2842 | struct partition *p1 = *(struct partition *const *) vp1; |
2843 | struct partition *p2 = *(struct partition *const *) vp2; |
2844 | unsigned HOST_WIDE_INT o1 = p1->builtin->dst_base_offset; |
2845 | unsigned HOST_WIDE_INT o2 = p2->builtin->dst_base_offset; |
2846 | return (o2 < o1) - (o1 < o2); |
2847 | } |
2848 | |
2849 | /* Fuse adjacent memset builtin PARTITIONS if possible. This is a special |
2850 | case optimization transforming below code: |
2851 | |
2852 | __builtin_memset (&obj, 0, 100); |
2853 | _1 = &obj + 100; |
2854 | __builtin_memset (_1, 0, 200); |
2855 | _2 = &obj + 300; |
2856 | __builtin_memset (_2, 0, 100); |
2857 | |
2858 | into: |
2859 | |
2860 | __builtin_memset (&obj, 0, 400); |
2861 | |
2862 | Note we don't have dependence information between different partitions |
2863 | at this point, as a result, we can't handle nonadjacent memset builtin |
2864 | partitions since dependence might be broken. */ |
2865 | |
2866 | static void |
2867 | fuse_memset_builtins (vec<struct partition *> *partitions) |
2868 | { |
2869 | unsigned i, j; |
2870 | struct partition *part1, *part2; |
2871 | tree rhs1, rhs2; |
2872 | |
2873 | for (i = 0; partitions->iterate (ix: i, ptr: &part1);) |
2874 | { |
2875 | if (part1->kind != PKIND_MEMSET) |
2876 | { |
2877 | i++; |
2878 | continue; |
2879 | } |
2880 | |
2881 | /* Find sub-array of memset builtins of the same base. Index range |
2882 | of the sub-array is [i, j) with "j > i". */ |
2883 | for (j = i + 1; partitions->iterate (ix: j, ptr: &part2); ++j) |
2884 | { |
2885 | if (part2->kind != PKIND_MEMSET |
2886 | || !operand_equal_p (part1->builtin->dst_base_base, |
2887 | part2->builtin->dst_base_base, flags: 0)) |
2888 | break; |
2889 | |
2890 | /* Memset calls setting different values can't be merged. */ |
2891 | rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr)); |
2892 | rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr)); |
2893 | if (!operand_equal_p (rhs1, rhs2, flags: 0)) |
2894 | break; |
2895 | } |
2896 | |
2897 | /* Stable sort is required in order to avoid breaking dependence. */ |
2898 | gcc_stablesort (&(*partitions)[i], j - i, sizeof (*partitions)[i], |
2899 | offset_cmp); |
2900 | /* Continue with next partition. */ |
2901 | i = j; |
2902 | } |
2903 | |
2904 | /* Merge all consecutive memset builtin partitions. */ |
2905 | for (i = 0; i < partitions->length () - 1;) |
2906 | { |
2907 | part1 = (*partitions)[i]; |
2908 | if (part1->kind != PKIND_MEMSET) |
2909 | { |
2910 | i++; |
2911 | continue; |
2912 | } |
2913 | |
2914 | part2 = (*partitions)[i + 1]; |
2915 | /* Only merge memset partitions of the same base and with constant |
2916 | access sizes. */ |
2917 | if (part2->kind != PKIND_MEMSET |
2918 | || TREE_CODE (part1->builtin->size) != INTEGER_CST |
2919 | || TREE_CODE (part2->builtin->size) != INTEGER_CST |
2920 | || !operand_equal_p (part1->builtin->dst_base_base, |
2921 | part2->builtin->dst_base_base, flags: 0)) |
2922 | { |
2923 | i++; |
2924 | continue; |
2925 | } |
2926 | rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr)); |
2927 | rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr)); |
2928 | int bytev1 = const_with_all_bytes_same (val: rhs1); |
2929 | int bytev2 = const_with_all_bytes_same (val: rhs2); |
2930 | /* Only merge memset partitions of the same value. */ |
2931 | if (bytev1 != bytev2 || bytev1 == -1) |
2932 | { |
2933 | i++; |
2934 | continue; |
2935 | } |
2936 | wide_int end1 = wi::add (x: part1->builtin->dst_base_offset, |
2937 | y: wi::to_wide (t: part1->builtin->size)); |
2938 | /* Only merge adjacent memset partitions. */ |
2939 | if (wi::ne_p (x: end1, y: part2->builtin->dst_base_offset)) |
2940 | { |
2941 | i++; |
2942 | continue; |
2943 | } |
2944 | /* Merge partitions[i] and partitions[i+1]. */ |
2945 | part1->builtin->size = fold_build2 (PLUS_EXPR, sizetype, |
2946 | part1->builtin->size, |
2947 | part2->builtin->size); |
2948 | partition_free (partition: part2); |
2949 | partitions->ordered_remove (ix: i + 1); |
2950 | } |
2951 | } |
2952 | |
2953 | void |
2954 | loop_distribution::finalize_partitions (class loop *loop, |
2955 | vec<struct partition *> *partitions, |
2956 | vec<ddr_p> *alias_ddrs) |
2957 | { |
2958 | unsigned i; |
2959 | struct partition *partition, *a; |
2960 | |
2961 | if (partitions->length () == 1 |
2962 | || alias_ddrs->length () > 0) |
2963 | return; |
2964 | |
2965 | unsigned num_builtin = 0, num_normal = 0, num_partial_memset = 0; |
2966 | bool same_type_p = true; |
2967 | enum partition_type type = ((*partitions)[0])->type; |
2968 | for (i = 0; partitions->iterate (ix: i, ptr: &partition); ++i) |
2969 | { |
2970 | same_type_p &= (type == partition->type); |
2971 | if (partition_builtin_p (partition)) |
2972 | { |
2973 | num_builtin++; |
2974 | continue; |
2975 | } |
2976 | num_normal++; |
2977 | if (partition->kind == PKIND_PARTIAL_MEMSET) |
2978 | num_partial_memset++; |
2979 | } |
2980 | |
2981 | /* Don't distribute current loop into too many loops given we don't have |
2982 | memory stream cost model. Be even more conservative in case of loop |
2983 | nest distribution. */ |
2984 | if ((same_type_p && num_builtin == 0 |
2985 | && (loop->inner == NULL || num_normal != 2 || num_partial_memset != 1)) |
2986 | || (loop->inner != NULL |
2987 | && i >= NUM_PARTITION_THRESHOLD && num_normal > 1) |
2988 | || (loop->inner == NULL |
2989 | && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin)) |
2990 | { |
2991 | a = (*partitions)[0]; |
2992 | for (i = 1; partitions->iterate (ix: i, ptr: &partition); ++i) |
2993 | { |
2994 | partition_merge_into (NULL, dest: a, partition, ft: FUSE_FINALIZE); |
2995 | partition_free (partition); |
2996 | } |
2997 | partitions->truncate (size: 1); |
2998 | } |
2999 | |
3000 | /* Fuse memset builtins if possible. */ |
3001 | if (partitions->length () > 1) |
3002 | fuse_memset_builtins (partitions); |
3003 | } |
3004 | |
3005 | /* Distributes the code from LOOP in such a way that producer statements |
3006 | are placed before consumer statements. Tries to separate only the |
3007 | statements from STMTS into separate loops. Returns the number of |
3008 | distributed loops. Set NB_CALLS to number of generated builtin calls. |
3009 | Set *DESTROY_P to whether LOOP needs to be destroyed. */ |
3010 | |
3011 | int |
3012 | loop_distribution::distribute_loop (class loop *loop, |
3013 | const vec<gimple *> &stmts, |
3014 | control_dependences *cd, int *nb_calls, bool *destroy_p, |
3015 | bool only_patterns_p) |
3016 | { |
3017 | ddrs_table = new hash_table<ddr_hasher> (389); |
3018 | struct graph *rdg; |
3019 | partition *partition; |
3020 | int i, nbp; |
3021 | |
3022 | *destroy_p = false; |
3023 | *nb_calls = 0; |
3024 | loop_nest.create (nelems: 0); |
3025 | if (!find_loop_nest (loop, &loop_nest)) |
3026 | { |
3027 | loop_nest.release (); |
3028 | delete ddrs_table; |
3029 | return 0; |
3030 | } |
3031 | |
3032 | datarefs_vec.create (nelems: 20); |
3033 | has_nonaddressable_dataref_p = false; |
3034 | rdg = build_rdg (loop, cd); |
3035 | if (!rdg) |
3036 | { |
3037 | if (dump_file && (dump_flags & TDF_DETAILS)) |
3038 | fprintf (stream: dump_file, |
3039 | format: "Loop %d not distributed: failed to build the RDG.\n" , |
3040 | loop->num); |
3041 | |
3042 | loop_nest.release (); |
3043 | free_data_refs (datarefs_vec); |
3044 | delete ddrs_table; |
3045 | return 0; |
3046 | } |
3047 | |
3048 | if (datarefs_vec.length () > MAX_DATAREFS_NUM) |
3049 | { |
3050 | if (dump_file && (dump_flags & TDF_DETAILS)) |
3051 | fprintf (stream: dump_file, |
3052 | format: "Loop %d not distributed: too many memory references.\n" , |
3053 | loop->num); |
3054 | |
3055 | free_rdg (rdg); |
3056 | loop_nest.release (); |
3057 | free_data_refs (datarefs_vec); |
3058 | delete ddrs_table; |
3059 | return 0; |
3060 | } |
3061 | |
3062 | data_reference_p dref; |
3063 | for (i = 0; datarefs_vec.iterate (ix: i, ptr: &dref); ++i) |
3064 | dref->aux = (void *) (uintptr_t) i; |
3065 | |
3066 | if (dump_file && (dump_flags & TDF_DETAILS)) |
3067 | dump_rdg (file: dump_file, rdg); |
3068 | |
3069 | auto_vec<struct partition *, 3> partitions; |
3070 | rdg_build_partitions (rdg, starting_stmts: stmts, partitions: &partitions); |
3071 | |
3072 | auto_vec<ddr_p> alias_ddrs; |
3073 | |
3074 | auto_bitmap stmt_in_all_partitions; |
3075 | bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts); |
3076 | for (i = 1; partitions.iterate (ix: i, ptr: &partition); ++i) |
3077 | bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts); |
3078 | |
3079 | bool any_builtin = false; |
3080 | bool reduction_in_all = false; |
3081 | int reduction_partition_num = -1; |
3082 | FOR_EACH_VEC_ELT (partitions, i, partition) |
3083 | { |
3084 | reduction_in_all |
3085 | |= classify_partition (loop, rdg, partition, stmt_in_all_partitions); |
3086 | any_builtin |= partition_builtin_p (partition); |
3087 | } |
3088 | |
3089 | /* If we are only distributing patterns but did not detect any, |
3090 | simply bail out. */ |
3091 | if (only_patterns_p |
3092 | && !any_builtin) |
3093 | { |
3094 | nbp = 0; |
3095 | goto ldist_done; |
3096 | } |
3097 | |
3098 | /* If we are only distributing patterns fuse all partitions that |
3099 | were not classified as builtins. This also avoids chopping |
3100 | a loop into pieces, separated by builtin calls. That is, we |
3101 | only want no or a single loop body remaining. */ |
3102 | struct partition *into; |
3103 | if (only_patterns_p) |
3104 | { |
3105 | for (i = 0; partitions.iterate (ix: i, ptr: &into); ++i) |
3106 | if (!partition_builtin_p (partition: into)) |
3107 | break; |
3108 | for (++i; partitions.iterate (ix: i, ptr: &partition); ++i) |
3109 | if (!partition_builtin_p (partition)) |
3110 | { |
3111 | partition_merge_into (NULL, dest: into, partition, ft: FUSE_NON_BUILTIN); |
3112 | partitions.unordered_remove (ix: i); |
3113 | partition_free (partition); |
3114 | i--; |
3115 | } |
3116 | } |
3117 | |
3118 | /* Due to limitations in the transform phase we have to fuse all |
3119 | reduction partitions into the last partition so the existing |
3120 | loop will contain all loop-closed PHI nodes. */ |
3121 | for (i = 0; partitions.iterate (ix: i, ptr: &into); ++i) |
3122 | if (partition_reduction_p (partition: into)) |
3123 | break; |
3124 | for (i = i + 1; partitions.iterate (ix: i, ptr: &partition); ++i) |
3125 | if (partition_reduction_p (partition)) |
3126 | { |
3127 | partition_merge_into (rdg, dest: into, partition, ft: FUSE_REDUCTION); |
3128 | partitions.unordered_remove (ix: i); |
3129 | partition_free (partition); |
3130 | i--; |
3131 | } |
3132 | |
3133 | /* Apply our simple cost model - fuse partitions with similar |
3134 | memory accesses. */ |
3135 | for (i = 0; partitions.iterate (ix: i, ptr: &into); ++i) |
3136 | { |
3137 | bool changed = false; |
3138 | for (int j = i + 1; partitions.iterate (ix: j, ptr: &partition); ++j) |
3139 | { |
3140 | if (share_memory_accesses (rdg, partition1: into, partition2: partition)) |
3141 | { |
3142 | partition_merge_into (rdg, dest: into, partition, ft: FUSE_SHARE_REF); |
3143 | partitions.unordered_remove (ix: j); |
3144 | partition_free (partition); |
3145 | j--; |
3146 | changed = true; |
3147 | } |
3148 | } |
3149 | /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar |
3150 | accesses when 1 and 2 have similar accesses but not 0 and 1 |
3151 | then in the next iteration we will fail to consider merging |
3152 | 1 into 0,2. So try again if we did any merging into 0. */ |
3153 | if (changed) |
3154 | i--; |
3155 | } |
3156 | |
3157 | /* Put a non-builtin partition last if we need to preserve a reduction. |
3158 | In most cases this helps to keep a normal partition last avoiding to |
3159 | spill a reduction result across builtin calls. |
3160 | ??? The proper way would be to use dependences to see whether we |
3161 | can move builtin partitions earlier during merge_dep_scc_partitions |
3162 | and its sort_partitions_by_post_order. Especially when the |
3163 | dependence graph is composed of multiple independent subgraphs the |
3164 | heuristic does not work reliably. */ |
3165 | if (reduction_in_all |
3166 | && partition_builtin_p (partition: partitions.last())) |
3167 | FOR_EACH_VEC_ELT (partitions, i, partition) |
3168 | if (!partition_builtin_p (partition)) |
3169 | { |
3170 | partitions.unordered_remove (ix: i); |
3171 | partitions.quick_push (obj: partition); |
3172 | break; |
3173 | } |
3174 | |
3175 | /* Build the partition dependency graph and fuse partitions in strong |
3176 | connected component. */ |
3177 | if (partitions.length () > 1) |
3178 | { |
3179 | /* Don't support loop nest distribution under runtime alias check |
3180 | since it's not likely to enable many vectorization opportunities. |
3181 | Also if loop has any data reference which may be not addressable |
3182 | since alias check needs to take, compare address of the object. */ |
3183 | if (loop->inner || has_nonaddressable_dataref_p) |
3184 | merge_dep_scc_partitions (rdg, partitions: &partitions, ignore_alias_p: false); |
3185 | else |
3186 | { |
3187 | merge_dep_scc_partitions (rdg, partitions: &partitions, ignore_alias_p: true); |
3188 | if (partitions.length () > 1) |
3189 | break_alias_scc_partitions (rdg, partitions: &partitions, alias_ddrs: &alias_ddrs); |
3190 | } |
3191 | } |
3192 | |
3193 | finalize_partitions (loop, partitions: &partitions, alias_ddrs: &alias_ddrs); |
3194 | |
3195 | /* If there is a reduction in all partitions make sure the last |
3196 | non-builtin partition provides the LC PHI defs. */ |
3197 | if (reduction_in_all) |
3198 | { |
3199 | FOR_EACH_VEC_ELT (partitions, i, partition) |
3200 | if (!partition_builtin_p (partition)) |
3201 | reduction_partition_num = i; |
3202 | if (reduction_partition_num == -1) |
3203 | { |
3204 | /* If all partitions are builtin, force the last one to |
3205 | be code generated as normal partition. */ |
3206 | partition = partitions.last (); |
3207 | partition->kind = PKIND_NORMAL; |
3208 | } |
3209 | } |
3210 | |
3211 | nbp = partitions.length (); |
3212 | if (nbp == 0 |
3213 | || (nbp == 1 && !partition_builtin_p (partition: partitions[0])) |
3214 | || (nbp > 1 && partition_contains_all_rw (rdg, partitions))) |
3215 | { |
3216 | nbp = 0; |
3217 | goto ldist_done; |
3218 | } |
3219 | |
3220 | if (version_for_distribution_p (partitions: &partitions, alias_ddrs: &alias_ddrs)) |
3221 | version_loop_by_alias_check (partitions: &partitions, loop, alias_ddrs: &alias_ddrs); |
3222 | |
3223 | if (dump_file && (dump_flags & TDF_DETAILS)) |
3224 | { |
3225 | fprintf (stream: dump_file, |
3226 | format: "distribute loop <%d> into partitions:\n" , loop->num); |
3227 | dump_rdg_partitions (file: dump_file, partitions); |
3228 | } |
3229 | |
3230 | FOR_EACH_VEC_ELT (partitions, i, partition) |
3231 | { |
3232 | if (partition_builtin_p (partition)) |
3233 | (*nb_calls)++; |
3234 | *destroy_p |= generate_code_for_partition (loop, partition, copy_p: i < nbp - 1, |
3235 | keep_lc_phis_p: i == reduction_partition_num); |
3236 | } |
3237 | |
3238 | ldist_done: |
3239 | loop_nest.release (); |
3240 | free_data_refs (datarefs_vec); |
3241 | for (hash_table<ddr_hasher>::iterator iter = ddrs_table->begin (); |
3242 | iter != ddrs_table->end (); ++iter) |
3243 | { |
3244 | free_dependence_relation (*iter); |
3245 | *iter = NULL; |
3246 | } |
3247 | delete ddrs_table; |
3248 | |
3249 | FOR_EACH_VEC_ELT (partitions, i, partition) |
3250 | partition_free (partition); |
3251 | |
3252 | free_rdg (rdg); |
3253 | return nbp - *nb_calls; |
3254 | } |
3255 | |
3256 | |
3257 | void loop_distribution::bb_top_order_init (void) |
3258 | { |
3259 | int rpo_num; |
3260 | int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (cfun) - NUM_FIXED_BLOCKS); |
3261 | edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun)); |
3262 | bitmap exit_bbs = BITMAP_ALLOC (NULL); |
3263 | |
3264 | bb_top_order_index = XNEWVEC (int, last_basic_block_for_fn (cfun)); |
3265 | bb_top_order_index_size = last_basic_block_for_fn (cfun); |
3266 | |
3267 | entry->flags &= ~EDGE_DFS_BACK; |
3268 | bitmap_set_bit (exit_bbs, EXIT_BLOCK); |
3269 | rpo_num = rev_post_order_and_mark_dfs_back_seme (cfun, entry, exit_bbs, true, |
3270 | rpo, NULL); |
3271 | BITMAP_FREE (exit_bbs); |
3272 | |
3273 | for (int i = 0; i < rpo_num; i++) |
3274 | bb_top_order_index[rpo[i]] = i; |
3275 | |
3276 | free (ptr: rpo); |
3277 | } |
3278 | |
3279 | void loop_distribution::bb_top_order_destroy () |
3280 | { |
3281 | free (ptr: bb_top_order_index); |
3282 | bb_top_order_index = NULL; |
3283 | bb_top_order_index_size = 0; |
3284 | } |
3285 | |
3286 | |
3287 | /* Given LOOP, this function records seed statements for distribution in |
3288 | WORK_LIST. Return false if there is nothing for distribution. */ |
3289 | |
3290 | static bool |
3291 | find_seed_stmts_for_distribution (class loop *loop, vec<gimple *> *work_list) |
3292 | { |
3293 | basic_block *bbs = get_loop_body_in_dom_order (loop); |
3294 | |
3295 | /* Initialize the worklist with stmts we seed the partitions with. */ |
3296 | for (unsigned i = 0; i < loop->num_nodes; ++i) |
3297 | { |
3298 | /* In irreducible sub-regions we don't know how to redirect |
3299 | conditions, so fail. See PR100492. */ |
3300 | if (bbs[i]->flags & BB_IRREDUCIBLE_LOOP) |
3301 | { |
3302 | if (dump_file && (dump_flags & TDF_DETAILS)) |
3303 | fprintf (stream: dump_file, format: "loop %d contains an irreducible region.\n" , |
3304 | loop->num); |
3305 | work_list->truncate (size: 0); |
3306 | break; |
3307 | } |
3308 | for (gphi_iterator gsi = gsi_start_phis (bbs[i]); |
3309 | !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
3310 | { |
3311 | gphi *phi = gsi.phi (); |
3312 | if (virtual_operand_p (op: gimple_phi_result (gs: phi))) |
3313 | continue; |
3314 | /* Distribute stmts which have defs that are used outside of |
3315 | the loop. */ |
3316 | if (!stmt_has_scalar_dependences_outside_loop (loop, stmt: phi)) |
3317 | continue; |
3318 | work_list->safe_push (obj: phi); |
3319 | } |
3320 | for (gimple_stmt_iterator gsi = gsi_start_bb (bb: bbs[i]); |
3321 | !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
3322 | { |
3323 | gimple *stmt = gsi_stmt (i: gsi); |
3324 | |
3325 | /* Ignore clobbers, they do not have true side effects. */ |
3326 | if (gimple_clobber_p (s: stmt)) |
3327 | continue; |
3328 | |
3329 | /* If there is a stmt with side-effects bail out - we |
3330 | cannot and should not distribute this loop. */ |
3331 | if (gimple_has_side_effects (stmt)) |
3332 | { |
3333 | free (ptr: bbs); |
3334 | return false; |
3335 | } |
3336 | |
3337 | /* Distribute stmts which have defs that are used outside of |
3338 | the loop. */ |
3339 | if (stmt_has_scalar_dependences_outside_loop (loop, stmt)) |
3340 | ; |
3341 | /* Otherwise only distribute stores for now. */ |
3342 | else if (!gimple_vdef (g: stmt)) |
3343 | continue; |
3344 | |
3345 | work_list->safe_push (obj: stmt); |
3346 | } |
3347 | } |
3348 | bool res = work_list->length () > 0; |
3349 | if (res && !can_copy_bbs_p (bbs, loop->num_nodes)) |
3350 | { |
3351 | if (dump_file && (dump_flags & TDF_DETAILS)) |
3352 | fprintf (stream: dump_file, format: "cannot copy loop %d.\n" , loop->num); |
3353 | res = false; |
3354 | } |
3355 | free (ptr: bbs); |
3356 | return res; |
3357 | } |
3358 | |
3359 | /* A helper function for generate_{rawmemchr,strlen}_builtin functions in order |
3360 | to place new statements SEQ before LOOP and replace the old reduction |
3361 | variable with the new one. */ |
3362 | |
3363 | static void |
3364 | generate_reduction_builtin_1 (loop_p loop, gimple_seq &seq, |
3365 | tree reduction_var_old, tree reduction_var_new, |
3366 | const char *info, machine_mode load_mode) |
3367 | { |
3368 | gcc_assert (flag_tree_loop_distribute_patterns); |
3369 | |
3370 | /* Place new statements before LOOP. */ |
3371 | gimple_stmt_iterator gsi = gsi_last_bb (bb: loop_preheader_edge (loop)->src); |
3372 | gsi_insert_seq_after (&gsi, seq, GSI_CONTINUE_LINKING); |
3373 | |
3374 | /* Replace old reduction variable with new one. */ |
3375 | imm_use_iterator iter; |
3376 | gimple *stmt; |
3377 | use_operand_p use_p; |
3378 | FOR_EACH_IMM_USE_STMT (stmt, iter, reduction_var_old) |
3379 | { |
3380 | FOR_EACH_IMM_USE_ON_STMT (use_p, iter) |
3381 | SET_USE (use_p, reduction_var_new); |
3382 | |
3383 | update_stmt (s: stmt); |
3384 | } |
3385 | |
3386 | if (dump_file && (dump_flags & TDF_DETAILS)) |
3387 | fprintf (stream: dump_file, format: info, GET_MODE_NAME (load_mode)); |
3388 | } |
3389 | |
3390 | /* Generate a call to rawmemchr and place it before LOOP. REDUCTION_VAR is |
3391 | replaced with a fresh SSA name representing the result of the call. */ |
3392 | |
3393 | static void |
3394 | generate_rawmemchr_builtin (loop_p loop, tree reduction_var, |
3395 | data_reference_p store_dr, tree base, tree pattern, |
3396 | location_t loc) |
3397 | { |
3398 | gimple_seq seq = NULL; |
3399 | |
3400 | tree mem = force_gimple_operand (base, &seq, true, NULL_TREE); |
3401 | gimple *fn_call = gimple_build_call_internal (IFN_RAWMEMCHR, 2, mem, pattern); |
3402 | tree reduction_var_new = copy_ssa_name (var: reduction_var); |
3403 | gimple_call_set_lhs (gs: fn_call, lhs: reduction_var_new); |
3404 | gimple_set_location (g: fn_call, location: loc); |
3405 | gimple_seq_add_stmt (&seq, fn_call); |
3406 | |
3407 | if (store_dr) |
3408 | { |
3409 | gassign *g = gimple_build_assign (DR_REF (store_dr), reduction_var_new); |
3410 | gimple_seq_add_stmt (&seq, g); |
3411 | } |
3412 | |
3413 | generate_reduction_builtin_1 (loop, seq, reduction_var_old: reduction_var, reduction_var_new, |
3414 | info: "generated rawmemchr%s\n" , |
3415 | TYPE_MODE (TREE_TYPE (TREE_TYPE (base)))); |
3416 | } |
3417 | |
3418 | /* Helper function for generate_strlen_builtin(,_using_rawmemchr) */ |
3419 | |
3420 | static void |
3421 | generate_strlen_builtin_1 (loop_p loop, gimple_seq &seq, |
3422 | tree reduction_var_old, tree reduction_var_new, |
3423 | machine_mode mode, tree start_len) |
3424 | { |
3425 | /* REDUCTION_VAR_NEW has either size type or ptrdiff type and must be |
3426 | converted if types of old and new reduction variable are not compatible. */ |
3427 | reduction_var_new = gimple_convert (seq: &seq, TREE_TYPE (reduction_var_old), |
3428 | op: reduction_var_new); |
3429 | |
3430 | /* Loops of the form `for (i=42; s[i]; ++i);` have an additional start |
3431 | length. */ |
3432 | if (!integer_zerop (start_len)) |
3433 | { |
3434 | tree lhs = make_ssa_name (TREE_TYPE (reduction_var_new)); |
3435 | gimple *g = gimple_build_assign (lhs, PLUS_EXPR, reduction_var_new, |
3436 | start_len); |
3437 | gimple_seq_add_stmt (&seq, g); |
3438 | reduction_var_new = lhs; |
3439 | } |
3440 | |
3441 | generate_reduction_builtin_1 (loop, seq, reduction_var_old, reduction_var_new, |
3442 | info: "generated strlen%s\n" , load_mode: mode); |
3443 | } |
3444 | |
3445 | /* Generate a call to strlen and place it before LOOP. REDUCTION_VAR is |
3446 | replaced with a fresh SSA name representing the result of the call. */ |
3447 | |
3448 | static void |
3449 | generate_strlen_builtin (loop_p loop, tree reduction_var, tree base, |
3450 | tree start_len, location_t loc) |
3451 | { |
3452 | gimple_seq seq = NULL; |
3453 | |
3454 | tree reduction_var_new = make_ssa_name (size_type_node); |
3455 | |
3456 | tree mem = force_gimple_operand (base, &seq, true, NULL_TREE); |
3457 | tree fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_STRLEN)); |
3458 | gimple *fn_call = gimple_build_call (fn, 1, mem); |
3459 | gimple_call_set_lhs (gs: fn_call, lhs: reduction_var_new); |
3460 | gimple_set_location (g: fn_call, location: loc); |
3461 | gimple_seq_add_stmt (&seq, fn_call); |
3462 | |
3463 | generate_strlen_builtin_1 (loop, seq, reduction_var_old: reduction_var, reduction_var_new, |
3464 | QImode, start_len); |
3465 | } |
3466 | |
3467 | /* Generate code in order to mimic the behaviour of strlen but this time over |
3468 | an array of elements with mode different than QI. REDUCTION_VAR is replaced |
3469 | with a fresh SSA name representing the result, i.e., the length. */ |
3470 | |
3471 | static void |
3472 | generate_strlen_builtin_using_rawmemchr (loop_p loop, tree reduction_var, |
3473 | tree base, tree load_type, |
3474 | tree start_len, location_t loc) |
3475 | { |
3476 | gimple_seq seq = NULL; |
3477 | |
3478 | tree start = force_gimple_operand (base, &seq, true, NULL_TREE); |
3479 | tree zero = build_zero_cst (load_type); |
3480 | gimple *fn_call = gimple_build_call_internal (IFN_RAWMEMCHR, 2, start, zero); |
3481 | tree end = make_ssa_name (TREE_TYPE (base)); |
3482 | gimple_call_set_lhs (gs: fn_call, lhs: end); |
3483 | gimple_set_location (g: fn_call, location: loc); |
3484 | gimple_seq_add_stmt (&seq, fn_call); |
3485 | |
3486 | /* Determine the number of elements between START and END by |
3487 | evaluating (END - START) / sizeof (*START). */ |
3488 | tree diff = make_ssa_name (ptrdiff_type_node); |
3489 | gimple *diff_stmt = gimple_build_assign (diff, POINTER_DIFF_EXPR, end, start); |
3490 | gimple_seq_add_stmt (&seq, diff_stmt); |
3491 | /* Let SIZE be the size of each character. */ |
3492 | tree size = gimple_convert (seq: &seq, ptrdiff_type_node, |
3493 | TYPE_SIZE_UNIT (load_type)); |
3494 | tree count = make_ssa_name (ptrdiff_type_node); |
3495 | gimple *count_stmt = gimple_build_assign (count, TRUNC_DIV_EXPR, diff, size); |
3496 | gimple_seq_add_stmt (&seq, count_stmt); |
3497 | |
3498 | generate_strlen_builtin_1 (loop, seq, reduction_var_old: reduction_var, reduction_var_new: count, |
3499 | TYPE_MODE (load_type), |
3500 | start_len); |
3501 | } |
3502 | |
3503 | /* Return true if we can count at least as many characters by taking pointer |
3504 | difference as we can count via reduction_var without an overflow. Thus |
3505 | compute 2^n < (2^(m-1) / s) where n = TYPE_PRECISION (reduction_var_type), |
3506 | m = TYPE_PRECISION (ptrdiff_type_node), and s = size of each character. */ |
3507 | static bool |
3508 | reduction_var_overflows_first (tree reduction_var_type, tree load_type) |
3509 | { |
3510 | widest_int n2 = wi::lshift (x: 1, TYPE_PRECISION (reduction_var_type));; |
3511 | widest_int m2 = wi::lshift (x: 1, TYPE_PRECISION (ptrdiff_type_node) - 1); |
3512 | widest_int s = wi::to_widest (TYPE_SIZE_UNIT (load_type)); |
3513 | return wi::ltu_p (x: n2, y: wi::udiv_trunc (x: m2, y: s)); |
3514 | } |
3515 | |
3516 | static gimple * |
3517 | determine_reduction_stmt_1 (const loop_p loop, const basic_block *bbs) |
3518 | { |
3519 | gimple *reduction_stmt = NULL; |
3520 | |
3521 | for (unsigned i = 0, ninsns = 0; i < loop->num_nodes; ++i) |
3522 | { |
3523 | basic_block bb = bbs[i]; |
3524 | |
3525 | for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (i: bsi); |
3526 | gsi_next_nondebug (i: &bsi)) |
3527 | { |
3528 | gphi *phi = bsi.phi (); |
3529 | if (virtual_operand_p (op: gimple_phi_result (gs: phi))) |
3530 | continue; |
3531 | if (stmt_has_scalar_dependences_outside_loop (loop, stmt: phi)) |
3532 | { |
3533 | if (reduction_stmt) |
3534 | return NULL; |
3535 | reduction_stmt = phi; |
3536 | } |
3537 | } |
3538 | |
3539 | for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (i: bsi); |
3540 | gsi_next_nondebug (i: &bsi), ++ninsns) |
3541 | { |
3542 | /* Bail out early for loops which are unlikely to match. */ |
3543 | if (ninsns > 16) |
3544 | return NULL; |
3545 | gimple *stmt = gsi_stmt (i: bsi); |
3546 | if (gimple_clobber_p (s: stmt)) |
3547 | continue; |
3548 | if (gimple_code (g: stmt) == GIMPLE_LABEL) |
3549 | continue; |
3550 | if (gimple_has_volatile_ops (stmt)) |
3551 | return NULL; |
3552 | if (stmt_has_scalar_dependences_outside_loop (loop, stmt)) |
3553 | { |
3554 | if (reduction_stmt) |
3555 | return NULL; |
3556 | reduction_stmt = stmt; |
3557 | } |
3558 | } |
3559 | } |
3560 | |
3561 | return reduction_stmt; |
3562 | } |
3563 | |
3564 | /* If LOOP has a single non-volatile reduction statement, then return a pointer |
3565 | to it. Otherwise return NULL. */ |
3566 | static gimple * |
3567 | determine_reduction_stmt (const loop_p loop) |
3568 | { |
3569 | basic_block *bbs = get_loop_body (loop); |
3570 | gimple *reduction_stmt = determine_reduction_stmt_1 (loop, bbs); |
3571 | XDELETEVEC (bbs); |
3572 | return reduction_stmt; |
3573 | } |
3574 | |
3575 | /* Transform loops which mimic the effects of builtins rawmemchr or strlen and |
3576 | replace them accordingly. For example, a loop of the form |
3577 | |
3578 | for (; *p != 42; ++p); |
3579 | |
3580 | is replaced by |
3581 | |
3582 | p = rawmemchr<MODE> (p, 42); |
3583 | |
3584 | under the assumption that rawmemchr is available for a particular MODE. |
3585 | Another example is |
3586 | |
3587 | int i; |
3588 | for (i = 42; s[i]; ++i); |
3589 | |
3590 | which is replaced by |
3591 | |
3592 | i = (int)strlen (&s[42]) + 42; |
3593 | |
3594 | for some character array S. In case array S is not of type character array |
3595 | we end up with |
3596 | |
3597 | i = (int)(rawmemchr<MODE> (&s[42], 0) - &s[42]) + 42; |
3598 | |
3599 | assuming that rawmemchr is available for a particular MODE. */ |
3600 | |
3601 | bool |
3602 | loop_distribution::transform_reduction_loop (loop_p loop) |
3603 | { |
3604 | gimple *reduction_stmt; |
3605 | data_reference_p load_dr = NULL, store_dr = NULL; |
3606 | |
3607 | edge e = single_exit (loop); |
3608 | gcond *cond = safe_dyn_cast <gcond *> (p: *gsi_last_bb (bb: e->src)); |
3609 | if (!cond) |
3610 | return false; |
3611 | /* Ensure loop condition is an (in)equality test and loop is exited either if |
3612 | the inequality test fails or the equality test succeeds. */ |
3613 | if (!(e->flags & EDGE_FALSE_VALUE && gimple_cond_code (gs: cond) == NE_EXPR) |
3614 | && !(e->flags & EDGE_TRUE_VALUE && gimple_cond_code (gs: cond) == EQ_EXPR)) |
3615 | return false; |
3616 | /* A limitation of the current implementation is that we only support |
3617 | constant patterns in (in)equality tests. */ |
3618 | tree pattern = gimple_cond_rhs (gs: cond); |
3619 | if (TREE_CODE (pattern) != INTEGER_CST) |
3620 | return false; |
3621 | |
3622 | reduction_stmt = determine_reduction_stmt (loop); |
3623 | |
3624 | /* A limitation of the current implementation is that we require a reduction |
3625 | statement. Therefore, loops without a reduction statement as in the |
3626 | following are not recognized: |
3627 | int *p; |
3628 | void foo (void) { for (; *p; ++p); } */ |
3629 | if (reduction_stmt == NULL) |
3630 | return false; |
3631 | |
3632 | /* Reduction variables are guaranteed to be SSA names. */ |
3633 | tree reduction_var; |
3634 | switch (gimple_code (g: reduction_stmt)) |
3635 | { |
3636 | case GIMPLE_ASSIGN: |
3637 | case GIMPLE_PHI: |
3638 | reduction_var = gimple_get_lhs (reduction_stmt); |
3639 | break; |
3640 | default: |
3641 | /* Bail out e.g. for GIMPLE_CALL. */ |
3642 | return false; |
3643 | } |
3644 | |
3645 | struct graph *rdg = build_rdg (loop, NULL); |
3646 | if (rdg == NULL) |
3647 | { |
3648 | if (dump_file && (dump_flags & TDF_DETAILS)) |
3649 | fprintf (stream: dump_file, |
3650 | format: "Loop %d not transformed: failed to build the RDG.\n" , |
3651 | loop->num); |
3652 | |
3653 | return false; |
3654 | } |
3655 | auto_bitmap partition_stmts; |
3656 | bitmap_set_range (partition_stmts, 0, rdg->n_vertices); |
3657 | find_single_drs (loop, rdg, partition_stmts, dst_dr: &store_dr, src_dr: &load_dr); |
3658 | free_rdg (rdg); |
3659 | |
3660 | /* Bail out if there is no single load. */ |
3661 | if (load_dr == NULL) |
3662 | return false; |
3663 | |
3664 | /* Reaching this point we have a loop with a single reduction variable, |
3665 | a single load, and an optional single store. */ |
3666 | |
3667 | tree load_ref = DR_REF (load_dr); |
3668 | tree load_type = TREE_TYPE (load_ref); |
3669 | tree load_access_base = build_fold_addr_expr (load_ref); |
3670 | tree load_access_size = TYPE_SIZE_UNIT (load_type); |
3671 | affine_iv load_iv, reduction_iv; |
3672 | |
3673 | if (!INTEGRAL_TYPE_P (load_type) |
3674 | || !type_has_mode_precision_p (t: load_type)) |
3675 | return false; |
3676 | |
3677 | /* We already ensured that the loop condition tests for (in)equality where the |
3678 | rhs is a constant pattern. Now ensure that the lhs is the result of the |
3679 | load. */ |
3680 | if (gimple_cond_lhs (gs: cond) != gimple_assign_lhs (DR_STMT (load_dr))) |
3681 | return false; |
3682 | |
3683 | /* Bail out if no affine induction variable with constant step can be |
3684 | determined. */ |
3685 | if (!simple_iv (loop, loop, load_access_base, &load_iv, false)) |
3686 | return false; |
3687 | |
3688 | /* Bail out if memory accesses are not consecutive or not growing. */ |
3689 | if (!operand_equal_p (load_iv.step, load_access_size, flags: 0)) |
3690 | return false; |
3691 | |
3692 | if (!simple_iv (loop, loop, reduction_var, &reduction_iv, false)) |
3693 | return false; |
3694 | |
3695 | /* Handle rawmemchr like loops. */ |
3696 | if (operand_equal_p (load_iv.base, reduction_iv.base) |
3697 | && operand_equal_p (load_iv.step, reduction_iv.step)) |
3698 | { |
3699 | if (store_dr) |
3700 | { |
3701 | /* Ensure that we store to X and load from X+I where I>0. */ |
3702 | if (TREE_CODE (load_iv.base) != POINTER_PLUS_EXPR |
3703 | || !integer_onep (TREE_OPERAND (load_iv.base, 1))) |
3704 | return false; |
3705 | tree ptr_base = TREE_OPERAND (load_iv.base, 0); |
3706 | if (TREE_CODE (ptr_base) != SSA_NAME) |
3707 | return false; |
3708 | gimple *def = SSA_NAME_DEF_STMT (ptr_base); |
3709 | if (!gimple_assign_single_p (gs: def) |
3710 | || gimple_assign_rhs1 (gs: def) != DR_REF (store_dr)) |
3711 | return false; |
3712 | /* Ensure that the reduction value is stored. */ |
3713 | if (gimple_assign_rhs1 (DR_STMT (store_dr)) != reduction_var) |
3714 | return false; |
3715 | } |
3716 | /* Bail out if target does not provide rawmemchr for a certain mode. */ |
3717 | machine_mode mode = TYPE_MODE (load_type); |
3718 | if (direct_optab_handler (op: rawmemchr_optab, mode) == CODE_FOR_nothing) |
3719 | return false; |
3720 | location_t loc = gimple_location (DR_STMT (load_dr)); |
3721 | generate_rawmemchr_builtin (loop, reduction_var, store_dr, base: load_iv.base, |
3722 | pattern, loc); |
3723 | return true; |
3724 | } |
3725 | |
3726 | /* Handle strlen like loops. */ |
3727 | if (store_dr == NULL |
3728 | && integer_zerop (pattern) |
3729 | && INTEGRAL_TYPE_P (TREE_TYPE (reduction_var)) |
3730 | && TREE_CODE (reduction_iv.base) == INTEGER_CST |
3731 | && TREE_CODE (reduction_iv.step) == INTEGER_CST |
3732 | && integer_onep (reduction_iv.step)) |
3733 | { |
3734 | location_t loc = gimple_location (DR_STMT (load_dr)); |
3735 | tree reduction_var_type = TREE_TYPE (reduction_var); |
3736 | /* While determining the length of a string an overflow might occur. |
3737 | If an overflow only occurs in the loop implementation and not in the |
3738 | strlen implementation, then either the overflow is undefined or the |
3739 | truncated result of strlen equals the one of the loop. Otherwise if |
3740 | an overflow may also occur in the strlen implementation, then |
3741 | replacing a loop by a call to strlen is sound whenever we ensure that |
3742 | if an overflow occurs in the strlen implementation, then also an |
3743 | overflow occurs in the loop implementation which is undefined. It |
3744 | seems reasonable to relax this and assume that the strlen |
3745 | implementation cannot overflow in case sizetype is big enough in the |
3746 | sense that an overflow can only happen for string objects which are |
3747 | bigger than half of the address space; at least for 32-bit targets and |
3748 | up. |
3749 | |
3750 | For strlen which makes use of rawmemchr the maximal length of a string |
3751 | which can be determined without an overflow is PTRDIFF_MAX / S where |
3752 | each character has size S. Since an overflow for ptrdiff type is |
3753 | undefined we have to make sure that if an overflow occurs, then an |
3754 | overflow occurs in the loop implementation, too, and this is |
3755 | undefined, too. Similar as before we relax this and assume that no |
3756 | string object is larger than half of the address space; at least for |
3757 | 32-bit targets and up. */ |
3758 | if (TYPE_MODE (load_type) == TYPE_MODE (char_type_node) |
3759 | && TYPE_PRECISION (load_type) == TYPE_PRECISION (char_type_node) |
3760 | && ((TYPE_PRECISION (sizetype) >= TYPE_PRECISION (ptr_type_node) - 1 |
3761 | && TYPE_PRECISION (ptr_type_node) >= 32) |
3762 | || (TYPE_OVERFLOW_UNDEFINED (reduction_var_type) |
3763 | && TYPE_PRECISION (reduction_var_type) <= TYPE_PRECISION (sizetype))) |
3764 | && builtin_decl_implicit (fncode: BUILT_IN_STRLEN)) |
3765 | generate_strlen_builtin (loop, reduction_var, base: load_iv.base, |
3766 | start_len: reduction_iv.base, loc); |
3767 | else if (direct_optab_handler (op: rawmemchr_optab, TYPE_MODE (load_type)) |
3768 | != CODE_FOR_nothing |
3769 | && ((TYPE_PRECISION (ptrdiff_type_node) == TYPE_PRECISION (ptr_type_node) |
3770 | && TYPE_PRECISION (ptrdiff_type_node) >= 32) |
3771 | || (TYPE_OVERFLOW_UNDEFINED (reduction_var_type) |
3772 | && reduction_var_overflows_first (reduction_var_type, load_type)))) |
3773 | generate_strlen_builtin_using_rawmemchr (loop, reduction_var, |
3774 | base: load_iv.base, |
3775 | load_type, |
3776 | start_len: reduction_iv.base, loc); |
3777 | else |
3778 | return false; |
3779 | return true; |
3780 | } |
3781 | |
3782 | return false; |
3783 | } |
3784 | |
3785 | /* Given innermost LOOP, return the outermost enclosing loop that forms a |
3786 | perfect loop nest. */ |
3787 | |
3788 | static class loop * |
3789 | prepare_perfect_loop_nest (class loop *loop) |
3790 | { |
3791 | class loop *outer = loop_outer (loop); |
3792 | tree niters = number_of_latch_executions (loop); |
3793 | |
3794 | /* TODO: We only support the innermost 3-level loop nest distribution |
3795 | because of compilation time issue for now. This should be relaxed |
3796 | in the future. Note we only allow 3-level loop nest distribution |
3797 | when parallelizing loops. */ |
3798 | while ((loop->inner == NULL |
3799 | || (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1)) |
3800 | && loop_outer (loop: outer) |
3801 | && outer->inner == loop && loop->next == NULL |
3802 | && single_exit (outer) |
3803 | && !chrec_contains_symbols_defined_in_loop (niters, outer->num) |
3804 | && (niters = number_of_latch_executions (outer)) != NULL_TREE |
3805 | && niters != chrec_dont_know) |
3806 | { |
3807 | loop = outer; |
3808 | outer = loop_outer (loop); |
3809 | } |
3810 | |
3811 | return loop; |
3812 | } |
3813 | |
3814 | |
3815 | unsigned int |
3816 | loop_distribution::execute (function *fun) |
3817 | { |
3818 | bool changed = false; |
3819 | basic_block bb; |
3820 | control_dependences *cd = NULL; |
3821 | auto_vec<loop_p> loops_to_be_destroyed; |
3822 | |
3823 | if (number_of_loops (fn: fun) <= 1) |
3824 | return 0; |
3825 | |
3826 | bb_top_order_init (); |
3827 | |
3828 | FOR_ALL_BB_FN (bb, fun) |
3829 | { |
3830 | gimple_stmt_iterator gsi; |
3831 | for (gsi = gsi_start_phis (bb); !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
3832 | gimple_set_uid (g: gsi_stmt (i: gsi), uid: -1); |
3833 | for (gsi = gsi_start_bb (bb); !gsi_end_p (i: gsi); gsi_next (i: &gsi)) |
3834 | gimple_set_uid (g: gsi_stmt (i: gsi), uid: -1); |
3835 | } |
3836 | |
3837 | /* We can at the moment only distribute non-nested loops, thus restrict |
3838 | walking to innermost loops. */ |
3839 | for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST)) |
3840 | { |
3841 | /* Don't distribute multiple exit edges loop, or cold loop when |
3842 | not doing pattern detection. */ |
3843 | if (!single_exit (loop) |
3844 | || (!flag_tree_loop_distribute_patterns |
3845 | && !optimize_loop_for_speed_p (loop))) |
3846 | continue; |
3847 | |
3848 | /* If niters is unknown don't distribute loop but rather try to transform |
3849 | it to a call to a builtin. */ |
3850 | tree niters = number_of_latch_executions (loop); |
3851 | if (niters == NULL_TREE || niters == chrec_dont_know) |
3852 | { |
3853 | datarefs_vec.create (nelems: 20); |
3854 | if (flag_tree_loop_distribute_patterns |
3855 | && transform_reduction_loop (loop)) |
3856 | { |
3857 | changed = true; |
3858 | loops_to_be_destroyed.safe_push (obj: loop); |
3859 | if (dump_enabled_p ()) |
3860 | { |
3861 | dump_user_location_t loc = find_loop_location (loop); |
3862 | dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, |
3863 | loc, "Loop %d transformed into a builtin.\n" , |
3864 | loop->num); |
3865 | } |
3866 | } |
3867 | free_data_refs (datarefs_vec); |
3868 | continue; |
3869 | } |
3870 | |
3871 | /* Get the perfect loop nest for distribution. */ |
3872 | loop = prepare_perfect_loop_nest (loop); |
3873 | for (; loop; loop = loop->inner) |
3874 | { |
3875 | auto_vec<gimple *> work_list; |
3876 | if (!find_seed_stmts_for_distribution (loop, work_list: &work_list)) |
3877 | continue; |
3878 | |
3879 | const char *str = loop->inner ? " nest" : "" ; |
3880 | dump_user_location_t loc = find_loop_location (loop); |
3881 | if (!cd) |
3882 | { |
3883 | calculate_dominance_info (CDI_DOMINATORS); |
3884 | calculate_dominance_info (CDI_POST_DOMINATORS); |
3885 | cd = new control_dependences (); |
3886 | free_dominance_info (CDI_POST_DOMINATORS); |
3887 | } |
3888 | |
3889 | bool destroy_p; |
3890 | int nb_generated_loops, nb_generated_calls; |
3891 | bool only_patterns = !optimize_loop_for_speed_p (loop) |
3892 | || !flag_tree_loop_distribution; |
3893 | /* do not try to distribute loops that are not expected to iterate. */ |
3894 | if (!only_patterns) |
3895 | { |
3896 | HOST_WIDE_INT iterations = estimated_loop_iterations_int (loop); |
3897 | if (iterations < 0) |
3898 | iterations = likely_max_loop_iterations_int (loop); |
3899 | if (!iterations) |
3900 | only_patterns = true; |
3901 | } |
3902 | nb_generated_loops |
3903 | = distribute_loop (loop, stmts: work_list, cd, nb_calls: &nb_generated_calls, |
3904 | destroy_p: &destroy_p, only_patterns_p: only_patterns); |
3905 | if (destroy_p) |
3906 | loops_to_be_destroyed.safe_push (obj: loop); |
3907 | |
3908 | if (nb_generated_loops + nb_generated_calls > 0) |
3909 | { |
3910 | changed = true; |
3911 | if (dump_enabled_p ()) |
3912 | dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, |
3913 | loc, "Loop%s %d distributed: split to %d loops " |
3914 | "and %d library calls.\n" , str, loop->num, |
3915 | nb_generated_loops, nb_generated_calls); |
3916 | |
3917 | break; |
3918 | } |
3919 | |
3920 | if (dump_file && (dump_flags & TDF_DETAILS)) |
3921 | fprintf (stream: dump_file, format: "Loop%s %d not distributed.\n" , str, loop->num); |
3922 | } |
3923 | } |
3924 | |
3925 | if (cd) |
3926 | delete cd; |
3927 | |
3928 | if (bb_top_order_index != NULL) |
3929 | bb_top_order_destroy (); |
3930 | |
3931 | if (changed) |
3932 | { |
3933 | /* Destroy loop bodies that could not be reused. Do this late as we |
3934 | otherwise can end up refering to stale data in control dependences. */ |
3935 | unsigned i; |
3936 | class loop *loop; |
3937 | FOR_EACH_VEC_ELT (loops_to_be_destroyed, i, loop) |
3938 | destroy_loop (loop); |
3939 | |
3940 | /* Cached scalar evolutions now may refer to wrong or non-existing |
3941 | loops. */ |
3942 | scev_reset (); |
3943 | mark_virtual_operands_for_renaming (fun); |
3944 | rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa); |
3945 | } |
3946 | |
3947 | checking_verify_loop_structure (); |
3948 | |
3949 | return changed ? TODO_cleanup_cfg : 0; |
3950 | } |
3951 | |
3952 | |
3953 | /* Distribute all loops in the current function. */ |
3954 | |
3955 | namespace { |
3956 | |
3957 | const pass_data pass_data_loop_distribution = |
3958 | { |
3959 | .type: GIMPLE_PASS, /* type */ |
3960 | .name: "ldist" , /* name */ |
3961 | .optinfo_flags: OPTGROUP_LOOP, /* optinfo_flags */ |
3962 | .tv_id: TV_TREE_LOOP_DISTRIBUTION, /* tv_id */ |
3963 | .properties_required: ( PROP_cfg | PROP_ssa ), /* properties_required */ |
3964 | .properties_provided: 0, /* properties_provided */ |
3965 | .properties_destroyed: 0, /* properties_destroyed */ |
3966 | .todo_flags_start: 0, /* todo_flags_start */ |
3967 | .todo_flags_finish: 0, /* todo_flags_finish */ |
3968 | }; |
3969 | |
3970 | class pass_loop_distribution : public gimple_opt_pass |
3971 | { |
3972 | public: |
3973 | pass_loop_distribution (gcc::context *ctxt) |
3974 | : gimple_opt_pass (pass_data_loop_distribution, ctxt) |
3975 | {} |
3976 | |
3977 | /* opt_pass methods: */ |
3978 | bool gate (function *) final override |
3979 | { |
3980 | return flag_tree_loop_distribution |
3981 | || flag_tree_loop_distribute_patterns; |
3982 | } |
3983 | |
3984 | unsigned int execute (function *) final override; |
3985 | |
3986 | }; // class pass_loop_distribution |
3987 | |
3988 | unsigned int |
3989 | pass_loop_distribution::execute (function *fun) |
3990 | { |
3991 | return loop_distribution ().execute (fun); |
3992 | } |
3993 | |
3994 | } // anon namespace |
3995 | |
3996 | gimple_opt_pass * |
3997 | make_pass_loop_distribution (gcc::context *ctxt) |
3998 | { |
3999 | return new pass_loop_distribution (ctxt); |
4000 | } |
4001 | |