1/* Scheduler hooks for IA-32 which implement CPU specific logic.
2 Copyright (C) 1988-2023 Free Software Foundation, Inc.
3
4This file is part of GCC.
5
6GCC is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation; either version 3, or (at your option)
9any later version.
10
11GCC is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14GNU General Public License for more details.
15
16You should have received a copy of the GNU General Public License
17along with GCC; see the file COPYING3. If not see
18<http://www.gnu.org/licenses/>. */
19
20#define IN_TARGET_CODE 1
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
25#include "backend.h"
26#include "rtl.h"
27#include "tree.h"
28#include "cfghooks.h"
29#include "tm_p.h"
30#include "target.h"
31#include "insn-config.h"
32#include "insn-attr.h"
33#include "insn-opinit.h"
34#include "recog.h"
35
36/* Return the maximum number of instructions a cpu can issue. */
37
38int
39ix86_issue_rate (void)
40{
41 switch (ix86_tune)
42 {
43 case PROCESSOR_PENTIUM:
44 case PROCESSOR_LAKEMONT:
45 case PROCESSOR_BONNELL:
46 case PROCESSOR_SILVERMONT:
47 case PROCESSOR_KNL:
48 case PROCESSOR_KNM:
49 case PROCESSOR_INTEL:
50 case PROCESSOR_K6:
51 case PROCESSOR_BTVER2:
52 case PROCESSOR_PENTIUM4:
53 case PROCESSOR_NOCONA:
54 return 2;
55
56 case PROCESSOR_PENTIUMPRO:
57 case PROCESSOR_ATHLON:
58 case PROCESSOR_K8:
59 case PROCESSOR_AMDFAM10:
60 case PROCESSOR_BTVER1:
61 case PROCESSOR_LUJIAZUI:
62 return 3;
63
64 case PROCESSOR_BDVER1:
65 case PROCESSOR_BDVER2:
66 case PROCESSOR_BDVER3:
67 case PROCESSOR_BDVER4:
68 case PROCESSOR_ZNVER1:
69 case PROCESSOR_ZNVER2:
70 case PROCESSOR_ZNVER3:
71 case PROCESSOR_ZNVER4:
72 case PROCESSOR_CORE2:
73 case PROCESSOR_NEHALEM:
74 case PROCESSOR_SANDYBRIDGE:
75 case PROCESSOR_HASWELL:
76 case PROCESSOR_TREMONT:
77 case PROCESSOR_SKYLAKE:
78 case PROCESSOR_SKYLAKE_AVX512:
79 case PROCESSOR_CASCADELAKE:
80 case PROCESSOR_CANNONLAKE:
81 case PROCESSOR_ALDERLAKE:
82 case PROCESSOR_YONGFENG:
83 case PROCESSOR_GENERIC:
84 return 4;
85
86 case PROCESSOR_ICELAKE_CLIENT:
87 case PROCESSOR_ICELAKE_SERVER:
88 case PROCESSOR_TIGERLAKE:
89 case PROCESSOR_COOPERLAKE:
90 case PROCESSOR_ROCKETLAKE:
91 return 5;
92
93 case PROCESSOR_SAPPHIRERAPIDS:
94 return 6;
95
96 default:
97 return 1;
98 }
99}
100
101/* Return true iff USE_INSN has a memory address with operands set by
102 SET_INSN. */
103
104bool
105ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
106{
107 int i;
108 extract_insn_cached (use_insn);
109 for (i = recog_data.n_operands - 1; i >= 0; --i)
110 if (MEM_P (recog_data.operand[i]))
111 {
112 rtx addr = XEXP (recog_data.operand[i], 0);
113 if (modified_in_p (addr, set_insn) != 0)
114 {
115 /* No AGI stall if SET_INSN is a push or pop and USE_INSN
116 has SP based memory (unless index reg is modified in a pop). */
117 rtx set = single_set (insn: set_insn);
118 if (set
119 && (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))
120 || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set)))))
121 {
122 struct ix86_address parts;
123 if (ix86_decompose_address (addr, &parts)
124 && parts.base == stack_pointer_rtx
125 && (parts.index == NULL_RTX
126 || MEM_P (SET_DEST (set))
127 || !modified_in_p (parts.index, set_insn)))
128 return false;
129 }
130 return true;
131 }
132 return false;
133 }
134 return false;
135}
136
137/* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
138 by DEP_INSN and nothing set by DEP_INSN. */
139
140static bool
141ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
142{
143 rtx set, set2;
144
145 /* Simplify the test for uninteresting insns. */
146 if (insn_type != TYPE_SETCC
147 && insn_type != TYPE_ICMOV
148 && insn_type != TYPE_FCMOV
149 && insn_type != TYPE_IBR)
150 return false;
151
152 if ((set = single_set (insn: dep_insn)) != 0)
153 {
154 set = SET_DEST (set);
155 set2 = NULL_RTX;
156 }
157 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
158 && XVECLEN (PATTERN (dep_insn), 0) == 2
159 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
160 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
161 {
162 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
163 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
164 }
165 else
166 return false;
167
168 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
169 return false;
170
171 /* This test is true if the dependent insn reads the flags but
172 not any other potentially set register. */
173 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
174 return false;
175
176 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
177 return false;
178
179 return true;
180}
181
182/* Helper function for exact_store_load_dependency.
183 Return true if addr is found in insn. */
184static bool
185exact_dependency_1 (rtx addr, rtx insn)
186{
187 enum rtx_code code;
188 const char *format_ptr;
189 int i, j;
190
191 code = GET_CODE (insn);
192 switch (code)
193 {
194 case MEM:
195 if (rtx_equal_p (addr, insn))
196 return true;
197 break;
198 case REG:
199 CASE_CONST_ANY:
200 case SYMBOL_REF:
201 case CODE_LABEL:
202 case PC:
203 case EXPR_LIST:
204 return false;
205 default:
206 break;
207 }
208
209 format_ptr = GET_RTX_FORMAT (code);
210 for (i = 0; i < GET_RTX_LENGTH (code); i++)
211 {
212 switch (*format_ptr++)
213 {
214 case 'e':
215 if (exact_dependency_1 (addr, XEXP (insn, i)))
216 return true;
217 break;
218 case 'E':
219 for (j = 0; j < XVECLEN (insn, i); j++)
220 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
221 return true;
222 break;
223 }
224 }
225 return false;
226}
227
228/* Return true if there exists exact dependency for store & load, i.e.
229 the same memory address is used in them. */
230static bool
231exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
232{
233 rtx set1, set2;
234
235 set1 = single_set (insn: store);
236 if (!set1)
237 return false;
238 if (!MEM_P (SET_DEST (set1)))
239 return false;
240 set2 = single_set (insn: load);
241 if (!set2)
242 return false;
243 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
244 return true;
245 return false;
246}
247
248
249/* This function corrects the value of COST (latency) based on the relationship
250 between INSN and DEP_INSN through a dependence of type DEP_TYPE, and strength
251 DW. It should return the new value.
252
253 On x86 CPUs this is most commonly used to model the fact that valus of
254 registers used to compute address of memory operand needs to be ready
255 earlier than values of registers used in the actual operation. */
256
257int
258ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
259 unsigned int)
260{
261 enum attr_type insn_type, dep_insn_type;
262 enum attr_memory memory;
263 rtx set, set2;
264 int dep_insn_code_number;
265
266 /* Anti and output dependencies have zero cost on all CPUs. */
267 if (dep_type != 0)
268 return 0;
269
270 dep_insn_code_number = recog_memoized (insn: dep_insn);
271
272 /* If we can't recognize the insns, we can't really do anything. */
273 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
274 return cost;
275
276 insn_type = get_attr_type (insn);
277 dep_insn_type = get_attr_type (dep_insn);
278
279 switch (ix86_tune)
280 {
281 case PROCESSOR_PENTIUM:
282 case PROCESSOR_LAKEMONT:
283 /* Address Generation Interlock adds a cycle of latency. */
284 if (insn_type == TYPE_LEA)
285 {
286 rtx addr = PATTERN (insn);
287
288 if (GET_CODE (addr) == PARALLEL)
289 addr = XVECEXP (addr, 0, 0);
290
291 gcc_assert (GET_CODE (addr) == SET);
292
293 addr = SET_SRC (addr);
294 if (modified_in_p (addr, dep_insn))
295 cost += 1;
296 }
297 else if (ix86_agi_dependent (set_insn: dep_insn, use_insn: insn))
298 cost += 1;
299
300 /* ??? Compares pair with jump/setcc. */
301 if (ix86_flags_dependent (insn, dep_insn, insn_type))
302 cost = 0;
303
304 /* Floating point stores require value to be ready one cycle earlier. */
305 if (insn_type == TYPE_FMOV
306 && get_attr_memory (insn) == MEMORY_STORE
307 && !ix86_agi_dependent (set_insn: dep_insn, use_insn: insn))
308 cost += 1;
309 break;
310
311 case PROCESSOR_PENTIUMPRO:
312 /* INT->FP conversion is expensive. */
313 if (get_attr_fp_int_src (dep_insn))
314 cost += 5;
315
316 /* There is one cycle extra latency between an FP op and a store. */
317 if (insn_type == TYPE_FMOV
318 && (set = single_set (insn: dep_insn)) != NULL_RTX
319 && (set2 = single_set (insn)) != NULL_RTX
320 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
321 && MEM_P (SET_DEST (set2)))
322 cost += 1;
323
324 memory = get_attr_memory (insn);
325
326 /* Show ability of reorder buffer to hide latency of load by executing
327 in parallel with previous instruction in case
328 previous instruction is not needed to compute the address. */
329 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
330 && !ix86_agi_dependent (set_insn: dep_insn, use_insn: insn))
331 {
332 /* Claim moves to take one cycle, as core can issue one load
333 at time and the next load can start cycle later. */
334 if (dep_insn_type == TYPE_IMOV
335 || dep_insn_type == TYPE_FMOV)
336 cost = 1;
337 else if (cost > 1)
338 cost--;
339 }
340 break;
341
342 case PROCESSOR_K6:
343 /* The esp dependency is resolved before
344 the instruction is really finished. */
345 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
346 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
347 return 1;
348
349 /* INT->FP conversion is expensive. */
350 if (get_attr_fp_int_src (dep_insn))
351 cost += 5;
352
353 memory = get_attr_memory (insn);
354
355 /* Show ability of reorder buffer to hide latency of load by executing
356 in parallel with previous instruction in case
357 previous instruction is not needed to compute the address. */
358 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
359 && !ix86_agi_dependent (set_insn: dep_insn, use_insn: insn))
360 {
361 /* Claim moves to take one cycle, as core can issue one load
362 at time and the next load can start cycle later. */
363 if (dep_insn_type == TYPE_IMOV
364 || dep_insn_type == TYPE_FMOV)
365 cost = 1;
366 else if (cost > 2)
367 cost -= 2;
368 else
369 cost = 1;
370 }
371 break;
372
373 case PROCESSOR_AMDFAM10:
374 case PROCESSOR_BDVER1:
375 case PROCESSOR_BDVER2:
376 case PROCESSOR_BDVER3:
377 case PROCESSOR_BDVER4:
378 case PROCESSOR_BTVER1:
379 case PROCESSOR_BTVER2:
380 /* Stack engine allows to execute push&pop instructions in parall. */
381 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
382 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
383 return 0;
384 /* FALLTHRU */
385
386 case PROCESSOR_ATHLON:
387 case PROCESSOR_K8:
388 memory = get_attr_memory (insn);
389
390 /* Show ability of reorder buffer to hide latency of load by executing
391 in parallel with previous instruction in case
392 previous instruction is not needed to compute the address. */
393 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
394 && !ix86_agi_dependent (set_insn: dep_insn, use_insn: insn))
395 {
396 enum attr_unit unit = get_attr_unit (insn);
397 int loadcost = 3;
398
399 /* Because of the difference between the length of integer and
400 floating unit pipeline preparation stages, the memory operands
401 for floating point are cheaper.
402
403 ??? For Athlon it the difference is most probably 2. */
404 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
405 loadcost = 3;
406 else
407 loadcost = TARGET_CPU_P (ATHLON) ? 2 : 0;
408
409 if (cost >= loadcost)
410 cost -= loadcost;
411 else
412 cost = 0;
413 }
414 break;
415
416 case PROCESSOR_ZNVER1:
417 case PROCESSOR_ZNVER2:
418 case PROCESSOR_ZNVER3:
419 case PROCESSOR_ZNVER4:
420 /* Stack engine allows to execute push&pop instructions in parall. */
421 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
422 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
423 return 0;
424
425 memory = get_attr_memory (insn);
426
427 /* Show ability of reorder buffer to hide latency of load by executing
428 in parallel with previous instruction in case
429 previous instruction is not needed to compute the address. */
430 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
431 && !ix86_agi_dependent (set_insn: dep_insn, use_insn: insn))
432 {
433 enum attr_unit unit = get_attr_unit (insn);
434 int loadcost;
435
436 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
437 loadcost = 4;
438 else
439 loadcost = 7;
440
441 if (cost >= loadcost)
442 cost -= loadcost;
443 else
444 cost = 0;
445 }
446 break;
447
448 case PROCESSOR_YONGFENG:
449 /* Stack engine allows to execute push&pop instructions in parallel. */
450 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
451 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
452 return 0;
453 /* FALLTHRU */
454
455 case PROCESSOR_LUJIAZUI:
456 memory = get_attr_memory (insn);
457
458 /* Show ability of reorder buffer to hide latency of load by executing
459 in parallel with previous instruction in case
460 previous instruction is not needed to compute the address. */
461 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
462 && !ix86_agi_dependent (set_insn: dep_insn, use_insn: insn))
463 {
464 int loadcost = 4;
465
466 if (cost >= loadcost)
467 cost -= loadcost;
468 else
469 cost = 0;
470 }
471 break;
472
473 case PROCESSOR_CORE2:
474 case PROCESSOR_NEHALEM:
475 case PROCESSOR_SANDYBRIDGE:
476 case PROCESSOR_HASWELL:
477 case PROCESSOR_TREMONT:
478 case PROCESSOR_ALDERLAKE:
479 case PROCESSOR_GENERIC:
480 /* Stack engine allows to execute push&pop instructions in parall. */
481 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
482 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
483 return 0;
484
485 memory = get_attr_memory (insn);
486
487 /* Show ability of reorder buffer to hide latency of load by executing
488 in parallel with previous instruction in case
489 previous instruction is not needed to compute the address. */
490 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
491 && !ix86_agi_dependent (set_insn: dep_insn, use_insn: insn))
492 {
493 if (cost >= 4)
494 cost -= 4;
495 else
496 cost = 0;
497 }
498 break;
499
500 case PROCESSOR_SILVERMONT:
501 case PROCESSOR_KNL:
502 case PROCESSOR_KNM:
503 case PROCESSOR_INTEL:
504 if (!reload_completed)
505 return cost;
506
507 /* Increase cost of integer loads. */
508 memory = get_attr_memory (dep_insn);
509 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
510 {
511 enum attr_unit unit = get_attr_unit (dep_insn);
512 if (unit == UNIT_INTEGER && cost == 1)
513 {
514 if (memory == MEMORY_LOAD)
515 cost = 3;
516 else
517 {
518 /* Increase cost of ld/st for short int types only
519 because of store forwarding issue. */
520 rtx set = single_set (insn: dep_insn);
521 if (set && (GET_MODE (SET_DEST (set)) == QImode
522 || GET_MODE (SET_DEST (set)) == HImode))
523 {
524 /* Increase cost of store/load insn if exact
525 dependence exists and it is load insn. */
526 enum attr_memory insn_memory = get_attr_memory (insn);
527 if (insn_memory == MEMORY_LOAD
528 && exact_store_load_dependency (store: dep_insn, load: insn))
529 cost = 3;
530 }
531 }
532 }
533 }
534
535 default:
536 break;
537 }
538
539 return cost;
540}
541
542/* How many alternative schedules to try. This should be as wide as the
543 scheduling freedom in the DFA, but no wider. Making this value too
544 large results extra work for the scheduler. */
545
546int
547ia32_multipass_dfa_lookahead (void)
548{
549 /* Generally, we want haifa-sched:max_issue() to look ahead as far
550 as many instructions can be executed on a cycle, i.e.,
551 issue_rate. */
552 if (reload_completed)
553 return ix86_issue_rate ();
554 /* Don't use lookahead for pre-reload schedule to save compile time. */
555 return 0;
556}
557
558/* Return true if target platform supports macro-fusion. */
559
560bool
561ix86_macro_fusion_p ()
562{
563 return TARGET_FUSE_CMP_AND_BRANCH;
564}
565
566/* Check whether current microarchitecture support macro fusion
567 for insn pair "CONDGEN + CONDJMP". Refer to
568 "Intel Architectures Optimization Reference Manual". */
569
570bool
571ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
572{
573 rtx src, dest;
574 enum rtx_code ccode;
575 rtx compare_set = NULL_RTX, test_if, cond;
576 rtx alu_set = NULL_RTX, addr = NULL_RTX;
577 enum attr_type condgen_type;
578
579 if (!any_condjump_p (condjmp))
580 return false;
581
582 unsigned int condreg1, condreg2;
583 rtx cc_reg_1;
584 targetm.fixed_condition_code_regs (&condreg1, &condreg2);
585 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
586 if (!reg_referenced_p (cc_reg_1, PATTERN (insn: condjmp))
587 || !condgen
588 || !modified_in_p (cc_reg_1, condgen))
589 return false;
590
591 condgen_type = get_attr_type (condgen);
592 if (condgen_type == TYPE_MULTI
593 && INSN_CODE (condgen) == code_for_stack_protect_test_1 (arg0: ptr_mode)
594 && TARGET_FUSE_ALU_AND_BRANCH)
595 {
596 /* stack_protect_test_<mode> ends with a sub, which subtracts
597 a non-rip special memory operand from a GPR. */
598 src = NULL_RTX;
599 alu_set = XVECEXP (PATTERN (condgen), 0, 1);
600 goto handle_stack_protect_test;
601 }
602 else if (condgen_type != TYPE_TEST
603 && condgen_type != TYPE_ICMP
604 && condgen_type != TYPE_INCDEC
605 && condgen_type != TYPE_ALU)
606 return false;
607
608 compare_set = single_set (insn: condgen);
609 if (compare_set == NULL_RTX && !TARGET_FUSE_ALU_AND_BRANCH)
610 return false;
611
612 if (compare_set == NULL_RTX)
613 {
614 int i;
615 rtx pat = PATTERN (insn: condgen);
616 for (i = 0; i < XVECLEN (pat, 0); i++)
617 if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
618 {
619 rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
620 if (GET_CODE (set_src) == COMPARE)
621 compare_set = XVECEXP (pat, 0, i);
622 else
623 alu_set = XVECEXP (pat, 0, i);
624 }
625 }
626 if (compare_set == NULL_RTX)
627 return false;
628 src = SET_SRC (compare_set);
629 if (GET_CODE (src) != COMPARE)
630 return false;
631
632 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
633 supported. */
634 if ((MEM_P (XEXP (src, 0)) && CONST_INT_P (XEXP (src, 1)))
635 || (MEM_P (XEXP (src, 1)) && CONST_INT_P (XEXP (src, 0))))
636 return false;
637
638 /* No fusion for RIP-relative address. */
639 if (MEM_P (XEXP (src, 0)))
640 addr = XEXP (XEXP (src, 0), 0);
641 else if (MEM_P (XEXP (src, 1)))
642 addr = XEXP (XEXP (src, 1), 0);
643
644 if (addr)
645 {
646 ix86_address parts;
647 int ok = ix86_decompose_address (addr, &parts);
648 gcc_assert (ok);
649
650 if (ix86_rip_relative_addr_p (parts: &parts))
651 return false;
652 }
653
654 handle_stack_protect_test:
655 test_if = SET_SRC (pc_set (condjmp));
656 cond = XEXP (test_if, 0);
657 ccode = GET_CODE (cond);
658 /* Check whether conditional jump use Sign or Overflow Flags. */
659 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
660 && (ccode == GE || ccode == GT || ccode == LE || ccode == LT))
661 return false;
662
663 /* Return true for TYPE_TEST and TYPE_ICMP. */
664 if (condgen_type == TYPE_TEST || condgen_type == TYPE_ICMP)
665 return true;
666
667 /* The following is the case that macro-fusion for alu + jmp. */
668 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
669 return false;
670
671 /* No fusion for alu op with memory destination operand. */
672 dest = SET_DEST (alu_set);
673 if (MEM_P (dest))
674 return false;
675
676 /* Macro-fusion for inc/dec + unsigned conditional jump is not
677 supported. */
678 if (condgen_type == TYPE_INCDEC
679 && (ccode == GEU || ccode == GTU || ccode == LEU || ccode == LTU))
680 return false;
681
682 return true;
683}
684
685

source code of gcc/config/i386/x86-tune-sched.cc