1/* Copyright (C) 1988-2023 Free Software Foundation, Inc.
2
3This file is part of GCC.
4
5GCC is free software; you can redistribute it and/or modify
6it under the terms of the GNU General Public License as published by
7the Free Software Foundation; either version 3, or (at your option)
8any later version.
9
10GCC is distributed in the hope that it will be useful,
11but WITHOUT ANY WARRANTY; without even the implied warranty of
12MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13GNU General Public License for more details.
14
15You should have received a copy of the GNU General Public License
16along with GCC; see the file COPYING3. If not see
17<http://www.gnu.org/licenses/>. */
18
19#define IN_TARGET_CODE 1
20
21#include "config.h"
22#include "system.h"
23#include "coretypes.h"
24#include "backend.h"
25#include "rtl.h"
26#include "tree.h"
27#include "memmodel.h"
28#include "gimple.h"
29#include "cfghooks.h"
30#include "cfgloop.h"
31#include "df.h"
32#include "tm_p.h"
33#include "stringpool.h"
34#include "expmed.h"
35#include "optabs.h"
36#include "regs.h"
37#include "emit-rtl.h"
38#include "recog.h"
39#include "cgraph.h"
40#include "diagnostic.h"
41#include "cfgbuild.h"
42#include "alias.h"
43#include "fold-const.h"
44#include "attribs.h"
45#include "calls.h"
46#include "stor-layout.h"
47#include "varasm.h"
48#include "output.h"
49#include "insn-attr.h"
50#include "flags.h"
51#include "except.h"
52#include "explow.h"
53#include "expr.h"
54#include "cfgrtl.h"
55#include "common/common-target.h"
56#include "langhooks.h"
57#include "reload.h"
58#include "gimplify.h"
59#include "dwarf2.h"
60#include "tm-constrs.h"
61#include "cselib.h"
62#include "sched-int.h"
63#include "opts.h"
64#include "tree-pass.h"
65#include "context.h"
66#include "pass_manager.h"
67#include "target-globals.h"
68#include "gimple-iterator.h"
69#include "shrink-wrap.h"
70#include "builtins.h"
71#include "rtl-iter.h"
72#include "tree-iterator.h"
73#include "dbgcnt.h"
74#include "case-cfn-macros.h"
75#include "dojump.h"
76#include "fold-const-call.h"
77#include "tree-vrp.h"
78#include "tree-ssanames.h"
79#include "selftest.h"
80#include "selftest-rtl.h"
81#include "print-rtl.h"
82#include "intl.h"
83#include "ifcvt.h"
84#include "symbol-summary.h"
85#include "ipa-prop.h"
86#include "ipa-fnsummary.h"
87#include "wide-int-bitmask.h"
88#include "tree-vector-builder.h"
89#include "debug.h"
90#include "dwarf2out.h"
91#include "i386-options.h"
92#include "i386-builtins.h"
93#include "i386-expand.h"
94#include "asan.h"
95
96/* Split one or more double-mode RTL references into pairs of half-mode
97 references. The RTL can be REG, offsettable MEM, integer constant, or
98 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
99 split and "num" is its length. lo_half and hi_half are output arrays
100 that parallel "operands". */
101
102void
103split_double_mode (machine_mode mode, rtx operands[],
104 int num, rtx lo_half[], rtx hi_half[])
105{
106 machine_mode half_mode;
107 unsigned int byte;
108 rtx mem_op = NULL_RTX;
109 int mem_num = 0;
110
111 switch (mode)
112 {
113 case E_TImode:
114 half_mode = DImode;
115 break;
116 case E_DImode:
117 half_mode = SImode;
118 break;
119 case E_P2HImode:
120 half_mode = HImode;
121 break;
122 case E_P2QImode:
123 half_mode = QImode;
124 break;
125 default:
126 gcc_unreachable ();
127 }
128
129 byte = GET_MODE_SIZE (half_mode);
130
131 while (num--)
132 {
133 rtx op = operands[num];
134
135 /* simplify_subreg refuse to split volatile memory addresses,
136 but we still have to handle it. */
137 if (MEM_P (op))
138 {
139 if (mem_op && rtx_equal_p (op, mem_op))
140 {
141 lo_half[num] = lo_half[mem_num];
142 hi_half[num] = hi_half[mem_num];
143 }
144 else
145 {
146 mem_op = op;
147 mem_num = num;
148 lo_half[num] = adjust_address (op, half_mode, 0);
149 hi_half[num] = adjust_address (op, half_mode, byte);
150 }
151 }
152 else
153 {
154 lo_half[num] = simplify_gen_subreg (outermode: half_mode, op,
155 GET_MODE (op) == VOIDmode
156 ? mode : GET_MODE (op), byte: 0);
157
158 rtx tmp = simplify_gen_subreg (outermode: half_mode, op,
159 GET_MODE (op) == VOIDmode
160 ? mode : GET_MODE (op), byte);
161 /* simplify_gen_subreg will return NULL RTX for the
162 high half of the paradoxical subreg. */
163 hi_half[num] = tmp ? tmp : gen_reg_rtx (half_mode);
164 }
165 }
166}
167
168/* Emit the double word assignment DST = { LO, HI }. */
169
170void
171split_double_concat (machine_mode mode, rtx dst, rtx lo, rtx hi)
172{
173 rtx dlo, dhi;
174 int deleted_move_count = 0;
175 split_double_mode (mode, operands: &dst, num: 1, lo_half: &dlo, hi_half: &dhi);
176 /* Constraints ensure that if both lo and hi are MEMs, then
177 dst has early-clobber and thus addresses of MEMs don't use
178 dlo/dhi registers. Otherwise if at least one of li and hi are MEMs,
179 dlo/dhi are registers. */
180 if (MEM_P (lo)
181 && rtx_equal_p (dlo, hi)
182 && reg_overlap_mentioned_p (dhi, lo))
183 {
184 /* If dlo is same as hi and lo's address uses dhi register,
185 code below would first emit_move_insn (dhi, hi)
186 and then emit_move_insn (dlo, lo). But the former
187 would invalidate lo's address. Load into dhi first,
188 then swap. */
189 emit_move_insn (dhi, lo);
190 lo = dhi;
191 }
192 else if (MEM_P (hi)
193 && !MEM_P (lo)
194 && !rtx_equal_p (dlo, lo)
195 && reg_overlap_mentioned_p (dlo, hi))
196 {
197 /* In this case, code below would first emit_move_insn (dlo, lo)
198 and then emit_move_insn (dhi, hi). But the former would
199 invalidate hi's address. */
200 if (rtx_equal_p (dhi, lo))
201 {
202 /* We can't load into dhi first, so load into dlo
203 first and we'll swap. */
204 emit_move_insn (dlo, hi);
205 hi = dlo;
206 }
207 else
208 {
209 /* Load into dhi first. */
210 emit_move_insn (dhi, hi);
211 hi = dhi;
212 }
213 }
214 if (!rtx_equal_p (dlo, hi))
215 {
216 if (!rtx_equal_p (dlo, lo))
217 emit_move_insn (dlo, lo);
218 else
219 deleted_move_count++;
220 if (!rtx_equal_p (dhi, hi))
221 emit_move_insn (dhi, hi);
222 else
223 deleted_move_count++;
224 }
225 else if (!rtx_equal_p (lo, dhi))
226 {
227 if (!rtx_equal_p (dhi, hi))
228 emit_move_insn (dhi, hi);
229 else
230 deleted_move_count++;
231 if (!rtx_equal_p (dlo, lo))
232 emit_move_insn (dlo, lo);
233 else
234 deleted_move_count++;
235 }
236 else if (mode == TImode)
237 emit_insn (gen_swapdi (dlo, dhi));
238 else
239 emit_insn (gen_swapsi (dlo, dhi));
240
241 if (deleted_move_count == 2)
242 emit_note (NOTE_INSN_DELETED);
243}
244
245
246/* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
247 for the target. */
248
249void
250ix86_expand_clear (rtx dest)
251{
252 rtx tmp;
253
254 /* We play register width games, which are only valid after reload. */
255 gcc_assert (reload_completed);
256
257 /* Avoid HImode and its attendant prefix byte. */
258 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
259 dest = gen_rtx_REG (SImode, REGNO (dest));
260 tmp = gen_rtx_SET (dest, const0_rtx);
261
262 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
263 {
264 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
265 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
266 }
267
268 emit_insn (tmp);
269}
270
271/* Return true if V can be broadcasted from an integer of WIDTH bits
272 which is returned in VAL_BROADCAST. Otherwise, return false. */
273
274static bool
275ix86_broadcast (HOST_WIDE_INT v, unsigned int width,
276 HOST_WIDE_INT &val_broadcast)
277{
278 wide_int val = wi::uhwi (val: v, HOST_BITS_PER_WIDE_INT);
279 val_broadcast = wi::extract_uhwi (x: val, bitpos: 0, width);
280 for (unsigned int i = width; i < HOST_BITS_PER_WIDE_INT; i += width)
281 {
282 HOST_WIDE_INT each = wi::extract_uhwi (x: val, bitpos: i, width);
283 if (val_broadcast != each)
284 return false;
285 }
286 val_broadcast = sext_hwi (src: val_broadcast, prec: width);
287 return true;
288}
289
290/* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */
291
292static rtx
293ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
294{
295 /* Don't use integer vector broadcast if we can't move from GPR to SSE
296 register directly. */
297 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
298 return nullptr;
299
300 /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
301 broadcast only if vector broadcast is available. */
302 if (!TARGET_AVX
303 || !CONST_WIDE_INT_P (op)
304 || standard_sse_constant_p (op, mode)
305 || (CONST_WIDE_INT_NUNITS (op) * HOST_BITS_PER_WIDE_INT
306 != GET_MODE_BITSIZE (mode)))
307 return nullptr;
308
309 HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
310 HOST_WIDE_INT val_broadcast;
311 scalar_int_mode broadcast_mode;
312 if (TARGET_AVX2
313 && ix86_broadcast (v: val, GET_MODE_BITSIZE (QImode),
314 val_broadcast))
315 broadcast_mode = QImode;
316 else if (TARGET_AVX2
317 && ix86_broadcast (v: val, GET_MODE_BITSIZE (HImode),
318 val_broadcast))
319 broadcast_mode = HImode;
320 else if (ix86_broadcast (v: val, GET_MODE_BITSIZE (SImode),
321 val_broadcast))
322 broadcast_mode = SImode;
323 else if (TARGET_64BIT
324 && ix86_broadcast (v: val, GET_MODE_BITSIZE (DImode),
325 val_broadcast))
326 broadcast_mode = DImode;
327 else
328 return nullptr;
329
330 /* Check if OP can be broadcasted from VAL. */
331 for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++)
332 if (val != CONST_WIDE_INT_ELT (op, i))
333 return nullptr;
334
335 unsigned int nunits = (GET_MODE_SIZE (mode)
336 / GET_MODE_SIZE (broadcast_mode));
337 machine_mode vector_mode;
338 if (!mode_for_vector (broadcast_mode, nunits).exists (mode: &vector_mode))
339 gcc_unreachable ();
340 rtx target = gen_reg_rtx (vector_mode);
341 bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
342 target,
343 GEN_INT (val_broadcast));
344 gcc_assert (ok);
345 target = lowpart_subreg (outermode: mode, op: target, innermode: vector_mode);
346 return target;
347}
348
349void
350ix86_expand_move (machine_mode mode, rtx operands[])
351{
352 rtx op0, op1;
353 rtx tmp, addend = NULL_RTX;
354 enum tls_model model;
355
356 op0 = operands[0];
357 op1 = operands[1];
358
359 /* Avoid complex sets of likely spilled hard registers before reload. */
360 if (!ix86_hardreg_mov_ok (op0, op1))
361 {
362 tmp = gen_reg_rtx (mode);
363 operands[0] = tmp;
364 ix86_expand_move (mode, operands);
365 operands[0] = op0;
366 operands[1] = tmp;
367 op1 = tmp;
368 }
369
370 switch (GET_CODE (op1))
371 {
372 case CONST:
373 tmp = XEXP (op1, 0);
374
375 if (GET_CODE (tmp) != PLUS
376 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
377 break;
378
379 op1 = XEXP (tmp, 0);
380 addend = XEXP (tmp, 1);
381 /* FALLTHRU */
382
383 case SYMBOL_REF:
384 model = SYMBOL_REF_TLS_MODEL (op1);
385
386 if (model)
387 op1 = legitimize_tls_address (x: op1, model, for_mov: true);
388 else if (ix86_force_load_from_GOT_p (op1))
389 {
390 /* Load the external function address via GOT slot to avoid PLT. */
391 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
392 (TARGET_64BIT
393 ? UNSPEC_GOTPCREL
394 : UNSPEC_GOT));
395 op1 = gen_rtx_CONST (Pmode, op1);
396 op1 = gen_const_mem (Pmode, op1);
397 set_mem_alias_set (op1, ix86_GOT_alias_set ());
398 }
399 else
400 {
401 tmp = legitimize_pe_coff_symbol (addr: op1, inreg: addend != NULL_RTX);
402 if (tmp)
403 {
404 op1 = tmp;
405 if (!addend)
406 break;
407 }
408 else
409 {
410 op1 = operands[1];
411 break;
412 }
413 }
414
415 if (addend)
416 {
417 op1 = force_operand (op1, NULL_RTX);
418 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
419 op0, 1, OPTAB_DIRECT);
420 }
421 else
422 op1 = force_operand (op1, op0);
423
424 if (op1 == op0)
425 return;
426
427 op1 = convert_to_mode (mode, op1, 1);
428
429 default:
430 break;
431
432 case SUBREG:
433 /* Transform TImode paradoxical SUBREG into zero_extendditi2. */
434 if (TARGET_64BIT
435 && mode == TImode
436 && SUBREG_P (op1)
437 && GET_MODE (SUBREG_REG (op1)) == DImode
438 && SUBREG_BYTE (op1) == 0)
439 op1 = gen_rtx_ZERO_EXTEND (TImode, SUBREG_REG (op1));
440 break;
441 }
442
443 if ((flag_pic || MACHOPIC_INDIRECT)
444 && symbolic_operand (op1, mode))
445 {
446 if (TARGET_MACHO && !TARGET_64BIT)
447 {
448#if TARGET_MACHO
449 /* dynamic-no-pic */
450 if (MACHOPIC_INDIRECT)
451 {
452 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
453 ? op0 : gen_reg_rtx (Pmode);
454 op1 = machopic_indirect_data_reference (op1, temp);
455 if (MACHOPIC_PURE)
456 op1 = machopic_legitimize_pic_address (op1, mode,
457 temp == op1 ? 0 : temp);
458 }
459 if (op0 != op1 && GET_CODE (op0) != MEM)
460 {
461 rtx insn = gen_rtx_SET (op0, op1);
462 emit_insn (insn);
463 return;
464 }
465 if (GET_CODE (op0) == MEM)
466 op1 = force_reg (Pmode, op1);
467 else
468 {
469 rtx temp = op0;
470 if (GET_CODE (temp) != REG)
471 temp = gen_reg_rtx (Pmode);
472 temp = legitimize_pic_address (op1, temp);
473 if (temp == op0)
474 return;
475 op1 = temp;
476 }
477 /* dynamic-no-pic */
478#endif
479 }
480 else
481 {
482 if (MEM_P (op0))
483 op1 = force_reg (mode, op1);
484 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
485 {
486 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
487 op1 = legitimize_pic_address (orig: op1, reg);
488 if (op0 == op1)
489 return;
490 op1 = convert_to_mode (mode, op1, 1);
491 }
492 }
493 }
494 else
495 {
496 if (MEM_P (op0)
497 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
498 || !push_operand (op0, mode))
499 && MEM_P (op1))
500 op1 = force_reg (mode, op1);
501
502 if (push_operand (op0, mode)
503 && ! general_no_elim_operand (op1, mode))
504 op1 = copy_to_mode_reg (mode, op1);
505
506 /* Force large constants in 64bit compilation into register
507 to get them CSEed. */
508 if (can_create_pseudo_p ()
509 && (mode == DImode) && TARGET_64BIT
510 && immediate_operand (op1, mode)
511 && !x86_64_zext_immediate_operand (op1, VOIDmode)
512 && !register_operand (op0, mode)
513 && optimize)
514 op1 = copy_to_mode_reg (mode, op1);
515
516 if (can_create_pseudo_p ())
517 {
518 if (CONST_DOUBLE_P (op1))
519 {
520 /* If we are loading a floating point constant to a
521 register, force the value to memory now, since we'll
522 get better code out the back end. */
523
524 op1 = validize_mem (force_const_mem (mode, op1));
525 if (!register_operand (op0, mode))
526 {
527 rtx temp = gen_reg_rtx (mode);
528 emit_insn (gen_rtx_SET (temp, op1));
529 emit_move_insn (op0, temp);
530 return;
531 }
532 }
533 else if (CONST_WIDE_INT_P (op1)
534 && GET_MODE_SIZE (mode) >= 16)
535 {
536 rtx tmp = ix86_convert_const_wide_int_to_broadcast
537 (GET_MODE (op0), op: op1);
538 if (tmp != nullptr)
539 op1 = tmp;
540 }
541 }
542 }
543
544 /* Special case inserting 64-bit values into a TImode register. */
545 if (TARGET_64BIT
546 /* Disable for -O0 (see PR110587) unless naked (PR110533). */
547 && (optimize || ix86_function_naked (fn: current_function_decl))
548 && (mode == DImode || mode == DFmode)
549 && SUBREG_P (op0)
550 && GET_MODE (SUBREG_REG (op0)) == TImode
551 && REG_P (SUBREG_REG (op0))
552 && REG_P (op1))
553 {
554 /* Use *insvti_lowpart_1 to set lowpart. */
555 if (SUBREG_BYTE (op0) == 0)
556 {
557 wide_int mask = wi::mask (width: 64, negate_p: true, precision: 128);
558 rtx tmp = immed_wide_int_const (mask, TImode);
559 op0 = SUBREG_REG (op0);
560 tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
561 if (mode == DFmode)
562 op1 = gen_lowpart (DImode, op1);
563 op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
564 op1 = gen_rtx_IOR (TImode, tmp, op1);
565 }
566 /* Use *insvti_highpart_1 to set highpart. */
567 else if (SUBREG_BYTE (op0) == 8)
568 {
569 wide_int mask = wi::mask (width: 64, negate_p: false, precision: 128);
570 rtx tmp = immed_wide_int_const (mask, TImode);
571 op0 = SUBREG_REG (op0);
572 tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
573 if (mode == DFmode)
574 op1 = gen_lowpart (DImode, op1);
575 op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
576 op1 = gen_rtx_ASHIFT (TImode, op1, GEN_INT (64));
577 op1 = gen_rtx_IOR (TImode, tmp, op1);
578 }
579 }
580
581 emit_insn (gen_rtx_SET (op0, op1));
582}
583
584/* OP is a memref of CONST_VECTOR, return scalar constant mem
585 if CONST_VECTOR is a vec_duplicate, else return NULL. */
586static rtx
587ix86_broadcast_from_constant (machine_mode mode, rtx op)
588{
589 int nunits = GET_MODE_NUNITS (mode);
590 if (nunits < 2)
591 return nullptr;
592
593 /* Don't use integer vector broadcast if we can't move from GPR to SSE
594 register directly. */
595 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
596 && INTEGRAL_MODE_P (mode))
597 return nullptr;
598
599 /* Convert CONST_VECTOR to a non-standard SSE constant integer
600 broadcast only if vector broadcast is available. */
601 if (!(TARGET_AVX2
602 || (TARGET_AVX
603 && (GET_MODE_INNER (mode) == SImode
604 || GET_MODE_INNER (mode) == DImode))
605 || FLOAT_MODE_P (mode))
606 || standard_sse_constant_p (op, mode))
607 return nullptr;
608
609 /* Don't broadcast from a 64-bit integer constant in 32-bit mode.
610 We can still put 64-bit integer constant in memory when
611 avx512 embed broadcast is available. */
612 if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT
613 && (!TARGET_AVX512F
614 || (GET_MODE_SIZE (mode) == 64 && !TARGET_EVEX512)
615 || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL)))
616 return nullptr;
617
618 if (GET_MODE_INNER (mode) == TImode)
619 return nullptr;
620
621 rtx constant = get_pool_constant (XEXP (op, 0));
622 if (GET_CODE (constant) != CONST_VECTOR)
623 return nullptr;
624
625 /* There could be some rtx like
626 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
627 but with "*.LC1" refer to V2DI constant vector. */
628 if (GET_MODE (constant) != mode)
629 {
630 constant = simplify_subreg (outermode: mode, op: constant, GET_MODE (constant),
631 byte: 0);
632 if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
633 return nullptr;
634 }
635
636 rtx first = XVECEXP (constant, 0, 0);
637
638 for (int i = 1; i < nunits; ++i)
639 {
640 rtx tmp = XVECEXP (constant, 0, i);
641 /* Vector duplicate value. */
642 if (!rtx_equal_p (tmp, first))
643 return nullptr;
644 }
645
646 return first;
647}
648
649void
650ix86_expand_vector_move (machine_mode mode, rtx operands[])
651{
652 rtx op0 = operands[0], op1 = operands[1];
653 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
654 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
655 unsigned int align = (TARGET_IAMCU
656 ? GET_MODE_BITSIZE (mode)
657 : GET_MODE_ALIGNMENT (mode));
658
659 if (push_operand (op0, VOIDmode))
660 op0 = emit_move_resolve_push (mode, op0);
661
662 /* Force constants other than zero into memory. We do not know how
663 the instructions used to build constants modify the upper 64 bits
664 of the register, once we have that information we may be able
665 to handle some of them more efficiently. */
666 if (can_create_pseudo_p ()
667 && (CONSTANT_P (op1)
668 || (SUBREG_P (op1)
669 && CONSTANT_P (SUBREG_REG (op1))))
670 && ((register_operand (op0, mode)
671 && !standard_sse_constant_p (op1, mode))
672 /* ix86_expand_vector_move_misalign() does not like constants. */
673 || (SSE_REG_MODE_P (mode)
674 && MEM_P (op0)
675 && MEM_ALIGN (op0) < align)))
676 {
677 if (SUBREG_P (op1))
678 {
679 machine_mode imode = GET_MODE (SUBREG_REG (op1));
680 rtx r = force_const_mem (imode, SUBREG_REG (op1));
681 if (r)
682 r = validize_mem (r);
683 else
684 r = force_reg (imode, SUBREG_REG (op1));
685 op1 = simplify_gen_subreg (outermode: mode, op: r, innermode: imode, SUBREG_BYTE (op1));
686 }
687 else
688 {
689 machine_mode mode = GET_MODE (op0);
690 rtx tmp = ix86_convert_const_wide_int_to_broadcast
691 (mode, op: op1);
692 if (tmp == nullptr)
693 op1 = validize_mem (force_const_mem (mode, op1));
694 else
695 op1 = tmp;
696 }
697 }
698
699 if (can_create_pseudo_p ()
700 && GET_MODE_SIZE (mode) >= 16
701 && VECTOR_MODE_P (mode)
702 && (MEM_P (op1)
703 && SYMBOL_REF_P (XEXP (op1, 0))
704 && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
705 {
706 rtx first = ix86_broadcast_from_constant (mode, op: op1);
707 if (first != nullptr)
708 {
709 /* Broadcast to XMM/YMM/ZMM register from an integer
710 constant or scalar mem. */
711 op1 = gen_reg_rtx (mode);
712 if (FLOAT_MODE_P (mode)
713 || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
714 first = force_const_mem (GET_MODE_INNER (mode), first);
715 bool ok = ix86_expand_vector_init_duplicate (false, mode,
716 op1, first);
717 gcc_assert (ok);
718 emit_move_insn (op0, op1);
719 return;
720 }
721 }
722
723 /* We need to check memory alignment for SSE mode since attribute
724 can make operands unaligned. */
725 if (can_create_pseudo_p ()
726 && SSE_REG_MODE_P (mode)
727 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
728 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
729 {
730 rtx tmp[2];
731
732 /* ix86_expand_vector_move_misalign() does not like both
733 arguments in memory. */
734 if (!register_operand (op0, mode)
735 && !register_operand (op1, mode))
736 {
737 rtx scratch = gen_reg_rtx (mode);
738 emit_move_insn (scratch, op1);
739 op1 = scratch;
740 }
741
742 tmp[0] = op0; tmp[1] = op1;
743 ix86_expand_vector_move_misalign (mode, tmp);
744 return;
745 }
746
747 /* Special case TImode to 128-bit vector conversions via V2DI. */
748 if (VECTOR_MODE_P (mode)
749 && GET_MODE_SIZE (mode) == 16
750 && SUBREG_P (op1)
751 && GET_MODE (SUBREG_REG (op1)) == TImode
752 && TARGET_64BIT && TARGET_SSE
753 && can_create_pseudo_p ())
754 {
755 rtx tmp = gen_reg_rtx (V2DImode);
756 rtx lo = gen_reg_rtx (DImode);
757 rtx hi = gen_reg_rtx (DImode);
758 emit_move_insn (lo, gen_lowpart (DImode, SUBREG_REG (op1)));
759 emit_move_insn (hi, gen_highpart (DImode, SUBREG_REG (op1)));
760 emit_insn (gen_vec_concatv2di (tmp, lo, hi));
761 emit_move_insn (op0, gen_lowpart (mode, tmp));
762 return;
763 }
764
765 /* If operand0 is a hard register, make operand1 a pseudo. */
766 if (can_create_pseudo_p ()
767 && !ix86_hardreg_mov_ok (op0, op1))
768 {
769 rtx tmp = gen_reg_rtx (GET_MODE (op0));
770 emit_move_insn (tmp, op1);
771 emit_move_insn (op0, tmp);
772 return;
773 }
774
775 /* Make operand1 a register if it isn't already. */
776 if (can_create_pseudo_p ()
777 && !register_operand (op0, mode)
778 && !register_operand (op1, mode))
779 {
780 rtx tmp = gen_reg_rtx (GET_MODE (op0));
781 emit_move_insn (tmp, op1);
782 emit_move_insn (op0, tmp);
783 return;
784 }
785
786 emit_insn (gen_rtx_SET (op0, op1));
787}
788
789/* Split 32-byte AVX unaligned load and store if needed. */
790
791static void
792ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
793{
794 rtx m;
795 rtx (*extract) (rtx, rtx, rtx);
796 machine_mode mode;
797
798 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
799 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
800 {
801 emit_insn (gen_rtx_SET (op0, op1));
802 return;
803 }
804
805 rtx orig_op0 = NULL_RTX;
806 mode = GET_MODE (op0);
807 switch (GET_MODE_CLASS (mode))
808 {
809 case MODE_VECTOR_INT:
810 case MODE_INT:
811 if (mode != V32QImode)
812 {
813 if (!MEM_P (op0))
814 {
815 orig_op0 = op0;
816 op0 = gen_reg_rtx (V32QImode);
817 }
818 else
819 op0 = gen_lowpart (V32QImode, op0);
820 op1 = gen_lowpart (V32QImode, op1);
821 mode = V32QImode;
822 }
823 break;
824 case MODE_VECTOR_FLOAT:
825 break;
826 default:
827 gcc_unreachable ();
828 }
829
830 switch (mode)
831 {
832 default:
833 gcc_unreachable ();
834 case E_V32QImode:
835 extract = gen_avx_vextractf128v32qi;
836 mode = V16QImode;
837 break;
838 case E_V16BFmode:
839 extract = gen_avx_vextractf128v16bf;
840 mode = V8BFmode;
841 break;
842 case E_V16HFmode:
843 extract = gen_avx_vextractf128v16hf;
844 mode = V8HFmode;
845 break;
846 case E_V8SFmode:
847 extract = gen_avx_vextractf128v8sf;
848 mode = V4SFmode;
849 break;
850 case E_V4DFmode:
851 extract = gen_avx_vextractf128v4df;
852 mode = V2DFmode;
853 break;
854 }
855
856 if (MEM_P (op1))
857 {
858 rtx r = gen_reg_rtx (mode);
859 m = adjust_address (op1, mode, 0);
860 emit_move_insn (r, m);
861 m = adjust_address (op1, mode, 16);
862 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
863 emit_move_insn (op0, r);
864 }
865 else if (MEM_P (op0))
866 {
867 m = adjust_address (op0, mode, 0);
868 emit_insn (extract (m, op1, const0_rtx));
869 m = adjust_address (op0, mode, 16);
870 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
871 }
872 else
873 gcc_unreachable ();
874
875 if (orig_op0)
876 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
877}
878
879/* Implement the movmisalign patterns for SSE. Non-SSE modes go
880 straight to ix86_expand_vector_move. */
881/* Code generation for scalar reg-reg moves of single and double precision data:
882 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
883 movaps reg, reg
884 else
885 movss reg, reg
886 if (x86_sse_partial_reg_dependency == true)
887 movapd reg, reg
888 else
889 movsd reg, reg
890
891 Code generation for scalar loads of double precision data:
892 if (x86_sse_split_regs == true)
893 movlpd mem, reg (gas syntax)
894 else
895 movsd mem, reg
896
897 Code generation for unaligned packed loads of single precision data
898 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
899 if (x86_sse_unaligned_move_optimal)
900 movups mem, reg
901
902 if (x86_sse_partial_reg_dependency == true)
903 {
904 xorps reg, reg
905 movlps mem, reg
906 movhps mem+8, reg
907 }
908 else
909 {
910 movlps mem, reg
911 movhps mem+8, reg
912 }
913
914 Code generation for unaligned packed loads of double precision data
915 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
916 if (x86_sse_unaligned_move_optimal)
917 movupd mem, reg
918
919 if (x86_sse_split_regs == true)
920 {
921 movlpd mem, reg
922 movhpd mem+8, reg
923 }
924 else
925 {
926 movsd mem, reg
927 movhpd mem+8, reg
928 }
929 */
930
931void
932ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
933{
934 rtx op0, op1, m;
935
936 op0 = operands[0];
937 op1 = operands[1];
938
939 /* Use unaligned load/store for AVX512 or when optimizing for size. */
940 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
941 {
942 emit_insn (gen_rtx_SET (op0, op1));
943 return;
944 }
945
946 if (TARGET_AVX)
947 {
948 if (GET_MODE_SIZE (mode) == 32)
949 ix86_avx256_split_vector_move_misalign (op0, op1);
950 else
951 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
952 emit_insn (gen_rtx_SET (op0, op1));
953 return;
954 }
955
956 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
957 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
958 {
959 emit_insn (gen_rtx_SET (op0, op1));
960 return;
961 }
962
963 /* ??? If we have typed data, then it would appear that using
964 movdqu is the only way to get unaligned data loaded with
965 integer type. */
966 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
967 {
968 emit_insn (gen_rtx_SET (op0, op1));
969 return;
970 }
971
972 if (MEM_P (op1))
973 {
974 if (TARGET_SSE2 && mode == V2DFmode)
975 {
976 rtx zero;
977
978 /* When SSE registers are split into halves, we can avoid
979 writing to the top half twice. */
980 if (TARGET_SSE_SPLIT_REGS)
981 {
982 emit_clobber (op0);
983 zero = op0;
984 }
985 else
986 {
987 /* ??? Not sure about the best option for the Intel chips.
988 The following would seem to satisfy; the register is
989 entirely cleared, breaking the dependency chain. We
990 then store to the upper half, with a dependency depth
991 of one. A rumor has it that Intel recommends two movsd
992 followed by an unpacklpd, but this is unconfirmed. And
993 given that the dependency depth of the unpacklpd would
994 still be one, I'm not sure why this would be better. */
995 zero = CONST0_RTX (V2DFmode);
996 }
997
998 m = adjust_address (op1, DFmode, 0);
999 emit_insn (gen_sse2_loadlpd (op0, zero, m));
1000 m = adjust_address (op1, DFmode, 8);
1001 emit_insn (gen_sse2_loadhpd (op0, op0, m));
1002 }
1003 else
1004 {
1005 rtx t;
1006
1007 if (mode != V4SFmode)
1008 t = gen_reg_rtx (V4SFmode);
1009 else
1010 t = op0;
1011
1012 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
1013 emit_move_insn (t, CONST0_RTX (V4SFmode));
1014 else
1015 emit_clobber (t);
1016
1017 m = adjust_address (op1, V2SFmode, 0);
1018 emit_insn (gen_sse_loadlps (t, t, m));
1019 m = adjust_address (op1, V2SFmode, 8);
1020 emit_insn (gen_sse_loadhps (t, t, m));
1021 if (mode != V4SFmode)
1022 emit_move_insn (op0, gen_lowpart (mode, t));
1023 }
1024 }
1025 else if (MEM_P (op0))
1026 {
1027 if (TARGET_SSE2 && mode == V2DFmode)
1028 {
1029 m = adjust_address (op0, DFmode, 0);
1030 emit_insn (gen_sse2_storelpd (m, op1));
1031 m = adjust_address (op0, DFmode, 8);
1032 emit_insn (gen_sse2_storehpd (m, op1));
1033 }
1034 else
1035 {
1036 if (mode != V4SFmode)
1037 op1 = gen_lowpart (V4SFmode, op1);
1038
1039 m = adjust_address (op0, V2SFmode, 0);
1040 emit_insn (gen_sse_storelps (m, op1));
1041 m = adjust_address (op0, V2SFmode, 8);
1042 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
1043 }
1044 }
1045 else
1046 gcc_unreachable ();
1047}
1048
1049/* Move bits 64:95 to bits 32:63. */
1050
1051void
1052ix86_move_vector_high_sse_to_mmx (rtx op)
1053{
1054 rtx mask = gen_rtx_PARALLEL (VOIDmode,
1055 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
1056 GEN_INT (0), GEN_INT (0)));
1057 rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
1058 op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
1059 rtx insn = gen_rtx_SET (dest, op);
1060 emit_insn (insn);
1061}
1062
1063/* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
1064
1065void
1066ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
1067{
1068 rtx op0 = operands[0];
1069 rtx op1 = operands[1];
1070 rtx op2 = operands[2];
1071 rtx src;
1072
1073 machine_mode dmode = GET_MODE (op0);
1074 machine_mode smode = GET_MODE (op1);
1075 machine_mode inner_dmode = GET_MODE_INNER (dmode);
1076 machine_mode inner_smode = GET_MODE_INNER (smode);
1077
1078 /* Get the corresponding SSE mode for destination. */
1079 int nunits = 16 / GET_MODE_SIZE (inner_dmode);
1080 machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
1081 nunits).require ();
1082 machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
1083 nunits / 2).require ();
1084
1085 /* Get the corresponding SSE mode for source. */
1086 nunits = 16 / GET_MODE_SIZE (inner_smode);
1087 machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
1088 nunits).require ();
1089
1090 /* Generate SSE pack with signed/unsigned saturation. */
1091 rtx dest = lowpart_subreg (outermode: sse_dmode, op: op0, GET_MODE (op0));
1092 op1 = lowpart_subreg (outermode: sse_smode, op: op1, GET_MODE (op1));
1093 op2 = lowpart_subreg (outermode: sse_smode, op: op2, GET_MODE (op2));
1094
1095 /* paskusdw/packuswb does unsigned saturation of a signed source
1096 which is different from generic us_truncate RTX. */
1097 if (code == US_TRUNCATE)
1098 src = gen_rtx_UNSPEC (sse_dmode,
1099 gen_rtvec (2, op1, op2),
1100 UNSPEC_US_TRUNCATE);
1101 else
1102 {
1103 op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
1104 op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
1105 src = gen_rtx_VEC_CONCAT (sse_dmode, op1, op2);
1106 }
1107
1108 emit_move_insn (dest, src);
1109
1110 ix86_move_vector_high_sse_to_mmx (op: op0);
1111}
1112
1113/* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. This is also used
1114 for a full unpack of OPERANDS[1] and OPERANDS[2] into a wider
1115 OPERANDS[0]. */
1116
1117void
1118ix86_split_mmx_punpck (rtx operands[], bool high_p)
1119{
1120 rtx op0 = operands[0];
1121 rtx op1 = operands[1];
1122 rtx op2 = operands[2];
1123 machine_mode mode = GET_MODE (op1);
1124 rtx mask;
1125 /* The corresponding SSE mode. */
1126 machine_mode sse_mode, double_sse_mode;
1127
1128 switch (mode)
1129 {
1130 case E_V8QImode:
1131 case E_V4QImode:
1132 case E_V2QImode:
1133 sse_mode = V16QImode;
1134 double_sse_mode = V32QImode;
1135 mask = gen_rtx_PARALLEL (VOIDmode,
1136 gen_rtvec (16,
1137 GEN_INT (0), GEN_INT (16),
1138 GEN_INT (1), GEN_INT (17),
1139 GEN_INT (2), GEN_INT (18),
1140 GEN_INT (3), GEN_INT (19),
1141 GEN_INT (4), GEN_INT (20),
1142 GEN_INT (5), GEN_INT (21),
1143 GEN_INT (6), GEN_INT (22),
1144 GEN_INT (7), GEN_INT (23)));
1145 break;
1146
1147 case E_V4HImode:
1148 case E_V2HImode:
1149 sse_mode = V8HImode;
1150 double_sse_mode = V16HImode;
1151 mask = gen_rtx_PARALLEL (VOIDmode,
1152 gen_rtvec (8,
1153 GEN_INT (0), GEN_INT (8),
1154 GEN_INT (1), GEN_INT (9),
1155 GEN_INT (2), GEN_INT (10),
1156 GEN_INT (3), GEN_INT (11)));
1157 break;
1158
1159 case E_V2SImode:
1160 sse_mode = V4SImode;
1161 double_sse_mode = V8SImode;
1162 mask = gen_rtx_PARALLEL (VOIDmode,
1163 gen_rtvec (4,
1164 GEN_INT (0), GEN_INT (4),
1165 GEN_INT (1), GEN_INT (5)));
1166 break;
1167
1168 case E_V2SFmode:
1169 sse_mode = V4SFmode;
1170 double_sse_mode = V8SFmode;
1171 mask = gen_rtx_PARALLEL (VOIDmode,
1172 gen_rtvec (4,
1173 GEN_INT (0), GEN_INT (4),
1174 GEN_INT (1), GEN_INT (5)));
1175 break;
1176
1177 default:
1178 gcc_unreachable ();
1179 }
1180
1181 /* Generate SSE punpcklXX. */
1182 rtx dest = lowpart_subreg (outermode: sse_mode, op: op0, GET_MODE (op0));
1183 op1 = lowpart_subreg (outermode: sse_mode, op: op1, GET_MODE (op1));
1184 op2 = lowpart_subreg (outermode: sse_mode, op: op2, GET_MODE (op2));
1185
1186 op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
1187 op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
1188 rtx insn = gen_rtx_SET (dest, op2);
1189 emit_insn (insn);
1190
1191 /* Move high bits to low bits. */
1192 if (high_p)
1193 {
1194 if (sse_mode == V4SFmode)
1195 {
1196 mask = gen_rtx_PARALLEL (VOIDmode,
1197 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1198 GEN_INT (4), GEN_INT (5)));
1199 op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest);
1200 op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask);
1201 }
1202 else
1203 {
1204 int sz = GET_MODE_SIZE (mode);
1205
1206 if (sz == 4)
1207 mask = gen_rtx_PARALLEL (VOIDmode,
1208 gen_rtvec (4, GEN_INT (1), GEN_INT (0),
1209 GEN_INT (0), GEN_INT (1)));
1210 else if (sz == 8)
1211 mask = gen_rtx_PARALLEL (VOIDmode,
1212 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1213 GEN_INT (0), GEN_INT (1)));
1214 else
1215 gcc_unreachable ();
1216
1217 dest = lowpart_subreg (V4SImode, op: dest, GET_MODE (dest));
1218 op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
1219 }
1220
1221 insn = gen_rtx_SET (dest, op1);
1222 emit_insn (insn);
1223 }
1224}
1225
1226/* Helper function of ix86_fixup_binary_operands to canonicalize
1227 operand order. Returns true if the operands should be swapped. */
1228
1229static bool
1230ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
1231 rtx operands[])
1232{
1233 rtx dst = operands[0];
1234 rtx src1 = operands[1];
1235 rtx src2 = operands[2];
1236
1237 /* If the operation is not commutative, we can't do anything. */
1238 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
1239 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
1240 return false;
1241
1242 /* Highest priority is that src1 should match dst. */
1243 if (rtx_equal_p (dst, src1))
1244 return false;
1245 if (rtx_equal_p (dst, src2))
1246 return true;
1247
1248 /* Next highest priority is that immediate constants come second. */
1249 if (immediate_operand (src2, mode))
1250 return false;
1251 if (immediate_operand (src1, mode))
1252 return true;
1253
1254 /* Lowest priority is that memory references should come second. */
1255 if (MEM_P (src2))
1256 return false;
1257 if (MEM_P (src1))
1258 return true;
1259
1260 return false;
1261}
1262
1263
1264/* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
1265 destination to use for the operation. If different from the true
1266 destination in operands[0], a copy operation will be required. */
1267
1268rtx
1269ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
1270 rtx operands[])
1271{
1272 rtx dst = operands[0];
1273 rtx src1 = operands[1];
1274 rtx src2 = operands[2];
1275
1276 /* Canonicalize operand order. */
1277 if (ix86_swap_binary_operands_p (code, mode, operands))
1278 {
1279 /* It is invalid to swap operands of different modes. */
1280 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
1281
1282 std::swap (a&: src1, b&: src2);
1283 }
1284
1285 /* Both source operands cannot be in memory. */
1286 if (MEM_P (src1) && MEM_P (src2))
1287 {
1288 /* Optimization: Only read from memory once. */
1289 if (rtx_equal_p (src1, src2))
1290 {
1291 src2 = force_reg (mode, src2);
1292 src1 = src2;
1293 }
1294 else if (rtx_equal_p (dst, src1))
1295 src2 = force_reg (mode, src2);
1296 else
1297 src1 = force_reg (mode, src1);
1298 }
1299
1300 /* If the destination is memory, and we do not have matching source
1301 operands, do things in registers. */
1302 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1303 dst = gen_reg_rtx (mode);
1304
1305 /* Source 1 cannot be a constant. */
1306 if (CONSTANT_P (src1))
1307 src1 = force_reg (mode, src1);
1308
1309 /* Source 1 cannot be a non-matching memory. */
1310 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1311 src1 = force_reg (mode, src1);
1312
1313 /* Improve address combine. */
1314 if (code == PLUS
1315 && GET_MODE_CLASS (mode) == MODE_INT
1316 && MEM_P (src2))
1317 src2 = force_reg (mode, src2);
1318
1319 operands[1] = src1;
1320 operands[2] = src2;
1321 return dst;
1322}
1323
1324/* Similarly, but assume that the destination has already been
1325 set up properly. */
1326
1327void
1328ix86_fixup_binary_operands_no_copy (enum rtx_code code,
1329 machine_mode mode, rtx operands[])
1330{
1331 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
1332 gcc_assert (dst == operands[0]);
1333}
1334
1335/* Attempt to expand a binary operator. Make the expansion closer to the
1336 actual machine, then just general_operand, which will allow 3 separate
1337 memory references (one output, two input) in a single insn. */
1338
1339void
1340ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
1341 rtx operands[])
1342{
1343 rtx src1, src2, dst, op, clob;
1344
1345 dst = ix86_fixup_binary_operands (code, mode, operands);
1346 src1 = operands[1];
1347 src2 = operands[2];
1348
1349 /* Emit the instruction. */
1350
1351 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
1352
1353 if (reload_completed
1354 && code == PLUS
1355 && !rtx_equal_p (dst, src1))
1356 {
1357 /* This is going to be an LEA; avoid splitting it later. */
1358 emit_insn (op);
1359 }
1360 else
1361 {
1362 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1363 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1364 }
1365
1366 /* Fix up the destination if needed. */
1367 if (dst != operands[0])
1368 emit_move_insn (operands[0], dst);
1369}
1370
1371/* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
1372 the given OPERANDS. */
1373
1374void
1375ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
1376 rtx operands[])
1377{
1378 rtx op1 = NULL_RTX, op2 = NULL_RTX;
1379 if (SUBREG_P (operands[1]))
1380 {
1381 op1 = operands[1];
1382 op2 = operands[2];
1383 }
1384 else if (SUBREG_P (operands[2]))
1385 {
1386 op1 = operands[2];
1387 op2 = operands[1];
1388 }
1389 /* Optimize (__m128i) d | (__m128i) e and similar code
1390 when d and e are float vectors into float vector logical
1391 insn. In C/C++ without using intrinsics there is no other way
1392 to express vector logical operation on float vectors than
1393 to cast them temporarily to integer vectors. */
1394 if (op1
1395 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
1396 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
1397 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
1398 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
1399 && SUBREG_BYTE (op1) == 0
1400 && (GET_CODE (op2) == CONST_VECTOR
1401 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
1402 && SUBREG_BYTE (op2) == 0))
1403 && can_create_pseudo_p ())
1404 {
1405 rtx dst;
1406 switch (GET_MODE (SUBREG_REG (op1)))
1407 {
1408 case E_V4SFmode:
1409 case E_V8SFmode:
1410 case E_V16SFmode:
1411 case E_V2DFmode:
1412 case E_V4DFmode:
1413 case E_V8DFmode:
1414 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
1415 if (GET_CODE (op2) == CONST_VECTOR)
1416 {
1417 op2 = gen_lowpart (GET_MODE (dst), op2);
1418 op2 = force_reg (GET_MODE (dst), op2);
1419 }
1420 else
1421 {
1422 op1 = operands[1];
1423 op2 = SUBREG_REG (operands[2]);
1424 if (!vector_operand (op2, GET_MODE (dst)))
1425 op2 = force_reg (GET_MODE (dst), op2);
1426 }
1427 op1 = SUBREG_REG (op1);
1428 if (!vector_operand (op1, GET_MODE (dst)))
1429 op1 = force_reg (GET_MODE (dst), op1);
1430 emit_insn (gen_rtx_SET (dst,
1431 gen_rtx_fmt_ee (code, GET_MODE (dst),
1432 op1, op2)));
1433 emit_move_insn (operands[0], gen_lowpart (mode, dst));
1434 return;
1435 default:
1436 break;
1437 }
1438 }
1439 if (!vector_operand (operands[1], mode))
1440 operands[1] = force_reg (mode, operands[1]);
1441 if (!vector_operand (operands[2], mode))
1442 operands[2] = force_reg (mode, operands[2]);
1443 ix86_fixup_binary_operands_no_copy (code, mode, operands);
1444 emit_insn (gen_rtx_SET (operands[0],
1445 gen_rtx_fmt_ee (code, mode, operands[1],
1446 operands[2])));
1447}
1448
1449/* Return TRUE or FALSE depending on whether the binary operator meets the
1450 appropriate constraints. */
1451
1452bool
1453ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1454 rtx operands[3])
1455{
1456 rtx dst = operands[0];
1457 rtx src1 = operands[1];
1458 rtx src2 = operands[2];
1459
1460 /* Both source operands cannot be in memory. */
1461 if ((MEM_P (src1) || bcst_mem_operand (src1, mode))
1462 && (MEM_P (src2) || bcst_mem_operand (src2, mode)))
1463 return false;
1464
1465 /* Canonicalize operand order for commutative operators. */
1466 if (ix86_swap_binary_operands_p (code, mode, operands))
1467 std::swap (a&: src1, b&: src2);
1468
1469 /* If the destination is memory, we must have a matching source operand. */
1470 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1471 return false;
1472
1473 /* Source 1 cannot be a constant. */
1474 if (CONSTANT_P (src1))
1475 return false;
1476
1477 /* Source 1 cannot be a non-matching memory. */
1478 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1479 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1480 return (code == AND
1481 && (mode == HImode
1482 || mode == SImode
1483 || (TARGET_64BIT && mode == DImode))
1484 && satisfies_constraint_L (op: src2));
1485
1486 return true;
1487}
1488
1489/* Attempt to expand a unary operator. Make the expansion closer to the
1490 actual machine, then just general_operand, which will allow 2 separate
1491 memory references (one output, one input) in a single insn. */
1492
1493void
1494ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1495 rtx operands[])
1496{
1497 bool matching_memory = false;
1498 rtx src, dst, op, clob;
1499
1500 dst = operands[0];
1501 src = operands[1];
1502
1503 /* If the destination is memory, and we do not have matching source
1504 operands, do things in registers. */
1505 if (MEM_P (dst))
1506 {
1507 if (rtx_equal_p (dst, src))
1508 matching_memory = true;
1509 else
1510 dst = gen_reg_rtx (mode);
1511 }
1512
1513 /* When source operand is memory, destination must match. */
1514 if (MEM_P (src) && !matching_memory)
1515 src = force_reg (mode, src);
1516
1517 /* Emit the instruction. */
1518
1519 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1520
1521 if (code == NOT)
1522 emit_insn (op);
1523 else
1524 {
1525 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1526 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1527 }
1528
1529 /* Fix up the destination if needed. */
1530 if (dst != operands[0])
1531 emit_move_insn (operands[0], dst);
1532}
1533
1534/* Predict just emitted jump instruction to be taken with probability PROB. */
1535
1536static void
1537predict_jump (int prob)
1538{
1539 rtx_insn *insn = get_last_insn ();
1540 gcc_assert (JUMP_P (insn));
1541 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (v: prob));
1542}
1543
1544/* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1545 divisor are within the range [0-255]. */
1546
1547void
1548ix86_split_idivmod (machine_mode mode, rtx operands[],
1549 bool unsigned_p)
1550{
1551 rtx_code_label *end_label, *qimode_label;
1552 rtx div, mod;
1553 rtx_insn *insn;
1554 rtx scratch, tmp0, tmp1, tmp2;
1555 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
1556
1557 operands[2] = force_reg (mode, operands[2]);
1558 operands[3] = force_reg (mode, operands[3]);
1559
1560 switch (mode)
1561 {
1562 case E_SImode:
1563 if (GET_MODE (operands[0]) == SImode)
1564 {
1565 if (GET_MODE (operands[1]) == SImode)
1566 gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
1567 else
1568 gen_divmod4_1
1569 = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
1570 }
1571 else
1572 gen_divmod4_1
1573 = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
1574 break;
1575
1576 case E_DImode:
1577 gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
1578 break;
1579
1580 default:
1581 gcc_unreachable ();
1582 }
1583
1584 end_label = gen_label_rtx ();
1585 qimode_label = gen_label_rtx ();
1586
1587 scratch = gen_reg_rtx (mode);
1588
1589 /* Use 8bit unsigned divimod if dividend and divisor are within
1590 the range [0-255]. */
1591 emit_move_insn (scratch, operands[2]);
1592 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1593 scratch, 1, OPTAB_DIRECT);
1594 emit_insn (gen_test_ccno_1 (arg0: mode, x0: scratch, GEN_INT (-0x100)));
1595 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1596 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1597 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1598 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1599 pc_rtx);
1600 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1601 predict_jump (REG_BR_PROB_BASE * 50 / 100);
1602 JUMP_LABEL (insn) = qimode_label;
1603
1604 /* Generate original signed/unsigned divimod. */
1605 emit_insn (gen_divmod4_1 (operands[0], operands[1],
1606 operands[2], operands[3]));
1607
1608 /* Branch to the end. */
1609 emit_jump_insn (gen_jump (end_label));
1610 emit_barrier ();
1611
1612 /* Generate 8bit unsigned divide. */
1613 emit_label (qimode_label);
1614 /* Don't use operands[0] for result of 8bit divide since not all
1615 registers support QImode ZERO_EXTRACT. */
1616 tmp0 = lowpart_subreg (HImode, op: scratch, innermode: mode);
1617 tmp1 = lowpart_subreg (HImode, op: operands[2], innermode: mode);
1618 tmp2 = lowpart_subreg (QImode, op: operands[3], innermode: mode);
1619 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1620
1621 if (unsigned_p)
1622 {
1623 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1624 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
1625 }
1626 else
1627 {
1628 div = gen_rtx_DIV (mode, operands[2], operands[3]);
1629 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
1630 }
1631 if (mode == SImode)
1632 {
1633 if (GET_MODE (operands[0]) != SImode)
1634 div = gen_rtx_ZERO_EXTEND (DImode, div);
1635 if (GET_MODE (operands[1]) != SImode)
1636 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1637 }
1638
1639 /* Extract remainder from AH. */
1640 scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
1641 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
1642 GEN_INT (8), GEN_INT (8));
1643 insn = emit_move_insn (operands[1], tmp1);
1644 set_unique_reg_note (insn, REG_EQUAL, mod);
1645
1646 /* Zero extend quotient from AL. */
1647 tmp1 = gen_lowpart (QImode, tmp0);
1648 insn = emit_insn (gen_extend_insn
1649 (operands[0], tmp1,
1650 GET_MODE (operands[0]), QImode, 1));
1651 set_unique_reg_note (insn, REG_EQUAL, div);
1652
1653 emit_label (end_label);
1654}
1655
1656/* Emit x86 binary operand CODE in mode MODE, where the first operand
1657 matches destination. RTX includes clobber of FLAGS_REG. */
1658
1659void
1660ix86_emit_binop (enum rtx_code code, machine_mode mode,
1661 rtx dst, rtx src)
1662{
1663 rtx op, clob;
1664
1665 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1666 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1667
1668 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1669}
1670
1671/* Return true if regno1 def is nearest to the insn. */
1672
1673static bool
1674find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1675{
1676 rtx_insn *prev = insn;
1677 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1678
1679 if (insn == start)
1680 return false;
1681 while (prev && prev != start)
1682 {
1683 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1684 {
1685 prev = PREV_INSN (insn: prev);
1686 continue;
1687 }
1688 if (insn_defines_reg (regno1, INVALID_REGNUM, insn: prev))
1689 return true;
1690 else if (insn_defines_reg (regno1: regno2, INVALID_REGNUM, insn: prev))
1691 return false;
1692 prev = PREV_INSN (insn: prev);
1693 }
1694
1695 /* None of the regs is defined in the bb. */
1696 return false;
1697}
1698
1699/* INSN_UID of the last insn emitted by zero store peephole2s. */
1700int ix86_last_zero_store_uid;
1701
1702/* Split lea instructions into a sequence of instructions
1703 which are executed on ALU to avoid AGU stalls.
1704 It is assumed that it is allowed to clobber flags register
1705 at lea position. */
1706
1707void
1708ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1709{
1710 unsigned int regno0, regno1, regno2;
1711 struct ix86_address parts;
1712 rtx target, tmp;
1713 int ok, adds;
1714
1715 ok = ix86_decompose_address (operands[1], &parts);
1716 gcc_assert (ok);
1717
1718 target = gen_lowpart (mode, operands[0]);
1719
1720 regno0 = true_regnum (target);
1721 regno1 = INVALID_REGNUM;
1722 regno2 = INVALID_REGNUM;
1723
1724 if (parts.base)
1725 {
1726 parts.base = gen_lowpart (mode, parts.base);
1727 regno1 = true_regnum (parts.base);
1728 }
1729
1730 if (parts.index)
1731 {
1732 parts.index = gen_lowpart (mode, parts.index);
1733 regno2 = true_regnum (parts.index);
1734 }
1735
1736 if (parts.disp)
1737 parts.disp = gen_lowpart (mode, parts.disp);
1738
1739 if (parts.scale > 1)
1740 {
1741 /* Case r1 = r1 + ... */
1742 if (regno1 == regno0)
1743 {
1744 /* If we have a case r1 = r1 + C * r2 then we
1745 should use multiplication which is very
1746 expensive. Assume cost model is wrong if we
1747 have such case here. */
1748 gcc_assert (regno2 != regno0);
1749
1750 for (adds = parts.scale; adds > 0; adds--)
1751 ix86_emit_binop (code: PLUS, mode, dst: target, src: parts.index);
1752 }
1753 else
1754 {
1755 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1756 if (regno0 != regno2)
1757 emit_insn (gen_rtx_SET (target, parts.index));
1758
1759 /* Use shift for scaling, but emit it as MULT instead
1760 to avoid it being immediately peephole2 optimized back
1761 into lea. */
1762 ix86_emit_binop (code: MULT, mode, dst: target, GEN_INT (parts.scale));
1763
1764 if (parts.base)
1765 ix86_emit_binop (code: PLUS, mode, dst: target, src: parts.base);
1766
1767 if (parts.disp && parts.disp != const0_rtx)
1768 ix86_emit_binop (code: PLUS, mode, dst: target, src: parts.disp);
1769 }
1770 }
1771 else if (!parts.base && !parts.index)
1772 {
1773 gcc_assert(parts.disp);
1774 emit_insn (gen_rtx_SET (target, parts.disp));
1775 }
1776 else
1777 {
1778 if (!parts.base)
1779 {
1780 if (regno0 != regno2)
1781 emit_insn (gen_rtx_SET (target, parts.index));
1782 }
1783 else if (!parts.index)
1784 {
1785 if (regno0 != regno1)
1786 emit_insn (gen_rtx_SET (target, parts.base));
1787 }
1788 else
1789 {
1790 if (regno0 == regno1)
1791 tmp = parts.index;
1792 else if (regno0 == regno2)
1793 tmp = parts.base;
1794 else
1795 {
1796 rtx tmp1;
1797
1798 /* Find better operand for SET instruction, depending
1799 on which definition is farther from the insn. */
1800 if (find_nearest_reg_def (insn, regno1, regno2))
1801 tmp = parts.index, tmp1 = parts.base;
1802 else
1803 tmp = parts.base, tmp1 = parts.index;
1804
1805 emit_insn (gen_rtx_SET (target, tmp));
1806
1807 if (parts.disp && parts.disp != const0_rtx)
1808 ix86_emit_binop (code: PLUS, mode, dst: target, src: parts.disp);
1809
1810 ix86_emit_binop (code: PLUS, mode, dst: target, src: tmp1);
1811 return;
1812 }
1813
1814 ix86_emit_binop (code: PLUS, mode, dst: target, src: tmp);
1815 }
1816
1817 if (parts.disp && parts.disp != const0_rtx)
1818 ix86_emit_binop (code: PLUS, mode, dst: target, src: parts.disp);
1819 }
1820}
1821
1822/* Post-reload splitter for converting an SF or DFmode value in an
1823 SSE register into an unsigned SImode. */
1824
1825void
1826ix86_split_convert_uns_si_sse (rtx operands[])
1827{
1828 machine_mode vecmode;
1829 rtx value, large, zero_or_two31, input, two31, x;
1830
1831 large = operands[1];
1832 zero_or_two31 = operands[2];
1833 input = operands[3];
1834 two31 = operands[4];
1835 vecmode = GET_MODE (large);
1836 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1837
1838 /* Load up the value into the low element. We must ensure that the other
1839 elements are valid floats -- zero is the easiest such value. */
1840 if (MEM_P (input))
1841 {
1842 if (vecmode == V4SFmode)
1843 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1844 else
1845 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1846 }
1847 else
1848 {
1849 input = gen_rtx_REG (vecmode, REGNO (input));
1850 emit_move_insn (value, CONST0_RTX (vecmode));
1851 if (vecmode == V4SFmode)
1852 emit_insn (gen_sse_movss_v4sf (value, value, input));
1853 else
1854 emit_insn (gen_sse2_movsd_v2df (value, value, input));
1855 }
1856
1857 emit_move_insn (large, two31);
1858 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1859
1860 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1861 emit_insn (gen_rtx_SET (large, x));
1862
1863 x = gen_rtx_AND (vecmode, zero_or_two31, large);
1864 emit_insn (gen_rtx_SET (zero_or_two31, x));
1865
1866 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1867 emit_insn (gen_rtx_SET (value, x));
1868
1869 large = gen_rtx_REG (V4SImode, REGNO (large));
1870 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1871
1872 x = gen_rtx_REG (V4SImode, REGNO (value));
1873 if (vecmode == V4SFmode)
1874 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1875 else
1876 emit_insn (gen_sse2_cvttpd2dq (x, value));
1877 value = x;
1878
1879 emit_insn (gen_xorv4si3 (value, value, large));
1880}
1881
1882static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1883 machine_mode mode, rtx target,
1884 rtx var, int one_var);
1885
1886/* Convert an unsigned DImode value into a DFmode, using only SSE.
1887 Expects the 64-bit DImode to be supplied in a pair of integral
1888 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1889 -mfpmath=sse, !optimize_size only. */
1890
1891void
1892ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1893{
1894 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1895 rtx int_xmm, fp_xmm;
1896 rtx biases, exponents;
1897 rtx x;
1898
1899 int_xmm = gen_reg_rtx (V4SImode);
1900 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1901 emit_insn (gen_movdi_to_sse (int_xmm, input));
1902 else if (TARGET_SSE_SPLIT_REGS)
1903 {
1904 emit_clobber (int_xmm);
1905 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1906 }
1907 else
1908 {
1909 x = gen_reg_rtx (V2DImode);
1910 ix86_expand_vector_init_one_nonzero (mmx_ok: false, V2DImode, target: x, var: input, one_var: 0);
1911 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1912 }
1913
1914 x = gen_rtx_CONST_VECTOR (V4SImode,
1915 gen_rtvec (4, GEN_INT (0x43300000UL),
1916 GEN_INT (0x45300000UL),
1917 const0_rtx, const0_rtx));
1918 exponents = validize_mem (force_const_mem (V4SImode, x));
1919
1920 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1921 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1922
1923 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1924 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1925 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1926 (0x1.0p84 + double(fp_value_hi_xmm)).
1927 Note these exponents differ by 32. */
1928
1929 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1930
1931 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1932 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1933 real_ldexp (&bias_lo_rvt, &dconst1, 52);
1934 real_ldexp (&bias_hi_rvt, &dconst1, 84);
1935 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1936 x = const_double_from_real_value (bias_hi_rvt, DFmode);
1937 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1938 biases = validize_mem (force_const_mem (V2DFmode, biases));
1939 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1940
1941 /* Add the upper and lower DFmode values together. */
1942 if (TARGET_SSE3)
1943 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1944 else
1945 {
1946 x = copy_to_mode_reg (V2DFmode, fp_xmm);
1947 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1948 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1949 }
1950
1951 ix86_expand_vector_extract (false, target, fp_xmm, 0);
1952}
1953
1954/* Not used, but eases macroization of patterns. */
1955void
1956ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1957{
1958 gcc_unreachable ();
1959}
1960
1961static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask);
1962
1963/* Convert an unsigned SImode value into a DFmode. Only currently used
1964 for SSE, but applicable anywhere. */
1965
1966void
1967ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1968{
1969 REAL_VALUE_TYPE TWO31r;
1970 rtx x, fp;
1971
1972 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1973 NULL, 1, OPTAB_DIRECT);
1974
1975 fp = gen_reg_rtx (DFmode);
1976 emit_insn (gen_floatsidf2 (fp, x));
1977
1978 real_ldexp (&TWO31r, &dconst1, 31);
1979 x = const_double_from_real_value (TWO31r, DFmode);
1980
1981 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
1982
1983 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
1984 if (HONOR_SIGNED_ZEROS (DFmode) && flag_rounding_math)
1985 x = ix86_expand_sse_fabs (op0: x, NULL);
1986
1987 if (x != target)
1988 emit_move_insn (target, x);
1989}
1990
1991/* Convert a signed DImode value into a DFmode. Only used for SSE in
1992 32-bit mode; otherwise we have a direct convert instruction. */
1993
1994void
1995ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
1996{
1997 REAL_VALUE_TYPE TWO32r;
1998 rtx fp_lo, fp_hi, x;
1999
2000 fp_lo = gen_reg_rtx (DFmode);
2001 fp_hi = gen_reg_rtx (DFmode);
2002
2003 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
2004
2005 real_ldexp (&TWO32r, &dconst1, 32);
2006 x = const_double_from_real_value (TWO32r, DFmode);
2007 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
2008
2009 ix86_expand_convert_uns_sidf_sse (target: fp_lo, gen_lowpart (SImode, input));
2010
2011 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
2012 0, OPTAB_DIRECT);
2013 if (x != target)
2014 emit_move_insn (target, x);
2015}
2016
2017/* Convert an unsigned SImode value into a SFmode, using only SSE.
2018 For x86_32, -mfpmath=sse, !optimize_size only. */
2019void
2020ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
2021{
2022 REAL_VALUE_TYPE ONE16r;
2023 rtx fp_hi, fp_lo, int_hi, int_lo, x;
2024
2025 real_ldexp (&ONE16r, &dconst1, 16);
2026 x = const_double_from_real_value (ONE16r, SFmode);
2027 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
2028 NULL, 0, OPTAB_DIRECT);
2029 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
2030 NULL, 0, OPTAB_DIRECT);
2031 fp_hi = gen_reg_rtx (SFmode);
2032 fp_lo = gen_reg_rtx (SFmode);
2033 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
2034 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
2035 if (TARGET_FMA)
2036 {
2037 x = validize_mem (force_const_mem (SFmode, x));
2038 fp_hi = gen_rtx_FMA (SFmode, fp_hi, x, fp_lo);
2039 emit_move_insn (target, fp_hi);
2040 }
2041 else
2042 {
2043 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
2044 0, OPTAB_DIRECT);
2045 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
2046 0, OPTAB_DIRECT);
2047 if (!rtx_equal_p (target, fp_hi))
2048 emit_move_insn (target, fp_hi);
2049 }
2050}
2051
2052/* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
2053 a vector of unsigned ints VAL to vector of floats TARGET. */
2054
2055void
2056ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
2057{
2058 rtx tmp[8];
2059 REAL_VALUE_TYPE TWO16r;
2060 machine_mode intmode = GET_MODE (val);
2061 machine_mode fltmode = GET_MODE (target);
2062 rtx (*cvt) (rtx, rtx);
2063
2064 if (intmode == V4SImode)
2065 cvt = gen_floatv4siv4sf2;
2066 else
2067 cvt = gen_floatv8siv8sf2;
2068 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
2069 tmp[0] = force_reg (intmode, tmp[0]);
2070 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
2071 OPTAB_DIRECT);
2072 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
2073 NULL_RTX, 1, OPTAB_DIRECT);
2074 tmp[3] = gen_reg_rtx (fltmode);
2075 emit_insn (cvt (tmp[3], tmp[1]));
2076 tmp[4] = gen_reg_rtx (fltmode);
2077 emit_insn (cvt (tmp[4], tmp[2]));
2078 real_ldexp (&TWO16r, &dconst1, 16);
2079 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
2080 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
2081 if (TARGET_FMA)
2082 {
2083 tmp[6] = gen_rtx_FMA (fltmode, tmp[4], tmp[5], tmp[3]);
2084 emit_move_insn (target, tmp[6]);
2085 }
2086 else
2087 {
2088 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5],
2089 NULL_RTX, 1, OPTAB_DIRECT);
2090 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6],
2091 target, 1, OPTAB_DIRECT);
2092 if (tmp[7] != target)
2093 emit_move_insn (target, tmp[7]);
2094 }
2095}
2096
2097/* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
2098 pattern can be used on it instead of fixuns_trunc*.
2099 This is done by doing just signed conversion if < 0x1p31, and otherwise by
2100 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
2101
2102rtx
2103ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
2104{
2105 REAL_VALUE_TYPE TWO31r;
2106 rtx two31r, tmp[4];
2107 machine_mode mode = GET_MODE (val);
2108 machine_mode scalarmode = GET_MODE_INNER (mode);
2109 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
2110 rtx (*cmp) (rtx, rtx, rtx, rtx);
2111 int i;
2112
2113 for (i = 0; i < 3; i++)
2114 tmp[i] = gen_reg_rtx (mode);
2115 real_ldexp (&TWO31r, &dconst1, 31);
2116 two31r = const_double_from_real_value (TWO31r, scalarmode);
2117 two31r = ix86_build_const_vector (mode, 1, two31r);
2118 two31r = force_reg (mode, two31r);
2119 switch (mode)
2120 {
2121 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
2122 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
2123 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
2124 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
2125 default: gcc_unreachable ();
2126 }
2127 tmp[3] = gen_rtx_LE (mode, two31r, val);
2128 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
2129 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
2130 0, OPTAB_DIRECT);
2131 if (intmode == V4SImode || TARGET_AVX2)
2132 *xorp = expand_simple_binop (intmode, ASHIFT,
2133 gen_lowpart (intmode, tmp[0]),
2134 GEN_INT (31), NULL_RTX, 0,
2135 OPTAB_DIRECT);
2136 else
2137 {
2138 rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
2139 two31 = ix86_build_const_vector (intmode, 1, two31);
2140 *xorp = expand_simple_binop (intmode, AND,
2141 gen_lowpart (intmode, tmp[0]),
2142 two31, NULL_RTX, 0,
2143 OPTAB_DIRECT);
2144 }
2145 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
2146 0, OPTAB_DIRECT);
2147}
2148
2149/* Generate code for floating point ABS or NEG. */
2150
2151void
2152ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2153 rtx operands[])
2154{
2155 rtx set, dst, src;
2156 bool use_sse = false;
2157 bool vector_mode = VECTOR_MODE_P (mode);
2158 machine_mode vmode = mode;
2159 rtvec par;
2160
2161 if (vector_mode || mode == TFmode || mode == HFmode)
2162 {
2163 use_sse = true;
2164 if (mode == HFmode)
2165 vmode = V8HFmode;
2166 }
2167 else if (TARGET_SSE_MATH)
2168 {
2169 use_sse = SSE_FLOAT_MODE_P (mode);
2170 if (mode == SFmode)
2171 vmode = V4SFmode;
2172 else if (mode == DFmode)
2173 vmode = V2DFmode;
2174 }
2175
2176 dst = operands[0];
2177 src = operands[1];
2178
2179 set = gen_rtx_fmt_e (code, mode, src);
2180 set = gen_rtx_SET (dst, set);
2181
2182 if (use_sse)
2183 {
2184 rtx mask, use, clob;
2185
2186 /* NEG and ABS performed with SSE use bitwise mask operations.
2187 Create the appropriate mask now. */
2188 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
2189 use = gen_rtx_USE (VOIDmode, mask);
2190 if (vector_mode || mode == TFmode)
2191 par = gen_rtvec (2, set, use);
2192 else
2193 {
2194 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2195 par = gen_rtvec (3, set, use, clob);
2196 }
2197 }
2198 else
2199 {
2200 rtx clob;
2201
2202 /* Changing of sign for FP values is doable using integer unit too. */
2203 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2204 par = gen_rtvec (2, set, clob);
2205 }
2206
2207 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2208}
2209
2210/* Deconstruct a floating point ABS or NEG operation
2211 with integer registers into integer operations. */
2212
2213void
2214ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2215 rtx operands[])
2216{
2217 enum rtx_code absneg_op;
2218 rtx dst, set;
2219
2220 gcc_assert (operands_match_p (operands[0], operands[1]));
2221
2222 switch (mode)
2223 {
2224 case E_SFmode:
2225 dst = gen_lowpart (SImode, operands[0]);
2226
2227 if (code == ABS)
2228 {
2229 set = gen_int_mode (0x7fffffff, SImode);
2230 absneg_op = AND;
2231 }
2232 else
2233 {
2234 set = gen_int_mode (0x80000000, SImode);
2235 absneg_op = XOR;
2236 }
2237 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2238 break;
2239
2240 case E_DFmode:
2241 if (TARGET_64BIT)
2242 {
2243 dst = gen_lowpart (DImode, operands[0]);
2244 dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
2245
2246 if (code == ABS)
2247 set = const0_rtx;
2248 else
2249 set = gen_rtx_NOT (DImode, dst);
2250 }
2251 else
2252 {
2253 dst = gen_highpart (SImode, operands[0]);
2254
2255 if (code == ABS)
2256 {
2257 set = gen_int_mode (0x7fffffff, SImode);
2258 absneg_op = AND;
2259 }
2260 else
2261 {
2262 set = gen_int_mode (0x80000000, SImode);
2263 absneg_op = XOR;
2264 }
2265 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2266 }
2267 break;
2268
2269 case E_XFmode:
2270 dst = gen_rtx_REG (SImode,
2271 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
2272 if (code == ABS)
2273 {
2274 set = GEN_INT (0x7fff);
2275 absneg_op = AND;
2276 }
2277 else
2278 {
2279 set = GEN_INT (0x8000);
2280 absneg_op = XOR;
2281 }
2282 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2283 break;
2284
2285 default:
2286 gcc_unreachable ();
2287 }
2288
2289 set = gen_rtx_SET (dst, set);
2290
2291 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2292 rtvec par = gen_rtvec (2, set, clob);
2293
2294 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2295}
2296
2297/* Expand a copysign operation. Special case operand 0 being a constant. */
2298
2299void
2300ix86_expand_copysign (rtx operands[])
2301{
2302 machine_mode mode, vmode;
2303 rtx dest, vdest, op0, op1, mask, op2, op3;
2304
2305 mode = GET_MODE (operands[0]);
2306
2307 if (mode == HFmode)
2308 vmode = V8HFmode;
2309 else if (mode == SFmode)
2310 vmode = V4SFmode;
2311 else if (mode == DFmode)
2312 vmode = V2DFmode;
2313 else if (mode == TFmode)
2314 vmode = mode;
2315 else
2316 gcc_unreachable ();
2317
2318 if (rtx_equal_p (operands[1], operands[2]))
2319 {
2320 emit_move_insn (operands[0], operands[1]);
2321 return;
2322 }
2323
2324 dest = operands[0];
2325 vdest = lowpart_subreg (outermode: vmode, op: dest, innermode: mode);
2326 if (vdest == NULL_RTX)
2327 vdest = gen_reg_rtx (vmode);
2328 else
2329 dest = NULL_RTX;
2330 op1 = lowpart_subreg (outermode: vmode, op: force_reg (mode, operands[2]), innermode: mode);
2331 mask = ix86_build_signbit_mask (vmode, TARGET_AVX512F && mode != HFmode, 0);
2332
2333 if (CONST_DOUBLE_P (operands[1]))
2334 {
2335 op0 = simplify_unary_operation (code: ABS, mode, op: operands[1], op_mode: mode);
2336 /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a. */
2337 if (op0 == CONST0_RTX (mode))
2338 {
2339 emit_move_insn (vdest, gen_rtx_AND (vmode, mask, op1));
2340 if (dest)
2341 emit_move_insn (dest, lowpart_subreg (outermode: mode, op: vdest, innermode: vmode));
2342 return;
2343 }
2344
2345 if (GET_MODE_SIZE (mode) < 16)
2346 op0 = ix86_build_const_vector (vmode, false, op0);
2347 op0 = force_reg (vmode, op0);
2348 }
2349 else
2350 op0 = lowpart_subreg (outermode: vmode, op: force_reg (mode, operands[1]), innermode: mode);
2351
2352 op2 = gen_reg_rtx (vmode);
2353 op3 = gen_reg_rtx (vmode);
2354 emit_move_insn (op2, gen_rtx_AND (vmode,
2355 gen_rtx_NOT (vmode, mask),
2356 op0));
2357 emit_move_insn (op3, gen_rtx_AND (vmode, mask, op1));
2358 emit_move_insn (vdest, gen_rtx_IOR (vmode, op2, op3));
2359 if (dest)
2360 emit_move_insn (dest, lowpart_subreg (outermode: mode, op: vdest, innermode: vmode));
2361}
2362
2363/* Expand an xorsign operation. */
2364
2365void
2366ix86_expand_xorsign (rtx operands[])
2367{
2368 machine_mode mode, vmode;
2369 rtx dest, vdest, op0, op1, mask, x, temp;
2370
2371 dest = operands[0];
2372 op0 = operands[1];
2373 op1 = operands[2];
2374
2375 mode = GET_MODE (dest);
2376
2377 if (mode == HFmode)
2378 vmode = V8HFmode;
2379 else if (mode == SFmode)
2380 vmode = V4SFmode;
2381 else if (mode == DFmode)
2382 vmode = V2DFmode;
2383 else
2384 gcc_unreachable ();
2385
2386 temp = gen_reg_rtx (vmode);
2387 mask = ix86_build_signbit_mask (vmode, 0, 0);
2388
2389 op1 = lowpart_subreg (outermode: vmode, op: force_reg (mode, op1), innermode: mode);
2390 x = gen_rtx_AND (vmode, op1, mask);
2391 emit_insn (gen_rtx_SET (temp, x));
2392
2393 op0 = lowpart_subreg (outermode: vmode, op: force_reg (mode, op0), innermode: mode);
2394 x = gen_rtx_XOR (vmode, temp, op0);
2395
2396 vdest = lowpart_subreg (outermode: vmode, op: dest, innermode: mode);
2397 if (vdest == NULL_RTX)
2398 vdest = gen_reg_rtx (vmode);
2399 else
2400 dest = NULL_RTX;
2401 emit_insn (gen_rtx_SET (vdest, x));
2402
2403 if (dest)
2404 emit_move_insn (dest, lowpart_subreg (outermode: mode, op: vdest, innermode: vmode));
2405}
2406
2407static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2408
2409void
2410ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2411{
2412 machine_mode mode = GET_MODE (op0);
2413 rtx tmp;
2414
2415 /* Handle special case - vector comparsion with boolean result, transform
2416 it using ptest instruction or vpcmpeq + kortest. */
2417 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
2418 || (mode == TImode && !TARGET_64BIT)
2419 || mode == OImode
2420 || GET_MODE_SIZE (mode) == 64)
2421 {
2422 unsigned msize = GET_MODE_SIZE (mode);
2423 machine_mode p_mode
2424 = msize == 64 ? V16SImode : msize == 32 ? V4DImode : V2DImode;
2425 /* kortest set CF when result is 0xFFFF (op0 == op1). */
2426 rtx flag = gen_rtx_REG (msize == 64 ? CCCmode : CCZmode, FLAGS_REG);
2427
2428 gcc_assert (code == EQ || code == NE);
2429
2430 /* Using vpcmpeq zmm zmm k + kortest for 512-bit vectors. */
2431 if (msize == 64)
2432 {
2433 if (mode != V16SImode)
2434 {
2435 op0 = lowpart_subreg (outermode: p_mode, op: force_reg (mode, op0), innermode: mode);
2436 op1 = lowpart_subreg (outermode: p_mode, op: force_reg (mode, op1), innermode: mode);
2437 }
2438
2439 tmp = gen_reg_rtx (HImode);
2440 emit_insn (gen_avx512f_cmpv16si3 (tmp, op0, op1, GEN_INT (0)));
2441 emit_insn (gen_kortesthi_ccc (tmp, tmp));
2442 }
2443 /* Using ptest for 128/256-bit vectors. */
2444 else
2445 {
2446 if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
2447 {
2448 op0 = lowpart_subreg (outermode: p_mode, op: force_reg (mode, op0), innermode: mode);
2449 op1 = lowpart_subreg (outermode: p_mode, op: force_reg (mode, op1), innermode: mode);
2450 mode = p_mode;
2451 }
2452
2453 /* Generate XOR since we can't check that one operand is zero
2454 vector. */
2455 tmp = gen_reg_rtx (mode);
2456 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
2457 tmp = gen_lowpart (p_mode, tmp);
2458 emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
2459 gen_rtx_UNSPEC (CCZmode,
2460 gen_rtvec (2, tmp, tmp),
2461 UNSPEC_PTEST)));
2462 }
2463 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2464 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2465 gen_rtx_LABEL_REF (VOIDmode, label),
2466 pc_rtx);
2467 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2468 return;
2469 }
2470
2471 switch (mode)
2472 {
2473 case E_HFmode:
2474 case E_SFmode:
2475 case E_DFmode:
2476 case E_XFmode:
2477 case E_QImode:
2478 case E_HImode:
2479 case E_SImode:
2480 simple:
2481 tmp = ix86_expand_compare (code, op0, op1);
2482 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2483 gen_rtx_LABEL_REF (VOIDmode, label),
2484 pc_rtx);
2485 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2486 return;
2487
2488 case E_DImode:
2489 if (TARGET_64BIT)
2490 goto simple;
2491 /* FALLTHRU */
2492 case E_TImode:
2493 /* DI and TI mode equality/inequality comparisons may be performed
2494 on SSE registers. Avoid splitting them, except when optimizing
2495 for size. */
2496 if ((code == EQ || code == NE)
2497 && !optimize_insn_for_size_p ())
2498 goto simple;
2499
2500 /* Expand DImode branch into multiple compare+branch. */
2501 {
2502 rtx lo[2], hi[2];
2503 rtx_code_label *label2;
2504 enum rtx_code code1, code2, code3;
2505 machine_mode submode;
2506
2507 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2508 {
2509 std::swap (a&: op0, b&: op1);
2510 code = swap_condition (code);
2511 }
2512
2513 split_double_mode (mode, operands: &op0, num: 1, lo_half: lo+0, hi_half: hi+0);
2514 split_double_mode (mode, operands: &op1, num: 1, lo_half: lo+1, hi_half: hi+1);
2515
2516 submode = mode == DImode ? SImode : DImode;
2517
2518 /* If we are doing less-than or greater-or-equal-than,
2519 op1 is a constant and the low word is zero, then we can just
2520 examine the high word. Similarly for low word -1 and
2521 less-or-equal-than or greater-than. */
2522
2523 if (CONST_INT_P (hi[1]))
2524 switch (code)
2525 {
2526 case LT: case LTU: case GE: case GEU:
2527 if (lo[1] == const0_rtx)
2528 {
2529 ix86_expand_branch (code, op0: hi[0], op1: hi[1], label);
2530 return;
2531 }
2532 break;
2533 case LE: case LEU: case GT: case GTU:
2534 if (lo[1] == constm1_rtx)
2535 {
2536 ix86_expand_branch (code, op0: hi[0], op1: hi[1], label);
2537 return;
2538 }
2539 break;
2540 default:
2541 break;
2542 }
2543
2544 /* Emulate comparisons that do not depend on Zero flag with
2545 double-word subtraction. Note that only Overflow, Sign
2546 and Carry flags are valid, so swap arguments and condition
2547 of comparisons that would otherwise test Zero flag. */
2548
2549 switch (code)
2550 {
2551 case LE: case LEU: case GT: case GTU:
2552 std::swap (a&: lo[0], b&: lo[1]);
2553 std::swap (a&: hi[0], b&: hi[1]);
2554 code = swap_condition (code);
2555 /* FALLTHRU */
2556
2557 case LT: case LTU: case GE: case GEU:
2558 {
2559 bool uns = (code == LTU || code == GEU);
2560 rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2561 = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2562
2563 if (!nonimmediate_operand (lo[0], submode))
2564 lo[0] = force_reg (submode, lo[0]);
2565 if (!x86_64_general_operand (lo[1], submode))
2566 lo[1] = force_reg (submode, lo[1]);
2567
2568 if (!register_operand (hi[0], submode))
2569 hi[0] = force_reg (submode, hi[0]);
2570 if ((uns && !nonimmediate_operand (hi[1], submode))
2571 || (!uns && !x86_64_general_operand (hi[1], submode)))
2572 hi[1] = force_reg (submode, hi[1]);
2573
2574 emit_insn (gen_cmp_1 (arg0: submode, x0: lo[0], x1: lo[1]));
2575
2576 tmp = gen_rtx_SCRATCH (submode);
2577 emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2578
2579 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2580 ix86_expand_branch (code, op0: tmp, const0_rtx, label);
2581 return;
2582 }
2583
2584 default:
2585 break;
2586 }
2587
2588 /* Otherwise, we need two or three jumps. */
2589
2590 label2 = gen_label_rtx ();
2591
2592 code1 = code;
2593 code2 = swap_condition (code);
2594 code3 = unsigned_condition (code);
2595
2596 switch (code)
2597 {
2598 case LT: case GT: case LTU: case GTU:
2599 break;
2600
2601 case LE: code1 = LT; code2 = GT; break;
2602 case GE: code1 = GT; code2 = LT; break;
2603 case LEU: code1 = LTU; code2 = GTU; break;
2604 case GEU: code1 = GTU; code2 = LTU; break;
2605
2606 case EQ: code1 = UNKNOWN; code2 = NE; break;
2607 case NE: code2 = UNKNOWN; break;
2608
2609 default:
2610 gcc_unreachable ();
2611 }
2612
2613 /*
2614 * a < b =>
2615 * if (hi(a) < hi(b)) goto true;
2616 * if (hi(a) > hi(b)) goto false;
2617 * if (lo(a) < lo(b)) goto true;
2618 * false:
2619 */
2620
2621 if (code1 != UNKNOWN)
2622 ix86_expand_branch (code: code1, op0: hi[0], op1: hi[1], label);
2623 if (code2 != UNKNOWN)
2624 ix86_expand_branch (code: code2, op0: hi[0], op1: hi[1], label: label2);
2625
2626 ix86_expand_branch (code: code3, op0: lo[0], op1: lo[1], label);
2627
2628 if (code2 != UNKNOWN)
2629 emit_label (label2);
2630 return;
2631 }
2632
2633 default:
2634 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2635 goto simple;
2636 }
2637}
2638
2639/* Figure out whether to use unordered fp comparisons. */
2640
2641static bool
2642ix86_unordered_fp_compare (enum rtx_code code)
2643{
2644 if (!TARGET_IEEE_FP)
2645 return false;
2646
2647 switch (code)
2648 {
2649 case LT:
2650 case LE:
2651 case GT:
2652 case GE:
2653 case LTGT:
2654 return false;
2655
2656 case EQ:
2657 case NE:
2658
2659 case UNORDERED:
2660 case ORDERED:
2661 case UNLT:
2662 case UNLE:
2663 case UNGT:
2664 case UNGE:
2665 case UNEQ:
2666 return true;
2667
2668 default:
2669 gcc_unreachable ();
2670 }
2671}
2672
2673/* Return a comparison we can do and that it is equivalent to
2674 swap_condition (code) apart possibly from orderedness.
2675 But, never change orderedness if TARGET_IEEE_FP, returning
2676 UNKNOWN in that case if necessary. */
2677
2678static enum rtx_code
2679ix86_fp_swap_condition (enum rtx_code code)
2680{
2681 switch (code)
2682 {
2683 case GT: /* GTU - CF=0 & ZF=0 */
2684 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2685 case GE: /* GEU - CF=0 */
2686 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2687 case UNLT: /* LTU - CF=1 */
2688 return TARGET_IEEE_FP ? UNKNOWN : GT;
2689 case UNLE: /* LEU - CF=1 | ZF=1 */
2690 return TARGET_IEEE_FP ? UNKNOWN : GE;
2691 default:
2692 return swap_condition (code);
2693 }
2694}
2695
2696/* Return cost of comparison CODE using the best strategy for performance.
2697 All following functions do use number of instructions as a cost metrics.
2698 In future this should be tweaked to compute bytes for optimize_size and
2699 take into account performance of various instructions on various CPUs. */
2700
2701static int
2702ix86_fp_comparison_cost (enum rtx_code code)
2703{
2704 int arith_cost;
2705
2706 /* The cost of code using bit-twiddling on %ah. */
2707 switch (code)
2708 {
2709 case UNLE:
2710 case UNLT:
2711 case LTGT:
2712 case GT:
2713 case GE:
2714 case UNORDERED:
2715 case ORDERED:
2716 case UNEQ:
2717 arith_cost = 4;
2718 break;
2719 case LT:
2720 case NE:
2721 case EQ:
2722 case UNGE:
2723 arith_cost = TARGET_IEEE_FP ? 5 : 4;
2724 break;
2725 case LE:
2726 case UNGT:
2727 arith_cost = TARGET_IEEE_FP ? 6 : 4;
2728 break;
2729 default:
2730 gcc_unreachable ();
2731 }
2732
2733 switch (ix86_fp_comparison_strategy (code))
2734 {
2735 case IX86_FPCMP_COMI:
2736 return arith_cost > 4 ? 3 : 2;
2737 case IX86_FPCMP_SAHF:
2738 return arith_cost > 4 ? 4 : 3;
2739 default:
2740 return arith_cost;
2741 }
2742}
2743
2744/* Swap, force into registers, or otherwise massage the two operands
2745 to a fp comparison. The operands are updated in place; the new
2746 comparison code is returned. */
2747
2748static enum rtx_code
2749ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2750{
2751 bool unordered_compare = ix86_unordered_fp_compare (code);
2752 rtx op0 = *pop0, op1 = *pop1;
2753 machine_mode op_mode = GET_MODE (op0);
2754 bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode);
2755
2756 if (op_mode == BFmode)
2757 {
2758 rtx op = gen_lowpart (HImode, op0);
2759 if (CONST_INT_P (op))
2760 op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
2761 op0, BFmode);
2762 else
2763 {
2764 rtx t1 = gen_reg_rtx (SImode);
2765 emit_insn (gen_zero_extendhisi2 (t1, op));
2766 emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
2767 op = gen_lowpart (SFmode, t1);
2768 }
2769 *pop0 = op;
2770 op = gen_lowpart (HImode, op1);
2771 if (CONST_INT_P (op))
2772 op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
2773 op1, BFmode);
2774 else
2775 {
2776 rtx t1 = gen_reg_rtx (SImode);
2777 emit_insn (gen_zero_extendhisi2 (t1, op));
2778 emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
2779 op = gen_lowpart (SFmode, t1);
2780 }
2781 *pop1 = op;
2782 return ix86_prepare_fp_compare_args (code, pop0, pop1);
2783 }
2784
2785 /* All of the unordered compare instructions only work on registers.
2786 The same is true of the fcomi compare instructions. The XFmode
2787 compare instructions require registers except when comparing
2788 against zero or when converting operand 1 from fixed point to
2789 floating point. */
2790
2791 if (!is_sse
2792 && (unordered_compare
2793 || (op_mode == XFmode
2794 && ! (standard_80387_constant_p (op0) == 1
2795 || standard_80387_constant_p (op1) == 1)
2796 && GET_CODE (op1) != FLOAT)
2797 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2798 {
2799 op0 = force_reg (op_mode, op0);
2800 op1 = force_reg (op_mode, op1);
2801 }
2802 else
2803 {
2804 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2805 things around if they appear profitable, otherwise force op0
2806 into a register. */
2807
2808 if (standard_80387_constant_p (op0) == 0
2809 || (MEM_P (op0)
2810 && ! (standard_80387_constant_p (op1) == 0
2811 || MEM_P (op1))))
2812 {
2813 enum rtx_code new_code = ix86_fp_swap_condition (code);
2814 if (new_code != UNKNOWN)
2815 {
2816 std::swap (a&: op0, b&: op1);
2817 code = new_code;
2818 }
2819 }
2820
2821 if (!REG_P (op0))
2822 op0 = force_reg (op_mode, op0);
2823
2824 if (CONSTANT_P (op1))
2825 {
2826 int tmp = standard_80387_constant_p (op1);
2827 if (tmp == 0)
2828 op1 = validize_mem (force_const_mem (op_mode, op1));
2829 else if (tmp == 1)
2830 {
2831 if (TARGET_CMOVE)
2832 op1 = force_reg (op_mode, op1);
2833 }
2834 else
2835 op1 = force_reg (op_mode, op1);
2836 }
2837 }
2838
2839 /* Try to rearrange the comparison to make it cheaper. */
2840 if (ix86_fp_comparison_cost (code)
2841 > ix86_fp_comparison_cost (code: swap_condition (code))
2842 && (REG_P (op1) || can_create_pseudo_p ()))
2843 {
2844 std::swap (a&: op0, b&: op1);
2845 code = swap_condition (code);
2846 if (!REG_P (op0))
2847 op0 = force_reg (op_mode, op0);
2848 }
2849
2850 *pop0 = op0;
2851 *pop1 = op1;
2852 return code;
2853}
2854
2855/* Generate insn patterns to do a floating point compare of OPERANDS. */
2856
2857static rtx
2858ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2859{
2860 bool unordered_compare = ix86_unordered_fp_compare (code);
2861 machine_mode cmp_mode;
2862 rtx tmp, scratch;
2863
2864 code = ix86_prepare_fp_compare_args (code, pop0: &op0, pop1: &op1);
2865
2866 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2867 if (unordered_compare)
2868 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2869
2870 /* Do fcomi/sahf based test when profitable. */
2871 switch (ix86_fp_comparison_strategy (code))
2872 {
2873 case IX86_FPCMP_COMI:
2874 cmp_mode = CCFPmode;
2875 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2876 break;
2877
2878 case IX86_FPCMP_SAHF:
2879 cmp_mode = CCFPmode;
2880 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2881 scratch = gen_reg_rtx (HImode);
2882 emit_insn (gen_rtx_SET (scratch, tmp));
2883 emit_insn (gen_x86_sahf_1 (scratch));
2884 break;
2885
2886 case IX86_FPCMP_ARITH:
2887 cmp_mode = CCNOmode;
2888 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2889 scratch = gen_reg_rtx (HImode);
2890 emit_insn (gen_rtx_SET (scratch, tmp));
2891
2892 /* In the unordered case, we have to check C2 for NaN's, which
2893 doesn't happen to work out to anything nice combination-wise.
2894 So do some bit twiddling on the value we've got in AH to come
2895 up with an appropriate set of condition codes. */
2896
2897 switch (code)
2898 {
2899 case GT:
2900 case UNGT:
2901 if (code == GT || !TARGET_IEEE_FP)
2902 {
2903 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2904 code = EQ;
2905 }
2906 else
2907 {
2908 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2909 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2910 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2911 cmp_mode = CCmode;
2912 code = GEU;
2913 }
2914 break;
2915 case LT:
2916 case UNLT:
2917 if (code == LT && TARGET_IEEE_FP)
2918 {
2919 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2920 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2921 cmp_mode = CCmode;
2922 code = EQ;
2923 }
2924 else
2925 {
2926 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2927 code = NE;
2928 }
2929 break;
2930 case GE:
2931 case UNGE:
2932 if (code == GE || !TARGET_IEEE_FP)
2933 {
2934 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2935 code = EQ;
2936 }
2937 else
2938 {
2939 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2940 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2941 code = NE;
2942 }
2943 break;
2944 case LE:
2945 case UNLE:
2946 if (code == LE && TARGET_IEEE_FP)
2947 {
2948 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2949 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2950 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2951 cmp_mode = CCmode;
2952 code = LTU;
2953 }
2954 else
2955 {
2956 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2957 code = NE;
2958 }
2959 break;
2960 case EQ:
2961 case UNEQ:
2962 if (code == EQ && TARGET_IEEE_FP)
2963 {
2964 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2965 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2966 cmp_mode = CCmode;
2967 code = EQ;
2968 }
2969 else
2970 {
2971 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2972 code = NE;
2973 }
2974 break;
2975 case NE:
2976 case LTGT:
2977 if (code == NE && TARGET_IEEE_FP)
2978 {
2979 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2980 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2981 GEN_INT (0x40)));
2982 code = NE;
2983 }
2984 else
2985 {
2986 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2987 code = EQ;
2988 }
2989 break;
2990
2991 case UNORDERED:
2992 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2993 code = NE;
2994 break;
2995 case ORDERED:
2996 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2997 code = EQ;
2998 break;
2999
3000 default:
3001 gcc_unreachable ();
3002 }
3003 break;
3004
3005 default:
3006 gcc_unreachable();
3007 }
3008
3009 /* Return the test that should be put into the flags user, i.e.
3010 the bcc, scc, or cmov instruction. */
3011 return gen_rtx_fmt_ee (code, VOIDmode,
3012 gen_rtx_REG (cmp_mode, FLAGS_REG),
3013 const0_rtx);
3014}
3015
3016/* Generate insn patterns to do an integer compare of OPERANDS. */
3017
3018static rtx
3019ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
3020{
3021 machine_mode cmpmode;
3022 rtx tmp, flags;
3023
3024 /* Swap operands to emit carry flag comparison. */
3025 if ((code == GTU || code == LEU)
3026 && nonimmediate_operand (op1, VOIDmode))
3027 {
3028 std::swap (a&: op0, b&: op1);
3029 code = swap_condition (code);
3030 }
3031
3032 cmpmode = SELECT_CC_MODE (code, op0, op1);
3033 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
3034
3035 /* Attempt to use PTEST, if available, when testing vector modes for
3036 equality/inequality against zero. */
3037 if (op1 == const0_rtx
3038 && SUBREG_P (op0)
3039 && cmpmode == CCZmode
3040 && SUBREG_BYTE (op0) == 0
3041 && REG_P (SUBREG_REG (op0))
3042 && VECTOR_MODE_P (GET_MODE (SUBREG_REG (op0)))
3043 && TARGET_SSE4_1
3044 && GET_MODE (op0) == TImode
3045 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op0))) == 16)
3046 {
3047 tmp = SUBREG_REG (op0);
3048 tmp = gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, tmp, tmp), UNSPEC_PTEST);
3049 }
3050 else
3051 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
3052
3053 /* This is very simple, but making the interface the same as in the
3054 FP case makes the rest of the code easier. */
3055 emit_insn (gen_rtx_SET (flags, tmp));
3056
3057 /* Return the test that should be put into the flags user, i.e.
3058 the bcc, scc, or cmov instruction. */
3059 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
3060}
3061
3062static rtx
3063ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
3064{
3065 rtx ret;
3066
3067 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
3068 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
3069
3070 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
3071 {
3072 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
3073 ret = ix86_expand_fp_compare (code, op0, op1);
3074 }
3075 else
3076 ret = ix86_expand_int_compare (code, op0, op1);
3077
3078 return ret;
3079}
3080
3081void
3082ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
3083{
3084 rtx ret;
3085
3086 gcc_assert (GET_MODE (dest) == QImode);
3087
3088 ret = ix86_expand_compare (code, op0, op1);
3089 PUT_MODE (x: ret, QImode);
3090 emit_insn (gen_rtx_SET (dest, ret));
3091}
3092
3093/* Expand floating point op0 <=> op1, i.e.
3094 dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : 2. */
3095
3096void
3097ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1)
3098{
3099 gcc_checking_assert (ix86_fp_comparison_strategy (GT) != IX86_FPCMP_ARITH);
3100 rtx gt = ix86_expand_fp_compare (code: GT, op0, op1);
3101 rtx l0 = gen_label_rtx ();
3102 rtx l1 = gen_label_rtx ();
3103 rtx l2 = TARGET_IEEE_FP ? gen_label_rtx () : NULL_RTX;
3104 rtx lend = gen_label_rtx ();
3105 rtx tmp;
3106 rtx_insn *jmp;
3107 if (l2)
3108 {
3109 rtx un = gen_rtx_fmt_ee (UNORDERED, VOIDmode,
3110 gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
3111 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, un,
3112 gen_rtx_LABEL_REF (VOIDmode, l2), pc_rtx);
3113 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3114 add_reg_br_prob_note (jmp, profile_probability:: very_unlikely ());
3115 }
3116 rtx eq = gen_rtx_fmt_ee (UNEQ, VOIDmode,
3117 gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
3118 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, eq,
3119 gen_rtx_LABEL_REF (VOIDmode, l0), pc_rtx);
3120 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3121 add_reg_br_prob_note (jmp, profile_probability::unlikely ());
3122 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, gt,
3123 gen_rtx_LABEL_REF (VOIDmode, l1), pc_rtx);
3124 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3125 add_reg_br_prob_note (jmp, profile_probability::even ());
3126 emit_move_insn (dest, constm1_rtx);
3127 emit_jump (lend);
3128 emit_label (l0);
3129 emit_move_insn (dest, const0_rtx);
3130 emit_jump (lend);
3131 emit_label (l1);
3132 emit_move_insn (dest, const1_rtx);
3133 emit_jump (lend);
3134 if (l2)
3135 {
3136 emit_label (l2);
3137 emit_move_insn (dest, const2_rtx);
3138 }
3139 emit_label (lend);
3140}
3141
3142/* Expand comparison setting or clearing carry flag. Return true when
3143 successful and set pop for the operation. */
3144static bool
3145ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
3146{
3147 machine_mode mode
3148 = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
3149
3150 /* Do not handle double-mode compares that go through special path. */
3151 if (mode == (TARGET_64BIT ? TImode : DImode))
3152 return false;
3153
3154 if (SCALAR_FLOAT_MODE_P (mode))
3155 {
3156 rtx compare_op;
3157 rtx_insn *compare_seq;
3158
3159 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
3160
3161 /* Shortcut: following common codes never translate
3162 into carry flag compares. */
3163 if (code == EQ || code == NE || code == UNEQ || code == LTGT
3164 || code == ORDERED || code == UNORDERED)
3165 return false;
3166
3167 /* These comparisons require zero flag; swap operands so they won't. */
3168 if ((code == GT || code == UNLE || code == LE || code == UNGT)
3169 && !TARGET_IEEE_FP)
3170 {
3171 std::swap (a&: op0, b&: op1);
3172 code = swap_condition (code);
3173 }
3174
3175 /* Try to expand the comparison and verify that we end up with
3176 carry flag based comparison. This fails to be true only when
3177 we decide to expand comparison using arithmetic that is not
3178 too common scenario. */
3179 start_sequence ();
3180 compare_op = ix86_expand_fp_compare (code, op0, op1);
3181 compare_seq = get_insns ();
3182 end_sequence ();
3183
3184 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
3185 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
3186 else
3187 code = GET_CODE (compare_op);
3188
3189 if (code != LTU && code != GEU)
3190 return false;
3191
3192 emit_insn (compare_seq);
3193 *pop = compare_op;
3194 return true;
3195 }
3196
3197 if (!INTEGRAL_MODE_P (mode))
3198 return false;
3199
3200 switch (code)
3201 {
3202 case LTU:
3203 case GEU:
3204 break;
3205
3206 /* Convert a==0 into (unsigned)a<1. */
3207 case EQ:
3208 case NE:
3209 if (op1 != const0_rtx)
3210 return false;
3211 op1 = const1_rtx;
3212 code = (code == EQ ? LTU : GEU);
3213 break;
3214
3215 /* Convert a>b into b<a or a>=b-1. */
3216 case GTU:
3217 case LEU:
3218 if (CONST_INT_P (op1))
3219 {
3220 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
3221 /* Bail out on overflow. We still can swap operands but that
3222 would force loading of the constant into register. */
3223 if (op1 == const0_rtx
3224 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
3225 return false;
3226 code = (code == GTU ? GEU : LTU);
3227 }
3228 else
3229 {
3230 std::swap (a&: op0, b&: op1);
3231 code = (code == GTU ? LTU : GEU);
3232 }
3233 break;
3234
3235 /* Convert a>=0 into (unsigned)a<0x80000000. */
3236 case LT:
3237 case GE:
3238 if (mode == DImode || op1 != const0_rtx)
3239 return false;
3240 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3241 code = (code == LT ? GEU : LTU);
3242 break;
3243 case LE:
3244 case GT:
3245 if (mode == DImode || op1 != constm1_rtx)
3246 return false;
3247 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3248 code = (code == LE ? GEU : LTU);
3249 break;
3250
3251 default:
3252 return false;
3253 }
3254 /* Swapping operands may cause constant to appear as first operand. */
3255 if (!nonimmediate_operand (op0, VOIDmode))
3256 {
3257 if (!can_create_pseudo_p ())
3258 return false;
3259 op0 = force_reg (mode, op0);
3260 }
3261 *pop = ix86_expand_compare (code, op0, op1);
3262 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
3263 return true;
3264}
3265
3266/* Expand conditional increment or decrement using adb/sbb instructions.
3267 The default case using setcc followed by the conditional move can be
3268 done by generic code. */
3269bool
3270ix86_expand_int_addcc (rtx operands[])
3271{
3272 enum rtx_code code = GET_CODE (operands[1]);
3273 rtx flags;
3274 rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
3275 rtx compare_op;
3276 rtx val = const0_rtx;
3277 bool fpcmp = false;
3278 machine_mode mode;
3279 rtx op0 = XEXP (operands[1], 0);
3280 rtx op1 = XEXP (operands[1], 1);
3281
3282 if (operands[3] != const1_rtx
3283 && operands[3] != constm1_rtx)
3284 return false;
3285 if (!ix86_expand_carry_flag_compare (code, op0, op1, pop: &compare_op))
3286 return false;
3287 code = GET_CODE (compare_op);
3288
3289 flags = XEXP (compare_op, 0);
3290
3291 if (GET_MODE (flags) == CCFPmode)
3292 {
3293 fpcmp = true;
3294 code = ix86_fp_compare_code_to_integer (code);
3295 }
3296
3297 if (code != LTU)
3298 {
3299 val = constm1_rtx;
3300 if (fpcmp)
3301 PUT_CODE (compare_op,
3302 reverse_condition_maybe_unordered
3303 (GET_CODE (compare_op)));
3304 else
3305 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
3306 }
3307
3308 mode = GET_MODE (operands[0]);
3309
3310 /* Construct either adc or sbb insn. */
3311 if ((code == LTU) == (operands[3] == constm1_rtx))
3312 insn = gen_sub3_carry;
3313 else
3314 insn = gen_add3_carry;
3315
3316 emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
3317
3318 return true;
3319}
3320
3321bool
3322ix86_expand_int_movcc (rtx operands[])
3323{
3324 enum rtx_code code = GET_CODE (operands[1]), compare_code;
3325 rtx_insn *compare_seq;
3326 rtx compare_op;
3327 machine_mode mode = GET_MODE (operands[0]);
3328 bool sign_bit_compare_p = false;
3329 bool negate_cc_compare_p = false;
3330 rtx op0 = XEXP (operands[1], 0);
3331 rtx op1 = XEXP (operands[1], 1);
3332 rtx op2 = operands[2];
3333 rtx op3 = operands[3];
3334
3335 if (GET_MODE (op0) == TImode
3336 || (GET_MODE (op0) == DImode
3337 && !TARGET_64BIT))
3338 return false;
3339
3340 if (GET_MODE (op0) == BFmode
3341 && !ix86_fp_comparison_operator (operands[1], VOIDmode))
3342 return false;
3343
3344 start_sequence ();
3345 compare_op = ix86_expand_compare (code, op0, op1);
3346 compare_seq = get_insns ();
3347 end_sequence ();
3348
3349 compare_code = GET_CODE (compare_op);
3350
3351 if ((op1 == const0_rtx && (code == GE || code == LT))
3352 || (op1 == constm1_rtx && (code == GT || code == LE)))
3353 sign_bit_compare_p = true;
3354
3355 /* op0 == op1 ? op0 : op3 is equivalent to op0 == op1 ? op1 : op3,
3356 but if op1 is a constant, the latter form allows more optimizations,
3357 either through the last 2 ops being constant handling, or the one
3358 constant and one variable cases. On the other side, for cmov the
3359 former might be better as we don't need to load the constant into
3360 another register. */
3361 if (code == EQ && CONST_INT_P (op1) && rtx_equal_p (op0, op2))
3362 op2 = op1;
3363 /* Similarly for op0 != op1 ? op2 : op0 and op0 != op1 ? op2 : op1. */
3364 else if (code == NE && CONST_INT_P (op1) && rtx_equal_p (op0, op3))
3365 op3 = op1;
3366
3367 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
3368 HImode insns, we'd be swallowed in word prefix ops. */
3369
3370 if ((mode != HImode || TARGET_FAST_PREFIX)
3371 && (mode != (TARGET_64BIT ? TImode : DImode))
3372 && CONST_INT_P (op2)
3373 && CONST_INT_P (op3))
3374 {
3375 rtx out = operands[0];
3376 HOST_WIDE_INT ct = INTVAL (op2);
3377 HOST_WIDE_INT cf = INTVAL (op3);
3378 HOST_WIDE_INT diff;
3379
3380 if ((mode == SImode
3381 || (TARGET_64BIT && mode == DImode))
3382 && (GET_MODE (op0) == SImode
3383 || (TARGET_64BIT && GET_MODE (op0) == DImode)))
3384 {
3385 /* Special case x != 0 ? -1 : y. */
3386 if (code == NE && op1 == const0_rtx && ct == -1)
3387 {
3388 negate_cc_compare_p = true;
3389 std::swap (a&: ct, b&: cf);
3390 code = EQ;
3391 }
3392 else if (code == EQ && op1 == const0_rtx && cf == -1)
3393 negate_cc_compare_p = true;
3394 }
3395
3396 diff = ct - cf;
3397 /* Sign bit compares are better done using shifts than we do by using
3398 sbb. */
3399 if (sign_bit_compare_p
3400 || negate_cc_compare_p
3401 || ix86_expand_carry_flag_compare (code, op0, op1, pop: &compare_op))
3402 {
3403 /* Detect overlap between destination and compare sources. */
3404 rtx tmp = out;
3405
3406 if (negate_cc_compare_p)
3407 {
3408 if (GET_MODE (op0) == DImode)
3409 emit_insn (gen_x86_negdi_ccc (gen_reg_rtx (DImode), op0));
3410 else
3411 emit_insn (gen_x86_negsi_ccc (gen_reg_rtx (SImode),
3412 gen_lowpart (SImode, op0)));
3413
3414 tmp = gen_reg_rtx (mode);
3415 if (mode == DImode)
3416 emit_insn (gen_x86_movdicc_0_m1_neg (tmp));
3417 else
3418 emit_insn (gen_x86_movsicc_0_m1_neg (gen_lowpart (SImode,
3419 tmp)));
3420 }
3421 else if (!sign_bit_compare_p)
3422 {
3423 rtx flags;
3424 bool fpcmp = false;
3425
3426 compare_code = GET_CODE (compare_op);
3427
3428 flags = XEXP (compare_op, 0);
3429
3430 if (GET_MODE (flags) == CCFPmode)
3431 {
3432 fpcmp = true;
3433 compare_code
3434 = ix86_fp_compare_code_to_integer (compare_code);
3435 }
3436
3437 /* To simplify rest of code, restrict to the GEU case. */
3438 if (compare_code == LTU)
3439 {
3440 std::swap (a&: ct, b&: cf);
3441 compare_code = reverse_condition (compare_code);
3442 code = reverse_condition (code);
3443 }
3444 else
3445 {
3446 if (fpcmp)
3447 PUT_CODE (compare_op,
3448 reverse_condition_maybe_unordered
3449 (GET_CODE (compare_op)));
3450 else
3451 PUT_CODE (compare_op,
3452 reverse_condition (GET_CODE (compare_op)));
3453 }
3454 diff = ct - cf;
3455
3456 if (reg_overlap_mentioned_p (out, compare_op))
3457 tmp = gen_reg_rtx (mode);
3458
3459 if (mode == DImode)
3460 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
3461 else
3462 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
3463 flags, compare_op));
3464 }
3465 else
3466 {
3467 if (code == GT || code == GE)
3468 code = reverse_condition (code);
3469 else
3470 {
3471 std::swap (a&: ct, b&: cf);
3472 diff = ct - cf;
3473 }
3474 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
3475 }
3476
3477 if (diff == 1)
3478 {
3479 /*
3480 * cmpl op0,op1
3481 * sbbl dest,dest
3482 * [addl dest, ct]
3483 *
3484 * Size 5 - 8.
3485 */
3486 if (ct)
3487 tmp = expand_simple_binop (mode, PLUS,
3488 tmp, GEN_INT (ct),
3489 copy_rtx (tmp), 1, OPTAB_DIRECT);
3490 }
3491 else if (cf == -1)
3492 {
3493 /*
3494 * cmpl op0,op1
3495 * sbbl dest,dest
3496 * orl $ct, dest
3497 *
3498 * Size 8.
3499 */
3500 tmp = expand_simple_binop (mode, IOR,
3501 tmp, GEN_INT (ct),
3502 copy_rtx (tmp), 1, OPTAB_DIRECT);
3503 }
3504 else if (diff == -1 && ct)
3505 {
3506 /*
3507 * cmpl op0,op1
3508 * sbbl dest,dest
3509 * notl dest
3510 * [addl dest, cf]
3511 *
3512 * Size 8 - 11.
3513 */
3514 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3515 if (cf)
3516 tmp = expand_simple_binop (mode, PLUS,
3517 copy_rtx (tmp), GEN_INT (cf),
3518 copy_rtx (tmp), 1, OPTAB_DIRECT);
3519 }
3520 else
3521 {
3522 /*
3523 * cmpl op0,op1
3524 * sbbl dest,dest
3525 * [notl dest]
3526 * andl cf - ct, dest
3527 * [addl dest, ct]
3528 *
3529 * Size 8 - 11.
3530 */
3531
3532 if (cf == 0)
3533 {
3534 cf = ct;
3535 ct = 0;
3536 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3537 }
3538
3539 tmp = expand_simple_binop (mode, AND,
3540 copy_rtx (tmp),
3541 gen_int_mode (cf - ct, mode),
3542 copy_rtx (tmp), 1, OPTAB_DIRECT);
3543 if (ct)
3544 tmp = expand_simple_binop (mode, PLUS,
3545 copy_rtx (tmp), GEN_INT (ct),
3546 copy_rtx (tmp), 1, OPTAB_DIRECT);
3547 }
3548
3549 if (!rtx_equal_p (tmp, out))
3550 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3551
3552 return true;
3553 }
3554
3555 if (diff < 0)
3556 {
3557 machine_mode cmp_mode = GET_MODE (op0);
3558 enum rtx_code new_code;
3559
3560 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3561 {
3562 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3563
3564 /* We may be reversing a non-trapping
3565 comparison to a trapping comparison. */
3566 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3567 && code != EQ && code != NE
3568 && code != ORDERED && code != UNORDERED)
3569 new_code = UNKNOWN;
3570 else
3571 new_code = reverse_condition_maybe_unordered (code);
3572 }
3573 else
3574 new_code = ix86_reverse_condition (code, cmp_mode);
3575 if (new_code != UNKNOWN)
3576 {
3577 std::swap (a&: ct, b&: cf);
3578 diff = -diff;
3579 code = new_code;
3580 }
3581 }
3582
3583 compare_code = UNKNOWN;
3584 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3585 && CONST_INT_P (op1))
3586 {
3587 if (op1 == const0_rtx
3588 && (code == LT || code == GE))
3589 compare_code = code;
3590 else if (op1 == constm1_rtx)
3591 {
3592 if (code == LE)
3593 compare_code = LT;
3594 else if (code == GT)
3595 compare_code = GE;
3596 }
3597 }
3598
3599 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3600 if (compare_code != UNKNOWN
3601 && GET_MODE (op0) == GET_MODE (out)
3602 && (cf == -1 || ct == -1))
3603 {
3604 /* If lea code below could be used, only optimize
3605 if it results in a 2 insn sequence. */
3606
3607 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3608 || diff == 3 || diff == 5 || diff == 9)
3609 || (compare_code == LT && ct == -1)
3610 || (compare_code == GE && cf == -1))
3611 {
3612 /*
3613 * notl op1 (if necessary)
3614 * sarl $31, op1
3615 * orl cf, op1
3616 */
3617 if (ct != -1)
3618 {
3619 cf = ct;
3620 ct = -1;
3621 code = reverse_condition (code);
3622 }
3623
3624 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3625
3626 out = expand_simple_binop (mode, IOR,
3627 out, GEN_INT (cf),
3628 out, 1, OPTAB_DIRECT);
3629 if (out != operands[0])
3630 emit_move_insn (operands[0], out);
3631
3632 return true;
3633 }
3634 }
3635
3636
3637 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3638 || diff == 3 || diff == 5 || diff == 9)
3639 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3640 && (mode != DImode
3641 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3642 {
3643 /*
3644 * xorl dest,dest
3645 * cmpl op1,op2
3646 * setcc dest
3647 * lea cf(dest*(ct-cf)),dest
3648 *
3649 * Size 14.
3650 *
3651 * This also catches the degenerate setcc-only case.
3652 */
3653
3654 rtx tmp;
3655 int nops;
3656
3657 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3658
3659 nops = 0;
3660 /* On x86_64 the lea instruction operates on Pmode, so we need
3661 to get arithmetics done in proper mode to match. */
3662 if (diff == 1)
3663 tmp = copy_rtx (out);
3664 else
3665 {
3666 rtx out1;
3667 out1 = copy_rtx (out);
3668 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3669 nops++;
3670 if (diff & 1)
3671 {
3672 tmp = gen_rtx_PLUS (mode, tmp, out1);
3673 nops++;
3674 }
3675 }
3676 if (cf != 0)
3677 {
3678 tmp = plus_constant (mode, tmp, cf);
3679 nops++;
3680 }
3681 if (!rtx_equal_p (tmp, out))
3682 {
3683 if (nops == 1)
3684 out = force_operand (tmp, copy_rtx (out));
3685 else
3686 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3687 }
3688 if (!rtx_equal_p (out, operands[0]))
3689 emit_move_insn (operands[0], copy_rtx (out));
3690
3691 return true;
3692 }
3693
3694 /*
3695 * General case: Jumpful:
3696 * xorl dest,dest cmpl op1, op2
3697 * cmpl op1, op2 movl ct, dest
3698 * setcc dest jcc 1f
3699 * decl dest movl cf, dest
3700 * andl (cf-ct),dest 1:
3701 * addl ct,dest
3702 *
3703 * Size 20. Size 14.
3704 *
3705 * This is reasonably steep, but branch mispredict costs are
3706 * high on modern cpus, so consider failing only if optimizing
3707 * for space.
3708 */
3709
3710 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3711 && BRANCH_COST (optimize_insn_for_speed_p (),
3712 false) >= 2)
3713 {
3714 if (cf == 0)
3715 {
3716 machine_mode cmp_mode = GET_MODE (op0);
3717 enum rtx_code new_code;
3718
3719 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3720 {
3721 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3722
3723 /* We may be reversing a non-trapping
3724 comparison to a trapping comparison. */
3725 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3726 && code != EQ && code != NE
3727 && code != ORDERED && code != UNORDERED)
3728 new_code = UNKNOWN;
3729 else
3730 new_code = reverse_condition_maybe_unordered (code);
3731
3732 }
3733 else
3734 {
3735 new_code = ix86_reverse_condition (code, cmp_mode);
3736 if (compare_code != UNKNOWN && new_code != UNKNOWN)
3737 compare_code = reverse_condition (compare_code);
3738 }
3739
3740 if (new_code != UNKNOWN)
3741 {
3742 cf = ct;
3743 ct = 0;
3744 code = new_code;
3745 }
3746 }
3747
3748 if (compare_code != UNKNOWN)
3749 {
3750 /* notl op1 (if needed)
3751 sarl $31, op1
3752 andl (cf-ct), op1
3753 addl ct, op1
3754
3755 For x < 0 (resp. x <= -1) there will be no notl,
3756 so if possible swap the constants to get rid of the
3757 complement.
3758 True/false will be -1/0 while code below (store flag
3759 followed by decrement) is 0/-1, so the constants need
3760 to be exchanged once more. */
3761
3762 if (compare_code == GE || !cf)
3763 {
3764 code = reverse_condition (code);
3765 compare_code = LT;
3766 }
3767 else
3768 std::swap (a&: ct, b&: cf);
3769
3770 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3771 }
3772 else
3773 {
3774 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3775
3776 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3777 constm1_rtx,
3778 copy_rtx (out), 1, OPTAB_DIRECT);
3779 }
3780
3781 out = expand_simple_binop (mode, AND, copy_rtx (out),
3782 gen_int_mode (cf - ct, mode),
3783 copy_rtx (out), 1, OPTAB_DIRECT);
3784 if (ct)
3785 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3786 copy_rtx (out), 1, OPTAB_DIRECT);
3787 if (!rtx_equal_p (out, operands[0]))
3788 emit_move_insn (operands[0], copy_rtx (out));
3789
3790 return true;
3791 }
3792 }
3793
3794 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3795 {
3796 /* Try a few things more with specific constants and a variable. */
3797
3798 optab op;
3799 rtx var, orig_out, out, tmp;
3800
3801 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3802 return false;
3803
3804 operands[2] = op2;
3805 operands[3] = op3;
3806
3807 /* If one of the two operands is an interesting constant, load a
3808 constant with the above and mask it in with a logical operation. */
3809
3810 if (CONST_INT_P (operands[2]))
3811 {
3812 var = operands[3];
3813 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3814 operands[3] = constm1_rtx, op = and_optab;
3815 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3816 operands[3] = const0_rtx, op = ior_optab;
3817 else
3818 return false;
3819 }
3820 else if (CONST_INT_P (operands[3]))
3821 {
3822 var = operands[2];
3823 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
3824 {
3825 /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3826 "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
3827 if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
3828 operands[1] = simplify_gen_relational (code: LT, VOIDmode,
3829 GET_MODE (op0),
3830 op0, const0_rtx);
3831
3832 operands[2] = constm1_rtx;
3833 op = and_optab;
3834 }
3835 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3836 operands[2] = const0_rtx, op = ior_optab;
3837 else
3838 return false;
3839 }
3840 else
3841 return false;
3842
3843 orig_out = operands[0];
3844 tmp = gen_reg_rtx (mode);
3845 operands[0] = tmp;
3846
3847 /* Recurse to get the constant loaded. */
3848 if (!ix86_expand_int_movcc (operands))
3849 return false;
3850
3851 /* Mask in the interesting variable. */
3852 out = expand_binop (mode, op, var, tmp, orig_out, 0,
3853 OPTAB_WIDEN);
3854 if (!rtx_equal_p (out, orig_out))
3855 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3856
3857 return true;
3858 }
3859
3860 /*
3861 * For comparison with above,
3862 *
3863 * movl cf,dest
3864 * movl ct,tmp
3865 * cmpl op1,op2
3866 * cmovcc tmp,dest
3867 *
3868 * Size 15.
3869 */
3870
3871 if (! nonimmediate_operand (operands[2], mode))
3872 operands[2] = force_reg (mode, operands[2]);
3873 if (! nonimmediate_operand (operands[3], mode))
3874 operands[3] = force_reg (mode, operands[3]);
3875
3876 if (! register_operand (operands[2], VOIDmode)
3877 && (mode == QImode
3878 || ! register_operand (operands[3], VOIDmode)))
3879 operands[2] = force_reg (mode, operands[2]);
3880
3881 if (mode == QImode
3882 && ! register_operand (operands[3], VOIDmode))
3883 operands[3] = force_reg (mode, operands[3]);
3884
3885 emit_insn (compare_seq);
3886 emit_insn (gen_rtx_SET (operands[0],
3887 gen_rtx_IF_THEN_ELSE (mode,
3888 compare_op, operands[2],
3889 operands[3])));
3890 return true;
3891}
3892
3893/* Detect conditional moves that exactly match min/max operational
3894 semantics. Note that this is IEEE safe, as long as we don't
3895 interchange the operands.
3896
3897 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3898 and TRUE if the operation is successful and instructions are emitted. */
3899
3900static bool
3901ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3902 rtx cmp_op1, rtx if_true, rtx if_false)
3903{
3904 machine_mode mode;
3905 bool is_min;
3906 rtx tmp;
3907
3908 if (code == LT)
3909 ;
3910 else if (code == UNGE)
3911 std::swap (a&: if_true, b&: if_false);
3912 else
3913 return false;
3914
3915 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3916 is_min = true;
3917 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3918 is_min = false;
3919 else
3920 return false;
3921
3922 mode = GET_MODE (dest);
3923
3924 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3925 but MODE may be a vector mode and thus not appropriate. */
3926 if (!flag_finite_math_only || flag_signed_zeros)
3927 {
3928 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3929 rtvec v;
3930
3931 if_true = force_reg (mode, if_true);
3932 v = gen_rtvec (2, if_true, if_false);
3933 tmp = gen_rtx_UNSPEC (mode, v, u);
3934 }
3935 else
3936 {
3937 code = is_min ? SMIN : SMAX;
3938 if (MEM_P (if_true) && MEM_P (if_false))
3939 if_true = force_reg (mode, if_true);
3940 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3941 }
3942
3943 emit_insn (gen_rtx_SET (dest, tmp));
3944 return true;
3945}
3946
3947/* Return true if MODE is valid for vector compare to mask register,
3948 Same result for conditionl vector move with mask register. */
3949static bool
3950ix86_valid_mask_cmp_mode (machine_mode mode)
3951{
3952 /* XOP has its own vector conditional movement. */
3953 if (TARGET_XOP && !TARGET_AVX512F)
3954 return false;
3955
3956 /* HFmode only supports vcmpsh whose dest is mask register. */
3957 if (TARGET_AVX512FP16 && mode == HFmode)
3958 return true;
3959
3960 /* AVX512F is needed for mask operation. */
3961 if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
3962 return false;
3963
3964 /* AVX512BW is needed for vector QI/HImode,
3965 AVX512VL is needed for 128/256-bit vector. */
3966 machine_mode inner_mode = GET_MODE_INNER (mode);
3967 int vector_size = GET_MODE_SIZE (mode);
3968 if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
3969 return false;
3970
3971 return (vector_size == 64 && TARGET_EVEX512) || TARGET_AVX512VL;
3972}
3973
3974/* Return true if integer mask comparison should be used. */
3975static bool
3976ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
3977 rtx op_true, rtx op_false)
3978{
3979 int vector_size = GET_MODE_SIZE (mode);
3980
3981 if (cmp_mode == HFmode)
3982 return true;
3983 else if (vector_size < 16)
3984 return false;
3985 else if (vector_size == 64)
3986 return true;
3987 else if (GET_MODE_INNER (cmp_mode) == HFmode)
3988 return true;
3989
3990 /* When op_true is NULL, op_false must be NULL, or vice versa. */
3991 gcc_assert (!op_true == !op_false);
3992
3993 /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
3994 vector dest is required. */
3995 if (!op_true || !ix86_valid_mask_cmp_mode (mode: cmp_mode))
3996 return false;
3997
3998 /* Exclude those that could be optimized in ix86_expand_sse_movcc. */
3999 if (op_false == CONST0_RTX (mode)
4000 || op_true == CONST0_RTX (mode)
4001 || (INTEGRAL_MODE_P (mode)
4002 && (op_true == CONSTM1_RTX (mode)
4003 || op_false == CONSTM1_RTX (mode))))
4004 return false;
4005
4006 return true;
4007}
4008
4009/* Expand an SSE comparison. Return the register with the result. */
4010
4011static rtx
4012ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
4013 rtx op_true, rtx op_false)
4014{
4015 machine_mode mode = GET_MODE (dest);
4016 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
4017
4018 /* In general case result of comparison can differ from operands' type. */
4019 machine_mode cmp_mode;
4020
4021 /* In AVX512F the result of comparison is an integer mask. */
4022 bool maskcmp = false;
4023 rtx x;
4024
4025 if (ix86_use_mask_cmp_p (mode, cmp_mode: cmp_ops_mode, op_true, op_false))
4026 {
4027 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
4028 maskcmp = true;
4029 cmp_mode = nbits > 8 ? int_mode_for_size (size: nbits, limit: 0).require () : E_QImode;
4030 }
4031 else
4032 cmp_mode = cmp_ops_mode;
4033
4034 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
4035
4036 bool (*op1_predicate)(rtx, machine_mode)
4037 = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
4038
4039 if (!op1_predicate (cmp_op1, cmp_ops_mode))
4040 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
4041
4042 if (optimize
4043 || (maskcmp && cmp_mode != mode)
4044 || (op_true && reg_overlap_mentioned_p (dest, op_true))
4045 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
4046 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
4047
4048 if (maskcmp)
4049 {
4050 bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
4051 gcc_assert (ok);
4052 return dest;
4053 }
4054
4055 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
4056
4057 if (cmp_mode != mode)
4058 {
4059 x = force_reg (cmp_ops_mode, x);
4060 convert_move (dest, x, false);
4061 }
4062 else
4063 emit_insn (gen_rtx_SET (dest, x));
4064
4065 return dest;
4066}
4067
4068/* Emit x86 binary operand CODE in mode MODE for SSE vector
4069 instructions that can be performed using GP registers. */
4070
4071static void
4072ix86_emit_vec_binop (enum rtx_code code, machine_mode mode,
4073 rtx dst, rtx src1, rtx src2)
4074{
4075 rtx tmp;
4076
4077 tmp = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
4078
4079 if (GET_MODE_SIZE (mode) <= GET_MODE_SIZE (SImode)
4080 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
4081 {
4082 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
4083 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
4084 }
4085
4086 emit_insn (tmp);
4087}
4088
4089/* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
4090 operations. This is used for both scalar and vector conditional moves. */
4091
4092void
4093ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
4094{
4095 machine_mode mode = GET_MODE (dest);
4096 machine_mode cmpmode = GET_MODE (cmp);
4097 rtx x;
4098
4099 /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
4100 if (rtx_equal_p (op_true, op_false))
4101 {
4102 emit_move_insn (dest, op_true);
4103 return;
4104 }
4105
4106 /* If we have an integer mask and FP value then we need
4107 to cast mask to FP mode. */
4108 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
4109 {
4110 cmp = force_reg (cmpmode, cmp);
4111 cmp = gen_rtx_SUBREG (mode, cmp, 0);
4112 }
4113
4114 /* In AVX512F the result of comparison is an integer mask. */
4115 if (mode != cmpmode
4116 && GET_MODE_CLASS (cmpmode) == MODE_INT)
4117 {
4118 gcc_assert (ix86_valid_mask_cmp_mode (mode));
4119 /* Using scalar/vector move with mask register. */
4120 cmp = force_reg (cmpmode, cmp);
4121 /* Optimize for mask zero. */
4122 op_true = (op_true != CONST0_RTX (mode)
4123 ? force_reg (mode, op_true) : op_true);
4124 op_false = (op_false != CONST0_RTX (mode)
4125 ? force_reg (mode, op_false) : op_false);
4126 if (op_true == CONST0_RTX (mode))
4127 {
4128 if (cmpmode == E_DImode && !TARGET_64BIT)
4129 {
4130 x = gen_reg_rtx (cmpmode);
4131 emit_insn (gen_knotdi (x, cmp));
4132 }
4133 else
4134 x = expand_simple_unop (cmpmode, NOT, cmp, NULL, 1);
4135 cmp = x;
4136 /* Reverse op_true op_false. */
4137 std::swap (a&: op_true, b&: op_false);
4138 }
4139
4140 if (mode == HFmode)
4141 emit_insn (gen_movhf_mask (dest, op_true, op_false, cmp));
4142 else
4143 emit_insn (gen_rtx_SET (dest,
4144 gen_rtx_VEC_MERGE (mode,
4145 op_true, op_false, cmp)));
4146 return;
4147 }
4148
4149 if (vector_all_ones_operand (op_true, mode)
4150 && op_false == CONST0_RTX (mode))
4151 {
4152 emit_move_insn (dest, cmp);
4153 return;
4154 }
4155 else if (op_false == CONST0_RTX (mode))
4156 {
4157 x = expand_simple_binop (mode, AND, cmp, op_true,
4158 dest, 1, OPTAB_DIRECT);
4159 if (x != dest)
4160 emit_move_insn (dest, x);
4161 return;
4162 }
4163 else if (op_true == CONST0_RTX (mode))
4164 {
4165 op_false = force_reg (mode, op_false);
4166 x = gen_rtx_NOT (mode, cmp);
4167 ix86_emit_vec_binop (code: AND, mode, dst: dest, src1: x, src2: op_false);
4168 return;
4169 }
4170 else if (vector_all_ones_operand (op_true, mode))
4171 {
4172 x = expand_simple_binop (mode, IOR, cmp, op_false,
4173 dest, 1, OPTAB_DIRECT);
4174 if (x != dest)
4175 emit_move_insn (dest, x);
4176 return;
4177 }
4178
4179 if (TARGET_XOP)
4180 {
4181 op_true = force_reg (mode, op_true);
4182
4183 if (GET_MODE_SIZE (mode) < 16
4184 || !nonimmediate_operand (op_false, mode))
4185 op_false = force_reg (mode, op_false);
4186
4187 emit_insn (gen_rtx_SET (dest,
4188 gen_rtx_IF_THEN_ELSE (mode, cmp,
4189 op_true, op_false)));
4190 return;
4191 }
4192
4193 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4194 machine_mode blend_mode = mode;
4195
4196 if (GET_MODE_SIZE (mode) < 16
4197 || !vector_operand (op_true, mode))
4198 op_true = force_reg (mode, op_true);
4199
4200 op_false = force_reg (mode, op_false);
4201
4202 switch (mode)
4203 {
4204 case E_V2SFmode:
4205 if (TARGET_SSE4_1)
4206 gen = gen_mmx_blendvps;
4207 break;
4208 case E_V4SFmode:
4209 if (TARGET_SSE4_1)
4210 gen = gen_sse4_1_blendvps;
4211 break;
4212 case E_V2DFmode:
4213 if (TARGET_SSE4_1)
4214 gen = gen_sse4_1_blendvpd;
4215 break;
4216 case E_SFmode:
4217 if (TARGET_SSE4_1)
4218 gen = gen_sse4_1_blendvss;
4219 break;
4220 case E_DFmode:
4221 if (TARGET_SSE4_1)
4222 gen = gen_sse4_1_blendvsd;
4223 break;
4224 case E_V8QImode:
4225 case E_V4HImode:
4226 case E_V4HFmode:
4227 case E_V4BFmode:
4228 case E_V2SImode:
4229 if (TARGET_SSE4_1)
4230 {
4231 gen = gen_mmx_pblendvb_v8qi;
4232 blend_mode = V8QImode;
4233 }
4234 break;
4235 case E_V4QImode:
4236 case E_V2HImode:
4237 case E_V2HFmode:
4238 case E_V2BFmode:
4239 if (TARGET_SSE4_1)
4240 {
4241 gen = gen_mmx_pblendvb_v4qi;
4242 blend_mode = V4QImode;
4243 }
4244 break;
4245 case E_V2QImode:
4246 if (TARGET_SSE4_1)
4247 gen = gen_mmx_pblendvb_v2qi;
4248 break;
4249 case E_V16QImode:
4250 case E_V8HImode:
4251 case E_V8HFmode:
4252 case E_V8BFmode:
4253 case E_V4SImode:
4254 case E_V2DImode:
4255 case E_V1TImode:
4256 if (TARGET_SSE4_1)
4257 {
4258 gen = gen_sse4_1_pblendvb;
4259 blend_mode = V16QImode;
4260 }
4261 break;
4262 case E_V8SFmode:
4263 if (TARGET_AVX)
4264 gen = gen_avx_blendvps256;
4265 break;
4266 case E_V4DFmode:
4267 if (TARGET_AVX)
4268 gen = gen_avx_blendvpd256;
4269 break;
4270 case E_V32QImode:
4271 case E_V16HImode:
4272 case E_V16HFmode:
4273 case E_V16BFmode:
4274 case E_V8SImode:
4275 case E_V4DImode:
4276 if (TARGET_AVX2)
4277 {
4278 gen = gen_avx2_pblendvb;
4279 blend_mode = V32QImode;
4280 }
4281 break;
4282
4283 case E_V64QImode:
4284 gen = gen_avx512bw_blendmv64qi;
4285 break;
4286 case E_V32HImode:
4287 gen = gen_avx512bw_blendmv32hi;
4288 break;
4289 case E_V32HFmode:
4290 gen = gen_avx512bw_blendmv32hf;
4291 break;
4292 case E_V32BFmode:
4293 gen = gen_avx512bw_blendmv32bf;
4294 break;
4295 case E_V16SImode:
4296 gen = gen_avx512f_blendmv16si;
4297 break;
4298 case E_V8DImode:
4299 gen = gen_avx512f_blendmv8di;
4300 break;
4301 case E_V8DFmode:
4302 gen = gen_avx512f_blendmv8df;
4303 break;
4304 case E_V16SFmode:
4305 gen = gen_avx512f_blendmv16sf;
4306 break;
4307
4308 default:
4309 break;
4310 }
4311
4312 if (gen != NULL)
4313 {
4314 if (blend_mode == mode)
4315 x = dest;
4316 else
4317 {
4318 x = gen_reg_rtx (blend_mode);
4319 op_false = gen_lowpart (blend_mode, op_false);
4320 op_true = gen_lowpart (blend_mode, op_true);
4321 cmp = gen_lowpart (blend_mode, cmp);
4322 }
4323
4324 emit_insn (gen (x, op_false, op_true, cmp));
4325
4326 if (x != dest)
4327 emit_move_insn (dest, gen_lowpart (mode, x));
4328 }
4329 else
4330 {
4331 rtx t2, t3;
4332
4333 t2 = expand_simple_binop (mode, AND, op_true, cmp,
4334 NULL, 1, OPTAB_DIRECT);
4335
4336 t3 = gen_reg_rtx (mode);
4337 x = gen_rtx_NOT (mode, cmp);
4338 ix86_emit_vec_binop (code: AND, mode, dst: t3, src1: x, src2: op_false);
4339
4340 x = expand_simple_binop (mode, IOR, t3, t2,
4341 dest, 1, OPTAB_DIRECT);
4342 if (x != dest)
4343 emit_move_insn (dest, x);
4344 }
4345}
4346
4347/* Swap, force into registers, or otherwise massage the two operands
4348 to an sse comparison with a mask result. Thus we differ a bit from
4349 ix86_prepare_fp_compare_args which expects to produce a flags result.
4350
4351 The DEST operand exists to help determine whether to commute commutative
4352 operators. The POP0/POP1 operands are updated in place. The new
4353 comparison code is returned, or UNKNOWN if not implementable. */
4354
4355static enum rtx_code
4356ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
4357 rtx *pop0, rtx *pop1)
4358{
4359 switch (code)
4360 {
4361 case LTGT:
4362 case UNEQ:
4363 /* AVX supports all the needed comparisons. */
4364 if (TARGET_AVX)
4365 break;
4366 /* We have no LTGT as an operator. We could implement it with
4367 NE & ORDERED, but this requires an extra temporary. It's
4368 not clear that it's worth it. */
4369 return UNKNOWN;
4370
4371 case LT:
4372 case LE:
4373 case UNGT:
4374 case UNGE:
4375 /* These are supported directly. */
4376 break;
4377
4378 case EQ:
4379 case NE:
4380 case UNORDERED:
4381 case ORDERED:
4382 /* AVX has 3 operand comparisons, no need to swap anything. */
4383 if (TARGET_AVX)
4384 break;
4385 /* For commutative operators, try to canonicalize the destination
4386 operand to be first in the comparison - this helps reload to
4387 avoid extra moves. */
4388 if (!dest || !rtx_equal_p (dest, *pop1))
4389 break;
4390 /* FALLTHRU */
4391
4392 case GE:
4393 case GT:
4394 case UNLE:
4395 case UNLT:
4396 /* These are not supported directly before AVX, and furthermore
4397 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
4398 comparison operands to transform into something that is
4399 supported. */
4400 std::swap (a&: *pop0, b&: *pop1);
4401 code = swap_condition (code);
4402 break;
4403
4404 default:
4405 gcc_unreachable ();
4406 }
4407
4408 return code;
4409}
4410
4411/* Expand a floating-point conditional move. Return true if successful. */
4412
4413bool
4414ix86_expand_fp_movcc (rtx operands[])
4415{
4416 machine_mode mode = GET_MODE (operands[0]);
4417 enum rtx_code code = GET_CODE (operands[1]);
4418 rtx tmp, compare_op;
4419 rtx op0 = XEXP (operands[1], 0);
4420 rtx op1 = XEXP (operands[1], 1);
4421
4422 if (GET_MODE (op0) == BFmode
4423 && !ix86_fp_comparison_operator (operands[1], VOIDmode))
4424 return false;
4425
4426 if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode))
4427 {
4428 machine_mode cmode;
4429
4430 /* Since we've no cmove for sse registers, don't force bad register
4431 allocation just to gain access to it. Deny movcc when the
4432 comparison mode doesn't match the move mode. */
4433 cmode = GET_MODE (op0);
4434 if (cmode == VOIDmode)
4435 cmode = GET_MODE (op1);
4436 if (cmode != mode)
4437 return false;
4438
4439 code = ix86_prepare_sse_fp_compare_args (dest: operands[0], code, pop0: &op0, pop1: &op1);
4440 if (code == UNKNOWN)
4441 return false;
4442
4443 if (ix86_expand_sse_fp_minmax (dest: operands[0], code, cmp_op0: op0, cmp_op1: op1,
4444 if_true: operands[2], if_false: operands[3]))
4445 return true;
4446
4447 tmp = ix86_expand_sse_cmp (dest: operands[0], code, cmp_op0: op0, cmp_op1: op1,
4448 op_true: operands[2], op_false: operands[3]);
4449 ix86_expand_sse_movcc (dest: operands[0], cmp: tmp, op_true: operands[2], op_false: operands[3]);
4450 return true;
4451 }
4452
4453 if (GET_MODE (op0) == TImode
4454 || (GET_MODE (op0) == DImode
4455 && !TARGET_64BIT))
4456 return false;
4457
4458 /* The floating point conditional move instructions don't directly
4459 support conditions resulting from a signed integer comparison. */
4460
4461 compare_op = ix86_expand_compare (code, op0, op1);
4462 if (!fcmov_comparison_operator (compare_op, VOIDmode))
4463 {
4464 tmp = gen_reg_rtx (QImode);
4465 ix86_expand_setcc (dest: tmp, code, op0, op1);
4466
4467 compare_op = ix86_expand_compare (code: NE, op0: tmp, const0_rtx);
4468 }
4469
4470 emit_insn (gen_rtx_SET (operands[0],
4471 gen_rtx_IF_THEN_ELSE (mode, compare_op,
4472 operands[2], operands[3])));
4473
4474 return true;
4475}
4476
4477/* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
4478
4479static int
4480ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
4481{
4482 switch (code)
4483 {
4484 case EQ:
4485 return 0;
4486 case LT:
4487 case LTU:
4488 return 1;
4489 case LE:
4490 case LEU:
4491 return 2;
4492 case NE:
4493 return 4;
4494 case GE:
4495 case GEU:
4496 return 5;
4497 case GT:
4498 case GTU:
4499 return 6;
4500 default:
4501 gcc_unreachable ();
4502 }
4503}
4504
4505/* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
4506
4507static int
4508ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
4509{
4510 switch (code)
4511 {
4512 case EQ:
4513 return 0x00;
4514 case NE:
4515 return 0x04;
4516 case GT:
4517 return 0x0e;
4518 case LE:
4519 return 0x02;
4520 case GE:
4521 return 0x0d;
4522 case LT:
4523 return 0x01;
4524 case UNLE:
4525 return 0x0a;
4526 case UNLT:
4527 return 0x09;
4528 case UNGE:
4529 return 0x05;
4530 case UNGT:
4531 return 0x06;
4532 case UNEQ:
4533 return 0x18;
4534 case LTGT:
4535 return 0x0c;
4536 case ORDERED:
4537 return 0x07;
4538 case UNORDERED:
4539 return 0x03;
4540 default:
4541 gcc_unreachable ();
4542 }
4543}
4544
4545/* Return immediate value to be used in UNSPEC_PCMP
4546 for comparison CODE in MODE. */
4547
4548static int
4549ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
4550{
4551 if (FLOAT_MODE_P (mode))
4552 return ix86_fp_cmp_code_to_pcmp_immediate (code);
4553 return ix86_int_cmp_code_to_pcmp_immediate (code);
4554}
4555
4556/* Expand AVX-512 vector comparison. */
4557
4558bool
4559ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
4560{
4561 machine_mode mask_mode = GET_MODE (dest);
4562 machine_mode cmp_mode = GET_MODE (cmp_op0);
4563 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
4564 int unspec_code;
4565 rtx unspec;
4566
4567 switch (code)
4568 {
4569 case LEU:
4570 case GTU:
4571 case GEU:
4572 case LTU:
4573 unspec_code = UNSPEC_UNSIGNED_PCMP;
4574 break;
4575
4576 default:
4577 unspec_code = UNSPEC_PCMP;
4578 }
4579
4580 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
4581 unspec_code);
4582 emit_insn (gen_rtx_SET (dest, unspec));
4583
4584 return true;
4585}
4586
4587/* Expand fp vector comparison. */
4588
4589bool
4590ix86_expand_fp_vec_cmp (rtx operands[])
4591{
4592 enum rtx_code code = GET_CODE (operands[1]);
4593 rtx cmp;
4594
4595 code = ix86_prepare_sse_fp_compare_args (dest: operands[0], code,
4596 pop0: &operands[2], pop1: &operands[3]);
4597 if (code == UNKNOWN)
4598 {
4599 rtx temp;
4600 switch (GET_CODE (operands[1]))
4601 {
4602 case LTGT:
4603 temp = ix86_expand_sse_cmp (dest: operands[0], code: ORDERED, cmp_op0: operands[2],
4604 cmp_op1: operands[3], NULL, NULL);
4605 cmp = ix86_expand_sse_cmp (dest: operands[0], code: NE, cmp_op0: operands[2],
4606 cmp_op1: operands[3], NULL, NULL);
4607 code = AND;
4608 break;
4609 case UNEQ:
4610 temp = ix86_expand_sse_cmp (dest: operands[0], code: UNORDERED, cmp_op0: operands[2],
4611 cmp_op1: operands[3], NULL, NULL);
4612 cmp = ix86_expand_sse_cmp (dest: operands[0], code: EQ, cmp_op0: operands[2],
4613 cmp_op1: operands[3], NULL, NULL);
4614 code = IOR;
4615 break;
4616 default:
4617 gcc_unreachable ();
4618 }
4619 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4620 OPTAB_DIRECT);
4621 }
4622 else
4623 cmp = ix86_expand_sse_cmp (dest: operands[0], code, cmp_op0: operands[2], cmp_op1: operands[3],
4624 NULL, NULL);
4625
4626 if (operands[0] != cmp)
4627 emit_move_insn (operands[0], cmp);
4628
4629 return true;
4630}
4631
4632static rtx
4633ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4634 rtx op_true, rtx op_false, bool *negate)
4635{
4636 machine_mode data_mode = GET_MODE (dest);
4637 machine_mode mode = GET_MODE (cop0);
4638 rtx x;
4639
4640 *negate = false;
4641
4642 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4643 if (TARGET_XOP
4644 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
4645 && GET_MODE_SIZE (mode) <= 16)
4646 ;
4647 /* AVX512F supports all of the comparsions
4648 on all 128/256/512-bit vector int types. */
4649 else if (ix86_use_mask_cmp_p (mode: data_mode, cmp_mode: mode, op_true, op_false))
4650 ;
4651 else
4652 {
4653 /* Canonicalize the comparison to EQ, GT, GTU. */
4654 switch (code)
4655 {
4656 case EQ:
4657 case GT:
4658 case GTU:
4659 break;
4660
4661 case LE:
4662 case LEU:
4663 /* x <= cst can be handled as x < cst + 1 unless there is
4664 wrap around in cst + 1. */
4665 if (GET_CODE (cop1) == CONST_VECTOR
4666 && GET_MODE_INNER (mode) != TImode)
4667 {
4668 unsigned int n_elts = GET_MODE_NUNITS (mode), i;
4669 machine_mode eltmode = GET_MODE_INNER (mode);
4670 for (i = 0; i < n_elts; ++i)
4671 {
4672 rtx elt = CONST_VECTOR_ELT (cop1, i);
4673 if (!CONST_INT_P (elt))
4674 break;
4675 if (code == GE)
4676 {
4677 /* For LE punt if some element is signed maximum. */
4678 if ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
4679 == (GET_MODE_MASK (eltmode) >> 1))
4680 break;
4681 }
4682 /* For LEU punt if some element is unsigned maximum. */
4683 else if (elt == constm1_rtx)
4684 break;
4685 }
4686 if (i == n_elts)
4687 {
4688 rtvec v = rtvec_alloc (n_elts);
4689 for (i = 0; i < n_elts; ++i)
4690 RTVEC_ELT (v, i)
4691 = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) + 1,
4692 eltmode);
4693 cop1 = gen_rtx_CONST_VECTOR (mode, v);
4694 std::swap (a&: cop0, b&: cop1);
4695 code = code == LE ? GT : GTU;
4696 break;
4697 }
4698 }
4699 /* FALLTHRU */
4700 case NE:
4701 code = reverse_condition (code);
4702 *negate = true;
4703 break;
4704
4705 case GE:
4706 case GEU:
4707 /* x >= cst can be handled as x > cst - 1 unless there is
4708 wrap around in cst - 1. */
4709 if (GET_CODE (cop1) == CONST_VECTOR
4710 && GET_MODE_INNER (mode) != TImode)
4711 {
4712 unsigned int n_elts = GET_MODE_NUNITS (mode), i;
4713 machine_mode eltmode = GET_MODE_INNER (mode);
4714 for (i = 0; i < n_elts; ++i)
4715 {
4716 rtx elt = CONST_VECTOR_ELT (cop1, i);
4717 if (!CONST_INT_P (elt))
4718 break;
4719 if (code == GE)
4720 {
4721 /* For GE punt if some element is signed minimum. */
4722 if (INTVAL (elt) < 0
4723 && ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
4724 == 0))
4725 break;
4726 }
4727 /* For GEU punt if some element is zero. */
4728 else if (elt == const0_rtx)
4729 break;
4730 }
4731 if (i == n_elts)
4732 {
4733 rtvec v = rtvec_alloc (n_elts);
4734 for (i = 0; i < n_elts; ++i)
4735 RTVEC_ELT (v, i)
4736 = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) - 1,
4737 eltmode);
4738 cop1 = gen_rtx_CONST_VECTOR (mode, v);
4739 code = code == GE ? GT : GTU;
4740 break;
4741 }
4742 }
4743 code = reverse_condition (code);
4744 *negate = true;
4745 /* FALLTHRU */
4746
4747 case LT:
4748 case LTU:
4749 std::swap (a&: cop0, b&: cop1);
4750 code = swap_condition (code);
4751 break;
4752
4753 default:
4754 gcc_unreachable ();
4755 }
4756
4757 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4758 if (mode == V2DImode)
4759 {
4760 switch (code)
4761 {
4762 case EQ:
4763 /* SSE4.1 supports EQ. */
4764 if (!TARGET_SSE4_1)
4765 return NULL;
4766 break;
4767
4768 case GT:
4769 case GTU:
4770 /* SSE4.2 supports GT/GTU. */
4771 if (!TARGET_SSE4_2)
4772 return NULL;
4773 break;
4774
4775 default:
4776 gcc_unreachable ();
4777 }
4778 }
4779
4780 if (GET_CODE (cop0) == CONST_VECTOR)
4781 cop0 = force_reg (mode, cop0);
4782 else if (GET_CODE (cop1) == CONST_VECTOR)
4783 cop1 = force_reg (mode, cop1);
4784
4785 rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4786 rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4787 if (*negate)
4788 std::swap (a&: optrue, b&: opfalse);
4789
4790 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4791 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4792 min (x, y) == x). While we add one instruction (the minimum),
4793 we remove the need for two instructions in the negation, as the
4794 result is done this way.
4795 When using masks, do it for SI/DImode element types, as it is shorter
4796 than the two subtractions. */
4797 if ((code != EQ
4798 && GET_MODE_SIZE (mode) != 64
4799 && vector_all_ones_operand (opfalse, data_mode)
4800 && optrue == CONST0_RTX (data_mode))
4801 || (code == GTU
4802 && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4803 /* Don't do it if not using integer masks and we'd end up with
4804 the right values in the registers though. */
4805 && ((GET_MODE_SIZE (mode) == 64 && TARGET_EVEX512)
4806 || !vector_all_ones_operand (optrue, data_mode)
4807 || opfalse != CONST0_RTX (data_mode))))
4808 {
4809 rtx (*gen) (rtx, rtx, rtx) = NULL;
4810
4811 switch (mode)
4812 {
4813 case E_V16SImode:
4814 gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4815 break;
4816 case E_V8DImode:
4817 gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4818 cop0 = force_reg (mode, cop0);
4819 cop1 = force_reg (mode, cop1);
4820 break;
4821 case E_V32QImode:
4822 if (TARGET_AVX2)
4823 gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4824 break;
4825 case E_V16HImode:
4826 if (TARGET_AVX2)
4827 gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4828 break;
4829 case E_V8SImode:
4830 if (TARGET_AVX2)
4831 gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4832 break;
4833 case E_V4DImode:
4834 if (TARGET_AVX512VL)
4835 {
4836 gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4837 cop0 = force_reg (mode, cop0);
4838 cop1 = force_reg (mode, cop1);
4839 }
4840 break;
4841 case E_V16QImode:
4842 if (code == GTU && TARGET_SSE2)
4843 gen = gen_uminv16qi3;
4844 else if (code == GT && TARGET_SSE4_1)
4845 gen = gen_sminv16qi3;
4846 break;
4847 case E_V8QImode:
4848 if (code == GTU && TARGET_SSE2)
4849 gen = gen_uminv8qi3;
4850 else if (code == GT && TARGET_SSE4_1)
4851 gen = gen_sminv8qi3;
4852 break;
4853 case E_V4QImode:
4854 if (code == GTU && TARGET_SSE2)
4855 gen = gen_uminv4qi3;
4856 else if (code == GT && TARGET_SSE4_1)
4857 gen = gen_sminv4qi3;
4858 break;
4859 case E_V2QImode:
4860 if (code == GTU && TARGET_SSE2)
4861 gen = gen_uminv2qi3;
4862 else if (code == GT && TARGET_SSE4_1)
4863 gen = gen_sminv2qi3;
4864 break;
4865 case E_V8HImode:
4866 if (code == GTU && TARGET_SSE4_1)
4867 gen = gen_uminv8hi3;
4868 else if (code == GT && TARGET_SSE2)
4869 gen = gen_sminv8hi3;
4870 break;
4871 case E_V4HImode:
4872 if (code == GTU && TARGET_SSE4_1)
4873 gen = gen_uminv4hi3;
4874 else if (code == GT && TARGET_SSE2)
4875 gen = gen_sminv4hi3;
4876 break;
4877 case E_V2HImode:
4878 if (code == GTU && TARGET_SSE4_1)
4879 gen = gen_uminv2hi3;
4880 else if (code == GT && TARGET_SSE2)
4881 gen = gen_sminv2hi3;
4882 break;
4883 case E_V4SImode:
4884 if (TARGET_SSE4_1)
4885 gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4886 break;
4887 case E_V2SImode:
4888 if (TARGET_SSE4_1)
4889 gen = (code == GTU) ? gen_uminv2si3 : gen_sminv2si3;
4890 break;
4891 case E_V2DImode:
4892 if (TARGET_AVX512VL)
4893 {
4894 gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4895 cop0 = force_reg (mode, cop0);
4896 cop1 = force_reg (mode, cop1);
4897 }
4898 break;
4899 default:
4900 break;
4901 }
4902
4903 if (gen)
4904 {
4905 rtx tem = gen_reg_rtx (mode);
4906 if (!vector_operand (cop0, mode))
4907 cop0 = force_reg (mode, cop0);
4908 if (!vector_operand (cop1, mode))
4909 cop1 = force_reg (mode, cop1);
4910 *negate = !*negate;
4911 emit_insn (gen (tem, cop0, cop1));
4912 cop1 = tem;
4913 code = EQ;
4914 }
4915 }
4916
4917 /* Unsigned parallel compare is not supported by the hardware.
4918 Play some tricks to turn this into a signed comparison
4919 against 0. */
4920 if (code == GTU)
4921 {
4922 cop0 = force_reg (mode, cop0);
4923
4924 switch (mode)
4925 {
4926 case E_V16SImode:
4927 case E_V8DImode:
4928 case E_V8SImode:
4929 case E_V4DImode:
4930 case E_V4SImode:
4931 case E_V2SImode:
4932 case E_V2DImode:
4933 {
4934 rtx t1, t2, mask;
4935
4936 /* Subtract (-(INT MAX) - 1) from both operands to make
4937 them signed. */
4938 mask = ix86_build_signbit_mask (mode, true, false);
4939 t1 = gen_reg_rtx (mode);
4940 emit_insn (gen_sub3_insn (t1, cop0, mask));
4941
4942 t2 = gen_reg_rtx (mode);
4943 emit_insn (gen_sub3_insn (t2, cop1, mask));
4944
4945 cop0 = t1;
4946 cop1 = t2;
4947 code = GT;
4948 }
4949 break;
4950
4951 case E_V64QImode:
4952 case E_V32HImode:
4953 case E_V32QImode:
4954 case E_V16HImode:
4955 case E_V16QImode:
4956 case E_V8QImode:
4957 case E_V4QImode:
4958 case E_V2QImode:
4959 case E_V8HImode:
4960 case E_V4HImode:
4961 case E_V2HImode:
4962 /* Perform a parallel unsigned saturating subtraction. */
4963 x = gen_reg_rtx (mode);
4964 emit_insn (gen_rtx_SET
4965 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
4966 cop0 = x;
4967 cop1 = CONST0_RTX (mode);
4968 code = EQ;
4969 *negate = !*negate;
4970 break;
4971
4972 default:
4973 gcc_unreachable ();
4974 }
4975 }
4976 }
4977
4978 if (*negate)
4979 std::swap (a&: op_true, b&: op_false);
4980
4981 if (GET_CODE (cop1) == CONST_VECTOR)
4982 cop1 = force_reg (mode, cop1);
4983
4984 /* Allow the comparison to be done in one mode, but the movcc to
4985 happen in another mode. */
4986 if (data_mode == mode)
4987 x = ix86_expand_sse_cmp (dest, code, cmp_op0: cop0, cmp_op1: cop1, op_true, op_false);
4988 else
4989 {
4990 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
4991 x = ix86_expand_sse_cmp (dest: gen_reg_rtx (mode), code, cmp_op0: cop0, cmp_op1: cop1,
4992 op_true, op_false);
4993 if (GET_MODE (x) == mode)
4994 x = gen_lowpart (data_mode, x);
4995 }
4996
4997 return x;
4998}
4999
5000/* Expand integer vector comparison. */
5001
5002bool
5003ix86_expand_int_vec_cmp (rtx operands[])
5004{
5005 rtx_code code = GET_CODE (operands[1]);
5006 bool negate = false;
5007 rtx cmp = ix86_expand_int_sse_cmp (dest: operands[0], code, cop0: operands[2],
5008 cop1: operands[3], NULL, NULL, negate: &negate);
5009
5010 if (!cmp)
5011 return false;
5012
5013 if (negate)
5014 cmp = ix86_expand_int_sse_cmp (dest: operands[0], code: EQ, cop0: cmp,
5015 CONST0_RTX (GET_MODE (cmp)),
5016 NULL, NULL, negate: &negate);
5017
5018 gcc_assert (!negate);
5019
5020 if (operands[0] != cmp)
5021 emit_move_insn (operands[0], cmp);
5022
5023 return true;
5024}
5025
5026/* Expand a floating-point vector conditional move; a vcond operation
5027 rather than a movcc operation. */
5028
5029bool
5030ix86_expand_fp_vcond (rtx operands[])
5031{
5032 enum rtx_code code = GET_CODE (operands[3]);
5033 rtx cmp;
5034
5035 code = ix86_prepare_sse_fp_compare_args (dest: operands[0], code,
5036 pop0: &operands[4], pop1: &operands[5]);
5037 if (code == UNKNOWN)
5038 {
5039 rtx temp;
5040 switch (GET_CODE (operands[3]))
5041 {
5042 case LTGT:
5043 temp = ix86_expand_sse_cmp (dest: operands[0], code: ORDERED, cmp_op0: operands[4],
5044 cmp_op1: operands[5], op_true: operands[0], op_false: operands[0]);
5045 cmp = ix86_expand_sse_cmp (dest: operands[0], code: NE, cmp_op0: operands[4],
5046 cmp_op1: operands[5], op_true: operands[1], op_false: operands[2]);
5047 code = AND;
5048 break;
5049 case UNEQ:
5050 temp = ix86_expand_sse_cmp (dest: operands[0], code: UNORDERED, cmp_op0: operands[4],
5051 cmp_op1: operands[5], op_true: operands[0], op_false: operands[0]);
5052 cmp = ix86_expand_sse_cmp (dest: operands[0], code: EQ, cmp_op0: operands[4],
5053 cmp_op1: operands[5], op_true: operands[1], op_false: operands[2]);
5054 code = IOR;
5055 break;
5056 default:
5057 gcc_unreachable ();
5058 }
5059 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
5060 OPTAB_DIRECT);
5061 ix86_expand_sse_movcc (dest: operands[0], cmp, op_true: operands[1], op_false: operands[2]);
5062 return true;
5063 }
5064
5065 if (ix86_expand_sse_fp_minmax (dest: operands[0], code, cmp_op0: operands[4],
5066 cmp_op1: operands[5], if_true: operands[1], if_false: operands[2]))
5067 return true;
5068
5069 cmp = ix86_expand_sse_cmp (dest: operands[0], code, cmp_op0: operands[4], cmp_op1: operands[5],
5070 op_true: operands[1], op_false: operands[2]);
5071 ix86_expand_sse_movcc (dest: operands[0], cmp, op_true: operands[1], op_false: operands[2]);
5072 return true;
5073}
5074
5075/* Expand a signed/unsigned integral vector conditional move. */
5076
5077bool
5078ix86_expand_int_vcond (rtx operands[])
5079{
5080 machine_mode data_mode = GET_MODE (operands[0]);
5081 machine_mode mode = GET_MODE (operands[4]);
5082 enum rtx_code code = GET_CODE (operands[3]);
5083 bool negate = false;
5084 rtx x, cop0, cop1;
5085
5086 cop0 = operands[4];
5087 cop1 = operands[5];
5088
5089 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
5090 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
5091 if ((code == LT || code == GE)
5092 && data_mode == mode
5093 && cop1 == CONST0_RTX (mode)
5094 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
5095 && GET_MODE_UNIT_SIZE (data_mode) > 1
5096 && GET_MODE_UNIT_SIZE (data_mode) <= 8
5097 && (GET_MODE_SIZE (data_mode) == 16
5098 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
5099 {
5100 rtx negop = operands[2 - (code == LT)];
5101 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
5102 if (negop == CONST1_RTX (data_mode))
5103 {
5104 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
5105 operands[0], 1, OPTAB_DIRECT);
5106 if (res != operands[0])
5107 emit_move_insn (operands[0], res);
5108 return true;
5109 }
5110 else if (GET_MODE_INNER (data_mode) != DImode
5111 && vector_all_ones_operand (negop, data_mode))
5112 {
5113 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
5114 operands[0], 0, OPTAB_DIRECT);
5115 if (res != operands[0])
5116 emit_move_insn (operands[0], res);
5117 return true;
5118 }
5119 }
5120
5121 if (!nonimmediate_operand (cop1, mode))
5122 cop1 = force_reg (mode, cop1);
5123 if (!general_operand (operands[1], data_mode))
5124 operands[1] = force_reg (data_mode, operands[1]);
5125 if (!general_operand (operands[2], data_mode))
5126 operands[2] = force_reg (data_mode, operands[2]);
5127
5128 x = ix86_expand_int_sse_cmp (dest: operands[0], code, cop0, cop1,
5129 op_true: operands[1], op_false: operands[2], negate: &negate);
5130
5131 if (!x)
5132 return false;
5133
5134 ix86_expand_sse_movcc (dest: operands[0], cmp: x, op_true: operands[1+negate],
5135 op_false: operands[2-negate]);
5136 return true;
5137}
5138
5139static bool
5140ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
5141 struct expand_vec_perm_d *d)
5142{
5143 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5144 expander, so args are either in d, or in op0, op1 etc. */
5145 machine_mode mode = GET_MODE (d ? d->op0 : op0);
5146 machine_mode maskmode = mode;
5147 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
5148
5149 switch (mode)
5150 {
5151 case E_V16QImode:
5152 if (TARGET_AVX512VL && TARGET_AVX512VBMI)
5153 gen = gen_avx512vl_vpermt2varv16qi3;
5154 break;
5155 case E_V32QImode:
5156 if (TARGET_AVX512VL && TARGET_AVX512VBMI)
5157 gen = gen_avx512vl_vpermt2varv32qi3;
5158 break;
5159 case E_V64QImode:
5160 if (TARGET_AVX512VBMI)
5161 gen = gen_avx512bw_vpermt2varv64qi3;
5162 break;
5163 case E_V8HImode:
5164 if (TARGET_AVX512VL && TARGET_AVX512BW)
5165 gen = gen_avx512vl_vpermt2varv8hi3;
5166 break;
5167 case E_V16HImode:
5168 if (TARGET_AVX512VL && TARGET_AVX512BW)
5169 gen = gen_avx512vl_vpermt2varv16hi3;
5170 break;
5171 case E_V32HImode:
5172 if (TARGET_AVX512BW)
5173 gen = gen_avx512bw_vpermt2varv32hi3;
5174 break;
5175 case E_V4SImode:
5176 if (TARGET_AVX512VL)
5177 gen = gen_avx512vl_vpermt2varv4si3;
5178 break;
5179 case E_V8SImode:
5180 if (TARGET_AVX512VL)
5181 gen = gen_avx512vl_vpermt2varv8si3;
5182 break;
5183 case E_V16SImode:
5184 if (TARGET_AVX512F)
5185 gen = gen_avx512f_vpermt2varv16si3;
5186 break;
5187 case E_V4SFmode:
5188 if (TARGET_AVX512VL)
5189 {
5190 gen = gen_avx512vl_vpermt2varv4sf3;
5191 maskmode = V4SImode;
5192 }
5193 break;
5194 case E_V8SFmode:
5195 if (TARGET_AVX512VL)
5196 {
5197 gen = gen_avx512vl_vpermt2varv8sf3;
5198 maskmode = V8SImode;
5199 }
5200 break;
5201 case E_V16SFmode:
5202 if (TARGET_AVX512F)
5203 {
5204 gen = gen_avx512f_vpermt2varv16sf3;
5205 maskmode = V16SImode;
5206 }
5207 break;
5208 case E_V2DImode:
5209 if (TARGET_AVX512VL)
5210 gen = gen_avx512vl_vpermt2varv2di3;
5211 break;
5212 case E_V4DImode:
5213 if (TARGET_AVX512VL)
5214 gen = gen_avx512vl_vpermt2varv4di3;
5215 break;
5216 case E_V8DImode:
5217 if (TARGET_AVX512F)
5218 gen = gen_avx512f_vpermt2varv8di3;
5219 break;
5220 case E_V2DFmode:
5221 if (TARGET_AVX512VL)
5222 {
5223 gen = gen_avx512vl_vpermt2varv2df3;
5224 maskmode = V2DImode;
5225 }
5226 break;
5227 case E_V4DFmode:
5228 if (TARGET_AVX512VL)
5229 {
5230 gen = gen_avx512vl_vpermt2varv4df3;
5231 maskmode = V4DImode;
5232 }
5233 break;
5234 case E_V8DFmode:
5235 if (TARGET_AVX512F)
5236 {
5237 gen = gen_avx512f_vpermt2varv8df3;
5238 maskmode = V8DImode;
5239 }
5240 break;
5241 default:
5242 break;
5243 }
5244
5245 if (gen == NULL)
5246 return false;
5247
5248 if (d && d->testing_p)
5249 return true;
5250
5251 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5252 expander, so args are either in d, or in op0, op1 etc. */
5253 if (d)
5254 {
5255 rtx vec[64];
5256 target = d->target;
5257 op0 = d->op0;
5258 op1 = d->op1;
5259 for (int i = 0; i < d->nelt; ++i)
5260 vec[i] = GEN_INT (d->perm[i]);
5261 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
5262 }
5263
5264 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
5265 return true;
5266}
5267
5268/* Expand a variable vector permutation. */
5269
5270void
5271ix86_expand_vec_perm (rtx operands[])
5272{
5273 rtx target = operands[0];
5274 rtx op0 = operands[1];
5275 rtx op1 = operands[2];
5276 rtx mask = operands[3];
5277 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
5278 machine_mode mode = GET_MODE (op0);
5279 machine_mode maskmode = GET_MODE (mask);
5280 int w, e, i;
5281 bool one_operand_shuffle = rtx_equal_p (op0, op1);
5282
5283 /* Number of elements in the vector. */
5284 w = GET_MODE_NUNITS (mode);
5285 e = GET_MODE_UNIT_SIZE (mode);
5286 gcc_assert (w <= 64);
5287
5288 /* For HF mode vector, convert it to HI using subreg. */
5289 if (GET_MODE_INNER (mode) == HFmode)
5290 {
5291 machine_mode orig_mode = mode;
5292 mode = mode_for_vector (HImode, w).require ();
5293 target = lowpart_subreg (outermode: mode, op: target, innermode: orig_mode);
5294 op0 = lowpart_subreg (outermode: mode, op: op0, innermode: orig_mode);
5295 op1 = lowpart_subreg (outermode: mode, op: op1, innermode: orig_mode);
5296 }
5297
5298 if (TARGET_AVX512F && one_operand_shuffle)
5299 {
5300 rtx (*gen) (rtx, rtx, rtx) = NULL;
5301 switch (mode)
5302 {
5303 case E_V16SImode:
5304 gen =gen_avx512f_permvarv16si;
5305 break;
5306 case E_V16SFmode:
5307 gen = gen_avx512f_permvarv16sf;
5308 break;
5309 case E_V8DImode:
5310 gen = gen_avx512f_permvarv8di;
5311 break;
5312 case E_V8DFmode:
5313 gen = gen_avx512f_permvarv8df;
5314 break;
5315 default:
5316 break;
5317 }
5318 if (gen != NULL)
5319 {
5320 emit_insn (gen (target, op0, mask));
5321 return;
5322 }
5323 }
5324
5325 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
5326 return;
5327
5328 if (TARGET_AVX2)
5329 {
5330 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
5331 {
5332 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
5333 an constant shuffle operand. With a tiny bit of effort we can
5334 use VPERMD instead. A re-interpretation stall for V4DFmode is
5335 unfortunate but there's no avoiding it.
5336 Similarly for V16HImode we don't have instructions for variable
5337 shuffling, while for V32QImode we can use after preparing suitable
5338 masks vpshufb; vpshufb; vpermq; vpor. */
5339
5340 if (mode == V16HImode)
5341 {
5342 maskmode = mode = V32QImode;
5343 w = 32;
5344 e = 1;
5345 }
5346 else
5347 {
5348 maskmode = mode = V8SImode;
5349 w = 8;
5350 e = 4;
5351 }
5352 t1 = gen_reg_rtx (maskmode);
5353
5354 /* Replicate the low bits of the V4DImode mask into V8SImode:
5355 mask = { A B C D }
5356 t1 = { A A B B C C D D }. */
5357 for (i = 0; i < w / 2; ++i)
5358 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
5359 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5360 vt = force_reg (maskmode, vt);
5361 mask = gen_lowpart (maskmode, mask);
5362 if (maskmode == V8SImode)
5363 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
5364 else
5365 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
5366
5367 /* Multiply the shuffle indicies by two. */
5368 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
5369 OPTAB_DIRECT);
5370
5371 /* Add one to the odd shuffle indicies:
5372 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
5373 for (i = 0; i < w / 2; ++i)
5374 {
5375 vec[i * 2] = const0_rtx;
5376 vec[i * 2 + 1] = const1_rtx;
5377 }
5378 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5379 vt = validize_mem (force_const_mem (maskmode, vt));
5380 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
5381 OPTAB_DIRECT);
5382
5383 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
5384 operands[3] = mask = t1;
5385 target = gen_reg_rtx (mode);
5386 op0 = gen_lowpart (mode, op0);
5387 op1 = gen_lowpart (mode, op1);
5388 }
5389
5390 switch (mode)
5391 {
5392 case E_V8SImode:
5393 /* The VPERMD and VPERMPS instructions already properly ignore
5394 the high bits of the shuffle elements. No need for us to
5395 perform an AND ourselves. */
5396 if (one_operand_shuffle)
5397 {
5398 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
5399 if (target != operands[0])
5400 emit_move_insn (operands[0],
5401 gen_lowpart (GET_MODE (operands[0]), target));
5402 }
5403 else
5404 {
5405 t1 = gen_reg_rtx (V8SImode);
5406 t2 = gen_reg_rtx (V8SImode);
5407 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
5408 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
5409 goto merge_two;
5410 }
5411 return;
5412
5413 case E_V8SFmode:
5414 mask = gen_lowpart (V8SImode, mask);
5415 if (one_operand_shuffle)
5416 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
5417 else
5418 {
5419 t1 = gen_reg_rtx (V8SFmode);
5420 t2 = gen_reg_rtx (V8SFmode);
5421 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
5422 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
5423 goto merge_two;
5424 }
5425 return;
5426
5427 case E_V4SImode:
5428 /* By combining the two 128-bit input vectors into one 256-bit
5429 input vector, we can use VPERMD and VPERMPS for the full
5430 two-operand shuffle. */
5431 t1 = gen_reg_rtx (V8SImode);
5432 t2 = gen_reg_rtx (V8SImode);
5433 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
5434 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5435 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
5436 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
5437 return;
5438
5439 case E_V4SFmode:
5440 t1 = gen_reg_rtx (V8SFmode);
5441 t2 = gen_reg_rtx (V8SImode);
5442 mask = gen_lowpart (V4SImode, mask);
5443 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
5444 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5445 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
5446 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
5447 return;
5448
5449 case E_V32QImode:
5450 t1 = gen_reg_rtx (V32QImode);
5451 t2 = gen_reg_rtx (V32QImode);
5452 t3 = gen_reg_rtx (V32QImode);
5453 vt2 = GEN_INT (-128);
5454 vt = gen_const_vec_duplicate (V32QImode, vt2);
5455 vt = force_reg (V32QImode, vt);
5456 for (i = 0; i < 32; i++)
5457 vec[i] = i < 16 ? vt2 : const0_rtx;
5458 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
5459 vt2 = force_reg (V32QImode, vt2);
5460 /* From mask create two adjusted masks, which contain the same
5461 bits as mask in the low 7 bits of each vector element.
5462 The first mask will have the most significant bit clear
5463 if it requests element from the same 128-bit lane
5464 and MSB set if it requests element from the other 128-bit lane.
5465 The second mask will have the opposite values of the MSB,
5466 and additionally will have its 128-bit lanes swapped.
5467 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
5468 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
5469 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
5470 stands for other 12 bytes. */
5471 /* The bit whether element is from the same lane or the other
5472 lane is bit 4, so shift it up by 3 to the MSB position. */
5473 t5 = gen_reg_rtx (V4DImode);
5474 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
5475 GEN_INT (3)));
5476 /* Clear MSB bits from the mask just in case it had them set. */
5477 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
5478 /* After this t1 will have MSB set for elements from other lane. */
5479 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
5480 /* Clear bits other than MSB. */
5481 emit_insn (gen_andv32qi3 (t1, t1, vt));
5482 /* Or in the lower bits from mask into t3. */
5483 emit_insn (gen_iorv32qi3 (t3, t1, t2));
5484 /* And invert MSB bits in t1, so MSB is set for elements from the same
5485 lane. */
5486 emit_insn (gen_xorv32qi3 (t1, t1, vt));
5487 /* Swap 128-bit lanes in t3. */
5488 t6 = gen_reg_rtx (V4DImode);
5489 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
5490 const2_rtx, GEN_INT (3),
5491 const0_rtx, const1_rtx));
5492 /* And or in the lower bits from mask into t1. */
5493 emit_insn (gen_iorv32qi3 (t1, t1, t2));
5494 if (one_operand_shuffle)
5495 {
5496 /* Each of these shuffles will put 0s in places where
5497 element from the other 128-bit lane is needed, otherwise
5498 will shuffle in the requested value. */
5499 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
5500 gen_lowpart (V32QImode, t6)));
5501 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
5502 /* For t3 the 128-bit lanes are swapped again. */
5503 t7 = gen_reg_rtx (V4DImode);
5504 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
5505 const2_rtx, GEN_INT (3),
5506 const0_rtx, const1_rtx));
5507 /* And oring both together leads to the result. */
5508 emit_insn (gen_iorv32qi3 (target, t1,
5509 gen_lowpart (V32QImode, t7)));
5510 if (target != operands[0])
5511 emit_move_insn (operands[0],
5512 gen_lowpart (GET_MODE (operands[0]), target));
5513 return;
5514 }
5515
5516 t4 = gen_reg_rtx (V32QImode);
5517 /* Similarly to the above one_operand_shuffle code,
5518 just for repeated twice for each operand. merge_two:
5519 code will merge the two results together. */
5520 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
5521 gen_lowpart (V32QImode, t6)));
5522 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
5523 gen_lowpart (V32QImode, t6)));
5524 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
5525 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
5526 t7 = gen_reg_rtx (V4DImode);
5527 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
5528 const2_rtx, GEN_INT (3),
5529 const0_rtx, const1_rtx));
5530 t8 = gen_reg_rtx (V4DImode);
5531 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
5532 const2_rtx, GEN_INT (3),
5533 const0_rtx, const1_rtx));
5534 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
5535 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
5536 t1 = t4;
5537 t2 = t3;
5538 goto merge_two;
5539
5540 default:
5541 gcc_assert (GET_MODE_SIZE (mode) <= 16);
5542 break;
5543 }
5544 }
5545
5546 if (TARGET_XOP)
5547 {
5548 /* The XOP VPPERM insn supports three inputs. By ignoring the
5549 one_operand_shuffle special case, we avoid creating another
5550 set of constant vectors in memory. */
5551 one_operand_shuffle = false;
5552
5553 /* mask = mask & {2*w-1, ...} */
5554 vt = GEN_INT (2*w - 1);
5555 }
5556 else
5557 {
5558 /* mask = mask & {w-1, ...} */
5559 vt = GEN_INT (w - 1);
5560 }
5561
5562 vt = gen_const_vec_duplicate (maskmode, vt);
5563 mask = expand_simple_binop (maskmode, AND, mask, vt,
5564 NULL_RTX, 0, OPTAB_DIRECT);
5565
5566 /* For non-QImode operations, convert the word permutation control
5567 into a byte permutation control. */
5568 if (mode != V16QImode)
5569 {
5570 mask = expand_simple_binop (maskmode, ASHIFT, mask,
5571 GEN_INT (exact_log2 (e)),
5572 NULL_RTX, 0, OPTAB_DIRECT);
5573
5574 /* Convert mask to vector of chars. */
5575 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
5576
5577 /* Replicate each of the input bytes into byte positions:
5578 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
5579 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
5580 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
5581 for (i = 0; i < 16; ++i)
5582 vec[i] = GEN_INT (i/e * e);
5583 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5584 vt = validize_mem (force_const_mem (V16QImode, vt));
5585 if (TARGET_XOP)
5586 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
5587 else
5588 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
5589
5590 /* Convert it into the byte positions by doing
5591 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
5592 for (i = 0; i < 16; ++i)
5593 vec[i] = GEN_INT (i % e);
5594 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5595 vt = validize_mem (force_const_mem (V16QImode, vt));
5596 emit_insn (gen_addv16qi3 (mask, mask, vt));
5597 }
5598
5599 /* The actual shuffle operations all operate on V16QImode. */
5600 op0 = gen_lowpart (V16QImode, op0);
5601 op1 = gen_lowpart (V16QImode, op1);
5602
5603 if (TARGET_XOP)
5604 {
5605 if (GET_MODE (target) != V16QImode)
5606 target = gen_reg_rtx (V16QImode);
5607 emit_insn (gen_xop_pperm (target, op0, op1, mask));
5608 if (target != operands[0])
5609 emit_move_insn (operands[0],
5610 gen_lowpart (GET_MODE (operands[0]), target));
5611 }
5612 else if (one_operand_shuffle)
5613 {
5614 if (GET_MODE (target) != V16QImode)
5615 target = gen_reg_rtx (V16QImode);
5616 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
5617 if (target != operands[0])
5618 emit_move_insn (operands[0],
5619 gen_lowpart (GET_MODE (operands[0]), target));
5620 }
5621 else
5622 {
5623 rtx xops[6];
5624 bool ok;
5625
5626 /* Shuffle the two input vectors independently. */
5627 t1 = gen_reg_rtx (V16QImode);
5628 t2 = gen_reg_rtx (V16QImode);
5629 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
5630 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
5631
5632 merge_two:
5633 /* Then merge them together. The key is whether any given control
5634 element contained a bit set that indicates the second word. */
5635 mask = operands[3];
5636 vt = GEN_INT (w);
5637 if (maskmode == V2DImode && !TARGET_SSE4_1)
5638 {
5639 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
5640 more shuffle to convert the V2DI input mask into a V4SI
5641 input mask. At which point the masking that expand_int_vcond
5642 will work as desired. */
5643 rtx t3 = gen_reg_rtx (V4SImode);
5644 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
5645 const0_rtx, const0_rtx,
5646 const2_rtx, const2_rtx));
5647 mask = t3;
5648 maskmode = V4SImode;
5649 e = w = 4;
5650 }
5651
5652 vt = gen_const_vec_duplicate (maskmode, vt);
5653 vt = force_reg (maskmode, vt);
5654 mask = expand_simple_binop (maskmode, AND, mask, vt,
5655 NULL_RTX, 0, OPTAB_DIRECT);
5656
5657 if (GET_MODE (target) != mode)
5658 target = gen_reg_rtx (mode);
5659 xops[0] = target;
5660 xops[1] = gen_lowpart (mode, t2);
5661 xops[2] = gen_lowpart (mode, t1);
5662 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
5663 xops[4] = mask;
5664 xops[5] = vt;
5665 ok = ix86_expand_int_vcond (operands: xops);
5666 gcc_assert (ok);
5667 if (target != operands[0])
5668 emit_move_insn (operands[0],
5669 gen_lowpart (GET_MODE (operands[0]), target));
5670 }
5671}
5672
5673/* Extend SRC into next wider integer vector type. UNSIGNED_P is
5674 true if we should do zero extension, else sign extension. */
5675
5676void
5677ix86_expand_sse_extend (rtx dest, rtx src, bool unsigned_p)
5678{
5679 machine_mode imode = GET_MODE (src);
5680 rtx ops[3];
5681
5682 switch (imode)
5683 {
5684 case E_V8QImode:
5685 case E_V4QImode:
5686 case E_V2QImode:
5687 case E_V4HImode:
5688 case E_V2HImode:
5689 case E_V2SImode:
5690 break;
5691 default:
5692 gcc_unreachable ();
5693 }
5694
5695 ops[0] = dest;
5696
5697 ops[1] = force_reg (imode, src);
5698
5699 if (unsigned_p)
5700 ops[2] = force_reg (imode, CONST0_RTX (imode));
5701 else
5702 ops[2] = ix86_expand_sse_cmp (dest: gen_reg_rtx (imode), code: GT, CONST0_RTX (imode),
5703 cmp_op1: ops[1], op_true: pc_rtx, op_false: pc_rtx);
5704
5705 ix86_split_mmx_punpck (operands: ops, high_p: false);
5706}
5707
5708/* Unpack SRC into the next wider integer vector type. UNSIGNED_P is
5709 true if we should do zero extension, else sign extension. HIGH_P is
5710 true if we want the N/2 high elements, else the low elements. */
5711
5712void
5713ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
5714{
5715 machine_mode imode = GET_MODE (src);
5716 rtx tmp;
5717
5718 if (TARGET_SSE4_1)
5719 {
5720 rtx (*unpack)(rtx, rtx);
5721 rtx (*extract)(rtx, rtx) = NULL;
5722 machine_mode halfmode = BLKmode;
5723
5724 switch (imode)
5725 {
5726 case E_V64QImode:
5727 if (unsigned_p)
5728 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
5729 else
5730 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
5731 halfmode = V32QImode;
5732 extract
5733 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
5734 break;
5735 case E_V32QImode:
5736 if (unsigned_p)
5737 unpack = gen_avx2_zero_extendv16qiv16hi2;
5738 else
5739 unpack = gen_avx2_sign_extendv16qiv16hi2;
5740 halfmode = V16QImode;
5741 extract
5742 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
5743 break;
5744 case E_V32HImode:
5745 if (unsigned_p)
5746 unpack = gen_avx512f_zero_extendv16hiv16si2;
5747 else
5748 unpack = gen_avx512f_sign_extendv16hiv16si2;
5749 halfmode = V16HImode;
5750 extract
5751 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
5752 break;
5753 case E_V16HImode:
5754 if (unsigned_p)
5755 unpack = gen_avx2_zero_extendv8hiv8si2;
5756 else
5757 unpack = gen_avx2_sign_extendv8hiv8si2;
5758 halfmode = V8HImode;
5759 extract
5760 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
5761 break;
5762 case E_V16SImode:
5763 if (unsigned_p)
5764 unpack = gen_avx512f_zero_extendv8siv8di2;
5765 else
5766 unpack = gen_avx512f_sign_extendv8siv8di2;
5767 halfmode = V8SImode;
5768 extract
5769 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
5770 break;
5771 case E_V8SImode:
5772 if (unsigned_p)
5773 unpack = gen_avx2_zero_extendv4siv4di2;
5774 else
5775 unpack = gen_avx2_sign_extendv4siv4di2;
5776 halfmode = V4SImode;
5777 extract
5778 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
5779 break;
5780 case E_V16QImode:
5781 if (unsigned_p)
5782 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
5783 else
5784 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
5785 break;
5786 case E_V8HImode:
5787 if (unsigned_p)
5788 unpack = gen_sse4_1_zero_extendv4hiv4si2;
5789 else
5790 unpack = gen_sse4_1_sign_extendv4hiv4si2;
5791 break;
5792 case E_V4SImode:
5793 if (unsigned_p)
5794 unpack = gen_sse4_1_zero_extendv2siv2di2;
5795 else
5796 unpack = gen_sse4_1_sign_extendv2siv2di2;
5797 break;
5798 case E_V8QImode:
5799 if (unsigned_p)
5800 unpack = gen_sse4_1_zero_extendv4qiv4hi2;
5801 else
5802 unpack = gen_sse4_1_sign_extendv4qiv4hi2;
5803 break;
5804 case E_V4HImode:
5805 if (unsigned_p)
5806 unpack = gen_sse4_1_zero_extendv2hiv2si2;
5807 else
5808 unpack = gen_sse4_1_sign_extendv2hiv2si2;
5809 break;
5810 case E_V4QImode:
5811 if (unsigned_p)
5812 unpack = gen_sse4_1_zero_extendv2qiv2hi2;
5813 else
5814 unpack = gen_sse4_1_sign_extendv2qiv2hi2;
5815 break;
5816 default:
5817 gcc_unreachable ();
5818 }
5819
5820 if (GET_MODE_SIZE (imode) >= 32)
5821 {
5822 tmp = gen_reg_rtx (halfmode);
5823 emit_insn (extract (tmp, src));
5824 }
5825 else if (high_p)
5826 {
5827 switch (GET_MODE_SIZE (imode))
5828 {
5829 case 16:
5830 /* Shift higher 8 bytes to lower 8 bytes. */
5831 tmp = gen_reg_rtx (V1TImode);
5832 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
5833 GEN_INT (64)));
5834 break;
5835 case 8:
5836 /* Shift higher 4 bytes to lower 4 bytes. */
5837 tmp = gen_reg_rtx (V1DImode);
5838 emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src),
5839 GEN_INT (32)));
5840 break;
5841 case 4:
5842 /* Shift higher 2 bytes to lower 2 bytes. */
5843 tmp = gen_reg_rtx (V1SImode);
5844 emit_insn (gen_mmx_lshrv1si3 (tmp, gen_lowpart (V1SImode, src),
5845 GEN_INT (16)));
5846 break;
5847 default:
5848 gcc_unreachable ();
5849 }
5850
5851 tmp = gen_lowpart (imode, tmp);
5852 }
5853 else
5854 tmp = src;
5855
5856 emit_insn (unpack (dest, tmp));
5857 }
5858 else
5859 {
5860 rtx (*unpack)(rtx, rtx, rtx);
5861
5862 switch (imode)
5863 {
5864 case E_V16QImode:
5865 if (high_p)
5866 unpack = gen_vec_interleave_highv16qi;
5867 else
5868 unpack = gen_vec_interleave_lowv16qi;
5869 break;
5870 case E_V8HImode:
5871 if (high_p)
5872 unpack = gen_vec_interleave_highv8hi;
5873 else
5874 unpack = gen_vec_interleave_lowv8hi;
5875 break;
5876 case E_V4SImode:
5877 if (high_p)
5878 unpack = gen_vec_interleave_highv4si;
5879 else
5880 unpack = gen_vec_interleave_lowv4si;
5881 break;
5882 case E_V8QImode:
5883 if (high_p)
5884 unpack = gen_mmx_punpckhbw;
5885 else
5886 unpack = gen_mmx_punpcklbw;
5887 break;
5888 case E_V4HImode:
5889 if (high_p)
5890 unpack = gen_mmx_punpckhwd;
5891 else
5892 unpack = gen_mmx_punpcklwd;
5893 break;
5894 case E_V4QImode:
5895 if (high_p)
5896 unpack = gen_mmx_punpckhbw_low;
5897 else
5898 unpack = gen_mmx_punpcklbw_low;
5899 break;
5900 default:
5901 gcc_unreachable ();
5902 }
5903
5904 if (unsigned_p)
5905 tmp = force_reg (imode, CONST0_RTX (imode));
5906 else
5907 tmp = ix86_expand_sse_cmp (dest: gen_reg_rtx (imode), code: GT, CONST0_RTX (imode),
5908 cmp_op1: src, op_true: pc_rtx, op_false: pc_rtx);
5909
5910 rtx tmp2 = gen_reg_rtx (imode);
5911 emit_insn (unpack (tmp2, src, tmp));
5912 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5913 }
5914}
5915
5916/* Return true if mem is pool constant which contains a const_vector
5917 perm index, assign the index to PERM. */
5918bool
5919ix86_extract_perm_from_pool_constant (int* perm, rtx mem)
5920{
5921 machine_mode mode = GET_MODE (mem);
5922 int nelt = GET_MODE_NUNITS (mode);
5923
5924 if (!INTEGRAL_MODE_P (mode))
5925 return false;
5926
5927 /* Needs to be constant pool. */
5928 if (!(MEM_P (mem))
5929 || !SYMBOL_REF_P (XEXP (mem, 0))
5930 || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0)))
5931 return false;
5932
5933 rtx constant = get_pool_constant (XEXP (mem, 0));
5934
5935 if (GET_CODE (constant) != CONST_VECTOR)
5936 return false;
5937
5938 /* There could be some rtx like
5939 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
5940 but with "*.LC1" refer to V2DI constant vector. */
5941 if (GET_MODE (constant) != mode)
5942 {
5943 constant = simplify_subreg (outermode: mode, op: constant, GET_MODE (constant), byte: 0);
5944
5945 if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
5946 return false;
5947 }
5948
5949 for (int i = 0; i != nelt; i++)
5950 perm[i] = UINTVAL (XVECEXP (constant, 0, i));
5951
5952 return true;
5953}
5954
5955/* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5956 but works for floating pointer parameters and nonoffsetable memories.
5957 For pushes, it returns just stack offsets; the values will be saved
5958 in the right order. Maximally three parts are generated. */
5959
5960static int
5961ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5962{
5963 int size;
5964
5965 if (!TARGET_64BIT)
5966 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5967 else
5968 size = (GET_MODE_SIZE (mode) + 4) / 8;
5969
5970 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5971 gcc_assert (size >= 2 && size <= 4);
5972
5973 /* Optimize constant pool reference to immediates. This is used by fp
5974 moves, that force all constants to memory to allow combining. */
5975 if (MEM_P (operand) && MEM_READONLY_P (operand))
5976 operand = avoid_constant_pool_reference (operand);
5977
5978 if (MEM_P (operand) && !offsettable_memref_p (operand))
5979 {
5980 /* The only non-offsetable memories we handle are pushes. */
5981 int ok = push_operand (operand, VOIDmode);
5982
5983 gcc_assert (ok);
5984
5985 operand = copy_rtx (operand);
5986 PUT_MODE (x: operand, mode: word_mode);
5987 parts[0] = parts[1] = parts[2] = parts[3] = operand;
5988 return size;
5989 }
5990
5991 if (GET_CODE (operand) == CONST_VECTOR)
5992 {
5993 scalar_int_mode imode = int_mode_for_mode (mode).require ();
5994 /* Caution: if we looked through a constant pool memory above,
5995 the operand may actually have a different mode now. That's
5996 ok, since we want to pun this all the way back to an integer. */
5997 operand = simplify_subreg (outermode: imode, op: operand, GET_MODE (operand), byte: 0);
5998 gcc_assert (operand != NULL);
5999 mode = imode;
6000 }
6001
6002 if (!TARGET_64BIT)
6003 {
6004 if (mode == DImode)
6005 split_double_mode (mode, operands: &operand, num: 1, lo_half: &parts[0], hi_half: &parts[1]);
6006 else
6007 {
6008 int i;
6009
6010 if (REG_P (operand))
6011 {
6012 gcc_assert (reload_completed);
6013 for (i = 0; i < size; i++)
6014 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
6015 }
6016 else if (offsettable_memref_p (operand))
6017 {
6018 operand = adjust_address (operand, SImode, 0);
6019 parts[0] = operand;
6020 for (i = 1; i < size; i++)
6021 parts[i] = adjust_address (operand, SImode, 4 * i);
6022 }
6023 else if (CONST_DOUBLE_P (operand))
6024 {
6025 const REAL_VALUE_TYPE *r;
6026 long l[4];
6027
6028 r = CONST_DOUBLE_REAL_VALUE (operand);
6029 switch (mode)
6030 {
6031 case E_TFmode:
6032 real_to_target (l, r, mode);
6033 parts[3] = gen_int_mode (l[3], SImode);
6034 parts[2] = gen_int_mode (l[2], SImode);
6035 break;
6036 case E_XFmode:
6037 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
6038 long double may not be 80-bit. */
6039 real_to_target (l, r, mode);
6040 parts[2] = gen_int_mode (l[2], SImode);
6041 break;
6042 case E_DFmode:
6043 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
6044 break;
6045 default:
6046 gcc_unreachable ();
6047 }
6048 parts[1] = gen_int_mode (l[1], SImode);
6049 parts[0] = gen_int_mode (l[0], SImode);
6050 }
6051 else
6052 gcc_unreachable ();
6053 }
6054 }
6055 else
6056 {
6057 if (mode == TImode)
6058 split_double_mode (mode, operands: &operand, num: 1, lo_half: &parts[0], hi_half: &parts[1]);
6059 if (mode == XFmode || mode == TFmode)
6060 {
6061 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
6062 if (REG_P (operand))
6063 {
6064 gcc_assert (reload_completed);
6065 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
6066 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
6067 }
6068 else if (offsettable_memref_p (operand))
6069 {
6070 operand = adjust_address (operand, DImode, 0);
6071 parts[0] = operand;
6072 parts[1] = adjust_address (operand, upper_mode, 8);
6073 }
6074 else if (CONST_DOUBLE_P (operand))
6075 {
6076 long l[4];
6077
6078 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
6079
6080 /* real_to_target puts 32-bit pieces in each long. */
6081 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
6082 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
6083 << 32), DImode);
6084
6085 if (upper_mode == SImode)
6086 parts[1] = gen_int_mode (l[2], SImode);
6087 else
6088 parts[1]
6089 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
6090 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
6091 << 32), DImode);
6092 }
6093 else
6094 gcc_unreachable ();
6095 }
6096 }
6097
6098 return size;
6099}
6100
6101/* Emit insns to perform a move or push of DI, DF, XF, and TF values.
6102 Return false when normal moves are needed; true when all required
6103 insns have been emitted. Operands 2-4 contain the input values
6104 int the correct order; operands 5-7 contain the output values. */
6105
6106void
6107ix86_split_long_move (rtx operands[])
6108{
6109 rtx part[2][4];
6110 int nparts, i, j;
6111 int push = 0;
6112 int collisions = 0;
6113 machine_mode mode = GET_MODE (operands[0]);
6114 bool collisionparts[4];
6115
6116 /* The DFmode expanders may ask us to move double.
6117 For 64bit target this is single move. By hiding the fact
6118 here we simplify i386.md splitters. */
6119 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
6120 {
6121 /* Optimize constant pool reference to immediates. This is used by
6122 fp moves, that force all constants to memory to allow combining. */
6123
6124 if (MEM_P (operands[1])
6125 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
6126 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
6127 operands[1] = get_pool_constant (XEXP (operands[1], 0));
6128 if (push_operand (operands[0], VOIDmode))
6129 {
6130 operands[0] = copy_rtx (operands[0]);
6131 PUT_MODE (x: operands[0], mode: word_mode);
6132 }
6133 else
6134 operands[0] = gen_lowpart (DImode, operands[0]);
6135 operands[1] = gen_lowpart (DImode, operands[1]);
6136 emit_move_insn (operands[0], operands[1]);
6137 return;
6138 }
6139
6140 /* The only non-offsettable memory we handle is push. */
6141 if (push_operand (operands[0], VOIDmode))
6142 push = 1;
6143 else
6144 gcc_assert (!MEM_P (operands[0])
6145 || offsettable_memref_p (operands[0]));
6146
6147 nparts = ix86_split_to_parts (operand: operands[1], parts: part[1], GET_MODE (operands[0]));
6148 ix86_split_to_parts (operand: operands[0], parts: part[0], GET_MODE (operands[0]));
6149
6150 /* When emitting push, take care for source operands on the stack. */
6151 if (push && MEM_P (operands[1])
6152 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
6153 {
6154 rtx src_base = XEXP (part[1][nparts - 1], 0);
6155
6156 /* Compensate for the stack decrement by 4. */
6157 if (!TARGET_64BIT && nparts == 3
6158 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
6159 src_base = plus_constant (Pmode, src_base, 4);
6160
6161 /* src_base refers to the stack pointer and is
6162 automatically decreased by emitted push. */
6163 for (i = 0; i < nparts; i++)
6164 part[1][i] = change_address (part[1][i],
6165 GET_MODE (part[1][i]), src_base);
6166 }
6167
6168 /* We need to do copy in the right order in case an address register
6169 of the source overlaps the destination. */
6170 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
6171 {
6172 rtx tmp;
6173
6174 for (i = 0; i < nparts; i++)
6175 {
6176 collisionparts[i]
6177 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
6178 if (collisionparts[i])
6179 collisions++;
6180 }
6181
6182 /* Collision in the middle part can be handled by reordering. */
6183 if (collisions == 1 && nparts == 3 && collisionparts [1])
6184 {
6185 std::swap (a&: part[0][1], b&: part[0][2]);
6186 std::swap (a&: part[1][1], b&: part[1][2]);
6187 }
6188 else if (collisions == 1
6189 && nparts == 4
6190 && (collisionparts [1] || collisionparts [2]))
6191 {
6192 if (collisionparts [1])
6193 {
6194 std::swap (a&: part[0][1], b&: part[0][2]);
6195 std::swap (a&: part[1][1], b&: part[1][2]);
6196 }
6197 else
6198 {
6199 std::swap (a&: part[0][2], b&: part[0][3]);
6200 std::swap (a&: part[1][2], b&: part[1][3]);
6201 }
6202 }
6203
6204 /* If there are more collisions, we can't handle it by reordering.
6205 Do an lea to the last part and use only one colliding move. */
6206 else if (collisions > 1)
6207 {
6208 rtx base, addr;
6209
6210 collisions = 1;
6211
6212 base = part[0][nparts - 1];
6213
6214 /* Handle the case when the last part isn't valid for lea.
6215 Happens in 64-bit mode storing the 12-byte XFmode. */
6216 if (GET_MODE (base) != Pmode)
6217 base = gen_rtx_REG (Pmode, REGNO (base));
6218
6219 addr = XEXP (part[1][0], 0);
6220 if (TARGET_TLS_DIRECT_SEG_REFS)
6221 {
6222 struct ix86_address parts;
6223 int ok = ix86_decompose_address (addr, &parts);
6224 gcc_assert (ok);
6225 /* It is not valid to use %gs: or %fs: in lea. */
6226 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
6227 }
6228 emit_insn (gen_rtx_SET (base, addr));
6229 part[1][0] = replace_equiv_address (part[1][0], base);
6230 for (i = 1; i < nparts; i++)
6231 {
6232 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
6233 part[1][i] = replace_equiv_address (part[1][i], tmp);
6234 }
6235 }
6236 }
6237
6238 if (push)
6239 {
6240 if (!TARGET_64BIT)
6241 {
6242 if (nparts == 3)
6243 {
6244 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
6245 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
6246 emit_move_insn (part[0][2], part[1][2]);
6247 }
6248 else if (nparts == 4)
6249 {
6250 emit_move_insn (part[0][3], part[1][3]);
6251 emit_move_insn (part[0][2], part[1][2]);
6252 }
6253 }
6254 else
6255 {
6256 /* In 64bit mode we don't have 32bit push available. In case this is
6257 register, it is OK - we will just use larger counterpart. We also
6258 retype memory - these comes from attempt to avoid REX prefix on
6259 moving of second half of TFmode value. */
6260 if (GET_MODE (part[1][1]) == SImode)
6261 {
6262 switch (GET_CODE (part[1][1]))
6263 {
6264 case MEM:
6265 part[1][1] = adjust_address (part[1][1], DImode, 0);
6266 break;
6267
6268 case REG:
6269 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
6270 break;
6271
6272 default:
6273 gcc_unreachable ();
6274 }
6275
6276 if (GET_MODE (part[1][0]) == SImode)
6277 part[1][0] = part[1][1];
6278 }
6279 }
6280 emit_move_insn (part[0][1], part[1][1]);
6281 emit_move_insn (part[0][0], part[1][0]);
6282 return;
6283 }
6284
6285 /* Choose correct order to not overwrite the source before it is copied. */
6286 if ((REG_P (part[0][0])
6287 && REG_P (part[1][1])
6288 && (REGNO (part[0][0]) == REGNO (part[1][1])
6289 || (nparts == 3
6290 && REGNO (part[0][0]) == REGNO (part[1][2]))
6291 || (nparts == 4
6292 && REGNO (part[0][0]) == REGNO (part[1][3]))))
6293 || (collisions > 0
6294 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
6295 {
6296 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
6297 {
6298 operands[2 + i] = part[0][j];
6299 operands[6 + i] = part[1][j];
6300 }
6301 }
6302 else
6303 {
6304 for (i = 0; i < nparts; i++)
6305 {
6306 operands[2 + i] = part[0][i];
6307 operands[6 + i] = part[1][i];
6308 }
6309 }
6310
6311 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
6312 if (optimize_insn_for_size_p ())
6313 {
6314 for (j = 0; j < nparts - 1; j++)
6315 if (CONST_INT_P (operands[6 + j])
6316 && operands[6 + j] != const0_rtx
6317 && REG_P (operands[2 + j]))
6318 for (i = j; i < nparts - 1; i++)
6319 if (CONST_INT_P (operands[7 + i])
6320 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
6321 operands[7 + i] = operands[2 + j];
6322 }
6323
6324 for (i = 0; i < nparts; i++)
6325 emit_move_insn (operands[2 + i], operands[6 + i]);
6326
6327 return;
6328}
6329
6330/* Helper function of ix86_split_ashl used to generate an SImode/DImode
6331 left shift by a constant, either using a single shift or
6332 a sequence of add instructions. */
6333
6334static void
6335ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
6336{
6337 if (count == 1
6338 || (count * ix86_cost->add <= ix86_cost->shift_const
6339 && !optimize_insn_for_size_p ()))
6340 {
6341 while (count-- > 0)
6342 emit_insn (gen_add2_insn (operand, operand));
6343 }
6344 else
6345 {
6346 rtx (*insn)(rtx, rtx, rtx);
6347
6348 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6349 emit_insn (insn (operand, operand, GEN_INT (count)));
6350 }
6351}
6352
6353void
6354ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
6355{
6356 rtx (*gen_ashl3)(rtx, rtx, rtx);
6357 rtx (*gen_shld)(rtx, rtx, rtx);
6358 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6359 machine_mode half_mode;
6360
6361 rtx low[2], high[2];
6362 int count;
6363
6364 if (CONST_INT_P (operands[2]))
6365 {
6366 split_double_mode (mode, operands, num: 2, lo_half: low, hi_half: high);
6367 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6368
6369 if (count >= half_width)
6370 {
6371 emit_move_insn (high[0], low[1]);
6372 ix86_expand_clear (dest: low[0]);
6373
6374 if (count > half_width)
6375 ix86_expand_ashl_const (operand: high[0], count: count - half_width, mode);
6376 }
6377 else if (count == 1)
6378 {
6379 if (!rtx_equal_p (operands[0], operands[1]))
6380 emit_move_insn (operands[0], operands[1]);
6381 rtx x3 = gen_rtx_REG (CCCmode, FLAGS_REG);
6382 rtx x4 = gen_rtx_LTU (mode, x3, const0_rtx);
6383 half_mode = mode == DImode ? SImode : DImode;
6384 emit_insn (gen_add3_cc_overflow_1 (arg0: half_mode, x0: low[0],
6385 x1: low[0], x2: low[0]));
6386 emit_insn (gen_add3_carry (arg0: half_mode, x0: high[0], x1: high[0], x2: high[0],
6387 x3, x4));
6388 }
6389 else
6390 {
6391 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6392
6393 if (!rtx_equal_p (operands[0], operands[1]))
6394 emit_move_insn (operands[0], operands[1]);
6395
6396 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
6397 ix86_expand_ashl_const (operand: low[0], count, mode);
6398 }
6399 return;
6400 }
6401
6402 split_double_mode (mode, operands, num: 1, lo_half: low, hi_half: high);
6403 half_mode = mode == DImode ? SImode : DImode;
6404
6405 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6406
6407 if (operands[1] == const1_rtx)
6408 {
6409 /* Assuming we've chosen a QImode capable registers, then 1 << N
6410 can be done with two 32/64-bit shifts, no branches, no cmoves. */
6411 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
6412 {
6413 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
6414
6415 ix86_expand_clear (dest: low[0]);
6416 ix86_expand_clear (dest: high[0]);
6417 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
6418
6419 d = gen_lowpart (QImode, low[0]);
6420 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6421 s = gen_rtx_EQ (QImode, flags, const0_rtx);
6422 emit_insn (gen_rtx_SET (d, s));
6423
6424 d = gen_lowpart (QImode, high[0]);
6425 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6426 s = gen_rtx_NE (QImode, flags, const0_rtx);
6427 emit_insn (gen_rtx_SET (d, s));
6428 }
6429
6430 /* Otherwise, we can get the same results by manually performing
6431 a bit extract operation on bit 5/6, and then performing the two
6432 shifts. The two methods of getting 0/1 into low/high are exactly
6433 the same size. Avoiding the shift in the bit extract case helps
6434 pentium4 a bit; no one else seems to care much either way. */
6435 else
6436 {
6437 rtx (*gen_lshr3)(rtx, rtx, rtx);
6438 rtx (*gen_and3)(rtx, rtx, rtx);
6439 rtx (*gen_xor3)(rtx, rtx, rtx);
6440 HOST_WIDE_INT bits;
6441 rtx x;
6442
6443 if (mode == DImode)
6444 {
6445 gen_lshr3 = gen_lshrsi3;
6446 gen_and3 = gen_andsi3;
6447 gen_xor3 = gen_xorsi3;
6448 bits = 5;
6449 }
6450 else
6451 {
6452 gen_lshr3 = gen_lshrdi3;
6453 gen_and3 = gen_anddi3;
6454 gen_xor3 = gen_xordi3;
6455 bits = 6;
6456 }
6457
6458 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
6459 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
6460 else
6461 x = gen_lowpart (half_mode, operands[2]);
6462 emit_insn (gen_rtx_SET (high[0], x));
6463
6464 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
6465 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
6466 emit_move_insn (low[0], high[0]);
6467 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
6468 }
6469
6470 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6471 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
6472 return;
6473 }
6474
6475 if (operands[1] == constm1_rtx)
6476 {
6477 /* For -1 << N, we can avoid the shld instruction, because we
6478 know that we're shifting 0...31/63 ones into a -1. */
6479 emit_move_insn (low[0], constm1_rtx);
6480 if (optimize_insn_for_size_p ())
6481 emit_move_insn (high[0], low[0]);
6482 else
6483 emit_move_insn (high[0], constm1_rtx);
6484 }
6485 else
6486 {
6487 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6488
6489 if (!rtx_equal_p (operands[0], operands[1]))
6490 emit_move_insn (operands[0], operands[1]);
6491
6492 split_double_mode (mode, operands, num: 1, lo_half: low, hi_half: high);
6493 emit_insn (gen_shld (high[0], low[0], operands[2]));
6494 }
6495
6496 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6497
6498 if (TARGET_CMOVE && scratch)
6499 {
6500 ix86_expand_clear (dest: scratch);
6501 emit_insn (gen_x86_shift_adj_1
6502 (arg0: half_mode, x0: high[0], x1: low[0], x2: operands[2], x3: scratch));
6503 }
6504 else
6505 emit_insn (gen_x86_shift_adj_2 (arg0: half_mode, x0: high[0], x1: low[0], x2: operands[2]));
6506}
6507
6508void
6509ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
6510{
6511 rtx (*gen_ashr3)(rtx, rtx, rtx)
6512 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
6513 rtx (*gen_shrd)(rtx, rtx, rtx);
6514 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6515
6516 rtx low[2], high[2];
6517 int count;
6518
6519 if (CONST_INT_P (operands[2]))
6520 {
6521 split_double_mode (mode, operands, num: 2, lo_half: low, hi_half: high);
6522 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6523
6524 if (count == GET_MODE_BITSIZE (mode) - 1)
6525 {
6526 emit_move_insn (high[0], high[1]);
6527 emit_insn (gen_ashr3 (high[0], high[0],
6528 GEN_INT (half_width - 1)));
6529 emit_move_insn (low[0], high[0]);
6530
6531 }
6532 else if (count >= half_width)
6533 {
6534 emit_move_insn (low[0], high[1]);
6535 emit_move_insn (high[0], low[0]);
6536 emit_insn (gen_ashr3 (high[0], high[0],
6537 GEN_INT (half_width - 1)));
6538
6539 if (count > half_width)
6540 emit_insn (gen_ashr3 (low[0], low[0],
6541 GEN_INT (count - half_width)));
6542 }
6543 else if (count == 1
6544 && (TARGET_USE_RCR || optimize_size > 1))
6545 {
6546 if (!rtx_equal_p (operands[0], operands[1]))
6547 emit_move_insn (operands[0], operands[1]);
6548 if (mode == DImode)
6549 {
6550 emit_insn (gen_ashrsi3_carry (high[0], high[0]));
6551 emit_insn (gen_rcrsi2 (low[0], low[0]));
6552 }
6553 else
6554 {
6555 emit_insn (gen_ashrdi3_carry (high[0], high[0]));
6556 emit_insn (gen_rcrdi2 (low[0], low[0]));
6557 }
6558 }
6559 else
6560 {
6561 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6562
6563 if (!rtx_equal_p (operands[0], operands[1]))
6564 emit_move_insn (operands[0], operands[1]);
6565
6566 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6567 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
6568 }
6569 }
6570 else
6571 {
6572 machine_mode half_mode;
6573
6574 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6575
6576 if (!rtx_equal_p (operands[0], operands[1]))
6577 emit_move_insn (operands[0], operands[1]);
6578
6579 split_double_mode (mode, operands, num: 1, lo_half: low, hi_half: high);
6580 half_mode = mode == DImode ? SImode : DImode;
6581
6582 emit_insn (gen_shrd (low[0], high[0], operands[2]));
6583 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
6584
6585 if (TARGET_CMOVE && scratch)
6586 {
6587 emit_move_insn (scratch, high[0]);
6588 emit_insn (gen_ashr3 (scratch, scratch,
6589 GEN_INT (half_width - 1)));
6590 emit_insn (gen_x86_shift_adj_1
6591 (arg0: half_mode, x0: low[0], x1: high[0], x2: operands[2], x3: scratch));
6592 }
6593 else
6594 emit_insn (gen_x86_shift_adj_3
6595 (arg0: half_mode, x0: low[0], x1: high[0], x2: operands[2]));
6596 }
6597}
6598
6599void
6600ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
6601{
6602 rtx (*gen_lshr3)(rtx, rtx, rtx)
6603 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
6604 rtx (*gen_shrd)(rtx, rtx, rtx);
6605 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6606
6607 rtx low[2], high[2];
6608 int count;
6609
6610 if (CONST_INT_P (operands[2]))
6611 {
6612 split_double_mode (mode, operands, num: 2, lo_half: low, hi_half: high);
6613 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6614
6615 if (count >= half_width)
6616 {
6617 emit_move_insn (low[0], high[1]);
6618 ix86_expand_clear (dest: high[0]);
6619
6620 if (count > half_width)
6621 emit_insn (gen_lshr3 (low[0], low[0],
6622 GEN_INT (count - half_width)));
6623 }
6624 else if (count == 1
6625 && (TARGET_USE_RCR || optimize_size > 1))
6626 {
6627 if (!rtx_equal_p (operands[0], operands[1]))
6628 emit_move_insn (operands[0], operands[1]);
6629 if (mode == DImode)
6630 {
6631 emit_insn (gen_lshrsi3_carry (high[0], high[0]));
6632 emit_insn (gen_rcrsi2 (low[0], low[0]));
6633 }
6634 else
6635 {
6636 emit_insn (gen_lshrdi3_carry (high[0], high[0]));
6637 emit_insn (gen_rcrdi2 (low[0], low[0]));
6638 }
6639 }
6640 else
6641 {
6642 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6643
6644 if (!rtx_equal_p (operands[0], operands[1]))
6645 emit_move_insn (operands[0], operands[1]);
6646
6647 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6648 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
6649 }
6650 }
6651 else
6652 {
6653 machine_mode half_mode;
6654
6655 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6656
6657 if (!rtx_equal_p (operands[0], operands[1]))
6658 emit_move_insn (operands[0], operands[1]);
6659
6660 split_double_mode (mode, operands, num: 1, lo_half: low, hi_half: high);
6661 half_mode = mode == DImode ? SImode : DImode;
6662
6663 emit_insn (gen_shrd (low[0], high[0], operands[2]));
6664 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
6665
6666 if (TARGET_CMOVE && scratch)
6667 {
6668 ix86_expand_clear (dest: scratch);
6669 emit_insn (gen_x86_shift_adj_1
6670 (arg0: half_mode, x0: low[0], x1: high[0], x2: operands[2], x3: scratch));
6671 }
6672 else
6673 emit_insn (gen_x86_shift_adj_2
6674 (arg0: half_mode, x0: low[0], x1: high[0], x2: operands[2]));
6675 }
6676}
6677
6678/* Expand move of V1TI mode register X to a new TI mode register. */
6679static rtx
6680ix86_expand_v1ti_to_ti (rtx x)
6681{
6682 rtx result = gen_reg_rtx (TImode);
6683 if (TARGET_SSE2)
6684 {
6685 rtx temp = force_reg (V2DImode, gen_lowpart (V2DImode, x));
6686 rtx lo = gen_lowpart (DImode, result);
6687 emit_insn (gen_vec_extractv2didi (lo, temp, const0_rtx));
6688 rtx hi = gen_highpart (DImode, result);
6689 emit_insn (gen_vec_extractv2didi (hi, temp, const1_rtx));
6690 }
6691 else
6692 emit_move_insn (result, gen_lowpart (TImode, x));
6693 return result;
6694}
6695
6696/* Expand move of TI mode register X to a new V1TI mode register. */
6697static rtx
6698ix86_expand_ti_to_v1ti (rtx x)
6699{
6700 if (TARGET_SSE2)
6701 {
6702 rtx lo = gen_lowpart (DImode, x);
6703 rtx hi = gen_highpart (DImode, x);
6704 rtx tmp = gen_reg_rtx (V2DImode);
6705 emit_insn (gen_vec_concatv2di (tmp, lo, hi));
6706 return force_reg (V1TImode, gen_lowpart (V1TImode, tmp));
6707 }
6708
6709 return force_reg (V1TImode, gen_lowpart (V1TImode, x));
6710}
6711
6712/* Expand V1TI mode shift (of rtx_code CODE) by constant. */
6713void
6714ix86_expand_v1ti_shift (enum rtx_code code, rtx operands[])
6715{
6716 rtx op1 = force_reg (V1TImode, operands[1]);
6717
6718 if (!CONST_INT_P (operands[2]))
6719 {
6720 rtx tmp1 = ix86_expand_v1ti_to_ti (x: op1);
6721 rtx tmp2 = gen_reg_rtx (TImode);
6722 rtx (*shift) (rtx, rtx, rtx)
6723 = (code == ASHIFT) ? gen_ashlti3 : gen_lshrti3;
6724 emit_insn (shift (tmp2, tmp1, operands[2]));
6725 rtx tmp3 = ix86_expand_ti_to_v1ti (x: tmp2);
6726 emit_move_insn (operands[0], tmp3);
6727 return;
6728 }
6729
6730 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6731
6732 if (bits == 0)
6733 {
6734 emit_move_insn (operands[0], op1);
6735 return;
6736 }
6737
6738 if ((bits & 7) == 0)
6739 {
6740 rtx tmp = gen_reg_rtx (V1TImode);
6741 if (code == ASHIFT)
6742 emit_insn (gen_sse2_ashlv1ti3 (tmp, op1, GEN_INT (bits)));
6743 else
6744 emit_insn (gen_sse2_lshrv1ti3 (tmp, op1, GEN_INT (bits)));
6745 emit_move_insn (operands[0], tmp);
6746 return;
6747 }
6748
6749 rtx tmp1 = gen_reg_rtx (V1TImode);
6750 if (code == ASHIFT)
6751 emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (64)));
6752 else
6753 emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
6754
6755 /* tmp2 is operands[1] shifted by 64, in V2DImode. */
6756 rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6757
6758 /* tmp3 will be the V2DImode result. */
6759 rtx tmp3 = gen_reg_rtx (V2DImode);
6760
6761 if (bits > 64)
6762 {
6763 if (code == ASHIFT)
6764 emit_insn (gen_ashlv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
6765 else
6766 emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
6767 }
6768 else
6769 {
6770 /* tmp4 is operands[1], in V2DImode. */
6771 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6772
6773 rtx tmp5 = gen_reg_rtx (V2DImode);
6774 if (code == ASHIFT)
6775 emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (bits)));
6776 else
6777 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6778
6779 rtx tmp6 = gen_reg_rtx (V2DImode);
6780 if (code == ASHIFT)
6781 emit_insn (gen_lshrv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
6782 else
6783 emit_insn (gen_ashlv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
6784
6785 emit_insn (gen_iorv2di3 (tmp3, tmp5, tmp6));
6786 }
6787
6788 /* Convert the result back to V1TImode and store in operands[0]. */
6789 rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
6790 emit_move_insn (operands[0], tmp7);
6791}
6792
6793/* Expand V1TI mode rotate (of rtx_code CODE) by constant. */
6794void
6795ix86_expand_v1ti_rotate (enum rtx_code code, rtx operands[])
6796{
6797 rtx op1 = force_reg (V1TImode, operands[1]);
6798
6799 if (!CONST_INT_P (operands[2]))
6800 {
6801 rtx tmp1 = ix86_expand_v1ti_to_ti (x: op1);
6802 rtx tmp2 = gen_reg_rtx (TImode);
6803 rtx (*rotate) (rtx, rtx, rtx)
6804 = (code == ROTATE) ? gen_rotlti3 : gen_rotrti3;
6805 emit_insn (rotate (tmp2, tmp1, operands[2]));
6806 rtx tmp3 = ix86_expand_ti_to_v1ti (x: tmp2);
6807 emit_move_insn (operands[0], tmp3);
6808 return;
6809 }
6810
6811 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6812
6813 if (bits == 0)
6814 {
6815 emit_move_insn (operands[0], op1);
6816 return;
6817 }
6818
6819 if (code == ROTATERT)
6820 bits = 128 - bits;
6821
6822 if ((bits & 31) == 0)
6823 {
6824 rtx tmp2 = gen_reg_rtx (V4SImode);
6825 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6826 if (bits == 32)
6827 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x93)));
6828 else if (bits == 64)
6829 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x4e)));
6830 else
6831 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x39)));
6832 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp2));
6833 return;
6834 }
6835
6836 if ((bits & 7) == 0)
6837 {
6838 rtx tmp1 = gen_reg_rtx (V1TImode);
6839 rtx tmp2 = gen_reg_rtx (V1TImode);
6840 rtx tmp3 = gen_reg_rtx (V1TImode);
6841
6842 emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (bits)));
6843 emit_insn (gen_sse2_lshrv1ti3 (tmp2, op1, GEN_INT (128 - bits)));
6844 emit_insn (gen_iorv1ti3 (tmp3, tmp1, tmp2));
6845 emit_move_insn (operands[0], tmp3);
6846 return;
6847 }
6848
6849 rtx op1_v4si = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6850
6851 rtx lobits;
6852 rtx hibits;
6853
6854 switch (bits >> 5)
6855 {
6856 case 0:
6857 lobits = op1_v4si;
6858 hibits = gen_reg_rtx (V4SImode);
6859 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x93)));
6860 break;
6861
6862 case 1:
6863 lobits = gen_reg_rtx (V4SImode);
6864 hibits = gen_reg_rtx (V4SImode);
6865 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x93)));
6866 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x4e)));
6867 break;
6868
6869 case 2:
6870 lobits = gen_reg_rtx (V4SImode);
6871 hibits = gen_reg_rtx (V4SImode);
6872 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x4e)));
6873 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x39)));
6874 break;
6875
6876 default:
6877 lobits = gen_reg_rtx (V4SImode);
6878 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x39)));
6879 hibits = op1_v4si;
6880 break;
6881 }
6882
6883 rtx tmp1 = gen_reg_rtx (V4SImode);
6884 rtx tmp2 = gen_reg_rtx (V4SImode);
6885 rtx tmp3 = gen_reg_rtx (V4SImode);
6886
6887 emit_insn (gen_ashlv4si3 (tmp1, lobits, GEN_INT (bits & 31)));
6888 emit_insn (gen_lshrv4si3 (tmp2, hibits, GEN_INT (32 - (bits & 31))));
6889 emit_insn (gen_iorv4si3 (tmp3, tmp1, tmp2));
6890
6891 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
6892}
6893
6894/* Expand V1TI mode ashiftrt by constant. */
6895void
6896ix86_expand_v1ti_ashiftrt (rtx operands[])
6897{
6898 rtx op1 = force_reg (V1TImode, operands[1]);
6899
6900 if (!CONST_INT_P (operands[2]))
6901 {
6902 rtx tmp1 = ix86_expand_v1ti_to_ti (x: op1);
6903 rtx tmp2 = gen_reg_rtx (TImode);
6904 emit_insn (gen_ashrti3 (tmp2, tmp1, operands[2]));
6905 rtx tmp3 = ix86_expand_ti_to_v1ti (x: tmp2);
6906 emit_move_insn (operands[0], tmp3);
6907 return;
6908 }
6909
6910 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6911
6912 if (bits == 0)
6913 {
6914 emit_move_insn (operands[0], op1);
6915 return;
6916 }
6917
6918 if (bits == 127)
6919 {
6920 /* Two operations. */
6921 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
6922 rtx tmp2 = gen_reg_rtx (V4SImode);
6923 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6924
6925 rtx tmp3 = gen_reg_rtx (V4SImode);
6926 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6927
6928 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
6929 return;
6930 }
6931
6932 if (bits == 64)
6933 {
6934 /* Three operations. */
6935 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
6936 rtx tmp2 = gen_reg_rtx (V4SImode);
6937 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6938
6939 rtx tmp3 = gen_reg_rtx (V4SImode);
6940 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6941
6942 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6943 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6944 rtx tmp6 = gen_reg_rtx (V2DImode);
6945 emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
6946
6947 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6948 return;
6949 }
6950
6951 if (bits == 96)
6952 {
6953 /* Three operations. */
6954 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
6955 rtx tmp2 = gen_reg_rtx (V4SImode);
6956 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
6957
6958 rtx tmp3 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6959 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
6960 rtx tmp5 = gen_reg_rtx (V2DImode);
6961 emit_insn (gen_vec_interleave_highv2di (tmp5, tmp3, tmp4));
6962
6963 rtx tmp6 = force_reg(V4SImode, gen_lowpart (V4SImode, tmp5));
6964 rtx tmp7 = gen_reg_rtx (V4SImode);
6965 emit_insn (gen_sse2_pshufd (tmp7, tmp6, GEN_INT (0xfd)));
6966
6967 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
6968 return;
6969 }
6970
6971 if (bits >= 111)
6972 {
6973 /* Three operations. */
6974 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6975 rtx tmp2 = gen_reg_rtx (V4SImode);
6976 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
6977
6978 rtx tmp3 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6979 rtx tmp4 = gen_reg_rtx (V8HImode);
6980 emit_insn (gen_sse2_pshufhw (tmp4, tmp3, GEN_INT (0xfe)));
6981
6982 rtx tmp5 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp4));
6983 rtx tmp6 = gen_reg_rtx (V4SImode);
6984 emit_insn (gen_sse2_pshufd (tmp6, tmp5, GEN_INT (0xfe)));
6985
6986 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6987 return;
6988 }
6989
6990 if (TARGET_AVX2 || TARGET_SSE4_1)
6991 {
6992 /* Three operations. */
6993 if (bits == 32)
6994 {
6995 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6996 rtx tmp2 = gen_reg_rtx (V4SImode);
6997 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
6998
6999 rtx tmp3 = gen_reg_rtx (V1TImode);
7000 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (32)));
7001
7002 if (TARGET_AVX2)
7003 {
7004 rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
7005 rtx tmp5 = gen_reg_rtx (V4SImode);
7006 emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
7007 GEN_INT (7)));
7008
7009 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
7010 }
7011 else
7012 {
7013 rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
7014 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
7015 rtx tmp6 = gen_reg_rtx (V8HImode);
7016 emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
7017 GEN_INT (0x3f)));
7018
7019 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
7020 }
7021 return;
7022 }
7023
7024 /* Three operations. */
7025 if (bits == 8 || bits == 16 || bits == 24)
7026 {
7027 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7028 rtx tmp2 = gen_reg_rtx (V4SImode);
7029 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
7030
7031 rtx tmp3 = gen_reg_rtx (V1TImode);
7032 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (bits)));
7033
7034 if (TARGET_AVX2)
7035 {
7036 rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
7037 rtx tmp5 = gen_reg_rtx (V4SImode);
7038 emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
7039 GEN_INT (7)));
7040
7041 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
7042 }
7043 else
7044 {
7045 rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
7046 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
7047 rtx tmp6 = gen_reg_rtx (V8HImode);
7048 emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
7049 GEN_INT (0x3f)));
7050
7051 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
7052 }
7053 return;
7054 }
7055 }
7056
7057 if (bits > 96)
7058 {
7059 /* Four operations. */
7060 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7061 rtx tmp2 = gen_reg_rtx (V4SImode);
7062 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
7063
7064 rtx tmp3 = gen_reg_rtx (V4SImode);
7065 emit_insn (gen_ashrv4si3 (tmp3, tmp1, GEN_INT (31)));
7066
7067 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
7068 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7069 rtx tmp6 = gen_reg_rtx (V2DImode);
7070 emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
7071
7072 rtx tmp7 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp6));
7073 rtx tmp8 = gen_reg_rtx (V4SImode);
7074 emit_insn (gen_sse2_pshufd (tmp8, tmp7, GEN_INT (0xfd)));
7075
7076 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp8));
7077 return;
7078 }
7079
7080 if (TARGET_SSE4_1 && (bits == 48 || bits == 80))
7081 {
7082 /* Four operations. */
7083 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7084 rtx tmp2 = gen_reg_rtx (V4SImode);
7085 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7086
7087 rtx tmp3 = gen_reg_rtx (V4SImode);
7088 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7089
7090 rtx tmp4 = gen_reg_rtx (V1TImode);
7091 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
7092
7093 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
7094 rtx tmp6 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp4));
7095 rtx tmp7 = gen_reg_rtx (V8HImode);
7096 emit_insn (gen_sse4_1_pblendw (tmp7, tmp5, tmp6,
7097 GEN_INT (bits == 48 ? 0x1f : 0x07)));
7098
7099 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
7100 return;
7101 }
7102
7103 if ((bits & 7) == 0)
7104 {
7105 /* Five operations. */
7106 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7107 rtx tmp2 = gen_reg_rtx (V4SImode);
7108 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7109
7110 rtx tmp3 = gen_reg_rtx (V4SImode);
7111 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7112
7113 rtx tmp4 = gen_reg_rtx (V1TImode);
7114 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
7115
7116 rtx tmp5 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
7117 rtx tmp6 = gen_reg_rtx (V1TImode);
7118 emit_insn (gen_sse2_ashlv1ti3 (tmp6, tmp5, GEN_INT (128 - bits)));
7119
7120 rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
7121 rtx tmp8 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp6));
7122 rtx tmp9 = gen_reg_rtx (V2DImode);
7123 emit_insn (gen_iorv2di3 (tmp9, tmp7, tmp8));
7124
7125 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp9));
7126 return;
7127 }
7128
7129 if (TARGET_AVX2 && bits < 32)
7130 {
7131 /* Six operations. */
7132 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7133 rtx tmp2 = gen_reg_rtx (V4SImode);
7134 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
7135
7136 rtx tmp3 = gen_reg_rtx (V1TImode);
7137 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
7138
7139 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7140 rtx tmp5 = gen_reg_rtx (V2DImode);
7141 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
7142
7143 rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7144 rtx tmp7 = gen_reg_rtx (V2DImode);
7145 emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
7146
7147 rtx tmp8 = gen_reg_rtx (V2DImode);
7148 emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
7149
7150 rtx tmp9 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp8));
7151 rtx tmp10 = gen_reg_rtx (V4SImode);
7152 emit_insn (gen_avx2_pblenddv4si (tmp10, tmp2, tmp9, GEN_INT (7)));
7153
7154 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp10));
7155 return;
7156 }
7157
7158 if (TARGET_SSE4_1 && bits < 15)
7159 {
7160 /* Six operations. */
7161 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7162 rtx tmp2 = gen_reg_rtx (V4SImode);
7163 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
7164
7165 rtx tmp3 = gen_reg_rtx (V1TImode);
7166 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
7167
7168 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7169 rtx tmp5 = gen_reg_rtx (V2DImode);
7170 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
7171
7172 rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7173 rtx tmp7 = gen_reg_rtx (V2DImode);
7174 emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
7175
7176 rtx tmp8 = gen_reg_rtx (V2DImode);
7177 emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
7178
7179 rtx tmp9 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
7180 rtx tmp10 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp8));
7181 rtx tmp11 = gen_reg_rtx (V8HImode);
7182 emit_insn (gen_sse4_1_pblendw (tmp11, tmp9, tmp10, GEN_INT (0x3f)));
7183
7184 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp11));
7185 return;
7186 }
7187
7188 if (bits == 1)
7189 {
7190 /* Eight operations. */
7191 rtx tmp1 = gen_reg_rtx (V1TImode);
7192 emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
7193
7194 rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7195 rtx tmp3 = gen_reg_rtx (V2DImode);
7196 emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (1)));
7197
7198 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
7199 rtx tmp5 = gen_reg_rtx (V2DImode);
7200 emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (63)));
7201
7202 rtx tmp6 = gen_reg_rtx (V2DImode);
7203 emit_insn (gen_iorv2di3 (tmp6, tmp3, tmp5));
7204
7205 rtx tmp7 = gen_reg_rtx (V2DImode);
7206 emit_insn (gen_lshrv2di3 (tmp7, tmp2, GEN_INT (63)));
7207
7208 rtx tmp8 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp7));
7209 rtx tmp9 = gen_reg_rtx (V4SImode);
7210 emit_insn (gen_sse2_pshufd (tmp9, tmp8, GEN_INT (0xbf)));
7211
7212 rtx tmp10 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp9));
7213 rtx tmp11 = gen_reg_rtx (V2DImode);
7214 emit_insn (gen_ashlv2di3 (tmp11, tmp10, GEN_INT (31)));
7215
7216 rtx tmp12 = gen_reg_rtx (V2DImode);
7217 emit_insn (gen_iorv2di3 (tmp12, tmp6, tmp11));
7218
7219 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp12));
7220 return;
7221 }
7222
7223 if (bits > 64)
7224 {
7225 /* Eight operations. */
7226 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7227 rtx tmp2 = gen_reg_rtx (V4SImode);
7228 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7229
7230 rtx tmp3 = gen_reg_rtx (V4SImode);
7231 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7232
7233 rtx tmp4 = gen_reg_rtx (V1TImode);
7234 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
7235
7236 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
7237 rtx tmp6 = gen_reg_rtx (V2DImode);
7238 emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits - 64)));
7239
7240 rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
7241 rtx tmp8 = gen_reg_rtx (V1TImode);
7242 emit_insn (gen_sse2_ashlv1ti3 (tmp8, tmp7, GEN_INT (64)));
7243
7244 rtx tmp9 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7245 rtx tmp10 = gen_reg_rtx (V2DImode);
7246 emit_insn (gen_ashlv2di3 (tmp10, tmp9, GEN_INT (128 - bits)));
7247
7248 rtx tmp11 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp8));
7249 rtx tmp12 = gen_reg_rtx (V2DImode);
7250 emit_insn (gen_iorv2di3 (tmp12, tmp10, tmp11));
7251
7252 rtx tmp13 = gen_reg_rtx (V2DImode);
7253 emit_insn (gen_iorv2di3 (tmp13, tmp6, tmp12));
7254
7255 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp13));
7256 }
7257 else
7258 {
7259 /* Nine operations. */
7260 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7261 rtx tmp2 = gen_reg_rtx (V4SImode);
7262 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7263
7264 rtx tmp3 = gen_reg_rtx (V4SImode);
7265 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7266
7267 rtx tmp4 = gen_reg_rtx (V1TImode);
7268 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
7269
7270 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7271 rtx tmp6 = gen_reg_rtx (V2DImode);
7272 emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits)));
7273
7274 rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
7275 rtx tmp8 = gen_reg_rtx (V2DImode);
7276 emit_insn (gen_ashlv2di3 (tmp8, tmp7, GEN_INT (64 - bits)));
7277
7278 rtx tmp9 = gen_reg_rtx (V2DImode);
7279 emit_insn (gen_iorv2di3 (tmp9, tmp6, tmp8));
7280
7281 rtx tmp10 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
7282 rtx tmp11 = gen_reg_rtx (V1TImode);
7283 emit_insn (gen_sse2_ashlv1ti3 (tmp11, tmp10, GEN_INT (64)));
7284
7285 rtx tmp12 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp11));
7286 rtx tmp13 = gen_reg_rtx (V2DImode);
7287 emit_insn (gen_ashlv2di3 (tmp13, tmp12, GEN_INT (64 - bits)));
7288
7289 rtx tmp14 = gen_reg_rtx (V2DImode);
7290 emit_insn (gen_iorv2di3 (tmp14, tmp9, tmp13));
7291
7292 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp14));
7293 }
7294}
7295
7296/* Replace all occurrences of REG FROM with REG TO in X, including
7297 occurrences with different modes. */
7298
7299rtx
7300ix86_replace_reg_with_reg (rtx x, rtx from, rtx to)
7301{
7302 gcc_checking_assert (REG_P (from)
7303 && REG_P (to)
7304 && GET_MODE (from) == GET_MODE (to));
7305 if (!reg_overlap_mentioned_p (from, x))
7306 return x;
7307 rtx ret = copy_rtx (x);
7308 subrtx_ptr_iterator::array_type array;
7309 FOR_EACH_SUBRTX_PTR (iter, array, &ret, NONCONST)
7310 {
7311 rtx *loc = *iter;
7312 x = *loc;
7313 if (REG_P (x) && REGNO (x) == REGNO (from))
7314 {
7315 if (x == from)
7316 *loc = to;
7317 else
7318 {
7319 gcc_checking_assert (REG_NREGS (x) == 1);
7320 *loc = gen_rtx_REG (GET_MODE (x), REGNO (to));
7321 }
7322 }
7323 }
7324 return ret;
7325}
7326
7327/* Return mode for the memcpy/memset loop counter. Prefer SImode over
7328 DImode for constant loop counts. */
7329
7330static machine_mode
7331counter_mode (rtx count_exp)
7332{
7333 if (GET_MODE (count_exp) != VOIDmode)
7334 return GET_MODE (count_exp);
7335 if (!CONST_INT_P (count_exp))
7336 return Pmode;
7337 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
7338 return DImode;
7339 return SImode;
7340}
7341
7342/* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
7343 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
7344 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
7345 memory by VALUE (supposed to be in MODE).
7346
7347 The size is rounded down to whole number of chunk size moved at once.
7348 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
7349
7350
7351static void
7352expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
7353 rtx destptr, rtx srcptr, rtx value,
7354 rtx count, machine_mode mode, int unroll,
7355 int expected_size, bool issetmem)
7356{
7357 rtx_code_label *out_label, *top_label;
7358 rtx iter, tmp;
7359 machine_mode iter_mode = counter_mode (count_exp: count);
7360 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
7361 rtx piece_size = GEN_INT (piece_size_n);
7362 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
7363 rtx size;
7364 int i;
7365
7366 top_label = gen_label_rtx ();
7367 out_label = gen_label_rtx ();
7368 iter = gen_reg_rtx (iter_mode);
7369
7370 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
7371 NULL, 1, OPTAB_DIRECT);
7372 /* Those two should combine. */
7373 if (piece_size == const1_rtx)
7374 {
7375 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
7376 true, out_label);
7377 predict_jump (REG_BR_PROB_BASE * 10 / 100);
7378 }
7379 emit_move_insn (iter, const0_rtx);
7380
7381 emit_label (top_label);
7382
7383 tmp = convert_modes (Pmode, oldmode: iter_mode, x: iter, unsignedp: true);
7384
7385 /* This assert could be relaxed - in this case we'll need to compute
7386 smallest power of two, containing in PIECE_SIZE_N and pass it to
7387 offset_address. */
7388 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
7389 destmem = offset_address (destmem, tmp, piece_size_n);
7390 destmem = adjust_address (destmem, mode, 0);
7391
7392 if (!issetmem)
7393 {
7394 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
7395 srcmem = adjust_address (srcmem, mode, 0);
7396
7397 /* When unrolling for chips that reorder memory reads and writes,
7398 we can save registers by using single temporary.
7399 Also using 4 temporaries is overkill in 32bit mode. */
7400 if (!TARGET_64BIT && 0)
7401 {
7402 for (i = 0; i < unroll; i++)
7403 {
7404 if (i)
7405 {
7406 destmem = adjust_address (copy_rtx (destmem), mode,
7407 GET_MODE_SIZE (mode));
7408 srcmem = adjust_address (copy_rtx (srcmem), mode,
7409 GET_MODE_SIZE (mode));
7410 }
7411 emit_move_insn (destmem, srcmem);
7412 }
7413 }
7414 else
7415 {
7416 rtx tmpreg[4];
7417 gcc_assert (unroll <= 4);
7418 for (i = 0; i < unroll; i++)
7419 {
7420 tmpreg[i] = gen_reg_rtx (mode);
7421 if (i)
7422 srcmem = adjust_address (copy_rtx (srcmem), mode,
7423 GET_MODE_SIZE (mode));
7424 emit_move_insn (tmpreg[i], srcmem);
7425 }
7426 for (i = 0; i < unroll; i++)
7427 {
7428 if (i)
7429 destmem = adjust_address (copy_rtx (destmem), mode,
7430 GET_MODE_SIZE (mode));
7431 emit_move_insn (destmem, tmpreg[i]);
7432 }
7433 }
7434 }
7435 else
7436 for (i = 0; i < unroll; i++)
7437 {
7438 if (i)
7439 destmem = adjust_address (copy_rtx (destmem), mode,
7440 GET_MODE_SIZE (mode));
7441 emit_move_insn (destmem, value);
7442 }
7443
7444 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
7445 true, OPTAB_LIB_WIDEN);
7446 if (tmp != iter)
7447 emit_move_insn (iter, tmp);
7448
7449 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
7450 true, top_label);
7451 if (expected_size != -1)
7452 {
7453 expected_size /= GET_MODE_SIZE (mode) * unroll;
7454 if (expected_size == 0)
7455 predict_jump (prob: 0);
7456 else if (expected_size > REG_BR_PROB_BASE)
7457 predict_jump (REG_BR_PROB_BASE - 1);
7458 else
7459 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
7460 / expected_size);
7461 }
7462 else
7463 predict_jump (REG_BR_PROB_BASE * 80 / 100);
7464 iter = ix86_zero_extend_to_Pmode (iter);
7465 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
7466 true, OPTAB_LIB_WIDEN);
7467 if (tmp != destptr)
7468 emit_move_insn (destptr, tmp);
7469 if (!issetmem)
7470 {
7471 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
7472 true, OPTAB_LIB_WIDEN);
7473 if (tmp != srcptr)
7474 emit_move_insn (srcptr, tmp);
7475 }
7476 emit_label (out_label);
7477}
7478
7479/* Divide COUNTREG by SCALE. */
7480static rtx
7481scale_counter (rtx countreg, int scale)
7482{
7483 rtx sc;
7484
7485 if (scale == 1)
7486 return countreg;
7487 if (CONST_INT_P (countreg))
7488 return GEN_INT (INTVAL (countreg) / scale);
7489 gcc_assert (REG_P (countreg));
7490
7491 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
7492 GEN_INT (exact_log2 (scale)),
7493 NULL, 1, OPTAB_DIRECT);
7494 return sc;
7495}
7496
7497/* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
7498 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
7499 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
7500 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
7501 ORIG_VALUE is the original value passed to memset to fill the memory with.
7502 Other arguments have same meaning as for previous function. */
7503
7504static void
7505expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
7506 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
7507 rtx count,
7508 machine_mode mode, bool issetmem)
7509{
7510 rtx destexp;
7511 rtx srcexp;
7512 rtx countreg;
7513 HOST_WIDE_INT rounded_count;
7514
7515 /* If possible, it is shorter to use rep movs.
7516 TODO: Maybe it is better to move this logic to decide_alg. */
7517 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
7518 && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
7519 && (!issetmem || orig_value == const0_rtx))
7520 mode = SImode;
7521
7522 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
7523 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
7524
7525 countreg = ix86_zero_extend_to_Pmode (scale_counter (countreg: count,
7526 GET_MODE_SIZE (mode)));
7527 if (mode != QImode)
7528 {
7529 destexp = gen_rtx_ASHIFT (Pmode, countreg,
7530 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
7531 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
7532 }
7533 else
7534 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
7535 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
7536 {
7537 rounded_count
7538 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
7539 destmem = shallow_copy_rtx (destmem);
7540 set_mem_size (destmem, rounded_count);
7541 }
7542 else if (MEM_SIZE_KNOWN_P (destmem))
7543 clear_mem_size (destmem);
7544
7545 if (issetmem)
7546 {
7547 value = force_reg (mode, gen_lowpart (mode, value));
7548 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
7549 }
7550 else
7551 {
7552 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
7553 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
7554 if (mode != QImode)
7555 {
7556 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
7557 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
7558 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
7559 }
7560 else
7561 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
7562 if (CONST_INT_P (count))
7563 {
7564 rounded_count
7565 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
7566 srcmem = shallow_copy_rtx (srcmem);
7567 set_mem_size (srcmem, rounded_count);
7568 }
7569 else
7570 {
7571 if (MEM_SIZE_KNOWN_P (srcmem))
7572 clear_mem_size (srcmem);
7573 }
7574 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
7575 destexp, srcexp));
7576 }
7577}
7578
7579/* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
7580 DESTMEM.
7581 SRC is passed by pointer to be updated on return.
7582 Return value is updated DST. */
7583static rtx
7584emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
7585 HOST_WIDE_INT size_to_move)
7586{
7587 rtx dst = destmem, src = *srcmem, tempreg;
7588 enum insn_code code;
7589 machine_mode move_mode;
7590 int piece_size, i;
7591
7592 /* Find the widest mode in which we could perform moves.
7593 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7594 it until move of such size is supported. */
7595 piece_size = 1 << floor_log2 (x: size_to_move);
7596 while (!int_mode_for_size (size: piece_size * BITS_PER_UNIT, limit: 0).exists (mode: &move_mode)
7597 || (code = optab_handler (op: mov_optab, mode: move_mode)) == CODE_FOR_nothing)
7598 {
7599 gcc_assert (piece_size > 1);
7600 piece_size >>= 1;
7601 }
7602
7603 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7604 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7605 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7606 {
7607 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7608 if (!mode_for_vector (word_mode, nunits).exists (mode: &move_mode)
7609 || (code = optab_handler (op: mov_optab, mode: move_mode)) == CODE_FOR_nothing)
7610 {
7611 move_mode = word_mode;
7612 piece_size = GET_MODE_SIZE (move_mode);
7613 code = optab_handler (op: mov_optab, mode: move_mode);
7614 }
7615 }
7616 gcc_assert (code != CODE_FOR_nothing);
7617
7618 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
7619 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
7620
7621 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7622 gcc_assert (size_to_move % piece_size == 0);
7623
7624 for (i = 0; i < size_to_move; i += piece_size)
7625 {
7626 /* We move from memory to memory, so we'll need to do it via
7627 a temporary register. */
7628 tempreg = gen_reg_rtx (move_mode);
7629 emit_insn (GEN_FCN (code) (tempreg, src));
7630 emit_insn (GEN_FCN (code) (dst, tempreg));
7631
7632 emit_move_insn (destptr,
7633 plus_constant (Pmode, copy_rtx (destptr), piece_size));
7634 emit_move_insn (srcptr,
7635 plus_constant (Pmode, copy_rtx (srcptr), piece_size));
7636
7637 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7638 piece_size);
7639 src = adjust_automodify_address_nv (src, move_mode, srcptr,
7640 piece_size);
7641 }
7642
7643 /* Update DST and SRC rtx. */
7644 *srcmem = src;
7645 return dst;
7646}
7647
7648/* Helper function for the string operations below. Dest VARIABLE whether
7649 it is aligned to VALUE bytes. If true, jump to the label. */
7650
7651static rtx_code_label *
7652ix86_expand_aligntest (rtx variable, int value, bool epilogue)
7653{
7654 rtx_code_label *label = gen_label_rtx ();
7655 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
7656 if (GET_MODE (variable) == DImode)
7657 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
7658 else
7659 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
7660 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
7661 1, label);
7662 if (epilogue)
7663 predict_jump (REG_BR_PROB_BASE * 50 / 100);
7664 else
7665 predict_jump (REG_BR_PROB_BASE * 90 / 100);
7666 return label;
7667}
7668
7669
7670/* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
7671
7672static void
7673expand_cpymem_epilogue (rtx destmem, rtx srcmem,
7674 rtx destptr, rtx srcptr, rtx count, int max_size)
7675{
7676 rtx src, dest;
7677 if (CONST_INT_P (count))
7678 {
7679 HOST_WIDE_INT countval = INTVAL (count);
7680 HOST_WIDE_INT epilogue_size = countval % max_size;
7681 int i;
7682
7683 /* For now MAX_SIZE should be a power of 2. This assert could be
7684 relaxed, but it'll require a bit more complicated epilogue
7685 expanding. */
7686 gcc_assert ((max_size & (max_size - 1)) == 0);
7687 for (i = max_size; i >= 1; i >>= 1)
7688 {
7689 if (epilogue_size & i)
7690 destmem = emit_memmov (destmem, srcmem: &srcmem, destptr, srcptr, size_to_move: i);
7691 }
7692 return;
7693 }
7694 if (max_size > 8)
7695 {
7696 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
7697 count, 1, OPTAB_DIRECT);
7698 expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
7699 count, QImode, unroll: 1, expected_size: 4, issetmem: false);
7700 return;
7701 }
7702
7703 /* When there are stringops, we can cheaply increase dest and src pointers.
7704 Otherwise we save code size by maintaining offset (zero is readily
7705 available from preceding rep operation) and using x86 addressing modes.
7706 */
7707 if (TARGET_SINGLE_STRINGOP)
7708 {
7709 if (max_size > 4)
7710 {
7711 rtx_code_label *label = ix86_expand_aligntest (variable: count, value: 4, epilogue: true);
7712 src = change_address (srcmem, SImode, srcptr);
7713 dest = change_address (destmem, SImode, destptr);
7714 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7715 emit_label (label);
7716 LABEL_NUSES (label) = 1;
7717 }
7718 if (max_size > 2)
7719 {
7720 rtx_code_label *label = ix86_expand_aligntest (variable: count, value: 2, epilogue: true);
7721 src = change_address (srcmem, HImode, srcptr);
7722 dest = change_address (destmem, HImode, destptr);
7723 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7724 emit_label (label);
7725 LABEL_NUSES (label) = 1;
7726 }
7727 if (max_size > 1)
7728 {
7729 rtx_code_label *label = ix86_expand_aligntest (variable: count, value: 1, epilogue: true);
7730 src = change_address (srcmem, QImode, srcptr);
7731 dest = change_address (destmem, QImode, destptr);
7732 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7733 emit_label (label);
7734 LABEL_NUSES (label) = 1;
7735 }
7736 }
7737 else
7738 {
7739 rtx offset = force_reg (Pmode, const0_rtx);
7740 rtx tmp;
7741
7742 if (max_size > 4)
7743 {
7744 rtx_code_label *label = ix86_expand_aligntest (variable: count, value: 4, epilogue: true);
7745 src = change_address (srcmem, SImode, srcptr);
7746 dest = change_address (destmem, SImode, destptr);
7747 emit_move_insn (dest, src);
7748 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
7749 true, OPTAB_LIB_WIDEN);
7750 if (tmp != offset)
7751 emit_move_insn (offset, tmp);
7752 emit_label (label);
7753 LABEL_NUSES (label) = 1;
7754 }
7755 if (max_size > 2)
7756 {
7757 rtx_code_label *label = ix86_expand_aligntest (variable: count, value: 2, epilogue: true);
7758 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
7759 src = change_address (srcmem, HImode, tmp);
7760 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
7761 dest = change_address (destmem, HImode, tmp);
7762 emit_move_insn (dest, src);
7763 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
7764 true, OPTAB_LIB_WIDEN);
7765 if (tmp != offset)
7766 emit_move_insn (offset, tmp);
7767 emit_label (label);
7768 LABEL_NUSES (label) = 1;
7769 }
7770 if (max_size > 1)
7771 {
7772 rtx_code_label *label = ix86_expand_aligntest (variable: count, value: 1, epilogue: true);
7773 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
7774 src = change_address (srcmem, QImode, tmp);
7775 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
7776 dest = change_address (destmem, QImode, tmp);
7777 emit_move_insn (dest, src);
7778 emit_label (label);
7779 LABEL_NUSES (label) = 1;
7780 }
7781 }
7782}
7783
7784/* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
7785 with value PROMOTED_VAL.
7786 SRC is passed by pointer to be updated on return.
7787 Return value is updated DST. */
7788static rtx
7789emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
7790 HOST_WIDE_INT size_to_move)
7791{
7792 rtx dst = destmem;
7793 enum insn_code code;
7794 machine_mode move_mode;
7795 int piece_size, i;
7796
7797 /* Find the widest mode in which we could perform moves.
7798 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7799 it until move of such size is supported. */
7800 move_mode = GET_MODE (promoted_val);
7801 if (move_mode == VOIDmode)
7802 move_mode = QImode;
7803 if (size_to_move < GET_MODE_SIZE (move_mode))
7804 {
7805 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
7806 move_mode = int_mode_for_size (size: move_bits, limit: 0).require ();
7807 promoted_val = gen_lowpart (move_mode, promoted_val);
7808 }
7809 piece_size = GET_MODE_SIZE (move_mode);
7810 code = optab_handler (op: mov_optab, mode: move_mode);
7811 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
7812
7813 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
7814
7815 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7816 gcc_assert (size_to_move % piece_size == 0);
7817
7818 for (i = 0; i < size_to_move; i += piece_size)
7819 {
7820 if (piece_size <= GET_MODE_SIZE (word_mode))
7821 {
7822 emit_insn (gen_strset (destptr, dst, promoted_val));
7823 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7824 piece_size);
7825 continue;
7826 }
7827
7828 emit_insn (GEN_FCN (code) (dst, promoted_val));
7829
7830 emit_move_insn (destptr,
7831 plus_constant (Pmode, copy_rtx (destptr), piece_size));
7832
7833 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7834 piece_size);
7835 }
7836
7837 /* Update DST rtx. */
7838 return dst;
7839}
7840/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7841static void
7842expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
7843 rtx count, int max_size)
7844{
7845 count = expand_simple_binop (counter_mode (count_exp: count), AND, count,
7846 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
7847 expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
7848 gen_lowpart (QImode, value), count, QImode,
7849 unroll: 1, expected_size: max_size / 2, issetmem: true);
7850}
7851
7852/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7853static void
7854expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
7855 rtx count, int max_size)
7856{
7857 rtx dest;
7858
7859 if (CONST_INT_P (count))
7860 {
7861 HOST_WIDE_INT countval = INTVAL (count);
7862 HOST_WIDE_INT epilogue_size = countval % max_size;
7863 int i;
7864
7865 /* For now MAX_SIZE should be a power of 2. This assert could be
7866 relaxed, but it'll require a bit more complicated epilogue
7867 expanding. */
7868 gcc_assert ((max_size & (max_size - 1)) == 0);
7869 for (i = max_size; i >= 1; i >>= 1)
7870 {
7871 if (epilogue_size & i)
7872 {
7873 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
7874 destmem = emit_memset (destmem, destptr, promoted_val: vec_value, size_to_move: i);
7875 else
7876 destmem = emit_memset (destmem, destptr, promoted_val: value, size_to_move: i);
7877 }
7878 }
7879 return;
7880 }
7881 if (max_size > 32)
7882 {
7883 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
7884 return;
7885 }
7886 if (max_size > 16)
7887 {
7888 rtx_code_label *label = ix86_expand_aligntest (variable: count, value: 16, epilogue: true);
7889 if (TARGET_64BIT)
7890 {
7891 dest = change_address (destmem, DImode, destptr);
7892 emit_insn (gen_strset (destptr, dest, value));
7893 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
7894 emit_insn (gen_strset (destptr, dest, value));
7895 }
7896 else
7897 {
7898 dest = change_address (destmem, SImode, destptr);
7899 emit_insn (gen_strset (destptr, dest, value));
7900 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
7901 emit_insn (gen_strset (destptr, dest, value));
7902 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
7903 emit_insn (gen_strset (destptr, dest, value));
7904 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
7905 emit_insn (gen_strset (destptr, dest, value));
7906 }
7907 emit_label (label);
7908 LABEL_NUSES (label) = 1;
7909 }
7910 if (max_size > 8)
7911 {
7912 rtx_code_label *label = ix86_expand_aligntest (variable: count, value: 8, epilogue: true);
7913 if (TARGET_64BIT)
7914 {
7915 dest = change_address (destmem, DImode, destptr);
7916 emit_insn (gen_strset (destptr, dest, value));
7917 }
7918 else
7919 {
7920 dest = change_address (destmem, SImode, destptr);
7921 emit_insn (gen_strset (destptr, dest, value));
7922 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
7923 emit_insn (gen_strset (destptr, dest, value));
7924 }
7925 emit_label (label);
7926 LABEL_NUSES (label) = 1;
7927 }
7928 if (max_size > 4)
7929 {
7930 rtx_code_label *label = ix86_expand_aligntest (variable: count, value: 4, epilogue: true);
7931 dest = change_address (destmem, SImode, destptr);
7932 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
7933 emit_label (label);
7934 LABEL_NUSES (label) = 1;
7935 }
7936 if (max_size > 2)
7937 {
7938 rtx_code_label *label = ix86_expand_aligntest (variable: count, value: 2, epilogue: true);
7939 dest = change_address (destmem, HImode, destptr);
7940 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
7941 emit_label (label);
7942 LABEL_NUSES (label) = 1;
7943 }
7944 if (max_size > 1)
7945 {
7946 rtx_code_label *label = ix86_expand_aligntest (variable: count, value: 1, epilogue: true);
7947 dest = change_address (destmem, QImode, destptr);
7948 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
7949 emit_label (label);
7950 LABEL_NUSES (label) = 1;
7951 }
7952}
7953
7954/* Adjust COUNTER by the VALUE. */
7955static void
7956ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
7957{
7958 emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
7959}
7960
7961/* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
7962 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
7963 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
7964 ignored.
7965 Return value is updated DESTMEM. */
7966
7967static rtx
7968expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
7969 rtx destptr, rtx srcptr, rtx value,
7970 rtx vec_value, rtx count, int align,
7971 int desired_alignment, bool issetmem)
7972{
7973 int i;
7974 for (i = 1; i < desired_alignment; i <<= 1)
7975 {
7976 if (align <= i)
7977 {
7978 rtx_code_label *label = ix86_expand_aligntest (variable: destptr, value: i, epilogue: false);
7979 if (issetmem)
7980 {
7981 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
7982 destmem = emit_memset (destmem, destptr, promoted_val: vec_value, size_to_move: i);
7983 else
7984 destmem = emit_memset (destmem, destptr, promoted_val: value, size_to_move: i);
7985 }
7986 else
7987 destmem = emit_memmov (destmem, srcmem: &srcmem, destptr, srcptr, size_to_move: i);
7988 ix86_adjust_counter (countreg: count, value: i);
7989 emit_label (label);
7990 LABEL_NUSES (label) = 1;
7991 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
7992 }
7993 }
7994 return destmem;
7995}
7996
7997/* Test if COUNT&SIZE is nonzero and if so, expand movme
7998 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
7999 and jump to DONE_LABEL. */
8000static void
8001expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
8002 rtx destptr, rtx srcptr,
8003 rtx value, rtx vec_value,
8004 rtx count, int size,
8005 rtx done_label, bool issetmem)
8006{
8007 rtx_code_label *label = ix86_expand_aligntest (variable: count, value: size, epilogue: false);
8008 machine_mode mode = int_mode_for_size (size: size * BITS_PER_UNIT, limit: 1).else_blk ();
8009 rtx modesize;
8010 int n;
8011
8012 /* If we do not have vector value to copy, we must reduce size. */
8013 if (issetmem)
8014 {
8015 if (!vec_value)
8016 {
8017 if (GET_MODE (value) == VOIDmode && size > 8)
8018 mode = Pmode;
8019 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
8020 mode = GET_MODE (value);
8021 }
8022 else
8023 mode = GET_MODE (vec_value), value = vec_value;
8024 }
8025 else
8026 {
8027 /* Choose appropriate vector mode. */
8028 if (size >= 32)
8029 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
8030 else if (size >= 16)
8031 mode = TARGET_SSE ? V16QImode : DImode;
8032 srcmem = change_address (srcmem, mode, srcptr);
8033 }
8034 destmem = change_address (destmem, mode, destptr);
8035 modesize = GEN_INT (GET_MODE_SIZE (mode));
8036 gcc_assert (GET_MODE_SIZE (mode) <= size);
8037 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
8038 {
8039 if (issetmem)
8040 emit_move_insn (destmem, gen_lowpart (mode, value));
8041 else
8042 {
8043 emit_move_insn (destmem, srcmem);
8044 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
8045 }
8046 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
8047 }
8048
8049 destmem = offset_address (destmem, count, 1);
8050 destmem = offset_address (destmem, GEN_INT (-2 * size),
8051 GET_MODE_SIZE (mode));
8052 if (!issetmem)
8053 {
8054 srcmem = offset_address (srcmem, count, 1);
8055 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
8056 GET_MODE_SIZE (mode));
8057 }
8058 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
8059 {
8060 if (issetmem)
8061 emit_move_insn (destmem, gen_lowpart (mode, value));
8062 else
8063 {
8064 emit_move_insn (destmem, srcmem);
8065 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
8066 }
8067 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
8068 }
8069 emit_jump_insn (gen_jump (done_label));
8070 emit_barrier ();
8071
8072 emit_label (label);
8073 LABEL_NUSES (label) = 1;
8074}
8075
8076/* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
8077 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
8078 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
8079 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
8080 DONE_LABEL is a label after the whole copying sequence. The label is created
8081 on demand if *DONE_LABEL is NULL.
8082 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
8083 bounds after the initial copies.
8084
8085 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
8086 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
8087 we will dispatch to a library call for large blocks.
8088
8089 In pseudocode we do:
8090
8091 if (COUNT < SIZE)
8092 {
8093 Assume that SIZE is 4. Bigger sizes are handled analogously
8094 if (COUNT & 4)
8095 {
8096 copy 4 bytes from SRCPTR to DESTPTR
8097 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
8098 goto done_label
8099 }
8100 if (!COUNT)
8101 goto done_label;
8102 copy 1 byte from SRCPTR to DESTPTR
8103 if (COUNT & 2)
8104 {
8105 copy 2 bytes from SRCPTR to DESTPTR
8106 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
8107 }
8108 }
8109 else
8110 {
8111 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
8112 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
8113
8114 OLD_DESPTR = DESTPTR;
8115 Align DESTPTR up to DESIRED_ALIGN
8116 SRCPTR += DESTPTR - OLD_DESTPTR
8117 COUNT -= DEST_PTR - OLD_DESTPTR
8118 if (DYNAMIC_CHECK)
8119 Round COUNT down to multiple of SIZE
8120 << optional caller supplied zero size guard is here >>
8121 << optional caller supplied dynamic check is here >>
8122 << caller supplied main copy loop is here >>
8123 }
8124 done_label:
8125 */
8126static void
8127expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
8128 rtx *destptr, rtx *srcptr,
8129 machine_mode mode,
8130 rtx value, rtx vec_value,
8131 rtx *count,
8132 rtx_code_label **done_label,
8133 int size,
8134 int desired_align,
8135 int align,
8136 unsigned HOST_WIDE_INT *min_size,
8137 bool dynamic_check,
8138 bool issetmem)
8139{
8140 rtx_code_label *loop_label = NULL, *label;
8141 int n;
8142 rtx modesize;
8143 int prolog_size = 0;
8144 rtx mode_value;
8145
8146 /* Chose proper value to copy. */
8147 if (issetmem && VECTOR_MODE_P (mode))
8148 mode_value = vec_value;
8149 else
8150 mode_value = value;
8151 gcc_assert (GET_MODE_SIZE (mode) <= size);
8152
8153 /* See if block is big or small, handle small blocks. */
8154 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
8155 {
8156 int size2 = size;
8157 loop_label = gen_label_rtx ();
8158
8159 if (!*done_label)
8160 *done_label = gen_label_rtx ();
8161
8162 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
8163 1, loop_label);
8164 size2 >>= 1;
8165
8166 /* Handle sizes > 3. */
8167 for (;size2 > 2; size2 >>= 1)
8168 expand_small_cpymem_or_setmem (destmem, srcmem,
8169 destptr: *destptr, srcptr: *srcptr,
8170 value, vec_value,
8171 count: *count,
8172 size: size2, done_label: *done_label, issetmem);
8173 /* Nothing to copy? Jump to DONE_LABEL if so */
8174 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
8175 1, *done_label);
8176
8177 /* Do a byte copy. */
8178 destmem = change_address (destmem, QImode, *destptr);
8179 if (issetmem)
8180 emit_move_insn (destmem, gen_lowpart (QImode, value));
8181 else
8182 {
8183 srcmem = change_address (srcmem, QImode, *srcptr);
8184 emit_move_insn (destmem, srcmem);
8185 }
8186
8187 /* Handle sizes 2 and 3. */
8188 label = ix86_expand_aligntest (variable: *count, value: 2, epilogue: false);
8189 destmem = change_address (destmem, HImode, *destptr);
8190 destmem = offset_address (destmem, *count, 1);
8191 destmem = offset_address (destmem, GEN_INT (-2), 2);
8192 if (issetmem)
8193 emit_move_insn (destmem, gen_lowpart (HImode, value));
8194 else
8195 {
8196 srcmem = change_address (srcmem, HImode, *srcptr);
8197 srcmem = offset_address (srcmem, *count, 1);
8198 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
8199 emit_move_insn (destmem, srcmem);
8200 }
8201
8202 emit_label (label);
8203 LABEL_NUSES (label) = 1;
8204 emit_jump_insn (gen_jump (*done_label));
8205 emit_barrier ();
8206 }
8207 else
8208 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
8209 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
8210
8211 /* Start memcpy for COUNT >= SIZE. */
8212 if (loop_label)
8213 {
8214 emit_label (loop_label);
8215 LABEL_NUSES (loop_label) = 1;
8216 }
8217
8218 /* Copy first desired_align bytes. */
8219 if (!issetmem)
8220 srcmem = change_address (srcmem, mode, *srcptr);
8221 destmem = change_address (destmem, mode, *destptr);
8222 modesize = GEN_INT (GET_MODE_SIZE (mode));
8223 for (n = 0; prolog_size < desired_align - align; n++)
8224 {
8225 if (issetmem)
8226 emit_move_insn (destmem, mode_value);
8227 else
8228 {
8229 emit_move_insn (destmem, srcmem);
8230 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
8231 }
8232 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
8233 prolog_size += GET_MODE_SIZE (mode);
8234 }
8235
8236
8237 /* Copy last SIZE bytes. */
8238 destmem = offset_address (destmem, *count, 1);
8239 destmem = offset_address (destmem,
8240 GEN_INT (-size - prolog_size),
8241 1);
8242 if (issetmem)
8243 emit_move_insn (destmem, mode_value);
8244 else
8245 {
8246 srcmem = offset_address (srcmem, *count, 1);
8247 srcmem = offset_address (srcmem,
8248 GEN_INT (-size - prolog_size),
8249 1);
8250 emit_move_insn (destmem, srcmem);
8251 }
8252 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
8253 {
8254 destmem = offset_address (destmem, modesize, 1);
8255 if (issetmem)
8256 emit_move_insn (destmem, mode_value);
8257 else
8258 {
8259 srcmem = offset_address (srcmem, modesize, 1);
8260 emit_move_insn (destmem, srcmem);
8261 }
8262 }
8263
8264 /* Align destination. */
8265 if (desired_align > 1 && desired_align > align)
8266 {
8267 rtx saveddest = *destptr;
8268
8269 gcc_assert (desired_align <= size);
8270 /* Align destptr up, place it to new register. */
8271 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
8272 GEN_INT (prolog_size),
8273 NULL_RTX, 1, OPTAB_DIRECT);
8274 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
8275 REG_POINTER (*destptr) = 1;
8276 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
8277 GEN_INT (-desired_align),
8278 *destptr, 1, OPTAB_DIRECT);
8279 /* See how many bytes we skipped. */
8280 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
8281 *destptr,
8282 saveddest, 1, OPTAB_DIRECT);
8283 /* Adjust srcptr and count. */
8284 if (!issetmem)
8285 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
8286 saveddest, *srcptr, 1, OPTAB_DIRECT);
8287 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
8288 saveddest, *count, 1, OPTAB_DIRECT);
8289 /* We copied at most size + prolog_size. */
8290 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
8291 *min_size
8292 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
8293 else
8294 *min_size = 0;
8295
8296 /* Our loops always round down the block size, but for dispatch to
8297 library we need precise value. */
8298 if (dynamic_check)
8299 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
8300 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
8301 }
8302 else
8303 {
8304 gcc_assert (prolog_size == 0);
8305 /* Decrease count, so we won't end up copying last word twice. */
8306 if (!CONST_INT_P (*count))
8307 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
8308 constm1_rtx, *count, 1, OPTAB_DIRECT);
8309 else
8310 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
8311 (unsigned HOST_WIDE_INT)size));
8312 if (*min_size)
8313 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
8314 }
8315}
8316
8317
8318/* This function is like the previous one, except here we know how many bytes
8319 need to be copied. That allows us to update alignment not only of DST, which
8320 is returned, but also of SRC, which is passed as a pointer for that
8321 reason. */
8322static rtx
8323expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
8324 rtx srcreg, rtx value, rtx vec_value,
8325 int desired_align, int align_bytes,
8326 bool issetmem)
8327{
8328 rtx src = NULL;
8329 rtx orig_dst = dst;
8330 rtx orig_src = NULL;
8331 int piece_size = 1;
8332 int copied_bytes = 0;
8333
8334 if (!issetmem)
8335 {
8336 gcc_assert (srcp != NULL);
8337 src = *srcp;
8338 orig_src = src;
8339 }
8340
8341 for (piece_size = 1;
8342 piece_size <= desired_align && copied_bytes < align_bytes;
8343 piece_size <<= 1)
8344 {
8345 if (align_bytes & piece_size)
8346 {
8347 if (issetmem)
8348 {
8349 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
8350 dst = emit_memset (destmem: dst, destptr: destreg, promoted_val: vec_value, size_to_move: piece_size);
8351 else
8352 dst = emit_memset (destmem: dst, destptr: destreg, promoted_val: value, size_to_move: piece_size);
8353 }
8354 else
8355 dst = emit_memmov (destmem: dst, srcmem: &src, destptr: destreg, srcptr: srcreg, size_to_move: piece_size);
8356 copied_bytes += piece_size;
8357 }
8358 }
8359 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
8360 set_mem_align (dst, desired_align * BITS_PER_UNIT);
8361 if (MEM_SIZE_KNOWN_P (orig_dst))
8362 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
8363
8364 if (!issetmem)
8365 {
8366 int src_align_bytes = get_mem_align_offset (src, desired_align
8367 * BITS_PER_UNIT);
8368 if (src_align_bytes >= 0)
8369 src_align_bytes = desired_align - src_align_bytes;
8370 if (src_align_bytes >= 0)
8371 {
8372 unsigned int src_align;
8373 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
8374 {
8375 if ((src_align_bytes & (src_align - 1))
8376 == (align_bytes & (src_align - 1)))
8377 break;
8378 }
8379 if (src_align > (unsigned int) desired_align)
8380 src_align = desired_align;
8381 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
8382 set_mem_align (src, src_align * BITS_PER_UNIT);
8383 }
8384 if (MEM_SIZE_KNOWN_P (orig_src))
8385 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
8386 *srcp = src;
8387 }
8388
8389 return dst;
8390}
8391
8392/* Return true if ALG can be used in current context.
8393 Assume we expand memset if MEMSET is true. */
8394static bool
8395alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
8396{
8397 if (alg == no_stringop)
8398 return false;
8399 /* It is not possible to use a library call if we have non-default
8400 address space. We can do better than the generic byte-at-a-time
8401 loop, used as a fallback. */
8402 if (alg == libcall && have_as)
8403 return false;
8404 if (alg == vector_loop)
8405 return TARGET_SSE || TARGET_AVX;
8406 /* Algorithms using the rep prefix want at least edi and ecx;
8407 additionally, memset wants eax and memcpy wants esi. Don't
8408 consider such algorithms if the user has appropriated those
8409 registers for their own purposes, or if we have a non-default
8410 address space, since some string insns cannot override the segment. */
8411 if (alg == rep_prefix_1_byte
8412 || alg == rep_prefix_4_byte
8413 || alg == rep_prefix_8_byte)
8414 {
8415 if (have_as)
8416 return false;
8417 if (fixed_regs[CX_REG]
8418 || fixed_regs[DI_REG]
8419 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
8420 return false;
8421 }
8422 return true;
8423}
8424
8425/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
8426static enum stringop_alg
8427decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
8428 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
8429 bool memset, bool zero_memset, bool have_as,
8430 int *dynamic_check, bool *noalign, bool recur)
8431{
8432 const struct stringop_algs *algs;
8433 bool optimize_for_speed;
8434 int max = 0;
8435 const struct processor_costs *cost;
8436 int i;
8437 bool any_alg_usable_p = false;
8438
8439 *noalign = false;
8440 *dynamic_check = -1;
8441
8442 /* Even if the string operation call is cold, we still might spend a lot
8443 of time processing large blocks. */
8444 if (optimize_function_for_size_p (cfun)
8445 || (optimize_insn_for_size_p ()
8446 && (max_size < 256
8447 || (expected_size != -1 && expected_size < 256))))
8448 optimize_for_speed = false;
8449 else
8450 optimize_for_speed = true;
8451
8452 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
8453 if (memset)
8454 algs = &cost->memset[TARGET_64BIT != 0];
8455 else
8456 algs = &cost->memcpy[TARGET_64BIT != 0];
8457
8458 /* See maximal size for user defined algorithm. */
8459 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
8460 {
8461 enum stringop_alg candidate = algs->size[i].alg;
8462 bool usable = alg_usable_p (alg: candidate, memset, have_as);
8463 any_alg_usable_p |= usable;
8464
8465 if (candidate != libcall && candidate && usable)
8466 max = algs->size[i].max;
8467 }
8468
8469 /* If expected size is not known but max size is small enough
8470 so inline version is a win, set expected size into
8471 the range. */
8472 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
8473 && expected_size == -1)
8474 expected_size = min_size / 2 + max_size / 2;
8475
8476 /* If user specified the algorithm, honor it if possible. */
8477 if (ix86_stringop_alg != no_stringop
8478 && alg_usable_p (ix86_stringop_alg, memset, have_as))
8479 return ix86_stringop_alg;
8480 /* rep; movq or rep; movl is the smallest variant. */
8481 else if (!optimize_for_speed)
8482 {
8483 *noalign = true;
8484 if (!count || (count & 3) || (memset && !zero_memset))
8485 return alg_usable_p (alg: rep_prefix_1_byte, memset, have_as)
8486 ? rep_prefix_1_byte : loop_1_byte;
8487 else
8488 return alg_usable_p (alg: rep_prefix_4_byte, memset, have_as)
8489 ? rep_prefix_4_byte : loop;
8490 }
8491 /* Very tiny blocks are best handled via the loop, REP is expensive to
8492 setup. */
8493 else if (expected_size != -1 && expected_size < 4)
8494 return loop_1_byte;
8495 else if (expected_size != -1)
8496 {
8497 enum stringop_alg alg = libcall;
8498 bool alg_noalign = false;
8499 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
8500 {
8501 /* We get here if the algorithms that were not libcall-based
8502 were rep-prefix based and we are unable to use rep prefixes
8503 based on global register usage. Break out of the loop and
8504 use the heuristic below. */
8505 if (algs->size[i].max == 0)
8506 break;
8507 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
8508 {
8509 enum stringop_alg candidate = algs->size[i].alg;
8510
8511 if (candidate != libcall
8512 && alg_usable_p (alg: candidate, memset, have_as))
8513 {
8514 alg = candidate;
8515 alg_noalign = algs->size[i].noalign;
8516 }
8517 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
8518 last non-libcall inline algorithm. */
8519 if (TARGET_INLINE_ALL_STRINGOPS)
8520 {
8521 /* When the current size is best to be copied by a libcall,
8522 but we are still forced to inline, run the heuristic below
8523 that will pick code for medium sized blocks. */
8524 if (alg != libcall)
8525 {
8526 *noalign = alg_noalign;
8527 return alg;
8528 }
8529 else if (!any_alg_usable_p)
8530 break;
8531 }
8532 else if (alg_usable_p (alg: candidate, memset, have_as)
8533 && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
8534 && candidate == rep_prefix_1_byte
8535 /* NB: If min_size != max_size, size is
8536 unknown. */
8537 && min_size != max_size))
8538 {
8539 *noalign = algs->size[i].noalign;
8540 return candidate;
8541 }
8542 }
8543 }
8544 }
8545 /* When asked to inline the call anyway, try to pick meaningful choice.
8546 We look for maximal size of block that is faster to copy by hand and
8547 take blocks of at most of that size guessing that average size will
8548 be roughly half of the block.
8549
8550 If this turns out to be bad, we might simply specify the preferred
8551 choice in ix86_costs. */
8552 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
8553 && (algs->unknown_size == libcall
8554 || !alg_usable_p (alg: algs->unknown_size, memset, have_as)))
8555 {
8556 enum stringop_alg alg;
8557 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
8558
8559 /* If there aren't any usable algorithms or if recursing already,
8560 then recursing on smaller sizes or same size isn't going to
8561 find anything. Just return the simple byte-at-a-time copy loop. */
8562 if (!any_alg_usable_p || recur)
8563 {
8564 /* Pick something reasonable. */
8565 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
8566 *dynamic_check = 128;
8567 return loop_1_byte;
8568 }
8569 alg = decide_alg (count, expected_size: new_expected_size, min_size, max_size, memset,
8570 zero_memset, have_as, dynamic_check, noalign, recur: true);
8571 gcc_assert (*dynamic_check == -1);
8572 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
8573 *dynamic_check = max;
8574 else
8575 gcc_assert (alg != libcall);
8576 return alg;
8577 }
8578
8579 /* Try to use some reasonable fallback algorithm. Note that for
8580 non-default address spaces we default to a loop instead of
8581 a libcall. */
8582 return (alg_usable_p (alg: algs->unknown_size, memset, have_as)
8583 ? algs->unknown_size : have_as ? loop : libcall);
8584}
8585
8586/* Decide on alignment. We know that the operand is already aligned to ALIGN
8587 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
8588static int
8589decide_alignment (int align,
8590 enum stringop_alg alg,
8591 int expected_size,
8592 machine_mode move_mode)
8593{
8594 int desired_align = 0;
8595
8596 gcc_assert (alg != no_stringop);
8597
8598 if (alg == libcall)
8599 return 0;
8600 if (move_mode == VOIDmode)
8601 return 0;
8602
8603 desired_align = GET_MODE_SIZE (move_mode);
8604 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
8605 copying whole cacheline at once. */
8606 if (TARGET_CPU_P (PENTIUMPRO)
8607 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
8608 desired_align = 8;
8609
8610 if (optimize_size)
8611 desired_align = 1;
8612 if (desired_align < align)
8613 desired_align = align;
8614 if (expected_size != -1 && expected_size < 4)
8615 desired_align = align;
8616
8617 return desired_align;
8618}
8619
8620
8621/* Helper function for memcpy. For QImode value 0xXY produce
8622 0xXYXYXYXY of wide specified by MODE. This is essentially
8623 a * 0x10101010, but we can do slightly better than
8624 synth_mult by unwinding the sequence by hand on CPUs with
8625 slow multiply. */
8626static rtx
8627promote_duplicated_reg (machine_mode mode, rtx val)
8628{
8629 machine_mode valmode = GET_MODE (val);
8630 rtx tmp;
8631 int nops = mode == DImode ? 3 : 2;
8632
8633 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
8634 if (val == const0_rtx)
8635 return copy_to_mode_reg (mode, CONST0_RTX (mode));
8636 if (CONST_INT_P (val))
8637 {
8638 HOST_WIDE_INT v = INTVAL (val) & 255;
8639
8640 v |= v << 8;
8641 v |= v << 16;
8642 if (mode == DImode)
8643 v |= (v << 16) << 16;
8644 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
8645 }
8646
8647 if (valmode == VOIDmode)
8648 valmode = QImode;
8649 if (valmode != QImode)
8650 val = gen_lowpart (QImode, val);
8651 if (mode == QImode)
8652 return val;
8653 if (!TARGET_PARTIAL_REG_STALL)
8654 nops--;
8655 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
8656 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
8657 <= (ix86_cost->shift_const + ix86_cost->add) * nops
8658 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
8659 {
8660 rtx reg = convert_modes (mode, QImode, x: val, unsignedp: true);
8661 tmp = promote_duplicated_reg (mode, const1_rtx);
8662 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
8663 OPTAB_DIRECT);
8664 }
8665 else
8666 {
8667 rtx reg = convert_modes (mode, QImode, x: val, unsignedp: true);
8668
8669 if (!TARGET_PARTIAL_REG_STALL)
8670 emit_insn (gen_insv_1 (arg0: mode, x0: reg, x1: reg));
8671 else
8672 {
8673 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
8674 NULL, 1, OPTAB_DIRECT);
8675 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
8676 OPTAB_DIRECT);
8677 }
8678 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
8679 NULL, 1, OPTAB_DIRECT);
8680 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
8681 if (mode == SImode)
8682 return reg;
8683 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
8684 NULL, 1, OPTAB_DIRECT);
8685 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
8686 return reg;
8687 }
8688}
8689
8690/* Duplicate value VAL using promote_duplicated_reg into maximal size that will
8691 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
8692 alignment from ALIGN to DESIRED_ALIGN. */
8693static rtx
8694promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
8695 int align)
8696{
8697 rtx promoted_val;
8698
8699 if (TARGET_64BIT
8700 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
8701 promoted_val = promote_duplicated_reg (DImode, val);
8702 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
8703 promoted_val = promote_duplicated_reg (SImode, val);
8704 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
8705 promoted_val = promote_duplicated_reg (HImode, val);
8706 else
8707 promoted_val = val;
8708
8709 return promoted_val;
8710}
8711
8712/* Copy the address to a Pmode register. This is used for x32 to
8713 truncate DImode TLS address to a SImode register. */
8714
8715static rtx
8716ix86_copy_addr_to_reg (rtx addr)
8717{
8718 rtx reg;
8719 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
8720 {
8721 reg = copy_addr_to_reg (addr);
8722 REG_POINTER (reg) = 1;
8723 return reg;
8724 }
8725 else
8726 {
8727 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
8728 reg = copy_to_mode_reg (DImode, addr);
8729 REG_POINTER (reg) = 1;
8730 return gen_rtx_SUBREG (SImode, reg, 0);
8731 }
8732}
8733
8734/* Expand string move (memcpy) ot store (memset) operation. Use i386 string
8735 operations when profitable. The code depends upon architecture, block size
8736 and alignment, but always has one of the following overall structures:
8737
8738 Aligned move sequence:
8739
8740 1) Prologue guard: Conditional that jumps up to epilogues for small
8741 blocks that can be handled by epilogue alone. This is faster
8742 but also needed for correctness, since prologue assume the block
8743 is larger than the desired alignment.
8744
8745 Optional dynamic check for size and libcall for large
8746 blocks is emitted here too, with -minline-stringops-dynamically.
8747
8748 2) Prologue: copy first few bytes in order to get destination
8749 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
8750 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
8751 copied. We emit either a jump tree on power of two sized
8752 blocks, or a byte loop.
8753
8754 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8755 with specified algorithm.
8756
8757 4) Epilogue: code copying tail of the block that is too small to be
8758 handled by main body (or up to size guarded by prologue guard).
8759
8760 Misaligned move sequence
8761
8762 1) missaligned move prologue/epilogue containing:
8763 a) Prologue handling small memory blocks and jumping to done_label
8764 (skipped if blocks are known to be large enough)
8765 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
8766 needed by single possibly misaligned move
8767 (skipped if alignment is not needed)
8768 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
8769
8770 2) Zero size guard dispatching to done_label, if needed
8771
8772 3) dispatch to library call, if needed,
8773
8774 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8775 with specified algorithm. */
8776bool
8777ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
8778 rtx align_exp, rtx expected_align_exp,
8779 rtx expected_size_exp, rtx min_size_exp,
8780 rtx max_size_exp, rtx probable_max_size_exp,
8781 bool issetmem)
8782{
8783 rtx destreg;
8784 rtx srcreg = NULL;
8785 rtx_code_label *label = NULL;
8786 rtx tmp;
8787 rtx_code_label *jump_around_label = NULL;
8788 HOST_WIDE_INT align = 1;
8789 unsigned HOST_WIDE_INT count = 0;
8790 HOST_WIDE_INT expected_size = -1;
8791 int size_needed = 0, epilogue_size_needed;
8792 int desired_align = 0, align_bytes = 0;
8793 enum stringop_alg alg;
8794 rtx promoted_val = NULL;
8795 rtx vec_promoted_val = NULL;
8796 bool force_loopy_epilogue = false;
8797 int dynamic_check;
8798 bool need_zero_guard = false;
8799 bool noalign;
8800 machine_mode move_mode = VOIDmode;
8801 machine_mode wider_mode;
8802 int unroll_factor = 1;
8803 /* TODO: Once value ranges are available, fill in proper data. */
8804 unsigned HOST_WIDE_INT min_size = 0;
8805 unsigned HOST_WIDE_INT max_size = -1;
8806 unsigned HOST_WIDE_INT probable_max_size = -1;
8807 bool misaligned_prologue_used = false;
8808 bool have_as;
8809
8810 if (CONST_INT_P (align_exp))
8811 align = INTVAL (align_exp);
8812 /* i386 can do misaligned access on reasonably increased cost. */
8813 if (CONST_INT_P (expected_align_exp)
8814 && INTVAL (expected_align_exp) > align)
8815 align = INTVAL (expected_align_exp);
8816 /* ALIGN is the minimum of destination and source alignment, but we care here
8817 just about destination alignment. */
8818 else if (!issetmem
8819 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
8820 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
8821
8822 if (CONST_INT_P (count_exp))
8823 {
8824 min_size = max_size = probable_max_size = count = expected_size
8825 = INTVAL (count_exp);
8826 /* When COUNT is 0, there is nothing to do. */
8827 if (!count)
8828 return true;
8829 }
8830 else
8831 {
8832 if (min_size_exp)
8833 min_size = INTVAL (min_size_exp);
8834 if (max_size_exp)
8835 max_size = INTVAL (max_size_exp);
8836 if (probable_max_size_exp)
8837 probable_max_size = INTVAL (probable_max_size_exp);
8838 if (CONST_INT_P (expected_size_exp))
8839 expected_size = INTVAL (expected_size_exp);
8840 }
8841
8842 /* Make sure we don't need to care about overflow later on. */
8843 if (count > (HOST_WIDE_INT_1U << 30))
8844 return false;
8845
8846 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
8847 if (!issetmem)
8848 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
8849
8850 /* Step 0: Decide on preferred algorithm, desired alignment and
8851 size of chunks to be copied by main loop. */
8852 alg = decide_alg (count, expected_size, min_size, max_size: probable_max_size,
8853 memset: issetmem,
8854 zero_memset: issetmem && val_exp == const0_rtx, have_as,
8855 dynamic_check: &dynamic_check, noalign: &noalign, recur: false);
8856
8857 if (dump_file)
8858 fprintf (stream: dump_file, format: "Selected stringop expansion strategy: %s\n",
8859 stringop_alg_names[alg]);
8860
8861 if (alg == libcall)
8862 return false;
8863 gcc_assert (alg != no_stringop);
8864
8865 /* For now vector-version of memset is generated only for memory zeroing, as
8866 creating of promoted vector value is very cheap in this case. */
8867 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
8868 alg = unrolled_loop;
8869
8870 if (!count)
8871 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
8872 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
8873 if (!issetmem)
8874 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
8875
8876 unroll_factor = 1;
8877 move_mode = word_mode;
8878 switch (alg)
8879 {
8880 case libcall:
8881 case no_stringop:
8882 case last_alg:
8883 gcc_unreachable ();
8884 case loop_1_byte:
8885 need_zero_guard = true;
8886 move_mode = QImode;
8887 break;
8888 case loop:
8889 need_zero_guard = true;
8890 break;
8891 case unrolled_loop:
8892 need_zero_guard = true;
8893 unroll_factor = (TARGET_64BIT ? 4 : 2);
8894 break;
8895 case vector_loop:
8896 need_zero_guard = true;
8897 unroll_factor = 4;
8898 /* Find the widest supported mode. */
8899 move_mode = word_mode;
8900 while (GET_MODE_WIDER_MODE (m: move_mode).exists (mode: &wider_mode)
8901 && optab_handler (op: mov_optab, mode: wider_mode) != CODE_FOR_nothing)
8902 move_mode = wider_mode;
8903
8904 if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
8905 move_mode = TImode;
8906 if (TARGET_AVX512_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 256)
8907 move_mode = OImode;
8908
8909 /* Find the corresponding vector mode with the same size as MOVE_MODE.
8910 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
8911 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
8912 {
8913 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
8914 if (!mode_for_vector (word_mode, nunits).exists (mode: &move_mode)
8915 || optab_handler (op: mov_optab, mode: move_mode) == CODE_FOR_nothing)
8916 move_mode = word_mode;
8917 }
8918 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
8919 break;
8920 case rep_prefix_8_byte:
8921 move_mode = DImode;
8922 break;
8923 case rep_prefix_4_byte:
8924 move_mode = SImode;
8925 break;
8926 case rep_prefix_1_byte:
8927 move_mode = QImode;
8928 break;
8929 }
8930 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
8931 epilogue_size_needed = size_needed;
8932
8933 /* If we are going to call any library calls conditionally, make sure any
8934 pending stack adjustment happen before the first conditional branch,
8935 otherwise they will be emitted before the library call only and won't
8936 happen from the other branches. */
8937 if (dynamic_check != -1)
8938 do_pending_stack_adjust ();
8939
8940 desired_align = decide_alignment (align, alg, expected_size, move_mode);
8941 if (!TARGET_ALIGN_STRINGOPS || noalign)
8942 align = desired_align;
8943
8944 /* Step 1: Prologue guard. */
8945
8946 /* Alignment code needs count to be in register. */
8947 if (CONST_INT_P (count_exp) && desired_align > align)
8948 {
8949 if (INTVAL (count_exp) > desired_align
8950 && INTVAL (count_exp) > size_needed)
8951 {
8952 align_bytes
8953 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
8954 if (align_bytes <= 0)
8955 align_bytes = 0;
8956 else
8957 align_bytes = desired_align - align_bytes;
8958 }
8959 if (align_bytes == 0)
8960 count_exp = force_reg (counter_mode (count_exp), count_exp);
8961 }
8962 gcc_assert (desired_align >= 1 && align >= 1);
8963
8964 /* Misaligned move sequences handle both prologue and epilogue at once.
8965 Default code generation results in a smaller code for large alignments
8966 and also avoids redundant job when sizes are known precisely. */
8967 misaligned_prologue_used
8968 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
8969 && MAX (desired_align, epilogue_size_needed) <= 32
8970 && desired_align <= epilogue_size_needed
8971 && ((desired_align > align && !align_bytes)
8972 || (!count && epilogue_size_needed > 1)));
8973
8974 /* Do the cheap promotion to allow better CSE across the
8975 main loop and epilogue (ie one load of the big constant in the
8976 front of all code.
8977 For now the misaligned move sequences do not have fast path
8978 without broadcasting. */
8979 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
8980 {
8981 if (alg == vector_loop)
8982 {
8983 gcc_assert (val_exp == const0_rtx);
8984 vec_promoted_val = promote_duplicated_reg (mode: move_mode, val: val_exp);
8985 promoted_val = promote_duplicated_reg_to_size (val: val_exp,
8986 GET_MODE_SIZE (word_mode),
8987 desired_align, align);
8988 }
8989 else
8990 {
8991 promoted_val = promote_duplicated_reg_to_size (val: val_exp, size_needed,
8992 desired_align, align);
8993 }
8994 }
8995 /* Misaligned move sequences handles both prologues and epilogues at once.
8996 Default code generation results in smaller code for large alignments and
8997 also avoids redundant job when sizes are known precisely. */
8998 if (misaligned_prologue_used)
8999 {
9000 /* Misaligned move prologue handled small blocks by itself. */
9001 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
9002 (destmem: dst, srcmem: src, destptr: &destreg, srcptr: &srcreg,
9003 mode: move_mode, value: promoted_val, vec_value: vec_promoted_val,
9004 count: &count_exp,
9005 done_label: &jump_around_label,
9006 size: desired_align < align
9007 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
9008 desired_align, align, min_size: &min_size, dynamic_check, issetmem);
9009 if (!issetmem)
9010 src = change_address (src, BLKmode, srcreg);
9011 dst = change_address (dst, BLKmode, destreg);
9012 set_mem_align (dst, desired_align * BITS_PER_UNIT);
9013 epilogue_size_needed = 0;
9014 if (need_zero_guard
9015 && min_size < (unsigned HOST_WIDE_INT) size_needed)
9016 {
9017 /* It is possible that we copied enough so the main loop will not
9018 execute. */
9019 gcc_assert (size_needed > 1);
9020 if (jump_around_label == NULL_RTX)
9021 jump_around_label = gen_label_rtx ();
9022 emit_cmp_and_jump_insns (count_exp,
9023 GEN_INT (size_needed),
9024 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
9025 if (expected_size == -1
9026 || expected_size < (desired_align - align) / 2 + size_needed)
9027 predict_jump (REG_BR_PROB_BASE * 20 / 100);
9028 else
9029 predict_jump (REG_BR_PROB_BASE * 60 / 100);
9030 }
9031 }
9032 /* Ensure that alignment prologue won't copy past end of block. */
9033 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
9034 {
9035 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
9036 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
9037 Make sure it is power of 2. */
9038 epilogue_size_needed = 1 << (floor_log2 (x: epilogue_size_needed) + 1);
9039
9040 /* To improve performance of small blocks, we jump around the VAL
9041 promoting mode. This mean that if the promoted VAL is not constant,
9042 we might not use it in the epilogue and have to use byte
9043 loop variant. */
9044 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
9045 force_loopy_epilogue = true;
9046 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
9047 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
9048 {
9049 /* If main algorithm works on QImode, no epilogue is needed.
9050 For small sizes just don't align anything. */
9051 if (size_needed == 1)
9052 desired_align = align;
9053 else
9054 goto epilogue;
9055 }
9056 else if (!count
9057 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
9058 {
9059 label = gen_label_rtx ();
9060 emit_cmp_and_jump_insns (count_exp,
9061 GEN_INT (epilogue_size_needed),
9062 LTU, 0, counter_mode (count_exp), 1, label);
9063 if (expected_size == -1 || expected_size < epilogue_size_needed)
9064 predict_jump (REG_BR_PROB_BASE * 60 / 100);
9065 else
9066 predict_jump (REG_BR_PROB_BASE * 20 / 100);
9067 }
9068 }
9069
9070 /* Emit code to decide on runtime whether library call or inline should be
9071 used. */
9072 if (dynamic_check != -1)
9073 {
9074 if (!issetmem && CONST_INT_P (count_exp))
9075 {
9076 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
9077 {
9078 emit_block_copy_via_libcall (dst, src, size: count_exp);
9079 count_exp = const0_rtx;
9080 goto epilogue;
9081 }
9082 }
9083 else
9084 {
9085 rtx_code_label *hot_label = gen_label_rtx ();
9086 if (jump_around_label == NULL_RTX)
9087 jump_around_label = gen_label_rtx ();
9088 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
9089 LEU, 0, counter_mode (count_exp),
9090 1, hot_label);
9091 predict_jump (REG_BR_PROB_BASE * 90 / 100);
9092 if (issetmem)
9093 set_storage_via_libcall (dst, count_exp, val_exp);
9094 else
9095 emit_block_copy_via_libcall (dst, src, size: count_exp);
9096 emit_jump (jump_around_label);
9097 emit_label (hot_label);
9098 }
9099 }
9100
9101 /* Step 2: Alignment prologue. */
9102 /* Do the expensive promotion once we branched off the small blocks. */
9103 if (issetmem && !promoted_val)
9104 promoted_val = promote_duplicated_reg_to_size (val: val_exp, size_needed,
9105 desired_align, align);
9106
9107 if (desired_align > align && !misaligned_prologue_used)
9108 {
9109 if (align_bytes == 0)
9110 {
9111 /* Except for the first move in prologue, we no longer know
9112 constant offset in aliasing info. It don't seems to worth
9113 the pain to maintain it for the first move, so throw away
9114 the info early. */
9115 dst = change_address (dst, BLKmode, destreg);
9116 if (!issetmem)
9117 src = change_address (src, BLKmode, srcreg);
9118 dst = expand_set_or_cpymem_prologue (destmem: dst, srcmem: src, destptr: destreg, srcptr: srcreg,
9119 value: promoted_val, vec_value: vec_promoted_val,
9120 count: count_exp, align, desired_alignment: desired_align,
9121 issetmem);
9122 /* At most desired_align - align bytes are copied. */
9123 if (min_size < (unsigned)(desired_align - align))
9124 min_size = 0;
9125 else
9126 min_size -= desired_align - align;
9127 }
9128 else
9129 {
9130 /* If we know how many bytes need to be stored before dst is
9131 sufficiently aligned, maintain aliasing info accurately. */
9132 dst = expand_set_or_cpymem_constant_prologue (dst, srcp: &src, destreg,
9133 srcreg,
9134 value: promoted_val,
9135 vec_value: vec_promoted_val,
9136 desired_align,
9137 align_bytes,
9138 issetmem);
9139
9140 count_exp = plus_constant (counter_mode (count_exp),
9141 count_exp, -align_bytes);
9142 count -= align_bytes;
9143 min_size -= align_bytes;
9144 max_size -= align_bytes;
9145 }
9146 if (need_zero_guard
9147 && min_size < (unsigned HOST_WIDE_INT) size_needed
9148 && (count < (unsigned HOST_WIDE_INT) size_needed
9149 || (align_bytes == 0
9150 && count < ((unsigned HOST_WIDE_INT) size_needed
9151 + desired_align - align))))
9152 {
9153 /* It is possible that we copied enough so the main loop will not
9154 execute. */
9155 gcc_assert (size_needed > 1);
9156 if (label == NULL_RTX)
9157 label = gen_label_rtx ();
9158 emit_cmp_and_jump_insns (count_exp,
9159 GEN_INT (size_needed),
9160 LTU, 0, counter_mode (count_exp), 1, label);
9161 if (expected_size == -1
9162 || expected_size < (desired_align - align) / 2 + size_needed)
9163 predict_jump (REG_BR_PROB_BASE * 20 / 100);
9164 else
9165 predict_jump (REG_BR_PROB_BASE * 60 / 100);
9166 }
9167 }
9168 if (label && size_needed == 1)
9169 {
9170 emit_label (label);
9171 LABEL_NUSES (label) = 1;
9172 label = NULL;
9173 epilogue_size_needed = 1;
9174 if (issetmem)
9175 promoted_val = val_exp;
9176 }
9177 else if (label == NULL_RTX && !misaligned_prologue_used)
9178 epilogue_size_needed = size_needed;
9179
9180 /* Step 3: Main loop. */
9181
9182 switch (alg)
9183 {
9184 case libcall:
9185 case no_stringop:
9186 case last_alg:
9187 gcc_unreachable ();
9188 case loop_1_byte:
9189 case loop:
9190 case unrolled_loop:
9191 expand_set_or_cpymem_via_loop (destmem: dst, srcmem: src, destptr: destreg, srcptr: srcreg, value: promoted_val,
9192 count: count_exp, mode: move_mode, unroll: unroll_factor,
9193 expected_size, issetmem);
9194 break;
9195 case vector_loop:
9196 expand_set_or_cpymem_via_loop (destmem: dst, srcmem: src, destptr: destreg, srcptr: srcreg,
9197 value: vec_promoted_val, count: count_exp, mode: move_mode,
9198 unroll: unroll_factor, expected_size, issetmem);
9199 break;
9200 case rep_prefix_8_byte:
9201 case rep_prefix_4_byte:
9202 case rep_prefix_1_byte:
9203 expand_set_or_cpymem_via_rep (destmem: dst, srcmem: src, destptr: destreg, srcptr: srcreg, value: promoted_val,
9204 orig_value: val_exp, count: count_exp, mode: move_mode, issetmem);
9205 break;
9206 }
9207 /* Adjust properly the offset of src and dest memory for aliasing. */
9208 if (CONST_INT_P (count_exp))
9209 {
9210 if (!issetmem)
9211 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
9212 (count / size_needed) * size_needed);
9213 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
9214 (count / size_needed) * size_needed);
9215 }
9216 else
9217 {
9218 if (!issetmem)
9219 src = change_address (src, BLKmode, srcreg);
9220 dst = change_address (dst, BLKmode, destreg);
9221 }
9222
9223 /* Step 4: Epilogue to copy the remaining bytes. */
9224 epilogue:
9225 if (label)
9226 {
9227 /* When the main loop is done, COUNT_EXP might hold original count,
9228 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
9229 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
9230 bytes. Compensate if needed. */
9231
9232 if (size_needed < epilogue_size_needed)
9233 {
9234 tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
9235 GEN_INT (size_needed - 1), count_exp, 1,
9236 OPTAB_DIRECT);
9237 if (tmp != count_exp)
9238 emit_move_insn (count_exp, tmp);
9239 }
9240 emit_label (label);
9241 LABEL_NUSES (label) = 1;
9242 }
9243
9244 if (count_exp != const0_rtx && epilogue_size_needed > 1)
9245 {
9246 if (force_loopy_epilogue)
9247 expand_setmem_epilogue_via_loop (destmem: dst, destptr: destreg, value: val_exp, count: count_exp,
9248 max_size: epilogue_size_needed);
9249 else
9250 {
9251 if (issetmem)
9252 expand_setmem_epilogue (destmem: dst, destptr: destreg, value: promoted_val,
9253 vec_value: vec_promoted_val, count: count_exp,
9254 max_size: epilogue_size_needed);
9255 else
9256 expand_cpymem_epilogue (destmem: dst, srcmem: src, destptr: destreg, srcptr: srcreg, count: count_exp,
9257 max_size: epilogue_size_needed);
9258 }
9259 }
9260 if (jump_around_label)
9261 emit_label (jump_around_label);
9262 return true;
9263}
9264
9265/* Expand cmpstrn or memcmp. */
9266
9267bool
9268ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2,
9269 rtx length, rtx align, bool is_cmpstrn)
9270{
9271 /* Expand strncmp and memcmp only with -minline-all-stringops since
9272 "repz cmpsb" can be much slower than strncmp and memcmp functions
9273 implemented with vector instructions, see
9274
9275 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
9276 */
9277 if (!TARGET_INLINE_ALL_STRINGOPS)
9278 return false;
9279
9280 /* Can't use this if the user has appropriated ecx, esi or edi. */
9281 if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
9282 return false;
9283
9284 if (is_cmpstrn)
9285 {
9286 /* For strncmp, length is the maximum length, which can be larger
9287 than actual string lengths. We can expand the cmpstrn pattern
9288 to "repz cmpsb" only if one of the strings is a constant so
9289 that expand_builtin_strncmp() can write the length argument to
9290 be the minimum of the const string length and the actual length
9291 argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
9292 tree t1 = MEM_EXPR (src1);
9293 tree t2 = MEM_EXPR (src2);
9294 if (!((t1 && TREE_CODE (t1) == MEM_REF
9295 && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
9296 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0))
9297 == STRING_CST))
9298 || (t2 && TREE_CODE (t2) == MEM_REF
9299 && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
9300 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0))
9301 == STRING_CST))))
9302 return false;
9303 }
9304
9305 rtx addr1 = copy_addr_to_reg (XEXP (src1, 0));
9306 rtx addr2 = copy_addr_to_reg (XEXP (src2, 0));
9307 if (addr1 != XEXP (src1, 0))
9308 src1 = replace_equiv_address_nv (src1, addr1);
9309 if (addr2 != XEXP (src2, 0))
9310 src2 = replace_equiv_address_nv (src2, addr2);
9311
9312 /* NB: Make a copy of the data length to avoid changing the original
9313 data length by cmpstrnqi patterns. */
9314 length = ix86_zero_extend_to_Pmode (length);
9315 rtx lengthreg = gen_reg_rtx (Pmode);
9316 emit_move_insn (lengthreg, length);
9317
9318 /* If we are testing strict equality, we can use known alignment to
9319 good advantage. This may be possible with combine, particularly
9320 once cc0 is dead. */
9321 if (CONST_INT_P (length))
9322 {
9323 if (length == const0_rtx)
9324 {
9325 emit_move_insn (result, const0_rtx);
9326 return true;
9327 }
9328 emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align,
9329 src1, src2));
9330 }
9331 else
9332 {
9333 emit_insn (gen_cmp_1 (Pmode, x0: lengthreg, x1: lengthreg));
9334 emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align,
9335 src1, src2));
9336 }
9337
9338 rtx out = gen_lowpart (QImode, result);
9339 emit_insn (gen_cmpintqi (out));
9340 emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out));
9341
9342 return true;
9343}
9344
9345/* Expand the appropriate insns for doing strlen if not just doing
9346 repnz; scasb
9347
9348 out = result, initialized with the start address
9349 align_rtx = alignment of the address.
9350 scratch = scratch register, initialized with the startaddress when
9351 not aligned, otherwise undefined
9352
9353 This is just the body. It needs the initializations mentioned above and
9354 some address computing at the end. These things are done in i386.md. */
9355
9356static void
9357ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
9358{
9359 int align;
9360 rtx tmp;
9361 rtx_code_label *align_2_label = NULL;
9362 rtx_code_label *align_3_label = NULL;
9363 rtx_code_label *align_4_label = gen_label_rtx ();
9364 rtx_code_label *end_0_label = gen_label_rtx ();
9365 rtx mem;
9366 rtx tmpreg = gen_reg_rtx (SImode);
9367 rtx scratch = gen_reg_rtx (SImode);
9368 rtx cmp;
9369
9370 align = 0;
9371 if (CONST_INT_P (align_rtx))
9372 align = INTVAL (align_rtx);
9373
9374 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
9375
9376 /* Is there a known alignment and is it less than 4? */
9377 if (align < 4)
9378 {
9379 rtx scratch1 = gen_reg_rtx (Pmode);
9380 emit_move_insn (scratch1, out);
9381 /* Is there a known alignment and is it not 2? */
9382 if (align != 2)
9383 {
9384 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
9385 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
9386
9387 /* Leave just the 3 lower bits. */
9388 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
9389 NULL_RTX, 0, OPTAB_WIDEN);
9390
9391 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
9392 Pmode, 1, align_4_label);
9393 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
9394 Pmode, 1, align_2_label);
9395 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
9396 Pmode, 1, align_3_label);
9397 }
9398 else
9399 {
9400 /* Since the alignment is 2, we have to check 2 or 0 bytes;
9401 check if is aligned to 4 - byte. */
9402
9403 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
9404 NULL_RTX, 0, OPTAB_WIDEN);
9405
9406 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
9407 Pmode, 1, align_4_label);
9408 }
9409
9410 mem = change_address (src, QImode, out);
9411
9412 /* Now compare the bytes. */
9413
9414 /* Compare the first n unaligned byte on a byte per byte basis. */
9415 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
9416 QImode, 1, end_0_label);
9417
9418 /* Increment the address. */
9419 emit_insn (gen_add2_insn (out, const1_rtx));
9420
9421 /* Not needed with an alignment of 2 */
9422 if (align != 2)
9423 {
9424 emit_label (align_2_label);
9425
9426 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
9427 end_0_label);
9428
9429 emit_insn (gen_add2_insn (out, const1_rtx));
9430
9431 emit_label (align_3_label);
9432 }
9433
9434 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
9435 end_0_label);
9436
9437 emit_insn (gen_add2_insn (out, const1_rtx));
9438 }
9439
9440 /* Generate loop to check 4 bytes at a time. It is not a good idea to
9441 align this loop. It gives only huge programs, but does not help to
9442 speed up. */
9443 emit_label (align_4_label);
9444
9445 mem = change_address (src, SImode, out);
9446 emit_move_insn (scratch, mem);
9447 emit_insn (gen_add2_insn (out, GEN_INT (4)));
9448
9449 /* This formula yields a nonzero result iff one of the bytes is zero.
9450 This saves three branches inside loop and many cycles. */
9451
9452 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
9453 emit_insn (gen_one_cmplsi2 (scratch, scratch));
9454 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
9455 emit_insn (gen_andsi3 (tmpreg, tmpreg,
9456 gen_int_mode (0x80808080, SImode)));
9457 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
9458 align_4_label);
9459
9460 if (TARGET_CMOVE)
9461 {
9462 rtx reg = gen_reg_rtx (SImode);
9463 rtx reg2 = gen_reg_rtx (Pmode);
9464 emit_move_insn (reg, tmpreg);
9465 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
9466
9467 /* If zero is not in the first two bytes, move two bytes forward. */
9468 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
9469 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9470 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
9471 emit_insn (gen_rtx_SET (tmpreg,
9472 gen_rtx_IF_THEN_ELSE (SImode, tmp,
9473 reg,
9474 tmpreg)));
9475 /* Emit lea manually to avoid clobbering of flags. */
9476 emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
9477
9478 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9479 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
9480 emit_insn (gen_rtx_SET (out,
9481 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
9482 reg2,
9483 out)));
9484 }
9485 else
9486 {
9487 rtx_code_label *end_2_label = gen_label_rtx ();
9488 /* Is zero in the first two bytes? */
9489
9490 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
9491 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9492 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
9493 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
9494 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
9495 pc_rtx);
9496 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
9497 JUMP_LABEL (tmp) = end_2_label;
9498
9499 /* Not in the first two. Move two bytes forward. */
9500 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
9501 emit_insn (gen_add2_insn (out, const2_rtx));
9502
9503 emit_label (end_2_label);
9504
9505 }
9506
9507 /* Avoid branch in fixing the byte. */
9508 tmpreg = gen_lowpart (QImode, tmpreg);
9509 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
9510 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
9511 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
9512 emit_insn (gen_sub3_carry (Pmode, x0: out, x1: out, GEN_INT (3), x3: tmp, x4: cmp));
9513
9514 emit_label (end_0_label);
9515}
9516
9517/* Expand strlen. */
9518
9519bool
9520ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
9521{
9522if (TARGET_UNROLL_STRLEN
9523 && TARGET_INLINE_ALL_STRINGOPS
9524 && eoschar == const0_rtx
9525 && optimize > 1)
9526 {
9527 /* The generic case of strlen expander is long. Avoid it's
9528 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
9529 rtx addr = force_reg (Pmode, XEXP (src, 0));
9530 /* Well it seems that some optimizer does not combine a call like
9531 foo(strlen(bar), strlen(bar));
9532 when the move and the subtraction is done here. It does calculate
9533 the length just once when these instructions are done inside of
9534 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
9535 often used and I use one fewer register for the lifetime of
9536 output_strlen_unroll() this is better. */
9537
9538 emit_move_insn (out, addr);
9539
9540 ix86_expand_strlensi_unroll_1 (out, src, align_rtx: align);
9541
9542 /* strlensi_unroll_1 returns the address of the zero at the end of
9543 the string, like memchr(), so compute the length by subtracting
9544 the start address. */
9545 emit_insn (gen_sub2_insn (out, addr));
9546 return true;
9547 }
9548 else
9549 return false;
9550}
9551
9552/* For given symbol (function) construct code to compute address of it's PLT
9553 entry in large x86-64 PIC model. */
9554
9555static rtx
9556construct_plt_address (rtx symbol)
9557{
9558 rtx tmp, unspec;
9559
9560 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
9561 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
9562 gcc_assert (Pmode == DImode);
9563
9564 tmp = gen_reg_rtx (Pmode);
9565 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
9566
9567 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
9568 emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
9569 return tmp;
9570}
9571
9572/* Additional registers that are clobbered by SYSV calls. */
9573
9574static int const x86_64_ms_sysv_extra_clobbered_registers
9575 [NUM_X86_64_MS_CLOBBERED_REGS] =
9576{
9577 SI_REG, DI_REG,
9578 XMM6_REG, XMM7_REG,
9579 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
9580 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
9581};
9582
9583rtx_insn *
9584ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
9585 rtx callarg2,
9586 rtx pop, bool sibcall)
9587{
9588 rtx vec[3];
9589 rtx use = NULL, call;
9590 unsigned int vec_len = 0;
9591 tree fndecl;
9592
9593 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
9594 {
9595 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
9596 if (fndecl
9597 && (lookup_attribute (attr_name: "interrupt",
9598 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
9599 error ("interrupt service routine cannot be called directly");
9600 }
9601 else
9602 fndecl = NULL_TREE;
9603
9604 if (pop == const0_rtx)
9605 pop = NULL;
9606 gcc_assert (!TARGET_64BIT || !pop);
9607
9608 rtx addr = XEXP (fnaddr, 0);
9609 if (TARGET_MACHO && !TARGET_64BIT)
9610 {
9611#if TARGET_MACHO
9612 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
9613 fnaddr = machopic_indirect_call_target (fnaddr);
9614#endif
9615 }
9616 else
9617 {
9618 /* Static functions and indirect calls don't need the pic register. Also,
9619 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
9620 it an indirect call. */
9621 if (flag_pic
9622 && GET_CODE (addr) == SYMBOL_REF
9623 && ix86_call_use_plt_p (addr))
9624 {
9625 if (flag_plt
9626 && (SYMBOL_REF_DECL (addr) == NULL_TREE
9627 || !lookup_attribute (attr_name: "noplt",
9628 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
9629 {
9630 if (!TARGET_64BIT
9631 || (ix86_cmodel == CM_LARGE_PIC
9632 && DEFAULT_ABI != MS_ABI))
9633 {
9634 use_reg (fusage: &use, reg: gen_rtx_REG (Pmode,
9635 REAL_PIC_OFFSET_TABLE_REGNUM));
9636 if (ix86_use_pseudo_pic_reg ())
9637 emit_move_insn (gen_rtx_REG (Pmode,
9638 REAL_PIC_OFFSET_TABLE_REGNUM),
9639 pic_offset_table_rtx);
9640 }
9641 }
9642 else if (!TARGET_PECOFF && !TARGET_MACHO)
9643 {
9644 if (TARGET_64BIT
9645 && ix86_cmodel == CM_LARGE_PIC
9646 && DEFAULT_ABI != MS_ABI)
9647 {
9648 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
9649 UNSPEC_GOT);
9650 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9651 fnaddr = force_reg (Pmode, fnaddr);
9652 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
9653 }
9654 else if (TARGET_64BIT)
9655 {
9656 fnaddr = gen_rtx_UNSPEC (Pmode,
9657 gen_rtvec (1, addr),
9658 UNSPEC_GOTPCREL);
9659 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9660 }
9661 else
9662 {
9663 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
9664 UNSPEC_GOT);
9665 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9666 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
9667 fnaddr);
9668 }
9669 fnaddr = gen_const_mem (Pmode, fnaddr);
9670 /* Pmode may not be the same as word_mode for x32, which
9671 doesn't support indirect branch via 32-bit memory slot.
9672 Since x32 GOT slot is 64 bit with zero upper 32 bits,
9673 indirect branch via x32 GOT slot is OK. */
9674 if (GET_MODE (fnaddr) != word_mode)
9675 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
9676 fnaddr = gen_rtx_MEM (QImode, fnaddr);
9677 }
9678 }
9679 }
9680
9681 /* Skip setting up RAX register for -mskip-rax-setup when there are no
9682 parameters passed in vector registers. */
9683 if (TARGET_64BIT
9684 && (INTVAL (callarg2) > 0
9685 || (INTVAL (callarg2) == 0
9686 && (TARGET_SSE || !flag_skip_rax_setup))))
9687 {
9688 rtx al = gen_rtx_REG (QImode, AX_REG);
9689 emit_move_insn (al, callarg2);
9690 use_reg (fusage: &use, reg: al);
9691 }
9692
9693 if (ix86_cmodel == CM_LARGE_PIC
9694 && !TARGET_PECOFF
9695 && MEM_P (fnaddr)
9696 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
9697 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
9698 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
9699 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
9700 branch via x32 GOT slot is OK. */
9701 else if (!(TARGET_X32
9702 && MEM_P (fnaddr)
9703 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
9704 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
9705 && (sibcall
9706 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
9707 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
9708 {
9709 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
9710 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
9711 }
9712
9713 /* PR100665: Hwasan may tag code pointer which is not supported by LAM,
9714 mask off code pointers here.
9715 TODO: also need to handle indirect jump. */
9716 if (ix86_memtag_can_tag_addresses () && !fndecl
9717 && sanitize_flags_p (flag: SANITIZE_HWADDRESS))
9718 {
9719 rtx untagged_addr = ix86_memtag_untagged_pointer (XEXP (fnaddr, 0),
9720 NULL_RTX);
9721 fnaddr = gen_rtx_MEM (QImode, untagged_addr);
9722 }
9723
9724 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
9725
9726 if (retval)
9727 call = gen_rtx_SET (retval, call);
9728 vec[vec_len++] = call;
9729
9730 if (pop)
9731 {
9732 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
9733 pop = gen_rtx_SET (stack_pointer_rtx, pop);
9734 vec[vec_len++] = pop;
9735 }
9736
9737 if (cfun->machine->no_caller_saved_registers
9738 && (!fndecl
9739 || (!TREE_THIS_VOLATILE (fndecl)
9740 && !lookup_attribute (attr_name: "no_caller_saved_registers",
9741 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
9742 {
9743 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
9744 bool is_64bit_ms_abi = (TARGET_64BIT
9745 && ix86_function_abi (fndecl) == MS_ABI);
9746 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
9747
9748 /* If there are no caller-saved registers, add all registers
9749 that are clobbered by the call which returns. */
9750 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
9751 if (!fixed_regs[i]
9752 && (ix86_call_used_regs[i] == 1
9753 || (ix86_call_used_regs[i] & c_mask))
9754 && !STACK_REGNO_P (i)
9755 && !MMX_REGNO_P (i))
9756 clobber_reg (fusage: &use,
9757 reg: gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
9758 }
9759 else if (TARGET_64BIT_MS_ABI
9760 && (!callarg2 || INTVAL (callarg2) != -2))
9761 {
9762 unsigned i;
9763
9764 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
9765 {
9766 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
9767 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
9768
9769 clobber_reg (fusage: &use, reg: gen_rtx_REG (mode, regno));
9770 }
9771
9772 /* Set here, but it may get cleared later. */
9773 if (TARGET_CALL_MS2SYSV_XLOGUES)
9774 {
9775 if (!TARGET_SSE)
9776 ;
9777
9778 /* Don't break hot-patched functions. */
9779 else if (ix86_function_ms_hook_prologue (fn: current_function_decl))
9780 ;
9781
9782 /* TODO: Cases not yet examined. */
9783 else if (flag_split_stack)
9784 warn_once_call_ms2sysv_xlogues (feature: "-fsplit-stack");
9785
9786 else
9787 {
9788 gcc_assert (!reload_completed);
9789 cfun->machine->call_ms2sysv = true;
9790 }
9791 }
9792 }
9793
9794 if (TARGET_MACHO && TARGET_64BIT && !sibcall
9795 && ((GET_CODE (addr) == SYMBOL_REF && !SYMBOL_REF_LOCAL_P (addr))
9796 || !fndecl || TREE_PUBLIC (fndecl)))
9797 {
9798 /* We allow public functions defined in a TU to bind locally for PIC
9799 code (the default) on 64bit Mach-O.
9800 If such functions are not inlined, we cannot tell at compile-time if
9801 they will be called via the lazy symbol resolver (this can depend on
9802 options given at link-time). Therefore, we must assume that the lazy
9803 resolver could be used which clobbers R11 and R10. */
9804 clobber_reg (fusage: &use, reg: gen_rtx_REG (DImode, R11_REG));
9805 clobber_reg (fusage: &use, reg: gen_rtx_REG (DImode, R10_REG));
9806 }
9807
9808 if (vec_len > 1)
9809 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
9810 rtx_insn *call_insn = emit_call_insn (call);
9811 if (use)
9812 CALL_INSN_FUNCTION_USAGE (call_insn) = use;
9813
9814 return call_insn;
9815}
9816
9817/* Split simple return with popping POPC bytes from stack to indirect
9818 branch with stack adjustment . */
9819
9820void
9821ix86_split_simple_return_pop_internal (rtx popc)
9822{
9823 struct machine_function *m = cfun->machine;
9824 rtx ecx = gen_rtx_REG (SImode, CX_REG);
9825 rtx_insn *insn;
9826
9827 /* There is no "pascal" calling convention in any 64bit ABI. */
9828 gcc_assert (!TARGET_64BIT);
9829
9830 insn = emit_insn (gen_pop (arg: ecx));
9831 m->fs.cfa_offset -= UNITS_PER_WORD;
9832 m->fs.sp_offset -= UNITS_PER_WORD;
9833
9834 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
9835 x = gen_rtx_SET (stack_pointer_rtx, x);
9836 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
9837 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
9838 RTX_FRAME_RELATED_P (insn) = 1;
9839
9840 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
9841 x = gen_rtx_SET (stack_pointer_rtx, x);
9842 insn = emit_insn (x);
9843 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
9844 RTX_FRAME_RELATED_P (insn) = 1;
9845
9846 /* Now return address is in ECX. */
9847 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
9848}
9849
9850/* Errors in the source file can cause expand_expr to return const0_rtx
9851 where we expect a vector. To avoid crashing, use one of the vector
9852 clear instructions. */
9853
9854static rtx
9855safe_vector_operand (rtx x, machine_mode mode)
9856{
9857 if (x == const0_rtx)
9858 x = CONST0_RTX (mode);
9859 return x;
9860}
9861
9862/* Subroutine of ix86_expand_builtin to take care of binop insns. */
9863
9864static rtx
9865ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
9866{
9867 rtx pat;
9868 tree arg0 = CALL_EXPR_ARG (exp, 0);
9869 tree arg1 = CALL_EXPR_ARG (exp, 1);
9870 rtx op0 = expand_normal (exp: arg0);
9871 rtx op1 = expand_normal (exp: arg1);
9872 machine_mode tmode = insn_data[icode].operand[0].mode;
9873 machine_mode mode0 = insn_data[icode].operand[1].mode;
9874 machine_mode mode1 = insn_data[icode].operand[2].mode;
9875
9876 if (VECTOR_MODE_P (mode0))
9877 op0 = safe_vector_operand (x: op0, mode: mode0);
9878 if (VECTOR_MODE_P (mode1))
9879 op1 = safe_vector_operand (x: op1, mode: mode1);
9880
9881 if (optimize || !target
9882 || GET_MODE (target) != tmode
9883 || !insn_data[icode].operand[0].predicate (target, tmode))
9884 target = gen_reg_rtx (tmode);
9885
9886 if (GET_MODE (op1) == SImode && mode1 == TImode)
9887 {
9888 rtx x = gen_reg_rtx (V4SImode);
9889 emit_insn (gen_sse2_loadd (x, op1));
9890 op1 = gen_lowpart (TImode, x);
9891 }
9892
9893 if (!insn_data[icode].operand[1].predicate (op0, mode0))
9894 op0 = copy_to_mode_reg (mode0, op0);
9895 if (!insn_data[icode].operand[2].predicate (op1, mode1))
9896 op1 = copy_to_mode_reg (mode1, op1);
9897
9898 pat = GEN_FCN (icode) (target, op0, op1);
9899 if (! pat)
9900 return 0;
9901
9902 emit_insn (pat);
9903
9904 return target;
9905}
9906
9907/* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
9908
9909static rtx
9910ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
9911 enum ix86_builtin_func_type m_type,
9912 enum rtx_code sub_code)
9913{
9914 rtx pat;
9915 unsigned int i, nargs;
9916 bool comparison_p = false;
9917 bool tf_p = false;
9918 bool last_arg_constant = false;
9919 int num_memory = 0;
9920 rtx xops[4];
9921
9922 machine_mode tmode = insn_data[icode].operand[0].mode;
9923
9924 switch (m_type)
9925 {
9926 case MULTI_ARG_4_DF2_DI_I:
9927 case MULTI_ARG_4_DF2_DI_I1:
9928 case MULTI_ARG_4_SF2_SI_I:
9929 case MULTI_ARG_4_SF2_SI_I1:
9930 nargs = 4;
9931 last_arg_constant = true;
9932 break;
9933
9934 case MULTI_ARG_3_SF:
9935 case MULTI_ARG_3_DF:
9936 case MULTI_ARG_3_SF2:
9937 case MULTI_ARG_3_DF2:
9938 case MULTI_ARG_3_DI:
9939 case MULTI_ARG_3_SI:
9940 case MULTI_ARG_3_SI_DI:
9941 case MULTI_ARG_3_HI:
9942 case MULTI_ARG_3_HI_SI:
9943 case MULTI_ARG_3_QI:
9944 case MULTI_ARG_3_DI2:
9945 case MULTI_ARG_3_SI2:
9946 case MULTI_ARG_3_HI2:
9947 case MULTI_ARG_3_QI2:
9948 nargs = 3;
9949 break;
9950
9951 case MULTI_ARG_2_SF:
9952 case MULTI_ARG_2_DF:
9953 case MULTI_ARG_2_DI:
9954 case MULTI_ARG_2_SI:
9955 case MULTI_ARG_2_HI:
9956 case MULTI_ARG_2_QI:
9957 nargs = 2;
9958 break;
9959
9960 case MULTI_ARG_2_DI_IMM:
9961 case MULTI_ARG_2_SI_IMM:
9962 case MULTI_ARG_2_HI_IMM:
9963 case MULTI_ARG_2_QI_IMM:
9964 nargs = 2;
9965 last_arg_constant = true;
9966 break;
9967
9968 case MULTI_ARG_1_SF:
9969 case MULTI_ARG_1_DF:
9970 case MULTI_ARG_1_SF2:
9971 case MULTI_ARG_1_DF2:
9972 case MULTI_ARG_1_DI:
9973 case MULTI_ARG_1_SI:
9974 case MULTI_ARG_1_HI:
9975 case MULTI_ARG_1_QI:
9976 case MULTI_ARG_1_SI_DI:
9977 case MULTI_ARG_1_HI_DI:
9978 case MULTI_ARG_1_HI_SI:
9979 case MULTI_ARG_1_QI_DI:
9980 case MULTI_ARG_1_QI_SI:
9981 case MULTI_ARG_1_QI_HI:
9982 nargs = 1;
9983 break;
9984
9985 case MULTI_ARG_2_DI_CMP:
9986 case MULTI_ARG_2_SI_CMP:
9987 case MULTI_ARG_2_HI_CMP:
9988 case MULTI_ARG_2_QI_CMP:
9989 nargs = 2;
9990 comparison_p = true;
9991 break;
9992
9993 case MULTI_ARG_2_SF_TF:
9994 case MULTI_ARG_2_DF_TF:
9995 case MULTI_ARG_2_DI_TF:
9996 case MULTI_ARG_2_SI_TF:
9997 case MULTI_ARG_2_HI_TF:
9998 case MULTI_ARG_2_QI_TF:
9999 nargs = 2;
10000 tf_p = true;
10001 break;
10002
10003 default:
10004 gcc_unreachable ();
10005 }
10006
10007 if (optimize || !target
10008 || GET_MODE (target) != tmode
10009 || !insn_data[icode].operand[0].predicate (target, tmode))
10010 target = gen_reg_rtx (tmode);
10011 else if (memory_operand (target, tmode))
10012 num_memory++;
10013
10014 gcc_assert (nargs <= ARRAY_SIZE (xops));
10015
10016 for (i = 0; i < nargs; i++)
10017 {
10018 tree arg = CALL_EXPR_ARG (exp, i);
10019 rtx op = expand_normal (exp: arg);
10020 int adjust = (comparison_p) ? 1 : 0;
10021 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
10022
10023 if (last_arg_constant && i == nargs - 1)
10024 {
10025 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
10026 {
10027 enum insn_code new_icode = icode;
10028 switch (icode)
10029 {
10030 case CODE_FOR_xop_vpermil2v2df3:
10031 case CODE_FOR_xop_vpermil2v4sf3:
10032 case CODE_FOR_xop_vpermil2v4df3:
10033 case CODE_FOR_xop_vpermil2v8sf3:
10034 error ("the last argument must be a 2-bit immediate");
10035 return gen_reg_rtx (tmode);
10036 case CODE_FOR_xop_rotlv2di3:
10037 new_icode = CODE_FOR_rotlv2di3;
10038 goto xop_rotl;
10039 case CODE_FOR_xop_rotlv4si3:
10040 new_icode = CODE_FOR_rotlv4si3;
10041 goto xop_rotl;
10042 case CODE_FOR_xop_rotlv8hi3:
10043 new_icode = CODE_FOR_rotlv8hi3;
10044 goto xop_rotl;
10045 case CODE_FOR_xop_rotlv16qi3:
10046 new_icode = CODE_FOR_rotlv16qi3;
10047 xop_rotl:
10048 if (CONST_INT_P (op))
10049 {
10050 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
10051 op = GEN_INT (INTVAL (op) & mask);
10052 gcc_checking_assert
10053 (insn_data[icode].operand[i + 1].predicate (op, mode));
10054 }
10055 else
10056 {
10057 gcc_checking_assert
10058 (nargs == 2
10059 && insn_data[new_icode].operand[0].mode == tmode
10060 && insn_data[new_icode].operand[1].mode == tmode
10061 && insn_data[new_icode].operand[2].mode == mode
10062 && insn_data[new_icode].operand[0].predicate
10063 == insn_data[icode].operand[0].predicate
10064 && insn_data[new_icode].operand[1].predicate
10065 == insn_data[icode].operand[1].predicate);
10066 icode = new_icode;
10067 goto non_constant;
10068 }
10069 break;
10070 default:
10071 gcc_unreachable ();
10072 }
10073 }
10074 }
10075 else
10076 {
10077 non_constant:
10078 if (VECTOR_MODE_P (mode))
10079 op = safe_vector_operand (x: op, mode);
10080
10081 /* If we aren't optimizing, only allow one memory operand to be
10082 generated. */
10083 if (memory_operand (op, mode))
10084 num_memory++;
10085
10086 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
10087
10088 if (optimize
10089 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
10090 || num_memory > 1)
10091 op = force_reg (mode, op);
10092 }
10093
10094 xops[i] = op;
10095 }
10096
10097 switch (nargs)
10098 {
10099 case 1:
10100 pat = GEN_FCN (icode) (target, xops[0]);
10101 break;
10102
10103 case 2:
10104 if (tf_p)
10105 pat = GEN_FCN (icode) (target, xops[0], xops[1],
10106 GEN_INT ((int)sub_code));
10107 else if (! comparison_p)
10108 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
10109 else
10110 {
10111 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
10112 xops[0], xops[1]);
10113
10114 pat = GEN_FCN (icode) (target, cmp_op, xops[0], xops[1]);
10115 }
10116 break;
10117
10118 case 3:
10119 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
10120 break;
10121
10122 case 4:
10123 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
10124 break;
10125
10126 default:
10127 gcc_unreachable ();
10128 }
10129
10130 if (! pat)
10131 return 0;
10132
10133 emit_insn (pat);
10134 return target;
10135}
10136
10137/* Subroutine of ix86_expand_args_builtin to take care of scalar unop
10138 insns with vec_merge. */
10139
10140static rtx
10141ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
10142 rtx target)
10143{
10144 rtx pat;
10145 tree arg0 = CALL_EXPR_ARG (exp, 0);
10146 rtx op1, op0 = expand_normal (exp: arg0);
10147 machine_mode tmode = insn_data[icode].operand[0].mode;
10148 machine_mode mode0 = insn_data[icode].operand[1].mode;
10149
10150 if (optimize || !target
10151 || GET_MODE (target) != tmode
10152 || !insn_data[icode].operand[0].predicate (target, tmode))
10153 target = gen_reg_rtx (tmode);
10154
10155 if (VECTOR_MODE_P (mode0))
10156 op0 = safe_vector_operand (x: op0, mode: mode0);
10157
10158 if ((optimize && !register_operand (op0, mode0))
10159 || !insn_data[icode].operand[1].predicate (op0, mode0))
10160 op0 = copy_to_mode_reg (mode0, op0);
10161
10162 op1 = op0;
10163 if (!insn_data[icode].operand[2].predicate (op1, mode0))
10164 op1 = copy_to_mode_reg (mode0, op1);
10165
10166 pat = GEN_FCN (icode) (target, op0, op1);
10167 if (! pat)
10168 return 0;
10169 emit_insn (pat);
10170 return target;
10171}
10172
10173/* Subroutine of ix86_expand_builtin to take care of comparison insns. */
10174
10175static rtx
10176ix86_expand_sse_compare (const struct builtin_description *d,
10177 tree exp, rtx target, bool swap)
10178{
10179 rtx pat;
10180 tree arg0 = CALL_EXPR_ARG (exp, 0);
10181 tree arg1 = CALL_EXPR_ARG (exp, 1);
10182 rtx op0 = expand_normal (exp: arg0);
10183 rtx op1 = expand_normal (exp: arg1);
10184 rtx op2;
10185 machine_mode tmode = insn_data[d->icode].operand[0].mode;
10186 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
10187 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
10188 enum rtx_code comparison = d->comparison;
10189
10190 if (VECTOR_MODE_P (mode0))
10191 op0 = safe_vector_operand (x: op0, mode: mode0);
10192 if (VECTOR_MODE_P (mode1))
10193 op1 = safe_vector_operand (x: op1, mode: mode1);
10194
10195 /* Swap operands if we have a comparison that isn't available in
10196 hardware. */
10197 if (swap)
10198 std::swap (a&: op0, b&: op1);
10199
10200 if (optimize || !target
10201 || GET_MODE (target) != tmode
10202 || !insn_data[d->icode].operand[0].predicate (target, tmode))
10203 target = gen_reg_rtx (tmode);
10204
10205 if ((optimize && !register_operand (op0, mode0))
10206 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
10207 op0 = copy_to_mode_reg (mode0, op0);
10208 if ((optimize && !register_operand (op1, mode1))
10209 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
10210 op1 = copy_to_mode_reg (mode1, op1);
10211
10212 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
10213 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
10214 if (! pat)
10215 return 0;
10216 emit_insn (pat);
10217 return target;
10218}
10219
10220/* Subroutine of ix86_sse_comi and ix86_sse_comi_round to take care of
10221 * ordered EQ or unordered NE, generate PF jump. */
10222
10223static rtx
10224ix86_ssecom_setcc (const enum rtx_code comparison,
10225 bool check_unordered, machine_mode mode,
10226 rtx set_dst, rtx target)
10227{
10228
10229 rtx_code_label *label = NULL;
10230
10231 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10232 with NAN operands. */
10233 if (check_unordered)
10234 {
10235 gcc_assert (comparison == EQ || comparison == NE);
10236
10237 rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
10238 label = gen_label_rtx ();
10239 rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
10240 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
10241 gen_rtx_LABEL_REF (VOIDmode, label),
10242 pc_rtx);
10243 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
10244 }
10245
10246 /* NB: Set CCFPmode and check a different CCmode which is in subset
10247 of CCFPmode. */
10248 if (GET_MODE (set_dst) != mode)
10249 {
10250 gcc_assert (mode == CCAmode || mode == CCCmode
10251 || mode == CCOmode || mode == CCPmode
10252 || mode == CCSmode || mode == CCZmode);
10253 set_dst = gen_rtx_REG (mode, FLAGS_REG);
10254 }
10255
10256 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10257 gen_rtx_fmt_ee (comparison, QImode,
10258 set_dst,
10259 const0_rtx)));
10260
10261 if (label)
10262 emit_label (label);
10263
10264 return SUBREG_REG (target);
10265}
10266
10267/* Subroutine of ix86_expand_builtin to take care of comi insns. */
10268
10269static rtx
10270ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
10271 rtx target)
10272{
10273 rtx pat, set_dst;
10274 tree arg0 = CALL_EXPR_ARG (exp, 0);
10275 tree arg1 = CALL_EXPR_ARG (exp, 1);
10276 rtx op0 = expand_normal (exp: arg0);
10277 rtx op1 = expand_normal (exp: arg1);
10278 enum insn_code icode = d->icode;
10279 const struct insn_data_d *insn_p = &insn_data[icode];
10280 machine_mode mode0 = insn_p->operand[0].mode;
10281 machine_mode mode1 = insn_p->operand[1].mode;
10282
10283 if (VECTOR_MODE_P (mode0))
10284 op0 = safe_vector_operand (x: op0, mode: mode0);
10285 if (VECTOR_MODE_P (mode1))
10286 op1 = safe_vector_operand (x: op1, mode: mode1);
10287
10288 enum rtx_code comparison = d->comparison;
10289 rtx const_val = const0_rtx;
10290
10291 bool check_unordered = false;
10292 machine_mode mode = CCFPmode;
10293 switch (comparison)
10294 {
10295 case LE: /* -> GE */
10296 case LT: /* -> GT */
10297 std::swap (a&: op0, b&: op1);
10298 comparison = swap_condition (comparison);
10299 /* FALLTHRU */
10300 case GT:
10301 case GE:
10302 break;
10303 case EQ:
10304 check_unordered = true;
10305 mode = CCZmode;
10306 break;
10307 case NE:
10308 check_unordered = true;
10309 mode = CCZmode;
10310 const_val = const1_rtx;
10311 break;
10312 default:
10313 gcc_unreachable ();
10314 }
10315
10316 target = gen_reg_rtx (SImode);
10317 emit_move_insn (target, const_val);
10318 target = gen_rtx_SUBREG (QImode, target, 0);
10319
10320 if ((optimize && !register_operand (op0, mode0))
10321 || !insn_p->operand[0].predicate (op0, mode0))
10322 op0 = copy_to_mode_reg (mode0, op0);
10323 if ((optimize && !register_operand (op1, mode1))
10324 || !insn_p->operand[1].predicate (op1, mode1))
10325 op1 = copy_to_mode_reg (mode1, op1);
10326
10327 pat = GEN_FCN (icode) (op0, op1);
10328 if (! pat)
10329 return 0;
10330
10331 set_dst = SET_DEST (pat);
10332 emit_insn (pat);
10333 return ix86_ssecom_setcc (comparison, check_unordered, mode,
10334 set_dst, target);
10335}
10336
10337/* Subroutines of ix86_expand_args_builtin to take care of round insns. */
10338
10339static rtx
10340ix86_expand_sse_round (const struct builtin_description *d, tree exp,
10341 rtx target)
10342{
10343 rtx pat;
10344 tree arg0 = CALL_EXPR_ARG (exp, 0);
10345 rtx op1, op0 = expand_normal (exp: arg0);
10346 machine_mode tmode = insn_data[d->icode].operand[0].mode;
10347 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
10348
10349 if (optimize || target == 0
10350 || GET_MODE (target) != tmode
10351 || !insn_data[d->icode].operand[0].predicate (target, tmode))
10352 target = gen_reg_rtx (tmode);
10353
10354 if (VECTOR_MODE_P (mode0))
10355 op0 = safe_vector_operand (x: op0, mode: mode0);
10356
10357 if ((optimize && !register_operand (op0, mode0))
10358 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
10359 op0 = copy_to_mode_reg (mode0, op0);
10360
10361 op1 = GEN_INT (d->comparison);
10362
10363 pat = GEN_FCN (d->icode) (target, op0, op1);
10364 if (! pat)
10365 return 0;
10366 emit_insn (pat);
10367 return target;
10368}
10369
10370static rtx
10371ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
10372 tree exp, rtx target)
10373{
10374 rtx pat;
10375 tree arg0 = CALL_EXPR_ARG (exp, 0);
10376 tree arg1 = CALL_EXPR_ARG (exp, 1);
10377 rtx op0 = expand_normal (exp: arg0);
10378 rtx op1 = expand_normal (exp: arg1);
10379 rtx op2;
10380 machine_mode tmode = insn_data[d->icode].operand[0].mode;
10381 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
10382 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
10383
10384 if (optimize || target == 0
10385 || GET_MODE (target) != tmode
10386 || !insn_data[d->icode].operand[0].predicate (target, tmode))
10387 target = gen_reg_rtx (tmode);
10388
10389 op0 = safe_vector_operand (x: op0, mode: mode0);
10390 op1 = safe_vector_operand (x: op1, mode: mode1);
10391
10392 if ((optimize && !register_operand (op0, mode0))
10393 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
10394 op0 = copy_to_mode_reg (mode0, op0);
10395 if ((optimize && !register_operand (op1, mode1))
10396 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
10397 op1 = copy_to_mode_reg (mode1, op1);
10398
10399 op2 = GEN_INT (d->comparison);
10400
10401 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
10402 if (! pat)
10403 return 0;
10404 emit_insn (pat);
10405 return target;
10406}
10407
10408/* Subroutine of ix86_expand_builtin to take care of ptest insns. */
10409
10410static rtx
10411ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
10412 rtx target)
10413{
10414 rtx pat;
10415 tree arg0 = CALL_EXPR_ARG (exp, 0);
10416 tree arg1 = CALL_EXPR_ARG (exp, 1);
10417 rtx op0 = expand_normal (exp: arg0);
10418 rtx op1 = expand_normal (exp: arg1);
10419 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
10420 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
10421 enum rtx_code comparison = d->comparison;
10422
10423 /* ptest reg, reg sets the carry flag. */
10424 if (comparison == LTU
10425 && (d->code == IX86_BUILTIN_PTESTC
10426 || d->code == IX86_BUILTIN_PTESTC256)
10427 && rtx_equal_p (op0, op1))
10428 {
10429 if (!target)
10430 target = gen_reg_rtx (SImode);
10431 emit_move_insn (target, const1_rtx);
10432 return target;
10433 }
10434
10435 if (VECTOR_MODE_P (mode0))
10436 op0 = safe_vector_operand (x: op0, mode: mode0);
10437 if (VECTOR_MODE_P (mode1))
10438 op1 = safe_vector_operand (x: op1, mode: mode1);
10439
10440 target = gen_reg_rtx (SImode);
10441 emit_move_insn (target, const0_rtx);
10442 target = gen_rtx_SUBREG (QImode, target, 0);
10443
10444 if ((optimize && !register_operand (op0, mode0))
10445 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
10446 op0 = copy_to_mode_reg (mode0, op0);
10447 if ((optimize && !register_operand (op1, mode1))
10448 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
10449 op1 = copy_to_mode_reg (mode1, op1);
10450
10451 pat = GEN_FCN (d->icode) (op0, op1);
10452 if (! pat)
10453 return 0;
10454 emit_insn (pat);
10455 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10456 gen_rtx_fmt_ee (comparison, QImode,
10457 SET_DEST (pat),
10458 const0_rtx)));
10459
10460 return SUBREG_REG (target);
10461}
10462
10463/* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
10464
10465static rtx
10466ix86_expand_sse_pcmpestr (const struct builtin_description *d,
10467 tree exp, rtx target)
10468{
10469 rtx pat;
10470 tree arg0 = CALL_EXPR_ARG (exp, 0);
10471 tree arg1 = CALL_EXPR_ARG (exp, 1);
10472 tree arg2 = CALL_EXPR_ARG (exp, 2);
10473 tree arg3 = CALL_EXPR_ARG (exp, 3);
10474 tree arg4 = CALL_EXPR_ARG (exp, 4);
10475 rtx scratch0, scratch1;
10476 rtx op0 = expand_normal (exp: arg0);
10477 rtx op1 = expand_normal (exp: arg1);
10478 rtx op2 = expand_normal (exp: arg2);
10479 rtx op3 = expand_normal (exp: arg3);
10480 rtx op4 = expand_normal (exp: arg4);
10481 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
10482
10483 tmode0 = insn_data[d->icode].operand[0].mode;
10484 tmode1 = insn_data[d->icode].operand[1].mode;
10485 modev2 = insn_data[d->icode].operand[2].mode;
10486 modei3 = insn_data[d->icode].operand[3].mode;
10487 modev4 = insn_data[d->icode].operand[4].mode;
10488 modei5 = insn_data[d->icode].operand[5].mode;
10489 modeimm = insn_data[d->icode].operand[6].mode;
10490
10491 if (VECTOR_MODE_P (modev2))
10492 op0 = safe_vector_operand (x: op0, mode: modev2);
10493 if (VECTOR_MODE_P (modev4))
10494 op2 = safe_vector_operand (x: op2, mode: modev4);
10495
10496 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
10497 op0 = copy_to_mode_reg (modev2, op0);
10498 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
10499 op1 = copy_to_mode_reg (modei3, op1);
10500 if ((optimize && !register_operand (op2, modev4))
10501 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
10502 op2 = copy_to_mode_reg (modev4, op2);
10503 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
10504 op3 = copy_to_mode_reg (modei5, op3);
10505
10506 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
10507 {
10508 error ("the fifth argument must be an 8-bit immediate");
10509 return const0_rtx;
10510 }
10511
10512 if (d->code == IX86_BUILTIN_PCMPESTRI128)
10513 {
10514 if (optimize || !target
10515 || GET_MODE (target) != tmode0
10516 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
10517 target = gen_reg_rtx (tmode0);
10518
10519 scratch1 = gen_reg_rtx (tmode1);
10520
10521 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
10522 }
10523 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
10524 {
10525 if (optimize || !target
10526 || GET_MODE (target) != tmode1
10527 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
10528 target = gen_reg_rtx (tmode1);
10529
10530 scratch0 = gen_reg_rtx (tmode0);
10531
10532 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
10533 }
10534 else
10535 {
10536 gcc_assert (d->flag);
10537
10538 scratch0 = gen_reg_rtx (tmode0);
10539 scratch1 = gen_reg_rtx (tmode1);
10540
10541 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
10542 }
10543
10544 if (! pat)
10545 return 0;
10546
10547 emit_insn (pat);
10548
10549 if (d->flag)
10550 {
10551 target = gen_reg_rtx (SImode);
10552 emit_move_insn (target, const0_rtx);
10553 target = gen_rtx_SUBREG (QImode, target, 0);
10554
10555 emit_insn
10556 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10557 gen_rtx_fmt_ee (EQ, QImode,
10558 gen_rtx_REG ((machine_mode) d->flag,
10559 FLAGS_REG),
10560 const0_rtx)));
10561 return SUBREG_REG (target);
10562 }
10563 else
10564 return target;
10565}
10566
10567
10568/* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
10569
10570static rtx
10571ix86_expand_sse_pcmpistr (const struct builtin_description *d,
10572 tree exp, rtx target)
10573{
10574 rtx pat;
10575 tree arg0 = CALL_EXPR_ARG (exp, 0);
10576 tree arg1 = CALL_EXPR_ARG (exp, 1);
10577 tree arg2 = CALL_EXPR_ARG (exp, 2);
10578 rtx scratch0, scratch1;
10579 rtx op0 = expand_normal (exp: arg0);
10580 rtx op1 = expand_normal (exp: arg1);
10581 rtx op2 = expand_normal (exp: arg2);
10582 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
10583
10584 tmode0 = insn_data[d->icode].operand[0].mode;
10585 tmode1 = insn_data[d->icode].operand[1].mode;
10586 modev2 = insn_data[d->icode].operand[2].mode;
10587 modev3 = insn_data[d->icode].operand[3].mode;
10588 modeimm = insn_data[d->icode].operand[4].mode;
10589
10590 if (VECTOR_MODE_P (modev2))
10591 op0 = safe_vector_operand (x: op0, mode: modev2);
10592 if (VECTOR_MODE_P (modev3))
10593 op1 = safe_vector_operand (x: op1, mode: modev3);
10594
10595 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
10596 op0 = copy_to_mode_reg (modev2, op0);
10597 if ((optimize && !register_operand (op1, modev3))
10598 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
10599 op1 = copy_to_mode_reg (modev3, op1);
10600
10601 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
10602 {
10603 error ("the third argument must be an 8-bit immediate");
10604 return const0_rtx;
10605 }
10606
10607 if (d->code == IX86_BUILTIN_PCMPISTRI128)
10608 {
10609 if (optimize || !target
10610 || GET_MODE (target) != tmode0
10611 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
10612 target = gen_reg_rtx (tmode0);
10613
10614 scratch1 = gen_reg_rtx (tmode1);
10615
10616 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
10617 }
10618 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
10619 {
10620 if (optimize || !target
10621 || GET_MODE (target) != tmode1
10622 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
10623 target = gen_reg_rtx (tmode1);
10624
10625 scratch0 = gen_reg_rtx (tmode0);
10626
10627 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
10628 }
10629 else
10630 {
10631 gcc_assert (d->flag);
10632
10633 scratch0 = gen_reg_rtx (tmode0);
10634 scratch1 = gen_reg_rtx (tmode1);
10635
10636 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
10637 }
10638
10639 if (! pat)
10640 return 0;
10641
10642 emit_insn (pat);
10643
10644 if (d->flag)
10645 {
10646 target = gen_reg_rtx (SImode);
10647 emit_move_insn (target, const0_rtx);
10648 target = gen_rtx_SUBREG (QImode, target, 0);
10649
10650 emit_insn
10651 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10652 gen_rtx_fmt_ee (EQ, QImode,
10653 gen_rtx_REG ((machine_mode) d->flag,
10654 FLAGS_REG),
10655 const0_rtx)));
10656 return SUBREG_REG (target);
10657 }
10658 else
10659 return target;
10660}
10661
10662/* Fixup modeless constants to fit required mode. */
10663
10664static rtx
10665fixup_modeless_constant (rtx x, machine_mode mode)
10666{
10667 if (GET_MODE (x) == VOIDmode)
10668 x = convert_to_mode (mode, x, 1);
10669 return x;
10670}
10671
10672/* Subroutine of ix86_expand_builtin to take care of insns with
10673 variable number of operands. */
10674
10675static rtx
10676ix86_expand_args_builtin (const struct builtin_description *d,
10677 tree exp, rtx target)
10678{
10679 rtx pat, real_target;
10680 unsigned int i, nargs;
10681 unsigned int nargs_constant = 0;
10682 unsigned int mask_pos = 0;
10683 int num_memory = 0;
10684 rtx xops[6];
10685 bool second_arg_count = false;
10686 enum insn_code icode = d->icode;
10687 const struct insn_data_d *insn_p = &insn_data[icode];
10688 machine_mode tmode = insn_p->operand[0].mode;
10689 machine_mode rmode = VOIDmode;
10690 bool swap = false;
10691 enum rtx_code comparison = d->comparison;
10692
10693 switch ((enum ix86_builtin_func_type) d->flag)
10694 {
10695 case V2DF_FTYPE_V2DF_ROUND:
10696 case V4DF_FTYPE_V4DF_ROUND:
10697 case V8DF_FTYPE_V8DF_ROUND:
10698 case V4SF_FTYPE_V4SF_ROUND:
10699 case V8SF_FTYPE_V8SF_ROUND:
10700 case V16SF_FTYPE_V16SF_ROUND:
10701 case V8HF_FTYPE_V8HF_ROUND:
10702 case V16HF_FTYPE_V16HF_ROUND:
10703 case V32HF_FTYPE_V32HF_ROUND:
10704 case V4SI_FTYPE_V4SF_ROUND:
10705 case V8SI_FTYPE_V8SF_ROUND:
10706 case V16SI_FTYPE_V16SF_ROUND:
10707 return ix86_expand_sse_round (d, exp, target);
10708 case V4SI_FTYPE_V2DF_V2DF_ROUND:
10709 case V8SI_FTYPE_V4DF_V4DF_ROUND:
10710 case V16SI_FTYPE_V8DF_V8DF_ROUND:
10711 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
10712 case INT_FTYPE_V8SF_V8SF_PTEST:
10713 case INT_FTYPE_V4DI_V4DI_PTEST:
10714 case INT_FTYPE_V4DF_V4DF_PTEST:
10715 case INT_FTYPE_V4SF_V4SF_PTEST:
10716 case INT_FTYPE_V2DI_V2DI_PTEST:
10717 case INT_FTYPE_V2DF_V2DF_PTEST:
10718 return ix86_expand_sse_ptest (d, exp, target);
10719 case FLOAT128_FTYPE_FLOAT128:
10720 case FLOAT_FTYPE_FLOAT:
10721 case FLOAT_FTYPE_BFLOAT16:
10722 case INT_FTYPE_INT:
10723 case UINT_FTYPE_UINT:
10724 case UINT16_FTYPE_UINT16:
10725 case UINT64_FTYPE_INT:
10726 case UINT64_FTYPE_UINT64:
10727 case INT64_FTYPE_INT64:
10728 case INT64_FTYPE_V4SF:
10729 case INT64_FTYPE_V2DF:
10730 case INT_FTYPE_V16QI:
10731 case INT_FTYPE_V8QI:
10732 case INT_FTYPE_V8SF:
10733 case INT_FTYPE_V4DF:
10734 case INT_FTYPE_V4SF:
10735 case INT_FTYPE_V2DF:
10736 case INT_FTYPE_V32QI:
10737 case V16QI_FTYPE_V16QI:
10738 case V8SI_FTYPE_V8SF:
10739 case V8SI_FTYPE_V4SI:
10740 case V8HI_FTYPE_V8HI:
10741 case V8HI_FTYPE_V16QI:
10742 case V8QI_FTYPE_V8QI:
10743 case V8SF_FTYPE_V8SF:
10744 case V8SF_FTYPE_V8SI:
10745 case V8SF_FTYPE_V4SF:
10746 case V8SF_FTYPE_V8HI:
10747 case V4SI_FTYPE_V4SI:
10748 case V4SI_FTYPE_V16QI:
10749 case V4SI_FTYPE_V4SF:
10750 case V4SI_FTYPE_V8SI:
10751 case V4SI_FTYPE_V8HI:
10752 case V4SI_FTYPE_V4DF:
10753 case V4SI_FTYPE_V2DF:
10754 case V4HI_FTYPE_V4HI:
10755 case V4DF_FTYPE_V4DF:
10756 case V4DF_FTYPE_V4SI:
10757 case V4DF_FTYPE_V4SF:
10758 case V4DF_FTYPE_V2DF:
10759 case V4SF_FTYPE_V4SF:
10760 case V4SF_FTYPE_V4SI:
10761 case V4SF_FTYPE_V8SF:
10762 case V4SF_FTYPE_V4DF:
10763 case V4SF_FTYPE_V8HI:
10764 case V4SF_FTYPE_V2DF:
10765 case V2DI_FTYPE_V2DI:
10766 case V2DI_FTYPE_V16QI:
10767 case V2DI_FTYPE_V8HI:
10768 case V2DI_FTYPE_V4SI:
10769 case V2DF_FTYPE_V2DF:
10770 case V2DF_FTYPE_V4SI:
10771 case V2DF_FTYPE_V4DF:
10772 case V2DF_FTYPE_V4SF:
10773 case V2DF_FTYPE_V2SI:
10774 case V2SI_FTYPE_V2SI:
10775 case V2SI_FTYPE_V4SF:
10776 case V2SI_FTYPE_V2SF:
10777 case V2SI_FTYPE_V2DF:
10778 case V2SF_FTYPE_V2SF:
10779 case V2SF_FTYPE_V2SI:
10780 case V32QI_FTYPE_V32QI:
10781 case V32QI_FTYPE_V16QI:
10782 case V16HI_FTYPE_V16HI:
10783 case V16HI_FTYPE_V8HI:
10784 case V8SI_FTYPE_V8SI:
10785 case V16HI_FTYPE_V16QI:
10786 case V8SI_FTYPE_V16QI:
10787 case V4DI_FTYPE_V16QI:
10788 case V8SI_FTYPE_V8HI:
10789 case V4DI_FTYPE_V8HI:
10790 case V4DI_FTYPE_V4SI:
10791 case V4DI_FTYPE_V2DI:
10792 case UQI_FTYPE_UQI:
10793 case UHI_FTYPE_UHI:
10794 case USI_FTYPE_USI:
10795 case USI_FTYPE_UQI:
10796 case USI_FTYPE_UHI:
10797 case UDI_FTYPE_UDI:
10798 case UHI_FTYPE_V16QI:
10799 case USI_FTYPE_V32QI:
10800 case UDI_FTYPE_V64QI:
10801 case V16QI_FTYPE_UHI:
10802 case V32QI_FTYPE_USI:
10803 case V64QI_FTYPE_UDI:
10804 case V8HI_FTYPE_UQI:
10805 case V16HI_FTYPE_UHI:
10806 case V32HI_FTYPE_USI:
10807 case V4SI_FTYPE_UQI:
10808 case V8SI_FTYPE_UQI:
10809 case V4SI_FTYPE_UHI:
10810 case V8SI_FTYPE_UHI:
10811 case UQI_FTYPE_V8HI:
10812 case UHI_FTYPE_V16HI:
10813 case USI_FTYPE_V32HI:
10814 case UQI_FTYPE_V4SI:
10815 case UQI_FTYPE_V8SI:
10816 case UHI_FTYPE_V16SI:
10817 case UQI_FTYPE_V2DI:
10818 case UQI_FTYPE_V4DI:
10819 case UQI_FTYPE_V8DI:
10820 case V16SI_FTYPE_UHI:
10821 case V2DI_FTYPE_UQI:
10822 case V4DI_FTYPE_UQI:
10823 case V16SI_FTYPE_INT:
10824 case V16SF_FTYPE_V8SF:
10825 case V16SI_FTYPE_V8SI:
10826 case V16SF_FTYPE_V4SF:
10827 case V16SI_FTYPE_V4SI:
10828 case V16SI_FTYPE_V16SF:
10829 case V16SI_FTYPE_V16SI:
10830 case V64QI_FTYPE_V64QI:
10831 case V32HI_FTYPE_V32HI:
10832 case V16SF_FTYPE_V16SF:
10833 case V8DI_FTYPE_UQI:
10834 case V8DI_FTYPE_V8DI:
10835 case V8DF_FTYPE_V4DF:
10836 case V8DF_FTYPE_V2DF:
10837 case V8DF_FTYPE_V8DF:
10838 case V4DI_FTYPE_V4DI:
10839 case V16BF_FTYPE_V16SF:
10840 case V8BF_FTYPE_V8SF:
10841 case V8BF_FTYPE_V4SF:
10842 nargs = 1;
10843 break;
10844 case V4SF_FTYPE_V4SF_VEC_MERGE:
10845 case V2DF_FTYPE_V2DF_VEC_MERGE:
10846 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
10847 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
10848 case V16QI_FTYPE_V16QI_V16QI:
10849 case V16QI_FTYPE_V8HI_V8HI:
10850 case V16HF_FTYPE_V16HF_V16HF:
10851 case V16SF_FTYPE_V16SF_V16SF:
10852 case V8QI_FTYPE_V8QI_V8QI:
10853 case V8QI_FTYPE_V4HI_V4HI:
10854 case V8HI_FTYPE_V8HI_V8HI:
10855 case V8HI_FTYPE_V16QI_V16QI:
10856 case V8HI_FTYPE_V4SI_V4SI:
10857 case V8HF_FTYPE_V8HF_V8HF:
10858 case V8SF_FTYPE_V8SF_V8SF:
10859 case V8SF_FTYPE_V8SF_V8SI:
10860 case V8DF_FTYPE_V8DF_V8DF:
10861 case V4SI_FTYPE_V4SI_V4SI:
10862 case V4SI_FTYPE_V8HI_V8HI:
10863 case V4SI_FTYPE_V2DF_V2DF:
10864 case V4HI_FTYPE_V4HI_V4HI:
10865 case V4HI_FTYPE_V8QI_V8QI:
10866 case V4HI_FTYPE_V2SI_V2SI:
10867 case V4DF_FTYPE_V4DF_V4DF:
10868 case V4DF_FTYPE_V4DF_V4DI:
10869 case V4SF_FTYPE_V4SF_V4SF:
10870 case V4SF_FTYPE_V4SF_V4SI:
10871 case V4SF_FTYPE_V4SF_V2SI:
10872 case V4SF_FTYPE_V4SF_V2DF:
10873 case V4SF_FTYPE_V4SF_UINT:
10874 case V4SF_FTYPE_V4SF_DI:
10875 case V4SF_FTYPE_V4SF_SI:
10876 case V4DI_FTYPE_V4DI_V2DI:
10877 case V2DI_FTYPE_V2DI_V2DI:
10878 case V2DI_FTYPE_V16QI_V16QI:
10879 case V2DI_FTYPE_V4SI_V4SI:
10880 case V2DI_FTYPE_V2DI_V16QI:
10881 case V2SI_FTYPE_V2SI_V2SI:
10882 case V2SI_FTYPE_V4HI_V4HI:
10883 case V2SI_FTYPE_V2SF_V2SF:
10884 case V2DF_FTYPE_V2DF_V2DF:
10885 case V2DF_FTYPE_V2DF_V4SF:
10886 case V2DF_FTYPE_V2DF_V2DI:
10887 case V2DF_FTYPE_V2DF_DI:
10888 case V2DF_FTYPE_V2DF_SI:
10889 case V2DF_FTYPE_V2DF_UINT:
10890 case V2SF_FTYPE_V2SF_V2SF:
10891 case V1DI_FTYPE_V1DI_V1DI:
10892 case V1DI_FTYPE_V8QI_V8QI:
10893 case V1DI_FTYPE_V2SI_V2SI:
10894 case V32QI_FTYPE_V16HI_V16HI:
10895 case V16HI_FTYPE_V8SI_V8SI:
10896 case V64QI_FTYPE_V64QI_V64QI:
10897 case V32QI_FTYPE_V32QI_V32QI:
10898 case V16HI_FTYPE_V32QI_V32QI:
10899 case V16HI_FTYPE_V16HI_V16HI:
10900 case V8SI_FTYPE_V4DF_V4DF:
10901 case V8SI_FTYPE_V8SI_V8SI:
10902 case V8SI_FTYPE_V16HI_V16HI:
10903 case V4DI_FTYPE_V4DI_V4DI:
10904 case V4DI_FTYPE_V8SI_V8SI:
10905 case V4DI_FTYPE_V32QI_V32QI:
10906 case V8DI_FTYPE_V64QI_V64QI:
10907 if (comparison == UNKNOWN)
10908 return ix86_expand_binop_builtin (icode, exp, target);
10909 nargs = 2;
10910 break;
10911 case V4SF_FTYPE_V4SF_V4SF_SWAP:
10912 case V2DF_FTYPE_V2DF_V2DF_SWAP:
10913 gcc_assert (comparison != UNKNOWN);
10914 nargs = 2;
10915 swap = true;
10916 break;
10917 case V16HI_FTYPE_V16HI_V8HI_COUNT:
10918 case V16HI_FTYPE_V16HI_SI_COUNT:
10919 case V8SI_FTYPE_V8SI_V4SI_COUNT:
10920 case V8SI_FTYPE_V8SI_SI_COUNT:
10921 case V4DI_FTYPE_V4DI_V2DI_COUNT:
10922 case V4DI_FTYPE_V4DI_INT_COUNT:
10923 case V8HI_FTYPE_V8HI_V8HI_COUNT:
10924 case V8HI_FTYPE_V8HI_SI_COUNT:
10925 case V4SI_FTYPE_V4SI_V4SI_COUNT:
10926 case V4SI_FTYPE_V4SI_SI_COUNT:
10927 case V4HI_FTYPE_V4HI_V4HI_COUNT:
10928 case V4HI_FTYPE_V4HI_SI_COUNT:
10929 case V2DI_FTYPE_V2DI_V2DI_COUNT:
10930 case V2DI_FTYPE_V2DI_SI_COUNT:
10931 case V2SI_FTYPE_V2SI_V2SI_COUNT:
10932 case V2SI_FTYPE_V2SI_SI_COUNT:
10933 case V1DI_FTYPE_V1DI_V1DI_COUNT:
10934 case V1DI_FTYPE_V1DI_SI_COUNT:
10935 nargs = 2;
10936 second_arg_count = true;
10937 break;
10938 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
10939 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
10940 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
10941 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
10942 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
10943 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
10944 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
10945 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
10946 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
10947 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
10948 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
10949 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
10950 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
10951 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
10952 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
10953 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
10954 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
10955 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
10956 nargs = 4;
10957 second_arg_count = true;
10958 break;
10959 case UINT64_FTYPE_UINT64_UINT64:
10960 case UINT_FTYPE_UINT_UINT:
10961 case UINT_FTYPE_UINT_USHORT:
10962 case UINT_FTYPE_UINT_UCHAR:
10963 case UINT16_FTYPE_UINT16_INT:
10964 case UINT8_FTYPE_UINT8_INT:
10965 case UQI_FTYPE_UQI_UQI:
10966 case UHI_FTYPE_UHI_UHI:
10967 case USI_FTYPE_USI_USI:
10968 case UDI_FTYPE_UDI_UDI:
10969 case V16SI_FTYPE_V8DF_V8DF:
10970 case V32BF_FTYPE_V16SF_V16SF:
10971 case V16BF_FTYPE_V8SF_V8SF:
10972 case V8BF_FTYPE_V4SF_V4SF:
10973 case V16BF_FTYPE_V16SF_UHI:
10974 case V8BF_FTYPE_V8SF_UQI:
10975 case V8BF_FTYPE_V4SF_UQI:
10976 nargs = 2;
10977 break;
10978 case V2DI_FTYPE_V2DI_INT_CONVERT:
10979 nargs = 2;
10980 rmode = V1TImode;
10981 nargs_constant = 1;
10982 break;
10983 case V4DI_FTYPE_V4DI_INT_CONVERT:
10984 nargs = 2;
10985 rmode = V2TImode;
10986 nargs_constant = 1;
10987 break;
10988 case V8DI_FTYPE_V8DI_INT_CONVERT:
10989 nargs = 2;
10990 rmode = V4TImode;
10991 nargs_constant = 1;
10992 break;
10993 case V8HI_FTYPE_V8HI_INT:
10994 case V8HI_FTYPE_V8SF_INT:
10995 case V16HI_FTYPE_V16SF_INT:
10996 case V8HI_FTYPE_V4SF_INT:
10997 case V8SF_FTYPE_V8SF_INT:
10998 case V4SF_FTYPE_V16SF_INT:
10999 case V16SF_FTYPE_V16SF_INT:
11000 case V4SI_FTYPE_V4SI_INT:
11001 case V4SI_FTYPE_V8SI_INT:
11002 case V4HI_FTYPE_V4HI_INT:
11003 case V4DF_FTYPE_V4DF_INT:
11004 case V4DF_FTYPE_V8DF_INT:
11005 case V4SF_FTYPE_V4SF_INT:
11006 case V4SF_FTYPE_V8SF_INT:
11007 case V2DI_FTYPE_V2DI_INT:
11008 case V2DF_FTYPE_V2DF_INT:
11009 case V2DF_FTYPE_V4DF_INT:
11010 case V16HI_FTYPE_V16HI_INT:
11011 case V8SI_FTYPE_V8SI_INT:
11012 case V16SI_FTYPE_V16SI_INT:
11013 case V4SI_FTYPE_V16SI_INT:
11014 case V4DI_FTYPE_V4DI_INT:
11015 case V2DI_FTYPE_V4DI_INT:
11016 case V4DI_FTYPE_V8DI_INT:
11017 case UQI_FTYPE_UQI_UQI_CONST:
11018 case UHI_FTYPE_UHI_UQI:
11019 case USI_FTYPE_USI_UQI:
11020 case UDI_FTYPE_UDI_UQI:
11021 nargs = 2;
11022 nargs_constant = 1;
11023 break;
11024 case V16QI_FTYPE_V16QI_V16QI_V16QI:
11025 case V8SF_FTYPE_V8SF_V8SF_V8SF:
11026 case V4DF_FTYPE_V4DF_V4DF_V4DF:
11027 case V4SF_FTYPE_V4SF_V4SF_V4SF:
11028 case V2DF_FTYPE_V2DF_V2DF_V2DF:
11029 case V32QI_FTYPE_V32QI_V32QI_V32QI:
11030 case UHI_FTYPE_V16SI_V16SI_UHI:
11031 case UQI_FTYPE_V8DI_V8DI_UQI:
11032 case V16HI_FTYPE_V16SI_V16HI_UHI:
11033 case V16QI_FTYPE_V16SI_V16QI_UHI:
11034 case V16QI_FTYPE_V8DI_V16QI_UQI:
11035 case V32HF_FTYPE_V32HF_V32HF_USI:
11036 case V16SF_FTYPE_V16SF_V16SF_UHI:
11037 case V16SF_FTYPE_V4SF_V16SF_UHI:
11038 case V16SI_FTYPE_SI_V16SI_UHI:
11039 case V16SI_FTYPE_V16HI_V16SI_UHI:
11040 case V16SI_FTYPE_V16QI_V16SI_UHI:
11041 case V8SF_FTYPE_V4SF_V8SF_UQI:
11042 case V4DF_FTYPE_V2DF_V4DF_UQI:
11043 case V8SI_FTYPE_V4SI_V8SI_UQI:
11044 case V8SI_FTYPE_SI_V8SI_UQI:
11045 case V4SI_FTYPE_V4SI_V4SI_UQI:
11046 case V4SI_FTYPE_SI_V4SI_UQI:
11047 case V4DI_FTYPE_V2DI_V4DI_UQI:
11048 case V4DI_FTYPE_DI_V4DI_UQI:
11049 case V2DI_FTYPE_V2DI_V2DI_UQI:
11050 case V2DI_FTYPE_DI_V2DI_UQI:
11051 case V64QI_FTYPE_V64QI_V64QI_UDI:
11052 case V64QI_FTYPE_V16QI_V64QI_UDI:
11053 case V64QI_FTYPE_QI_V64QI_UDI:
11054 case V32QI_FTYPE_V32QI_V32QI_USI:
11055 case V32QI_FTYPE_V16QI_V32QI_USI:
11056 case V32QI_FTYPE_QI_V32QI_USI:
11057 case V16QI_FTYPE_V16QI_V16QI_UHI:
11058 case V16QI_FTYPE_QI_V16QI_UHI:
11059 case V32HI_FTYPE_V8HI_V32HI_USI:
11060 case V32HI_FTYPE_HI_V32HI_USI:
11061 case V16HI_FTYPE_V8HI_V16HI_UHI:
11062 case V16HI_FTYPE_HI_V16HI_UHI:
11063 case V8HI_FTYPE_V8HI_V8HI_UQI:
11064 case V8HI_FTYPE_HI_V8HI_UQI:
11065 case V16HF_FTYPE_V16HF_V16HF_UHI:
11066 case V8SF_FTYPE_V8HI_V8SF_UQI:
11067 case V4SF_FTYPE_V8HI_V4SF_UQI:
11068 case V8SI_FTYPE_V8HF_V8SI_UQI:
11069 case V8SF_FTYPE_V8HF_V8SF_UQI:
11070 case V8SI_FTYPE_V8SF_V8SI_UQI:
11071 case V4SI_FTYPE_V4SF_V4SI_UQI:
11072 case V4SI_FTYPE_V8HF_V4SI_UQI:
11073 case V4SF_FTYPE_V8HF_V4SF_UQI:
11074 case V4DI_FTYPE_V8HF_V4DI_UQI:
11075 case V4DI_FTYPE_V4SF_V4DI_UQI:
11076 case V2DI_FTYPE_V8HF_V2DI_UQI:
11077 case V2DI_FTYPE_V4SF_V2DI_UQI:
11078 case V8HF_FTYPE_V8HF_V8HF_UQI:
11079 case V8HF_FTYPE_V8HF_V8HF_V8HF:
11080 case V8HF_FTYPE_V8HI_V8HF_UQI:
11081 case V8HF_FTYPE_V8SI_V8HF_UQI:
11082 case V8HF_FTYPE_V8SF_V8HF_UQI:
11083 case V8HF_FTYPE_V4SI_V8HF_UQI:
11084 case V8HF_FTYPE_V4SF_V8HF_UQI:
11085 case V8HF_FTYPE_V4DI_V8HF_UQI:
11086 case V8HF_FTYPE_V4DF_V8HF_UQI:
11087 case V8HF_FTYPE_V2DI_V8HF_UQI:
11088 case V8HF_FTYPE_V2DF_V8HF_UQI:
11089 case V4SF_FTYPE_V4DI_V4SF_UQI:
11090 case V4SF_FTYPE_V2DI_V4SF_UQI:
11091 case V4DF_FTYPE_V4DI_V4DF_UQI:
11092 case V4DF_FTYPE_V8HF_V4DF_UQI:
11093 case V2DF_FTYPE_V8HF_V2DF_UQI:
11094 case V2DF_FTYPE_V2DI_V2DF_UQI:
11095 case V16QI_FTYPE_V8HI_V16QI_UQI:
11096 case V16QI_FTYPE_V16HI_V16QI_UHI:
11097 case V16QI_FTYPE_V4SI_V16QI_UQI:
11098 case V16QI_FTYPE_V8SI_V16QI_UQI:
11099 case V8HI_FTYPE_V8HF_V8HI_UQI:
11100 case V8HI_FTYPE_V4SI_V8HI_UQI:
11101 case V8HI_FTYPE_V8SI_V8HI_UQI:
11102 case V16QI_FTYPE_V2DI_V16QI_UQI:
11103 case V16QI_FTYPE_V4DI_V16QI_UQI:
11104 case V8HI_FTYPE_V2DI_V8HI_UQI:
11105 case V8HI_FTYPE_V4DI_V8HI_UQI:
11106 case V4SI_FTYPE_V2DI_V4SI_UQI:
11107 case V4SI_FTYPE_V4DI_V4SI_UQI:
11108 case V32QI_FTYPE_V32HI_V32QI_USI:
11109 case UHI_FTYPE_V16QI_V16QI_UHI:
11110 case USI_FTYPE_V32QI_V32QI_USI:
11111 case UDI_FTYPE_V64QI_V64QI_UDI:
11112 case UQI_FTYPE_V8HI_V8HI_UQI:
11113 case UHI_FTYPE_V16HI_V16HI_UHI:
11114 case USI_FTYPE_V32HI_V32HI_USI:
11115 case UQI_FTYPE_V4SI_V4SI_UQI:
11116 case UQI_FTYPE_V8SI_V8SI_UQI:
11117 case UQI_FTYPE_V2DI_V2DI_UQI:
11118 case UQI_FTYPE_V4DI_V4DI_UQI:
11119 case V4SF_FTYPE_V2DF_V4SF_UQI:
11120 case V4SF_FTYPE_V4DF_V4SF_UQI:
11121 case V16SI_FTYPE_V16SI_V16SI_UHI:
11122 case V16SI_FTYPE_V4SI_V16SI_UHI:
11123 case V2DI_FTYPE_V4SI_V2DI_UQI:
11124 case V2DI_FTYPE_V8HI_V2DI_UQI:
11125 case V2DI_FTYPE_V16QI_V2DI_UQI:
11126 case V4DI_FTYPE_V4DI_V4DI_UQI:
11127 case V4DI_FTYPE_V4SI_V4DI_UQI:
11128 case V4DI_FTYPE_V8HI_V4DI_UQI:
11129 case V4DI_FTYPE_V16QI_V4DI_UQI:
11130 case V4DI_FTYPE_V4DF_V4DI_UQI:
11131 case V2DI_FTYPE_V2DF_V2DI_UQI:
11132 case V4SI_FTYPE_V4DF_V4SI_UQI:
11133 case V4SI_FTYPE_V2DF_V4SI_UQI:
11134 case V4SI_FTYPE_V8HI_V4SI_UQI:
11135 case V4SI_FTYPE_V16QI_V4SI_UQI:
11136 case V4DI_FTYPE_V4DI_V4DI_V4DI:
11137 case V8DF_FTYPE_V2DF_V8DF_UQI:
11138 case V8DF_FTYPE_V4DF_V8DF_UQI:
11139 case V8DF_FTYPE_V8DF_V8DF_UQI:
11140 case V8SF_FTYPE_V8SF_V8SF_UQI:
11141 case V8SF_FTYPE_V8SI_V8SF_UQI:
11142 case V4DF_FTYPE_V4DF_V4DF_UQI:
11143 case V4SF_FTYPE_V4SF_V4SF_UQI:
11144 case V2DF_FTYPE_V2DF_V2DF_UQI:
11145 case V2DF_FTYPE_V4SF_V2DF_UQI:
11146 case V2DF_FTYPE_V4SI_V2DF_UQI:
11147 case V4SF_FTYPE_V4SI_V4SF_UQI:
11148 case V4DF_FTYPE_V4SF_V4DF_UQI:
11149 case V4DF_FTYPE_V4SI_V4DF_UQI:
11150 case V8SI_FTYPE_V8SI_V8SI_UQI:
11151 case V8SI_FTYPE_V8HI_V8SI_UQI:
11152 case V8SI_FTYPE_V16QI_V8SI_UQI:
11153 case V8DF_FTYPE_V8SI_V8DF_UQI:
11154 case V8DI_FTYPE_DI_V8DI_UQI:
11155 case V16SF_FTYPE_V8SF_V16SF_UHI:
11156 case V16SI_FTYPE_V8SI_V16SI_UHI:
11157 case V16HF_FTYPE_V16HI_V16HF_UHI:
11158 case V16HF_FTYPE_V16HF_V16HF_V16HF:
11159 case V16HI_FTYPE_V16HF_V16HI_UHI:
11160 case V16HI_FTYPE_V16HI_V16HI_UHI:
11161 case V8HI_FTYPE_V16QI_V8HI_UQI:
11162 case V16HI_FTYPE_V16QI_V16HI_UHI:
11163 case V32HI_FTYPE_V32HI_V32HI_USI:
11164 case V32HI_FTYPE_V32QI_V32HI_USI:
11165 case V8DI_FTYPE_V16QI_V8DI_UQI:
11166 case V8DI_FTYPE_V2DI_V8DI_UQI:
11167 case V8DI_FTYPE_V4DI_V8DI_UQI:
11168 case V8DI_FTYPE_V8DI_V8DI_UQI:
11169 case V8DI_FTYPE_V8HI_V8DI_UQI:
11170 case V8DI_FTYPE_V8SI_V8DI_UQI:
11171 case V8HI_FTYPE_V8DI_V8HI_UQI:
11172 case V8SI_FTYPE_V8DI_V8SI_UQI:
11173 case V4SI_FTYPE_V4SI_V4SI_V4SI:
11174 case V4DI_FTYPE_V4DI_V4DI_V2DI:
11175 case V16SI_FTYPE_V16SI_V16SI_V16SI:
11176 case V8DI_FTYPE_V8DI_V8DI_V8DI:
11177 case V32HI_FTYPE_V32HI_V32HI_V32HI:
11178 case V2DI_FTYPE_V2DI_V2DI_V2DI:
11179 case V16HI_FTYPE_V16HI_V16HI_V16HI:
11180 case V8SI_FTYPE_V8SI_V8SI_V8SI:
11181 case V8HI_FTYPE_V8HI_V8HI_V8HI:
11182 case V32BF_FTYPE_V16SF_V16SF_USI:
11183 case V16BF_FTYPE_V8SF_V8SF_UHI:
11184 case V8BF_FTYPE_V4SF_V4SF_UQI:
11185 case V16BF_FTYPE_V16SF_V16BF_UHI:
11186 case V8BF_FTYPE_V8SF_V8BF_UQI:
11187 case V8BF_FTYPE_V4SF_V8BF_UQI:
11188 case V16SF_FTYPE_V16SF_V32BF_V32BF:
11189 case V8SF_FTYPE_V8SF_V16BF_V16BF:
11190 case V4SF_FTYPE_V4SF_V8BF_V8BF:
11191 nargs = 3;
11192 break;
11193 case V32QI_FTYPE_V32QI_V32QI_INT:
11194 case V16HI_FTYPE_V16HI_V16HI_INT:
11195 case V16QI_FTYPE_V16QI_V16QI_INT:
11196 case V4DI_FTYPE_V4DI_V4DI_INT:
11197 case V8HI_FTYPE_V8HI_V8HI_INT:
11198 case V8SI_FTYPE_V8SI_V8SI_INT:
11199 case V8SI_FTYPE_V8SI_V4SI_INT:
11200 case V8SF_FTYPE_V8SF_V8SF_INT:
11201 case V8SF_FTYPE_V8SF_V4SF_INT:
11202 case V4SI_FTYPE_V4SI_V4SI_INT:
11203 case V4DF_FTYPE_V4DF_V4DF_INT:
11204 case V16SF_FTYPE_V16SF_V16SF_INT:
11205 case V16SF_FTYPE_V16SF_V4SF_INT:
11206 case V16SI_FTYPE_V16SI_V4SI_INT:
11207 case V4DF_FTYPE_V4DF_V2DF_INT:
11208 case V4SF_FTYPE_V4SF_V4SF_INT:
11209 case V2DI_FTYPE_V2DI_V2DI_INT:
11210 case V4DI_FTYPE_V4DI_V2DI_INT:
11211 case V2DF_FTYPE_V2DF_V2DF_INT:
11212 case UQI_FTYPE_V8DI_V8UDI_INT:
11213 case UQI_FTYPE_V8DF_V8DF_INT:
11214 case UQI_FTYPE_V2DF_V2DF_INT:
11215 case UQI_FTYPE_V4SF_V4SF_INT:
11216 case UHI_FTYPE_V16SI_V16SI_INT:
11217 case UHI_FTYPE_V16SF_V16SF_INT:
11218 case V64QI_FTYPE_V64QI_V64QI_INT:
11219 case V32HI_FTYPE_V32HI_V32HI_INT:
11220 case V16SI_FTYPE_V16SI_V16SI_INT:
11221 case V8DI_FTYPE_V8DI_V8DI_INT:
11222 nargs = 3;
11223 nargs_constant = 1;
11224 break;
11225 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
11226 nargs = 3;
11227 rmode = V4DImode;
11228 nargs_constant = 1;
11229 break;
11230 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
11231 nargs = 3;
11232 rmode = V2DImode;
11233 nargs_constant = 1;
11234 break;
11235 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
11236 nargs = 3;
11237 rmode = DImode;
11238 nargs_constant = 1;
11239 break;
11240 case V2DI_FTYPE_V2DI_UINT_UINT:
11241 nargs = 3;
11242 nargs_constant = 2;
11243 break;
11244 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
11245 nargs = 3;
11246 rmode = V8DImode;
11247 nargs_constant = 1;
11248 break;
11249 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
11250 nargs = 5;
11251 rmode = V8DImode;
11252 mask_pos = 2;
11253 nargs_constant = 1;
11254 break;
11255 case QI_FTYPE_V8DF_INT_UQI:
11256 case QI_FTYPE_V4DF_INT_UQI:
11257 case QI_FTYPE_V2DF_INT_UQI:
11258 case HI_FTYPE_V16SF_INT_UHI:
11259 case QI_FTYPE_V8SF_INT_UQI:
11260 case QI_FTYPE_V4SF_INT_UQI:
11261 case QI_FTYPE_V8HF_INT_UQI:
11262 case HI_FTYPE_V16HF_INT_UHI:
11263 case SI_FTYPE_V32HF_INT_USI:
11264 case V4SI_FTYPE_V4SI_V4SI_UHI:
11265 case V8SI_FTYPE_V8SI_V8SI_UHI:
11266 nargs = 3;
11267 mask_pos = 1;
11268 nargs_constant = 1;
11269 break;
11270 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
11271 nargs = 5;
11272 rmode = V4DImode;
11273 mask_pos = 2;
11274 nargs_constant = 1;
11275 break;
11276 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
11277 nargs = 5;
11278 rmode = V2DImode;
11279 mask_pos = 2;
11280 nargs_constant = 1;
11281 break;
11282 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
11283 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
11284 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
11285 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
11286 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
11287 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
11288 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
11289 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
11290 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
11291 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
11292 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
11293 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
11294 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
11295 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
11296 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
11297 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
11298 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI:
11299 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
11300 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
11301 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
11302 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
11303 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
11304 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
11305 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
11306 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
11307 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
11308 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
11309 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
11310 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
11311 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
11312 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
11313 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
11314 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
11315 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
11316 case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI:
11317 case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI:
11318 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
11319 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
11320 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
11321 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
11322 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
11323 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
11324 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
11325 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI:
11326 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
11327 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
11328 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
11329 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
11330 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
11331 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
11332 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
11333 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
11334 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
11335 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
11336 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
11337 case V32BF_FTYPE_V16SF_V16SF_V32BF_USI:
11338 case V16BF_FTYPE_V8SF_V8SF_V16BF_UHI:
11339 case V8BF_FTYPE_V4SF_V4SF_V8BF_UQI:
11340 nargs = 4;
11341 break;
11342 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
11343 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
11344 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
11345 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
11346 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
11347 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
11348 nargs = 4;
11349 nargs_constant = 1;
11350 break;
11351 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
11352 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
11353 case QI_FTYPE_V4DF_V4DF_INT_UQI:
11354 case QI_FTYPE_V8SF_V8SF_INT_UQI:
11355 case UHI_FTYPE_V16HF_V16HF_INT_UHI:
11356 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
11357 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
11358 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
11359 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
11360 case UQI_FTYPE_V8HF_V8HF_INT_UQI:
11361 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
11362 case USI_FTYPE_V32QI_V32QI_INT_USI:
11363 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
11364 case USI_FTYPE_V32HI_V32HI_INT_USI:
11365 case USI_FTYPE_V32HF_V32HF_INT_USI:
11366 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
11367 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
11368 nargs = 4;
11369 mask_pos = 1;
11370 nargs_constant = 1;
11371 break;
11372 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
11373 nargs = 4;
11374 nargs_constant = 2;
11375 break;
11376 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
11377 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
11378 case V16SF_FTYPE_V16SF_V32BF_V32BF_UHI:
11379 case V8SF_FTYPE_V8SF_V16BF_V16BF_UQI:
11380 case V4SF_FTYPE_V4SF_V8BF_V8BF_UQI:
11381 nargs = 4;
11382 break;
11383 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
11384 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
11385 mask_pos = 1;
11386 nargs = 4;
11387 nargs_constant = 1;
11388 break;
11389 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
11390 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
11391 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
11392 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
11393 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
11394 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
11395 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
11396 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
11397 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
11398 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
11399 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
11400 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
11401 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
11402 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
11403 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
11404 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
11405 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
11406 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
11407 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
11408 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
11409 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
11410 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
11411 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
11412 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
11413 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
11414 case V16HF_FTYPE_V16HF_INT_V16HF_UHI:
11415 case V8HF_FTYPE_V8HF_INT_V8HF_UQI:
11416 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
11417 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
11418 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
11419 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
11420 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
11421 nargs = 4;
11422 mask_pos = 2;
11423 nargs_constant = 1;
11424 break;
11425 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
11426 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
11427 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
11428 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
11429 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
11430 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
11431 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
11432 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
11433 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
11434 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
11435 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
11436 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
11437 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
11438 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
11439 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
11440 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
11441 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
11442 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
11443 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
11444 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
11445 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
11446 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
11447 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
11448 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
11449 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
11450 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
11451 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
11452 nargs = 5;
11453 mask_pos = 2;
11454 nargs_constant = 1;
11455 break;
11456 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
11457 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
11458 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
11459 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
11460 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
11461 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
11462 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
11463 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
11464 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
11465 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
11466 nargs = 5;
11467 mask_pos = 1;
11468 nargs_constant = 1;
11469 break;
11470 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
11471 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
11472 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
11473 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
11474 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
11475 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
11476 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
11477 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
11478 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
11479 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
11480 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
11481 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
11482 nargs = 5;
11483 mask_pos = 1;
11484 nargs_constant = 2;
11485 break;
11486
11487 default:
11488 gcc_unreachable ();
11489 }
11490
11491 gcc_assert (nargs <= ARRAY_SIZE (xops));
11492
11493 if (comparison != UNKNOWN)
11494 {
11495 gcc_assert (nargs == 2);
11496 return ix86_expand_sse_compare (d, exp, target, swap);
11497 }
11498
11499 if (rmode == VOIDmode || rmode == tmode)
11500 {
11501 if (optimize
11502 || target == 0
11503 || GET_MODE (target) != tmode
11504 || !insn_p->operand[0].predicate (target, tmode))
11505 target = gen_reg_rtx (tmode);
11506 else if (memory_operand (target, tmode))
11507 num_memory++;
11508 real_target = target;
11509 }
11510 else
11511 {
11512 real_target = gen_reg_rtx (tmode);
11513 target = lowpart_subreg (outermode: rmode, op: real_target, innermode: tmode);
11514 }
11515
11516 for (i = 0; i < nargs; i++)
11517 {
11518 tree arg = CALL_EXPR_ARG (exp, i);
11519 rtx op = expand_normal (exp: arg);
11520 machine_mode mode = insn_p->operand[i + 1].mode;
11521 bool match = insn_p->operand[i + 1].predicate (op, mode);
11522
11523 if (second_arg_count && i == 1)
11524 {
11525 /* SIMD shift insns take either an 8-bit immediate or
11526 register as count. But builtin functions take int as
11527 count. If count doesn't match, we put it in register.
11528 The instructions are using 64-bit count, if op is just
11529 32-bit, zero-extend it, as negative shift counts
11530 are undefined behavior and zero-extension is more
11531 efficient. */
11532 if (!match)
11533 {
11534 if (SCALAR_INT_MODE_P (GET_MODE (op)))
11535 op = convert_modes (mode, GET_MODE (op), x: op, unsignedp: 1);
11536 else
11537 op = lowpart_subreg (outermode: mode, op, GET_MODE (op));
11538 if (!insn_p->operand[i + 1].predicate (op, mode))
11539 op = copy_to_reg (op);
11540 }
11541 }
11542 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
11543 (!mask_pos && (nargs - i) <= nargs_constant))
11544 {
11545 if (!match)
11546 switch (icode)
11547 {
11548 case CODE_FOR_avx_vinsertf128v4di:
11549 case CODE_FOR_avx_vextractf128v4di:
11550 error ("the last argument must be an 1-bit immediate");
11551 return const0_rtx;
11552
11553 case CODE_FOR_avx512f_cmpv8di3_mask:
11554 case CODE_FOR_avx512f_cmpv16si3_mask:
11555 case CODE_FOR_avx512f_ucmpv8di3_mask:
11556 case CODE_FOR_avx512f_ucmpv16si3_mask:
11557 case CODE_FOR_avx512vl_cmpv4di3_mask:
11558 case CODE_FOR_avx512vl_cmpv8si3_mask:
11559 case CODE_FOR_avx512vl_ucmpv4di3_mask:
11560 case CODE_FOR_avx512vl_ucmpv8si3_mask:
11561 case CODE_FOR_avx512vl_cmpv2di3_mask:
11562 case CODE_FOR_avx512vl_cmpv4si3_mask:
11563 case CODE_FOR_avx512vl_ucmpv2di3_mask:
11564 case CODE_FOR_avx512vl_ucmpv4si3_mask:
11565 error ("the last argument must be a 3-bit immediate");
11566 return const0_rtx;
11567
11568 case CODE_FOR_sse4_1_roundsd:
11569 case CODE_FOR_sse4_1_roundss:
11570
11571 case CODE_FOR_sse4_1_roundpd:
11572 case CODE_FOR_sse4_1_roundps:
11573 case CODE_FOR_avx_roundpd256:
11574 case CODE_FOR_avx_roundps256:
11575
11576 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
11577 case CODE_FOR_sse4_1_roundps_sfix:
11578 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
11579 case CODE_FOR_avx_roundps_sfix256:
11580
11581 case CODE_FOR_sse4_1_blendps:
11582 case CODE_FOR_avx_blendpd256:
11583 case CODE_FOR_avx_vpermilv4df:
11584 case CODE_FOR_avx_vpermilv4df_mask:
11585 case CODE_FOR_avx512f_getmantv8df_mask:
11586 case CODE_FOR_avx512f_getmantv16sf_mask:
11587 case CODE_FOR_avx512vl_getmantv16hf_mask:
11588 case CODE_FOR_avx512vl_getmantv8sf_mask:
11589 case CODE_FOR_avx512vl_getmantv4df_mask:
11590 case CODE_FOR_avx512fp16_getmantv8hf_mask:
11591 case CODE_FOR_avx512vl_getmantv4sf_mask:
11592 case CODE_FOR_avx512vl_getmantv2df_mask:
11593 case CODE_FOR_avx512dq_rangepv8df_mask_round:
11594 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
11595 case CODE_FOR_avx512dq_rangepv4df_mask:
11596 case CODE_FOR_avx512dq_rangepv8sf_mask:
11597 case CODE_FOR_avx512dq_rangepv2df_mask:
11598 case CODE_FOR_avx512dq_rangepv4sf_mask:
11599 case CODE_FOR_avx_shufpd256_mask:
11600 error ("the last argument must be a 4-bit immediate");
11601 return const0_rtx;
11602
11603 case CODE_FOR_sha1rnds4:
11604 case CODE_FOR_sse4_1_blendpd:
11605 case CODE_FOR_avx_vpermilv2df:
11606 case CODE_FOR_avx_vpermilv2df_mask:
11607 case CODE_FOR_xop_vpermil2v2df3:
11608 case CODE_FOR_xop_vpermil2v4sf3:
11609 case CODE_FOR_xop_vpermil2v4df3:
11610 case CODE_FOR_xop_vpermil2v8sf3:
11611 case CODE_FOR_avx512f_vinsertf32x4_mask:
11612 case CODE_FOR_avx512f_vinserti32x4_mask:
11613 case CODE_FOR_avx512f_vextractf32x4_mask:
11614 case CODE_FOR_avx512f_vextracti32x4_mask:
11615 case CODE_FOR_sse2_shufpd:
11616 case CODE_FOR_sse2_shufpd_mask:
11617 case CODE_FOR_avx512dq_shuf_f64x2_mask:
11618 case CODE_FOR_avx512dq_shuf_i64x2_mask:
11619 case CODE_FOR_avx512vl_shuf_i32x4_mask:
11620 case CODE_FOR_avx512vl_shuf_f32x4_mask:
11621 error ("the last argument must be a 2-bit immediate");
11622 return const0_rtx;
11623
11624 case CODE_FOR_avx_vextractf128v4df:
11625 case CODE_FOR_avx_vextractf128v8sf:
11626 case CODE_FOR_avx_vextractf128v8si:
11627 case CODE_FOR_avx_vinsertf128v4df:
11628 case CODE_FOR_avx_vinsertf128v8sf:
11629 case CODE_FOR_avx_vinsertf128v8si:
11630 case CODE_FOR_avx512f_vinsertf64x4_mask:
11631 case CODE_FOR_avx512f_vinserti64x4_mask:
11632 case CODE_FOR_avx512f_vextractf64x4_mask:
11633 case CODE_FOR_avx512f_vextracti64x4_mask:
11634 case CODE_FOR_avx512dq_vinsertf32x8_mask:
11635 case CODE_FOR_avx512dq_vinserti32x8_mask:
11636 case CODE_FOR_avx512vl_vinsertv4df:
11637 case CODE_FOR_avx512vl_vinsertv4di:
11638 case CODE_FOR_avx512vl_vinsertv8sf:
11639 case CODE_FOR_avx512vl_vinsertv8si:
11640 error ("the last argument must be a 1-bit immediate");
11641 return const0_rtx;
11642
11643 case CODE_FOR_avx_vmcmpv2df3:
11644 case CODE_FOR_avx_vmcmpv4sf3:
11645 case CODE_FOR_avx_cmpv2df3:
11646 case CODE_FOR_avx_cmpv4sf3:
11647 case CODE_FOR_avx_cmpv4df3:
11648 case CODE_FOR_avx_cmpv8sf3:
11649 case CODE_FOR_avx512f_cmpv8df3_mask:
11650 case CODE_FOR_avx512f_cmpv16sf3_mask:
11651 case CODE_FOR_avx512f_vmcmpv2df3_mask:
11652 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
11653 case CODE_FOR_avx512bw_cmpv32hf3_mask:
11654 case CODE_FOR_avx512vl_cmpv16hf3_mask:
11655 case CODE_FOR_avx512fp16_cmpv8hf3_mask:
11656 error ("the last argument must be a 5-bit immediate");
11657 return const0_rtx;
11658
11659 default:
11660 switch (nargs_constant)
11661 {
11662 case 2:
11663 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
11664 (!mask_pos && (nargs - i) == nargs_constant))
11665 {
11666 error ("the next to last argument must be an 8-bit immediate");
11667 break;
11668 }
11669 /* FALLTHRU */
11670 case 1:
11671 error ("the last argument must be an 8-bit immediate");
11672 break;
11673 default:
11674 gcc_unreachable ();
11675 }
11676 return const0_rtx;
11677 }
11678 }
11679 else
11680 {
11681 if (VECTOR_MODE_P (mode))
11682 op = safe_vector_operand (x: op, mode);
11683
11684 /* If we aren't optimizing, only allow one memory operand to
11685 be generated. */
11686 if (memory_operand (op, mode))
11687 num_memory++;
11688
11689 op = fixup_modeless_constant (x: op, mode);
11690
11691 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
11692 {
11693 if (optimize || !match || num_memory > 1)
11694 op = copy_to_mode_reg (mode, op);
11695 }
11696 else
11697 {
11698 op = copy_to_reg (op);
11699 op = lowpart_subreg (outermode: mode, op, GET_MODE (op));
11700 }
11701 }
11702
11703 xops[i] = op;
11704 }
11705
11706 switch (nargs)
11707 {
11708 case 1:
11709 pat = GEN_FCN (icode) (real_target, xops[0]);
11710 break;
11711 case 2:
11712 pat = GEN_FCN (icode) (real_target, xops[0], xops[1]);
11713 break;
11714 case 3:
11715 pat = GEN_FCN (icode) (real_target, xops[0], xops[1], xops[2]);
11716 break;
11717 case 4:
11718 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11719 xops[2], xops[3]);
11720 break;
11721 case 5:
11722 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11723 xops[2], xops[3], xops[4]);
11724 break;
11725 case 6:
11726 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11727 xops[2], xops[3], xops[4], xops[5]);
11728 break;
11729 default:
11730 gcc_unreachable ();
11731 }
11732
11733 if (! pat)
11734 return 0;
11735
11736 emit_insn (pat);
11737 return target;
11738}
11739
11740/* Transform pattern of following layout:
11741 (set A
11742 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
11743 )
11744 into:
11745 (set (A B)) */
11746
11747static rtx
11748ix86_erase_embedded_rounding (rtx pat)
11749{
11750 if (GET_CODE (pat) == INSN)
11751 pat = PATTERN (insn: pat);
11752
11753 gcc_assert (GET_CODE (pat) == SET);
11754 rtx src = SET_SRC (pat);
11755 gcc_assert (XVECLEN (src, 0) == 2);
11756 rtx p0 = XVECEXP (src, 0, 0);
11757 gcc_assert (GET_CODE (src) == UNSPEC
11758 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
11759 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
11760 return res;
11761}
11762
11763/* Subroutine of ix86_expand_round_builtin to take care of comi insns
11764 with rounding. */
11765static rtx
11766ix86_expand_sse_comi_round (const struct builtin_description *d,
11767 tree exp, rtx target)
11768{
11769 rtx pat, set_dst;
11770 tree arg0 = CALL_EXPR_ARG (exp, 0);
11771 tree arg1 = CALL_EXPR_ARG (exp, 1);
11772 tree arg2 = CALL_EXPR_ARG (exp, 2);
11773 tree arg3 = CALL_EXPR_ARG (exp, 3);
11774 rtx op0 = expand_normal (exp: arg0);
11775 rtx op1 = expand_normal (exp: arg1);
11776 rtx op2 = expand_normal (exp: arg2);
11777 rtx op3 = expand_normal (exp: arg3);
11778 enum insn_code icode = d->icode;
11779 const struct insn_data_d *insn_p = &insn_data[icode];
11780 machine_mode mode0 = insn_p->operand[0].mode;
11781 machine_mode mode1 = insn_p->operand[1].mode;
11782
11783 /* See avxintrin.h for values. */
11784 static const enum rtx_code comparisons[32] =
11785 {
11786 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
11787 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
11788 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
11789 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
11790 };
11791 static const bool ordereds[32] =
11792 {
11793 true, true, true, false, false, false, false, true,
11794 false, false, false, true, true, true, true, false,
11795 true, true, true, false, false, false, false, true,
11796 false, false, false, true, true, true, true, false
11797 };
11798 static const bool non_signalings[32] =
11799 {
11800 true, false, false, true, true, false, false, true,
11801 true, false, false, true, true, false, false, true,
11802 false, true, true, false, false, true, true, false,
11803 false, true, true, false, false, true, true, false
11804 };
11805
11806 if (!CONST_INT_P (op2))
11807 {
11808 error ("the third argument must be comparison constant");
11809 return const0_rtx;
11810 }
11811 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
11812 {
11813 error ("incorrect comparison mode");
11814 return const0_rtx;
11815 }
11816
11817 if (!insn_p->operand[2].predicate (op3, SImode))
11818 {
11819 error ("incorrect rounding operand");
11820 return const0_rtx;
11821 }
11822
11823 if (VECTOR_MODE_P (mode0))
11824 op0 = safe_vector_operand (x: op0, mode: mode0);
11825 if (VECTOR_MODE_P (mode1))
11826 op1 = safe_vector_operand (x: op1, mode: mode1);
11827
11828 enum rtx_code comparison = comparisons[INTVAL (op2)];
11829 bool ordered = ordereds[INTVAL (op2)];
11830 bool non_signaling = non_signalings[INTVAL (op2)];
11831 rtx const_val = const0_rtx;
11832
11833 bool check_unordered = false;
11834 machine_mode mode = CCFPmode;
11835 switch (comparison)
11836 {
11837 case ORDERED:
11838 if (!ordered)
11839 {
11840 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
11841 if (!non_signaling)
11842 ordered = true;
11843 mode = CCSmode;
11844 }
11845 else
11846 {
11847 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
11848 if (non_signaling)
11849 ordered = false;
11850 mode = CCPmode;
11851 }
11852 comparison = NE;
11853 break;
11854 case UNORDERED:
11855 if (ordered)
11856 {
11857 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
11858 if (non_signaling)
11859 ordered = false;
11860 mode = CCSmode;
11861 }
11862 else
11863 {
11864 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
11865 if (!non_signaling)
11866 ordered = true;
11867 mode = CCPmode;
11868 }
11869 comparison = EQ;
11870 break;
11871
11872 case LE: /* -> GE */
11873 case LT: /* -> GT */
11874 case UNGE: /* -> UNLE */
11875 case UNGT: /* -> UNLT */
11876 std::swap (a&: op0, b&: op1);
11877 comparison = swap_condition (comparison);
11878 /* FALLTHRU */
11879 case GT:
11880 case GE:
11881 case UNEQ:
11882 case UNLT:
11883 case UNLE:
11884 case LTGT:
11885 /* These are supported by CCFPmode. NB: Use ordered/signaling
11886 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
11887 with NAN operands. */
11888 if (ordered == non_signaling)
11889 ordered = !ordered;
11890 break;
11891 case EQ:
11892 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11893 _CMP_EQ_OQ/_CMP_EQ_OS. */
11894 check_unordered = true;
11895 mode = CCZmode;
11896 break;
11897 case NE:
11898 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11899 _CMP_NEQ_UQ/_CMP_NEQ_US. */
11900 gcc_assert (!ordered);
11901 check_unordered = true;
11902 mode = CCZmode;
11903 const_val = const1_rtx;
11904 break;
11905 default:
11906 gcc_unreachable ();
11907 }
11908
11909 target = gen_reg_rtx (SImode);
11910 emit_move_insn (target, const_val);
11911 target = gen_rtx_SUBREG (QImode, target, 0);
11912
11913 if ((optimize && !register_operand (op0, mode0))
11914 || !insn_p->operand[0].predicate (op0, mode0))
11915 op0 = copy_to_mode_reg (mode0, op0);
11916 if ((optimize && !register_operand (op1, mode1))
11917 || !insn_p->operand[1].predicate (op1, mode1))
11918 op1 = copy_to_mode_reg (mode1, op1);
11919
11920 /*
11921 1. COMI: ordered and signaling.
11922 2. UCOMI: unordered and non-signaling.
11923 */
11924 if (non_signaling)
11925 icode = (icode == CODE_FOR_sse_comi_round
11926 ? CODE_FOR_sse_ucomi_round
11927 : CODE_FOR_sse2_ucomi_round);
11928
11929 pat = GEN_FCN (icode) (op0, op1, op3);
11930 if (! pat)
11931 return 0;
11932
11933 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
11934 if (INTVAL (op3) == NO_ROUND)
11935 {
11936 pat = ix86_erase_embedded_rounding (pat);
11937 if (! pat)
11938 return 0;
11939
11940 set_dst = SET_DEST (pat);
11941 }
11942 else
11943 {
11944 gcc_assert (GET_CODE (pat) == SET);
11945 set_dst = SET_DEST (pat);
11946 }
11947
11948 emit_insn (pat);
11949
11950 return ix86_ssecom_setcc (comparison, check_unordered, mode,
11951 set_dst, target);
11952}
11953
11954static rtx
11955ix86_expand_round_builtin (const struct builtin_description *d,
11956 tree exp, rtx target)
11957{
11958 rtx pat;
11959 unsigned int i, nargs;
11960 rtx xops[6];
11961 enum insn_code icode = d->icode;
11962 const struct insn_data_d *insn_p = &insn_data[icode];
11963 machine_mode tmode = insn_p->operand[0].mode;
11964 unsigned int nargs_constant = 0;
11965 unsigned int redundant_embed_rnd = 0;
11966
11967 switch ((enum ix86_builtin_func_type) d->flag)
11968 {
11969 case UINT64_FTYPE_V2DF_INT:
11970 case UINT64_FTYPE_V4SF_INT:
11971 case UINT64_FTYPE_V8HF_INT:
11972 case UINT_FTYPE_V2DF_INT:
11973 case UINT_FTYPE_V4SF_INT:
11974 case UINT_FTYPE_V8HF_INT:
11975 case INT64_FTYPE_V2DF_INT:
11976 case INT64_FTYPE_V4SF_INT:
11977 case INT64_FTYPE_V8HF_INT:
11978 case INT_FTYPE_V2DF_INT:
11979 case INT_FTYPE_V4SF_INT:
11980 case INT_FTYPE_V8HF_INT:
11981 nargs = 2;
11982 break;
11983 case V32HF_FTYPE_V32HF_V32HF_INT:
11984 case V8HF_FTYPE_V8HF_V8HF_INT:
11985 case V8HF_FTYPE_V8HF_INT_INT:
11986 case V8HF_FTYPE_V8HF_UINT_INT:
11987 case V8HF_FTYPE_V8HF_INT64_INT:
11988 case V8HF_FTYPE_V8HF_UINT64_INT:
11989 case V4SF_FTYPE_V4SF_UINT_INT:
11990 case V4SF_FTYPE_V4SF_UINT64_INT:
11991 case V2DF_FTYPE_V2DF_UINT64_INT:
11992 case V4SF_FTYPE_V4SF_INT_INT:
11993 case V4SF_FTYPE_V4SF_INT64_INT:
11994 case V2DF_FTYPE_V2DF_INT64_INT:
11995 case V4SF_FTYPE_V4SF_V4SF_INT:
11996 case V2DF_FTYPE_V2DF_V2DF_INT:
11997 case V4SF_FTYPE_V4SF_V2DF_INT:
11998 case V2DF_FTYPE_V2DF_V4SF_INT:
11999 nargs = 3;
12000 break;
12001 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
12002 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
12003 case V32HI_FTYPE_V32HF_V32HI_USI_INT:
12004 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
12005 case V8DI_FTYPE_V8HF_V8DI_UQI_INT:
12006 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
12007 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
12008 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
12009 case V8DF_FTYPE_V8HF_V8DF_UQI_INT:
12010 case V16SF_FTYPE_V16HF_V16SF_UHI_INT:
12011 case V32HF_FTYPE_V32HI_V32HF_USI_INT:
12012 case V32HF_FTYPE_V32HF_V32HF_USI_INT:
12013 case V32HF_FTYPE_V32HF_V32HF_V32HF_INT:
12014 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
12015 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
12016 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
12017 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
12018 case V16SI_FTYPE_V16HF_V16SI_UHI_INT:
12019 case V16HF_FTYPE_V16SI_V16HF_UHI_INT:
12020 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
12021 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
12022 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
12023 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
12024 case V8HF_FTYPE_V8DI_V8HF_UQI_INT:
12025 case V8HF_FTYPE_V8DF_V8HF_UQI_INT:
12026 case V16HF_FTYPE_V16SF_V16HF_UHI_INT:
12027 case V8HF_FTYPE_V8HF_V8HF_V8HF_INT:
12028 nargs = 4;
12029 break;
12030 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
12031 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
12032 nargs_constant = 2;
12033 nargs = 4;
12034 break;
12035 case INT_FTYPE_V4SF_V4SF_INT_INT:
12036 case INT_FTYPE_V2DF_V2DF_INT_INT:
12037 return ix86_expand_sse_comi_round (d, exp, target);
12038 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
12039 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
12040 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
12041 case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT:
12042 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
12043 case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT:
12044 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
12045 case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT:
12046 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
12047 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
12048 case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
12049 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
12050 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
12051 case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
12052 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT:
12053 case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT:
12054 case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT:
12055 nargs = 5;
12056 break;
12057 case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT:
12058 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
12059 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
12060 case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
12061 case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT:
12062 nargs_constant = 4;
12063 nargs = 5;
12064 break;
12065 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
12066 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
12067 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
12068 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
12069 case USI_FTYPE_V32HF_V32HF_INT_USI_INT:
12070 case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT:
12071 nargs_constant = 3;
12072 nargs = 5;
12073 break;
12074 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
12075 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
12076 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
12077 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
12078 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
12079 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
12080 case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT:
12081 nargs = 6;
12082 nargs_constant = 4;
12083 break;
12084 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
12085 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
12086 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
12087 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
12088 nargs = 6;
12089 nargs_constant = 3;
12090 break;
12091 default:
12092 gcc_unreachable ();
12093 }
12094 gcc_assert (nargs <= ARRAY_SIZE (xops));
12095
12096 if (optimize
12097 || target == 0
12098 || GET_MODE (target) != tmode
12099 || !insn_p->operand[0].predicate (target, tmode))
12100 target = gen_reg_rtx (tmode);
12101
12102 for (i = 0; i < nargs; i++)
12103 {
12104 tree arg = CALL_EXPR_ARG (exp, i);
12105 rtx op = expand_normal (exp: arg);
12106 machine_mode mode = insn_p->operand[i + 1].mode;
12107 bool match = insn_p->operand[i + 1].predicate (op, mode);
12108
12109 if (i == nargs - nargs_constant)
12110 {
12111 if (!match)
12112 {
12113 switch (icode)
12114 {
12115 case CODE_FOR_avx512f_getmantv8df_mask_round:
12116 case CODE_FOR_avx512f_getmantv16sf_mask_round:
12117 case CODE_FOR_avx512bw_getmantv32hf_mask_round:
12118 case CODE_FOR_avx512f_vgetmantv2df_round:
12119 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
12120 case CODE_FOR_avx512f_vgetmantv4sf_round:
12121 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
12122 case CODE_FOR_avx512f_vgetmantv8hf_mask_round:
12123 error ("the immediate argument must be a 4-bit immediate");
12124 return const0_rtx;
12125 case CODE_FOR_avx512f_cmpv8df3_mask_round:
12126 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
12127 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
12128 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
12129 case CODE_FOR_avx512f_vmcmpv8hf3_mask_round:
12130 case CODE_FOR_avx512bw_cmpv32hf3_mask_round:
12131 error ("the immediate argument must be a 5-bit immediate");
12132 return const0_rtx;
12133 default:
12134 error ("the immediate argument must be an 8-bit immediate");
12135 return const0_rtx;
12136 }
12137 }
12138 }
12139 else if (i == nargs-1)
12140 {
12141 if (!insn_p->operand[nargs].predicate (op, SImode))
12142 {
12143 error ("incorrect rounding operand");
12144 return const0_rtx;
12145 }
12146
12147 /* If there is no rounding use normal version of the pattern. */
12148 if (INTVAL (op) == NO_ROUND)
12149 {
12150 /* Skip erasing embedded rounding for below expanders who
12151 generates multiple insns. In ix86_erase_embedded_rounding
12152 the pattern will be transformed to a single set, and emit_insn
12153 appends the set insead of insert it to chain. So the insns
12154 emitted inside define_expander would be ignored. */
12155 switch (icode)
12156 {
12157 case CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round:
12158 case CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round:
12159 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round:
12160 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round:
12161 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round:
12162 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round:
12163 redundant_embed_rnd = 0;
12164 break;
12165 default:
12166 redundant_embed_rnd = 1;
12167 break;
12168 }
12169 }
12170 }
12171 else
12172 {
12173 if (VECTOR_MODE_P (mode))
12174 op = safe_vector_operand (x: op, mode);
12175
12176 op = fixup_modeless_constant (x: op, mode);
12177
12178 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
12179 {
12180 if (optimize || !match)
12181 op = copy_to_mode_reg (mode, op);
12182 }
12183 else
12184 {
12185 op = copy_to_reg (op);
12186 op = lowpart_subreg (outermode: mode, op, GET_MODE (op));
12187 }
12188 }
12189
12190 xops[i] = op;
12191 }
12192
12193 switch (nargs)
12194 {
12195 case 1:
12196 pat = GEN_FCN (icode) (target, xops[0]);
12197 break;
12198 case 2:
12199 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
12200 break;
12201 case 3:
12202 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
12203 break;
12204 case 4:
12205 pat = GEN_FCN (icode) (target, xops[0], xops[1],
12206 xops[2], xops[3]);
12207 break;
12208 case 5:
12209 pat = GEN_FCN (icode) (target, xops[0], xops[1],
12210 xops[2], xops[3], xops[4]);
12211 break;
12212 case 6:
12213 pat = GEN_FCN (icode) (target, xops[0], xops[1],
12214 xops[2], xops[3], xops[4], xops[5]);
12215 break;
12216 default:
12217 gcc_unreachable ();
12218 }
12219
12220 if (!pat)
12221 return 0;
12222
12223 if (redundant_embed_rnd)
12224 pat = ix86_erase_embedded_rounding (pat);
12225
12226 emit_insn (pat);
12227 return target;
12228}
12229
12230/* Subroutine of ix86_expand_builtin to take care of special insns
12231 with variable number of operands. */
12232
12233static rtx
12234ix86_expand_special_args_builtin (const struct builtin_description *d,
12235 tree exp, rtx target)
12236{
12237 tree arg;
12238 rtx pat, op;
12239 unsigned int i, nargs, arg_adjust, memory;
12240 unsigned int constant = 100;
12241 bool aligned_mem = false;
12242 rtx xops[4];
12243 enum insn_code icode = d->icode;
12244 const struct insn_data_d *insn_p = &insn_data[icode];
12245 machine_mode tmode = insn_p->operand[0].mode;
12246 enum { load, store } klass;
12247
12248 switch ((enum ix86_builtin_func_type) d->flag)
12249 {
12250 case VOID_FTYPE_VOID:
12251 emit_insn (GEN_FCN (icode) (target));
12252 return 0;
12253 case VOID_FTYPE_UINT64:
12254 case VOID_FTYPE_UNSIGNED:
12255 nargs = 0;
12256 klass = store;
12257 memory = 0;
12258 break;
12259
12260 case INT_FTYPE_VOID:
12261 case USHORT_FTYPE_VOID:
12262 case UINT64_FTYPE_VOID:
12263 case UINT_FTYPE_VOID:
12264 case UINT8_FTYPE_VOID:
12265 case UNSIGNED_FTYPE_VOID:
12266 nargs = 0;
12267 klass = load;
12268 memory = 0;
12269 break;
12270 case UINT64_FTYPE_PUNSIGNED:
12271 case V2DI_FTYPE_PV2DI:
12272 case V4DI_FTYPE_PV4DI:
12273 case V32QI_FTYPE_PCCHAR:
12274 case V16QI_FTYPE_PCCHAR:
12275 case V8SF_FTYPE_PCV4SF:
12276 case V8SF_FTYPE_PCFLOAT:
12277 case V4SF_FTYPE_PCFLOAT:
12278 case V4SF_FTYPE_PCFLOAT16:
12279 case V4SF_FTYPE_PCBFLOAT16:
12280 case V4SF_FTYPE_PCV8BF:
12281 case V4SF_FTYPE_PCV8HF:
12282 case V8SF_FTYPE_PCFLOAT16:
12283 case V8SF_FTYPE_PCBFLOAT16:
12284 case V8SF_FTYPE_PCV16HF:
12285 case V8SF_FTYPE_PCV16BF:
12286 case V4DF_FTYPE_PCV2DF:
12287 case V4DF_FTYPE_PCDOUBLE:
12288 case V2DF_FTYPE_PCDOUBLE:
12289 case VOID_FTYPE_PVOID:
12290 case V8DI_FTYPE_PV8DI:
12291 nargs = 1;
12292 klass = load;
12293 memory = 0;
12294 switch (icode)
12295 {
12296 case CODE_FOR_sse4_1_movntdqa:
12297 case CODE_FOR_avx2_movntdqa:
12298 case CODE_FOR_avx512f_movntdqa:
12299 aligned_mem = true;
12300 break;
12301 default:
12302 break;
12303 }
12304 break;
12305 case VOID_FTYPE_PV2SF_V4SF:
12306 case VOID_FTYPE_PV8DI_V8DI:
12307 case VOID_FTYPE_PV4DI_V4DI:
12308 case VOID_FTYPE_PV2DI_V2DI:
12309 case VOID_FTYPE_PCHAR_V32QI:
12310 case VOID_FTYPE_PCHAR_V16QI:
12311 case VOID_FTYPE_PFLOAT_V16SF:
12312 case VOID_FTYPE_PFLOAT_V8SF:
12313 case VOID_FTYPE_PFLOAT_V4SF:
12314 case VOID_FTYPE_PDOUBLE_V8DF:
12315 case VOID_FTYPE_PDOUBLE_V4DF:
12316 case VOID_FTYPE_PDOUBLE_V2DF:
12317 case VOID_FTYPE_PLONGLONG_LONGLONG:
12318 case VOID_FTYPE_PULONGLONG_ULONGLONG:
12319 case VOID_FTYPE_PUNSIGNED_UNSIGNED:
12320 case VOID_FTYPE_PINT_INT:
12321 nargs = 1;
12322 klass = store;
12323 /* Reserve memory operand for target. */
12324 memory = ARRAY_SIZE (xops);
12325 switch (icode)
12326 {
12327 /* These builtins and instructions require the memory
12328 to be properly aligned. */
12329 case CODE_FOR_avx_movntv4di:
12330 case CODE_FOR_sse2_movntv2di:
12331 case CODE_FOR_avx_movntv8sf:
12332 case CODE_FOR_sse_movntv4sf:
12333 case CODE_FOR_sse4a_vmmovntv4sf:
12334 case CODE_FOR_avx_movntv4df:
12335 case CODE_FOR_sse2_movntv2df:
12336 case CODE_FOR_sse4a_vmmovntv2df:
12337 case CODE_FOR_sse2_movntidi:
12338 case CODE_FOR_sse_movntq:
12339 case CODE_FOR_sse2_movntisi:
12340 case CODE_FOR_avx512f_movntv16sf:
12341 case CODE_FOR_avx512f_movntv8df:
12342 case CODE_FOR_avx512f_movntv8di:
12343 aligned_mem = true;
12344 break;
12345 default:
12346 break;
12347 }
12348 break;
12349 case VOID_FTYPE_PVOID_PCVOID:
12350 nargs = 1;
12351 klass = store;
12352 memory = 0;
12353
12354 break;
12355 case V4SF_FTYPE_V4SF_PCV2SF:
12356 case V2DF_FTYPE_V2DF_PCDOUBLE:
12357 nargs = 2;
12358 klass = load;
12359 memory = 1;
12360 break;
12361 case V8SF_FTYPE_PCV8SF_V8SI:
12362 case V4DF_FTYPE_PCV4DF_V4DI:
12363 case V4SF_FTYPE_PCV4SF_V4SI:
12364 case V2DF_FTYPE_PCV2DF_V2DI:
12365 case V8SI_FTYPE_PCV8SI_V8SI:
12366 case V4DI_FTYPE_PCV4DI_V4DI:
12367 case V4SI_FTYPE_PCV4SI_V4SI:
12368 case V2DI_FTYPE_PCV2DI_V2DI:
12369 case VOID_FTYPE_INT_INT64:
12370 nargs = 2;
12371 klass = load;
12372 memory = 0;
12373 break;
12374 case VOID_FTYPE_PV8DF_V8DF_UQI:
12375 case VOID_FTYPE_PV4DF_V4DF_UQI:
12376 case VOID_FTYPE_PV2DF_V2DF_UQI:
12377 case VOID_FTYPE_PV16SF_V16SF_UHI:
12378 case VOID_FTYPE_PV8SF_V8SF_UQI:
12379 case VOID_FTYPE_PV4SF_V4SF_UQI:
12380 case VOID_FTYPE_PV8DI_V8DI_UQI:
12381 case VOID_FTYPE_PV4DI_V4DI_UQI:
12382 case VOID_FTYPE_PV2DI_V2DI_UQI:
12383 case VOID_FTYPE_PV16SI_V16SI_UHI:
12384 case VOID_FTYPE_PV8SI_V8SI_UQI:
12385 case VOID_FTYPE_PV4SI_V4SI_UQI:
12386 case VOID_FTYPE_PV64QI_V64QI_UDI:
12387 case VOID_FTYPE_PV32HI_V32HI_USI:
12388 case VOID_FTYPE_PV32QI_V32QI_USI:
12389 case VOID_FTYPE_PV16QI_V16QI_UHI:
12390 case VOID_FTYPE_PV16HI_V16HI_UHI:
12391 case VOID_FTYPE_PV8HI_V8HI_UQI:
12392 switch (icode)
12393 {
12394 /* These builtins and instructions require the memory
12395 to be properly aligned. */
12396 case CODE_FOR_avx512f_storev16sf_mask:
12397 case CODE_FOR_avx512f_storev16si_mask:
12398 case CODE_FOR_avx512f_storev8df_mask:
12399 case CODE_FOR_avx512f_storev8di_mask:
12400 case CODE_FOR_avx512vl_storev8sf_mask:
12401 case CODE_FOR_avx512vl_storev8si_mask:
12402 case CODE_FOR_avx512vl_storev4df_mask:
12403 case CODE_FOR_avx512vl_storev4di_mask:
12404 case CODE_FOR_avx512vl_storev4sf_mask:
12405 case CODE_FOR_avx512vl_storev4si_mask:
12406 case CODE_FOR_avx512vl_storev2df_mask:
12407 case CODE_FOR_avx512vl_storev2di_mask:
12408 aligned_mem = true;
12409 break;
12410 default:
12411 break;
12412 }
12413 /* FALLTHRU */
12414 case VOID_FTYPE_PV8SF_V8SI_V8SF:
12415 case VOID_FTYPE_PV4DF_V4DI_V4DF:
12416 case VOID_FTYPE_PV4SF_V4SI_V4SF:
12417 case VOID_FTYPE_PV2DF_V2DI_V2DF:
12418 case VOID_FTYPE_PV8SI_V8SI_V8SI:
12419 case VOID_FTYPE_PV4DI_V4DI_V4DI:
12420 case VOID_FTYPE_PV4SI_V4SI_V4SI:
12421 case VOID_FTYPE_PV2DI_V2DI_V2DI:
12422 case VOID_FTYPE_PV8SI_V8DI_UQI:
12423 case VOID_FTYPE_PV8HI_V8DI_UQI:
12424 case VOID_FTYPE_PV16HI_V16SI_UHI:
12425 case VOID_FTYPE_PUDI_V8DI_UQI:
12426 case VOID_FTYPE_PV16QI_V16SI_UHI:
12427 case VOID_FTYPE_PV4SI_V4DI_UQI:
12428 case VOID_FTYPE_PUDI_V2DI_UQI:
12429 case VOID_FTYPE_PUDI_V4DI_UQI:
12430 case VOID_FTYPE_PUSI_V2DI_UQI:
12431 case VOID_FTYPE_PV8HI_V8SI_UQI:
12432 case VOID_FTYPE_PUDI_V4SI_UQI:
12433 case VOID_FTYPE_PUSI_V4DI_UQI:
12434 case VOID_FTYPE_PUHI_V2DI_UQI:
12435 case VOID_FTYPE_PUDI_V8SI_UQI:
12436 case VOID_FTYPE_PUSI_V4SI_UQI:
12437 case VOID_FTYPE_PCHAR_V64QI_UDI:
12438 case VOID_FTYPE_PCHAR_V32QI_USI:
12439 case VOID_FTYPE_PCHAR_V16QI_UHI:
12440 case VOID_FTYPE_PSHORT_V32HI_USI:
12441 case VOID_FTYPE_PSHORT_V16HI_UHI:
12442 case VOID_FTYPE_PSHORT_V8HI_UQI:
12443 case VOID_FTYPE_PINT_V16SI_UHI:
12444 case VOID_FTYPE_PINT_V8SI_UQI:
12445 case VOID_FTYPE_PINT_V4SI_UQI:
12446 case VOID_FTYPE_PINT64_V8DI_UQI:
12447 case VOID_FTYPE_PINT64_V4DI_UQI:
12448 case VOID_FTYPE_PINT64_V2DI_UQI:
12449 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
12450 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
12451 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
12452 case VOID_FTYPE_PFLOAT_V16SF_UHI:
12453 case VOID_FTYPE_PFLOAT_V8SF_UQI:
12454 case VOID_FTYPE_PFLOAT_V4SF_UQI:
12455 case VOID_FTYPE_PCFLOAT16_V8HF_UQI:
12456 case VOID_FTYPE_PV32QI_V32HI_USI:
12457 case VOID_FTYPE_PV16QI_V16HI_UHI:
12458 case VOID_FTYPE_PUDI_V8HI_UQI:
12459 nargs = 2;
12460 klass = store;
12461 /* Reserve memory operand for target. */
12462 memory = ARRAY_SIZE (xops);
12463 break;
12464 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
12465 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
12466 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
12467 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
12468 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
12469 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
12470 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
12471 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
12472 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
12473 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
12474 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
12475 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
12476 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
12477 case V32HI_FTYPE_PCV32HI_V32HI_USI:
12478 case V32QI_FTYPE_PCV32QI_V32QI_USI:
12479 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
12480 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
12481 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
12482 switch (icode)
12483 {
12484 /* These builtins and instructions require the memory
12485 to be properly aligned. */
12486 case CODE_FOR_avx512f_loadv16sf_mask:
12487 case CODE_FOR_avx512f_loadv16si_mask:
12488 case CODE_FOR_avx512f_loadv8df_mask:
12489 case CODE_FOR_avx512f_loadv8di_mask:
12490 case CODE_FOR_avx512vl_loadv8sf_mask:
12491 case CODE_FOR_avx512vl_loadv8si_mask:
12492 case CODE_FOR_avx512vl_loadv4df_mask:
12493 case CODE_FOR_avx512vl_loadv4di_mask:
12494 case CODE_FOR_avx512vl_loadv4sf_mask:
12495 case CODE_FOR_avx512vl_loadv4si_mask:
12496 case CODE_FOR_avx512vl_loadv2df_mask:
12497 case CODE_FOR_avx512vl_loadv2di_mask:
12498 case CODE_FOR_avx512bw_loadv64qi_mask:
12499 case CODE_FOR_avx512vl_loadv32qi_mask:
12500 case CODE_FOR_avx512vl_loadv16qi_mask:
12501 case CODE_FOR_avx512bw_loadv32hi_mask:
12502 case CODE_FOR_avx512vl_loadv16hi_mask:
12503 case CODE_FOR_avx512vl_loadv8hi_mask:
12504 aligned_mem = true;
12505 break;
12506 default:
12507 break;
12508 }
12509 /* FALLTHRU */
12510 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
12511 case V32QI_FTYPE_PCCHAR_V32QI_USI:
12512 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
12513 case V32HI_FTYPE_PCSHORT_V32HI_USI:
12514 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
12515 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
12516 case V16SI_FTYPE_PCINT_V16SI_UHI:
12517 case V8SI_FTYPE_PCINT_V8SI_UQI:
12518 case V4SI_FTYPE_PCINT_V4SI_UQI:
12519 case V8DI_FTYPE_PCINT64_V8DI_UQI:
12520 case V4DI_FTYPE_PCINT64_V4DI_UQI:
12521 case V2DI_FTYPE_PCINT64_V2DI_UQI:
12522 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
12523 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
12524 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
12525 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
12526 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
12527 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
12528 case V8HF_FTYPE_PCFLOAT16_V8HF_UQI:
12529 nargs = 3;
12530 klass = load;
12531 memory = 0;
12532 break;
12533 case INT_FTYPE_PINT_INT_INT_INT:
12534 case LONGLONG_FTYPE_PLONGLONG_LONGLONG_LONGLONG_INT:
12535 nargs = 4;
12536 klass = load;
12537 memory = 0;
12538 constant = 3;
12539 break;
12540 default:
12541 gcc_unreachable ();
12542 }
12543
12544 gcc_assert (nargs <= ARRAY_SIZE (xops));
12545
12546 if (klass == store)
12547 {
12548 arg = CALL_EXPR_ARG (exp, 0);
12549 op = expand_normal (exp: arg);
12550 gcc_assert (target == 0);
12551 if (memory)
12552 {
12553 op = ix86_zero_extend_to_Pmode (op);
12554 target = gen_rtx_MEM (tmode, op);
12555 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
12556 on it. Try to improve it using get_pointer_alignment,
12557 and if the special builtin is one that requires strict
12558 mode alignment, also from it's GET_MODE_ALIGNMENT.
12559 Failure to do so could lead to ix86_legitimate_combined_insn
12560 rejecting all changes to such insns. */
12561 unsigned int align = get_pointer_alignment (arg);
12562 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
12563 align = GET_MODE_ALIGNMENT (tmode);
12564 if (MEM_ALIGN (target) < align)
12565 set_mem_align (target, align);
12566 }
12567 else
12568 target = force_reg (tmode, op);
12569 arg_adjust = 1;
12570 }
12571 else
12572 {
12573 arg_adjust = 0;
12574 if (optimize
12575 || target == 0
12576 || !register_operand (target, tmode)
12577 || GET_MODE (target) != tmode)
12578 target = gen_reg_rtx (tmode);
12579 }
12580
12581 for (i = 0; i < nargs; i++)
12582 {
12583 machine_mode mode = insn_p->operand[i + 1].mode;
12584
12585 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
12586 op = expand_normal (exp: arg);
12587
12588 if (i == memory)
12589 {
12590 /* This must be the memory operand. */
12591 op = ix86_zero_extend_to_Pmode (op);
12592 op = gen_rtx_MEM (mode, op);
12593 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
12594 on it. Try to improve it using get_pointer_alignment,
12595 and if the special builtin is one that requires strict
12596 mode alignment, also from it's GET_MODE_ALIGNMENT.
12597 Failure to do so could lead to ix86_legitimate_combined_insn
12598 rejecting all changes to such insns. */
12599 unsigned int align = get_pointer_alignment (arg);
12600 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
12601 align = GET_MODE_ALIGNMENT (mode);
12602 if (MEM_ALIGN (op) < align)
12603 set_mem_align (op, align);
12604 }
12605 else if (i == constant)
12606 {
12607 /* This must be the constant. */
12608 if (!insn_p->operand[nargs].predicate(op, SImode))
12609 {
12610 error ("the fourth argument must be one of enum %qs", "_CMPCCX_ENUM");
12611 return const0_rtx;
12612 }
12613 }
12614 else
12615 {
12616 /* This must be register. */
12617 if (VECTOR_MODE_P (mode))
12618 op = safe_vector_operand (x: op, mode);
12619
12620 op = fixup_modeless_constant (x: op, mode);
12621
12622 /* NB: 3-operands load implied it's a mask load or v{p}expand*,
12623 and that mask operand shoud be at the end.
12624 Keep all-ones mask which would be simplified by the expander. */
12625 if (nargs == 3 && i == 2 && klass == load
12626 && constm1_operand (op, mode)
12627 && insn_p->operand[i].predicate (op, mode))
12628 ;
12629 else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
12630 op = copy_to_mode_reg (mode, op);
12631 else
12632 {
12633 op = copy_to_reg (op);
12634 op = lowpart_subreg (outermode: mode, op, GET_MODE (op));
12635 }
12636 }
12637
12638 xops[i]= op;
12639 }
12640
12641 switch (nargs)
12642 {
12643 case 0:
12644 pat = GEN_FCN (icode) (target);
12645 break;
12646 case 1:
12647 pat = GEN_FCN (icode) (target, xops[0]);
12648 break;
12649 case 2:
12650 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
12651 break;
12652 case 3:
12653 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
12654 break;
12655 case 4:
12656 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
12657 break;
12658 default:
12659 gcc_unreachable ();
12660 }
12661
12662 if (! pat)
12663 return 0;
12664
12665 emit_insn (pat);
12666 return klass == store ? 0 : target;
12667}
12668
12669/* Return the integer constant in ARG. Constrain it to be in the range
12670 of the subparts of VEC_TYPE; issue an error if not. */
12671
12672static int
12673get_element_number (tree vec_type, tree arg)
12674{
12675 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (node: vec_type) - 1;
12676
12677 if (!tree_fits_uhwi_p (arg)
12678 || (elt = tree_to_uhwi (arg), elt > max))
12679 {
12680 error ("selector must be an integer constant in the range "
12681 "[0, %wi]", max);
12682 return 0;
12683 }
12684
12685 return elt;
12686}
12687
12688/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12689 ix86_expand_vector_init. We DO have language-level syntax for this, in
12690 the form of (type){ init-list }. Except that since we can't place emms
12691 instructions from inside the compiler, we can't allow the use of MMX
12692 registers unless the user explicitly asks for it. So we do *not* define
12693 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
12694 we have builtins invoked by mmintrin.h that gives us license to emit
12695 these sorts of instructions. */
12696
12697static rtx
12698ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
12699{
12700 machine_mode tmode = TYPE_MODE (type);
12701 machine_mode inner_mode = GET_MODE_INNER (tmode);
12702 int i, n_elt = GET_MODE_NUNITS (tmode);
12703 rtvec v = rtvec_alloc (n_elt);
12704
12705 gcc_assert (VECTOR_MODE_P (tmode));
12706 gcc_assert (call_expr_nargs (exp) == n_elt);
12707
12708 for (i = 0; i < n_elt; ++i)
12709 {
12710 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
12711 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
12712 }
12713
12714 if (!target || !register_operand (target, tmode))
12715 target = gen_reg_rtx (tmode);
12716
12717 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
12718 return target;
12719}
12720
12721/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12722 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
12723 had a language-level syntax for referencing vector elements. */
12724
12725static rtx
12726ix86_expand_vec_ext_builtin (tree exp, rtx target)
12727{
12728 machine_mode tmode, mode0;
12729 tree arg0, arg1;
12730 int elt;
12731 rtx op0;
12732
12733 arg0 = CALL_EXPR_ARG (exp, 0);
12734 arg1 = CALL_EXPR_ARG (exp, 1);
12735
12736 op0 = expand_normal (exp: arg0);
12737 elt = get_element_number (TREE_TYPE (arg0), arg: arg1);
12738
12739 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
12740 mode0 = TYPE_MODE (TREE_TYPE (arg0));
12741 gcc_assert (VECTOR_MODE_P (mode0));
12742
12743 op0 = force_reg (mode0, op0);
12744
12745 if (optimize || !target || !register_operand (target, tmode))
12746 target = gen_reg_rtx (tmode);
12747
12748 ix86_expand_vector_extract (true, target, op0, elt);
12749
12750 return target;
12751}
12752
12753/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12754 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
12755 a language-level syntax for referencing vector elements. */
12756
12757static rtx
12758ix86_expand_vec_set_builtin (tree exp)
12759{
12760 machine_mode tmode, mode1;
12761 tree arg0, arg1, arg2;
12762 int elt;
12763 rtx op0, op1, target;
12764
12765 arg0 = CALL_EXPR_ARG (exp, 0);
12766 arg1 = CALL_EXPR_ARG (exp, 1);
12767 arg2 = CALL_EXPR_ARG (exp, 2);
12768
12769 tmode = TYPE_MODE (TREE_TYPE (arg0));
12770 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
12771 gcc_assert (VECTOR_MODE_P (tmode));
12772
12773 op0 = expand_expr (exp: arg0, NULL_RTX, mode: tmode, modifier: EXPAND_NORMAL);
12774 op1 = expand_expr (exp: arg1, NULL_RTX, mode: mode1, modifier: EXPAND_NORMAL);
12775 elt = get_element_number (TREE_TYPE (arg0), arg: arg2);
12776
12777 if (GET_MODE (op1) != mode1)
12778 op1 = convert_modes (mode: mode1, GET_MODE (op1), x: op1, unsignedp: true);
12779
12780 op0 = force_reg (tmode, op0);
12781 op1 = force_reg (mode1, op1);
12782
12783 /* OP0 is the source of these builtin functions and shouldn't be
12784 modified. Create a copy, use it and return it as target. */
12785 target = gen_reg_rtx (tmode);
12786 emit_move_insn (target, op0);
12787 ix86_expand_vector_set (true, target, op1, elt);
12788
12789 return target;
12790}
12791
12792/* Return true if the necessary isa options for this builtin exist,
12793 else false.
12794 fcode = DECL_MD_FUNCTION_CODE (fndecl); */
12795bool
12796ix86_check_builtin_isa_match (unsigned int fcode,
12797 HOST_WIDE_INT* pbisa,
12798 HOST_WIDE_INT* pbisa2)
12799{
12800 HOST_WIDE_INT isa = ix86_isa_flags;
12801 HOST_WIDE_INT isa2 = ix86_isa_flags2;
12802 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
12803 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
12804 HOST_WIDE_INT tmp_isa = isa, tmp_isa2 = isa2;
12805 /* The general case is we require all the ISAs specified in bisa{,2}
12806 to be enabled.
12807 The exceptions are:
12808 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
12809 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
12810 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
12811 (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
12812 OPTION_MASK_ISA2_AVXVNNI
12813 (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL) or
12814 OPTION_MASK_ISA2_AVXIFMA
12815 (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_AVX512BF16) or
12816 OPTION_MASK_ISA2_AVXNECONVERT
12817 OPTION_MASK_ISA_AES or (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_VAES)
12818 where for each such pair it is sufficient if either of the ISAs is
12819 enabled, plus if it is ored with other options also those others.
12820 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
12821
12822#define SHARE_BUILTIN(A1, A2, B1, B2) \
12823 if ((((bisa & (A1)) == (A1) && (bisa2 & (A2)) == (A2)) \
12824 && ((bisa & (B1)) == (B1) && (bisa2 & (B2)) == (B2))) \
12825 && (((isa & (A1)) == (A1) && (isa2 & (A2)) == (A2)) \
12826 || ((isa & (B1)) == (B1) && (isa2 & (B2)) == (B2)))) \
12827 { \
12828 tmp_isa |= (A1) | (B1); \
12829 tmp_isa2 |= (A2) | (B2); \
12830 }
12831
12832 SHARE_BUILTIN (OPTION_MASK_ISA_SSE, 0, OPTION_MASK_ISA_3DNOW_A, 0);
12833 SHARE_BUILTIN (OPTION_MASK_ISA_SSE4_2, 0, OPTION_MASK_ISA_CRC32, 0);
12834 SHARE_BUILTIN (OPTION_MASK_ISA_FMA, 0, OPTION_MASK_ISA_FMA4, 0);
12835 SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, 0,
12836 OPTION_MASK_ISA2_AVXVNNI);
12837 SHARE_BUILTIN (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL, 0, 0,
12838 OPTION_MASK_ISA2_AVXIFMA);
12839 SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BF16, 0,
12840 OPTION_MASK_ISA2_AVXNECONVERT);
12841 SHARE_BUILTIN (OPTION_MASK_ISA_AES, 0, OPTION_MASK_ISA_AVX512VL,
12842 OPTION_MASK_ISA2_VAES);
12843 isa = tmp_isa;
12844 isa2 = tmp_isa2;
12845
12846 if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
12847 /* __builtin_ia32_maskmovq requires MMX registers. */
12848 && fcode != IX86_BUILTIN_MASKMOVQ)
12849 {
12850 bisa &= ~OPTION_MASK_ISA_MMX;
12851 bisa |= OPTION_MASK_ISA_SSE2;
12852 }
12853
12854 if (pbisa)
12855 *pbisa = bisa;
12856 if (pbisa2)
12857 *pbisa2 = bisa2;
12858
12859 return (bisa & isa) == bisa && (bisa2 & isa2) == bisa2;
12860}
12861
12862/* Emit instructions to set the carry flag from ARG. */
12863
12864void
12865ix86_expand_carry (rtx arg)
12866{
12867 if (!CONST_INT_P (arg) || arg == const0_rtx)
12868 {
12869 arg = convert_to_mode (QImode, arg, 1);
12870 arg = copy_to_mode_reg (QImode, arg);
12871 emit_insn (gen_addqi3_cconly_overflow (arg, constm1_rtx));
12872 }
12873 else
12874 emit_insn (gen_x86_stc ());
12875}
12876
12877/* Expand an expression EXP that calls a built-in function,
12878 with result going to TARGET if that's convenient
12879 (and in mode MODE if that's convenient).
12880 SUBTARGET may be used as the target for computing one of EXP's operands.
12881 IGNORE is nonzero if the value is to be ignored. */
12882
12883rtx
12884ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
12885 machine_mode mode, int ignore)
12886{
12887 size_t i;
12888 enum insn_code icode, icode2;
12889 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
12890 tree arg0, arg1, arg2, arg3, arg4;
12891 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
12892 machine_mode mode0, mode1, mode2, mode3, mode4;
12893 unsigned int fcode = DECL_MD_FUNCTION_CODE (decl: fndecl);
12894 HOST_WIDE_INT bisa, bisa2;
12895
12896 /* For CPU builtins that can be folded, fold first and expand the fold. */
12897 switch (fcode)
12898 {
12899 case IX86_BUILTIN_CPU_INIT:
12900 {
12901 /* Make it call __cpu_indicator_init in libgcc. */
12902 tree call_expr, fndecl, type;
12903 type = build_function_type_list (integer_type_node, NULL_TREE);
12904 fndecl = build_fn_decl ("__cpu_indicator_init", type);
12905 call_expr = build_call_expr (fndecl, 0);
12906 return expand_expr (exp: call_expr, target, mode, modifier: EXPAND_NORMAL);
12907 }
12908 case IX86_BUILTIN_CPU_IS:
12909 case IX86_BUILTIN_CPU_SUPPORTS:
12910 {
12911 tree arg0 = CALL_EXPR_ARG (exp, 0);
12912 tree fold_expr = fold_builtin_cpu (fndecl, args: &arg0);
12913 gcc_assert (fold_expr != NULL_TREE);
12914 return expand_expr (exp: fold_expr, target, mode, modifier: EXPAND_NORMAL);
12915 }
12916 }
12917
12918 if (!ix86_check_builtin_isa_match (fcode, pbisa: &bisa, pbisa2: &bisa2))
12919 {
12920 bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
12921 if (TARGET_ABI_X32)
12922 bisa |= OPTION_MASK_ABI_X32;
12923 else
12924 bisa |= OPTION_MASK_ABI_64;
12925 char *opts = ix86_target_string (isa: bisa, isa2: bisa2, flags: 0, flags2: 0, NULL, NULL,
12926 fpmath: (enum fpmath_unit) 0,
12927 pvw: (enum prefer_vector_width) 0,
12928 move_max: PVW_NONE, store_max: PVW_NONE,
12929 add_nl_p: false, add_abi_p);
12930 if (!opts)
12931 error ("%qE needs unknown isa option", fndecl);
12932 else
12933 {
12934 gcc_assert (opts != NULL);
12935 error ("%qE needs isa option %s", fndecl, opts);
12936 free (ptr: opts);
12937 }
12938 return expand_call (exp, target, ignore);
12939 }
12940
12941 switch (fcode)
12942 {
12943 case IX86_BUILTIN_MASKMOVQ:
12944 case IX86_BUILTIN_MASKMOVDQU:
12945 icode = (fcode == IX86_BUILTIN_MASKMOVQ
12946 ? CODE_FOR_mmx_maskmovq
12947 : CODE_FOR_sse2_maskmovdqu);
12948 /* Note the arg order is different from the operand order. */
12949 arg1 = CALL_EXPR_ARG (exp, 0);
12950 arg2 = CALL_EXPR_ARG (exp, 1);
12951 arg0 = CALL_EXPR_ARG (exp, 2);
12952 op0 = expand_normal (exp: arg0);
12953 op1 = expand_normal (exp: arg1);
12954 op2 = expand_normal (exp: arg2);
12955 mode0 = insn_data[icode].operand[0].mode;
12956 mode1 = insn_data[icode].operand[1].mode;
12957 mode2 = insn_data[icode].operand[2].mode;
12958
12959 op0 = ix86_zero_extend_to_Pmode (op0);
12960 op0 = gen_rtx_MEM (mode1, op0);
12961
12962 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12963 op0 = copy_to_mode_reg (mode0, op0);
12964 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12965 op1 = copy_to_mode_reg (mode1, op1);
12966 if (!insn_data[icode].operand[2].predicate (op2, mode2))
12967 op2 = copy_to_mode_reg (mode2, op2);
12968 pat = GEN_FCN (icode) (op0, op1, op2);
12969 if (! pat)
12970 return 0;
12971 emit_insn (pat);
12972 return 0;
12973
12974 case IX86_BUILTIN_LDMXCSR:
12975 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
12976 target = assign_386_stack_local (SImode, SLOT_TEMP);
12977 emit_move_insn (target, op0);
12978 emit_insn (gen_sse_ldmxcsr (target));
12979 return 0;
12980
12981 case IX86_BUILTIN_STMXCSR:
12982 target = assign_386_stack_local (SImode, SLOT_TEMP);
12983 emit_insn (gen_sse_stmxcsr (target));
12984 return copy_to_mode_reg (SImode, target);
12985
12986 case IX86_BUILTIN_CLFLUSH:
12987 arg0 = CALL_EXPR_ARG (exp, 0);
12988 op0 = expand_normal (exp: arg0);
12989 icode = CODE_FOR_sse2_clflush;
12990 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12991 op0 = ix86_zero_extend_to_Pmode (op0);
12992
12993 emit_insn (gen_sse2_clflush (op0));
12994 return 0;
12995
12996 case IX86_BUILTIN_CLWB:
12997 arg0 = CALL_EXPR_ARG (exp, 0);
12998 op0 = expand_normal (exp: arg0);
12999 icode = CODE_FOR_clwb;
13000 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
13001 op0 = ix86_zero_extend_to_Pmode (op0);
13002
13003 emit_insn (gen_clwb (op0));
13004 return 0;
13005
13006 case IX86_BUILTIN_CLFLUSHOPT:
13007 arg0 = CALL_EXPR_ARG (exp, 0);
13008 op0 = expand_normal (exp: arg0);
13009 icode = CODE_FOR_clflushopt;
13010 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
13011 op0 = ix86_zero_extend_to_Pmode (op0);
13012
13013 emit_insn (gen_clflushopt (op0));
13014 return 0;
13015
13016 case IX86_BUILTIN_MONITOR:
13017 case IX86_BUILTIN_MONITORX:
13018 arg0 = CALL_EXPR_ARG (exp, 0);
13019 arg1 = CALL_EXPR_ARG (exp, 1);
13020 arg2 = CALL_EXPR_ARG (exp, 2);
13021 op0 = expand_normal (exp: arg0);
13022 op1 = expand_normal (exp: arg1);
13023 op2 = expand_normal (exp: arg2);
13024 if (!REG_P (op0))
13025 op0 = ix86_zero_extend_to_Pmode (op0);
13026 if (!REG_P (op1))
13027 op1 = copy_to_mode_reg (SImode, op1);
13028 if (!REG_P (op2))
13029 op2 = copy_to_mode_reg (SImode, op2);
13030
13031 emit_insn (fcode == IX86_BUILTIN_MONITOR
13032 ? gen_sse3_monitor (Pmode, x0: op0, x1: op1, x2: op2)
13033 : gen_monitorx (Pmode, x0: op0, x1: op1, x2: op2));
13034 return 0;
13035
13036 case IX86_BUILTIN_MWAIT:
13037 arg0 = CALL_EXPR_ARG (exp, 0);
13038 arg1 = CALL_EXPR_ARG (exp, 1);
13039 op0 = expand_normal (exp: arg0);
13040 op1 = expand_normal (exp: arg1);
13041 if (!REG_P (op0))
13042 op0 = copy_to_mode_reg (SImode, op0);
13043 if (!REG_P (op1))
13044 op1 = copy_to_mode_reg (SImode, op1);
13045 emit_insn (gen_sse3_mwait (op0, op1));
13046 return 0;
13047
13048 case IX86_BUILTIN_MWAITX:
13049 arg0 = CALL_EXPR_ARG (exp, 0);
13050 arg1 = CALL_EXPR_ARG (exp, 1);
13051 arg2 = CALL_EXPR_ARG (exp, 2);
13052 op0 = expand_normal (exp: arg0);
13053 op1 = expand_normal (exp: arg1);
13054 op2 = expand_normal (exp: arg2);
13055 if (!REG_P (op0))
13056 op0 = copy_to_mode_reg (SImode, op0);
13057 if (!REG_P (op1))
13058 op1 = copy_to_mode_reg (SImode, op1);
13059 if (!REG_P (op2))
13060 op2 = copy_to_mode_reg (SImode, op2);
13061 emit_insn (gen_mwaitx (op0, op1, op2));
13062 return 0;
13063
13064 case IX86_BUILTIN_UMONITOR:
13065 arg0 = CALL_EXPR_ARG (exp, 0);
13066 op0 = expand_normal (exp: arg0);
13067
13068 op0 = ix86_zero_extend_to_Pmode (op0);
13069 emit_insn (gen_umonitor (Pmode, x0: op0));
13070 return 0;
13071
13072 case IX86_BUILTIN_UMWAIT:
13073 case IX86_BUILTIN_TPAUSE:
13074 arg0 = CALL_EXPR_ARG (exp, 0);
13075 arg1 = CALL_EXPR_ARG (exp, 1);
13076 op0 = expand_normal (exp: arg0);
13077 op1 = expand_normal (exp: arg1);
13078
13079 if (!REG_P (op0))
13080 op0 = copy_to_mode_reg (SImode, op0);
13081
13082 op1 = force_reg (DImode, op1);
13083
13084 if (TARGET_64BIT)
13085 {
13086 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13087 NULL, 1, OPTAB_DIRECT);
13088 switch (fcode)
13089 {
13090 case IX86_BUILTIN_UMWAIT:
13091 icode = CODE_FOR_umwait_rex64;
13092 break;
13093 case IX86_BUILTIN_TPAUSE:
13094 icode = CODE_FOR_tpause_rex64;
13095 break;
13096 default:
13097 gcc_unreachable ();
13098 }
13099
13100 op2 = gen_lowpart (SImode, op2);
13101 op1 = gen_lowpart (SImode, op1);
13102 pat = GEN_FCN (icode) (op0, op1, op2);
13103 }
13104 else
13105 {
13106 switch (fcode)
13107 {
13108 case IX86_BUILTIN_UMWAIT:
13109 icode = CODE_FOR_umwait;
13110 break;
13111 case IX86_BUILTIN_TPAUSE:
13112 icode = CODE_FOR_tpause;
13113 break;
13114 default:
13115 gcc_unreachable ();
13116 }
13117 pat = GEN_FCN (icode) (op0, op1);
13118 }
13119
13120 if (!pat)
13121 return 0;
13122
13123 emit_insn (pat);
13124
13125 if (target == 0
13126 || !register_operand (target, QImode))
13127 target = gen_reg_rtx (QImode);
13128
13129 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13130 const0_rtx);
13131 emit_insn (gen_rtx_SET (target, pat));
13132
13133 return target;
13134
13135 case IX86_BUILTIN_TESTUI:
13136 emit_insn (gen_testui ());
13137
13138 if (target == 0
13139 || !register_operand (target, QImode))
13140 target = gen_reg_rtx (QImode);
13141
13142 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13143 const0_rtx);
13144 emit_insn (gen_rtx_SET (target, pat));
13145
13146 return target;
13147
13148 case IX86_BUILTIN_CLZERO:
13149 arg0 = CALL_EXPR_ARG (exp, 0);
13150 op0 = expand_normal (exp: arg0);
13151 if (!REG_P (op0))
13152 op0 = ix86_zero_extend_to_Pmode (op0);
13153 emit_insn (gen_clzero (Pmode, x0: op0));
13154 return 0;
13155
13156 case IX86_BUILTIN_CLDEMOTE:
13157 arg0 = CALL_EXPR_ARG (exp, 0);
13158 op0 = expand_normal (exp: arg0);
13159 icode = CODE_FOR_cldemote;
13160 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
13161 op0 = ix86_zero_extend_to_Pmode (op0);
13162
13163 emit_insn (gen_cldemote (op0));
13164 return 0;
13165
13166 case IX86_BUILTIN_LOADIWKEY:
13167 {
13168 arg0 = CALL_EXPR_ARG (exp, 0);
13169 arg1 = CALL_EXPR_ARG (exp, 1);
13170 arg2 = CALL_EXPR_ARG (exp, 2);
13171 arg3 = CALL_EXPR_ARG (exp, 3);
13172
13173 op0 = expand_normal (exp: arg0);
13174 op1 = expand_normal (exp: arg1);
13175 op2 = expand_normal (exp: arg2);
13176 op3 = expand_normal (exp: arg3);
13177
13178 if (!REG_P (op0))
13179 op0 = copy_to_mode_reg (V2DImode, op0);
13180 if (!REG_P (op1))
13181 op1 = copy_to_mode_reg (V2DImode, op1);
13182 if (!REG_P (op2))
13183 op2 = copy_to_mode_reg (V2DImode, op2);
13184 if (!REG_P (op3))
13185 op3 = copy_to_mode_reg (SImode, op3);
13186
13187 emit_insn (gen_loadiwkey (op0, op1, op2, op3));
13188
13189 return 0;
13190 }
13191
13192 case IX86_BUILTIN_AESDEC128KLU8:
13193 icode = CODE_FOR_aesdec128klu8;
13194 goto aesdecenc_expand;
13195
13196 case IX86_BUILTIN_AESDEC256KLU8:
13197 icode = CODE_FOR_aesdec256klu8;
13198 goto aesdecenc_expand;
13199
13200 case IX86_BUILTIN_AESENC128KLU8:
13201 icode = CODE_FOR_aesenc128klu8;
13202 goto aesdecenc_expand;
13203
13204 case IX86_BUILTIN_AESENC256KLU8:
13205 icode = CODE_FOR_aesenc256klu8;
13206
13207 aesdecenc_expand:
13208
13209 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i *odata
13210 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i idata
13211 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
13212
13213 op0 = expand_normal (exp: arg0);
13214 op1 = expand_normal (exp: arg1);
13215 op2 = expand_normal (exp: arg2);
13216
13217 if (!address_operand (op0, V2DImode))
13218 {
13219 op0 = convert_memory_address (Pmode, op0);
13220 op0 = copy_addr_to_reg (op0);
13221 }
13222 op0 = gen_rtx_MEM (V2DImode, op0);
13223
13224 if (!REG_P (op1))
13225 op1 = copy_to_mode_reg (V2DImode, op1);
13226
13227 if (!address_operand (op2, VOIDmode))
13228 {
13229 op2 = convert_memory_address (Pmode, op2);
13230 op2 = copy_addr_to_reg (op2);
13231 }
13232 op2 = gen_rtx_MEM (BLKmode, op2);
13233
13234 emit_insn (GEN_FCN (icode) (op1, op1, op2));
13235
13236 if (target == 0)
13237 target = gen_reg_rtx (QImode);
13238
13239 /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
13240 error occurs. Then the output should be cleared for safety. */
13241 rtx_code_label *ok_label;
13242 rtx tmp;
13243
13244 tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
13245 pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
13246 ok_label = gen_label_rtx ();
13247 emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
13248 true, ok_label);
13249 /* Usually the runtime error seldom occur, so predict OK path as
13250 hotspot to optimize it as fallthrough block. */
13251 predict_jump (REG_BR_PROB_BASE * 90 / 100);
13252
13253 emit_insn (gen_rtx_SET (op1, const0_rtx));
13254
13255 emit_label (ok_label);
13256 emit_insn (gen_rtx_SET (target, pat));
13257 emit_insn (gen_rtx_SET (op0, op1));
13258
13259 return target;
13260
13261 case IX86_BUILTIN_AESDECWIDE128KLU8:
13262 icode = CODE_FOR_aesdecwide128klu8;
13263 goto wideaesdecenc_expand;
13264
13265 case IX86_BUILTIN_AESDECWIDE256KLU8:
13266 icode = CODE_FOR_aesdecwide256klu8;
13267 goto wideaesdecenc_expand;
13268
13269 case IX86_BUILTIN_AESENCWIDE128KLU8:
13270 icode = CODE_FOR_aesencwide128klu8;
13271 goto wideaesdecenc_expand;
13272
13273 case IX86_BUILTIN_AESENCWIDE256KLU8:
13274 icode = CODE_FOR_aesencwide256klu8;
13275
13276 wideaesdecenc_expand:
13277
13278 rtx xmm_regs[8];
13279 rtx op;
13280
13281 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i * odata
13282 arg1 = CALL_EXPR_ARG (exp, 1); // const __m128i * idata
13283 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
13284
13285 op0 = expand_normal (exp: arg0);
13286 op1 = expand_normal (exp: arg1);
13287 op2 = expand_normal (exp: arg2);
13288
13289 if (!address_operand (op2, VOIDmode))
13290 {
13291 op2 = convert_memory_address (Pmode, op2);
13292 op2 = copy_addr_to_reg (op2);
13293 }
13294 op2 = gen_rtx_MEM (BLKmode, op2);
13295
13296 for (i = 0; i < 8; i++)
13297 {
13298 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
13299
13300 op = gen_rtx_MEM (V2DImode,
13301 plus_constant (Pmode, op1, (i * 16)));
13302
13303 emit_move_insn (xmm_regs[i], op);
13304 }
13305
13306 emit_insn (GEN_FCN (icode) (op2));
13307
13308 if (target == 0)
13309 target = gen_reg_rtx (QImode);
13310
13311 tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
13312 pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
13313 ok_label = gen_label_rtx ();
13314 emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
13315 true, ok_label);
13316 predict_jump (REG_BR_PROB_BASE * 90 / 100);
13317
13318 for (i = 0; i < 8; i++)
13319 emit_insn (gen_rtx_SET (xmm_regs[i], const0_rtx));
13320
13321 emit_label (ok_label);
13322 emit_insn (gen_rtx_SET (target, pat));
13323
13324 for (i = 0; i < 8; i++)
13325 {
13326 op = gen_rtx_MEM (V2DImode,
13327 plus_constant (Pmode, op0, (i * 16)));
13328 emit_move_insn (op, xmm_regs[i]);
13329 }
13330
13331 return target;
13332
13333 case IX86_BUILTIN_ENCODEKEY128U32:
13334 {
13335 rtx op, xmm_regs[7];
13336
13337 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
13338 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i key
13339 arg2 = CALL_EXPR_ARG (exp, 2); // void *h
13340
13341 op0 = expand_normal (exp: arg0);
13342 op1 = expand_normal (exp: arg1);
13343 op2 = expand_normal (exp: arg2);
13344
13345 if (!REG_P (op0))
13346 op0 = copy_to_mode_reg (SImode, op0);
13347
13348 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
13349 emit_move_insn (op, op1);
13350
13351 for (i = 0; i < 3; i++)
13352 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
13353
13354 if (target == 0)
13355 target = gen_reg_rtx (SImode);
13356
13357 emit_insn (gen_encodekey128u32 (target, op0));
13358
13359 for (i = 0; i < 3; i++)
13360 {
13361 op = gen_rtx_MEM (V2DImode,
13362 plus_constant (Pmode, op2, (i * 16)));
13363 emit_move_insn (op, xmm_regs[i]);
13364 }
13365
13366 return target;
13367 }
13368 case IX86_BUILTIN_ENCODEKEY256U32:
13369 {
13370 rtx op, xmm_regs[7];
13371
13372 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
13373 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i keylow
13374 arg2 = CALL_EXPR_ARG (exp, 2); // __m128i keyhi
13375 arg3 = CALL_EXPR_ARG (exp, 3); // void *h
13376
13377 op0 = expand_normal (exp: arg0);
13378 op1 = expand_normal (exp: arg1);
13379 op2 = expand_normal (exp: arg2);
13380 op3 = expand_normal (exp: arg3);
13381
13382 if (!REG_P (op0))
13383 op0 = copy_to_mode_reg (SImode, op0);
13384
13385 /* Force to use xmm0, xmm1 for keylow, keyhi*/
13386 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
13387 emit_move_insn (op, op1);
13388 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (1));
13389 emit_move_insn (op, op2);
13390
13391 for (i = 0; i < 4; i++)
13392 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
13393
13394 if (target == 0)
13395 target = gen_reg_rtx (SImode);
13396
13397 emit_insn (gen_encodekey256u32 (target, op0));
13398
13399 for (i = 0; i < 4; i++)
13400 {
13401 op = gen_rtx_MEM (V2DImode,
13402 plus_constant (Pmode, op3, (i * 16)));
13403 emit_move_insn (op, xmm_regs[i]);
13404 }
13405
13406 return target;
13407 }
13408
13409 case IX86_BUILTIN_PREFETCH:
13410 {
13411 arg0 = CALL_EXPR_ARG (exp, 0); // const void *
13412 arg1 = CALL_EXPR_ARG (exp, 1); // const int
13413 arg2 = CALL_EXPR_ARG (exp, 2); // const int
13414 arg3 = CALL_EXPR_ARG (exp, 3); // const int
13415
13416 op0 = expand_normal (exp: arg0);
13417 op1 = expand_normal (exp: arg1);
13418 op2 = expand_normal (exp: arg2);
13419 op3 = expand_normal (exp: arg3);
13420
13421 if (!CONST_INT_P (op1) || !CONST_INT_P (op2) || !CONST_INT_P (op3))
13422 {
13423 error ("second, third and fourth argument must be a const");
13424 return const0_rtx;
13425 }
13426
13427 if (INTVAL (op3) == 1)
13428 {
13429 if (INTVAL (op2) < 2 || INTVAL (op2) > 3)
13430 {
13431 error ("invalid third argument");
13432 return const0_rtx;
13433 }
13434
13435 if (TARGET_64BIT && TARGET_PREFETCHI
13436 && local_func_symbolic_operand (op0, GET_MODE (op0)))
13437 emit_insn (gen_prefetchi (op0, op2));
13438 else
13439 {
13440 warning (0, "instruction prefetch applies when in 64-bit mode"
13441 " with RIP-relative addressing and"
13442 " option %<-mprefetchi%>;"
13443 " they stay NOPs otherwise");
13444 emit_insn (gen_nop ());
13445 }
13446 }
13447 else
13448 {
13449 if (!address_operand (op0, VOIDmode))
13450 {
13451 op0 = convert_memory_address (Pmode, op0);
13452 op0 = copy_addr_to_reg (op0);
13453 }
13454
13455 if (INTVAL (op2) < 0 || INTVAL (op2) > 3)
13456 {
13457 warning (0, "invalid third argument to %<__builtin_ia32_prefetch%>; using zero");
13458 op2 = const0_rtx;
13459 }
13460
13461 if (TARGET_3DNOW || TARGET_PREFETCH_SSE
13462 || TARGET_PRFCHW || TARGET_PREFETCHWT1)
13463 emit_insn (gen_prefetch (op0, op1, op2));
13464 else if (!MEM_P (op0) && side_effects_p (op0))
13465 /* Don't do anything with direct references to volatile memory,
13466 but generate code to handle other side effects. */
13467 emit_insn (op0);
13468 }
13469
13470 return 0;
13471 }
13472
13473 case IX86_BUILTIN_PREFETCHI:
13474 {
13475 arg0 = CALL_EXPR_ARG (exp, 0); // const void *
13476 arg1 = CALL_EXPR_ARG (exp, 1); // const int
13477
13478 op0 = expand_normal (exp: arg0);
13479 op1 = expand_normal (exp: arg1);
13480
13481 if (!CONST_INT_P (op1))
13482 {
13483 error ("second argument must be a const");
13484 return const0_rtx;
13485 }
13486
13487 /* GOT/PLT_PIC should not be available for instruction prefetch.
13488 It must be real instruction address. */
13489 if (TARGET_64BIT
13490 && local_func_symbolic_operand (op0, GET_MODE (op0)))
13491 emit_insn (gen_prefetchi (op0, op1));
13492 else
13493 {
13494 /* Ignore the hint. */
13495 warning (0, "instruction prefetch applies when in 64-bit mode"
13496 " with RIP-relative addressing and"
13497 " option %<-mprefetchi%>;"
13498 " they stay NOPs otherwise");
13499 emit_insn (gen_nop ());
13500 }
13501
13502 return 0;
13503 }
13504
13505 case IX86_BUILTIN_URDMSR:
13506 case IX86_BUILTIN_UWRMSR:
13507 {
13508 arg0 = CALL_EXPR_ARG (exp, 0);
13509 op0 = expand_normal (exp: arg0);
13510
13511 if (CONST_INT_P (op0))
13512 {
13513 unsigned HOST_WIDE_INT val = UINTVAL (op0);
13514 if (val > 0xffffffff)
13515 op0 = force_reg (DImode, op0);
13516 }
13517 else
13518 op0 = force_reg (DImode, op0);
13519
13520 if (fcode == IX86_BUILTIN_UWRMSR)
13521 {
13522 arg1 = CALL_EXPR_ARG (exp, 1);
13523 op1 = expand_normal (exp: arg1);
13524 op1 = force_reg (DImode, op1);
13525 icode = CODE_FOR_uwrmsr;
13526 target = 0;
13527 }
13528 else
13529 {
13530 if (target == 0)
13531 target = gen_reg_rtx (DImode);
13532 icode = CODE_FOR_urdmsr;
13533 op1 = op0;
13534 op0 = target;
13535 }
13536 emit_insn (GEN_FCN (icode) (op0, op1));
13537 return target;
13538 }
13539
13540 case IX86_BUILTIN_VEC_INIT_V2SI:
13541 case IX86_BUILTIN_VEC_INIT_V4HI:
13542 case IX86_BUILTIN_VEC_INIT_V8QI:
13543 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
13544
13545 case IX86_BUILTIN_VEC_EXT_V2DF:
13546 case IX86_BUILTIN_VEC_EXT_V2DI:
13547 case IX86_BUILTIN_VEC_EXT_V4SF:
13548 case IX86_BUILTIN_VEC_EXT_V4SI:
13549 case IX86_BUILTIN_VEC_EXT_V8HI:
13550 case IX86_BUILTIN_VEC_EXT_V2SI:
13551 case IX86_BUILTIN_VEC_EXT_V4HI:
13552 case IX86_BUILTIN_VEC_EXT_V16QI:
13553 return ix86_expand_vec_ext_builtin (exp, target);
13554
13555 case IX86_BUILTIN_VEC_SET_V2DI:
13556 case IX86_BUILTIN_VEC_SET_V4SF:
13557 case IX86_BUILTIN_VEC_SET_V4SI:
13558 case IX86_BUILTIN_VEC_SET_V8HI:
13559 case IX86_BUILTIN_VEC_SET_V4HI:
13560 case IX86_BUILTIN_VEC_SET_V16QI:
13561 return ix86_expand_vec_set_builtin (exp);
13562
13563 case IX86_BUILTIN_NANQ:
13564 case IX86_BUILTIN_NANSQ:
13565 return expand_call (exp, target, ignore);
13566
13567 case IX86_BUILTIN_RDPID:
13568
13569 op0 = gen_reg_rtx (word_mode);
13570
13571 if (TARGET_64BIT)
13572 {
13573 insn = gen_rdpid_rex64 (op0);
13574 op0 = convert_to_mode (SImode, op0, 1);
13575 }
13576 else
13577 insn = gen_rdpid (op0);
13578
13579 emit_insn (insn);
13580
13581 if (target == 0
13582 || !register_operand (target, SImode))
13583 target = gen_reg_rtx (SImode);
13584
13585 emit_move_insn (target, op0);
13586 return target;
13587
13588 case IX86_BUILTIN_2INTERSECTD512:
13589 case IX86_BUILTIN_2INTERSECTQ512:
13590 case IX86_BUILTIN_2INTERSECTD256:
13591 case IX86_BUILTIN_2INTERSECTQ256:
13592 case IX86_BUILTIN_2INTERSECTD128:
13593 case IX86_BUILTIN_2INTERSECTQ128:
13594 arg0 = CALL_EXPR_ARG (exp, 0);
13595 arg1 = CALL_EXPR_ARG (exp, 1);
13596 arg2 = CALL_EXPR_ARG (exp, 2);
13597 arg3 = CALL_EXPR_ARG (exp, 3);
13598 op0 = expand_normal (exp: arg0);
13599 op1 = expand_normal (exp: arg1);
13600 op2 = expand_normal (exp: arg2);
13601 op3 = expand_normal (exp: arg3);
13602
13603 if (!address_operand (op0, VOIDmode))
13604 {
13605 op0 = convert_memory_address (Pmode, op0);
13606 op0 = copy_addr_to_reg (op0);
13607 }
13608 if (!address_operand (op1, VOIDmode))
13609 {
13610 op1 = convert_memory_address (Pmode, op1);
13611 op1 = copy_addr_to_reg (op1);
13612 }
13613
13614 switch (fcode)
13615 {
13616 case IX86_BUILTIN_2INTERSECTD512:
13617 mode4 = P2HImode;
13618 icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
13619 break;
13620 case IX86_BUILTIN_2INTERSECTQ512:
13621 mode4 = P2QImode;
13622 icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
13623 break;
13624 case IX86_BUILTIN_2INTERSECTD256:
13625 mode4 = P2QImode;
13626 icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
13627 break;
13628 case IX86_BUILTIN_2INTERSECTQ256:
13629 mode4 = P2QImode;
13630 icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
13631 break;
13632 case IX86_BUILTIN_2INTERSECTD128:
13633 mode4 = P2QImode;
13634 icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
13635 break;
13636 case IX86_BUILTIN_2INTERSECTQ128:
13637 mode4 = P2QImode;
13638 icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
13639 break;
13640 default:
13641 gcc_unreachable ();
13642 }
13643
13644 mode2 = insn_data[icode].operand[1].mode;
13645 mode3 = insn_data[icode].operand[2].mode;
13646 if (!insn_data[icode].operand[1].predicate (op2, mode2))
13647 op2 = copy_to_mode_reg (mode2, op2);
13648 if (!insn_data[icode].operand[2].predicate (op3, mode3))
13649 op3 = copy_to_mode_reg (mode3, op3);
13650
13651 op4 = gen_reg_rtx (mode4);
13652 emit_insn (GEN_FCN (icode) (op4, op2, op3));
13653 mode0 = mode4 == P2HImode ? HImode : QImode;
13654 emit_move_insn (gen_rtx_MEM (mode0, op0),
13655 gen_lowpart (mode0, op4));
13656 emit_move_insn (gen_rtx_MEM (mode0, op1),
13657 gen_highpart (mode0, op4));
13658
13659 return 0;
13660
13661 case IX86_BUILTIN_RDPMC:
13662 case IX86_BUILTIN_RDTSC:
13663 case IX86_BUILTIN_RDTSCP:
13664 case IX86_BUILTIN_XGETBV:
13665
13666 op0 = gen_reg_rtx (DImode);
13667 op1 = gen_reg_rtx (DImode);
13668
13669 if (fcode == IX86_BUILTIN_RDPMC)
13670 {
13671 arg0 = CALL_EXPR_ARG (exp, 0);
13672 op2 = expand_normal (exp: arg0);
13673 if (!register_operand (op2, SImode))
13674 op2 = copy_to_mode_reg (SImode, op2);
13675
13676 insn = (TARGET_64BIT
13677 ? gen_rdpmc_rex64 (op0, op1, op2)
13678 : gen_rdpmc (op0, op2));
13679 emit_insn (insn);
13680 }
13681 else if (fcode == IX86_BUILTIN_XGETBV)
13682 {
13683 arg0 = CALL_EXPR_ARG (exp, 0);
13684 op2 = expand_normal (exp: arg0);
13685 if (!register_operand (op2, SImode))
13686 op2 = copy_to_mode_reg (SImode, op2);
13687
13688 insn = (TARGET_64BIT
13689 ? gen_xgetbv_rex64 (op0, op1, op2)
13690 : gen_xgetbv (op0, op2));
13691 emit_insn (insn);
13692 }
13693 else if (fcode == IX86_BUILTIN_RDTSC)
13694 {
13695 insn = (TARGET_64BIT
13696 ? gen_rdtsc_rex64 (op0, op1)
13697 : gen_rdtsc (op0));
13698 emit_insn (insn);
13699 }
13700 else
13701 {
13702 op2 = gen_reg_rtx (SImode);
13703
13704 insn = (TARGET_64BIT
13705 ? gen_rdtscp_rex64 (op0, op1, op2)
13706 : gen_rdtscp (op0, op2));
13707 emit_insn (insn);
13708
13709 arg0 = CALL_EXPR_ARG (exp, 0);
13710 op4 = expand_normal (exp: arg0);
13711 if (!address_operand (op4, VOIDmode))
13712 {
13713 op4 = convert_memory_address (Pmode, op4);
13714 op4 = copy_addr_to_reg (op4);
13715 }
13716 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
13717 }
13718
13719 if (target == 0
13720 || !register_operand (target, DImode))
13721 target = gen_reg_rtx (DImode);
13722
13723 if (TARGET_64BIT)
13724 {
13725 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
13726 op1, 1, OPTAB_DIRECT);
13727 op0 = expand_simple_binop (DImode, IOR, op0, op1,
13728 op0, 1, OPTAB_DIRECT);
13729 }
13730
13731 emit_move_insn (target, op0);
13732 return target;
13733
13734 case IX86_BUILTIN_ENQCMD:
13735 case IX86_BUILTIN_ENQCMDS:
13736 case IX86_BUILTIN_MOVDIR64B:
13737
13738 arg0 = CALL_EXPR_ARG (exp, 0);
13739 arg1 = CALL_EXPR_ARG (exp, 1);
13740 op0 = expand_normal (exp: arg0);
13741 op1 = expand_normal (exp: arg1);
13742
13743 op0 = ix86_zero_extend_to_Pmode (op0);
13744 if (!address_operand (op1, VOIDmode))
13745 {
13746 op1 = convert_memory_address (Pmode, op1);
13747 op1 = copy_addr_to_reg (op1);
13748 }
13749 op1 = gen_rtx_MEM (XImode, op1);
13750
13751 if (fcode == IX86_BUILTIN_MOVDIR64B)
13752 {
13753 emit_insn (gen_movdir64b (Pmode, x0: op0, x1: op1));
13754 return 0;
13755 }
13756 else
13757 {
13758 if (target == 0
13759 || !register_operand (target, SImode))
13760 target = gen_reg_rtx (SImode);
13761
13762 emit_move_insn (target, const0_rtx);
13763 target = gen_rtx_SUBREG (QImode, target, 0);
13764
13765 int unspecv = (fcode == IX86_BUILTIN_ENQCMD
13766 ? UNSPECV_ENQCMD
13767 : UNSPECV_ENQCMDS);
13768 icode = code_for_enqcmd (arg0: unspecv, Pmode);
13769 emit_insn (GEN_FCN (icode) (op0, op1));
13770
13771 emit_insn
13772 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
13773 gen_rtx_fmt_ee (EQ, QImode,
13774 gen_rtx_REG (CCZmode, FLAGS_REG),
13775 const0_rtx)));
13776 return SUBREG_REG (target);
13777 }
13778
13779 case IX86_BUILTIN_FXSAVE:
13780 case IX86_BUILTIN_FXRSTOR:
13781 case IX86_BUILTIN_FXSAVE64:
13782 case IX86_BUILTIN_FXRSTOR64:
13783 case IX86_BUILTIN_FNSTENV:
13784 case IX86_BUILTIN_FLDENV:
13785 mode0 = BLKmode;
13786 switch (fcode)
13787 {
13788 case IX86_BUILTIN_FXSAVE:
13789 icode = CODE_FOR_fxsave;
13790 break;
13791 case IX86_BUILTIN_FXRSTOR:
13792 icode = CODE_FOR_fxrstor;
13793 break;
13794 case IX86_BUILTIN_FXSAVE64:
13795 icode = CODE_FOR_fxsave64;
13796 break;
13797 case IX86_BUILTIN_FXRSTOR64:
13798 icode = CODE_FOR_fxrstor64;
13799 break;
13800 case IX86_BUILTIN_FNSTENV:
13801 icode = CODE_FOR_fnstenv;
13802 break;
13803 case IX86_BUILTIN_FLDENV:
13804 icode = CODE_FOR_fldenv;
13805 break;
13806 default:
13807 gcc_unreachable ();
13808 }
13809
13810 arg0 = CALL_EXPR_ARG (exp, 0);
13811 op0 = expand_normal (exp: arg0);
13812
13813 if (!address_operand (op0, VOIDmode))
13814 {
13815 op0 = convert_memory_address (Pmode, op0);
13816 op0 = copy_addr_to_reg (op0);
13817 }
13818 op0 = gen_rtx_MEM (mode0, op0);
13819
13820 pat = GEN_FCN (icode) (op0);
13821 if (pat)
13822 emit_insn (pat);
13823 return 0;
13824
13825 case IX86_BUILTIN_XSETBV:
13826 arg0 = CALL_EXPR_ARG (exp, 0);
13827 arg1 = CALL_EXPR_ARG (exp, 1);
13828 op0 = expand_normal (exp: arg0);
13829 op1 = expand_normal (exp: arg1);
13830
13831 if (!REG_P (op0))
13832 op0 = copy_to_mode_reg (SImode, op0);
13833
13834 op1 = force_reg (DImode, op1);
13835
13836 if (TARGET_64BIT)
13837 {
13838 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13839 NULL, 1, OPTAB_DIRECT);
13840
13841 icode = CODE_FOR_xsetbv_rex64;
13842
13843 op2 = gen_lowpart (SImode, op2);
13844 op1 = gen_lowpart (SImode, op1);
13845 pat = GEN_FCN (icode) (op0, op1, op2);
13846 }
13847 else
13848 {
13849 icode = CODE_FOR_xsetbv;
13850
13851 pat = GEN_FCN (icode) (op0, op1);
13852 }
13853 if (pat)
13854 emit_insn (pat);
13855 return 0;
13856
13857 case IX86_BUILTIN_XSAVE:
13858 case IX86_BUILTIN_XRSTOR:
13859 case IX86_BUILTIN_XSAVE64:
13860 case IX86_BUILTIN_XRSTOR64:
13861 case IX86_BUILTIN_XSAVEOPT:
13862 case IX86_BUILTIN_XSAVEOPT64:
13863 case IX86_BUILTIN_XSAVES:
13864 case IX86_BUILTIN_XRSTORS:
13865 case IX86_BUILTIN_XSAVES64:
13866 case IX86_BUILTIN_XRSTORS64:
13867 case IX86_BUILTIN_XSAVEC:
13868 case IX86_BUILTIN_XSAVEC64:
13869 arg0 = CALL_EXPR_ARG (exp, 0);
13870 arg1 = CALL_EXPR_ARG (exp, 1);
13871 op0 = expand_normal (exp: arg0);
13872 op1 = expand_normal (exp: arg1);
13873
13874 if (!address_operand (op0, VOIDmode))
13875 {
13876 op0 = convert_memory_address (Pmode, op0);
13877 op0 = copy_addr_to_reg (op0);
13878 }
13879 op0 = gen_rtx_MEM (BLKmode, op0);
13880
13881 op1 = force_reg (DImode, op1);
13882
13883 if (TARGET_64BIT)
13884 {
13885 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13886 NULL, 1, OPTAB_DIRECT);
13887 switch (fcode)
13888 {
13889 case IX86_BUILTIN_XSAVE:
13890 icode = CODE_FOR_xsave_rex64;
13891 break;
13892 case IX86_BUILTIN_XRSTOR:
13893 icode = CODE_FOR_xrstor_rex64;
13894 break;
13895 case IX86_BUILTIN_XSAVE64:
13896 icode = CODE_FOR_xsave64;
13897 break;
13898 case IX86_BUILTIN_XRSTOR64:
13899 icode = CODE_FOR_xrstor64;
13900 break;
13901 case IX86_BUILTIN_XSAVEOPT:
13902 icode = CODE_FOR_xsaveopt_rex64;
13903 break;
13904 case IX86_BUILTIN_XSAVEOPT64:
13905 icode = CODE_FOR_xsaveopt64;
13906 break;
13907 case IX86_BUILTIN_XSAVES:
13908 icode = CODE_FOR_xsaves_rex64;
13909 break;
13910 case IX86_BUILTIN_XRSTORS:
13911 icode = CODE_FOR_xrstors_rex64;
13912 break;
13913 case IX86_BUILTIN_XSAVES64:
13914 icode = CODE_FOR_xsaves64;
13915 break;
13916 case IX86_BUILTIN_XRSTORS64:
13917 icode = CODE_FOR_xrstors64;
13918 break;
13919 case IX86_BUILTIN_XSAVEC:
13920 icode = CODE_FOR_xsavec_rex64;
13921 break;
13922 case IX86_BUILTIN_XSAVEC64:
13923 icode = CODE_FOR_xsavec64;
13924 break;
13925 default:
13926 gcc_unreachable ();
13927 }
13928
13929 op2 = gen_lowpart (SImode, op2);
13930 op1 = gen_lowpart (SImode, op1);
13931 pat = GEN_FCN (icode) (op0, op1, op2);
13932 }
13933 else
13934 {
13935 switch (fcode)
13936 {
13937 case IX86_BUILTIN_XSAVE:
13938 icode = CODE_FOR_xsave;
13939 break;
13940 case IX86_BUILTIN_XRSTOR:
13941 icode = CODE_FOR_xrstor;
13942 break;
13943 case IX86_BUILTIN_XSAVEOPT:
13944 icode = CODE_FOR_xsaveopt;
13945 break;
13946 case IX86_BUILTIN_XSAVES:
13947 icode = CODE_FOR_xsaves;
13948 break;
13949 case IX86_BUILTIN_XRSTORS:
13950 icode = CODE_FOR_xrstors;
13951 break;
13952 case IX86_BUILTIN_XSAVEC:
13953 icode = CODE_FOR_xsavec;
13954 break;
13955 default:
13956 gcc_unreachable ();
13957 }
13958 pat = GEN_FCN (icode) (op0, op1);
13959 }
13960
13961 if (pat)
13962 emit_insn (pat);
13963 return 0;
13964
13965 case IX86_BUILTIN_LLWPCB:
13966 arg0 = CALL_EXPR_ARG (exp, 0);
13967 op0 = expand_normal (exp: arg0);
13968
13969 if (!register_operand (op0, Pmode))
13970 op0 = ix86_zero_extend_to_Pmode (op0);
13971 emit_insn (gen_lwp_llwpcb (Pmode, x0: op0));
13972 return 0;
13973
13974 case IX86_BUILTIN_SLWPCB:
13975 if (!target
13976 || !register_operand (target, Pmode))
13977 target = gen_reg_rtx (Pmode);
13978 emit_insn (gen_lwp_slwpcb (Pmode, x0: target));
13979 return target;
13980
13981 case IX86_BUILTIN_LWPVAL32:
13982 case IX86_BUILTIN_LWPVAL64:
13983 case IX86_BUILTIN_LWPINS32:
13984 case IX86_BUILTIN_LWPINS64:
13985 mode = ((fcode == IX86_BUILTIN_LWPVAL32
13986 || fcode == IX86_BUILTIN_LWPINS32)
13987 ? SImode : DImode);
13988
13989 if (fcode == IX86_BUILTIN_LWPVAL32
13990 || fcode == IX86_BUILTIN_LWPVAL64)
13991 icode = code_for_lwp_lwpval (arg0: mode);
13992 else
13993 icode = code_for_lwp_lwpins (arg0: mode);
13994
13995 arg0 = CALL_EXPR_ARG (exp, 0);
13996 arg1 = CALL_EXPR_ARG (exp, 1);
13997 arg2 = CALL_EXPR_ARG (exp, 2);
13998 op0 = expand_normal (exp: arg0);
13999 op1 = expand_normal (exp: arg1);
14000 op2 = expand_normal (exp: arg2);
14001 mode0 = insn_data[icode].operand[0].mode;
14002
14003 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14004 op0 = copy_to_mode_reg (mode0, op0);
14005 if (!insn_data[icode].operand[1].predicate (op1, SImode))
14006 op1 = copy_to_mode_reg (SImode, op1);
14007
14008 if (!CONST_INT_P (op2))
14009 {
14010 error ("the last argument must be a 32-bit immediate");
14011 return const0_rtx;
14012 }
14013
14014 emit_insn (GEN_FCN (icode) (op0, op1, op2));
14015
14016 if (fcode == IX86_BUILTIN_LWPINS32
14017 || fcode == IX86_BUILTIN_LWPINS64)
14018 {
14019 if (target == 0
14020 || !nonimmediate_operand (target, QImode))
14021 target = gen_reg_rtx (QImode);
14022
14023 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
14024 const0_rtx);
14025 emit_insn (gen_rtx_SET (target, pat));
14026
14027 return target;
14028 }
14029 else
14030 return 0;
14031
14032 case IX86_BUILTIN_BEXTRI32:
14033 case IX86_BUILTIN_BEXTRI64:
14034 mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
14035
14036 arg0 = CALL_EXPR_ARG (exp, 0);
14037 arg1 = CALL_EXPR_ARG (exp, 1);
14038 op0 = expand_normal (exp: arg0);
14039 op1 = expand_normal (exp: arg1);
14040
14041 if (!CONST_INT_P (op1))
14042 {
14043 error ("last argument must be an immediate");
14044 return const0_rtx;
14045 }
14046 else
14047 {
14048 unsigned char lsb_index = UINTVAL (op1);
14049 unsigned char length = UINTVAL (op1) >> 8;
14050
14051 unsigned char bitsize = GET_MODE_BITSIZE (mode);
14052
14053 icode = code_for_tbm_bextri (arg0: mode);
14054
14055 mode1 = insn_data[icode].operand[1].mode;
14056 if (!insn_data[icode].operand[1].predicate (op0, mode1))
14057 op0 = copy_to_mode_reg (mode1, op0);
14058
14059 mode0 = insn_data[icode].operand[0].mode;
14060 if (target == 0
14061 || !register_operand (target, mode0))
14062 target = gen_reg_rtx (mode0);
14063
14064 if (length == 0 || lsb_index >= bitsize)
14065 {
14066 emit_move_insn (target, const0_rtx);
14067 return target;
14068 }
14069
14070 if (length + lsb_index > bitsize)
14071 length = bitsize - lsb_index;
14072
14073 op1 = GEN_INT (length);
14074 op2 = GEN_INT (lsb_index);
14075
14076 emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
14077 return target;
14078 }
14079
14080 case IX86_BUILTIN_RDRAND16_STEP:
14081 mode = HImode;
14082 goto rdrand_step;
14083
14084 case IX86_BUILTIN_RDRAND32_STEP:
14085 mode = SImode;
14086 goto rdrand_step;
14087
14088 case IX86_BUILTIN_RDRAND64_STEP:
14089 mode = DImode;
14090
14091rdrand_step:
14092 arg0 = CALL_EXPR_ARG (exp, 0);
14093 op1 = expand_normal (exp: arg0);
14094 if (!address_operand (op1, VOIDmode))
14095 {
14096 op1 = convert_memory_address (Pmode, op1);
14097 op1 = copy_addr_to_reg (op1);
14098 }
14099
14100 op0 = gen_reg_rtx (mode);
14101 emit_insn (gen_rdrand (arg0: mode, x0: op0));
14102
14103 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
14104
14105 op1 = force_reg (SImode, const1_rtx);
14106
14107 /* Emit SImode conditional move. */
14108 if (mode == HImode)
14109 {
14110 if (TARGET_ZERO_EXTEND_WITH_AND
14111 && optimize_function_for_speed_p (cfun))
14112 {
14113 op2 = force_reg (SImode, const0_rtx);
14114
14115 emit_insn (gen_movstricthi
14116 (gen_lowpart (HImode, op2), op0));
14117 }
14118 else
14119 {
14120 op2 = gen_reg_rtx (SImode);
14121
14122 emit_insn (gen_zero_extendhisi2 (op2, op0));
14123 }
14124 }
14125 else if (mode == SImode)
14126 op2 = op0;
14127 else
14128 op2 = gen_rtx_SUBREG (SImode, op0, 0);
14129
14130 if (target == 0
14131 || !register_operand (target, SImode))
14132 target = gen_reg_rtx (SImode);
14133
14134 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
14135 const0_rtx);
14136 emit_insn (gen_rtx_SET (target,
14137 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
14138 return target;
14139
14140 case IX86_BUILTIN_RDSEED16_STEP:
14141 mode = HImode;
14142 goto rdseed_step;
14143
14144 case IX86_BUILTIN_RDSEED32_STEP:
14145 mode = SImode;
14146 goto rdseed_step;
14147
14148 case IX86_BUILTIN_RDSEED64_STEP:
14149 mode = DImode;
14150
14151rdseed_step:
14152 arg0 = CALL_EXPR_ARG (exp, 0);
14153 op1 = expand_normal (exp: arg0);
14154 if (!address_operand (op1, VOIDmode))
14155 {
14156 op1 = convert_memory_address (Pmode, op1);
14157 op1 = copy_addr_to_reg (op1);
14158 }
14159
14160 op0 = gen_reg_rtx (mode);
14161 emit_insn (gen_rdseed (arg0: mode, x0: op0));
14162
14163 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
14164
14165 op2 = gen_reg_rtx (QImode);
14166
14167 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
14168 const0_rtx);
14169 emit_insn (gen_rtx_SET (op2, pat));
14170
14171 if (target == 0
14172 || !register_operand (target, SImode))
14173 target = gen_reg_rtx (SImode);
14174
14175 emit_insn (gen_zero_extendqisi2 (target, op2));
14176 return target;
14177
14178 case IX86_BUILTIN_SBB32:
14179 icode = CODE_FOR_subborrowsi;
14180 icode2 = CODE_FOR_subborrowsi_0;
14181 mode0 = SImode;
14182 mode1 = DImode;
14183 mode2 = CCmode;
14184 goto handlecarry;
14185
14186 case IX86_BUILTIN_SBB64:
14187 icode = CODE_FOR_subborrowdi;
14188 icode2 = CODE_FOR_subborrowdi_0;
14189 mode0 = DImode;
14190 mode1 = TImode;
14191 mode2 = CCmode;
14192 goto handlecarry;
14193
14194 case IX86_BUILTIN_ADDCARRYX32:
14195 icode = CODE_FOR_addcarrysi;
14196 icode2 = CODE_FOR_addcarrysi_0;
14197 mode0 = SImode;
14198 mode1 = DImode;
14199 mode2 = CCCmode;
14200 goto handlecarry;
14201
14202 case IX86_BUILTIN_ADDCARRYX64:
14203 icode = CODE_FOR_addcarrydi;
14204 icode2 = CODE_FOR_addcarrydi_0;
14205 mode0 = DImode;
14206 mode1 = TImode;
14207 mode2 = CCCmode;
14208
14209 handlecarry:
14210 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
14211 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
14212 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
14213 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
14214
14215 op1 = expand_normal (exp: arg0);
14216
14217 op2 = expand_normal (exp: arg1);
14218 if (!register_operand (op2, mode0))
14219 op2 = copy_to_mode_reg (mode0, op2);
14220
14221 op3 = expand_normal (exp: arg2);
14222 if (!register_operand (op3, mode0))
14223 op3 = copy_to_mode_reg (mode0, op3);
14224
14225 op4 = expand_normal (exp: arg3);
14226 if (!address_operand (op4, VOIDmode))
14227 {
14228 op4 = convert_memory_address (Pmode, op4);
14229 op4 = copy_addr_to_reg (op4);
14230 }
14231
14232 op0 = gen_reg_rtx (mode0);
14233 if (op1 == const0_rtx)
14234 {
14235 /* If arg0 is 0, optimize right away into add or sub
14236 instruction that sets CCCmode flags. */
14237 op1 = gen_rtx_REG (mode2, FLAGS_REG);
14238 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
14239 }
14240 else
14241 {
14242 /* Generate CF from input operand. */
14243 ix86_expand_carry (arg: op1);
14244
14245 /* Generate instruction that consumes CF. */
14246 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
14247 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
14248 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
14249 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
14250 }
14251
14252 /* Return current CF value. */
14253 if (target == 0)
14254 target = gen_reg_rtx (QImode);
14255
14256 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
14257 emit_insn (gen_rtx_SET (target, pat));
14258
14259 /* Store the result. */
14260 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
14261
14262 return target;
14263
14264 case IX86_BUILTIN_READ_FLAGS:
14265 if (ignore)
14266 return const0_rtx;
14267
14268 emit_insn (gen_push (arg: gen_rtx_REG (word_mode, FLAGS_REG)));
14269
14270 if (optimize
14271 || target == NULL_RTX
14272 || !nonimmediate_operand (target, word_mode)
14273 || GET_MODE (target) != word_mode)
14274 target = gen_reg_rtx (word_mode);
14275
14276 emit_insn (gen_pop (arg: target));
14277 return target;
14278
14279 case IX86_BUILTIN_WRITE_FLAGS:
14280
14281 arg0 = CALL_EXPR_ARG (exp, 0);
14282 op0 = expand_normal (exp: arg0);
14283 if (!general_no_elim_operand (op0, word_mode))
14284 op0 = copy_to_mode_reg (word_mode, op0);
14285
14286 emit_insn (gen_push (arg: op0));
14287 emit_insn (gen_pop (arg: gen_rtx_REG (word_mode, FLAGS_REG)));
14288 return 0;
14289
14290 case IX86_BUILTIN_KTESTC8:
14291 icode = CODE_FOR_ktestqi;
14292 mode3 = CCCmode;
14293 goto kortest;
14294
14295 case IX86_BUILTIN_KTESTZ8:
14296 icode = CODE_FOR_ktestqi;
14297 mode3 = CCZmode;
14298 goto kortest;
14299
14300 case IX86_BUILTIN_KTESTC16:
14301 icode = CODE_FOR_ktesthi;
14302 mode3 = CCCmode;
14303 goto kortest;
14304
14305 case IX86_BUILTIN_KTESTZ16:
14306 icode = CODE_FOR_ktesthi;
14307 mode3 = CCZmode;
14308 goto kortest;
14309
14310 case IX86_BUILTIN_KTESTC32:
14311 icode = CODE_FOR_ktestsi;
14312 mode3 = CCCmode;
14313 goto kortest;
14314
14315 case IX86_BUILTIN_KTESTZ32:
14316 icode = CODE_FOR_ktestsi;
14317 mode3 = CCZmode;
14318 goto kortest;
14319
14320 case IX86_BUILTIN_KTESTC64:
14321 icode = CODE_FOR_ktestdi;
14322 mode3 = CCCmode;
14323 goto kortest;
14324
14325 case IX86_BUILTIN_KTESTZ64:
14326 icode = CODE_FOR_ktestdi;
14327 mode3 = CCZmode;
14328 goto kortest;
14329
14330 case IX86_BUILTIN_KORTESTC8:
14331 icode = CODE_FOR_kortestqi;
14332 mode3 = CCCmode;
14333 goto kortest;
14334
14335 case IX86_BUILTIN_KORTESTZ8:
14336 icode = CODE_FOR_kortestqi;
14337 mode3 = CCZmode;
14338 goto kortest;
14339
14340 case IX86_BUILTIN_KORTESTC16:
14341 icode = CODE_FOR_kortesthi;
14342 mode3 = CCCmode;
14343 goto kortest;
14344
14345 case IX86_BUILTIN_KORTESTZ16:
14346 icode = CODE_FOR_kortesthi;
14347 mode3 = CCZmode;
14348 goto kortest;
14349
14350 case IX86_BUILTIN_KORTESTC32:
14351 icode = CODE_FOR_kortestsi;
14352 mode3 = CCCmode;
14353 goto kortest;
14354
14355 case IX86_BUILTIN_KORTESTZ32:
14356 icode = CODE_FOR_kortestsi;
14357 mode3 = CCZmode;
14358 goto kortest;
14359
14360 case IX86_BUILTIN_KORTESTC64:
14361 icode = CODE_FOR_kortestdi;
14362 mode3 = CCCmode;
14363 goto kortest;
14364
14365 case IX86_BUILTIN_KORTESTZ64:
14366 icode = CODE_FOR_kortestdi;
14367 mode3 = CCZmode;
14368
14369 kortest:
14370 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
14371 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
14372 op0 = expand_normal (exp: arg0);
14373 op1 = expand_normal (exp: arg1);
14374
14375 mode0 = insn_data[icode].operand[0].mode;
14376 mode1 = insn_data[icode].operand[1].mode;
14377
14378 if (GET_MODE (op0) != VOIDmode)
14379 op0 = force_reg (GET_MODE (op0), op0);
14380
14381 op0 = gen_lowpart (mode0, op0);
14382
14383 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14384 op0 = copy_to_mode_reg (mode0, op0);
14385
14386 if (GET_MODE (op1) != VOIDmode)
14387 op1 = force_reg (GET_MODE (op1), op1);
14388
14389 op1 = gen_lowpart (mode1, op1);
14390
14391 if (!insn_data[icode].operand[1].predicate (op1, mode1))
14392 op1 = copy_to_mode_reg (mode1, op1);
14393
14394 target = gen_reg_rtx (QImode);
14395
14396 /* Emit kortest. */
14397 emit_insn (GEN_FCN (icode) (op0, op1));
14398 /* And use setcc to return result from flags. */
14399 ix86_expand_setcc (dest: target, code: EQ,
14400 op0: gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
14401 return target;
14402
14403 case IX86_BUILTIN_GATHERSIV2DF:
14404 icode = CODE_FOR_avx2_gathersiv2df;
14405 goto gather_gen;
14406 case IX86_BUILTIN_GATHERSIV4DF:
14407 icode = CODE_FOR_avx2_gathersiv4df;
14408 goto gather_gen;
14409 case IX86_BUILTIN_GATHERDIV2DF:
14410 icode = CODE_FOR_avx2_gatherdiv2df;
14411 goto gather_gen;
14412 case IX86_BUILTIN_GATHERDIV4DF:
14413 icode = CODE_FOR_avx2_gatherdiv4df;
14414 goto gather_gen;
14415 case IX86_BUILTIN_GATHERSIV4SF:
14416 icode = CODE_FOR_avx2_gathersiv4sf;
14417 goto gather_gen;
14418 case IX86_BUILTIN_GATHERSIV8SF:
14419 icode = CODE_FOR_avx2_gathersiv8sf;
14420 goto gather_gen;
14421 case IX86_BUILTIN_GATHERDIV4SF:
14422 icode = CODE_FOR_avx2_gatherdiv4sf;
14423 goto gather_gen;
14424 case IX86_BUILTIN_GATHERDIV8SF:
14425 icode = CODE_FOR_avx2_gatherdiv8sf;
14426 goto gather_gen;
14427 case IX86_BUILTIN_GATHERSIV2DI:
14428 icode = CODE_FOR_avx2_gathersiv2di;
14429 goto gather_gen;
14430 case IX86_BUILTIN_GATHERSIV4DI:
14431 icode = CODE_FOR_avx2_gathersiv4di;
14432 goto gather_gen;
14433 case IX86_BUILTIN_GATHERDIV2DI:
14434 icode = CODE_FOR_avx2_gatherdiv2di;
14435 goto gather_gen;
14436 case IX86_BUILTIN_GATHERDIV4DI:
14437 icode = CODE_FOR_avx2_gatherdiv4di;
14438 goto gather_gen;
14439 case IX86_BUILTIN_GATHERSIV4SI:
14440 icode = CODE_FOR_avx2_gathersiv4si;
14441 goto gather_gen;
14442 case IX86_BUILTIN_GATHERSIV8SI:
14443 icode = CODE_FOR_avx2_gathersiv8si;
14444 goto gather_gen;
14445 case IX86_BUILTIN_GATHERDIV4SI:
14446 icode = CODE_FOR_avx2_gatherdiv4si;
14447 goto gather_gen;
14448 case IX86_BUILTIN_GATHERDIV8SI:
14449 icode = CODE_FOR_avx2_gatherdiv8si;
14450 goto gather_gen;
14451 case IX86_BUILTIN_GATHERALTSIV4DF:
14452 icode = CODE_FOR_avx2_gathersiv4df;
14453 goto gather_gen;
14454 case IX86_BUILTIN_GATHERALTDIV8SF:
14455 icode = CODE_FOR_avx2_gatherdiv8sf;
14456 goto gather_gen;
14457 case IX86_BUILTIN_GATHERALTSIV4DI:
14458 icode = CODE_FOR_avx2_gathersiv4di;
14459 goto gather_gen;
14460 case IX86_BUILTIN_GATHERALTDIV8SI:
14461 icode = CODE_FOR_avx2_gatherdiv8si;
14462 goto gather_gen;
14463 case IX86_BUILTIN_GATHER3SIV16SF:
14464 icode = CODE_FOR_avx512f_gathersiv16sf;
14465 goto gather_gen;
14466 case IX86_BUILTIN_GATHER3SIV8DF:
14467 icode = CODE_FOR_avx512f_gathersiv8df;
14468 goto gather_gen;
14469 case IX86_BUILTIN_GATHER3DIV16SF:
14470 icode = CODE_FOR_avx512f_gatherdiv16sf;
14471 goto gather_gen;
14472 case IX86_BUILTIN_GATHER3DIV8DF:
14473 icode = CODE_FOR_avx512f_gatherdiv8df;
14474 goto gather_gen;
14475 case IX86_BUILTIN_GATHER3SIV16SI:
14476 icode = CODE_FOR_avx512f_gathersiv16si;
14477 goto gather_gen;
14478 case IX86_BUILTIN_GATHER3SIV8DI:
14479 icode = CODE_FOR_avx512f_gathersiv8di;
14480 goto gather_gen;
14481 case IX86_BUILTIN_GATHER3DIV16SI:
14482 icode = CODE_FOR_avx512f_gatherdiv16si;
14483 goto gather_gen;
14484 case IX86_BUILTIN_GATHER3DIV8DI:
14485 icode = CODE_FOR_avx512f_gatherdiv8di;
14486 goto gather_gen;
14487 case IX86_BUILTIN_GATHER3ALTSIV8DF:
14488 icode = CODE_FOR_avx512f_gathersiv8df;
14489 goto gather_gen;
14490 case IX86_BUILTIN_GATHER3ALTDIV16SF:
14491 icode = CODE_FOR_avx512f_gatherdiv16sf;
14492 goto gather_gen;
14493 case IX86_BUILTIN_GATHER3ALTSIV8DI:
14494 icode = CODE_FOR_avx512f_gathersiv8di;
14495 goto gather_gen;
14496 case IX86_BUILTIN_GATHER3ALTDIV16SI:
14497 icode = CODE_FOR_avx512f_gatherdiv16si;
14498 goto gather_gen;
14499 case IX86_BUILTIN_GATHER3SIV2DF:
14500 icode = CODE_FOR_avx512vl_gathersiv2df;
14501 goto gather_gen;
14502 case IX86_BUILTIN_GATHER3SIV4DF:
14503 icode = CODE_FOR_avx512vl_gathersiv4df;
14504 goto gather_gen;
14505 case IX86_BUILTIN_GATHER3DIV2DF:
14506 icode = CODE_FOR_avx512vl_gatherdiv2df;
14507 goto gather_gen;
14508 case IX86_BUILTIN_GATHER3DIV4DF:
14509 icode = CODE_FOR_avx512vl_gatherdiv4df;
14510 goto gather_gen;
14511 case IX86_BUILTIN_GATHER3SIV4SF:
14512 icode = CODE_FOR_avx512vl_gathersiv4sf;
14513 goto gather_gen;
14514 case IX86_BUILTIN_GATHER3SIV8SF:
14515 icode = CODE_FOR_avx512vl_gathersiv8sf;
14516 goto gather_gen;
14517 case IX86_BUILTIN_GATHER3DIV4SF:
14518 icode = CODE_FOR_avx512vl_gatherdiv4sf;
14519 goto gather_gen;
14520 case IX86_BUILTIN_GATHER3DIV8SF:
14521 icode = CODE_FOR_avx512vl_gatherdiv8sf;
14522 goto gather_gen;
14523 case IX86_BUILTIN_GATHER3SIV2DI:
14524 icode = CODE_FOR_avx512vl_gathersiv2di;
14525 goto gather_gen;
14526 case IX86_BUILTIN_GATHER3SIV4DI:
14527 icode = CODE_FOR_avx512vl_gathersiv4di;
14528 goto gather_gen;
14529 case IX86_BUILTIN_GATHER3DIV2DI:
14530 icode = CODE_FOR_avx512vl_gatherdiv2di;
14531 goto gather_gen;
14532 case IX86_BUILTIN_GATHER3DIV4DI:
14533 icode = CODE_FOR_avx512vl_gatherdiv4di;
14534 goto gather_gen;
14535 case IX86_BUILTIN_GATHER3SIV4SI:
14536 icode = CODE_FOR_avx512vl_gathersiv4si;
14537 goto gather_gen;
14538 case IX86_BUILTIN_GATHER3SIV8SI:
14539 icode = CODE_FOR_avx512vl_gathersiv8si;
14540 goto gather_gen;
14541 case IX86_BUILTIN_GATHER3DIV4SI:
14542 icode = CODE_FOR_avx512vl_gatherdiv4si;
14543 goto gather_gen;
14544 case IX86_BUILTIN_GATHER3DIV8SI:
14545 icode = CODE_FOR_avx512vl_gatherdiv8si;
14546 goto gather_gen;
14547 case IX86_BUILTIN_GATHER3ALTSIV4DF:
14548 icode = CODE_FOR_avx512vl_gathersiv4df;
14549 goto gather_gen;
14550 case IX86_BUILTIN_GATHER3ALTDIV8SF:
14551 icode = CODE_FOR_avx512vl_gatherdiv8sf;
14552 goto gather_gen;
14553 case IX86_BUILTIN_GATHER3ALTSIV4DI:
14554 icode = CODE_FOR_avx512vl_gathersiv4di;
14555 goto gather_gen;
14556 case IX86_BUILTIN_GATHER3ALTDIV8SI:
14557 icode = CODE_FOR_avx512vl_gatherdiv8si;
14558 goto gather_gen;
14559 case IX86_BUILTIN_SCATTERSIV16SF:
14560 icode = CODE_FOR_avx512f_scattersiv16sf;
14561 goto scatter_gen;
14562 case IX86_BUILTIN_SCATTERSIV8DF:
14563 icode = CODE_FOR_avx512f_scattersiv8df;
14564 goto scatter_gen;
14565 case IX86_BUILTIN_SCATTERDIV16SF:
14566 icode = CODE_FOR_avx512f_scatterdiv16sf;
14567 goto scatter_gen;
14568 case IX86_BUILTIN_SCATTERDIV8DF:
14569 icode = CODE_FOR_avx512f_scatterdiv8df;
14570 goto scatter_gen;
14571 case IX86_BUILTIN_SCATTERSIV16SI:
14572 icode = CODE_FOR_avx512f_scattersiv16si;
14573 goto scatter_gen;
14574 case IX86_BUILTIN_SCATTERSIV8DI:
14575 icode = CODE_FOR_avx512f_scattersiv8di;
14576 goto scatter_gen;
14577 case IX86_BUILTIN_SCATTERDIV16SI:
14578 icode = CODE_FOR_avx512f_scatterdiv16si;
14579 goto scatter_gen;
14580 case IX86_BUILTIN_SCATTERDIV8DI:
14581 icode = CODE_FOR_avx512f_scatterdiv8di;
14582 goto scatter_gen;
14583 case IX86_BUILTIN_SCATTERSIV8SF:
14584 icode = CODE_FOR_avx512vl_scattersiv8sf;
14585 goto scatter_gen;
14586 case IX86_BUILTIN_SCATTERSIV4SF:
14587 icode = CODE_FOR_avx512vl_scattersiv4sf;
14588 goto scatter_gen;
14589 case IX86_BUILTIN_SCATTERSIV4DF:
14590 icode = CODE_FOR_avx512vl_scattersiv4df;
14591 goto scatter_gen;
14592 case IX86_BUILTIN_SCATTERSIV2DF:
14593 icode = CODE_FOR_avx512vl_scattersiv2df;
14594 goto scatter_gen;
14595 case IX86_BUILTIN_SCATTERDIV8SF:
14596 icode = CODE_FOR_avx512vl_scatterdiv8sf;
14597 goto scatter_gen;
14598 case IX86_BUILTIN_SCATTERDIV4SF:
14599 icode = CODE_FOR_avx512vl_scatterdiv4sf;
14600 goto scatter_gen;
14601 case IX86_BUILTIN_SCATTERDIV4DF:
14602 icode = CODE_FOR_avx512vl_scatterdiv4df;
14603 goto scatter_gen;
14604 case IX86_BUILTIN_SCATTERDIV2DF:
14605 icode = CODE_FOR_avx512vl_scatterdiv2df;
14606 goto scatter_gen;
14607 case IX86_BUILTIN_SCATTERSIV8SI:
14608 icode = CODE_FOR_avx512vl_scattersiv8si;
14609 goto scatter_gen;
14610 case IX86_BUILTIN_SCATTERSIV4SI:
14611 icode = CODE_FOR_avx512vl_scattersiv4si;
14612 goto scatter_gen;
14613 case IX86_BUILTIN_SCATTERSIV4DI:
14614 icode = CODE_FOR_avx512vl_scattersiv4di;
14615 goto scatter_gen;
14616 case IX86_BUILTIN_SCATTERSIV2DI:
14617 icode = CODE_FOR_avx512vl_scattersiv2di;
14618 goto scatter_gen;
14619 case IX86_BUILTIN_SCATTERDIV8SI:
14620 icode = CODE_FOR_avx512vl_scatterdiv8si;
14621 goto scatter_gen;
14622 case IX86_BUILTIN_SCATTERDIV4SI:
14623 icode = CODE_FOR_avx512vl_scatterdiv4si;
14624 goto scatter_gen;
14625 case IX86_BUILTIN_SCATTERDIV4DI:
14626 icode = CODE_FOR_avx512vl_scatterdiv4di;
14627 goto scatter_gen;
14628 case IX86_BUILTIN_SCATTERDIV2DI:
14629 icode = CODE_FOR_avx512vl_scatterdiv2di;
14630 goto scatter_gen;
14631 case IX86_BUILTIN_GATHERPFDPD:
14632 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
14633 goto vec_prefetch_gen;
14634 case IX86_BUILTIN_SCATTERALTSIV8DF:
14635 icode = CODE_FOR_avx512f_scattersiv8df;
14636 goto scatter_gen;
14637 case IX86_BUILTIN_SCATTERALTDIV16SF:
14638 icode = CODE_FOR_avx512f_scatterdiv16sf;
14639 goto scatter_gen;
14640 case IX86_BUILTIN_SCATTERALTSIV8DI:
14641 icode = CODE_FOR_avx512f_scattersiv8di;
14642 goto scatter_gen;
14643 case IX86_BUILTIN_SCATTERALTDIV16SI:
14644 icode = CODE_FOR_avx512f_scatterdiv16si;
14645 goto scatter_gen;
14646 case IX86_BUILTIN_SCATTERALTSIV4DF:
14647 icode = CODE_FOR_avx512vl_scattersiv4df;
14648 goto scatter_gen;
14649 case IX86_BUILTIN_SCATTERALTDIV8SF:
14650 icode = CODE_FOR_avx512vl_scatterdiv8sf;
14651 goto scatter_gen;
14652 case IX86_BUILTIN_SCATTERALTSIV4DI:
14653 icode = CODE_FOR_avx512vl_scattersiv4di;
14654 goto scatter_gen;
14655 case IX86_BUILTIN_SCATTERALTDIV8SI:
14656 icode = CODE_FOR_avx512vl_scatterdiv8si;
14657 goto scatter_gen;
14658 case IX86_BUILTIN_SCATTERALTSIV2DF:
14659 icode = CODE_FOR_avx512vl_scattersiv2df;
14660 goto scatter_gen;
14661 case IX86_BUILTIN_SCATTERALTDIV4SF:
14662 icode = CODE_FOR_avx512vl_scatterdiv4sf;
14663 goto scatter_gen;
14664 case IX86_BUILTIN_SCATTERALTSIV2DI:
14665 icode = CODE_FOR_avx512vl_scattersiv2di;
14666 goto scatter_gen;
14667 case IX86_BUILTIN_SCATTERALTDIV4SI:
14668 icode = CODE_FOR_avx512vl_scatterdiv4si;
14669 goto scatter_gen;
14670 case IX86_BUILTIN_GATHERPFDPS:
14671 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
14672 goto vec_prefetch_gen;
14673 case IX86_BUILTIN_GATHERPFQPD:
14674 icode = CODE_FOR_avx512pf_gatherpfv8didf;
14675 goto vec_prefetch_gen;
14676 case IX86_BUILTIN_GATHERPFQPS:
14677 icode = CODE_FOR_avx512pf_gatherpfv8disf;
14678 goto vec_prefetch_gen;
14679 case IX86_BUILTIN_SCATTERPFDPD:
14680 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
14681 goto vec_prefetch_gen;
14682 case IX86_BUILTIN_SCATTERPFDPS:
14683 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
14684 goto vec_prefetch_gen;
14685 case IX86_BUILTIN_SCATTERPFQPD:
14686 icode = CODE_FOR_avx512pf_scatterpfv8didf;
14687 goto vec_prefetch_gen;
14688 case IX86_BUILTIN_SCATTERPFQPS:
14689 icode = CODE_FOR_avx512pf_scatterpfv8disf;
14690 goto vec_prefetch_gen;
14691
14692 gather_gen:
14693 rtx half;
14694 rtx (*gen) (rtx, rtx);
14695
14696 arg0 = CALL_EXPR_ARG (exp, 0);
14697 arg1 = CALL_EXPR_ARG (exp, 1);
14698 arg2 = CALL_EXPR_ARG (exp, 2);
14699 arg3 = CALL_EXPR_ARG (exp, 3);
14700 arg4 = CALL_EXPR_ARG (exp, 4);
14701 op0 = expand_normal (exp: arg0);
14702 op1 = expand_normal (exp: arg1);
14703 op2 = expand_normal (exp: arg2);
14704 op3 = expand_normal (exp: arg3);
14705 op4 = expand_normal (exp: arg4);
14706 /* Note the arg order is different from the operand order. */
14707 mode0 = insn_data[icode].operand[1].mode;
14708 mode2 = insn_data[icode].operand[3].mode;
14709 mode3 = insn_data[icode].operand[4].mode;
14710 mode4 = insn_data[icode].operand[5].mode;
14711
14712 if (target == NULL_RTX
14713 || GET_MODE (target) != insn_data[icode].operand[0].mode
14714 || !insn_data[icode].operand[0].predicate (target,
14715 GET_MODE (target)))
14716 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
14717 else
14718 subtarget = target;
14719
14720 switch (fcode)
14721 {
14722 case IX86_BUILTIN_GATHER3ALTSIV8DF:
14723 case IX86_BUILTIN_GATHER3ALTSIV8DI:
14724 half = gen_reg_rtx (V8SImode);
14725 if (!nonimmediate_operand (op2, V16SImode))
14726 op2 = copy_to_mode_reg (V16SImode, op2);
14727 emit_insn (gen_vec_extract_lo_v16si (half, op2));
14728 op2 = half;
14729 break;
14730 case IX86_BUILTIN_GATHER3ALTSIV4DF:
14731 case IX86_BUILTIN_GATHER3ALTSIV4DI:
14732 case IX86_BUILTIN_GATHERALTSIV4DF:
14733 case IX86_BUILTIN_GATHERALTSIV4DI:
14734 half = gen_reg_rtx (V4SImode);
14735 if (!nonimmediate_operand (op2, V8SImode))
14736 op2 = copy_to_mode_reg (V8SImode, op2);
14737 emit_insn (gen_vec_extract_lo_v8si (half, op2));
14738 op2 = half;
14739 break;
14740 case IX86_BUILTIN_GATHER3ALTDIV16SF:
14741 case IX86_BUILTIN_GATHER3ALTDIV16SI:
14742 half = gen_reg_rtx (mode0);
14743 if (mode0 == V8SFmode)
14744 gen = gen_vec_extract_lo_v16sf;
14745 else
14746 gen = gen_vec_extract_lo_v16si;
14747 if (!nonimmediate_operand (op0, GET_MODE (op0)))
14748 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
14749 emit_insn (gen (half, op0));
14750 op0 = half;
14751 op3 = lowpart_subreg (QImode, op: op3, HImode);
14752 break;
14753 case IX86_BUILTIN_GATHER3ALTDIV8SF:
14754 case IX86_BUILTIN_GATHER3ALTDIV8SI:
14755 case IX86_BUILTIN_GATHERALTDIV8SF:
14756 case IX86_BUILTIN_GATHERALTDIV8SI:
14757 half = gen_reg_rtx (mode0);
14758 if (mode0 == V4SFmode)
14759 gen = gen_vec_extract_lo_v8sf;
14760 else
14761 gen = gen_vec_extract_lo_v8si;
14762 if (!nonimmediate_operand (op0, GET_MODE (op0)))
14763 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
14764 emit_insn (gen (half, op0));
14765 op0 = half;
14766 if (VECTOR_MODE_P (GET_MODE (op3)))
14767 {
14768 half = gen_reg_rtx (mode0);
14769 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14770 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14771 emit_insn (gen (half, op3));
14772 op3 = half;
14773 }
14774 break;
14775 default:
14776 break;
14777 }
14778
14779 /* Force memory operand only with base register here. But we
14780 don't want to do it on memory operand for other builtin
14781 functions. */
14782 op1 = ix86_zero_extend_to_Pmode (op1);
14783
14784 if (!insn_data[icode].operand[1].predicate (op0, mode0))
14785 op0 = copy_to_mode_reg (mode0, op0);
14786 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
14787 op1 = copy_to_mode_reg (Pmode, op1);
14788 if (!insn_data[icode].operand[3].predicate (op2, mode2))
14789 op2 = copy_to_mode_reg (mode2, op2);
14790
14791 op3 = fixup_modeless_constant (x: op3, mode: mode3);
14792
14793 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
14794 {
14795 if (!insn_data[icode].operand[4].predicate (op3, mode3))
14796 op3 = copy_to_mode_reg (mode3, op3);
14797 }
14798 else
14799 {
14800 op3 = copy_to_reg (op3);
14801 op3 = lowpart_subreg (outermode: mode3, op: op3, GET_MODE (op3));
14802 }
14803 if (!insn_data[icode].operand[5].predicate (op4, mode4))
14804 {
14805 error ("the last argument must be scale 1, 2, 4, 8");
14806 return const0_rtx;
14807 }
14808
14809 /* Optimize. If mask is known to have all high bits set,
14810 replace op0 with pc_rtx to signal that the instruction
14811 overwrites the whole destination and doesn't use its
14812 previous contents. */
14813 if (optimize)
14814 {
14815 if (TREE_CODE (arg3) == INTEGER_CST)
14816 {
14817 if (integer_all_onesp (arg3))
14818 op0 = pc_rtx;
14819 }
14820 else if (TREE_CODE (arg3) == VECTOR_CST)
14821 {
14822 unsigned int negative = 0;
14823 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
14824 {
14825 tree cst = VECTOR_CST_ELT (arg3, i);
14826 if (TREE_CODE (cst) == INTEGER_CST
14827 && tree_int_cst_sign_bit (cst))
14828 negative++;
14829 else if (TREE_CODE (cst) == REAL_CST
14830 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
14831 negative++;
14832 }
14833 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
14834 op0 = pc_rtx;
14835 }
14836 else if (TREE_CODE (arg3) == SSA_NAME
14837 && VECTOR_TYPE_P (TREE_TYPE (arg3)))
14838 {
14839 /* Recognize also when mask is like:
14840 __v2df src = _mm_setzero_pd ();
14841 __v2df mask = _mm_cmpeq_pd (src, src);
14842 or
14843 __v8sf src = _mm256_setzero_ps ();
14844 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
14845 as that is a cheaper way to load all ones into
14846 a register than having to load a constant from
14847 memory. */
14848 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
14849 if (is_gimple_call (gs: def_stmt))
14850 {
14851 tree fndecl = gimple_call_fndecl (gs: def_stmt);
14852 if (fndecl
14853 && fndecl_built_in_p (node: fndecl, klass: BUILT_IN_MD))
14854 switch (DECL_MD_FUNCTION_CODE (decl: fndecl))
14855 {
14856 case IX86_BUILTIN_CMPPD:
14857 case IX86_BUILTIN_CMPPS:
14858 case IX86_BUILTIN_CMPPD256:
14859 case IX86_BUILTIN_CMPPS256:
14860 if (!integer_zerop (gimple_call_arg (gs: def_stmt, index: 2)))
14861 break;
14862 /* FALLTHRU */
14863 case IX86_BUILTIN_CMPEQPD:
14864 case IX86_BUILTIN_CMPEQPS:
14865 if (initializer_zerop (gimple_call_arg (gs: def_stmt, index: 0))
14866 && initializer_zerop (gimple_call_arg (gs: def_stmt,
14867 index: 1)))
14868 op0 = pc_rtx;
14869 break;
14870 default:
14871 break;
14872 }
14873 }
14874 }
14875 }
14876
14877 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
14878 if (! pat)
14879 return const0_rtx;
14880 emit_insn (pat);
14881
14882 switch (fcode)
14883 {
14884 case IX86_BUILTIN_GATHER3DIV16SF:
14885 if (target == NULL_RTX)
14886 target = gen_reg_rtx (V8SFmode);
14887 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
14888 break;
14889 case IX86_BUILTIN_GATHER3DIV16SI:
14890 if (target == NULL_RTX)
14891 target = gen_reg_rtx (V8SImode);
14892 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
14893 break;
14894 case IX86_BUILTIN_GATHER3DIV8SF:
14895 case IX86_BUILTIN_GATHERDIV8SF:
14896 if (target == NULL_RTX)
14897 target = gen_reg_rtx (V4SFmode);
14898 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
14899 break;
14900 case IX86_BUILTIN_GATHER3DIV8SI:
14901 case IX86_BUILTIN_GATHERDIV8SI:
14902 if (target == NULL_RTX)
14903 target = gen_reg_rtx (V4SImode);
14904 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
14905 break;
14906 default:
14907 target = subtarget;
14908 break;
14909 }
14910 return target;
14911
14912 scatter_gen:
14913 arg0 = CALL_EXPR_ARG (exp, 0);
14914 arg1 = CALL_EXPR_ARG (exp, 1);
14915 arg2 = CALL_EXPR_ARG (exp, 2);
14916 arg3 = CALL_EXPR_ARG (exp, 3);
14917 arg4 = CALL_EXPR_ARG (exp, 4);
14918 op0 = expand_normal (exp: arg0);
14919 op1 = expand_normal (exp: arg1);
14920 op2 = expand_normal (exp: arg2);
14921 op3 = expand_normal (exp: arg3);
14922 op4 = expand_normal (exp: arg4);
14923 mode1 = insn_data[icode].operand[1].mode;
14924 mode2 = insn_data[icode].operand[2].mode;
14925 mode3 = insn_data[icode].operand[3].mode;
14926 mode4 = insn_data[icode].operand[4].mode;
14927
14928 /* Scatter instruction stores operand op3 to memory with
14929 indices from op2 and scale from op4 under writemask op1.
14930 If index operand op2 has more elements then source operand
14931 op3 one need to use only its low half. And vice versa. */
14932 switch (fcode)
14933 {
14934 case IX86_BUILTIN_SCATTERALTSIV8DF:
14935 case IX86_BUILTIN_SCATTERALTSIV8DI:
14936 half = gen_reg_rtx (V8SImode);
14937 if (!nonimmediate_operand (op2, V16SImode))
14938 op2 = copy_to_mode_reg (V16SImode, op2);
14939 emit_insn (gen_vec_extract_lo_v16si (half, op2));
14940 op2 = half;
14941 break;
14942 case IX86_BUILTIN_SCATTERALTDIV16SF:
14943 case IX86_BUILTIN_SCATTERALTDIV16SI:
14944 half = gen_reg_rtx (mode3);
14945 if (mode3 == V8SFmode)
14946 gen = gen_vec_extract_lo_v16sf;
14947 else
14948 gen = gen_vec_extract_lo_v16si;
14949 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14950 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14951 emit_insn (gen (half, op3));
14952 op3 = half;
14953 break;
14954 case IX86_BUILTIN_SCATTERALTSIV4DF:
14955 case IX86_BUILTIN_SCATTERALTSIV4DI:
14956 half = gen_reg_rtx (V4SImode);
14957 if (!nonimmediate_operand (op2, V8SImode))
14958 op2 = copy_to_mode_reg (V8SImode, op2);
14959 emit_insn (gen_vec_extract_lo_v8si (half, op2));
14960 op2 = half;
14961 break;
14962 case IX86_BUILTIN_SCATTERALTDIV8SF:
14963 case IX86_BUILTIN_SCATTERALTDIV8SI:
14964 half = gen_reg_rtx (mode3);
14965 if (mode3 == V4SFmode)
14966 gen = gen_vec_extract_lo_v8sf;
14967 else
14968 gen = gen_vec_extract_lo_v8si;
14969 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14970 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14971 emit_insn (gen (half, op3));
14972 op3 = half;
14973 break;
14974 case IX86_BUILTIN_SCATTERALTSIV2DF:
14975 case IX86_BUILTIN_SCATTERALTSIV2DI:
14976 if (!nonimmediate_operand (op2, V4SImode))
14977 op2 = copy_to_mode_reg (V4SImode, op2);
14978 break;
14979 case IX86_BUILTIN_SCATTERALTDIV4SF:
14980 case IX86_BUILTIN_SCATTERALTDIV4SI:
14981 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14982 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14983 break;
14984 default:
14985 break;
14986 }
14987
14988 /* Force memory operand only with base register here. But we
14989 don't want to do it on memory operand for other builtin
14990 functions. */
14991 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
14992
14993 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
14994 op0 = copy_to_mode_reg (Pmode, op0);
14995
14996 op1 = fixup_modeless_constant (x: op1, mode: mode1);
14997
14998 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
14999 {
15000 if (!insn_data[icode].operand[1].predicate (op1, mode1))
15001 op1 = copy_to_mode_reg (mode1, op1);
15002 }
15003 else
15004 {
15005 op1 = copy_to_reg (op1);
15006 op1 = lowpart_subreg (outermode: mode1, op: op1, GET_MODE (op1));
15007 }
15008
15009 if (!insn_data[icode].operand[2].predicate (op2, mode2))
15010 op2 = copy_to_mode_reg (mode2, op2);
15011
15012 if (!insn_data[icode].operand[3].predicate (op3, mode3))
15013 op3 = copy_to_mode_reg (mode3, op3);
15014
15015 if (!insn_data[icode].operand[4].predicate (op4, mode4))
15016 {
15017 error ("the last argument must be scale 1, 2, 4, 8");
15018 return const0_rtx;
15019 }
15020
15021 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
15022 if (! pat)
15023 return const0_rtx;
15024
15025 emit_insn (pat);
15026 return 0;
15027
15028 vec_prefetch_gen:
15029 arg0 = CALL_EXPR_ARG (exp, 0);
15030 arg1 = CALL_EXPR_ARG (exp, 1);
15031 arg2 = CALL_EXPR_ARG (exp, 2);
15032 arg3 = CALL_EXPR_ARG (exp, 3);
15033 arg4 = CALL_EXPR_ARG (exp, 4);
15034 op0 = expand_normal (exp: arg0);
15035 op1 = expand_normal (exp: arg1);
15036 op2 = expand_normal (exp: arg2);
15037 op3 = expand_normal (exp: arg3);
15038 op4 = expand_normal (exp: arg4);
15039 mode0 = insn_data[icode].operand[0].mode;
15040 mode1 = insn_data[icode].operand[1].mode;
15041 mode3 = insn_data[icode].operand[3].mode;
15042 mode4 = insn_data[icode].operand[4].mode;
15043
15044 op0 = fixup_modeless_constant (x: op0, mode: mode0);
15045
15046 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
15047 {
15048 if (!insn_data[icode].operand[0].predicate (op0, mode0))
15049 op0 = copy_to_mode_reg (mode0, op0);
15050 }
15051 else
15052 {
15053 op0 = copy_to_reg (op0);
15054 op0 = lowpart_subreg (outermode: mode0, op: op0, GET_MODE (op0));
15055 }
15056
15057 if (!insn_data[icode].operand[1].predicate (op1, mode1))
15058 op1 = copy_to_mode_reg (mode1, op1);
15059
15060 /* Force memory operand only with base register here. But we
15061 don't want to do it on memory operand for other builtin
15062 functions. */
15063 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
15064
15065 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
15066 op2 = copy_to_mode_reg (Pmode, op2);
15067
15068 if (!insn_data[icode].operand[3].predicate (op3, mode3))
15069 {
15070 error ("the forth argument must be scale 1, 2, 4, 8");
15071 return const0_rtx;
15072 }
15073
15074 if (!insn_data[icode].operand[4].predicate (op4, mode4))
15075 {
15076 error ("incorrect hint operand");
15077 return const0_rtx;
15078 }
15079
15080 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
15081 if (! pat)
15082 return const0_rtx;
15083
15084 emit_insn (pat);
15085
15086 return 0;
15087
15088 case IX86_BUILTIN_XABORT:
15089 icode = CODE_FOR_xabort;
15090 arg0 = CALL_EXPR_ARG (exp, 0);
15091 op0 = expand_normal (exp: arg0);
15092 mode0 = insn_data[icode].operand[0].mode;
15093 if (!insn_data[icode].operand[0].predicate (op0, mode0))
15094 {
15095 error ("the argument to %<xabort%> intrinsic must "
15096 "be an 8-bit immediate");
15097 return const0_rtx;
15098 }
15099 emit_insn (gen_xabort (op0));
15100 return 0;
15101
15102 case IX86_BUILTIN_RDSSPD:
15103 case IX86_BUILTIN_RDSSPQ:
15104 mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
15105
15106 if (target == 0
15107 || !register_operand (target, mode))
15108 target = gen_reg_rtx (mode);
15109
15110 op0 = force_reg (mode, const0_rtx);
15111
15112 emit_insn (gen_rdssp (arg0: mode, x0: target, x1: op0));
15113 return target;
15114
15115 case IX86_BUILTIN_INCSSPD:
15116 case IX86_BUILTIN_INCSSPQ:
15117 mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
15118
15119 arg0 = CALL_EXPR_ARG (exp, 0);
15120 op0 = expand_normal (exp: arg0);
15121
15122 op0 = force_reg (mode, op0);
15123
15124 emit_insn (gen_incssp (arg0: mode, x0: op0));
15125 return 0;
15126
15127 case IX86_BUILTIN_HRESET:
15128 icode = CODE_FOR_hreset;
15129 arg0 = CALL_EXPR_ARG (exp, 0);
15130 op0 = expand_normal (exp: arg0);
15131 op0 = force_reg (SImode, op0);
15132 emit_insn (gen_hreset (op0));
15133 return 0;
15134
15135 case IX86_BUILTIN_RSTORSSP:
15136 case IX86_BUILTIN_CLRSSBSY:
15137 arg0 = CALL_EXPR_ARG (exp, 0);
15138 op0 = expand_normal (exp: arg0);
15139 icode = (fcode == IX86_BUILTIN_RSTORSSP
15140 ? CODE_FOR_rstorssp
15141 : CODE_FOR_clrssbsy);
15142
15143 if (!address_operand (op0, VOIDmode))
15144 {
15145 op0 = convert_memory_address (Pmode, op0);
15146 op0 = copy_addr_to_reg (op0);
15147 }
15148 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
15149 return 0;
15150
15151 case IX86_BUILTIN_WRSSD:
15152 case IX86_BUILTIN_WRSSQ:
15153 case IX86_BUILTIN_WRUSSD:
15154 case IX86_BUILTIN_WRUSSQ:
15155 mode = ((fcode == IX86_BUILTIN_WRSSD
15156 || fcode == IX86_BUILTIN_WRUSSD)
15157 ? SImode : DImode);
15158
15159 arg0 = CALL_EXPR_ARG (exp, 0);
15160 op0 = expand_normal (exp: arg0);
15161 arg1 = CALL_EXPR_ARG (exp, 1);
15162 op1 = expand_normal (exp: arg1);
15163
15164 op0 = force_reg (mode, op0);
15165
15166 if (!address_operand (op1, VOIDmode))
15167 {
15168 op1 = convert_memory_address (Pmode, op1);
15169 op1 = copy_addr_to_reg (op1);
15170 }
15171 op1 = gen_rtx_MEM (mode, op1);
15172
15173 icode = ((fcode == IX86_BUILTIN_WRSSD
15174 || fcode == IX86_BUILTIN_WRSSQ)
15175 ? code_for_wrss (arg0: mode)
15176 : code_for_wruss (arg0: mode));
15177 emit_insn (GEN_FCN (icode) (op0, op1));
15178
15179 return 0;
15180
15181 default:
15182 break;
15183 }
15184
15185 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
15186 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
15187 {
15188 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
15189 return ix86_expand_special_args_builtin (d: bdesc_special_args + i, exp,
15190 target);
15191 }
15192
15193 if (fcode >= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
15194 && fcode <= IX86_BUILTIN__BDESC_PURE_ARGS_LAST)
15195 {
15196 i = fcode - IX86_BUILTIN__BDESC_PURE_ARGS_FIRST;
15197 return ix86_expand_special_args_builtin (d: bdesc_pure_args + i, exp,
15198 target);
15199 }
15200
15201 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
15202 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
15203 {
15204 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
15205 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
15206 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
15207 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
15208 int masked = 1;
15209 machine_mode mode, wide_mode, nar_mode;
15210
15211 nar_mode = V4SFmode;
15212 mode = V16SFmode;
15213 wide_mode = V64SFmode;
15214 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
15215 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
15216
15217 switch (fcode)
15218 {
15219 case IX86_BUILTIN_4FMAPS:
15220 fcn = gen_avx5124fmaddps_4fmaddps;
15221 masked = 0;
15222 goto v4fma_expand;
15223
15224 case IX86_BUILTIN_4DPWSSD:
15225 nar_mode = V4SImode;
15226 mode = V16SImode;
15227 wide_mode = V64SImode;
15228 fcn = gen_avx5124vnniw_vp4dpwssd;
15229 masked = 0;
15230 goto v4fma_expand;
15231
15232 case IX86_BUILTIN_4DPWSSDS:
15233 nar_mode = V4SImode;
15234 mode = V16SImode;
15235 wide_mode = V64SImode;
15236 fcn = gen_avx5124vnniw_vp4dpwssds;
15237 masked = 0;
15238 goto v4fma_expand;
15239
15240 case IX86_BUILTIN_4FNMAPS:
15241 fcn = gen_avx5124fmaddps_4fnmaddps;
15242 masked = 0;
15243 goto v4fma_expand;
15244
15245 case IX86_BUILTIN_4FNMAPS_MASK:
15246 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
15247 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
15248 goto v4fma_expand;
15249
15250 case IX86_BUILTIN_4DPWSSD_MASK:
15251 nar_mode = V4SImode;
15252 mode = V16SImode;
15253 wide_mode = V64SImode;
15254 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
15255 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
15256 goto v4fma_expand;
15257
15258 case IX86_BUILTIN_4DPWSSDS_MASK:
15259 nar_mode = V4SImode;
15260 mode = V16SImode;
15261 wide_mode = V64SImode;
15262 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
15263 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
15264 goto v4fma_expand;
15265
15266 case IX86_BUILTIN_4FMAPS_MASK:
15267 {
15268 tree args[4];
15269 rtx ops[4];
15270 rtx wide_reg;
15271 rtx accum;
15272 rtx addr;
15273 rtx mem;
15274
15275v4fma_expand:
15276 wide_reg = gen_reg_rtx (wide_mode);
15277 for (i = 0; i < 4; i++)
15278 {
15279 args[i] = CALL_EXPR_ARG (exp, i);
15280 ops[i] = expand_normal (exp: args[i]);
15281
15282 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
15283 ops[i]);
15284 }
15285
15286 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
15287 accum = force_reg (mode, accum);
15288
15289 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
15290 addr = force_reg (Pmode, addr);
15291
15292 mem = gen_rtx_MEM (nar_mode, addr);
15293
15294 target = gen_reg_rtx (mode);
15295
15296 emit_move_insn (target, accum);
15297
15298 if (! masked)
15299 emit_insn (fcn (target, accum, wide_reg, mem));
15300 else
15301 {
15302 rtx merge, mask;
15303 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
15304
15305 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
15306
15307 if (CONST_INT_P (mask))
15308 mask = fixup_modeless_constant (x: mask, HImode);
15309
15310 mask = force_reg (HImode, mask);
15311
15312 if (GET_MODE (mask) != HImode)
15313 mask = gen_rtx_SUBREG (HImode, mask, 0);
15314
15315 /* If merge is 0 then we're about to emit z-masked variant. */
15316 if (const0_operand (merge, mode))
15317 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
15318 /* If merge is the same as accum then emit merge-masked variant. */
15319 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
15320 {
15321 merge = force_reg (mode, merge);
15322 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
15323 }
15324 /* Merge with something unknown might happen if we z-mask w/ -O0. */
15325 else
15326 {
15327 target = gen_reg_rtx (mode);
15328 emit_move_insn (target, merge);
15329 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
15330 }
15331 }
15332 return target;
15333 }
15334
15335 case IX86_BUILTIN_4FNMASS:
15336 fcn = gen_avx5124fmaddps_4fnmaddss;
15337 masked = 0;
15338 goto s4fma_expand;
15339
15340 case IX86_BUILTIN_4FMASS:
15341 fcn = gen_avx5124fmaddps_4fmaddss;
15342 masked = 0;
15343 goto s4fma_expand;
15344
15345 case IX86_BUILTIN_4FNMASS_MASK:
15346 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
15347 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
15348 goto s4fma_expand;
15349
15350 case IX86_BUILTIN_4FMASS_MASK:
15351 {
15352 tree args[4];
15353 rtx ops[4];
15354 rtx wide_reg;
15355 rtx accum;
15356 rtx addr;
15357 rtx mem;
15358
15359 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
15360 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
15361
15362s4fma_expand:
15363 mode = V4SFmode;
15364 wide_reg = gen_reg_rtx (V64SFmode);
15365 for (i = 0; i < 4; i++)
15366 {
15367 rtx tmp;
15368 args[i] = CALL_EXPR_ARG (exp, i);
15369 ops[i] = expand_normal (exp: args[i]);
15370
15371 tmp = gen_reg_rtx (SFmode);
15372 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
15373
15374 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
15375 gen_rtx_SUBREG (V16SFmode, tmp, 0));
15376 }
15377
15378 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
15379 accum = force_reg (V4SFmode, accum);
15380
15381 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
15382 addr = force_reg (Pmode, addr);
15383
15384 mem = gen_rtx_MEM (V4SFmode, addr);
15385
15386 target = gen_reg_rtx (V4SFmode);
15387
15388 emit_move_insn (target, accum);
15389
15390 if (! masked)
15391 emit_insn (fcn (target, accum, wide_reg, mem));
15392 else
15393 {
15394 rtx merge, mask;
15395 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
15396
15397 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
15398
15399 if (CONST_INT_P (mask))
15400 mask = fixup_modeless_constant (x: mask, QImode);
15401
15402 mask = force_reg (QImode, mask);
15403
15404 if (GET_MODE (mask) != QImode)
15405 mask = gen_rtx_SUBREG (QImode, mask, 0);
15406
15407 /* If merge is 0 then we're about to emit z-masked variant. */
15408 if (const0_operand (merge, mode))
15409 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
15410 /* If merge is the same as accum then emit merge-masked
15411 variant. */
15412 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
15413 {
15414 merge = force_reg (mode, merge);
15415 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
15416 }
15417 /* Merge with something unknown might happen if we z-mask
15418 w/ -O0. */
15419 else
15420 {
15421 target = gen_reg_rtx (mode);
15422 emit_move_insn (target, merge);
15423 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
15424 }
15425 }
15426 return target;
15427 }
15428 case IX86_BUILTIN_RDPID:
15429 return ix86_expand_special_args_builtin (d: bdesc_args + i, exp,
15430 target);
15431 case IX86_BUILTIN_FABSQ:
15432 case IX86_BUILTIN_COPYSIGNQ:
15433 if (!TARGET_SSE)
15434 /* Emit a normal call if SSE isn't available. */
15435 return expand_call (exp, target, ignore);
15436 /* FALLTHRU */
15437 default:
15438 return ix86_expand_args_builtin (d: bdesc_args + i, exp, target);
15439 }
15440 }
15441
15442 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
15443 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
15444 {
15445 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
15446 return ix86_expand_sse_comi (d: bdesc_comi + i, exp, target);
15447 }
15448
15449 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
15450 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
15451 {
15452 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
15453 return ix86_expand_round_builtin (d: bdesc_round_args + i, exp, target);
15454 }
15455
15456 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
15457 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
15458 {
15459 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
15460 return ix86_expand_sse_pcmpestr (d: bdesc_pcmpestr + i, exp, target);
15461 }
15462
15463 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
15464 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
15465 {
15466 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
15467 return ix86_expand_sse_pcmpistr (d: bdesc_pcmpistr + i, exp, target);
15468 }
15469
15470 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
15471 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
15472 {
15473 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
15474 const struct builtin_description *d = bdesc_multi_arg + i;
15475 return ix86_expand_multi_arg_builtin (icode: d->icode, exp, target,
15476 m_type: (enum ix86_builtin_func_type)
15477 d->flag, sub_code: d->comparison);
15478 }
15479
15480 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
15481 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
15482 {
15483 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
15484 return ix86_expand_special_args_builtin (d: bdesc_cet + i, exp,
15485 target);
15486 }
15487
15488 gcc_unreachable ();
15489}
15490
15491/* A subroutine of ix86_expand_vector_init_duplicate. Tries to
15492 fill target with val via vec_duplicate. */
15493
15494static bool
15495ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
15496{
15497 bool ok;
15498 rtx_insn *insn;
15499 rtx dup;
15500 /* Save/restore recog_data in case this is called from splitters
15501 or other routines where recog_data needs to stay valid across
15502 force_reg. See PR106577. */
15503 recog_data_d recog_data_save = recog_data;
15504
15505 /* First attempt to recognize VAL as-is. */
15506 dup = gen_vec_duplicate (mode, val);
15507 insn = emit_insn (gen_rtx_SET (target, dup));
15508 if (recog_memoized (insn) < 0)
15509 {
15510 rtx_insn *seq;
15511 machine_mode innermode = GET_MODE_INNER (mode);
15512 rtx reg;
15513
15514 /* If that fails, force VAL into a register. */
15515
15516 start_sequence ();
15517 reg = force_reg (innermode, val);
15518 if (GET_MODE (reg) != innermode)
15519 reg = gen_lowpart (innermode, reg);
15520 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
15521 seq = get_insns ();
15522 end_sequence ();
15523 if (seq)
15524 emit_insn_before (seq, insn);
15525
15526 ok = recog_memoized (insn) >= 0;
15527 gcc_assert (ok);
15528 }
15529 recog_data = recog_data_save;
15530 return true;
15531}
15532
15533/* Get a vector mode of the same size as the original but with elements
15534 twice as wide. This is only guaranteed to apply to integral vectors. */
15535
15536static machine_mode
15537get_mode_wider_vector (machine_mode o)
15538{
15539 /* ??? Rely on the ordering that genmodes.cc gives to vectors. */
15540 machine_mode n = GET_MODE_NEXT_MODE (m: o).require ();
15541 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
15542 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
15543 return n;
15544}
15545
15546static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
15547static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
15548
15549/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15550 with all elements equal to VAR. Return true if successful. */
15551
15552bool
15553ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
15554 rtx target, rtx val)
15555{
15556 bool ok;
15557
15558 switch (mode)
15559 {
15560 case E_V2SImode:
15561 case E_V2SFmode:
15562 if (!mmx_ok)
15563 return false;
15564 /* FALLTHRU */
15565
15566 case E_V4DFmode:
15567 case E_V4DImode:
15568 case E_V8SFmode:
15569 case E_V8SImode:
15570 case E_V2DFmode:
15571 case E_V2DImode:
15572 case E_V4SFmode:
15573 case E_V4SImode:
15574 case E_V16SImode:
15575 case E_V8DImode:
15576 case E_V16SFmode:
15577 case E_V8DFmode:
15578 return ix86_vector_duplicate_value (mode, target, val);
15579
15580 case E_V4HImode:
15581 if (!mmx_ok)
15582 return false;
15583 if (TARGET_SSE || TARGET_3DNOW_A)
15584 {
15585 rtx x;
15586
15587 val = gen_lowpart (SImode, val);
15588 x = gen_rtx_TRUNCATE (HImode, val);
15589 x = gen_rtx_VEC_DUPLICATE (mode, x);
15590 emit_insn (gen_rtx_SET (target, x));
15591 return true;
15592 }
15593 goto widen;
15594
15595 case E_V2HImode:
15596 if (TARGET_SSE2)
15597 {
15598 rtx x;
15599
15600 val = gen_lowpart (SImode, val);
15601 x = gen_rtx_TRUNCATE (HImode, val);
15602 x = gen_rtx_VEC_DUPLICATE (mode, x);
15603 emit_insn (gen_rtx_SET (target, x));
15604 return true;
15605 }
15606 return false;
15607
15608 case E_V8QImode:
15609 case E_V4QImode:
15610 if (!mmx_ok)
15611 return false;
15612 goto widen;
15613
15614 case E_V8HImode:
15615 case E_V8HFmode:
15616 case E_V8BFmode:
15617 if (TARGET_AVX2)
15618 return ix86_vector_duplicate_value (mode, target, val);
15619
15620 if (TARGET_SSE2)
15621 {
15622 struct expand_vec_perm_d dperm;
15623 rtx tmp1, tmp2;
15624
15625 permute:
15626 memset (s: &dperm, c: 0, n: sizeof (dperm));
15627 dperm.target = target;
15628 dperm.vmode = mode;
15629 dperm.nelt = GET_MODE_NUNITS (mode);
15630 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
15631 dperm.one_operand_p = true;
15632
15633 if (mode == V8HFmode || mode == V8BFmode)
15634 {
15635 tmp1 = force_reg (GET_MODE_INNER (mode), val);
15636 tmp2 = gen_reg_rtx (mode);
15637 emit_insn (gen_vec_set_0 (arg0: mode, x0: tmp2, CONST0_RTX (mode), x2: tmp1));
15638 tmp1 = gen_lowpart (mode, tmp2);
15639 }
15640 else
15641 {
15642 /* Extend to SImode using a paradoxical SUBREG. */
15643 tmp1 = gen_reg_rtx (SImode);
15644 emit_move_insn (tmp1, gen_lowpart (SImode, val));
15645
15646 /* Insert the SImode value as
15647 low element of a V4SImode vector. */
15648 tmp2 = gen_reg_rtx (V4SImode);
15649 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
15650 tmp1 = gen_lowpart (mode, tmp2);
15651 }
15652
15653 emit_move_insn (dperm.op0, tmp1);
15654 ok = (expand_vec_perm_1 (d: &dperm)
15655 || expand_vec_perm_broadcast_1 (d: &dperm));
15656 gcc_assert (ok);
15657 return ok;
15658 }
15659 goto widen;
15660
15661 case E_V16QImode:
15662 if (TARGET_AVX2)
15663 return ix86_vector_duplicate_value (mode, target, val);
15664
15665 if (TARGET_SSE2)
15666 goto permute;
15667 goto widen;
15668
15669 widen:
15670 /* Replicate the value once into the next wider mode and recurse. */
15671 {
15672 machine_mode smode, wsmode, wvmode;
15673 rtx x;
15674
15675 smode = GET_MODE_INNER (mode);
15676 wvmode = get_mode_wider_vector (o: mode);
15677 wsmode = GET_MODE_INNER (wvmode);
15678
15679 val = convert_modes (mode: wsmode, oldmode: smode, x: val, unsignedp: true);
15680
15681 if (smode == QImode && !TARGET_PARTIAL_REG_STALL)
15682 emit_insn (gen_insv_1 (arg0: wsmode, x0: val, x1: val));
15683 else
15684 {
15685 x = expand_simple_binop (wsmode, ASHIFT, val,
15686 GEN_INT (GET_MODE_BITSIZE (smode)),
15687 NULL_RTX, 1, OPTAB_LIB_WIDEN);
15688 val = expand_simple_binop (wsmode, IOR, val, x, x, 1,
15689 OPTAB_LIB_WIDEN);
15690 }
15691
15692 x = gen_reg_rtx (wvmode);
15693 ok = ix86_expand_vector_init_duplicate (mmx_ok, mode: wvmode, target: x, val);
15694 gcc_assert (ok);
15695 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
15696 return ok;
15697 }
15698
15699 case E_V16HImode:
15700 case E_V16HFmode:
15701 case E_V16BFmode:
15702 case E_V32QImode:
15703 if (TARGET_AVX2)
15704 return ix86_vector_duplicate_value (mode, target, val);
15705 else
15706 {
15707 machine_mode hvmode;
15708 switch (mode)
15709 {
15710 case V16HImode:
15711 hvmode = V8HImode;
15712 break;
15713 case V16HFmode:
15714 hvmode = V8HFmode;
15715 break;
15716 case V16BFmode:
15717 hvmode = V8BFmode;
15718 break;
15719 case V32QImode:
15720 hvmode = V16QImode;
15721 break;
15722 default:
15723 gcc_unreachable ();
15724 }
15725 rtx x = gen_reg_rtx (hvmode);
15726
15727 ok = ix86_expand_vector_init_duplicate (mmx_ok: false, mode: hvmode, target: x, val);
15728 gcc_assert (ok);
15729
15730 x = gen_rtx_VEC_CONCAT (mode, x, x);
15731 emit_insn (gen_rtx_SET (target, x));
15732 }
15733 return true;
15734
15735 case E_V32HImode:
15736 case E_V32HFmode:
15737 case E_V32BFmode:
15738 case E_V64QImode:
15739 gcc_assert (TARGET_EVEX512);
15740 if (TARGET_AVX512BW)
15741 return ix86_vector_duplicate_value (mode, target, val);
15742 else
15743 {
15744 machine_mode hvmode;
15745 switch (mode)
15746 {
15747 case V32HImode:
15748 hvmode = V16HImode;
15749 break;
15750 case V32HFmode:
15751 hvmode = V16HFmode;
15752 break;
15753 case V32BFmode:
15754 hvmode = V16BFmode;
15755 break;
15756 case V64QImode:
15757 hvmode = V32QImode;
15758 break;
15759 default:
15760 gcc_unreachable ();
15761 }
15762 rtx x = gen_reg_rtx (hvmode);
15763
15764 ok = ix86_expand_vector_init_duplicate (mmx_ok: false, mode: hvmode, target: x, val);
15765 gcc_assert (ok);
15766
15767 x = gen_rtx_VEC_CONCAT (mode, x, x);
15768 emit_insn (gen_rtx_SET (target, x));
15769 }
15770 return true;
15771
15772 default:
15773 return false;
15774 }
15775}
15776
15777/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15778 whose ONE_VAR element is VAR, and other elements are zero. Return true
15779 if successful. */
15780
15781static bool
15782ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
15783 rtx target, rtx var, int one_var)
15784{
15785 machine_mode vsimode;
15786 rtx new_target;
15787 rtx x, tmp;
15788 bool use_vector_set = false;
15789 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
15790
15791 if (GET_MODE_SIZE (mode) == 64 && !TARGET_EVEX512)
15792 return false;
15793
15794 switch (mode)
15795 {
15796 case E_V2DImode:
15797 /* For SSE4.1, we normally use vector set. But if the second
15798 element is zero and inter-unit moves are OK, we use movq
15799 instead. */
15800 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
15801 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
15802 && one_var == 0));
15803 break;
15804 case E_V16QImode:
15805 case E_V4SImode:
15806 case E_V4SFmode:
15807 use_vector_set = TARGET_SSE4_1;
15808 break;
15809 case E_V8HImode:
15810 use_vector_set = TARGET_SSE2;
15811 gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
15812 ? gen_vec_setv8hi_0 : NULL;
15813 break;
15814 case E_V8QImode:
15815 use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
15816 break;
15817 case E_V4HImode:
15818 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
15819 break;
15820 case E_V4QImode:
15821 use_vector_set = TARGET_SSE4_1;
15822 break;
15823 case E_V32QImode:
15824 use_vector_set = TARGET_AVX;
15825 break;
15826 case E_V16HImode:
15827 use_vector_set = TARGET_AVX;
15828 gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
15829 ? gen_vec_setv16hi_0 : NULL;
15830 break;
15831 case E_V8SImode:
15832 use_vector_set = TARGET_AVX;
15833 gen_vec_set_0 = gen_vec_setv8si_0;
15834 break;
15835 case E_V8SFmode:
15836 use_vector_set = TARGET_AVX;
15837 gen_vec_set_0 = gen_vec_setv8sf_0;
15838 break;
15839 case E_V4DFmode:
15840 use_vector_set = TARGET_AVX;
15841 gen_vec_set_0 = gen_vec_setv4df_0;
15842 break;
15843 case E_V4DImode:
15844 /* Use ix86_expand_vector_set in 64bit mode only. */
15845 use_vector_set = TARGET_AVX && TARGET_64BIT;
15846 gen_vec_set_0 = gen_vec_setv4di_0;
15847 break;
15848 case E_V16SImode:
15849 use_vector_set = TARGET_AVX512F && one_var == 0;
15850 gen_vec_set_0 = gen_vec_setv16si_0;
15851 break;
15852 case E_V16SFmode:
15853 use_vector_set = TARGET_AVX512F && one_var == 0;
15854 gen_vec_set_0 = gen_vec_setv16sf_0;
15855 break;
15856 case E_V8DFmode:
15857 use_vector_set = TARGET_AVX512F && one_var == 0;
15858 gen_vec_set_0 = gen_vec_setv8df_0;
15859 break;
15860 case E_V8DImode:
15861 /* Use ix86_expand_vector_set in 64bit mode only. */
15862 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
15863 gen_vec_set_0 = gen_vec_setv8di_0;
15864 break;
15865 case E_V8HFmode:
15866 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15867 gen_vec_set_0 = gen_vec_setv8hf_0;
15868 break;
15869 case E_V16HFmode:
15870 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15871 gen_vec_set_0 = gen_vec_setv16hf_0;
15872 break;
15873 case E_V32HFmode:
15874 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15875 gen_vec_set_0 = gen_vec_setv32hf_0;
15876 break;
15877 case E_V8BFmode:
15878 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15879 gen_vec_set_0 = gen_vec_setv8bf_0;
15880 break;
15881 case E_V16BFmode:
15882 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15883 gen_vec_set_0 = gen_vec_setv16bf_0;
15884 break;
15885 case E_V32BFmode:
15886 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15887 gen_vec_set_0 = gen_vec_setv32bf_0;
15888 break;
15889 case E_V32HImode:
15890 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15891 gen_vec_set_0 = gen_vec_setv32hi_0;
15892 default:
15893 break;
15894 }
15895
15896 if (use_vector_set)
15897 {
15898 if (gen_vec_set_0 && one_var == 0)
15899 {
15900 var = force_reg (GET_MODE_INNER (mode), var);
15901 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
15902 return true;
15903 }
15904 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
15905 var = force_reg (GET_MODE_INNER (mode), var);
15906 ix86_expand_vector_set (mmx_ok, target, var, one_var);
15907 return true;
15908 }
15909
15910 switch (mode)
15911 {
15912 case E_V2SFmode:
15913 case E_V2SImode:
15914 if (!mmx_ok)
15915 return false;
15916 /* FALLTHRU */
15917
15918 case E_V2DFmode:
15919 case E_V2DImode:
15920 if (one_var != 0)
15921 return false;
15922 var = force_reg (GET_MODE_INNER (mode), var);
15923 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
15924 emit_insn (gen_rtx_SET (target, x));
15925 return true;
15926
15927 case E_V4SFmode:
15928 case E_V4SImode:
15929 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
15930 new_target = gen_reg_rtx (mode);
15931 else
15932 new_target = target;
15933 var = force_reg (GET_MODE_INNER (mode), var);
15934 x = gen_rtx_VEC_DUPLICATE (mode, var);
15935 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
15936 emit_insn (gen_rtx_SET (new_target, x));
15937 if (one_var != 0)
15938 {
15939 /* We need to shuffle the value to the correct position, so
15940 create a new pseudo to store the intermediate result. */
15941
15942 /* With SSE2, we can use the integer shuffle insns. */
15943 if (mode != V4SFmode && TARGET_SSE2)
15944 {
15945 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
15946 const1_rtx,
15947 GEN_INT (one_var == 1 ? 0 : 1),
15948 GEN_INT (one_var == 2 ? 0 : 1),
15949 GEN_INT (one_var == 3 ? 0 : 1)));
15950 if (target != new_target)
15951 emit_move_insn (target, new_target);
15952 return true;
15953 }
15954
15955 /* Otherwise convert the intermediate result to V4SFmode and
15956 use the SSE1 shuffle instructions. */
15957 if (mode != V4SFmode)
15958 {
15959 tmp = gen_reg_rtx (V4SFmode);
15960 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
15961 }
15962 else
15963 tmp = new_target;
15964
15965 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
15966 const1_rtx,
15967 GEN_INT (one_var == 1 ? 0 : 1),
15968 GEN_INT (one_var == 2 ? 0+4 : 1+4),
15969 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
15970
15971 if (mode != V4SFmode)
15972 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
15973 else if (tmp != target)
15974 emit_move_insn (target, tmp);
15975 }
15976 else if (target != new_target)
15977 emit_move_insn (target, new_target);
15978 return true;
15979
15980 case E_V8HImode:
15981 case E_V16QImode:
15982 vsimode = V4SImode;
15983 goto widen;
15984 case E_V4HImode:
15985 case E_V8QImode:
15986 if (!mmx_ok)
15987 return false;
15988 vsimode = V2SImode;
15989 goto widen;
15990 widen:
15991 if (one_var != 0)
15992 return false;
15993
15994 /* Zero extend the variable element to SImode and recurse. */
15995 var = convert_modes (SImode, GET_MODE_INNER (mode), x: var, unsignedp: true);
15996
15997 x = gen_reg_rtx (vsimode);
15998 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, mode: vsimode, target: x,
15999 var, one_var))
16000 gcc_unreachable ();
16001
16002 emit_move_insn (target, gen_lowpart (mode, x));
16003 return true;
16004
16005 default:
16006 return false;
16007 }
16008}
16009
16010/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
16011 consisting of the values in VALS. It is known that all elements
16012 except ONE_VAR are constants. Return true if successful. */
16013
16014static bool
16015ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
16016 rtx target, rtx vals, int one_var)
16017{
16018 rtx var = XVECEXP (vals, 0, one_var);
16019 machine_mode wmode;
16020 rtx const_vec, x;
16021
16022 const_vec = copy_rtx (vals);
16023 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
16024 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
16025
16026 switch (mode)
16027 {
16028 case E_V2DFmode:
16029 case E_V2DImode:
16030 case E_V2SFmode:
16031 case E_V2SImode:
16032 /* For the two element vectors, it's just as easy to use
16033 the general case. */
16034 return false;
16035
16036 case E_V4DImode:
16037 /* Use ix86_expand_vector_set in 64bit mode only. */
16038 if (!TARGET_64BIT)
16039 return false;
16040 /* FALLTHRU */
16041 case E_V8HFmode:
16042 case E_V16HFmode:
16043 case E_V8BFmode:
16044 case E_V16BFmode:
16045 case E_V4DFmode:
16046 case E_V8SFmode:
16047 case E_V8SImode:
16048 case E_V16HImode:
16049 case E_V32QImode:
16050 case E_V4SFmode:
16051 case E_V4SImode:
16052 case E_V8HImode:
16053 case E_V4HImode:
16054 break;
16055
16056 case E_V16QImode:
16057 if (TARGET_SSE4_1)
16058 break;
16059 wmode = V8HImode;
16060 goto widen;
16061 case E_V8QImode:
16062 if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
16063 break;
16064 wmode = V4HImode;
16065 goto widen;
16066 case E_V4QImode:
16067 if (TARGET_SSE4_1)
16068 break;
16069 wmode = V2HImode;
16070 widen:
16071 /* There's no way to set one QImode entry easily. Combine
16072 the variable value with its adjacent constant value, and
16073 promote to an HImode set. */
16074 x = XVECEXP (vals, 0, one_var ^ 1);
16075 if (one_var & 1)
16076 {
16077 var = convert_modes (HImode, QImode, x: var, unsignedp: true);
16078 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
16079 NULL_RTX, 1, OPTAB_LIB_WIDEN);
16080 x = GEN_INT (INTVAL (x) & 0xff);
16081 }
16082 else
16083 {
16084 var = convert_modes (HImode, QImode, x: var, unsignedp: true);
16085 x = gen_int_mode (UINTVAL (x) << 8, HImode);
16086 }
16087 if (x != const0_rtx)
16088 var = expand_simple_binop (HImode, IOR, var, x, var,
16089 1, OPTAB_LIB_WIDEN);
16090
16091 x = gen_reg_rtx (wmode);
16092 emit_move_insn (x, gen_lowpart (wmode, const_vec));
16093 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
16094
16095 emit_move_insn (target, gen_lowpart (mode, x));
16096 return true;
16097
16098 default:
16099 return false;
16100 }
16101
16102 emit_move_insn (target, const_vec);
16103 ix86_expand_vector_set (mmx_ok, target, var, one_var);
16104 return true;
16105}
16106
16107/* A subroutine of ix86_expand_vector_init_general. Use vector
16108 concatenate to handle the most general case: all values variable,
16109 and none identical. */
16110
16111static void
16112ix86_expand_vector_init_concat (machine_mode mode,
16113 rtx target, rtx *ops, int n)
16114{
16115 machine_mode half_mode = VOIDmode;
16116 rtx half[2];
16117 rtvec v;
16118 int i, j;
16119
16120 switch (n)
16121 {
16122 case 2:
16123 switch (mode)
16124 {
16125 case E_V32HFmode:
16126 half_mode = V16HFmode;
16127 break;
16128 case E_V32BFmode:
16129 half_mode = V16BFmode;
16130 break;
16131 case E_V16SImode:
16132 half_mode = V8SImode;
16133 break;
16134 case E_V16SFmode:
16135 half_mode = V8SFmode;
16136 break;
16137 case E_V8DImode:
16138 half_mode = V4DImode;
16139 break;
16140 case E_V8DFmode:
16141 half_mode = V4DFmode;
16142 break;
16143 case E_V16HFmode:
16144 half_mode = V8HFmode;
16145 break;
16146 case E_V16BFmode:
16147 half_mode = V8BFmode;
16148 break;
16149 case E_V8SImode:
16150 half_mode = V4SImode;
16151 break;
16152 case E_V8SFmode:
16153 half_mode = V4SFmode;
16154 break;
16155 case E_V4DImode:
16156 half_mode = V2DImode;
16157 break;
16158 case E_V4DFmode:
16159 half_mode = V2DFmode;
16160 break;
16161 case E_V4SImode:
16162 half_mode = V2SImode;
16163 break;
16164 case E_V4SFmode:
16165 half_mode = V2SFmode;
16166 break;
16167 case E_V2DImode:
16168 half_mode = DImode;
16169 break;
16170 case E_V2SImode:
16171 half_mode = SImode;
16172 break;
16173 case E_V2DFmode:
16174 half_mode = DFmode;
16175 break;
16176 case E_V2SFmode:
16177 half_mode = SFmode;
16178 break;
16179 default:
16180 gcc_unreachable ();
16181 }
16182
16183 if (!register_operand (ops[1], half_mode))
16184 ops[1] = force_reg (half_mode, ops[1]);
16185 if (!register_operand (ops[0], half_mode))
16186 ops[0] = force_reg (half_mode, ops[0]);
16187 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
16188 ops[1])));
16189 break;
16190
16191 case 4:
16192 switch (mode)
16193 {
16194 case E_V4DImode:
16195 half_mode = V2DImode;
16196 break;
16197 case E_V4DFmode:
16198 half_mode = V2DFmode;
16199 break;
16200 case E_V4SImode:
16201 half_mode = V2SImode;
16202 break;
16203 case E_V4SFmode:
16204 half_mode = V2SFmode;
16205 break;
16206 default:
16207 gcc_unreachable ();
16208 }
16209 goto half;
16210
16211 case 8:
16212 switch (mode)
16213 {
16214 case E_V8DImode:
16215 half_mode = V4DImode;
16216 break;
16217 case E_V8DFmode:
16218 half_mode = V4DFmode;
16219 break;
16220 case E_V8SImode:
16221 half_mode = V4SImode;
16222 break;
16223 case E_V8SFmode:
16224 half_mode = V4SFmode;
16225 break;
16226 default:
16227 gcc_unreachable ();
16228 }
16229 goto half;
16230
16231 case 16:
16232 switch (mode)
16233 {
16234 case E_V16SImode:
16235 half_mode = V8SImode;
16236 break;
16237 case E_V16SFmode:
16238 half_mode = V8SFmode;
16239 break;
16240 default:
16241 gcc_unreachable ();
16242 }
16243 goto half;
16244
16245half:
16246 /* FIXME: We process inputs backward to help RA. PR 36222. */
16247 i = n - 1;
16248 for (j = 1; j != -1; j--)
16249 {
16250 half[j] = gen_reg_rtx (half_mode);
16251 switch (n >> 1)
16252 {
16253 case 2:
16254 v = gen_rtvec (2, ops[i-1], ops[i]);
16255 i -= 2;
16256 break;
16257 case 4:
16258 v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
16259 i -= 4;
16260 break;
16261 case 8:
16262 v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
16263 ops[i-3], ops[i-2], ops[i-1], ops[i]);
16264 i -= 8;
16265 break;
16266 default:
16267 gcc_unreachable ();
16268 }
16269 ix86_expand_vector_init (false, half[j],
16270 gen_rtx_PARALLEL (half_mode, v));
16271 }
16272
16273 ix86_expand_vector_init_concat (mode, target, ops: half, n: 2);
16274 break;
16275
16276 default:
16277 gcc_unreachable ();
16278 }
16279}
16280
16281/* A subroutine of ix86_expand_vector_init_general. Use vector
16282 interleave to handle the most general case: all values variable,
16283 and none identical. */
16284
16285static void
16286ix86_expand_vector_init_interleave (machine_mode mode,
16287 rtx target, rtx *ops, int n)
16288{
16289 machine_mode first_imode, second_imode, third_imode, inner_mode;
16290 int i, j;
16291 rtx op, op0, op1;
16292 rtx (*gen_load_even) (rtx, rtx, rtx);
16293 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
16294 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
16295
16296 switch (mode)
16297 {
16298 case E_V8HFmode:
16299 gen_load_even = gen_vec_interleave_lowv8hf;
16300 gen_interleave_first_low = gen_vec_interleave_lowv4si;
16301 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16302 inner_mode = HFmode;
16303 first_imode = V4SImode;
16304 second_imode = V2DImode;
16305 third_imode = VOIDmode;
16306 break;
16307 case E_V8BFmode:
16308 gen_load_even = gen_vec_interleave_lowv8bf;
16309 gen_interleave_first_low = gen_vec_interleave_lowv4si;
16310 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16311 inner_mode = BFmode;
16312 first_imode = V4SImode;
16313 second_imode = V2DImode;
16314 third_imode = VOIDmode;
16315 break;
16316 case E_V8HImode:
16317 gen_load_even = gen_vec_setv8hi;
16318 gen_interleave_first_low = gen_vec_interleave_lowv4si;
16319 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16320 inner_mode = HImode;
16321 first_imode = V4SImode;
16322 second_imode = V2DImode;
16323 third_imode = VOIDmode;
16324 break;
16325 case E_V16QImode:
16326 gen_load_even = gen_vec_setv16qi;
16327 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
16328 gen_interleave_second_low = gen_vec_interleave_lowv4si;
16329 inner_mode = QImode;
16330 first_imode = V8HImode;
16331 second_imode = V4SImode;
16332 third_imode = V2DImode;
16333 break;
16334 default:
16335 gcc_unreachable ();
16336 }
16337
16338 for (i = 0; i < n; i++)
16339 {
16340 op = ops [i + i];
16341 if (inner_mode == HFmode || inner_mode == BFmode)
16342 {
16343 rtx even, odd;
16344 /* Use vpuncklwd to pack 2 HFmode or BFmode. */
16345 machine_mode vec_mode =
16346 (inner_mode == HFmode) ? V8HFmode : V8BFmode;
16347 op0 = gen_reg_rtx (vec_mode);
16348 even = lowpart_subreg (outermode: vec_mode,
16349 op: force_reg (inner_mode, op), innermode: inner_mode);
16350 odd = lowpart_subreg (outermode: vec_mode,
16351 op: force_reg (inner_mode, ops[i + i + 1]),
16352 innermode: inner_mode);
16353 emit_insn (gen_load_even (op0, even, odd));
16354 }
16355 else
16356 {
16357 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
16358 op0 = gen_reg_rtx (SImode);
16359 emit_move_insn (op0, gen_lowpart (SImode, op));
16360
16361 /* Insert the SImode value as low element of V4SImode vector. */
16362 op1 = gen_reg_rtx (V4SImode);
16363 op0 = gen_rtx_VEC_MERGE (V4SImode,
16364 gen_rtx_VEC_DUPLICATE (V4SImode,
16365 op0),
16366 CONST0_RTX (V4SImode),
16367 const1_rtx);
16368 emit_insn (gen_rtx_SET (op1, op0));
16369
16370 /* Cast the V4SImode vector back to a vector in orignal mode. */
16371 op0 = gen_reg_rtx (mode);
16372 emit_move_insn (op0, gen_lowpart (mode, op1));
16373
16374 /* Load even elements into the second position. */
16375 emit_insn (gen_load_even (op0,
16376 force_reg (inner_mode,
16377 ops[i + i + 1]),
16378 const1_rtx));
16379 }
16380
16381 /* Cast vector to FIRST_IMODE vector. */
16382 ops[i] = gen_reg_rtx (first_imode);
16383 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
16384 }
16385
16386 /* Interleave low FIRST_IMODE vectors. */
16387 for (i = j = 0; i < n; i += 2, j++)
16388 {
16389 op0 = gen_reg_rtx (first_imode);
16390 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
16391
16392 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
16393 ops[j] = gen_reg_rtx (second_imode);
16394 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
16395 }
16396
16397 /* Interleave low SECOND_IMODE vectors. */
16398 switch (second_imode)
16399 {
16400 case E_V4SImode:
16401 for (i = j = 0; i < n / 2; i += 2, j++)
16402 {
16403 op0 = gen_reg_rtx (second_imode);
16404 emit_insn (gen_interleave_second_low (op0, ops[i],
16405 ops[i + 1]));
16406
16407 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
16408 vector. */
16409 ops[j] = gen_reg_rtx (third_imode);
16410 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
16411 }
16412 second_imode = V2DImode;
16413 gen_interleave_second_low = gen_vec_interleave_lowv2di;
16414 /* FALLTHRU */
16415
16416 case E_V2DImode:
16417 op0 = gen_reg_rtx (second_imode);
16418 emit_insn (gen_interleave_second_low (op0, ops[0],
16419 ops[1]));
16420
16421 /* Cast the SECOND_IMODE vector back to a vector on original
16422 mode. */
16423 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
16424 break;
16425
16426 default:
16427 gcc_unreachable ();
16428 }
16429}
16430
16431/* A subroutine of ix86_expand_vector_init. Handle the most general case:
16432 all values variable, and none identical. */
16433
16434static void
16435ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
16436 rtx target, rtx vals)
16437{
16438 rtx ops[64], op0, op1, op2, op3, op4, op5;
16439 machine_mode half_mode = VOIDmode;
16440 machine_mode quarter_mode = VOIDmode;
16441 int n, i;
16442
16443 switch (mode)
16444 {
16445 case E_V2SFmode:
16446 case E_V2SImode:
16447 if (!mmx_ok && !TARGET_SSE)
16448 break;
16449 /* FALLTHRU */
16450
16451 case E_V16SImode:
16452 case E_V16SFmode:
16453 case E_V8DFmode:
16454 case E_V8DImode:
16455 case E_V8SFmode:
16456 case E_V8SImode:
16457 case E_V4DFmode:
16458 case E_V4DImode:
16459 case E_V4SFmode:
16460 case E_V4SImode:
16461 case E_V2DFmode:
16462 case E_V2DImode:
16463 n = GET_MODE_NUNITS (mode);
16464 for (i = 0; i < n; i++)
16465 ops[i] = XVECEXP (vals, 0, i);
16466 ix86_expand_vector_init_concat (mode, target, ops, n);
16467 return;
16468
16469 case E_V2TImode:
16470 for (i = 0; i < 2; i++)
16471 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
16472 op0 = gen_reg_rtx (V4DImode);
16473 ix86_expand_vector_init_concat (V4DImode, target: op0, ops, n: 2);
16474 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
16475 return;
16476
16477 case E_V4TImode:
16478 for (i = 0; i < 4; i++)
16479 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
16480 ops[4] = gen_reg_rtx (V4DImode);
16481 ix86_expand_vector_init_concat (V4DImode, target: ops[4], ops, n: 2);
16482 ops[5] = gen_reg_rtx (V4DImode);
16483 ix86_expand_vector_init_concat (V4DImode, target: ops[5], ops: ops + 2, n: 2);
16484 op0 = gen_reg_rtx (V8DImode);
16485 ix86_expand_vector_init_concat (V8DImode, target: op0, ops: ops + 4, n: 2);
16486 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
16487 return;
16488
16489 case E_V32QImode:
16490 half_mode = V16QImode;
16491 goto half;
16492
16493 case E_V16HImode:
16494 half_mode = V8HImode;
16495 goto half;
16496
16497 case E_V16HFmode:
16498 half_mode = V8HFmode;
16499 goto half;
16500
16501 case E_V16BFmode:
16502 half_mode = V8BFmode;
16503 goto half;
16504
16505half:
16506 n = GET_MODE_NUNITS (mode);
16507 for (i = 0; i < n; i++)
16508 ops[i] = XVECEXP (vals, 0, i);
16509 op0 = gen_reg_rtx (half_mode);
16510 op1 = gen_reg_rtx (half_mode);
16511 ix86_expand_vector_init_interleave (mode: half_mode, target: op0, ops,
16512 n: n >> 2);
16513 ix86_expand_vector_init_interleave (mode: half_mode, target: op1,
16514 ops: &ops [n >> 1], n: n >> 2);
16515 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
16516 return;
16517
16518 case E_V64QImode:
16519 quarter_mode = V16QImode;
16520 half_mode = V32QImode;
16521 goto quarter;
16522
16523 case E_V32HImode:
16524 quarter_mode = V8HImode;
16525 half_mode = V16HImode;
16526 goto quarter;
16527
16528 case E_V32HFmode:
16529 quarter_mode = V8HFmode;
16530 half_mode = V16HFmode;
16531 goto quarter;
16532
16533 case E_V32BFmode:
16534 quarter_mode = V8BFmode;
16535 half_mode = V16BFmode;
16536 goto quarter;
16537
16538quarter:
16539 n = GET_MODE_NUNITS (mode);
16540 for (i = 0; i < n; i++)
16541 ops[i] = XVECEXP (vals, 0, i);
16542 op0 = gen_reg_rtx (quarter_mode);
16543 op1 = gen_reg_rtx (quarter_mode);
16544 op2 = gen_reg_rtx (quarter_mode);
16545 op3 = gen_reg_rtx (quarter_mode);
16546 op4 = gen_reg_rtx (half_mode);
16547 op5 = gen_reg_rtx (half_mode);
16548 ix86_expand_vector_init_interleave (mode: quarter_mode, target: op0, ops,
16549 n: n >> 3);
16550 ix86_expand_vector_init_interleave (mode: quarter_mode, target: op1,
16551 ops: &ops [n >> 2], n: n >> 3);
16552 ix86_expand_vector_init_interleave (mode: quarter_mode, target: op2,
16553 ops: &ops [n >> 1], n: n >> 3);
16554 ix86_expand_vector_init_interleave (mode: quarter_mode, target: op3,
16555 ops: &ops [(n >> 1) | (n >> 2)], n: n >> 3);
16556 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
16557 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
16558 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
16559 return;
16560
16561 case E_V16QImode:
16562 if (!TARGET_SSE4_1)
16563 break;
16564 /* FALLTHRU */
16565
16566 case E_V8HImode:
16567 if (!TARGET_SSE2)
16568 break;
16569
16570 /* Don't use ix86_expand_vector_init_interleave if we can't
16571 move from GPR to SSE register directly. */
16572 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
16573 break;
16574 /* FALLTHRU */
16575
16576 case E_V8HFmode:
16577 case E_V8BFmode:
16578
16579 n = GET_MODE_NUNITS (mode);
16580 for (i = 0; i < n; i++)
16581 ops[i] = XVECEXP (vals, 0, i);
16582 ix86_expand_vector_init_interleave (mode, target, ops, n: n >> 1);
16583 return;
16584
16585 case E_V4HImode:
16586 case E_V8QImode:
16587
16588 case E_V2HImode:
16589 case E_V4QImode:
16590 break;
16591
16592 default:
16593 gcc_unreachable ();
16594 }
16595
16596 {
16597 int i, j, n_elts, n_words, n_elt_per_word;
16598 machine_mode tmp_mode, inner_mode;
16599 rtx words[4], shift;
16600
16601 tmp_mode = (GET_MODE_SIZE (mode) < UNITS_PER_WORD) ? SImode : word_mode;
16602
16603 inner_mode = GET_MODE_INNER (mode);
16604 n_elts = GET_MODE_NUNITS (mode);
16605 n_words = GET_MODE_SIZE (mode) / GET_MODE_SIZE (tmp_mode);
16606 n_elt_per_word = n_elts / n_words;
16607 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
16608
16609 for (i = 0; i < n_words; ++i)
16610 {
16611 rtx word = NULL_RTX;
16612
16613 for (j = 0; j < n_elt_per_word; ++j)
16614 {
16615 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
16616 elt = convert_modes (mode: tmp_mode, oldmode: inner_mode, x: elt, unsignedp: true);
16617
16618 if (j == 0)
16619 word = elt;
16620 else
16621 {
16622 word = expand_simple_binop (tmp_mode, ASHIFT, word, shift,
16623 NULL_RTX, 1, OPTAB_LIB_WIDEN);
16624 word = expand_simple_binop (tmp_mode, IOR, word, elt,
16625 NULL_RTX, 1, OPTAB_LIB_WIDEN);
16626 }
16627 }
16628
16629 words[i] = word;
16630 }
16631
16632 if (n_words == 1)
16633 emit_move_insn (target, gen_lowpart (mode, words[0]));
16634 else if (n_words == 2)
16635 {
16636 gcc_assert (tmp_mode == DImode || tmp_mode == SImode);
16637 machine_mode concat_mode = tmp_mode == DImode ? V2DImode : V2SImode;
16638 rtx tmp = gen_reg_rtx (concat_mode);
16639 vals = gen_rtx_PARALLEL (concat_mode, gen_rtvec_v (2, words));
16640 ix86_expand_vector_init_general (mmx_ok, mode: concat_mode, target: tmp, vals);
16641 emit_move_insn (target, gen_lowpart (mode, tmp));
16642 }
16643 else if (n_words == 4)
16644 {
16645 rtx tmp = gen_reg_rtx (V4SImode);
16646 gcc_assert (tmp_mode == SImode);
16647 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
16648 ix86_expand_vector_init_general (mmx_ok: false, V4SImode, target: tmp, vals);
16649 emit_move_insn (target, gen_lowpart (mode, tmp));
16650 }
16651 else
16652 gcc_unreachable ();
16653 }
16654}
16655
16656/* Initialize vector TARGET via VALS. Suppress the use of MMX
16657 instructions unless MMX_OK is true. */
16658
16659void
16660ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
16661{
16662 machine_mode mode = GET_MODE (target);
16663 machine_mode inner_mode = GET_MODE_INNER (mode);
16664 int n_elts = GET_MODE_NUNITS (mode);
16665 int n_var = 0, one_var = -1;
16666 bool all_same = true, all_const_zero = true;
16667 int i;
16668 rtx x;
16669
16670 /* Handle first initialization from vector elts. */
16671 if (n_elts != XVECLEN (vals, 0))
16672 {
16673 rtx subtarget = target;
16674 x = XVECEXP (vals, 0, 0);
16675 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
16676 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
16677 {
16678 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
16679 if (inner_mode == QImode
16680 || inner_mode == HImode
16681 || inner_mode == TImode
16682 || inner_mode == HFmode
16683 || inner_mode == BFmode)
16684 {
16685 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
16686 scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
16687 n_bits /= GET_MODE_SIZE (elt_mode);
16688 mode = mode_for_vector (elt_mode, n_bits).require ();
16689 inner_mode = mode_for_vector (elt_mode, n_bits / 2).require ();
16690 ops[0] = gen_lowpart (inner_mode, ops[0]);
16691 ops[1] = gen_lowpart (inner_mode, ops[1]);
16692 subtarget = gen_reg_rtx (mode);
16693 }
16694 ix86_expand_vector_init_concat (mode, target: subtarget, ops, n: 2);
16695 if (subtarget != target)
16696 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
16697 return;
16698 }
16699 gcc_unreachable ();
16700 }
16701
16702 for (i = 0; i < n_elts; ++i)
16703 {
16704 x = XVECEXP (vals, 0, i);
16705 if (!(CONST_SCALAR_INT_P (x)
16706 || CONST_DOUBLE_P (x)
16707 || CONST_FIXED_P (x)))
16708 n_var++, one_var = i;
16709 else if (x != CONST0_RTX (inner_mode))
16710 all_const_zero = false;
16711 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
16712 all_same = false;
16713 }
16714
16715 /* Constants are best loaded from the constant pool. */
16716 if (n_var == 0)
16717 {
16718 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
16719 return;
16720 }
16721
16722 /* If all values are identical, broadcast the value. */
16723 if (all_same
16724 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
16725 XVECEXP (vals, 0, 0)))
16726 return;
16727
16728 /* Values where only one field is non-constant are best loaded from
16729 the pool and overwritten via move later. */
16730 if (n_var == 1)
16731 {
16732 if (all_const_zero
16733 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
16734 XVECEXP (vals, 0, one_var),
16735 one_var))
16736 return;
16737
16738 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
16739 return;
16740 }
16741
16742 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
16743}
16744
16745/* Implemented as
16746 V setg (V v, int idx, T val)
16747 {
16748 V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
16749 V valv = (V){val, val, val, val, val, val, val, val};
16750 V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
16751 v = (v & ~mask) | (valv & mask);
16752 return v;
16753 }. */
16754void
16755ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
16756{
16757 rtx vec[64];
16758 machine_mode mode = GET_MODE (target);
16759 machine_mode cmp_mode = mode;
16760 int n_elts = GET_MODE_NUNITS (mode);
16761 rtx valv,idxv,constv,idx_tmp;
16762 bool ok = false;
16763
16764 /* 512-bits vector byte/word broadcast and comparison only available
16765 under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
16766 when without TARGET_AVX512BW. */
16767 if ((mode == V32HImode || mode == V32HFmode || mode == V32BFmode
16768 || mode == V64QImode)
16769 && !TARGET_AVX512BW)
16770 {
16771 gcc_assert (TARGET_AVX512F);
16772 rtx vhi, vlo, idx_hi;
16773 machine_mode half_mode;
16774 rtx (*extract_hi)(rtx, rtx);
16775 rtx (*extract_lo)(rtx, rtx);
16776
16777 if (mode == V32HImode)
16778 {
16779 half_mode = V16HImode;
16780 extract_hi = gen_vec_extract_hi_v32hi;
16781 extract_lo = gen_vec_extract_lo_v32hi;
16782 }
16783 else if (mode == V32HFmode)
16784 {
16785 half_mode = V16HFmode;
16786 extract_hi = gen_vec_extract_hi_v32hf;
16787 extract_lo = gen_vec_extract_lo_v32hf;
16788 }
16789 else if (mode == V32BFmode)
16790 {
16791 half_mode = V16BFmode;
16792 extract_hi = gen_vec_extract_hi_v32bf;
16793 extract_lo = gen_vec_extract_lo_v32bf;
16794 }
16795 else
16796 {
16797 half_mode = V32QImode;
16798 extract_hi = gen_vec_extract_hi_v64qi;
16799 extract_lo = gen_vec_extract_lo_v64qi;
16800 }
16801
16802 vhi = gen_reg_rtx (half_mode);
16803 vlo = gen_reg_rtx (half_mode);
16804 idx_hi = gen_reg_rtx (GET_MODE (idx));
16805 emit_insn (extract_hi (vhi, target));
16806 emit_insn (extract_lo (vlo, target));
16807 vec[0] = idx_hi;
16808 vec[1] = idx;
16809 vec[2] = GEN_INT (n_elts/2);
16810 ix86_expand_binary_operator (code: MINUS, GET_MODE (idx), operands: vec);
16811 ix86_expand_vector_set_var (target: vhi, val, idx: idx_hi);
16812 ix86_expand_vector_set_var (target: vlo, val, idx);
16813 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, vlo, vhi)));
16814 return;
16815 }
16816
16817 if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
16818 {
16819 switch (mode)
16820 {
16821 case E_V2DFmode:
16822 cmp_mode = V2DImode;
16823 break;
16824 case E_V4DFmode:
16825 cmp_mode = V4DImode;
16826 break;
16827 case E_V8DFmode:
16828 cmp_mode = V8DImode;
16829 break;
16830 case E_V2SFmode:
16831 cmp_mode = V2SImode;
16832 break;
16833 case E_V4SFmode:
16834 cmp_mode = V4SImode;
16835 break;
16836 case E_V8SFmode:
16837 cmp_mode = V8SImode;
16838 break;
16839 case E_V16SFmode:
16840 cmp_mode = V16SImode;
16841 break;
16842 case E_V8HFmode:
16843 cmp_mode = V8HImode;
16844 break;
16845 case E_V16HFmode:
16846 cmp_mode = V16HImode;
16847 break;
16848 case E_V32HFmode:
16849 cmp_mode = V32HImode;
16850 break;
16851 case E_V8BFmode:
16852 cmp_mode = V8HImode;
16853 break;
16854 case E_V16BFmode:
16855 cmp_mode = V16HImode;
16856 break;
16857 case E_V32BFmode:
16858 cmp_mode = V32HImode;
16859 break;
16860 default:
16861 gcc_unreachable ();
16862 }
16863 }
16864
16865 for (int i = 0; i != n_elts; i++)
16866 vec[i] = GEN_INT (i);
16867 constv = gen_rtx_CONST_VECTOR (cmp_mode, gen_rtvec_v (n_elts, vec));
16868 valv = gen_reg_rtx (mode);
16869 idxv = gen_reg_rtx (cmp_mode);
16870 idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
16871
16872 ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
16873 mode, target: valv, val);
16874 gcc_assert (ok);
16875 ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
16876 mode: cmp_mode, target: idxv, val: idx_tmp);
16877 gcc_assert (ok);
16878 vec[0] = target;
16879 vec[1] = valv;
16880 vec[2] = target;
16881 vec[3] = gen_rtx_EQ (mode, idxv, constv);
16882 vec[4] = idxv;
16883 vec[5] = constv;
16884 ok = ix86_expand_int_vcond (operands: vec);
16885 gcc_assert (ok);
16886}
16887
16888void
16889ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
16890{
16891 machine_mode mode = GET_MODE (target);
16892 machine_mode inner_mode = GET_MODE_INNER (mode);
16893 machine_mode half_mode;
16894 bool use_vec_merge = false;
16895 bool blendm_const = false;
16896 rtx tmp;
16897 static rtx (*gen_extract[8][2]) (rtx, rtx)
16898 = {
16899 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
16900 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
16901 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
16902 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
16903 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
16904 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df },
16905 { gen_vec_extract_lo_v16hf, gen_vec_extract_hi_v16hf },
16906 { gen_vec_extract_lo_v16bf, gen_vec_extract_hi_v16bf }
16907 };
16908 static rtx (*gen_insert[8][2]) (rtx, rtx, rtx)
16909 = {
16910 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
16911 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
16912 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
16913 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
16914 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
16915 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df },
16916 { gen_vec_set_lo_v16hf, gen_vec_set_hi_v16hf },
16917 { gen_vec_set_lo_v16bf, gen_vec_set_hi_v16bf },
16918 };
16919 int i, j, n;
16920 machine_mode mmode = VOIDmode;
16921 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
16922
16923 switch (mode)
16924 {
16925 case E_V2SImode:
16926 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16927 if (use_vec_merge)
16928 break;
16929 /* FALLTHRU */
16930
16931 case E_V2SFmode:
16932 if (mmx_ok)
16933 {
16934 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
16935 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
16936 if (elt == 0)
16937 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
16938 else
16939 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
16940 emit_insn (gen_rtx_SET (target, tmp));
16941 return;
16942 }
16943 break;
16944
16945 case E_V2DImode:
16946 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
16947 if (use_vec_merge)
16948 break;
16949
16950 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
16951 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
16952 if (elt == 0)
16953 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
16954 else
16955 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
16956 emit_insn (gen_rtx_SET (target, tmp));
16957 return;
16958
16959 case E_V2DFmode:
16960 /* NB: For ELT == 0, use standard scalar operation patterns which
16961 preserve the rest of the vector for combiner:
16962
16963 (vec_merge:V2DF
16964 (vec_duplicate:V2DF (reg:DF))
16965 (reg:V2DF)
16966 (const_int 1))
16967 */
16968 if (elt == 0)
16969 goto do_vec_merge;
16970
16971 {
16972 rtx op0, op1;
16973
16974 /* For the two element vectors, we implement a VEC_CONCAT with
16975 the extraction of the other element. */
16976
16977 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
16978 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
16979
16980 if (elt == 0)
16981 op0 = val, op1 = tmp;
16982 else
16983 op0 = tmp, op1 = val;
16984
16985 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
16986 emit_insn (gen_rtx_SET (target, tmp));
16987 }
16988 return;
16989
16990 case E_V4SFmode:
16991 use_vec_merge = TARGET_SSE4_1;
16992 if (use_vec_merge)
16993 break;
16994
16995 switch (elt)
16996 {
16997 case 0:
16998 use_vec_merge = true;
16999 break;
17000
17001 case 1:
17002 /* tmp = target = A B C D */
17003 tmp = copy_to_reg (target);
17004 /* target = A A B B */
17005 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
17006 /* target = X A B B */
17007 ix86_expand_vector_set (mmx_ok: false, target, val, elt: 0);
17008 /* target = A X C D */
17009 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
17010 const1_rtx, const0_rtx,
17011 GEN_INT (2+4), GEN_INT (3+4)));
17012 return;
17013
17014 case 2:
17015 /* tmp = target = A B C D */
17016 tmp = copy_to_reg (target);
17017 /* tmp = X B C D */
17018 ix86_expand_vector_set (mmx_ok: false, target: tmp, val, elt: 0);
17019 /* target = A B X D */
17020 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
17021 const0_rtx, const1_rtx,
17022 GEN_INT (0+4), GEN_INT (3+4)));
17023 return;
17024
17025 case 3:
17026 /* tmp = target = A B C D */
17027 tmp = copy_to_reg (target);
17028 /* tmp = X B C D */
17029 ix86_expand_vector_set (mmx_ok: false, target: tmp, val, elt: 0);
17030 /* target = A B X D */
17031 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
17032 const0_rtx, const1_rtx,
17033 GEN_INT (2+4), GEN_INT (0+4)));
17034 return;
17035
17036 default:
17037 gcc_unreachable ();
17038 }
17039 break;
17040
17041 case E_V4SImode:
17042 use_vec_merge = TARGET_SSE4_1;
17043 if (use_vec_merge)
17044 break;
17045
17046 /* Element 0 handled by vec_merge below. */
17047 if (elt == 0)
17048 {
17049 use_vec_merge = true;
17050 break;
17051 }
17052
17053 if (TARGET_SSE2)
17054 {
17055 /* With SSE2, use integer shuffles to swap element 0 and ELT,
17056 store into element 0, then shuffle them back. */
17057
17058 rtx order[4];
17059
17060 order[0] = GEN_INT (elt);
17061 order[1] = const1_rtx;
17062 order[2] = const2_rtx;
17063 order[3] = GEN_INT (3);
17064 order[elt] = const0_rtx;
17065
17066 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
17067 order[1], order[2], order[3]));
17068
17069 ix86_expand_vector_set (mmx_ok: false, target, val, elt: 0);
17070
17071 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
17072 order[1], order[2], order[3]));
17073 }
17074 else
17075 {
17076 /* For SSE1, we have to reuse the V4SF code. */
17077 rtx t = gen_reg_rtx (V4SFmode);
17078 emit_move_insn (t, gen_lowpart (V4SFmode, target));
17079 ix86_expand_vector_set (mmx_ok: false, target: t, gen_lowpart (SFmode, val), elt);
17080 emit_move_insn (target, gen_lowpart (mode, t));
17081 }
17082 return;
17083
17084 case E_V8HImode:
17085 case E_V8HFmode:
17086 case E_V8BFmode:
17087 case E_V2HImode:
17088 use_vec_merge = TARGET_SSE2;
17089 break;
17090 case E_V4HImode:
17091 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
17092 break;
17093
17094 case E_V16QImode:
17095 case E_V4QImode:
17096 use_vec_merge = TARGET_SSE4_1;
17097 break;
17098
17099 case E_V8QImode:
17100 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
17101 break;
17102
17103 case E_V32QImode:
17104 half_mode = V16QImode;
17105 j = 0;
17106 n = 16;
17107 goto half;
17108
17109 case E_V16HFmode:
17110 case E_V16BFmode:
17111 /* For ELT == 0, vec_setv8hf_0 can save 1 vpbroadcastw. */
17112 if (TARGET_AVX2 && elt != 0)
17113 {
17114 mmode = SImode;
17115 gen_blendm = ((mode == E_V16HFmode) ? gen_avx2_pblendph_1
17116 : gen_avx2_pblendbf_1);
17117 blendm_const = true;
17118 break;
17119 }
17120 else
17121 {
17122 half_mode = ((mode == E_V16HFmode) ? V8HFmode : V8BFmode);
17123 j = ((mode == E_V16HFmode) ? 6 : 7);
17124 n = 8;
17125 goto half;
17126 }
17127
17128 case E_V16HImode:
17129 half_mode = V8HImode;
17130 j = 1;
17131 n = 8;
17132 goto half;
17133
17134 case E_V8SImode:
17135 half_mode = V4SImode;
17136 j = 2;
17137 n = 4;
17138 goto half;
17139
17140 case E_V4DImode:
17141 half_mode = V2DImode;
17142 j = 3;
17143 n = 2;
17144 goto half;
17145
17146 case E_V8SFmode:
17147 half_mode = V4SFmode;
17148 j = 4;
17149 n = 4;
17150 goto half;
17151
17152 case E_V4DFmode:
17153 half_mode = V2DFmode;
17154 j = 5;
17155 n = 2;
17156 goto half;
17157
17158half:
17159 /* Compute offset. */
17160 i = elt / n;
17161 elt %= n;
17162
17163 gcc_assert (i <= 1);
17164
17165 /* Extract the half. */
17166 tmp = gen_reg_rtx (half_mode);
17167 emit_insn (gen_extract[j][i] (tmp, target));
17168
17169 /* Put val in tmp at elt. */
17170 ix86_expand_vector_set (mmx_ok: false, target: tmp, val, elt);
17171
17172 /* Put it back. */
17173 emit_insn (gen_insert[j][i] (target, target, tmp));
17174 return;
17175
17176 case E_V8DFmode:
17177 if (TARGET_AVX512F)
17178 {
17179 mmode = QImode;
17180 gen_blendm = gen_avx512f_blendmv8df;
17181 }
17182 break;
17183
17184 case E_V8DImode:
17185 if (TARGET_AVX512F)
17186 {
17187 mmode = QImode;
17188 gen_blendm = gen_avx512f_blendmv8di;
17189 }
17190 break;
17191
17192 case E_V16SFmode:
17193 if (TARGET_AVX512F)
17194 {
17195 mmode = HImode;
17196 gen_blendm = gen_avx512f_blendmv16sf;
17197 }
17198 break;
17199
17200 case E_V16SImode:
17201 if (TARGET_AVX512F)
17202 {
17203 mmode = HImode;
17204 gen_blendm = gen_avx512f_blendmv16si;
17205 }
17206 break;
17207
17208 case E_V32HFmode:
17209 if (TARGET_AVX512BW)
17210 {
17211 mmode = SImode;
17212 gen_blendm = gen_avx512bw_blendmv32hf;
17213 }
17214 break;
17215 case E_V32BFmode:
17216 if (TARGET_AVX512BW)
17217 {
17218 mmode = SImode;
17219 gen_blendm = gen_avx512bw_blendmv32bf;
17220 }
17221 break;
17222 case E_V32HImode:
17223 if (TARGET_AVX512BW)
17224 {
17225 mmode = SImode;
17226 gen_blendm = gen_avx512bw_blendmv32hi;
17227 }
17228 else if (TARGET_AVX512F)
17229 {
17230 half_mode = E_V8HImode;
17231 n = 8;
17232 goto quarter;
17233 }
17234 break;
17235
17236 case E_V64QImode:
17237 if (TARGET_AVX512BW)
17238 {
17239 mmode = DImode;
17240 gen_blendm = gen_avx512bw_blendmv64qi;
17241 }
17242 else if (TARGET_AVX512F)
17243 {
17244 half_mode = E_V16QImode;
17245 n = 16;
17246 goto quarter;
17247 }
17248 break;
17249
17250quarter:
17251 /* Compute offset. */
17252 i = elt / n;
17253 elt %= n;
17254
17255 gcc_assert (i <= 3);
17256
17257 {
17258 /* Extract the quarter. */
17259 tmp = gen_reg_rtx (V4SImode);
17260 rtx tmp2 = gen_lowpart (V16SImode, target);
17261 rtx mask = gen_reg_rtx (QImode);
17262
17263 emit_move_insn (mask, constm1_rtx);
17264 emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
17265 tmp, mask));
17266
17267 tmp2 = gen_reg_rtx (half_mode);
17268 emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
17269 tmp = tmp2;
17270
17271 /* Put val in tmp at elt. */
17272 ix86_expand_vector_set (mmx_ok: false, target: tmp, val, elt);
17273
17274 /* Put it back. */
17275 tmp2 = gen_reg_rtx (V16SImode);
17276 rtx tmp3 = gen_lowpart (V16SImode, target);
17277 mask = gen_reg_rtx (HImode);
17278 emit_move_insn (mask, constm1_rtx);
17279 tmp = gen_lowpart (V4SImode, tmp);
17280 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
17281 tmp3, mask));
17282 emit_move_insn (target, gen_lowpart (mode, tmp2));
17283 }
17284 return;
17285
17286 default:
17287 break;
17288 }
17289
17290 if (mmode != VOIDmode)
17291 {
17292 tmp = gen_reg_rtx (mode);
17293 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
17294 rtx merge_mask = gen_int_mode (HOST_WIDE_INT_1U << elt, mmode);
17295 /* The avx512*_blendm<mode> expanders have different operand order
17296 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
17297 elements where the mask is set and second input operand otherwise,
17298 in {sse,avx}*_*blend* the first input operand is used for elements
17299 where the mask is clear and second input operand otherwise. */
17300 if (!blendm_const)
17301 merge_mask = force_reg (mmode, merge_mask);
17302 emit_insn (gen_blendm (target, target, tmp, merge_mask));
17303 }
17304 else if (use_vec_merge)
17305 {
17306do_vec_merge:
17307 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
17308 tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
17309 GEN_INT (HOST_WIDE_INT_1U << elt));
17310 emit_insn (gen_rtx_SET (target, tmp));
17311 }
17312 else
17313 {
17314 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
17315
17316 emit_move_insn (mem, target);
17317
17318 tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
17319 emit_move_insn (tmp, val);
17320
17321 emit_move_insn (target, mem);
17322 }
17323}
17324
17325void
17326ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
17327{
17328 machine_mode mode = GET_MODE (vec);
17329 machine_mode inner_mode = GET_MODE_INNER (mode);
17330 bool use_vec_extr = false;
17331 rtx tmp;
17332
17333 switch (mode)
17334 {
17335 case E_V2SImode:
17336 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
17337 if (use_vec_extr)
17338 break;
17339 /* FALLTHRU */
17340
17341 case E_V2SFmode:
17342 if (!mmx_ok)
17343 break;
17344 /* FALLTHRU */
17345
17346 case E_V2DFmode:
17347 case E_V2DImode:
17348 case E_V2TImode:
17349 case E_V4TImode:
17350 use_vec_extr = true;
17351 break;
17352
17353 case E_V4SFmode:
17354 use_vec_extr = TARGET_SSE4_1;
17355 if (use_vec_extr)
17356 break;
17357
17358 switch (elt)
17359 {
17360 case 0:
17361 tmp = vec;
17362 break;
17363
17364 case 1:
17365 case 3:
17366 tmp = gen_reg_rtx (mode);
17367 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
17368 GEN_INT (elt), GEN_INT (elt),
17369 GEN_INT (elt+4), GEN_INT (elt+4)));
17370 break;
17371
17372 case 2:
17373 tmp = gen_reg_rtx (mode);
17374 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
17375 break;
17376
17377 default:
17378 gcc_unreachable ();
17379 }
17380 vec = tmp;
17381 use_vec_extr = true;
17382 elt = 0;
17383 break;
17384
17385 case E_V4SImode:
17386 use_vec_extr = TARGET_SSE4_1;
17387 if (use_vec_extr)
17388 break;
17389
17390 if (TARGET_SSE2)
17391 {
17392 switch (elt)
17393 {
17394 case 0:
17395 tmp = vec;
17396 break;
17397
17398 case 1:
17399 case 3:
17400 tmp = gen_reg_rtx (mode);
17401 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
17402 GEN_INT (elt), GEN_INT (elt),
17403 GEN_INT (elt), GEN_INT (elt)));
17404 break;
17405
17406 case 2:
17407 tmp = gen_reg_rtx (mode);
17408 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
17409 break;
17410
17411 default:
17412 gcc_unreachable ();
17413 }
17414 vec = tmp;
17415 use_vec_extr = true;
17416 elt = 0;
17417 }
17418 else
17419 {
17420 /* For SSE1, we have to reuse the V4SF code. */
17421 ix86_expand_vector_extract (mmx_ok: false, gen_lowpart (SFmode, target),
17422 gen_lowpart (V4SFmode, vec), elt);
17423 return;
17424 }
17425 break;
17426
17427 case E_V8HImode:
17428 case E_V8HFmode:
17429 case E_V8BFmode:
17430 case E_V2HImode:
17431 use_vec_extr = TARGET_SSE2;
17432 break;
17433 case E_V4HImode:
17434 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
17435 break;
17436
17437 case E_V16QImode:
17438 use_vec_extr = TARGET_SSE4_1;
17439 if (!use_vec_extr
17440 && TARGET_SSE2
17441 && elt == 0
17442 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
17443 {
17444 tmp = gen_reg_rtx (SImode);
17445 ix86_expand_vector_extract (mmx_ok: false, target: tmp, gen_lowpart (V4SImode, vec),
17446 elt: 0);
17447 emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
17448 return;
17449 }
17450 break;
17451 case E_V4QImode:
17452 use_vec_extr = TARGET_SSE4_1;
17453 break;
17454
17455 case E_V8SFmode:
17456 if (TARGET_AVX)
17457 {
17458 tmp = gen_reg_rtx (V4SFmode);
17459 if (elt < 4)
17460 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
17461 else
17462 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
17463 ix86_expand_vector_extract (mmx_ok: false, target, vec: tmp, elt: elt & 3);
17464 return;
17465 }
17466 break;
17467
17468 case E_V4DFmode:
17469 if (TARGET_AVX)
17470 {
17471 tmp = gen_reg_rtx (V2DFmode);
17472 if (elt < 2)
17473 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
17474 else
17475 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
17476 ix86_expand_vector_extract (mmx_ok: false, target, vec: tmp, elt: elt & 1);
17477 return;
17478 }
17479 break;
17480
17481 case E_V32QImode:
17482 if (TARGET_AVX)
17483 {
17484 tmp = gen_reg_rtx (V16QImode);
17485 if (elt < 16)
17486 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
17487 else
17488 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
17489 ix86_expand_vector_extract (mmx_ok: false, target, vec: tmp, elt: elt & 15);
17490 return;
17491 }
17492 break;
17493
17494 case E_V16HImode:
17495 if (TARGET_AVX)
17496 {
17497 tmp = gen_reg_rtx (V8HImode);
17498 if (elt < 8)
17499 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
17500 else
17501 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
17502 ix86_expand_vector_extract (mmx_ok: false, target, vec: tmp, elt: elt & 7);
17503 return;
17504 }
17505 break;
17506
17507 case E_V8SImode:
17508 if (TARGET_AVX)
17509 {
17510 tmp = gen_reg_rtx (V4SImode);
17511 if (elt < 4)
17512 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
17513 else
17514 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
17515 ix86_expand_vector_extract (mmx_ok: false, target, vec: tmp, elt: elt & 3);
17516 return;
17517 }
17518 break;
17519
17520 case E_V4DImode:
17521 if (TARGET_AVX)
17522 {
17523 tmp = gen_reg_rtx (V2DImode);
17524 if (elt < 2)
17525 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
17526 else
17527 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
17528 ix86_expand_vector_extract (mmx_ok: false, target, vec: tmp, elt: elt & 1);
17529 return;
17530 }
17531 break;
17532
17533 case E_V32HImode:
17534 if (TARGET_AVX512BW)
17535 {
17536 tmp = gen_reg_rtx (V16HImode);
17537 if (elt < 16)
17538 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
17539 else
17540 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
17541 ix86_expand_vector_extract (mmx_ok: false, target, vec: tmp, elt: elt & 15);
17542 return;
17543 }
17544 break;
17545
17546 case E_V64QImode:
17547 if (TARGET_AVX512BW)
17548 {
17549 tmp = gen_reg_rtx (V32QImode);
17550 if (elt < 32)
17551 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
17552 else
17553 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
17554 ix86_expand_vector_extract (mmx_ok: false, target, vec: tmp, elt: elt & 31);
17555 return;
17556 }
17557 break;
17558
17559 case E_V16SFmode:
17560 tmp = gen_reg_rtx (V8SFmode);
17561 if (elt < 8)
17562 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
17563 else
17564 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
17565 ix86_expand_vector_extract (mmx_ok: false, target, vec: tmp, elt: elt & 7);
17566 return;
17567
17568 case E_V8DFmode:
17569 tmp = gen_reg_rtx (V4DFmode);
17570 if (elt < 4)
17571 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
17572 else
17573 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
17574 ix86_expand_vector_extract (mmx_ok: false, target, vec: tmp, elt: elt & 3);
17575 return;
17576
17577 case E_V16SImode:
17578 tmp = gen_reg_rtx (V8SImode);
17579 if (elt < 8)
17580 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
17581 else
17582 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
17583 ix86_expand_vector_extract (mmx_ok: false, target, vec: tmp, elt: elt & 7);
17584 return;
17585
17586 case E_V8DImode:
17587 tmp = gen_reg_rtx (V4DImode);
17588 if (elt < 4)
17589 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
17590 else
17591 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
17592 ix86_expand_vector_extract (mmx_ok: false, target, vec: tmp, elt: elt & 3);
17593 return;
17594
17595 case E_V32HFmode:
17596 case E_V32BFmode:
17597 if (TARGET_AVX512BW)
17598 {
17599 tmp = (mode == E_V32HFmode
17600 ? gen_reg_rtx (V16HFmode)
17601 : gen_reg_rtx (V16BFmode));
17602 if (elt < 16)
17603 emit_insn (gen_vec_extract_lo (arg0: mode, x0: tmp, x1: vec));
17604 else
17605 emit_insn (gen_vec_extract_hi (arg0: mode, x0: tmp, x1: vec));
17606 ix86_expand_vector_extract (mmx_ok: false, target, vec: tmp, elt: elt & 15);
17607 return;
17608 }
17609 break;
17610
17611 case E_V16HFmode:
17612 case E_V16BFmode:
17613 if (TARGET_AVX)
17614 {
17615 tmp = (mode == E_V16HFmode
17616 ? gen_reg_rtx (V8HFmode)
17617 : gen_reg_rtx (V8BFmode));
17618 if (elt < 8)
17619 emit_insn (gen_vec_extract_lo (arg0: mode, x0: tmp, x1: vec));
17620 else
17621 emit_insn (gen_vec_extract_hi (arg0: mode, x0: tmp, x1: vec));
17622 ix86_expand_vector_extract (mmx_ok: false, target, vec: tmp, elt: elt & 7);
17623 return;
17624 }
17625 break;
17626
17627 case E_V8QImode:
17628 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
17629 /* ??? Could extract the appropriate HImode element and shift. */
17630 break;
17631
17632 default:
17633 break;
17634 }
17635
17636 if (use_vec_extr)
17637 {
17638 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
17639 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
17640
17641 /* Let the rtl optimizers know about the zero extension performed. */
17642 if (inner_mode == QImode || inner_mode == HImode)
17643 {
17644 rtx reg = gen_reg_rtx (SImode);
17645 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
17646 emit_move_insn (reg, tmp);
17647 tmp = gen_lowpart (inner_mode, reg);
17648 SUBREG_PROMOTED_VAR_P (tmp) = 1;
17649 SUBREG_PROMOTED_SET (tmp, 1);
17650 }
17651
17652 emit_move_insn (target, tmp);
17653 }
17654 else
17655 {
17656 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
17657
17658 emit_move_insn (mem, vec);
17659
17660 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
17661 emit_move_insn (target, tmp);
17662 }
17663}
17664
17665/* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
17666 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
17667 The upper bits of DEST are undefined, though they shouldn't cause
17668 exceptions (some bits from src or all zeros are ok). */
17669
17670static void
17671emit_reduc_half (rtx dest, rtx src, int i)
17672{
17673 rtx tem, d = dest;
17674 switch (GET_MODE (src))
17675 {
17676 case E_V4SFmode:
17677 if (i == 128)
17678 tem = gen_sse_movhlps (dest, src, src);
17679 else
17680 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
17681 GEN_INT (1 + 4), GEN_INT (1 + 4));
17682 break;
17683 case E_V2DFmode:
17684 tem = gen_vec_interleave_highv2df (dest, src, src);
17685 break;
17686 case E_V4QImode:
17687 d = gen_reg_rtx (V1SImode);
17688 tem = gen_mmx_lshrv1si3 (d, gen_lowpart (V1SImode, src),
17689 GEN_INT (i / 2));
17690 break;
17691 case E_V4HImode:
17692 d = gen_reg_rtx (V1DImode);
17693 tem = gen_mmx_lshrv1di3 (d, gen_lowpart (V1DImode, src),
17694 GEN_INT (i / 2));
17695 break;
17696 case E_V16QImode:
17697 case E_V8HImode:
17698 case E_V8HFmode:
17699 case E_V4SImode:
17700 case E_V2DImode:
17701 d = gen_reg_rtx (V1TImode);
17702 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
17703 GEN_INT (i / 2));
17704 break;
17705 case E_V8SFmode:
17706 if (i == 256)
17707 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
17708 else
17709 tem = gen_avx_shufps256 (dest, src, src,
17710 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
17711 break;
17712 case E_V4DFmode:
17713 if (i == 256)
17714 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
17715 else
17716 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
17717 break;
17718 case E_V32QImode:
17719 case E_V16HImode:
17720 case E_V16HFmode:
17721 case E_V8SImode:
17722 case E_V4DImode:
17723 if (i == 256)
17724 {
17725 if (GET_MODE (dest) != V4DImode)
17726 d = gen_reg_rtx (V4DImode);
17727 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
17728 gen_lowpart (V4DImode, src),
17729 const1_rtx);
17730 }
17731 else
17732 {
17733 d = gen_reg_rtx (V2TImode);
17734 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
17735 GEN_INT (i / 2));
17736 }
17737 break;
17738 case E_V64QImode:
17739 case E_V32HImode:
17740 case E_V32HFmode:
17741 if (i < 64)
17742 {
17743 d = gen_reg_rtx (V4TImode);
17744 tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
17745 GEN_INT (i / 2));
17746 break;
17747 }
17748 /* FALLTHRU */
17749 case E_V16SImode:
17750 case E_V16SFmode:
17751 case E_V8DImode:
17752 case E_V8DFmode:
17753 if (i > 128)
17754 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
17755 gen_lowpart (V16SImode, src),
17756 gen_lowpart (V16SImode, src),
17757 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
17758 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
17759 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
17760 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
17761 GEN_INT (0xC), GEN_INT (0xD),
17762 GEN_INT (0xE), GEN_INT (0xF),
17763 GEN_INT (0x10), GEN_INT (0x11),
17764 GEN_INT (0x12), GEN_INT (0x13),
17765 GEN_INT (0x14), GEN_INT (0x15),
17766 GEN_INT (0x16), GEN_INT (0x17));
17767 else
17768 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
17769 gen_lowpart (V16SImode, src),
17770 GEN_INT (i == 128 ? 0x2 : 0x1),
17771 GEN_INT (0x3),
17772 GEN_INT (0x3),
17773 GEN_INT (0x3),
17774 GEN_INT (i == 128 ? 0x6 : 0x5),
17775 GEN_INT (0x7),
17776 GEN_INT (0x7),
17777 GEN_INT (0x7),
17778 GEN_INT (i == 128 ? 0xA : 0x9),
17779 GEN_INT (0xB),
17780 GEN_INT (0xB),
17781 GEN_INT (0xB),
17782 GEN_INT (i == 128 ? 0xE : 0xD),
17783 GEN_INT (0xF),
17784 GEN_INT (0xF),
17785 GEN_INT (0xF));
17786 break;
17787 default:
17788 gcc_unreachable ();
17789 }
17790 emit_insn (tem);
17791 if (d != dest)
17792 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
17793}
17794
17795/* Expand a vector reduction. FN is the binary pattern to reduce;
17796 DEST is the destination; IN is the input vector. */
17797
17798void
17799ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
17800{
17801 rtx half, dst, vec = in;
17802 machine_mode mode = GET_MODE (in);
17803 int i;
17804
17805 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
17806 if (TARGET_SSE4_1
17807 && mode == V8HImode
17808 && fn == gen_uminv8hi3)
17809 {
17810 emit_insn (gen_sse4_1_phminposuw (dest, in));
17811 return;
17812 }
17813
17814 for (i = GET_MODE_BITSIZE (mode);
17815 i > GET_MODE_UNIT_BITSIZE (mode);
17816 i >>= 1)
17817 {
17818 half = gen_reg_rtx (mode);
17819 emit_reduc_half (dest: half, src: vec, i);
17820 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
17821 dst = dest;
17822 else
17823 dst = gen_reg_rtx (mode);
17824 emit_insn (fn (dst, half, vec));
17825 vec = dst;
17826 }
17827}
17828
17829/* Output code to perform a conditional jump to LABEL, if C2 flag in
17830 FP status register is set. */
17831
17832void
17833ix86_emit_fp_unordered_jump (rtx label)
17834{
17835 rtx reg = gen_reg_rtx (HImode);
17836 rtx_insn *insn;
17837 rtx temp;
17838
17839 emit_insn (gen_x86_fnstsw_1 (reg));
17840
17841 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
17842 {
17843 emit_insn (gen_x86_sahf_1 (reg));
17844
17845 temp = gen_rtx_REG (CCmode, FLAGS_REG);
17846 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
17847 }
17848 else
17849 {
17850 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
17851
17852 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
17853 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
17854 }
17855
17856 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
17857 gen_rtx_LABEL_REF (VOIDmode, label),
17858 pc_rtx);
17859 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
17860 predict_jump (REG_BR_PROB_BASE * 10 / 100);
17861 JUMP_LABEL (insn) = label;
17862}
17863
17864/* Output code to perform an sinh XFmode calculation. */
17865
17866void
17867ix86_emit_i387_sinh (rtx op0, rtx op1)
17868{
17869 rtx e1 = gen_reg_rtx (XFmode);
17870 rtx e2 = gen_reg_rtx (XFmode);
17871 rtx scratch = gen_reg_rtx (HImode);
17872 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17873 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17874 rtx cst1, tmp;
17875 rtx_code_label *jump_label = gen_label_rtx ();
17876 rtx_insn *insn;
17877
17878 /* scratch = fxam (op1) */
17879 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17880
17881 /* e1 = expm1 (|op1|) */
17882 emit_insn (gen_absxf2 (e2, op1));
17883 emit_insn (gen_expm1xf2 (e1, e2));
17884
17885 /* e2 = e1 / (e1 + 1.0) + e1 */
17886 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17887 emit_insn (gen_addxf3 (e2, e1, cst1));
17888 emit_insn (gen_divxf3 (e2, e1, e2));
17889 emit_insn (gen_addxf3 (e2, e2, e1));
17890
17891 /* flags = signbit (op1) */
17892 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17893
17894 /* if (flags) then e2 = -e2 */
17895 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17896 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17897 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17898 pc_rtx);
17899 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17900 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17901 JUMP_LABEL (insn) = jump_label;
17902
17903 emit_insn (gen_negxf2 (e2, e2));
17904
17905 emit_label (jump_label);
17906 LABEL_NUSES (jump_label) = 1;
17907
17908 /* op0 = 0.5 * e2 */
17909 half = force_reg (XFmode, half);
17910 emit_insn (gen_mulxf3 (op0, e2, half));
17911}
17912
17913/* Output code to perform an cosh XFmode calculation. */
17914
17915void
17916ix86_emit_i387_cosh (rtx op0, rtx op1)
17917{
17918 rtx e1 = gen_reg_rtx (XFmode);
17919 rtx e2 = gen_reg_rtx (XFmode);
17920 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17921 rtx cst1;
17922
17923 /* e1 = exp (op1) */
17924 emit_insn (gen_expxf2 (e1, op1));
17925
17926 /* e2 = e1 + 1.0 / e1 */
17927 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17928 emit_insn (gen_divxf3 (e2, cst1, e1));
17929 emit_insn (gen_addxf3 (e2, e1, e2));
17930
17931 /* op0 = 0.5 * e2 */
17932 half = force_reg (XFmode, half);
17933 emit_insn (gen_mulxf3 (op0, e2, half));
17934}
17935
17936/* Output code to perform an tanh XFmode calculation. */
17937
17938void
17939ix86_emit_i387_tanh (rtx op0, rtx op1)
17940{
17941 rtx e1 = gen_reg_rtx (XFmode);
17942 rtx e2 = gen_reg_rtx (XFmode);
17943 rtx scratch = gen_reg_rtx (HImode);
17944 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17945 rtx cst2, tmp;
17946 rtx_code_label *jump_label = gen_label_rtx ();
17947 rtx_insn *insn;
17948
17949 /* scratch = fxam (op1) */
17950 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17951
17952 /* e1 = expm1 (-|2 * op1|) */
17953 emit_insn (gen_addxf3 (e2, op1, op1));
17954 emit_insn (gen_absxf2 (e2, e2));
17955 emit_insn (gen_negxf2 (e2, e2));
17956 emit_insn (gen_expm1xf2 (e1, e2));
17957
17958 /* e2 = e1 / (e1 + 2.0) */
17959 cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
17960 emit_insn (gen_addxf3 (e2, e1, cst2));
17961 emit_insn (gen_divxf3 (e2, e1, e2));
17962
17963 /* flags = signbit (op1) */
17964 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17965
17966 /* if (!flags) then e2 = -e2 */
17967 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17968 gen_rtx_NE (VOIDmode, flags, const0_rtx),
17969 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17970 pc_rtx);
17971 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17972 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17973 JUMP_LABEL (insn) = jump_label;
17974
17975 emit_insn (gen_negxf2 (e2, e2));
17976
17977 emit_label (jump_label);
17978 LABEL_NUSES (jump_label) = 1;
17979
17980 emit_move_insn (op0, e2);
17981}
17982
17983/* Output code to perform an asinh XFmode calculation. */
17984
17985void
17986ix86_emit_i387_asinh (rtx op0, rtx op1)
17987{
17988 rtx e1 = gen_reg_rtx (XFmode);
17989 rtx e2 = gen_reg_rtx (XFmode);
17990 rtx scratch = gen_reg_rtx (HImode);
17991 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17992 rtx cst1, tmp;
17993 rtx_code_label *jump_label = gen_label_rtx ();
17994 rtx_insn *insn;
17995
17996 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
17997 emit_insn (gen_mulxf3 (e1, op1, op1));
17998 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17999 emit_insn (gen_addxf3 (e2, e1, cst1));
18000 emit_insn (gen_sqrtxf2 (e2, e2));
18001 emit_insn (gen_addxf3 (e2, e2, cst1));
18002
18003 /* e1 = e1 / e2 */
18004 emit_insn (gen_divxf3 (e1, e1, e2));
18005
18006 /* scratch = fxam (op1) */
18007 emit_insn (gen_fxamxf2_i387 (scratch, op1));
18008
18009 /* e1 = e1 + |op1| */
18010 emit_insn (gen_absxf2 (e2, op1));
18011 emit_insn (gen_addxf3 (e1, e1, e2));
18012
18013 /* e2 = log1p (e1) */
18014 ix86_emit_i387_log1p (e2, e1);
18015
18016 /* flags = signbit (op1) */
18017 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
18018
18019 /* if (flags) then e2 = -e2 */
18020 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
18021 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
18022 gen_rtx_LABEL_REF (VOIDmode, jump_label),
18023 pc_rtx);
18024 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
18025 predict_jump (REG_BR_PROB_BASE * 50 / 100);
18026 JUMP_LABEL (insn) = jump_label;
18027
18028 emit_insn (gen_negxf2 (e2, e2));
18029
18030 emit_label (jump_label);
18031 LABEL_NUSES (jump_label) = 1;
18032
18033 emit_move_insn (op0, e2);
18034}
18035
18036/* Output code to perform an acosh XFmode calculation. */
18037
18038void
18039ix86_emit_i387_acosh (rtx op0, rtx op1)
18040{
18041 rtx e1 = gen_reg_rtx (XFmode);
18042 rtx e2 = gen_reg_rtx (XFmode);
18043 rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
18044
18045 /* e2 = sqrt (op1 + 1.0) */
18046 emit_insn (gen_addxf3 (e2, op1, cst1));
18047 emit_insn (gen_sqrtxf2 (e2, e2));
18048
18049 /* e1 = sqrt (op1 - 1.0) */
18050 emit_insn (gen_subxf3 (e1, op1, cst1));
18051 emit_insn (gen_sqrtxf2 (e1, e1));
18052
18053 /* e1 = e1 * e2 */
18054 emit_insn (gen_mulxf3 (e1, e1, e2));
18055
18056 /* e1 = e1 + op1 */
18057 emit_insn (gen_addxf3 (e1, e1, op1));
18058
18059 /* op0 = log (e1) */
18060 emit_insn (gen_logxf2 (op0, e1));
18061}
18062
18063/* Output code to perform an atanh XFmode calculation. */
18064
18065void
18066ix86_emit_i387_atanh (rtx op0, rtx op1)
18067{
18068 rtx e1 = gen_reg_rtx (XFmode);
18069 rtx e2 = gen_reg_rtx (XFmode);
18070 rtx scratch = gen_reg_rtx (HImode);
18071 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
18072 rtx half = const_double_from_real_value (dconsthalf, XFmode);
18073 rtx cst1, tmp;
18074 rtx_code_label *jump_label = gen_label_rtx ();
18075 rtx_insn *insn;
18076
18077 /* scratch = fxam (op1) */
18078 emit_insn (gen_fxamxf2_i387 (scratch, op1));
18079
18080 /* e2 = |op1| */
18081 emit_insn (gen_absxf2 (e2, op1));
18082
18083 /* e1 = -(e2 + e2) / (e2 + 1.0) */
18084 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
18085 emit_insn (gen_addxf3 (e1, e2, cst1));
18086 emit_insn (gen_addxf3 (e2, e2, e2));
18087 emit_insn (gen_negxf2 (e2, e2));
18088 emit_insn (gen_divxf3 (e1, e2, e1));
18089
18090 /* e2 = log1p (e1) */
18091 ix86_emit_i387_log1p (e2, e1);
18092
18093 /* flags = signbit (op1) */
18094 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
18095
18096 /* if (!flags) then e2 = -e2 */
18097 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
18098 gen_rtx_NE (VOIDmode, flags, const0_rtx),
18099 gen_rtx_LABEL_REF (VOIDmode, jump_label),
18100 pc_rtx);
18101 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
18102 predict_jump (REG_BR_PROB_BASE * 50 / 100);
18103 JUMP_LABEL (insn) = jump_label;
18104
18105 emit_insn (gen_negxf2 (e2, e2));
18106
18107 emit_label (jump_label);
18108 LABEL_NUSES (jump_label) = 1;
18109
18110 /* op0 = 0.5 * e2 */
18111 half = force_reg (XFmode, half);
18112 emit_insn (gen_mulxf3 (op0, e2, half));
18113}
18114
18115/* Output code to perform a log1p XFmode calculation. */
18116
18117void
18118ix86_emit_i387_log1p (rtx op0, rtx op1)
18119{
18120 rtx_code_label *label1 = gen_label_rtx ();
18121 rtx_code_label *label2 = gen_label_rtx ();
18122
18123 rtx tmp = gen_reg_rtx (XFmode);
18124 rtx res = gen_reg_rtx (XFmode);
18125 rtx cst, cstln2, cst1;
18126 rtx_insn *insn;
18127
18128 /* The emit_jump call emits pending stack adjust, make sure it is emitted
18129 before the conditional jump, otherwise the stack adjustment will be
18130 only conditional. */
18131 do_pending_stack_adjust ();
18132
18133 cst = const_double_from_real_value
18134 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
18135 cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
18136
18137 emit_insn (gen_absxf2 (tmp, op1));
18138
18139 cst = force_reg (XFmode, cst);
18140 ix86_expand_branch (code: GE, op0: tmp, op1: cst, label: label1);
18141 predict_jump (REG_BR_PROB_BASE * 10 / 100);
18142 insn = get_last_insn ();
18143 JUMP_LABEL (insn) = label1;
18144
18145 emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
18146 emit_jump (label2);
18147
18148 emit_label (label1);
18149 LABEL_NUSES (label1) = 1;
18150
18151 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
18152 emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
18153 emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
18154
18155 emit_label (label2);
18156 LABEL_NUSES (label2) = 1;
18157
18158 emit_move_insn (op0, res);
18159}
18160
18161/* Emit code for round calculation. */
18162void
18163ix86_emit_i387_round (rtx op0, rtx op1)
18164{
18165 machine_mode inmode = GET_MODE (op1);
18166 machine_mode outmode = GET_MODE (op0);
18167 rtx e1 = gen_reg_rtx (XFmode);
18168 rtx e2 = gen_reg_rtx (XFmode);
18169 rtx scratch = gen_reg_rtx (HImode);
18170 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
18171 rtx half = const_double_from_real_value (dconsthalf, XFmode);
18172 rtx res = gen_reg_rtx (outmode);
18173 rtx_code_label *jump_label = gen_label_rtx ();
18174 rtx (*floor_insn) (rtx, rtx);
18175 rtx (*neg_insn) (rtx, rtx);
18176 rtx_insn *insn;
18177 rtx tmp;
18178
18179 switch (inmode)
18180 {
18181 case E_SFmode:
18182 case E_DFmode:
18183 tmp = gen_reg_rtx (XFmode);
18184
18185 emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
18186 op1 = tmp;
18187 break;
18188 case E_XFmode:
18189 break;
18190 default:
18191 gcc_unreachable ();
18192 }
18193
18194 switch (outmode)
18195 {
18196 case E_SFmode:
18197 floor_insn = gen_frndintxf2_floor;
18198 neg_insn = gen_negsf2;
18199 break;
18200 case E_DFmode:
18201 floor_insn = gen_frndintxf2_floor;
18202 neg_insn = gen_negdf2;
18203 break;
18204 case E_XFmode:
18205 floor_insn = gen_frndintxf2_floor;
18206 neg_insn = gen_negxf2;
18207 break;
18208 case E_HImode:
18209 floor_insn = gen_lfloorxfhi2;
18210 neg_insn = gen_neghi2;
18211 break;
18212 case E_SImode:
18213 floor_insn = gen_lfloorxfsi2;
18214 neg_insn = gen_negsi2;
18215 break;
18216 case E_DImode:
18217 floor_insn = gen_lfloorxfdi2;
18218 neg_insn = gen_negdi2;
18219 break;
18220 default:
18221 gcc_unreachable ();
18222 }
18223
18224 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
18225
18226 /* scratch = fxam(op1) */
18227 emit_insn (gen_fxamxf2_i387 (scratch, op1));
18228
18229 /* e1 = fabs(op1) */
18230 emit_insn (gen_absxf2 (e1, op1));
18231
18232 /* e2 = e1 + 0.5 */
18233 half = force_reg (XFmode, half);
18234 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
18235
18236 /* res = floor(e2) */
18237 switch (outmode)
18238 {
18239 case E_SFmode:
18240 case E_DFmode:
18241 {
18242 tmp = gen_reg_rtx (XFmode);
18243
18244 emit_insn (floor_insn (tmp, e2));
18245 emit_insn (gen_rtx_SET (res,
18246 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
18247 UNSPEC_TRUNC_NOOP)));
18248 }
18249 break;
18250 default:
18251 emit_insn (floor_insn (res, e2));
18252 }
18253
18254 /* flags = signbit(a) */
18255 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
18256
18257 /* if (flags) then res = -res */
18258 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
18259 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
18260 gen_rtx_LABEL_REF (VOIDmode, jump_label),
18261 pc_rtx);
18262 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
18263 predict_jump (REG_BR_PROB_BASE * 50 / 100);
18264 JUMP_LABEL (insn) = jump_label;
18265
18266 emit_insn (neg_insn (res, res));
18267
18268 emit_label (jump_label);
18269 LABEL_NUSES (jump_label) = 1;
18270
18271 emit_move_insn (op0, res);
18272}
18273
18274/* Output code to perform a Newton-Rhapson approximation of a single precision
18275 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
18276
18277void
18278ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
18279{
18280 rtx x0, x1, e0, e1;
18281
18282 x0 = gen_reg_rtx (mode);
18283 e0 = gen_reg_rtx (mode);
18284 e1 = gen_reg_rtx (mode);
18285 x1 = gen_reg_rtx (mode);
18286
18287 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
18288
18289 b = force_reg (mode, b);
18290
18291 /* x0 = rcp(b) estimate */
18292 if (mode == V16SFmode || mode == V8DFmode)
18293 {
18294 if (TARGET_AVX512ER)
18295 {
18296 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
18297 UNSPEC_RCP28)));
18298 /* res = a * x0 */
18299 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
18300 return;
18301 }
18302 else
18303 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
18304 UNSPEC_RCP14)));
18305 }
18306 else
18307 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
18308 UNSPEC_RCP)));
18309
18310 /* e0 = x0 * b */
18311 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
18312
18313 /* e0 = x0 * e0 */
18314 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
18315
18316 /* e1 = x0 + x0 */
18317 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
18318
18319 /* x1 = e1 - e0 */
18320 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
18321
18322 /* res = a * x1 */
18323 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
18324}
18325
18326/* Output code to perform a Newton-Rhapson approximation of a
18327 single precision floating point [reciprocal] square root. */
18328
18329void
18330ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
18331{
18332 rtx x0, e0, e1, e2, e3, mthree, mhalf;
18333 REAL_VALUE_TYPE r;
18334 int unspec;
18335
18336 x0 = gen_reg_rtx (mode);
18337 e0 = gen_reg_rtx (mode);
18338 e1 = gen_reg_rtx (mode);
18339 e2 = gen_reg_rtx (mode);
18340 e3 = gen_reg_rtx (mode);
18341
18342 if (TARGET_AVX512ER && mode == V16SFmode)
18343 {
18344 if (recip)
18345 /* res = rsqrt28(a) estimate */
18346 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
18347 UNSPEC_RSQRT28)));
18348 else
18349 {
18350 /* x0 = rsqrt28(a) estimate */
18351 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
18352 UNSPEC_RSQRT28)));
18353 /* res = rcp28(x0) estimate */
18354 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
18355 UNSPEC_RCP28)));
18356 }
18357 return;
18358 }
18359
18360 real_from_integer (&r, VOIDmode, -3, SIGNED);
18361 mthree = const_double_from_real_value (r, SFmode);
18362
18363 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
18364 mhalf = const_double_from_real_value (r, SFmode);
18365 unspec = UNSPEC_RSQRT;
18366
18367 if (VECTOR_MODE_P (mode))
18368 {
18369 mthree = ix86_build_const_vector (mode, true, mthree);
18370 mhalf = ix86_build_const_vector (mode, true, mhalf);
18371 /* There is no 512-bit rsqrt. There is however rsqrt14. */
18372 if (GET_MODE_SIZE (mode) == 64)
18373 unspec = UNSPEC_RSQRT14;
18374 }
18375
18376 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
18377 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
18378
18379 a = force_reg (mode, a);
18380
18381 /* x0 = rsqrt(a) estimate */
18382 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
18383 unspec)));
18384
18385 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
18386 if (!recip)
18387 {
18388 rtx zero = force_reg (mode, CONST0_RTX(mode));
18389 rtx mask;
18390
18391 /* Handle masked compare. */
18392 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
18393 {
18394 mask = gen_reg_rtx (HImode);
18395 /* Imm value 0x4 corresponds to not-equal comparison. */
18396 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
18397 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
18398 }
18399 else
18400 {
18401 mask = gen_reg_rtx (mode);
18402 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
18403 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
18404 }
18405 }
18406
18407 mthree = force_reg (mode, mthree);
18408
18409 /* e0 = x0 * a */
18410 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
18411
18412 unsigned vector_size = GET_MODE_SIZE (mode);
18413 if (TARGET_FMA
18414 || (TARGET_AVX512F && TARGET_EVEX512 && vector_size == 64)
18415 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
18416 emit_insn (gen_rtx_SET (e2,
18417 gen_rtx_FMA (mode, e0, x0, mthree)));
18418 else
18419 {
18420 /* e1 = e0 * x0 */
18421 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
18422
18423 /* e2 = e1 - 3. */
18424 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
18425 }
18426
18427 mhalf = force_reg (mode, mhalf);
18428 if (recip)
18429 /* e3 = -.5 * x0 */
18430 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
18431 else
18432 /* e3 = -.5 * e0 */
18433 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
18434 /* ret = e2 * e3 */
18435 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
18436}
18437
18438/* Expand fabs (OP0) and return a new rtx that holds the result. The
18439 mask for masking out the sign-bit is stored in *SMASK, if that is
18440 non-null. */
18441
18442static rtx
18443ix86_expand_sse_fabs (rtx op0, rtx *smask)
18444{
18445 machine_mode vmode, mode = GET_MODE (op0);
18446 rtx xa, mask;
18447
18448 xa = gen_reg_rtx (mode);
18449 if (mode == SFmode)
18450 vmode = V4SFmode;
18451 else if (mode == DFmode)
18452 vmode = V2DFmode;
18453 else
18454 vmode = mode;
18455 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
18456 if (!VECTOR_MODE_P (mode))
18457 {
18458 /* We need to generate a scalar mode mask in this case. */
18459 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
18460 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
18461 mask = gen_reg_rtx (mode);
18462 emit_insn (gen_rtx_SET (mask, tmp));
18463 }
18464 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
18465
18466 if (smask)
18467 *smask = mask;
18468
18469 return xa;
18470}
18471
18472/* Expands a comparison of OP0 with OP1 using comparison code CODE,
18473 swapping the operands if SWAP_OPERANDS is true. The expanded
18474 code is a forward jump to a newly created label in case the
18475 comparison is true. The generated label rtx is returned. */
18476static rtx_code_label *
18477ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
18478 bool swap_operands)
18479{
18480 bool unordered_compare = ix86_unordered_fp_compare (code);
18481 rtx_code_label *label;
18482 rtx tmp, reg;
18483
18484 if (swap_operands)
18485 std::swap (a&: op0, b&: op1);
18486
18487 label = gen_label_rtx ();
18488 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
18489 if (unordered_compare)
18490 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
18491 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
18492 emit_insn (gen_rtx_SET (reg, tmp));
18493 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
18494 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18495 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
18496 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
18497 JUMP_LABEL (tmp) = label;
18498
18499 return label;
18500}
18501
18502/* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
18503 using comparison code CODE. Operands are swapped for the comparison if
18504 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
18505static rtx
18506ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
18507 bool swap_operands)
18508{
18509 rtx (*insn)(rtx, rtx, rtx, rtx);
18510 machine_mode mode = GET_MODE (op0);
18511 rtx mask = gen_reg_rtx (mode);
18512
18513 if (swap_operands)
18514 std::swap (a&: op0, b&: op1);
18515
18516 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
18517
18518 emit_insn (insn (mask, op0, op1,
18519 gen_rtx_fmt_ee (code, mode, op0, op1)));
18520 return mask;
18521}
18522
18523/* Expand copysign from SIGN to the positive value ABS_VALUE
18524 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
18525 the sign-bit. */
18526
18527static void
18528ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
18529{
18530 machine_mode mode = GET_MODE (sign);
18531 rtx sgn = gen_reg_rtx (mode);
18532 if (mask == NULL_RTX)
18533 {
18534 machine_mode vmode;
18535
18536 if (mode == SFmode)
18537 vmode = V4SFmode;
18538 else if (mode == DFmode)
18539 vmode = V2DFmode;
18540 else if (mode == HFmode)
18541 vmode = V8HFmode;
18542 else
18543 vmode = mode;
18544
18545 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
18546 if (!VECTOR_MODE_P (mode))
18547 {
18548 /* We need to generate a scalar mode mask in this case. */
18549 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
18550 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
18551 mask = gen_reg_rtx (mode);
18552 emit_insn (gen_rtx_SET (mask, tmp));
18553 }
18554 }
18555 else
18556 mask = gen_rtx_NOT (mode, mask);
18557 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
18558 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
18559}
18560
18561/* Expand SSE sequence for computing lround from OP1 storing
18562 into OP0. */
18563
18564void
18565ix86_expand_lround (rtx op0, rtx op1)
18566{
18567 /* C code for the stuff we're doing below:
18568 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
18569 return (long)tmp;
18570 */
18571 machine_mode mode = GET_MODE (op1);
18572 const struct real_format *fmt;
18573 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18574 rtx adj;
18575
18576 /* load nextafter (0.5, 0.0) */
18577 fmt = REAL_MODE_FORMAT (mode);
18578 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18579 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18580
18581 /* adj = copysign (0.5, op1) */
18582 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
18583 ix86_sse_copysign_to_positive (result: adj, abs_value: adj, sign: force_reg (mode, op1), NULL_RTX);
18584
18585 /* adj = op1 + adj */
18586 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
18587
18588 /* op0 = (imode)adj */
18589 expand_fix (op0, adj, 0);
18590}
18591
18592/* Expand SSE2 sequence for computing lround from OPERAND1 storing
18593 into OPERAND0. */
18594
18595void
18596ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
18597{
18598 /* C code for the stuff we're doing below (for do_floor):
18599 xi = (long)op1;
18600 xi -= (double)xi > op1 ? 1 : 0;
18601 return xi;
18602 */
18603 machine_mode fmode = GET_MODE (op1);
18604 machine_mode imode = GET_MODE (op0);
18605 rtx ireg, freg, tmp;
18606 rtx_code_label *label;
18607
18608 /* reg = (long)op1 */
18609 ireg = gen_reg_rtx (imode);
18610 expand_fix (ireg, op1, 0);
18611
18612 /* freg = (double)reg */
18613 freg = gen_reg_rtx (fmode);
18614 expand_float (freg, ireg, 0);
18615
18616 /* ireg = (freg > op1) ? ireg - 1 : ireg */
18617 label = ix86_expand_sse_compare_and_jump (code: UNLE,
18618 op0: freg, op1, swap_operands: !do_floor);
18619 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
18620 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
18621 emit_move_insn (ireg, tmp);
18622
18623 emit_label (label);
18624 LABEL_NUSES (label) = 1;
18625
18626 emit_move_insn (op0, ireg);
18627}
18628
18629/* Generate and return a rtx of mode MODE for 2**n where n is the number
18630 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
18631
18632static rtx
18633ix86_gen_TWO52 (machine_mode mode)
18634{
18635 const struct real_format *fmt;
18636 REAL_VALUE_TYPE TWO52r;
18637 rtx TWO52;
18638
18639 fmt = REAL_MODE_FORMAT (mode);
18640 real_2expN (&TWO52r, fmt->p - 1, mode);
18641 TWO52 = const_double_from_real_value (TWO52r, mode);
18642 TWO52 = force_reg (mode, TWO52);
18643
18644 return TWO52;
18645}
18646
18647/* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
18648
18649void
18650ix86_expand_rint (rtx operand0, rtx operand1)
18651{
18652 /* C code for the stuff we're doing below:
18653 xa = fabs (operand1);
18654 if (!isless (xa, 2**52))
18655 return operand1;
18656 two52 = 2**52;
18657 if (flag_rounding_math)
18658 {
18659 two52 = copysign (two52, operand1);
18660 xa = operand1;
18661 }
18662 xa = xa + two52 - two52;
18663 return copysign (xa, operand1);
18664 */
18665 machine_mode mode = GET_MODE (operand0);
18666 rtx res, xa, TWO52, mask;
18667 rtx_code_label *label;
18668
18669 TWO52 = ix86_gen_TWO52 (mode);
18670
18671 /* Temporary for holding the result, initialized to the input
18672 operand to ease control flow. */
18673 res = copy_to_reg (operand1);
18674
18675 /* xa = abs (operand1) */
18676 xa = ix86_expand_sse_fabs (op0: res, smask: &mask);
18677
18678 /* if (!isless (xa, TWO52)) goto label; */
18679 label = ix86_expand_sse_compare_and_jump (code: UNLE, op0: TWO52, op1: xa, swap_operands: false);
18680
18681 if (flag_rounding_math)
18682 {
18683 ix86_sse_copysign_to_positive (result: TWO52, abs_value: TWO52, sign: res, mask);
18684 xa = res;
18685 }
18686
18687 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18688 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
18689
18690 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18691 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
18692 xa = ix86_expand_sse_fabs (op0: xa, NULL);
18693
18694 ix86_sse_copysign_to_positive (result: res, abs_value: xa, sign: res, mask);
18695
18696 emit_label (label);
18697 LABEL_NUSES (label) = 1;
18698
18699 emit_move_insn (operand0, res);
18700}
18701
18702/* Expand SSE2 sequence for computing floor or ceil
18703 from OPERAND1 storing into OPERAND0. */
18704void
18705ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
18706{
18707 /* C code for the stuff we expand below.
18708 double xa = fabs (x), x2;
18709 if (!isless (xa, TWO52))
18710 return x;
18711 x2 = (double)(long)x;
18712
18713 Compensate. Floor:
18714 if (x2 > x)
18715 x2 -= 1;
18716 Compensate. Ceil:
18717 if (x2 < x)
18718 x2 += 1;
18719
18720 if (HONOR_SIGNED_ZEROS (mode))
18721 return copysign (x2, x);
18722 return x2;
18723 */
18724 machine_mode mode = GET_MODE (operand0);
18725 rtx xa, xi, TWO52, tmp, one, res, mask;
18726 rtx_code_label *label;
18727
18728 TWO52 = ix86_gen_TWO52 (mode);
18729
18730 /* Temporary for holding the result, initialized to the input
18731 operand to ease control flow. */
18732 res = copy_to_reg (operand1);
18733
18734 /* xa = abs (operand1) */
18735 xa = ix86_expand_sse_fabs (op0: res, smask: &mask);
18736
18737 /* if (!isless (xa, TWO52)) goto label; */
18738 label = ix86_expand_sse_compare_and_jump (code: UNLE, op0: TWO52, op1: xa, swap_operands: false);
18739
18740 /* xa = (double)(long)x */
18741 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
18742 expand_fix (xi, res, 0);
18743 expand_float (xa, xi, 0);
18744
18745 /* generate 1.0 */
18746 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18747
18748 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18749 tmp = ix86_expand_sse_compare_mask (code: UNGT, op0: xa, op1: res, swap_operands: !do_floor);
18750 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18751 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
18752 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18753 if (HONOR_SIGNED_ZEROS (mode))
18754 {
18755 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18756 if (do_floor && flag_rounding_math)
18757 tmp = ix86_expand_sse_fabs (op0: tmp, NULL);
18758
18759 ix86_sse_copysign_to_positive (result: tmp, abs_value: tmp, sign: res, mask);
18760 }
18761 emit_move_insn (res, tmp);
18762
18763 emit_label (label);
18764 LABEL_NUSES (label) = 1;
18765
18766 emit_move_insn (operand0, res);
18767}
18768
18769/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
18770 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18771 that is only available on 64bit targets. */
18772void
18773ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
18774{
18775 /* C code for the stuff we expand below.
18776 double xa = fabs (x), x2;
18777 if (!isless (xa, TWO52))
18778 return x;
18779 xa = xa + TWO52 - TWO52;
18780 x2 = copysign (xa, x);
18781
18782 Compensate. Floor:
18783 if (x2 > x)
18784 x2 -= 1;
18785 Compensate. Ceil:
18786 if (x2 < x)
18787 x2 += 1;
18788
18789 if (HONOR_SIGNED_ZEROS (mode))
18790 x2 = copysign (x2, x);
18791 return x2;
18792 */
18793 machine_mode mode = GET_MODE (operand0);
18794 rtx xa, TWO52, tmp, one, res, mask;
18795 rtx_code_label *label;
18796
18797 TWO52 = ix86_gen_TWO52 (mode);
18798
18799 /* Temporary for holding the result, initialized to the input
18800 operand to ease control flow. */
18801 res = copy_to_reg (operand1);
18802
18803 /* xa = abs (operand1) */
18804 xa = ix86_expand_sse_fabs (op0: res, smask: &mask);
18805
18806 /* if (!isless (xa, TWO52)) goto label; */
18807 label = ix86_expand_sse_compare_and_jump (code: UNLE, op0: TWO52, op1: xa, swap_operands: false);
18808
18809 /* xa = xa + TWO52 - TWO52; */
18810 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18811 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
18812
18813 /* xa = copysign (xa, operand1) */
18814 ix86_sse_copysign_to_positive (result: xa, abs_value: xa, sign: res, mask);
18815
18816 /* generate 1.0 */
18817 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18818
18819 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18820 tmp = ix86_expand_sse_compare_mask (code: UNGT, op0: xa, op1: res, swap_operands: !do_floor);
18821 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18822 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
18823 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18824 if (HONOR_SIGNED_ZEROS (mode))
18825 {
18826 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18827 if (do_floor && flag_rounding_math)
18828 tmp = ix86_expand_sse_fabs (op0: tmp, NULL);
18829
18830 ix86_sse_copysign_to_positive (result: tmp, abs_value: tmp, sign: res, mask);
18831 }
18832 emit_move_insn (res, tmp);
18833
18834 emit_label (label);
18835 LABEL_NUSES (label) = 1;
18836
18837 emit_move_insn (operand0, res);
18838}
18839
18840/* Expand SSE sequence for computing trunc
18841 from OPERAND1 storing into OPERAND0. */
18842void
18843ix86_expand_trunc (rtx operand0, rtx operand1)
18844{
18845 /* C code for SSE variant we expand below.
18846 double xa = fabs (x), x2;
18847 if (!isless (xa, TWO52))
18848 return x;
18849 x2 = (double)(long)x;
18850 if (HONOR_SIGNED_ZEROS (mode))
18851 return copysign (x2, x);
18852 return x2;
18853 */
18854 machine_mode mode = GET_MODE (operand0);
18855 rtx xa, xi, TWO52, res, mask;
18856 rtx_code_label *label;
18857
18858 TWO52 = ix86_gen_TWO52 (mode);
18859
18860 /* Temporary for holding the result, initialized to the input
18861 operand to ease control flow. */
18862 res = copy_to_reg (operand1);
18863
18864 /* xa = abs (operand1) */
18865 xa = ix86_expand_sse_fabs (op0: res, smask: &mask);
18866
18867 /* if (!isless (xa, TWO52)) goto label; */
18868 label = ix86_expand_sse_compare_and_jump (code: UNLE, op0: TWO52, op1: xa, swap_operands: false);
18869
18870 /* xa = (double)(long)x */
18871 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
18872 expand_fix (xi, res, 0);
18873 expand_float (xa, xi, 0);
18874
18875 if (HONOR_SIGNED_ZEROS (mode))
18876 ix86_sse_copysign_to_positive (result: xa, abs_value: xa, sign: res, mask);
18877
18878 emit_move_insn (res, xa);
18879
18880 emit_label (label);
18881 LABEL_NUSES (label) = 1;
18882
18883 emit_move_insn (operand0, res);
18884}
18885
18886/* Expand SSE sequence for computing trunc from OPERAND1 storing
18887 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18888 that is only available on 64bit targets. */
18889void
18890ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
18891{
18892 machine_mode mode = GET_MODE (operand0);
18893 rtx xa, xa2, TWO52, tmp, one, res, mask;
18894 rtx_code_label *label;
18895
18896 /* C code for SSE variant we expand below.
18897 double xa = fabs (x), x2;
18898 if (!isless (xa, TWO52))
18899 return x;
18900 xa2 = xa + TWO52 - TWO52;
18901 Compensate:
18902 if (xa2 > xa)
18903 xa2 -= 1.0;
18904 x2 = copysign (xa2, x);
18905 return x2;
18906 */
18907
18908 TWO52 = ix86_gen_TWO52 (mode);
18909
18910 /* Temporary for holding the result, initialized to the input
18911 operand to ease control flow. */
18912 res =copy_to_reg (operand1);
18913
18914 /* xa = abs (operand1) */
18915 xa = ix86_expand_sse_fabs (op0: res, smask: &mask);
18916
18917 /* if (!isless (xa, TWO52)) goto label; */
18918 label = ix86_expand_sse_compare_and_jump (code: UNLE, op0: TWO52, op1: xa, swap_operands: false);
18919
18920 /* xa2 = xa + TWO52 - TWO52; */
18921 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18922 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
18923
18924 /* generate 1.0 */
18925 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18926
18927 /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
18928 tmp = ix86_expand_sse_compare_mask (code: UNGT, op0: xa2, op1: xa, swap_operands: false);
18929 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18930 tmp = expand_simple_binop (mode, MINUS,
18931 xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18932 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18933 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
18934 tmp = ix86_expand_sse_fabs (op0: tmp, NULL);
18935
18936 /* res = copysign (xa2, operand1) */
18937 ix86_sse_copysign_to_positive (result: res, abs_value: tmp, sign: res, mask);
18938
18939 emit_label (label);
18940 LABEL_NUSES (label) = 1;
18941
18942 emit_move_insn (operand0, res);
18943}
18944
18945/* Expand SSE sequence for computing round
18946 from OPERAND1 storing into OPERAND0. */
18947void
18948ix86_expand_round (rtx operand0, rtx operand1)
18949{
18950 /* C code for the stuff we're doing below:
18951 double xa = fabs (x);
18952 if (!isless (xa, TWO52))
18953 return x;
18954 xa = (double)(long)(xa + nextafter (0.5, 0.0));
18955 return copysign (xa, x);
18956 */
18957 machine_mode mode = GET_MODE (operand0);
18958 rtx res, TWO52, xa, xi, half, mask;
18959 rtx_code_label *label;
18960 const struct real_format *fmt;
18961 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18962
18963 /* Temporary for holding the result, initialized to the input
18964 operand to ease control flow. */
18965 res = copy_to_reg (operand1);
18966
18967 TWO52 = ix86_gen_TWO52 (mode);
18968 xa = ix86_expand_sse_fabs (op0: res, smask: &mask);
18969 label = ix86_expand_sse_compare_and_jump (code: UNLE, op0: TWO52, op1: xa, swap_operands: false);
18970
18971 /* load nextafter (0.5, 0.0) */
18972 fmt = REAL_MODE_FORMAT (mode);
18973 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18974 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18975
18976 /* xa = xa + 0.5 */
18977 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
18978 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
18979
18980 /* xa = (double)(int64_t)xa */
18981 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
18982 expand_fix (xi, xa, 0);
18983 expand_float (xa, xi, 0);
18984
18985 /* res = copysign (xa, operand1) */
18986 ix86_sse_copysign_to_positive (result: res, abs_value: xa, sign: res, mask);
18987
18988 emit_label (label);
18989 LABEL_NUSES (label) = 1;
18990
18991 emit_move_insn (operand0, res);
18992}
18993
18994/* Expand SSE sequence for computing round from OPERAND1 storing
18995 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18996 that is only available on 64bit targets. */
18997void
18998ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
18999{
19000 /* C code for the stuff we expand below.
19001 double xa = fabs (x), xa2, x2;
19002 if (!isless (xa, TWO52))
19003 return x;
19004 Using the absolute value and copying back sign makes
19005 -0.0 -> -0.0 correct.
19006 xa2 = xa + TWO52 - TWO52;
19007 Compensate.
19008 dxa = xa2 - xa;
19009 if (dxa <= -0.5)
19010 xa2 += 1;
19011 else if (dxa > 0.5)
19012 xa2 -= 1;
19013 x2 = copysign (xa2, x);
19014 return x2;
19015 */
19016 machine_mode mode = GET_MODE (operand0);
19017 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
19018 rtx_code_label *label;
19019
19020 TWO52 = ix86_gen_TWO52 (mode);
19021
19022 /* Temporary for holding the result, initialized to the input
19023 operand to ease control flow. */
19024 res = copy_to_reg (operand1);
19025
19026 /* xa = abs (operand1) */
19027 xa = ix86_expand_sse_fabs (op0: res, smask: &mask);
19028
19029 /* if (!isless (xa, TWO52)) goto label; */
19030 label = ix86_expand_sse_compare_and_jump (code: UNLE, op0: TWO52, op1: xa, swap_operands: false);
19031
19032 /* xa2 = xa + TWO52 - TWO52; */
19033 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
19034 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
19035
19036 /* dxa = xa2 - xa; */
19037 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
19038
19039 /* generate 0.5, 1.0 and -0.5 */
19040 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
19041 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
19042 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
19043 0, OPTAB_DIRECT);
19044
19045 /* Compensate. */
19046 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
19047 tmp = ix86_expand_sse_compare_mask (code: UNGT, op0: dxa, op1: half, swap_operands: false);
19048 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
19049 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
19050 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
19051 tmp = ix86_expand_sse_compare_mask (code: UNGE, op0: mhalf, op1: dxa, swap_operands: false);
19052 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
19053 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
19054
19055 /* res = copysign (xa2, operand1) */
19056 ix86_sse_copysign_to_positive (result: res, abs_value: xa2, sign: res, mask);
19057
19058 emit_label (label);
19059 LABEL_NUSES (label) = 1;
19060
19061 emit_move_insn (operand0, res);
19062}
19063
19064/* Expand SSE sequence for computing round
19065 from OP1 storing into OP0 using sse4 round insn. */
19066void
19067ix86_expand_round_sse4 (rtx op0, rtx op1)
19068{
19069 machine_mode mode = GET_MODE (op0);
19070 rtx e1, e2, res, half;
19071 const struct real_format *fmt;
19072 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
19073 rtx (*gen_copysign) (rtx, rtx, rtx);
19074 rtx (*gen_round) (rtx, rtx, rtx);
19075
19076 switch (mode)
19077 {
19078 case E_HFmode:
19079 gen_copysign = gen_copysignhf3;
19080 gen_round = gen_sse4_1_roundhf2;
19081 break;
19082 case E_SFmode:
19083 gen_copysign = gen_copysignsf3;
19084 gen_round = gen_sse4_1_roundsf2;
19085 break;
19086 case E_DFmode:
19087 gen_copysign = gen_copysigndf3;
19088 gen_round = gen_sse4_1_rounddf2;
19089 break;
19090 default:
19091 gcc_unreachable ();
19092 }
19093
19094 /* round (a) = trunc (a + copysign (0.5, a)) */
19095
19096 /* load nextafter (0.5, 0.0) */
19097 fmt = REAL_MODE_FORMAT (mode);
19098 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
19099 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
19100 half = const_double_from_real_value (pred_half, mode);
19101
19102 /* e1 = copysign (0.5, op1) */
19103 e1 = gen_reg_rtx (mode);
19104 emit_insn (gen_copysign (e1, half, op1));
19105
19106 /* e2 = op1 + e1 */
19107 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
19108
19109 /* res = trunc (e2) */
19110 res = gen_reg_rtx (mode);
19111 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
19112
19113 emit_move_insn (op0, res);
19114}
19115
19116/* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
19117 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
19118 insn every time. */
19119
19120static GTY(()) rtx_insn *vselect_insn;
19121
19122/* Initialize vselect_insn. */
19123
19124static void
19125init_vselect_insn (void)
19126{
19127 unsigned i;
19128 rtx x;
19129
19130 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
19131 for (i = 0; i < MAX_VECT_LEN; ++i)
19132 XVECEXP (x, 0, i) = const0_rtx;
19133 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
19134 const0_rtx), x);
19135 x = gen_rtx_SET (const0_rtx, x);
19136 start_sequence ();
19137 vselect_insn = emit_insn (x);
19138 end_sequence ();
19139}
19140
19141/* Construct (set target (vec_select op0 (parallel perm))) and
19142 return true if that's a valid instruction in the active ISA. */
19143
19144static bool
19145expand_vselect (rtx target, rtx op0, const unsigned char *perm,
19146 unsigned nelt, bool testing_p)
19147{
19148 unsigned int i;
19149 rtx x, save_vconcat;
19150 int icode;
19151
19152 if (vselect_insn == NULL_RTX)
19153 init_vselect_insn ();
19154
19155 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
19156 PUT_NUM_ELEM (XVEC (x, 0), nelt);
19157 for (i = 0; i < nelt; ++i)
19158 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
19159 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
19160 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
19161 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
19162 SET_DEST (PATTERN (vselect_insn)) = target;
19163 icode = recog_memoized (insn: vselect_insn);
19164
19165 if (icode >= 0 && !testing_p)
19166 emit_insn (copy_rtx (PATTERN (insn: vselect_insn)));
19167
19168 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
19169 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
19170 INSN_CODE (vselect_insn) = -1;
19171
19172 return icode >= 0;
19173}
19174
19175/* Similar, but generate a vec_concat from op0 and op1 as well. */
19176
19177static bool
19178expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
19179 const unsigned char *perm, unsigned nelt,
19180 bool testing_p)
19181{
19182 machine_mode v2mode;
19183 rtx x;
19184 bool ok;
19185
19186 if (vselect_insn == NULL_RTX)
19187 init_vselect_insn ();
19188
19189 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (mode: &v2mode))
19190 return false;
19191 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
19192 PUT_MODE (x, mode: v2mode);
19193 XEXP (x, 0) = op0;
19194 XEXP (x, 1) = op1;
19195 ok = expand_vselect (target, op0: x, perm, nelt, testing_p);
19196 XEXP (x, 0) = const0_rtx;
19197 XEXP (x, 1) = const0_rtx;
19198 return ok;
19199}
19200
19201/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19202 using movss or movsd. */
19203static bool
19204expand_vec_perm_movs (struct expand_vec_perm_d *d)
19205{
19206 machine_mode vmode = d->vmode;
19207 unsigned i, nelt = d->nelt;
19208 rtx x;
19209
19210 if (d->one_operand_p)
19211 return false;
19212
19213 if (!(TARGET_SSE && (vmode == V4SFmode || vmode == V4SImode))
19214 && !(TARGET_MMX_WITH_SSE && (vmode == V2SFmode || vmode == V2SImode))
19215 && !(TARGET_SSE2 && (vmode == V2DFmode || vmode == V2DImode)))
19216 return false;
19217
19218 /* Only the first element is changed. */
19219 if (d->perm[0] != nelt && d->perm[0] != 0)
19220 return false;
19221 for (i = 1; i < nelt; ++i)
19222 if (d->perm[i] != i + nelt - d->perm[0])
19223 return false;
19224
19225 if (d->testing_p)
19226 return true;
19227
19228 if (d->perm[0] == nelt)
19229 x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
19230 else
19231 x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
19232
19233 emit_insn (gen_rtx_SET (d->target, x));
19234
19235 return true;
19236}
19237
19238/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19239 using insertps. */
19240static bool
19241expand_vec_perm_insertps (struct expand_vec_perm_d *d)
19242{
19243 machine_mode vmode = d->vmode;
19244 unsigned i, cnt_s, nelt = d->nelt;
19245 int cnt_d = -1;
19246 rtx src, dst;
19247
19248 if (d->one_operand_p)
19249 return false;
19250
19251 if (!(TARGET_SSE4_1
19252 && (vmode == V4SFmode || vmode == V4SImode
19253 || (TARGET_MMX_WITH_SSE
19254 && (vmode == V2SFmode || vmode == V2SImode)))))
19255 return false;
19256
19257 for (i = 0; i < nelt; ++i)
19258 {
19259 if (d->perm[i] == i)
19260 continue;
19261 if (cnt_d != -1)
19262 {
19263 cnt_d = -1;
19264 break;
19265 }
19266 cnt_d = i;
19267 }
19268
19269 if (cnt_d == -1)
19270 {
19271 for (i = 0; i < nelt; ++i)
19272 {
19273 if (d->perm[i] == i + nelt)
19274 continue;
19275 if (cnt_d != -1)
19276 return false;
19277 cnt_d = i;
19278 }
19279
19280 if (cnt_d == -1)
19281 return false;
19282 }
19283
19284 if (d->testing_p)
19285 return true;
19286
19287 gcc_assert (cnt_d != -1);
19288
19289 cnt_s = d->perm[cnt_d];
19290 if (cnt_s < nelt)
19291 {
19292 src = d->op0;
19293 dst = d->op1;
19294 }
19295 else
19296 {
19297 cnt_s -= nelt;
19298 src = d->op1;
19299 dst = d->op0;
19300 }
19301 gcc_assert (cnt_s < nelt);
19302
19303 rtx x = gen_sse4_1_insertps (arg0: vmode, x0: d->target, x1: dst, x2: src,
19304 GEN_INT (cnt_s << 6 | cnt_d << 4));
19305 emit_insn (x);
19306
19307 return true;
19308}
19309
19310/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19311 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
19312
19313static bool
19314expand_vec_perm_blend (struct expand_vec_perm_d *d)
19315{
19316 machine_mode mmode, vmode = d->vmode;
19317 unsigned i, nelt = d->nelt;
19318 unsigned HOST_WIDE_INT mask;
19319 rtx target, op0, op1, maskop, x;
19320 rtx rperm[32], vperm;
19321
19322 if (d->one_operand_p)
19323 return false;
19324 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
19325 && (TARGET_AVX512BW
19326 || GET_MODE_UNIT_SIZE (vmode) >= 4))
19327 ;
19328 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
19329 ;
19330 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
19331 ;
19332 else if (TARGET_SSE4_1
19333 && (GET_MODE_SIZE (vmode) == 16
19334 || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
19335 || GET_MODE_SIZE (vmode) == 4))
19336 ;
19337 else
19338 return false;
19339
19340 /* This is a blend, not a permute. Elements must stay in their
19341 respective lanes. */
19342 for (i = 0; i < nelt; ++i)
19343 {
19344 unsigned e = d->perm[i];
19345 if (!(e == i || e == i + nelt))
19346 return false;
19347 }
19348
19349 if (d->testing_p)
19350 return true;
19351
19352 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
19353 decision should be extracted elsewhere, so that we only try that
19354 sequence once all budget==3 options have been tried. */
19355 target = d->target;
19356 op0 = d->op0;
19357 op1 = d->op1;
19358 mask = 0;
19359
19360 switch (vmode)
19361 {
19362 case E_V8DFmode:
19363 case E_V16SFmode:
19364 case E_V4DFmode:
19365 case E_V8SFmode:
19366 case E_V2DFmode:
19367 case E_V4SFmode:
19368 case E_V2SFmode:
19369 case E_V2HImode:
19370 case E_V4HImode:
19371 case E_V8HImode:
19372 case E_V8SImode:
19373 case E_V32HImode:
19374 case E_V64QImode:
19375 case E_V16SImode:
19376 case E_V8DImode:
19377 for (i = 0; i < nelt; ++i)
19378 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
19379 break;
19380
19381 case E_V2DImode:
19382 for (i = 0; i < 2; ++i)
19383 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
19384 vmode = V8HImode;
19385 goto do_subreg;
19386
19387 case E_V2SImode:
19388 for (i = 0; i < 2; ++i)
19389 mask |= (d->perm[i] >= 2 ? 3 : 0) << (i * 2);
19390 vmode = V4HImode;
19391 goto do_subreg;
19392
19393 case E_V4SImode:
19394 if (TARGET_AVX2)
19395 {
19396 /* Use vpblendd instead of vpblendw. */
19397 for (i = 0; i < nelt; ++i)
19398 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
19399 break;
19400 }
19401 else
19402 {
19403 for (i = 0; i < 4; ++i)
19404 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
19405 vmode = V8HImode;
19406 goto do_subreg;
19407 }
19408
19409 case E_V16QImode:
19410 /* See if bytes move in pairs so we can use pblendw with
19411 an immediate argument, rather than pblendvb with a vector
19412 argument. */
19413 for (i = 0; i < 16; i += 2)
19414 if (d->perm[i] + 1 != d->perm[i + 1])
19415 {
19416 use_pblendvb:
19417 for (i = 0; i < nelt; ++i)
19418 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
19419
19420 finish_pblendvb:
19421 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
19422 vperm = force_reg (vmode, vperm);
19423
19424 if (GET_MODE_SIZE (vmode) == 4)
19425 emit_insn (gen_mmx_pblendvb_v4qi (target, op0, op1, vperm));
19426 else if (GET_MODE_SIZE (vmode) == 8)
19427 emit_insn (gen_mmx_pblendvb_v8qi (target, op0, op1, vperm));
19428 else if (GET_MODE_SIZE (vmode) == 16)
19429 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
19430 else
19431 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
19432 if (target != d->target)
19433 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19434 return true;
19435 }
19436
19437 for (i = 0; i < 8; ++i)
19438 mask |= (d->perm[i * 2] >= 16) << i;
19439 vmode = V8HImode;
19440 /* FALLTHRU */
19441
19442 do_subreg:
19443 target = gen_reg_rtx (vmode);
19444 op0 = gen_lowpart (vmode, op0);
19445 op1 = gen_lowpart (vmode, op1);
19446 break;
19447
19448 case E_V8QImode:
19449 for (i = 0; i < 8; i += 2)
19450 if (d->perm[i] + 1 != d->perm[i + 1])
19451 goto use_pblendvb;
19452
19453 for (i = 0; i < 4; ++i)
19454 mask |= (d->perm[i * 2] >= 8) << i;
19455 vmode = V4HImode;
19456 goto do_subreg;
19457
19458 case E_V4QImode:
19459 for (i = 0; i < 4; i += 2)
19460 if (d->perm[i] + 1 != d->perm[i + 1])
19461 goto use_pblendvb;
19462
19463 for (i = 0; i < 2; ++i)
19464 mask |= (d->perm[i * 2] >= 4) << i;
19465 vmode = V2HImode;
19466 goto do_subreg;
19467
19468 case E_V32QImode:
19469 /* See if bytes move in pairs. If not, vpblendvb must be used. */
19470 for (i = 0; i < 32; i += 2)
19471 if (d->perm[i] + 1 != d->perm[i + 1])
19472 goto use_pblendvb;
19473 /* See if bytes move in quadruplets. If yes, vpblendd
19474 with immediate can be used. */
19475 for (i = 0; i < 32; i += 4)
19476 if (d->perm[i] + 2 != d->perm[i + 2])
19477 break;
19478 if (i < 32)
19479 {
19480 /* See if bytes move the same in both lanes. If yes,
19481 vpblendw with immediate can be used. */
19482 for (i = 0; i < 16; i += 2)
19483 if (d->perm[i] + 16 != d->perm[i + 16])
19484 goto use_pblendvb;
19485
19486 /* Use vpblendw. */
19487 for (i = 0; i < 16; ++i)
19488 mask |= (d->perm[i * 2] >= 32) << i;
19489 vmode = V16HImode;
19490 goto do_subreg;
19491 }
19492
19493 /* Use vpblendd. */
19494 for (i = 0; i < 8; ++i)
19495 mask |= (d->perm[i * 4] >= 32) << i;
19496 vmode = V8SImode;
19497 goto do_subreg;
19498
19499 case E_V16HImode:
19500 /* See if words move in pairs. If yes, vpblendd can be used. */
19501 for (i = 0; i < 16; i += 2)
19502 if (d->perm[i] + 1 != d->perm[i + 1])
19503 break;
19504 if (i < 16)
19505 {
19506 /* See if words move the same in both lanes. If not,
19507 vpblendvb must be used. */
19508 for (i = 0; i < 8; i++)
19509 if (d->perm[i] + 8 != d->perm[i + 8])
19510 {
19511 /* Use vpblendvb. */
19512 for (i = 0; i < 32; ++i)
19513 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
19514
19515 vmode = V32QImode;
19516 nelt = 32;
19517 target = gen_reg_rtx (vmode);
19518 op0 = gen_lowpart (vmode, op0);
19519 op1 = gen_lowpart (vmode, op1);
19520 goto finish_pblendvb;
19521 }
19522
19523 /* Use vpblendw. */
19524 for (i = 0; i < 16; ++i)
19525 mask |= (d->perm[i] >= 16) << i;
19526 break;
19527 }
19528
19529 /* Use vpblendd. */
19530 for (i = 0; i < 8; ++i)
19531 mask |= (d->perm[i * 2] >= 16) << i;
19532 vmode = V8SImode;
19533 goto do_subreg;
19534
19535 case E_V4DImode:
19536 /* Use vpblendd. */
19537 for (i = 0; i < 4; ++i)
19538 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
19539 vmode = V8SImode;
19540 goto do_subreg;
19541
19542 default:
19543 gcc_unreachable ();
19544 }
19545
19546 switch (vmode)
19547 {
19548 case E_V8DFmode:
19549 case E_V8DImode:
19550 mmode = QImode;
19551 break;
19552 case E_V16SFmode:
19553 case E_V16SImode:
19554 mmode = HImode;
19555 break;
19556 case E_V32HImode:
19557 mmode = SImode;
19558 break;
19559 case E_V64QImode:
19560 mmode = DImode;
19561 break;
19562 default:
19563 mmode = VOIDmode;
19564 }
19565
19566 /* Canonicalize vec_merge. */
19567 if (swap_commutative_operands_p (op1, op0)
19568 /* Two operands have same precedence, then
19569 first bit of mask select first operand. */
19570 || (!swap_commutative_operands_p (op0, op1)
19571 && !(mask & 1)))
19572 {
19573 unsigned n_elts = GET_MODE_NUNITS (vmode);
19574 std::swap (a&: op0, b&: op1);
19575 unsigned HOST_WIDE_INT mask_all = HOST_WIDE_INT_1U;
19576 if (n_elts == HOST_BITS_PER_WIDE_INT)
19577 mask_all = -1;
19578 else
19579 mask_all = (HOST_WIDE_INT_1U << n_elts) - 1;
19580 mask = ~mask & mask_all;
19581 }
19582
19583 if (mmode != VOIDmode)
19584 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
19585 else
19586 maskop = GEN_INT (mask);
19587
19588 /* This matches five different patterns with the different modes. */
19589 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
19590 x = gen_rtx_SET (target, x);
19591 emit_insn (x);
19592 if (target != d->target)
19593 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19594
19595 return true;
19596}
19597
19598/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19599 in terms of the variable form of vpermilps.
19600
19601 Note that we will have already failed the immediate input vpermilps,
19602 which requires that the high and low part shuffle be identical; the
19603 variable form doesn't require that. */
19604
19605static bool
19606expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
19607{
19608 rtx rperm[8], vperm;
19609 unsigned i;
19610
19611 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
19612 return false;
19613
19614 /* We can only permute within the 128-bit lane. */
19615 for (i = 0; i < 8; ++i)
19616 {
19617 unsigned e = d->perm[i];
19618 if (i < 4 ? e >= 4 : e < 4)
19619 return false;
19620 }
19621
19622 if (d->testing_p)
19623 return true;
19624
19625 for (i = 0; i < 8; ++i)
19626 {
19627 unsigned e = d->perm[i];
19628
19629 /* Within each 128-bit lane, the elements of op0 are numbered
19630 from 0 and the elements of op1 are numbered from 4. */
19631 if (e >= 8 + 4)
19632 e -= 8;
19633 else if (e >= 4)
19634 e -= 4;
19635
19636 rperm[i] = GEN_INT (e);
19637 }
19638
19639 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
19640 vperm = force_reg (V8SImode, vperm);
19641 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
19642
19643 return true;
19644}
19645
19646/* For V*[QHS]Imode permutations, check if the same permutation
19647 can't be performed in a 2x, 4x or 8x wider inner mode. */
19648
19649static bool
19650canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
19651 struct expand_vec_perm_d *nd)
19652{
19653 int i;
19654 machine_mode mode = VOIDmode;
19655
19656 switch (d->vmode)
19657 {
19658 case E_V8QImode: mode = V4HImode; break;
19659 case E_V16QImode: mode = V8HImode; break;
19660 case E_V32QImode: mode = V16HImode; break;
19661 case E_V64QImode: mode = V32HImode; break;
19662 case E_V4HImode: mode = V2SImode; break;
19663 case E_V8HImode: mode = V4SImode; break;
19664 case E_V16HImode: mode = V8SImode; break;
19665 case E_V32HImode: mode = V16SImode; break;
19666 case E_V4SImode: mode = V2DImode; break;
19667 case E_V8SImode: mode = V4DImode; break;
19668 case E_V16SImode: mode = V8DImode; break;
19669 default: return false;
19670 }
19671 for (i = 0; i < d->nelt; i += 2)
19672 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
19673 return false;
19674 nd->vmode = mode;
19675 nd->nelt = d->nelt / 2;
19676 for (i = 0; i < nd->nelt; i++)
19677 nd->perm[i] = d->perm[2 * i] / 2;
19678 if (GET_MODE_INNER (mode) != DImode)
19679 canonicalize_vector_int_perm (d: nd, nd);
19680 if (nd != d)
19681 {
19682 nd->one_operand_p = d->one_operand_p;
19683 nd->testing_p = d->testing_p;
19684 if (d->op0 == d->op1)
19685 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
19686 else
19687 {
19688 nd->op0 = gen_lowpart (nd->vmode, d->op0);
19689 nd->op1 = gen_lowpart (nd->vmode, d->op1);
19690 }
19691 if (d->testing_p)
19692 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
19693 else
19694 nd->target = gen_reg_rtx (nd->vmode);
19695 }
19696 return true;
19697}
19698
19699/* Return true if permutation D can be performed as VMODE permutation
19700 instead. */
19701
19702static bool
19703valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
19704{
19705 unsigned int i, j, chunk;
19706
19707 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
19708 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
19709 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
19710 return false;
19711
19712 if (GET_MODE_NUNITS (vmode) >= d->nelt)
19713 return true;
19714
19715 chunk = d->nelt / GET_MODE_NUNITS (vmode);
19716 for (i = 0; i < d->nelt; i += chunk)
19717 if (d->perm[i] & (chunk - 1))
19718 return false;
19719 else
19720 for (j = 1; j < chunk; ++j)
19721 if (d->perm[i] + j != d->perm[i + j])
19722 return false;
19723
19724 return true;
19725}
19726
19727/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19728 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
19729
19730static bool
19731expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
19732{
19733 unsigned i, nelt, eltsz, mask;
19734 unsigned char perm[64];
19735 machine_mode vmode;
19736 struct expand_vec_perm_d nd;
19737 rtx rperm[64], vperm, target, op0, op1;
19738
19739 nelt = d->nelt;
19740
19741 if (!d->one_operand_p)
19742 switch (GET_MODE_SIZE (d->vmode))
19743 {
19744 case 4:
19745 if (!TARGET_XOP)
19746 return false;
19747 vmode = V4QImode;
19748 break;
19749
19750 case 8:
19751 if (!TARGET_XOP)
19752 return false;
19753 vmode = V8QImode;
19754 break;
19755
19756 case 16:
19757 if (!TARGET_XOP)
19758 return false;
19759 vmode = V16QImode;
19760 break;
19761
19762 case 32:
19763 if (!TARGET_AVX2)
19764 return false;
19765
19766 if (valid_perm_using_mode_p (V2TImode, d))
19767 {
19768 if (d->testing_p)
19769 return true;
19770
19771 /* Use vperm2i128 insn. The pattern uses
19772 V4DImode instead of V2TImode. */
19773 target = d->target;
19774 if (d->vmode != V4DImode)
19775 target = gen_reg_rtx (V4DImode);
19776 op0 = gen_lowpart (V4DImode, d->op0);
19777 op1 = gen_lowpart (V4DImode, d->op1);
19778 rperm[0]
19779 = GEN_INT ((d->perm[0] / (nelt / 2))
19780 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
19781 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
19782 if (target != d->target)
19783 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19784 return true;
19785 }
19786 /* FALLTHRU */
19787
19788 default:
19789 return false;
19790 }
19791 else
19792 switch (GET_MODE_SIZE (d->vmode))
19793 {
19794 case 4:
19795 if (!TARGET_SSSE3)
19796 return false;
19797 vmode = V4QImode;
19798 break;
19799
19800 case 8:
19801 if (!TARGET_SSSE3)
19802 return false;
19803 vmode = V8QImode;
19804 break;
19805
19806 case 16:
19807 if (!TARGET_SSSE3)
19808 return false;
19809 vmode = V16QImode;
19810 break;
19811
19812 case 32:
19813 if (!TARGET_AVX2)
19814 return false;
19815
19816 /* V4DImode should be already handled through
19817 expand_vselect by vpermq instruction. */
19818 gcc_assert (d->vmode != V4DImode);
19819
19820 vmode = V32QImode;
19821 if (d->vmode == V8SImode
19822 || d->vmode == V16HImode
19823 || d->vmode == V32QImode)
19824 {
19825 /* First see if vpermq can be used for
19826 V8SImode/V16HImode/V32QImode. */
19827 if (valid_perm_using_mode_p (V4DImode, d))
19828 {
19829 for (i = 0; i < 4; i++)
19830 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
19831 if (d->testing_p)
19832 return true;
19833 target = gen_reg_rtx (V4DImode);
19834 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
19835 perm, nelt: 4, testing_p: false))
19836 {
19837 emit_move_insn (d->target,
19838 gen_lowpart (d->vmode, target));
19839 return true;
19840 }
19841 return false;
19842 }
19843
19844 /* Next see if vpermd can be used. */
19845 if (valid_perm_using_mode_p (V8SImode, d))
19846 vmode = V8SImode;
19847 }
19848 /* Or if vpermps can be used. */
19849 else if (d->vmode == V8SFmode)
19850 vmode = V8SImode;
19851
19852 if (vmode == V32QImode)
19853 {
19854 /* vpshufb only works intra lanes, it is not
19855 possible to shuffle bytes in between the lanes. */
19856 for (i = 0; i < nelt; ++i)
19857 if ((d->perm[i] ^ i) & (nelt / 2))
19858 return false;
19859 }
19860 break;
19861
19862 case 64:
19863 if (!TARGET_AVX512BW)
19864 return false;
19865
19866 /* If vpermq didn't work, vpshufb won't work either. */
19867 if (d->vmode == V8DFmode || d->vmode == V8DImode)
19868 return false;
19869
19870 vmode = V64QImode;
19871 if (d->vmode == V16SImode
19872 || d->vmode == V32HImode
19873 || d->vmode == V64QImode)
19874 {
19875 /* First see if vpermq can be used for
19876 V16SImode/V32HImode/V64QImode. */
19877 if (valid_perm_using_mode_p (V8DImode, d))
19878 {
19879 for (i = 0; i < 8; i++)
19880 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
19881 if (d->testing_p)
19882 return true;
19883 target = gen_reg_rtx (V8DImode);
19884 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
19885 perm, nelt: 8, testing_p: false))
19886 {
19887 emit_move_insn (d->target,
19888 gen_lowpart (d->vmode, target));
19889 return true;
19890 }
19891 return false;
19892 }
19893
19894 /* Next see if vpermd can be used. */
19895 if (valid_perm_using_mode_p (V16SImode, d))
19896 vmode = V16SImode;
19897 }
19898 /* Or if vpermps can be used. */
19899 else if (d->vmode == V16SFmode)
19900 vmode = V16SImode;
19901
19902 if (vmode == V64QImode)
19903 {
19904 /* vpshufb only works intra lanes, it is not
19905 possible to shuffle bytes in between the lanes. */
19906 for (i = 0; i < nelt; ++i)
19907 if ((d->perm[i] ^ i) & (3 * nelt / 4))
19908 return false;
19909 }
19910 break;
19911
19912 default:
19913 return false;
19914 }
19915
19916 if (d->testing_p)
19917 return true;
19918
19919 /* Try to avoid variable permutation instruction. */
19920 if (canonicalize_vector_int_perm (d, nd: &nd) && expand_vec_perm_1 (d: &nd))
19921 {
19922 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19923 return true;
19924 }
19925
19926 if (vmode == V8SImode)
19927 for (i = 0; i < 8; ++i)
19928 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
19929 else if (vmode == V16SImode)
19930 for (i = 0; i < 16; ++i)
19931 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
19932 else
19933 {
19934 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
19935 if (!d->one_operand_p)
19936 mask = 2 * nelt - 1;
19937 else if (vmode == V64QImode)
19938 mask = nelt / 4 - 1;
19939 else if (vmode == V32QImode)
19940 mask = nelt / 2 - 1;
19941 else
19942 mask = nelt - 1;
19943
19944 for (i = 0; i < nelt; ++i)
19945 {
19946 unsigned j, e = d->perm[i] & mask;
19947 for (j = 0; j < eltsz; ++j)
19948 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
19949 }
19950 }
19951
19952 machine_mode vpmode = vmode;
19953
19954 nelt = GET_MODE_SIZE (vmode);
19955
19956 /* Emulate narrow modes with V16QI instructions. */
19957 if (nelt < 16)
19958 {
19959 rtx m128 = GEN_INT (-128);
19960
19961 /* Remap elements from the second operand, as we have to
19962 account for inactive top elements from the first operand. */
19963 if (!d->one_operand_p)
19964 {
19965 for (i = 0; i < nelt; ++i)
19966 {
19967 unsigned ival = UINTVAL (rperm[i]);
19968 if (ival >= nelt)
19969 rperm[i] = GEN_INT (ival + 16 - nelt);
19970 }
19971 }
19972
19973 /* Fill inactive elements in the top positions with zeros. */
19974 for (i = nelt; i < 16; ++i)
19975 rperm[i] = m128;
19976
19977 vpmode = V16QImode;
19978 }
19979
19980 vperm = gen_rtx_CONST_VECTOR (vpmode,
19981 gen_rtvec_v (GET_MODE_NUNITS (vpmode), rperm));
19982 vperm = force_reg (vpmode, vperm);
19983
19984 if (vmode == d->vmode)
19985 target = d->target;
19986 else
19987 target = gen_reg_rtx (vmode);
19988
19989 op0 = gen_lowpart (vmode, d->op0);
19990
19991 if (d->one_operand_p)
19992 {
19993 rtx (*gen) (rtx, rtx, rtx);
19994
19995 if (vmode == V4QImode)
19996 gen = gen_mmx_pshufbv4qi3;
19997 else if (vmode == V8QImode)
19998 gen = gen_mmx_pshufbv8qi3;
19999 else if (vmode == V16QImode)
20000 gen = gen_ssse3_pshufbv16qi3;
20001 else if (vmode == V32QImode)
20002 gen = gen_avx2_pshufbv32qi3;
20003 else if (vmode == V64QImode)
20004 gen = gen_avx512bw_pshufbv64qi3;
20005 else if (vmode == V8SFmode)
20006 gen = gen_avx2_permvarv8sf;
20007 else if (vmode == V8SImode)
20008 gen = gen_avx2_permvarv8si;
20009 else if (vmode == V16SFmode)
20010 gen = gen_avx512f_permvarv16sf;
20011 else if (vmode == V16SImode)
20012 gen = gen_avx512f_permvarv16si;
20013 else
20014 gcc_unreachable ();
20015
20016 emit_insn (gen (target, op0, vperm));
20017 }
20018 else
20019 {
20020 rtx (*gen) (rtx, rtx, rtx, rtx);
20021
20022 op1 = gen_lowpart (vmode, d->op1);
20023
20024 if (vmode == V4QImode)
20025 gen = gen_mmx_ppermv32;
20026 else if (vmode == V8QImode)
20027 gen = gen_mmx_ppermv64;
20028 else if (vmode == V16QImode)
20029 gen = gen_xop_pperm;
20030 else
20031 gcc_unreachable ();
20032
20033 emit_insn (gen (target, op0, op1, vperm));
20034 }
20035
20036 if (target != d->target)
20037 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
20038
20039 return true;
20040}
20041
20042/* Try to expand one-operand permutation with constant mask. */
20043
20044static bool
20045ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
20046{
20047 machine_mode mode = GET_MODE (d->op0);
20048 machine_mode maskmode = mode;
20049 unsigned inner_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
20050 rtx (*gen) (rtx, rtx, rtx) = NULL;
20051 rtx target, op0, mask;
20052 rtx vec[64];
20053
20054 if (!rtx_equal_p (d->op0, d->op1))
20055 return false;
20056
20057 if (!TARGET_AVX512F)
20058 return false;
20059
20060 /* Accept VNxHImode and VNxQImode now. */
20061 if (!TARGET_AVX512VL && GET_MODE_SIZE (mode) < 64)
20062 return false;
20063
20064 /* vpermw. */
20065 if (!TARGET_AVX512BW && inner_size == 2)
20066 return false;
20067
20068 /* vpermb. */
20069 if (!TARGET_AVX512VBMI && inner_size == 1)
20070 return false;
20071
20072 switch (mode)
20073 {
20074 case E_V16SImode:
20075 gen = gen_avx512f_permvarv16si;
20076 break;
20077 case E_V16SFmode:
20078 gen = gen_avx512f_permvarv16sf;
20079 maskmode = V16SImode;
20080 break;
20081 case E_V8DImode:
20082 gen = gen_avx512f_permvarv8di;
20083 break;
20084 case E_V8DFmode:
20085 gen = gen_avx512f_permvarv8df;
20086 maskmode = V8DImode;
20087 break;
20088 case E_V32HImode:
20089 gen = gen_avx512bw_permvarv32hi;
20090 break;
20091 case E_V16HImode:
20092 gen = gen_avx512vl_permvarv16hi;
20093 break;
20094 case E_V8HImode:
20095 gen = gen_avx512vl_permvarv8hi;
20096 break;
20097 case E_V64QImode:
20098 gen = gen_avx512bw_permvarv64qi;
20099 break;
20100 case E_V32QImode:
20101 gen = gen_avx512vl_permvarv32qi;
20102 break;
20103 case E_V16QImode:
20104 gen = gen_avx512vl_permvarv16qi;
20105 break;
20106
20107 default:
20108 return false;
20109 }
20110
20111 if (d->testing_p)
20112 return true;
20113
20114 target = d->target;
20115 op0 = d->op0;
20116 for (int i = 0; i < d->nelt; ++i)
20117 vec[i] = GEN_INT (d->perm[i]);
20118 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
20119 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
20120 return true;
20121}
20122
20123static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
20124
20125/* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
20126 in a single instruction. */
20127
20128static bool
20129expand_vec_perm_1 (struct expand_vec_perm_d *d)
20130{
20131 unsigned i, nelt = d->nelt;
20132 struct expand_vec_perm_d nd;
20133
20134 /* Check plain VEC_SELECT first, because AVX has instructions that could
20135 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
20136 input where SEL+CONCAT may not. */
20137 if (d->one_operand_p)
20138 {
20139 int mask = nelt - 1;
20140 bool identity_perm = true;
20141 bool broadcast_perm = true;
20142
20143 for (i = 0; i < nelt; i++)
20144 {
20145 nd.perm[i] = d->perm[i] & mask;
20146 if (nd.perm[i] != i)
20147 identity_perm = false;
20148 if (nd.perm[i])
20149 broadcast_perm = false;
20150 }
20151
20152 if (identity_perm)
20153 {
20154 if (!d->testing_p)
20155 emit_move_insn (d->target, d->op0);
20156 return true;
20157 }
20158 else if (broadcast_perm && TARGET_AVX2)
20159 {
20160 /* Use vpbroadcast{b,w,d}. */
20161 rtx (*gen) (rtx, rtx) = NULL;
20162 switch (d->vmode)
20163 {
20164 case E_V64QImode:
20165 if (TARGET_AVX512BW)
20166 gen = gen_avx512bw_vec_dupv64qi_1;
20167 break;
20168 case E_V32QImode:
20169 gen = gen_avx2_pbroadcastv32qi_1;
20170 break;
20171 case E_V32HImode:
20172 if (TARGET_AVX512BW)
20173 gen = gen_avx512bw_vec_dupv32hi_1;
20174 break;
20175 case E_V16HImode:
20176 gen = gen_avx2_pbroadcastv16hi_1;
20177 break;
20178 case E_V16SImode:
20179 if (TARGET_AVX512F)
20180 gen = gen_avx512f_vec_dupv16si_1;
20181 break;
20182 case E_V8SImode:
20183 gen = gen_avx2_pbroadcastv8si_1;
20184 break;
20185 case E_V16QImode:
20186 gen = gen_avx2_pbroadcastv16qi;
20187 break;
20188 case E_V8HImode:
20189 gen = gen_avx2_pbroadcastv8hi;
20190 break;
20191 case E_V16SFmode:
20192 if (TARGET_AVX512F)
20193 gen = gen_avx512f_vec_dupv16sf_1;
20194 break;
20195 case E_V8SFmode:
20196 gen = gen_avx2_vec_dupv8sf_1;
20197 break;
20198 case E_V8DFmode:
20199 if (TARGET_AVX512F)
20200 gen = gen_avx512f_vec_dupv8df_1;
20201 break;
20202 case E_V8DImode:
20203 if (TARGET_AVX512F)
20204 gen = gen_avx512f_vec_dupv8di_1;
20205 break;
20206 /* For other modes prefer other shuffles this function creates. */
20207 default: break;
20208 }
20209 if (gen != NULL)
20210 {
20211 if (!d->testing_p)
20212 emit_insn (gen (d->target, d->op0));
20213 return true;
20214 }
20215 }
20216
20217 if (expand_vselect (target: d->target, op0: d->op0, perm: nd.perm, nelt, testing_p: d->testing_p))
20218 return true;
20219
20220 /* There are plenty of patterns in sse.md that are written for
20221 SEL+CONCAT and are not replicated for a single op. Perhaps
20222 that should be changed, to avoid the nastiness here. */
20223
20224 /* Recognize interleave style patterns, which means incrementing
20225 every other permutation operand. */
20226 for (i = 0; i < nelt; i += 2)
20227 {
20228 nd.perm[i] = d->perm[i] & mask;
20229 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
20230 }
20231 if (expand_vselect_vconcat (target: d->target, op0: d->op0, op1: d->op0, perm: nd.perm, nelt,
20232 testing_p: d->testing_p))
20233 return true;
20234
20235 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
20236 if (nelt >= 4)
20237 {
20238 for (i = 0; i < nelt; i += 4)
20239 {
20240 nd.perm[i + 0] = d->perm[i + 0] & mask;
20241 nd.perm[i + 1] = d->perm[i + 1] & mask;
20242 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
20243 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
20244 }
20245
20246 if (expand_vselect_vconcat (target: d->target, op0: d->op0, op1: d->op0, perm: nd.perm, nelt,
20247 testing_p: d->testing_p))
20248 return true;
20249 }
20250 }
20251
20252 /* Try the SSE4.1 blend variable merge instructions. */
20253 if (expand_vec_perm_blend (d))
20254 return true;
20255
20256 /* Try movss/movsd instructions. */
20257 if (expand_vec_perm_movs (d))
20258 return true;
20259
20260 /* Try the SSE4.1 insertps instruction. */
20261 if (expand_vec_perm_insertps (d))
20262 return true;
20263
20264 /* Try the fully general two operand permute. */
20265 if (expand_vselect_vconcat (target: d->target, op0: d->op0, op1: d->op1, perm: d->perm, nelt,
20266 testing_p: d->testing_p))
20267 return true;
20268
20269 /* Recognize interleave style patterns with reversed operands. */
20270 if (!d->one_operand_p)
20271 {
20272 for (i = 0; i < nelt; ++i)
20273 {
20274 unsigned e = d->perm[i];
20275 if (e >= nelt)
20276 e -= nelt;
20277 else
20278 e += nelt;
20279 nd.perm[i] = e;
20280 }
20281
20282 if (expand_vselect_vconcat (target: d->target, op0: d->op1, op1: d->op0, perm: nd.perm, nelt,
20283 testing_p: d->testing_p))
20284 return true;
20285 }
20286
20287 /* Try one of the AVX vpermil variable permutations. */
20288 if (expand_vec_perm_vpermil (d))
20289 return true;
20290
20291 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
20292 vpshufb, vpermd, vpermps or vpermq variable permutation. */
20293 if (expand_vec_perm_pshufb (d))
20294 return true;
20295
20296 /* Try the AVX2 vpalignr instruction. */
20297 if (expand_vec_perm_palignr (d, true))
20298 return true;
20299
20300 /* Try the AVX512F vperm{w,b,s,d} instructions */
20301 if (ix86_expand_vec_one_operand_perm_avx512 (d))
20302 return true;
20303
20304 /* Try the AVX512F vpermt2/vpermi2 instructions. */
20305 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
20306 return true;
20307
20308 /* See if we can get the same permutation in different vector integer
20309 mode. */
20310 if (canonicalize_vector_int_perm (d, nd: &nd) && expand_vec_perm_1 (d: &nd))
20311 {
20312 if (!d->testing_p)
20313 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
20314 return true;
20315 }
20316 return false;
20317}
20318
20319/* Canonicalize vec_perm index to make the first index
20320 always comes from the first vector. */
20321static void
20322ix86_vec_perm_index_canon (struct expand_vec_perm_d *d)
20323{
20324 unsigned nelt = d->nelt;
20325 if (d->perm[0] < nelt)
20326 return;
20327
20328 for (unsigned i = 0; i != nelt; i++)
20329 d->perm[i] = (d->perm[i] + nelt) % (2 * nelt);
20330
20331 std::swap (a&: d->op0, b&: d->op1);
20332 return;
20333}
20334
20335/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
20336 in terms of a pair of shufps+ shufps/pshufd instructions. */
20337static bool
20338expand_vec_perm_shufps_shufps (struct expand_vec_perm_d *d)
20339{
20340 unsigned char perm1[4];
20341 machine_mode vmode = d->vmode;
20342 bool ok;
20343 unsigned i, j, k, count = 0;
20344
20345 if (d->one_operand_p
20346 || (vmode != V4SImode && vmode != V4SFmode))
20347 return false;
20348
20349 if (d->testing_p)
20350 return true;
20351
20352 ix86_vec_perm_index_canon (d);
20353 for (i = 0; i < 4; ++i)
20354 count += d->perm[i] > 3 ? 1 : 0;
20355
20356 gcc_assert (count & 3);
20357
20358 rtx tmp = gen_reg_rtx (vmode);
20359 /* 2 from op0 and 2 from op1. */
20360 if (count == 2)
20361 {
20362 unsigned char perm2[4];
20363 for (i = 0, j = 0, k = 2; i < 4; ++i)
20364 if (d->perm[i] & 4)
20365 {
20366 perm1[k++] = d->perm[i];
20367 perm2[i] = k - 1;
20368 }
20369 else
20370 {
20371 perm1[j++] = d->perm[i];
20372 perm2[i] = j - 1;
20373 }
20374
20375 /* shufps. */
20376 ok = expand_vselect_vconcat (target: tmp, op0: d->op0, op1: d->op1,
20377 perm: perm1, nelt: d->nelt, testing_p: false);
20378 gcc_assert (ok);
20379 if (vmode == V4SImode && TARGET_SSE2)
20380 /* pshufd. */
20381 ok = expand_vselect (target: d->target, op0: tmp,
20382 perm: perm2, nelt: d->nelt, testing_p: false);
20383 else
20384 {
20385 /* shufps. */
20386 perm2[2] += 4;
20387 perm2[3] += 4;
20388 ok = expand_vselect_vconcat (target: d->target, op0: tmp, op1: tmp,
20389 perm: perm2, nelt: d->nelt, testing_p: false);
20390 }
20391 gcc_assert (ok);
20392 }
20393 /* 3 from one op and 1 from another. */
20394 else
20395 {
20396 unsigned pair_idx = 8, lone_idx = 8, shift;
20397
20398 /* Find the lone index. */
20399 for (i = 0; i < 4; ++i)
20400 if ((d->perm[i] > 3 && count == 1)
20401 || (d->perm[i] < 4 && count == 3))
20402 lone_idx = i;
20403
20404 /* When lone_idx is not 0, it must from second op(count == 1). */
20405 gcc_assert (count == (lone_idx ? 1 : 3));
20406
20407 /* Find the pair index that sits in the same half as the lone index. */
20408 shift = lone_idx & 2;
20409 pair_idx = 1 - lone_idx + 2 * shift;
20410
20411 /* First permutate lone index and pair index into the same vector as
20412 [ lone, lone, pair, pair ]. */
20413 perm1[1] = perm1[0]
20414 = (count == 3) ? d->perm[lone_idx] : d->perm[lone_idx] - 4;
20415 perm1[3] = perm1[2]
20416 = (count == 3) ? d->perm[pair_idx] : d->perm[pair_idx] + 4;
20417
20418 /* Alway put the vector contains lone indx at the first. */
20419 if (count == 1)
20420 std::swap (a&: d->op0, b&: d->op1);
20421
20422 /* shufps. */
20423 ok = expand_vselect_vconcat (target: tmp, op0: d->op0, op1: d->op1,
20424 perm: perm1, nelt: d->nelt, testing_p: false);
20425 gcc_assert (ok);
20426
20427 /* Refine lone and pair index to original order. */
20428 perm1[shift] = lone_idx << 1;
20429 perm1[shift + 1] = pair_idx << 1;
20430
20431 /* Select the remaining 2 elements in another vector. */
20432 for (i = 2 - shift; i < 4 - shift; ++i)
20433 perm1[i] = lone_idx == 1 ? d->perm[i] + 4 : d->perm[i];
20434
20435 /* Adjust to original selector. */
20436 if (lone_idx > 1)
20437 std::swap (a&: tmp, b&: d->op1);
20438
20439 /* shufps. */
20440 ok = expand_vselect_vconcat (target: d->target, op0: tmp, op1: d->op1,
20441 perm: perm1, nelt: d->nelt, testing_p: false);
20442
20443 gcc_assert (ok);
20444 }
20445
20446 return true;
20447}
20448
20449/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
20450 in terms of a pair of pshuflw + pshufhw instructions. */
20451
20452static bool
20453expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
20454{
20455 unsigned char perm2[MAX_VECT_LEN];
20456 unsigned i;
20457 bool ok;
20458
20459 if (d->vmode != V8HImode || !d->one_operand_p)
20460 return false;
20461
20462 /* The two permutations only operate in 64-bit lanes. */
20463 for (i = 0; i < 4; ++i)
20464 if (d->perm[i] >= 4)
20465 return false;
20466 for (i = 4; i < 8; ++i)
20467 if (d->perm[i] < 4)
20468 return false;
20469
20470 if (d->testing_p)
20471 return true;
20472
20473 /* Emit the pshuflw. */
20474 memcpy (dest: perm2, src: d->perm, n: 4);
20475 for (i = 4; i < 8; ++i)
20476 perm2[i] = i;
20477 ok = expand_vselect (target: d->target, op0: d->op0, perm: perm2, nelt: 8, testing_p: d->testing_p);
20478 gcc_assert (ok);
20479
20480 /* Emit the pshufhw. */
20481 memcpy (dest: perm2 + 4, src: d->perm + 4, n: 4);
20482 for (i = 0; i < 4; ++i)
20483 perm2[i] = i;
20484 ok = expand_vselect (target: d->target, op0: d->target, perm: perm2, nelt: 8, testing_p: d->testing_p);
20485 gcc_assert (ok);
20486
20487 return true;
20488}
20489
20490/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20491 the permutation using the SSSE3 palignr instruction. This succeeds
20492 when all of the elements in PERM fit within one vector and we merely
20493 need to shift them down so that a single vector permutation has a
20494 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
20495 the vpalignr instruction itself can perform the requested permutation. */
20496
20497static bool
20498expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
20499{
20500 unsigned i, nelt = d->nelt;
20501 unsigned min, max, minswap, maxswap;
20502 bool in_order, ok, swap = false;
20503 rtx shift, target;
20504 struct expand_vec_perm_d dcopy;
20505
20506 /* Even with AVX, palignr only operates on 128-bit vectors,
20507 in AVX2 palignr operates on both 128-bit lanes. */
20508 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
20509 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
20510 return false;
20511
20512 min = 2 * nelt;
20513 max = 0;
20514 minswap = 2 * nelt;
20515 maxswap = 0;
20516 for (i = 0; i < nelt; ++i)
20517 {
20518 unsigned e = d->perm[i];
20519 unsigned eswap = d->perm[i] ^ nelt;
20520 if (GET_MODE_SIZE (d->vmode) == 32)
20521 {
20522 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
20523 eswap = e ^ (nelt / 2);
20524 }
20525 if (e < min)
20526 min = e;
20527 if (e > max)
20528 max = e;
20529 if (eswap < minswap)
20530 minswap = eswap;
20531 if (eswap > maxswap)
20532 maxswap = eswap;
20533 }
20534 if (min == 0
20535 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
20536 {
20537 if (d->one_operand_p
20538 || minswap == 0
20539 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
20540 ? nelt / 2 : nelt))
20541 return false;
20542 swap = true;
20543 min = minswap;
20544 max = maxswap;
20545 }
20546
20547 /* Given that we have SSSE3, we know we'll be able to implement the
20548 single operand permutation after the palignr with pshufb for
20549 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
20550 first. */
20551 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
20552 return true;
20553
20554 dcopy = *d;
20555 if (swap)
20556 {
20557 dcopy.op0 = d->op1;
20558 dcopy.op1 = d->op0;
20559 for (i = 0; i < nelt; ++i)
20560 dcopy.perm[i] ^= nelt;
20561 }
20562
20563 in_order = true;
20564 for (i = 0; i < nelt; ++i)
20565 {
20566 unsigned e = dcopy.perm[i];
20567 if (GET_MODE_SIZE (d->vmode) == 32
20568 && e >= nelt
20569 && (e & (nelt / 2 - 1)) < min)
20570 e = e - min - (nelt / 2);
20571 else
20572 e = e - min;
20573 if (e != i)
20574 in_order = false;
20575 dcopy.perm[i] = e;
20576 }
20577 dcopy.one_operand_p = true;
20578
20579 if (single_insn_only_p && !in_order)
20580 return false;
20581
20582 /* For AVX2, test whether we can permute the result in one instruction. */
20583 if (d->testing_p)
20584 {
20585 if (in_order)
20586 return true;
20587 dcopy.op1 = dcopy.op0;
20588 return expand_vec_perm_1 (d: &dcopy);
20589 }
20590
20591 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
20592 if (GET_MODE_SIZE (d->vmode) == 16)
20593 {
20594 target = gen_reg_rtx (V1TImode);
20595 emit_insn (gen_ssse3_palignrv1ti (target,
20596 gen_lowpart (V1TImode, dcopy.op1),
20597 gen_lowpart (V1TImode, dcopy.op0),
20598 shift));
20599 }
20600 else
20601 {
20602 target = gen_reg_rtx (V2TImode);
20603 emit_insn (gen_avx2_palignrv2ti (target,
20604 gen_lowpart (V2TImode, dcopy.op1),
20605 gen_lowpart (V2TImode, dcopy.op0),
20606 shift));
20607 }
20608
20609 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
20610
20611 /* Test for the degenerate case where the alignment by itself
20612 produces the desired permutation. */
20613 if (in_order)
20614 {
20615 emit_move_insn (d->target, dcopy.op0);
20616 return true;
20617 }
20618
20619 ok = expand_vec_perm_1 (d: &dcopy);
20620 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
20621
20622 return ok;
20623}
20624
20625/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20626 the permutation using the SSE4_1 pblendv instruction. Potentially
20627 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
20628
20629static bool
20630expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
20631{
20632 unsigned i, which, nelt = d->nelt;
20633 struct expand_vec_perm_d dcopy, dcopy1;
20634 machine_mode vmode = d->vmode;
20635 bool ok;
20636
20637 /* Use the same checks as in expand_vec_perm_blend. */
20638 if (d->one_operand_p)
20639 return false;
20640 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
20641 ;
20642 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
20643 ;
20644 else if (TARGET_SSE4_1
20645 && (GET_MODE_SIZE (vmode) == 16
20646 || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
20647 || GET_MODE_SIZE (vmode) == 4))
20648 ;
20649 else
20650 return false;
20651
20652 /* Figure out where permutation elements stay not in their
20653 respective lanes. */
20654 for (i = 0, which = 0; i < nelt; ++i)
20655 {
20656 unsigned e = d->perm[i];
20657 if (e != i)
20658 which |= (e < nelt ? 1 : 2);
20659 }
20660 /* We can pblend the part where elements stay not in their
20661 respective lanes only when these elements are all in one
20662 half of a permutation.
20663 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
20664 lanes, but both 8 and 9 >= 8
20665 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
20666 respective lanes and 8 >= 8, but 2 not. */
20667 if (which != 1 && which != 2)
20668 return false;
20669 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
20670 return true;
20671
20672 /* First we apply one operand permutation to the part where
20673 elements stay not in their respective lanes. */
20674 dcopy = *d;
20675 if (which == 2)
20676 dcopy.op0 = dcopy.op1 = d->op1;
20677 else
20678 dcopy.op0 = dcopy.op1 = d->op0;
20679 if (!d->testing_p)
20680 dcopy.target = gen_reg_rtx (vmode);
20681 dcopy.one_operand_p = true;
20682
20683 for (i = 0; i < nelt; ++i)
20684 dcopy.perm[i] = d->perm[i] & (nelt - 1);
20685
20686 ok = expand_vec_perm_1 (d: &dcopy);
20687 if (GET_MODE_SIZE (vmode) != 16 && !ok)
20688 return false;
20689 else
20690 gcc_assert (ok);
20691 if (d->testing_p)
20692 return true;
20693
20694 /* Next we put permuted elements into their positions. */
20695 dcopy1 = *d;
20696 if (which == 2)
20697 dcopy1.op1 = dcopy.target;
20698 else
20699 dcopy1.op0 = dcopy.target;
20700
20701 for (i = 0; i < nelt; ++i)
20702 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
20703
20704 ok = expand_vec_perm_blend (d: &dcopy1);
20705 gcc_assert (ok);
20706
20707 return true;
20708}
20709
20710static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
20711
20712/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20713 a two vector permutation into a single vector permutation by using
20714 an interleave operation to merge the vectors. */
20715
20716static bool
20717expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
20718{
20719 struct expand_vec_perm_d dremap, dfinal;
20720 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
20721 unsigned HOST_WIDE_INT contents;
20722 unsigned char remap[2 * MAX_VECT_LEN];
20723 rtx_insn *seq;
20724 bool ok, same_halves = false;
20725
20726 if (GET_MODE_SIZE (d->vmode) == 4
20727 || GET_MODE_SIZE (d->vmode) == 8
20728 || GET_MODE_SIZE (d->vmode) == 16)
20729 {
20730 if (d->one_operand_p)
20731 return false;
20732 }
20733 else if (GET_MODE_SIZE (d->vmode) == 32)
20734 {
20735 if (!TARGET_AVX)
20736 return false;
20737 /* For 32-byte modes allow even d->one_operand_p.
20738 The lack of cross-lane shuffling in some instructions
20739 might prevent a single insn shuffle. */
20740 dfinal = *d;
20741 dfinal.testing_p = true;
20742 /* If expand_vec_perm_interleave3 can expand this into
20743 a 3 insn sequence, give up and let it be expanded as
20744 3 insn sequence. While that is one insn longer,
20745 it doesn't need a memory operand and in the common
20746 case that both interleave low and high permutations
20747 with the same operands are adjacent needs 4 insns
20748 for both after CSE. */
20749 if (expand_vec_perm_interleave3 (d: &dfinal))
20750 return false;
20751 }
20752 else
20753 return false;
20754
20755 /* Examine from whence the elements come. */
20756 contents = 0;
20757 for (i = 0; i < nelt; ++i)
20758 contents |= HOST_WIDE_INT_1U << d->perm[i];
20759
20760 memset (s: remap, c: 0xff, n: sizeof (remap));
20761 dremap = *d;
20762
20763 if (GET_MODE_SIZE (d->vmode) == 4
20764 || GET_MODE_SIZE (d->vmode) == 8)
20765 {
20766 unsigned HOST_WIDE_INT h1, h2, h3, h4;
20767
20768 /* Split the two input vectors into 4 halves. */
20769 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
20770 h2 = h1 << nelt2;
20771 h3 = h2 << nelt2;
20772 h4 = h3 << nelt2;
20773
20774 /* If the elements from the low halves use interleave low,
20775 and similarly for interleave high. */
20776 if ((contents & (h1 | h3)) == contents)
20777 {
20778 /* punpckl* */
20779 for (i = 0; i < nelt2; ++i)
20780 {
20781 remap[i] = i * 2;
20782 remap[i + nelt] = i * 2 + 1;
20783 dremap.perm[i * 2] = i;
20784 dremap.perm[i * 2 + 1] = i + nelt;
20785 }
20786 }
20787 else if ((contents & (h2 | h4)) == contents)
20788 {
20789 /* punpckh* */
20790 for (i = 0; i < nelt2; ++i)
20791 {
20792 remap[i + nelt2] = i * 2;
20793 remap[i + nelt + nelt2] = i * 2 + 1;
20794 dremap.perm[i * 2] = i + nelt2;
20795 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
20796 }
20797 }
20798 else
20799 return false;
20800 }
20801 else if (GET_MODE_SIZE (d->vmode) == 16)
20802 {
20803 unsigned HOST_WIDE_INT h1, h2, h3, h4;
20804
20805 /* Split the two input vectors into 4 halves. */
20806 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
20807 h2 = h1 << nelt2;
20808 h3 = h2 << nelt2;
20809 h4 = h3 << nelt2;
20810
20811 /* If the elements from the low halves use interleave low, and similarly
20812 for interleave high. If the elements are from mis-matched halves, we
20813 can use shufps for V4SF/V4SI or do a DImode shuffle. */
20814 if ((contents & (h1 | h3)) == contents)
20815 {
20816 /* punpckl* */
20817 for (i = 0; i < nelt2; ++i)
20818 {
20819 remap[i] = i * 2;
20820 remap[i + nelt] = i * 2 + 1;
20821 dremap.perm[i * 2] = i;
20822 dremap.perm[i * 2 + 1] = i + nelt;
20823 }
20824 if (!TARGET_SSE2 && d->vmode == V4SImode)
20825 dremap.vmode = V4SFmode;
20826 }
20827 else if ((contents & (h2 | h4)) == contents)
20828 {
20829 /* punpckh* */
20830 for (i = 0; i < nelt2; ++i)
20831 {
20832 remap[i + nelt2] = i * 2;
20833 remap[i + nelt + nelt2] = i * 2 + 1;
20834 dremap.perm[i * 2] = i + nelt2;
20835 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
20836 }
20837 if (!TARGET_SSE2 && d->vmode == V4SImode)
20838 dremap.vmode = V4SFmode;
20839 }
20840 else if ((contents & (h1 | h4)) == contents)
20841 {
20842 /* shufps */
20843 for (i = 0; i < nelt2; ++i)
20844 {
20845 remap[i] = i;
20846 remap[i + nelt + nelt2] = i + nelt2;
20847 dremap.perm[i] = i;
20848 dremap.perm[i + nelt2] = i + nelt + nelt2;
20849 }
20850 if (nelt != 4)
20851 {
20852 /* shufpd */
20853 dremap.vmode = V2DImode;
20854 dremap.nelt = 2;
20855 dremap.perm[0] = 0;
20856 dremap.perm[1] = 3;
20857 }
20858 }
20859 else if ((contents & (h2 | h3)) == contents)
20860 {
20861 /* shufps */
20862 for (i = 0; i < nelt2; ++i)
20863 {
20864 remap[i + nelt2] = i;
20865 remap[i + nelt] = i + nelt2;
20866 dremap.perm[i] = i + nelt2;
20867 dremap.perm[i + nelt2] = i + nelt;
20868 }
20869 if (nelt != 4)
20870 {
20871 /* shufpd */
20872 dremap.vmode = V2DImode;
20873 dremap.nelt = 2;
20874 dremap.perm[0] = 1;
20875 dremap.perm[1] = 2;
20876 }
20877 }
20878 else
20879 return false;
20880 }
20881 else
20882 {
20883 unsigned int nelt4 = nelt / 4, nzcnt = 0;
20884 unsigned HOST_WIDE_INT q[8];
20885 unsigned int nonzero_halves[4];
20886
20887 /* Split the two input vectors into 8 quarters. */
20888 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
20889 for (i = 1; i < 8; ++i)
20890 q[i] = q[0] << (nelt4 * i);
20891 for (i = 0; i < 4; ++i)
20892 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
20893 {
20894 nonzero_halves[nzcnt] = i;
20895 ++nzcnt;
20896 }
20897
20898 if (nzcnt == 1)
20899 {
20900 gcc_assert (d->one_operand_p);
20901 nonzero_halves[1] = nonzero_halves[0];
20902 same_halves = true;
20903 }
20904 else if (d->one_operand_p)
20905 {
20906 gcc_assert (nonzero_halves[0] == 0);
20907 gcc_assert (nonzero_halves[1] == 1);
20908 }
20909
20910 if (nzcnt <= 2)
20911 {
20912 if (d->perm[0] / nelt2 == nonzero_halves[1])
20913 {
20914 /* Attempt to increase the likelihood that dfinal
20915 shuffle will be intra-lane. */
20916 std::swap (a&: nonzero_halves[0], b&: nonzero_halves[1]);
20917 }
20918
20919 /* vperm2f128 or vperm2i128. */
20920 for (i = 0; i < nelt2; ++i)
20921 {
20922 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
20923 remap[i + nonzero_halves[0] * nelt2] = i;
20924 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
20925 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
20926 }
20927
20928 if (d->vmode != V8SFmode
20929 && d->vmode != V4DFmode
20930 && d->vmode != V8SImode)
20931 {
20932 dremap.vmode = V8SImode;
20933 dremap.nelt = 8;
20934 for (i = 0; i < 4; ++i)
20935 {
20936 dremap.perm[i] = i + nonzero_halves[0] * 4;
20937 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
20938 }
20939 }
20940 }
20941 else if (d->one_operand_p)
20942 return false;
20943 else if (TARGET_AVX2
20944 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
20945 {
20946 /* vpunpckl* */
20947 for (i = 0; i < nelt4; ++i)
20948 {
20949 remap[i] = i * 2;
20950 remap[i + nelt] = i * 2 + 1;
20951 remap[i + nelt2] = i * 2 + nelt2;
20952 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
20953 dremap.perm[i * 2] = i;
20954 dremap.perm[i * 2 + 1] = i + nelt;
20955 dremap.perm[i * 2 + nelt2] = i + nelt2;
20956 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
20957 }
20958 }
20959 else if (TARGET_AVX2
20960 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
20961 {
20962 /* vpunpckh* */
20963 for (i = 0; i < nelt4; ++i)
20964 {
20965 remap[i + nelt4] = i * 2;
20966 remap[i + nelt + nelt4] = i * 2 + 1;
20967 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
20968 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
20969 dremap.perm[i * 2] = i + nelt4;
20970 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
20971 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
20972 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
20973 }
20974 }
20975 else
20976 return false;
20977 }
20978
20979 /* Use the remapping array set up above to move the elements from their
20980 swizzled locations into their final destinations. */
20981 dfinal = *d;
20982 for (i = 0; i < nelt; ++i)
20983 {
20984 unsigned e = remap[d->perm[i]];
20985 gcc_assert (e < nelt);
20986 /* If same_halves is true, both halves of the remapped vector are the
20987 same. Avoid cross-lane accesses if possible. */
20988 if (same_halves && i >= nelt2)
20989 {
20990 gcc_assert (e < nelt2);
20991 dfinal.perm[i] = e + nelt2;
20992 }
20993 else
20994 dfinal.perm[i] = e;
20995 }
20996 if (!d->testing_p)
20997 {
20998 dremap.target = gen_reg_rtx (dremap.vmode);
20999 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
21000 }
21001 dfinal.op1 = dfinal.op0;
21002 dfinal.one_operand_p = true;
21003
21004 /* Test if the final remap can be done with a single insn. For V4SFmode or
21005 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
21006 start_sequence ();
21007 ok = expand_vec_perm_1 (d: &dfinal);
21008 seq = get_insns ();
21009 end_sequence ();
21010
21011 if (!ok)
21012 return false;
21013
21014 if (d->testing_p)
21015 return true;
21016
21017 if (dremap.vmode != dfinal.vmode)
21018 {
21019 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
21020 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
21021 }
21022
21023 ok = expand_vec_perm_1 (d: &dremap);
21024 gcc_assert (ok);
21025
21026 emit_insn (seq);
21027 return true;
21028}
21029
21030/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
21031 a single vector cross-lane permutation into vpermq followed
21032 by any of the single insn permutations. */
21033
21034static bool
21035expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
21036{
21037 struct expand_vec_perm_d dremap, dfinal;
21038 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
21039 unsigned contents[2];
21040 bool ok;
21041
21042 if (!(TARGET_AVX2
21043 && (d->vmode == V32QImode || d->vmode == V16HImode)
21044 && d->one_operand_p))
21045 return false;
21046
21047 contents[0] = 0;
21048 contents[1] = 0;
21049 for (i = 0; i < nelt2; ++i)
21050 {
21051 contents[0] |= 1u << (d->perm[i] / nelt4);
21052 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
21053 }
21054
21055 for (i = 0; i < 2; ++i)
21056 {
21057 unsigned int cnt = 0;
21058 for (j = 0; j < 4; ++j)
21059 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
21060 return false;
21061 }
21062
21063 if (d->testing_p)
21064 return true;
21065
21066 dremap = *d;
21067 dremap.vmode = V4DImode;
21068 dremap.nelt = 4;
21069 dremap.target = gen_reg_rtx (V4DImode);
21070 dremap.op0 = gen_lowpart (V4DImode, d->op0);
21071 dremap.op1 = dremap.op0;
21072 dremap.one_operand_p = true;
21073 for (i = 0; i < 2; ++i)
21074 {
21075 unsigned int cnt = 0;
21076 for (j = 0; j < 4; ++j)
21077 if ((contents[i] & (1u << j)) != 0)
21078 dremap.perm[2 * i + cnt++] = j;
21079 for (; cnt < 2; ++cnt)
21080 dremap.perm[2 * i + cnt] = 0;
21081 }
21082
21083 dfinal = *d;
21084 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
21085 dfinal.op1 = dfinal.op0;
21086 dfinal.one_operand_p = true;
21087 for (i = 0, j = 0; i < nelt; ++i)
21088 {
21089 if (i == nelt2)
21090 j = 2;
21091 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
21092 if ((d->perm[i] / nelt4) == dremap.perm[j])
21093 ;
21094 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
21095 dfinal.perm[i] |= nelt4;
21096 else
21097 gcc_unreachable ();
21098 }
21099
21100 ok = expand_vec_perm_1 (d: &dremap);
21101 gcc_assert (ok);
21102
21103 ok = expand_vec_perm_1 (d: &dfinal);
21104 gcc_assert (ok);
21105
21106 return true;
21107}
21108
21109static bool canonicalize_perm (struct expand_vec_perm_d *d);
21110
21111/* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
21112 a vector permutation using two instructions, vperm2f128 resp.
21113 vperm2i128 followed by any single in-lane permutation. */
21114
21115static bool
21116expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
21117{
21118 struct expand_vec_perm_d dfirst, dsecond;
21119 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
21120 bool ok;
21121
21122 if (!TARGET_AVX
21123 || GET_MODE_SIZE (d->vmode) != 32
21124 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
21125 return false;
21126
21127 dsecond = *d;
21128 dsecond.one_operand_p = false;
21129 dsecond.testing_p = true;
21130
21131 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
21132 immediate. For perm < 16 the second permutation uses
21133 d->op0 as first operand, for perm >= 16 it uses d->op1
21134 as first operand. The second operand is the result of
21135 vperm2[fi]128. */
21136 for (perm = 0; perm < 32; perm++)
21137 {
21138 /* Ignore permutations which do not move anything cross-lane. */
21139 if (perm < 16)
21140 {
21141 /* The second shuffle for e.g. V4DFmode has
21142 0123 and ABCD operands.
21143 Ignore AB23, as 23 is already in the second lane
21144 of the first operand. */
21145 if ((perm & 0xc) == (1 << 2)) continue;
21146 /* And 01CD, as 01 is in the first lane of the first
21147 operand. */
21148 if ((perm & 3) == 0) continue;
21149 /* And 4567, as then the vperm2[fi]128 doesn't change
21150 anything on the original 4567 second operand. */
21151 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
21152 }
21153 else
21154 {
21155 /* The second shuffle for e.g. V4DFmode has
21156 4567 and ABCD operands.
21157 Ignore AB67, as 67 is already in the second lane
21158 of the first operand. */
21159 if ((perm & 0xc) == (3 << 2)) continue;
21160 /* And 45CD, as 45 is in the first lane of the first
21161 operand. */
21162 if ((perm & 3) == 2) continue;
21163 /* And 0123, as then the vperm2[fi]128 doesn't change
21164 anything on the original 0123 first operand. */
21165 if ((perm & 0xf) == (1 << 2)) continue;
21166 }
21167
21168 for (i = 0; i < nelt; i++)
21169 {
21170 j = d->perm[i] / nelt2;
21171 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
21172 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
21173 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
21174 dsecond.perm[i] = d->perm[i] & (nelt - 1);
21175 else
21176 break;
21177 }
21178
21179 if (i == nelt)
21180 {
21181 start_sequence ();
21182 ok = expand_vec_perm_1 (d: &dsecond);
21183 end_sequence ();
21184 }
21185 else
21186 ok = false;
21187
21188 if (ok)
21189 {
21190 if (d->testing_p)
21191 return true;
21192
21193 /* Found a usable second shuffle. dfirst will be
21194 vperm2f128 on d->op0 and d->op1. */
21195 dsecond.testing_p = false;
21196 dfirst = *d;
21197 dfirst.target = gen_reg_rtx (d->vmode);
21198 for (i = 0; i < nelt; i++)
21199 dfirst.perm[i] = (i & (nelt2 - 1))
21200 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
21201
21202 canonicalize_perm (d: &dfirst);
21203 ok = expand_vec_perm_1 (d: &dfirst);
21204 gcc_assert (ok);
21205
21206 /* And dsecond is some single insn shuffle, taking
21207 d->op0 and result of vperm2f128 (if perm < 16) or
21208 d->op1 and result of vperm2f128 (otherwise). */
21209 if (perm >= 16)
21210 dsecond.op0 = dsecond.op1;
21211 dsecond.op1 = dfirst.target;
21212
21213 ok = expand_vec_perm_1 (d: &dsecond);
21214 gcc_assert (ok);
21215
21216 return true;
21217 }
21218
21219 /* For one operand, the only useful vperm2f128 permutation is 0x01
21220 aka lanes swap. */
21221 if (d->one_operand_p)
21222 return false;
21223 }
21224
21225 return false;
21226}
21227
21228/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
21229 a two vector permutation using 2 intra-lane interleave insns
21230 and cross-lane shuffle for 32-byte vectors. */
21231
21232static bool
21233expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
21234{
21235 unsigned i, nelt;
21236 rtx (*gen) (rtx, rtx, rtx);
21237
21238 if (d->one_operand_p)
21239 return false;
21240 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
21241 ;
21242 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
21243 ;
21244 else
21245 return false;
21246
21247 nelt = d->nelt;
21248 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
21249 return false;
21250 for (i = 0; i < nelt; i += 2)
21251 if (d->perm[i] != d->perm[0] + i / 2
21252 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
21253 return false;
21254
21255 if (d->testing_p)
21256 return true;
21257
21258 switch (d->vmode)
21259 {
21260 case E_V32QImode:
21261 if (d->perm[0])
21262 gen = gen_vec_interleave_highv32qi;
21263 else
21264 gen = gen_vec_interleave_lowv32qi;
21265 break;
21266 case E_V16HImode:
21267 if (d->perm[0])
21268 gen = gen_vec_interleave_highv16hi;
21269 else
21270 gen = gen_vec_interleave_lowv16hi;
21271 break;
21272 case E_V8SImode:
21273 if (d->perm[0])
21274 gen = gen_vec_interleave_highv8si;
21275 else
21276 gen = gen_vec_interleave_lowv8si;
21277 break;
21278 case E_V4DImode:
21279 if (d->perm[0])
21280 gen = gen_vec_interleave_highv4di;
21281 else
21282 gen = gen_vec_interleave_lowv4di;
21283 break;
21284 case E_V8SFmode:
21285 if (d->perm[0])
21286 gen = gen_vec_interleave_highv8sf;
21287 else
21288 gen = gen_vec_interleave_lowv8sf;
21289 break;
21290 case E_V4DFmode:
21291 if (d->perm[0])
21292 gen = gen_vec_interleave_highv4df;
21293 else
21294 gen = gen_vec_interleave_lowv4df;
21295 break;
21296 default:
21297 gcc_unreachable ();
21298 }
21299
21300 emit_insn (gen (d->target, d->op0, d->op1));
21301 return true;
21302}
21303
21304/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21305 a single vector permutation using a single intra-lane vector
21306 permutation, vperm2f128 swapping the lanes and vblend* insn blending
21307 the non-swapped and swapped vectors together. */
21308
21309static bool
21310expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
21311{
21312 struct expand_vec_perm_d dfirst, dsecond;
21313 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
21314 rtx_insn *seq;
21315 bool ok;
21316 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
21317
21318 if (!TARGET_AVX
21319 || TARGET_AVX2
21320 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
21321 || !d->one_operand_p)
21322 return false;
21323
21324 dfirst = *d;
21325 for (i = 0; i < nelt; i++)
21326 dfirst.perm[i] = 0xff;
21327 for (i = 0, msk = 0; i < nelt; i++)
21328 {
21329 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
21330 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
21331 return false;
21332 dfirst.perm[j] = d->perm[i];
21333 if (j != i)
21334 msk |= (1 << i);
21335 }
21336 for (i = 0; i < nelt; i++)
21337 if (dfirst.perm[i] == 0xff)
21338 dfirst.perm[i] = i;
21339
21340 if (!d->testing_p)
21341 dfirst.target = gen_reg_rtx (dfirst.vmode);
21342
21343 start_sequence ();
21344 ok = expand_vec_perm_1 (d: &dfirst);
21345 seq = get_insns ();
21346 end_sequence ();
21347
21348 if (!ok)
21349 return false;
21350
21351 if (d->testing_p)
21352 return true;
21353
21354 emit_insn (seq);
21355
21356 dsecond = *d;
21357 dsecond.op0 = dfirst.target;
21358 dsecond.op1 = dfirst.target;
21359 dsecond.one_operand_p = true;
21360 dsecond.target = gen_reg_rtx (dsecond.vmode);
21361 for (i = 0; i < nelt; i++)
21362 dsecond.perm[i] = i ^ nelt2;
21363
21364 ok = expand_vec_perm_1 (d: &dsecond);
21365 gcc_assert (ok);
21366
21367 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
21368 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
21369 return true;
21370}
21371
21372/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21373 a two vector permutation using two single vector permutations and
21374 {,v}{,p}unpckl{ps,pd,bw,wd,dq}. If two_insn, succeed only if one
21375 of dfirst or dsecond is identity permutation. */
21376
21377static bool
21378expand_vec_perm_2perm_interleave (struct expand_vec_perm_d *d, bool two_insn)
21379{
21380 unsigned i, nelt = d->nelt, nelt2 = nelt / 2, lane = nelt;
21381 struct expand_vec_perm_d dfirst, dsecond, dfinal;
21382 bool ident1 = true, ident2 = true;
21383
21384 if (d->one_operand_p)
21385 return false;
21386
21387 if (GET_MODE_SIZE (d->vmode) == 16)
21388 {
21389 if (!TARGET_SSE)
21390 return false;
21391 if (d->vmode != V4SFmode && d->vmode != V2DFmode && !TARGET_SSE2)
21392 return false;
21393 }
21394 else if (GET_MODE_SIZE (d->vmode) == 32)
21395 {
21396 if (!TARGET_AVX)
21397 return false;
21398 if (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)
21399 return false;
21400 lane = nelt2;
21401 }
21402 else
21403 return false;
21404
21405 for (i = 1; i < nelt; i++)
21406 if ((d->perm[i] >= nelt) != ((d->perm[0] >= nelt) ^ (i & 1)))
21407 return false;
21408
21409 dfirst = *d;
21410 dsecond = *d;
21411 dfinal = *d;
21412 dfirst.op1 = dfirst.op0;
21413 dfirst.one_operand_p = true;
21414 dsecond.op0 = dsecond.op1;
21415 dsecond.one_operand_p = true;
21416
21417 for (i = 0; i < nelt; i++)
21418 if (d->perm[i] >= nelt)
21419 {
21420 dsecond.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i] - nelt;
21421 if (d->perm[i] - nelt != i / 2 + (i >= lane ? lane / 2 : 0))
21422 ident2 = false;
21423 dsecond.perm[i / 2 + (i >= lane ? lane : lane / 2)]
21424 = d->perm[i] - nelt;
21425 }
21426 else
21427 {
21428 dfirst.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i];
21429 if (d->perm[i] != i / 2 + (i >= lane ? lane / 2 : 0))
21430 ident1 = false;
21431 dfirst.perm[i / 2 + (i >= lane ? lane : lane / 2)] = d->perm[i];
21432 }
21433
21434 if (two_insn && !ident1 && !ident2)
21435 return false;
21436
21437 if (!d->testing_p)
21438 {
21439 if (!ident1)
21440 dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
21441 if (!ident2)
21442 dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
21443 if (d->perm[0] >= nelt)
21444 std::swap (a&: dfinal.op0, b&: dfinal.op1);
21445 }
21446
21447 bool ok;
21448 rtx_insn *seq1 = NULL, *seq2 = NULL;
21449
21450 if (!ident1)
21451 {
21452 start_sequence ();
21453 ok = expand_vec_perm_1 (d: &dfirst);
21454 seq1 = get_insns ();
21455 end_sequence ();
21456
21457 if (!ok)
21458 return false;
21459 }
21460
21461 if (!ident2)
21462 {
21463 start_sequence ();
21464 ok = expand_vec_perm_1 (d: &dsecond);
21465 seq2 = get_insns ();
21466 end_sequence ();
21467
21468 if (!ok)
21469 return false;
21470 }
21471
21472 if (d->testing_p)
21473 return true;
21474
21475 for (i = 0; i < nelt; i++)
21476 {
21477 dfinal.perm[i] = i / 2;
21478 if (i >= lane)
21479 dfinal.perm[i] += lane / 2;
21480 if ((i & 1) != 0)
21481 dfinal.perm[i] += nelt;
21482 }
21483 emit_insn (seq1);
21484 emit_insn (seq2);
21485 ok = expand_vselect_vconcat (target: dfinal.target, op0: dfinal.op0, op1: dfinal.op1,
21486 perm: dfinal.perm, nelt: dfinal.nelt, testing_p: false);
21487 gcc_assert (ok);
21488 return true;
21489}
21490
21491/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
21492 the permutation using two single vector permutations and the SSE4_1 pblendv
21493 instruction. If two_insn, succeed only if one of dfirst or dsecond is
21494 identity permutation. */
21495
21496static bool
21497expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
21498{
21499 unsigned i, nelt = d->nelt;
21500 struct expand_vec_perm_d dfirst, dsecond, dfinal;
21501 machine_mode vmode = d->vmode;
21502 bool ident1 = true, ident2 = true;
21503
21504 /* Use the same checks as in expand_vec_perm_blend. */
21505 if (d->one_operand_p)
21506 return false;
21507 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
21508 ;
21509 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
21510 ;
21511 else if (TARGET_SSE4_1
21512 && (GET_MODE_SIZE (vmode) == 16
21513 || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
21514 || GET_MODE_SIZE (vmode) == 4))
21515 ;
21516 else
21517 return false;
21518
21519 dfirst = *d;
21520 dsecond = *d;
21521 dfinal = *d;
21522 dfirst.op1 = dfirst.op0;
21523 dfirst.one_operand_p = true;
21524 dsecond.op0 = dsecond.op1;
21525 dsecond.one_operand_p = true;
21526
21527 for (i = 0; i < nelt; ++i)
21528 if (d->perm[i] >= nelt)
21529 {
21530 dfirst.perm[i] = 0xff;
21531 dsecond.perm[i] = d->perm[i] - nelt;
21532 if (d->perm[i] != i + nelt)
21533 ident2 = false;
21534 }
21535 else
21536 {
21537 dsecond.perm[i] = 0xff;
21538 dfirst.perm[i] = d->perm[i];
21539 if (d->perm[i] != i)
21540 ident1 = false;
21541 }
21542
21543 if (two_insn && !ident1 && !ident2)
21544 return false;
21545
21546 /* For now. Ideally treat 0xff as a wildcard. */
21547 for (i = 0; i < nelt; ++i)
21548 if (dfirst.perm[i] == 0xff)
21549 {
21550 if (GET_MODE_SIZE (vmode) == 32
21551 && dfirst.perm[i ^ (nelt / 2)] != 0xff)
21552 dfirst.perm[i] = dfirst.perm[i ^ (nelt / 2)] ^ (nelt / 2);
21553 else
21554 dfirst.perm[i] = i;
21555 }
21556 else
21557 {
21558 if (GET_MODE_SIZE (vmode) == 32
21559 && dsecond.perm[i ^ (nelt / 2)] != 0xff)
21560 dsecond.perm[i] = dsecond.perm[i ^ (nelt / 2)] ^ (nelt / 2);
21561 else
21562 dsecond.perm[i] = i;
21563 }
21564
21565 if (!d->testing_p)
21566 {
21567 if (!ident1)
21568 dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
21569 if (!ident2)
21570 dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
21571 }
21572
21573 bool ok;
21574 rtx_insn *seq1 = NULL, *seq2 = NULL;
21575
21576 if (!ident1)
21577 {
21578 start_sequence ();
21579 ok = expand_vec_perm_1 (d: &dfirst);
21580 seq1 = get_insns ();
21581 end_sequence ();
21582
21583 if (!ok)
21584 return false;
21585 }
21586
21587 if (!ident2)
21588 {
21589 start_sequence ();
21590 ok = expand_vec_perm_1 (d: &dsecond);
21591 seq2 = get_insns ();
21592 end_sequence ();
21593
21594 if (!ok)
21595 return false;
21596 }
21597
21598 if (d->testing_p)
21599 return true;
21600
21601 for (i = 0; i < nelt; ++i)
21602 dfinal.perm[i] = (d->perm[i] >= nelt ? i + nelt : i);
21603
21604 emit_insn (seq1);
21605 emit_insn (seq2);
21606 ok = expand_vec_perm_blend (d: &dfinal);
21607 gcc_assert (ok);
21608 return true;
21609}
21610
21611/* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
21612 permutation using two vperm2f128, followed by a vshufpd insn blending
21613 the two vectors together. */
21614
21615static bool
21616expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
21617{
21618 struct expand_vec_perm_d dfirst, dsecond, dthird;
21619 bool ok;
21620
21621 if (!TARGET_AVX || (d->vmode != V4DFmode))
21622 return false;
21623
21624 if (d->testing_p)
21625 return true;
21626
21627 dfirst = *d;
21628 dsecond = *d;
21629 dthird = *d;
21630
21631 dfirst.perm[0] = (d->perm[0] & ~1);
21632 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
21633 dfirst.perm[2] = (d->perm[2] & ~1);
21634 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
21635 dsecond.perm[0] = (d->perm[1] & ~1);
21636 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
21637 dsecond.perm[2] = (d->perm[3] & ~1);
21638 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
21639 dthird.perm[0] = (d->perm[0] % 2);
21640 dthird.perm[1] = (d->perm[1] % 2) + 4;
21641 dthird.perm[2] = (d->perm[2] % 2) + 2;
21642 dthird.perm[3] = (d->perm[3] % 2) + 6;
21643
21644 dfirst.target = gen_reg_rtx (dfirst.vmode);
21645 dsecond.target = gen_reg_rtx (dsecond.vmode);
21646 dthird.op0 = dfirst.target;
21647 dthird.op1 = dsecond.target;
21648 dthird.one_operand_p = false;
21649
21650 canonicalize_perm (d: &dfirst);
21651 canonicalize_perm (d: &dsecond);
21652
21653 ok = expand_vec_perm_1 (d: &dfirst)
21654 && expand_vec_perm_1 (d: &dsecond)
21655 && expand_vec_perm_1 (d: &dthird);
21656
21657 gcc_assert (ok);
21658
21659 return true;
21660}
21661
21662static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
21663
21664/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21665 a two vector permutation using two intra-lane vector
21666 permutations, vperm2f128 swapping the lanes and vblend* insn blending
21667 the non-swapped and swapped vectors together. */
21668
21669static bool
21670expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
21671{
21672 struct expand_vec_perm_d dfirst, dsecond, dthird;
21673 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
21674 rtx_insn *seq1, *seq2;
21675 bool ok;
21676 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
21677
21678 if (!TARGET_AVX
21679 || TARGET_AVX2
21680 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
21681 || d->one_operand_p)
21682 return false;
21683
21684 dfirst = *d;
21685 dsecond = *d;
21686 for (i = 0; i < nelt; i++)
21687 {
21688 dfirst.perm[i] = 0xff;
21689 dsecond.perm[i] = 0xff;
21690 }
21691 for (i = 0, msk = 0; i < nelt; i++)
21692 {
21693 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
21694 if (j == i)
21695 {
21696 dfirst.perm[j] = d->perm[i];
21697 which1 |= (d->perm[i] < nelt ? 1 : 2);
21698 }
21699 else
21700 {
21701 dsecond.perm[j] = d->perm[i];
21702 which2 |= (d->perm[i] < nelt ? 1 : 2);
21703 msk |= (1U << i);
21704 }
21705 }
21706 if (msk == 0 || msk == (1U << nelt) - 1)
21707 return false;
21708
21709 if (!d->testing_p)
21710 {
21711 dfirst.target = gen_reg_rtx (dfirst.vmode);
21712 dsecond.target = gen_reg_rtx (dsecond.vmode);
21713 }
21714
21715 for (i = 0; i < nelt; i++)
21716 {
21717 if (dfirst.perm[i] == 0xff)
21718 dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
21719 if (dsecond.perm[i] == 0xff)
21720 dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
21721 }
21722 canonicalize_perm (d: &dfirst);
21723 start_sequence ();
21724 ok = ix86_expand_vec_perm_const_1 (&dfirst);
21725 seq1 = get_insns ();
21726 end_sequence ();
21727
21728 if (!ok)
21729 return false;
21730
21731 canonicalize_perm (d: &dsecond);
21732 start_sequence ();
21733 ok = ix86_expand_vec_perm_const_1 (&dsecond);
21734 seq2 = get_insns ();
21735 end_sequence ();
21736
21737 if (!ok)
21738 return false;
21739
21740 if (d->testing_p)
21741 return true;
21742
21743 emit_insn (seq1);
21744 emit_insn (seq2);
21745
21746 dthird = *d;
21747 dthird.op0 = dsecond.target;
21748 dthird.op1 = dsecond.target;
21749 dthird.one_operand_p = true;
21750 dthird.target = gen_reg_rtx (dthird.vmode);
21751 for (i = 0; i < nelt; i++)
21752 dthird.perm[i] = i ^ nelt2;
21753
21754 ok = expand_vec_perm_1 (d: &dthird);
21755 gcc_assert (ok);
21756
21757 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
21758 emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
21759 return true;
21760}
21761
21762/* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
21763 permutation with two pshufb insns and an ior. We should have already
21764 failed all two instruction sequences. */
21765
21766static bool
21767expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
21768{
21769 rtx rperm[2][16], vperm, l, h, op, m128;
21770 unsigned int i, nelt, eltsz;
21771 machine_mode mode;
21772 rtx (*gen) (rtx, rtx, rtx);
21773
21774 if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16
21775 && GET_MODE_SIZE (d->vmode) != 8
21776 && GET_MODE_SIZE (d->vmode) != 4))
21777 return false;
21778 gcc_assert (!d->one_operand_p);
21779
21780 if (d->testing_p)
21781 return true;
21782
21783 switch (GET_MODE_SIZE (d->vmode))
21784 {
21785 case 4:
21786 mode = V4QImode;
21787 gen = gen_mmx_pshufbv4qi3;
21788 break;
21789 case 8:
21790 mode = V8QImode;
21791 gen = gen_mmx_pshufbv8qi3;
21792 break;
21793 case 16:
21794 mode = V16QImode;
21795 gen = gen_ssse3_pshufbv16qi3;
21796 break;
21797 default:
21798 gcc_unreachable ();
21799 }
21800
21801 nelt = d->nelt;
21802 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21803
21804 /* Generate two permutation masks. If the required element is within
21805 the given vector it is shuffled into the proper lane. If the required
21806 element is in the other vector, force a zero into the lane by setting
21807 bit 7 in the permutation mask. */
21808 m128 = GEN_INT (-128);
21809 for (i = 0; i < nelt; ++i)
21810 {
21811 unsigned j, k, e = d->perm[i];
21812 unsigned which = (e >= nelt);
21813 if (e >= nelt)
21814 e -= nelt;
21815
21816 for (j = 0; j < eltsz; ++j)
21817 {
21818 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
21819 rperm[1-which][i*eltsz + j] = m128;
21820 }
21821
21822 for (k = i*eltsz + j; k < 16; ++k)
21823 rperm[0][k] = rperm[1][k] = m128;
21824 }
21825
21826 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
21827 vperm = force_reg (V16QImode, vperm);
21828
21829 l = gen_reg_rtx (mode);
21830 op = gen_lowpart (mode, d->op0);
21831 emit_insn (gen (l, op, vperm));
21832
21833 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
21834 vperm = force_reg (V16QImode, vperm);
21835
21836 h = gen_reg_rtx (mode);
21837 op = gen_lowpart (mode, d->op1);
21838 emit_insn (gen (h, op, vperm));
21839
21840 op = d->target;
21841 if (d->vmode != mode)
21842 op = gen_reg_rtx (mode);
21843 ix86_emit_vec_binop (code: IOR, mode, dst: op, src1: l, src2: h);
21844 if (op != d->target)
21845 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21846
21847 return true;
21848}
21849
21850/* Implement arbitrary permutation of one V32QImode and V16QImode operand
21851 with two vpshufb insns, vpermq and vpor. We should have already failed
21852 all two or three instruction sequences. */
21853
21854static bool
21855expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
21856{
21857 rtx rperm[2][32], vperm, l, h, hp, op, m128;
21858 unsigned int i, nelt, eltsz;
21859
21860 if (!TARGET_AVX2
21861 || !d->one_operand_p
21862 || (d->vmode != V32QImode && d->vmode != V16HImode))
21863 return false;
21864
21865 if (d->testing_p)
21866 return true;
21867
21868 nelt = d->nelt;
21869 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21870
21871 /* Generate two permutation masks. If the required element is within
21872 the same lane, it is shuffled in. If the required element from the
21873 other lane, force a zero by setting bit 7 in the permutation mask.
21874 In the other mask the mask has non-negative elements if element
21875 is requested from the other lane, but also moved to the other lane,
21876 so that the result of vpshufb can have the two V2TImode halves
21877 swapped. */
21878 m128 = GEN_INT (-128);
21879 for (i = 0; i < nelt; ++i)
21880 {
21881 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
21882 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
21883
21884 for (j = 0; j < eltsz; ++j)
21885 {
21886 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
21887 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
21888 }
21889 }
21890
21891 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
21892 vperm = force_reg (V32QImode, vperm);
21893
21894 h = gen_reg_rtx (V32QImode);
21895 op = gen_lowpart (V32QImode, d->op0);
21896 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
21897
21898 /* Swap the 128-byte lanes of h into hp. */
21899 hp = gen_reg_rtx (V4DImode);
21900 op = gen_lowpart (V4DImode, h);
21901 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
21902 const1_rtx));
21903
21904 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
21905 vperm = force_reg (V32QImode, vperm);
21906
21907 l = gen_reg_rtx (V32QImode);
21908 op = gen_lowpart (V32QImode, d->op0);
21909 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
21910
21911 op = d->target;
21912 if (d->vmode != V32QImode)
21913 op = gen_reg_rtx (V32QImode);
21914 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
21915 if (op != d->target)
21916 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21917
21918 return true;
21919}
21920
21921/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21922 and extract-odd permutations of two V32QImode and V16QImode operand
21923 with two vpshufb insns, vpor and vpermq. We should have already
21924 failed all two or three instruction sequences. */
21925
21926static bool
21927expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
21928{
21929 rtx rperm[2][32], vperm, l, h, ior, op, m128;
21930 unsigned int i, nelt, eltsz;
21931
21932 if (!TARGET_AVX2
21933 || d->one_operand_p
21934 || (d->vmode != V32QImode && d->vmode != V16HImode))
21935 return false;
21936
21937 for (i = 0; i < d->nelt; ++i)
21938 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
21939 return false;
21940
21941 if (d->testing_p)
21942 return true;
21943
21944 nelt = d->nelt;
21945 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21946
21947 /* Generate two permutation masks. In the first permutation mask
21948 the first quarter will contain indexes for the first half
21949 of the op0, the second quarter will contain bit 7 set, third quarter
21950 will contain indexes for the second half of the op0 and the
21951 last quarter bit 7 set. In the second permutation mask
21952 the first quarter will contain bit 7 set, the second quarter
21953 indexes for the first half of the op1, the third quarter bit 7 set
21954 and last quarter indexes for the second half of the op1.
21955 I.e. the first mask e.g. for V32QImode extract even will be:
21956 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
21957 (all values masked with 0xf except for -128) and second mask
21958 for extract even will be
21959 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
21960 m128 = GEN_INT (-128);
21961 for (i = 0; i < nelt; ++i)
21962 {
21963 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
21964 unsigned which = d->perm[i] >= nelt;
21965 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
21966
21967 for (j = 0; j < eltsz; ++j)
21968 {
21969 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
21970 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
21971 }
21972 }
21973
21974 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
21975 vperm = force_reg (V32QImode, vperm);
21976
21977 l = gen_reg_rtx (V32QImode);
21978 op = gen_lowpart (V32QImode, d->op0);
21979 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
21980
21981 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
21982 vperm = force_reg (V32QImode, vperm);
21983
21984 h = gen_reg_rtx (V32QImode);
21985 op = gen_lowpart (V32QImode, d->op1);
21986 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
21987
21988 ior = gen_reg_rtx (V32QImode);
21989 emit_insn (gen_iorv32qi3 (ior, l, h));
21990
21991 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
21992 op = gen_reg_rtx (V4DImode);
21993 ior = gen_lowpart (V4DImode, ior);
21994 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
21995 const1_rtx, GEN_INT (3)));
21996 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21997
21998 return true;
21999}
22000
22001/* Implement permutation with pslldq + psrldq + por when pshufb is not
22002 available. */
22003static bool
22004expand_vec_perm_pslldq_psrldq_por (struct expand_vec_perm_d *d, bool pandn)
22005{
22006 unsigned i, nelt = d->nelt;
22007 unsigned start1, end1 = -1;
22008 machine_mode vmode = d->vmode, imode;
22009 int start2 = -1;
22010 bool clear_op0, clear_op1;
22011 unsigned inner_size;
22012 rtx op0, op1, dop1;
22013 rtx (*gen_vec_shr) (rtx, rtx, rtx);
22014 rtx (*gen_vec_shl) (rtx, rtx, rtx);
22015
22016 /* pshufd can be used for V4SI/V2DI under TARGET_SSE2. */
22017 if (!TARGET_SSE2 || (vmode != E_V16QImode && vmode != E_V8HImode))
22018 return false;
22019
22020 start1 = d->perm[0];
22021 for (i = 1; i < nelt; i++)
22022 {
22023 if (d->perm[i] != d->perm[i-1] + 1
22024 || d->perm[i] == nelt)
22025 {
22026 if (start2 == -1)
22027 {
22028 start2 = d->perm[i];
22029 end1 = d->perm[i-1];
22030 }
22031 else
22032 return false;
22033 }
22034 }
22035
22036 clear_op0 = end1 != nelt - 1;
22037 clear_op1 = start2 % nelt != 0;
22038 /* pandn/pand is needed to clear upper/lower bits of op0/op1. */
22039 if (!pandn && (clear_op0 || clear_op1))
22040 return false;
22041
22042 if (d->testing_p)
22043 return true;
22044
22045 gen_vec_shr = vmode == E_V16QImode ? gen_vec_shr_v16qi : gen_vec_shr_v8hi;
22046 gen_vec_shl = vmode == E_V16QImode ? gen_vec_shl_v16qi : gen_vec_shl_v8hi;
22047 imode = GET_MODE_INNER (vmode);
22048 inner_size = GET_MODE_BITSIZE (imode);
22049 op0 = gen_reg_rtx (vmode);
22050 op1 = gen_reg_rtx (vmode);
22051
22052 if (start1)
22053 emit_insn (gen_vec_shr (op0, d->op0, GEN_INT (start1 * inner_size)));
22054 else
22055 emit_move_insn (op0, d->op0);
22056
22057 dop1 = d->op1;
22058 if (d->one_operand_p)
22059 dop1 = d->op0;
22060
22061 int shl_offset = end1 - start1 + 1 - start2 % nelt;
22062 if (shl_offset)
22063 emit_insn (gen_vec_shl (op1, dop1, GEN_INT (shl_offset * inner_size)));
22064 else
22065 emit_move_insn (op1, dop1);
22066
22067 /* Clear lower/upper bits for op0/op1. */
22068 if (clear_op0 || clear_op1)
22069 {
22070 rtx vec[16];
22071 rtx const_vec;
22072 rtx clear;
22073 for (i = 0; i != nelt; i++)
22074 {
22075 if (i < (end1 - start1 + 1))
22076 vec[i] = gen_int_mode ((HOST_WIDE_INT_1U << inner_size) - 1, imode);
22077 else
22078 vec[i] = CONST0_RTX (imode);
22079 }
22080 const_vec = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, vec));
22081 const_vec = validize_mem (force_const_mem (vmode, const_vec));
22082 clear = force_reg (vmode, const_vec);
22083
22084 if (clear_op0)
22085 emit_move_insn (op0, gen_rtx_AND (vmode, op0, clear));
22086 if (clear_op1)
22087 emit_move_insn (op1, gen_rtx_AND (vmode,
22088 gen_rtx_NOT (vmode, clear),
22089 op1));
22090 }
22091
22092 emit_move_insn (d->target, gen_rtx_IOR (vmode, op0, op1));
22093 return true;
22094}
22095
22096/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
22097 and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
22098 operands with two "and" and "pack" or two "shift" and "pack" insns.
22099 We should have already failed all two instruction sequences. */
22100
22101static bool
22102expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
22103{
22104 rtx op, dop0, dop1, t;
22105 unsigned i, odd, c, s, nelt = d->nelt;
22106 bool end_perm = false;
22107 machine_mode half_mode;
22108 rtx (*gen_and) (rtx, rtx, rtx);
22109 rtx (*gen_pack) (rtx, rtx, rtx);
22110 rtx (*gen_shift) (rtx, rtx, rtx);
22111
22112 if (d->one_operand_p)
22113 return false;
22114
22115 switch (d->vmode)
22116 {
22117 case E_V4HImode:
22118 /* Required for "pack". */
22119 if (!TARGET_SSE4_1)
22120 return false;
22121 c = 0xffff;
22122 s = 16;
22123 half_mode = V2SImode;
22124 gen_and = gen_andv2si3;
22125 gen_pack = gen_mmx_packusdw;
22126 gen_shift = gen_lshrv2si3;
22127 break;
22128 case E_V8HImode:
22129 /* Required for "pack". */
22130 if (!TARGET_SSE4_1)
22131 return false;
22132 c = 0xffff;
22133 s = 16;
22134 half_mode = V4SImode;
22135 gen_and = gen_andv4si3;
22136 gen_pack = gen_sse4_1_packusdw;
22137 gen_shift = gen_lshrv4si3;
22138 break;
22139 case E_V8QImode:
22140 /* No check as all instructions are SSE2. */
22141 c = 0xff;
22142 s = 8;
22143 half_mode = V4HImode;
22144 gen_and = gen_andv4hi3;
22145 gen_pack = gen_mmx_packuswb;
22146 gen_shift = gen_lshrv4hi3;
22147 break;
22148 case E_V16QImode:
22149 /* No check as all instructions are SSE2. */
22150 c = 0xff;
22151 s = 8;
22152 half_mode = V8HImode;
22153 gen_and = gen_andv8hi3;
22154 gen_pack = gen_sse2_packuswb;
22155 gen_shift = gen_lshrv8hi3;
22156 break;
22157 case E_V16HImode:
22158 if (!TARGET_AVX2)
22159 return false;
22160 c = 0xffff;
22161 s = 16;
22162 half_mode = V8SImode;
22163 gen_and = gen_andv8si3;
22164 gen_pack = gen_avx2_packusdw;
22165 gen_shift = gen_lshrv8si3;
22166 end_perm = true;
22167 break;
22168 case E_V32QImode:
22169 if (!TARGET_AVX2)
22170 return false;
22171 c = 0xff;
22172 s = 8;
22173 half_mode = V16HImode;
22174 gen_and = gen_andv16hi3;
22175 gen_pack = gen_avx2_packuswb;
22176 gen_shift = gen_lshrv16hi3;
22177 end_perm = true;
22178 break;
22179 default:
22180 /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
22181 are more profitable than general shuffles. */
22182 return false;
22183 }
22184
22185 /* Check that permutation is even or odd. */
22186 odd = d->perm[0];
22187 if (odd > 1)
22188 return false;
22189
22190 for (i = 1; i < nelt; ++i)
22191 if (d->perm[i] != 2 * i + odd)
22192 return false;
22193
22194 if (d->testing_p)
22195 return true;
22196
22197 dop0 = gen_reg_rtx (half_mode);
22198 dop1 = gen_reg_rtx (half_mode);
22199 if (odd == 0)
22200 {
22201 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
22202 t = force_reg (half_mode, t);
22203 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
22204 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
22205 }
22206 else
22207 {
22208 emit_insn (gen_shift (dop0,
22209 gen_lowpart (half_mode, d->op0),
22210 GEN_INT (s)));
22211 emit_insn (gen_shift (dop1,
22212 gen_lowpart (half_mode, d->op1),
22213 GEN_INT (s)));
22214 }
22215 /* In AVX2 for 256 bit case we need to permute pack result. */
22216 if (TARGET_AVX2 && end_perm)
22217 {
22218 op = gen_reg_rtx (d->vmode);
22219 t = gen_reg_rtx (V4DImode);
22220 emit_insn (gen_pack (op, dop0, dop1));
22221 emit_insn (gen_avx2_permv4di_1 (t,
22222 gen_lowpart (V4DImode, op),
22223 const0_rtx,
22224 const2_rtx,
22225 const1_rtx,
22226 GEN_INT (3)));
22227 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
22228 }
22229 else
22230 emit_insn (gen_pack (d->target, dop0, dop1));
22231
22232 return true;
22233}
22234
22235/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
22236 and extract-odd permutations of two V64QI operands
22237 with two "shifts", two "truncs" and one "concat" insns for "odd"
22238 and two "truncs" and one concat insn for "even."
22239 Have already failed all two instruction sequences. */
22240
22241static bool
22242expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
22243{
22244 rtx t1, t2, t3, t4;
22245 unsigned i, odd, nelt = d->nelt;
22246
22247 if (!TARGET_AVX512BW
22248 || d->one_operand_p
22249 || d->vmode != V64QImode)
22250 return false;
22251
22252 /* Check that permutation is even or odd. */
22253 odd = d->perm[0];
22254 if (odd > 1)
22255 return false;
22256
22257 for (i = 1; i < nelt; ++i)
22258 if (d->perm[i] != 2 * i + odd)
22259 return false;
22260
22261 if (d->testing_p)
22262 return true;
22263
22264
22265 if (odd)
22266 {
22267 t1 = gen_reg_rtx (V32HImode);
22268 t2 = gen_reg_rtx (V32HImode);
22269 emit_insn (gen_lshrv32hi3 (t1,
22270 gen_lowpart (V32HImode, d->op0),
22271 GEN_INT (8)));
22272 emit_insn (gen_lshrv32hi3 (t2,
22273 gen_lowpart (V32HImode, d->op1),
22274 GEN_INT (8)));
22275 }
22276 else
22277 {
22278 t1 = gen_lowpart (V32HImode, d->op0);
22279 t2 = gen_lowpart (V32HImode, d->op1);
22280 }
22281
22282 t3 = gen_reg_rtx (V32QImode);
22283 t4 = gen_reg_rtx (V32QImode);
22284 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
22285 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
22286 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
22287
22288 return true;
22289}
22290
22291/* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
22292 and extract-odd permutations. */
22293
22294static bool
22295expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
22296{
22297 rtx t1, t2, t3, t4, t5;
22298
22299 switch (d->vmode)
22300 {
22301 case E_V4DFmode:
22302 if (d->testing_p)
22303 break;
22304 t1 = gen_reg_rtx (V4DFmode);
22305 t2 = gen_reg_rtx (V4DFmode);
22306
22307 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
22308 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
22309 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
22310
22311 /* Now an unpck[lh]pd will produce the result required. */
22312 if (odd)
22313 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
22314 else
22315 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
22316 emit_insn (t3);
22317 break;
22318
22319 case E_V8SFmode:
22320 {
22321 int mask = odd ? 0xdd : 0x88;
22322
22323 if (d->testing_p)
22324 break;
22325 t1 = gen_reg_rtx (V8SFmode);
22326 t2 = gen_reg_rtx (V8SFmode);
22327 t3 = gen_reg_rtx (V8SFmode);
22328
22329 /* Shuffle within the 128-bit lanes to produce:
22330 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
22331 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
22332 GEN_INT (mask)));
22333
22334 /* Shuffle the lanes around to produce:
22335 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
22336 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
22337 GEN_INT (0x3)));
22338
22339 /* Shuffle within the 128-bit lanes to produce:
22340 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
22341 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
22342
22343 /* Shuffle within the 128-bit lanes to produce:
22344 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
22345 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
22346
22347 /* Shuffle the lanes around to produce:
22348 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
22349 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
22350 GEN_INT (0x20)));
22351 }
22352 break;
22353
22354 case E_V2DFmode:
22355 case E_V4SFmode:
22356 case E_V2DImode:
22357 case E_V2SImode:
22358 case E_V4SImode:
22359 case E_V2HImode:
22360 /* These are always directly implementable by expand_vec_perm_1. */
22361 gcc_unreachable ();
22362
22363 case E_V2SFmode:
22364 gcc_assert (TARGET_MMX_WITH_SSE);
22365 /* We have no suitable instructions. */
22366 if (d->testing_p)
22367 return false;
22368 break;
22369
22370 case E_V4QImode:
22371 if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
22372 return expand_vec_perm_pshufb2 (d);
22373 else
22374 {
22375 if (d->testing_p)
22376 break;
22377 /* We need 2*log2(N)-1 operations to achieve odd/even
22378 with interleave. */
22379 t1 = gen_reg_rtx (V4QImode);
22380 emit_insn (gen_mmx_punpckhbw_low (t1, d->op0, d->op1));
22381 emit_insn (gen_mmx_punpcklbw_low (d->target, d->op0, d->op1));
22382 if (odd)
22383 t2 = gen_mmx_punpckhbw_low (d->target, d->target, t1);
22384 else
22385 t2 = gen_mmx_punpcklbw_low (d->target, d->target, t1);
22386 emit_insn (t2);
22387 }
22388 break;
22389
22390 case E_V4HImode:
22391 if (TARGET_SSE4_1)
22392 return expand_vec_perm_even_odd_pack (d);
22393 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
22394 return expand_vec_perm_pshufb2 (d);
22395 else
22396 {
22397 if (d->testing_p)
22398 break;
22399 /* We need 2*log2(N)-1 operations to achieve odd/even
22400 with interleave. */
22401 t1 = gen_reg_rtx (V4HImode);
22402 emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
22403 emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
22404 if (odd)
22405 t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
22406 else
22407 t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
22408 emit_insn (t2);
22409 }
22410 break;
22411
22412 case E_V8HImode:
22413 if (TARGET_SSE4_1)
22414 return expand_vec_perm_even_odd_pack (d);
22415 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
22416 return expand_vec_perm_pshufb2 (d);
22417 else
22418 {
22419 if (d->testing_p)
22420 break;
22421 /* We need 2*log2(N)-1 operations to achieve odd/even
22422 with interleave. */
22423 t1 = gen_reg_rtx (V8HImode);
22424 t2 = gen_reg_rtx (V8HImode);
22425 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
22426 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
22427 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
22428 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
22429 if (odd)
22430 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
22431 else
22432 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
22433 emit_insn (t3);
22434 }
22435 break;
22436
22437 case E_V8QImode:
22438 case E_V16QImode:
22439 return expand_vec_perm_even_odd_pack (d);
22440
22441 case E_V16HImode:
22442 case E_V32QImode:
22443 return expand_vec_perm_even_odd_pack (d);
22444
22445 case E_V64QImode:
22446 return expand_vec_perm_even_odd_trunc (d);
22447
22448 case E_V4DImode:
22449 if (!TARGET_AVX2)
22450 {
22451 struct expand_vec_perm_d d_copy = *d;
22452 d_copy.vmode = V4DFmode;
22453 if (d->testing_p)
22454 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
22455 else
22456 d_copy.target = gen_reg_rtx (V4DFmode);
22457 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
22458 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
22459 if (expand_vec_perm_even_odd_1 (d: &d_copy, odd))
22460 {
22461 if (!d->testing_p)
22462 emit_move_insn (d->target,
22463 gen_lowpart (V4DImode, d_copy.target));
22464 return true;
22465 }
22466 return false;
22467 }
22468
22469 if (d->testing_p)
22470 break;
22471
22472 t1 = gen_reg_rtx (V4DImode);
22473 t2 = gen_reg_rtx (V4DImode);
22474
22475 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
22476 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
22477 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
22478
22479 /* Now an vpunpck[lh]qdq will produce the result required. */
22480 if (odd)
22481 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
22482 else
22483 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
22484 emit_insn (t3);
22485 break;
22486
22487 case E_V8SImode:
22488 if (!TARGET_AVX2)
22489 {
22490 struct expand_vec_perm_d d_copy = *d;
22491 d_copy.vmode = V8SFmode;
22492 if (d->testing_p)
22493 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
22494 else
22495 d_copy.target = gen_reg_rtx (V8SFmode);
22496 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
22497 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
22498 if (expand_vec_perm_even_odd_1 (d: &d_copy, odd))
22499 {
22500 if (!d->testing_p)
22501 emit_move_insn (d->target,
22502 gen_lowpart (V8SImode, d_copy.target));
22503 return true;
22504 }
22505 return false;
22506 }
22507
22508 if (d->testing_p)
22509 break;
22510
22511 t1 = gen_reg_rtx (V8SImode);
22512 t2 = gen_reg_rtx (V8SImode);
22513 t3 = gen_reg_rtx (V4DImode);
22514 t4 = gen_reg_rtx (V4DImode);
22515 t5 = gen_reg_rtx (V4DImode);
22516
22517 /* Shuffle the lanes around into
22518 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
22519 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
22520 gen_lowpart (V4DImode, d->op1),
22521 GEN_INT (0x20)));
22522 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
22523 gen_lowpart (V4DImode, d->op1),
22524 GEN_INT (0x31)));
22525
22526 /* Swap the 2nd and 3rd position in each lane into
22527 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
22528 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
22529 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
22530 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
22531 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
22532
22533 /* Now an vpunpck[lh]qdq will produce
22534 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
22535 if (odd)
22536 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
22537 gen_lowpart (V4DImode, t2));
22538 else
22539 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
22540 gen_lowpart (V4DImode, t2));
22541 emit_insn (t3);
22542 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
22543 break;
22544
22545 default:
22546 gcc_unreachable ();
22547 }
22548
22549 return true;
22550}
22551
22552/* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
22553 extract-even and extract-odd permutations. */
22554
22555static bool
22556expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
22557{
22558 unsigned i, odd, nelt = d->nelt;
22559
22560 odd = d->perm[0];
22561 if (odd != 0 && odd != 1)
22562 return false;
22563
22564 for (i = 1; i < nelt; ++i)
22565 if (d->perm[i] != 2 * i + odd)
22566 return false;
22567
22568 if (d->vmode == E_V32HImode
22569 && d->testing_p
22570 && !TARGET_AVX512BW)
22571 return false;
22572
22573 return expand_vec_perm_even_odd_1 (d, odd);
22574}
22575
22576/* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
22577 permutations. We assume that expand_vec_perm_1 has already failed. */
22578
22579static bool
22580expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
22581{
22582 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
22583 machine_mode vmode = d->vmode;
22584 rtx (*gen) (rtx, rtx, rtx);
22585 unsigned char perm2[4];
22586 rtx op0 = d->op0, dest;
22587 bool ok;
22588
22589 switch (vmode)
22590 {
22591 case E_V4DFmode:
22592 case E_V8SFmode:
22593 /* These are special-cased in sse.md so that we can optionally
22594 use the vbroadcast instruction. They expand to two insns
22595 if the input happens to be in a register. */
22596 gcc_unreachable ();
22597
22598 case E_V2DFmode:
22599 case E_V2SFmode:
22600 case E_V4SFmode:
22601 case E_V2DImode:
22602 case E_V2SImode:
22603 case E_V4SImode:
22604 case E_V2HImode:
22605 case E_V4HImode:
22606 /* These are always implementable using standard shuffle patterns. */
22607 gcc_unreachable ();
22608
22609 case E_V4QImode:
22610 /* This can be implemented via interleave and pshuflw. */
22611 if (d->testing_p)
22612 return true;
22613
22614 if (elt >= nelt2)
22615 {
22616 gen = gen_mmx_punpckhbw_low;
22617 elt -= nelt2;
22618 }
22619 else
22620 gen = gen_mmx_punpcklbw_low;
22621
22622 dest = gen_reg_rtx (vmode);
22623 emit_insn (gen (dest, op0, op0));
22624 vmode = get_mode_wider_vector (o: vmode);
22625 op0 = gen_lowpart (vmode, dest);
22626
22627 memset (s: perm2, c: elt, n: 2);
22628 dest = gen_reg_rtx (vmode);
22629 ok = expand_vselect (target: dest, op0, perm: perm2, nelt: 2, testing_p: d->testing_p);
22630 gcc_assert (ok);
22631
22632 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22633 return true;
22634
22635 case E_V8QImode:
22636 /* This can be implemented via interleave. We save one insn by
22637 stopping once we have promoted to V2SImode and then use pshufd. */
22638 if (d->testing_p)
22639 return true;
22640 do
22641 {
22642 if (elt >= nelt2)
22643 {
22644 gen = vmode == V8QImode ? gen_mmx_punpckhbw
22645 : gen_mmx_punpckhwd;
22646 elt -= nelt2;
22647 }
22648 else
22649 gen = vmode == V8QImode ? gen_mmx_punpcklbw
22650 : gen_mmx_punpcklwd;
22651 nelt2 /= 2;
22652
22653 dest = gen_reg_rtx (vmode);
22654 emit_insn (gen (dest, op0, op0));
22655 vmode = get_mode_wider_vector (o: vmode);
22656 op0 = gen_lowpart (vmode, dest);
22657 }
22658 while (vmode != V2SImode);
22659
22660 memset (s: perm2, c: elt, n: 2);
22661 dest = gen_reg_rtx (vmode);
22662 ok = expand_vselect (target: dest, op0, perm: perm2, nelt: 2, testing_p: d->testing_p);
22663 gcc_assert (ok);
22664
22665 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22666 return true;
22667
22668 case E_V8HImode:
22669 case E_V16QImode:
22670 /* These can be implemented via interleave. We save one insn by
22671 stopping once we have promoted to V4SImode and then use pshufd. */
22672 if (d->testing_p)
22673 return true;
22674 do
22675 {
22676 if (elt >= nelt2)
22677 {
22678 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
22679 : gen_vec_interleave_highv8hi;
22680 elt -= nelt2;
22681 }
22682 else
22683 gen = vmode == V16QImode ? gen_vec_interleave_lowv16qi
22684 : gen_vec_interleave_lowv8hi;
22685 nelt2 /= 2;
22686
22687 dest = gen_reg_rtx (vmode);
22688 emit_insn (gen (dest, op0, op0));
22689 vmode = get_mode_wider_vector (o: vmode);
22690 op0 = gen_lowpart (vmode, dest);
22691 }
22692 while (vmode != V4SImode);
22693
22694 memset (s: perm2, c: elt, n: 4);
22695 dest = gen_reg_rtx (vmode);
22696 ok = expand_vselect (target: dest, op0, perm: perm2, nelt: 4, testing_p: d->testing_p);
22697 gcc_assert (ok);
22698
22699 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22700 return true;
22701
22702 case E_V8HFmode:
22703 case E_V8BFmode:
22704 /* This can be implemented via interleave and pshufd. */
22705 if (d->testing_p)
22706 return true;
22707
22708 rtx (*gen_interleave) (machine_mode, int, rtx, rtx, rtx);
22709 if (elt >= nelt2)
22710 {
22711 gen_interleave = gen_vec_interleave_high;
22712 elt -= nelt2;
22713 }
22714 else
22715 gen_interleave = gen_vec_interleave_low;
22716 nelt2 /= 2;
22717
22718 dest = gen_reg_rtx (vmode);
22719 emit_insn (gen_interleave (vmode, 1, dest, op0, op0));
22720
22721 vmode = V4SImode;
22722 op0 = gen_lowpart (vmode, dest);
22723
22724 memset (s: perm2, c: elt, n: 4);
22725 dest = gen_reg_rtx (vmode);
22726 ok = expand_vselect (target: dest, op0, perm: perm2, nelt: 4, testing_p: d->testing_p);
22727 gcc_assert (ok);
22728
22729 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22730 return true;
22731
22732 case E_V32QImode:
22733 case E_V16HImode:
22734 case E_V8SImode:
22735 case E_V4DImode:
22736 /* For AVX2 broadcasts of the first element vpbroadcast* or
22737 vpermq should be used by expand_vec_perm_1. */
22738 gcc_assert (!TARGET_AVX2 || d->perm[0]);
22739 return false;
22740
22741 case E_V64QImode:
22742 gcc_assert (!TARGET_AVX512BW || d->perm[0]);
22743 return false;
22744
22745 case E_V32HImode:
22746 gcc_assert (!TARGET_AVX512BW);
22747 return false;
22748
22749 default:
22750 gcc_unreachable ();
22751 }
22752}
22753
22754/* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
22755 broadcast permutations. */
22756
22757static bool
22758expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
22759{
22760 unsigned i, elt, nelt = d->nelt;
22761
22762 if (!d->one_operand_p)
22763 return false;
22764
22765 elt = d->perm[0];
22766 for (i = 1; i < nelt; ++i)
22767 if (d->perm[i] != elt)
22768 return false;
22769
22770 return expand_vec_perm_broadcast_1 (d);
22771}
22772
22773/* Implement arbitrary permutations of two V64QImode operands
22774 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
22775static bool
22776expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
22777{
22778 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
22779 return false;
22780
22781 if (d->testing_p)
22782 return true;
22783
22784 struct expand_vec_perm_d ds[2];
22785 rtx rperm[128], vperm, target0, target1;
22786 unsigned int i, nelt;
22787 machine_mode vmode;
22788
22789 nelt = d->nelt;
22790 vmode = V64QImode;
22791
22792 for (i = 0; i < 2; i++)
22793 {
22794 ds[i] = *d;
22795 ds[i].vmode = V32HImode;
22796 ds[i].nelt = 32;
22797 ds[i].target = gen_reg_rtx (V32HImode);
22798 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
22799 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
22800 }
22801
22802 /* Prepare permutations such that the first one takes care of
22803 putting the even bytes into the right positions or one higher
22804 positions (ds[0]) and the second one takes care of
22805 putting the odd bytes into the right positions or one below
22806 (ds[1]). */
22807
22808 for (i = 0; i < nelt; i++)
22809 {
22810 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
22811 if (i & 1)
22812 {
22813 rperm[i] = constm1_rtx;
22814 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
22815 }
22816 else
22817 {
22818 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
22819 rperm[i + 64] = constm1_rtx;
22820 }
22821 }
22822
22823 bool ok = expand_vec_perm_1 (d: &ds[0]);
22824 gcc_assert (ok);
22825 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
22826
22827 ok = expand_vec_perm_1 (d: &ds[1]);
22828 gcc_assert (ok);
22829 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
22830
22831 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
22832 vperm = force_reg (vmode, vperm);
22833 target0 = gen_reg_rtx (V64QImode);
22834 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
22835
22836 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
22837 vperm = force_reg (vmode, vperm);
22838 target1 = gen_reg_rtx (V64QImode);
22839 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
22840
22841 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
22842 return true;
22843}
22844
22845/* Implement arbitrary permutation of two V32QImode and V16QImode operands
22846 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
22847 all the shorter instruction sequences. */
22848
22849static bool
22850expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
22851{
22852 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
22853 unsigned int i, nelt, eltsz;
22854 bool used[4];
22855
22856 if (!TARGET_AVX2
22857 || d->one_operand_p
22858 || (d->vmode != V32QImode && d->vmode != V16HImode))
22859 return false;
22860
22861 if (d->testing_p)
22862 return true;
22863
22864 nelt = d->nelt;
22865 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
22866
22867 /* Generate 4 permutation masks. If the required element is within
22868 the same lane, it is shuffled in. If the required element from the
22869 other lane, force a zero by setting bit 7 in the permutation mask.
22870 In the other mask the mask has non-negative elements if element
22871 is requested from the other lane, but also moved to the other lane,
22872 so that the result of vpshufb can have the two V2TImode halves
22873 swapped. */
22874 m128 = GEN_INT (-128);
22875 for (i = 0; i < 32; ++i)
22876 {
22877 rperm[0][i] = m128;
22878 rperm[1][i] = m128;
22879 rperm[2][i] = m128;
22880 rperm[3][i] = m128;
22881 }
22882 used[0] = false;
22883 used[1] = false;
22884 used[2] = false;
22885 used[3] = false;
22886 for (i = 0; i < nelt; ++i)
22887 {
22888 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
22889 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
22890 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
22891
22892 for (j = 0; j < eltsz; ++j)
22893 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
22894 used[which] = true;
22895 }
22896
22897 for (i = 0; i < 2; ++i)
22898 {
22899 if (!used[2 * i + 1])
22900 {
22901 h[i] = NULL_RTX;
22902 continue;
22903 }
22904 vperm = gen_rtx_CONST_VECTOR (V32QImode,
22905 gen_rtvec_v (32, rperm[2 * i + 1]));
22906 vperm = force_reg (V32QImode, vperm);
22907 h[i] = gen_reg_rtx (V32QImode);
22908 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
22909 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
22910 }
22911
22912 /* Swap the 128-byte lanes of h[X]. */
22913 for (i = 0; i < 2; ++i)
22914 {
22915 if (h[i] == NULL_RTX)
22916 continue;
22917 op = gen_reg_rtx (V4DImode);
22918 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
22919 const2_rtx, GEN_INT (3), const0_rtx,
22920 const1_rtx));
22921 h[i] = gen_lowpart (V32QImode, op);
22922 }
22923
22924 for (i = 0; i < 2; ++i)
22925 {
22926 if (!used[2 * i])
22927 {
22928 l[i] = NULL_RTX;
22929 continue;
22930 }
22931 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
22932 vperm = force_reg (V32QImode, vperm);
22933 l[i] = gen_reg_rtx (V32QImode);
22934 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
22935 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
22936 }
22937
22938 for (i = 0; i < 2; ++i)
22939 {
22940 if (h[i] && l[i])
22941 {
22942 op = gen_reg_rtx (V32QImode);
22943 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
22944 l[i] = op;
22945 }
22946 else if (h[i])
22947 l[i] = h[i];
22948 }
22949
22950 gcc_assert (l[0] && l[1]);
22951 op = d->target;
22952 if (d->vmode != V32QImode)
22953 op = gen_reg_rtx (V32QImode);
22954 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
22955 if (op != d->target)
22956 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
22957 return true;
22958}
22959
22960/* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
22961 taken care of, perform the expansion in D and return true on success. */
22962
22963static bool
22964ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
22965{
22966 /* Try a single instruction expansion. */
22967 if (expand_vec_perm_1 (d))
22968 return true;
22969
22970 /* Try sequences of two instructions. */
22971
22972 if (expand_vec_perm_pshuflw_pshufhw (d))
22973 return true;
22974
22975 if (expand_vec_perm_palignr (d, single_insn_only_p: false))
22976 return true;
22977
22978 if (expand_vec_perm_interleave2 (d))
22979 return true;
22980
22981 if (expand_vec_perm_broadcast (d))
22982 return true;
22983
22984 if (expand_vec_perm_vpermq_perm_1 (d))
22985 return true;
22986
22987 if (expand_vec_perm_vperm2f128 (d))
22988 return true;
22989
22990 if (expand_vec_perm_pblendv (d))
22991 return true;
22992
22993 if (expand_vec_perm_2perm_interleave (d, two_insn: true))
22994 return true;
22995
22996 if (expand_vec_perm_2perm_pblendv (d, two_insn: true))
22997 return true;
22998
22999 if (expand_vec_perm_shufps_shufps (d))
23000 return true;
23001
23002 /* Try sequences of three instructions. */
23003
23004 if (expand_vec_perm_even_odd_pack (d))
23005 return true;
23006
23007 if (expand_vec_perm_2vperm2f128_vshuf (d))
23008 return true;
23009
23010 if (expand_vec_perm_pshufb2 (d))
23011 return true;
23012
23013 if (expand_vec_perm_pslldq_psrldq_por (d, pandn: false))
23014 return true;
23015
23016 if (expand_vec_perm_interleave3 (d))
23017 return true;
23018
23019 if (expand_vec_perm_vperm2f128_vblend (d))
23020 return true;
23021
23022 if (expand_vec_perm_2perm_interleave (d, two_insn: false))
23023 return true;
23024
23025 if (expand_vec_perm_2perm_pblendv (d, two_insn: false))
23026 return true;
23027
23028 /* Try sequences of four instructions. */
23029
23030 if (expand_vec_perm_even_odd_trunc (d))
23031 return true;
23032 if (expand_vec_perm_vpshufb2_vpermq (d))
23033 return true;
23034
23035 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
23036 return true;
23037
23038 if (expand_vec_perm_vpermt2_vpshub2 (d))
23039 return true;
23040
23041 /* ??? Look for narrow permutations whose element orderings would
23042 allow the promotion to a wider mode. */
23043
23044 /* ??? Look for sequences of interleave or a wider permute that place
23045 the data into the correct lanes for a half-vector shuffle like
23046 pshuf[lh]w or vpermilps. */
23047
23048 /* ??? Look for sequences of interleave that produce the desired results.
23049 The combinatorics of punpck[lh] get pretty ugly... */
23050
23051 if (expand_vec_perm_even_odd (d))
23052 return true;
23053
23054 /* Generate four or five instructions. */
23055 if (expand_vec_perm_pslldq_psrldq_por (d, pandn: true))
23056 return true;
23057
23058 /* Even longer sequences. */
23059 if (expand_vec_perm_vpshufb4_vpermq2 (d))
23060 return true;
23061
23062 /* See if we can get the same permutation in different vector integer
23063 mode. */
23064 struct expand_vec_perm_d nd;
23065 if (canonicalize_vector_int_perm (d, nd: &nd) && expand_vec_perm_1 (d: &nd))
23066 {
23067 if (!d->testing_p)
23068 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
23069 return true;
23070 }
23071
23072 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
23073 if (expand_vec_perm2_vperm2f128_vblend (d))
23074 return true;
23075
23076 return false;
23077}
23078
23079/* If a permutation only uses one operand, make it clear. Returns true
23080 if the permutation references both operands. */
23081
23082static bool
23083canonicalize_perm (struct expand_vec_perm_d *d)
23084{
23085 int i, which, nelt = d->nelt;
23086
23087 for (i = which = 0; i < nelt; ++i)
23088 which |= (d->perm[i] < nelt ? 1 : 2);
23089
23090 d->one_operand_p = true;
23091 switch (which)
23092 {
23093 default:
23094 gcc_unreachable();
23095
23096 case 3:
23097 if (!rtx_equal_p (d->op0, d->op1))
23098 {
23099 d->one_operand_p = false;
23100 break;
23101 }
23102 /* The elements of PERM do not suggest that only the first operand
23103 is used, but both operands are identical. Allow easier matching
23104 of the permutation by folding the permutation into the single
23105 input vector. */
23106 /* FALLTHRU */
23107
23108 case 2:
23109 for (i = 0; i < nelt; ++i)
23110 d->perm[i] &= nelt - 1;
23111 d->op0 = d->op1;
23112 break;
23113
23114 case 1:
23115 d->op1 = d->op0;
23116 break;
23117 }
23118
23119 return (which == 3);
23120}
23121
23122/* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
23123
23124bool
23125ix86_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
23126 rtx target, rtx op0, rtx op1,
23127 const vec_perm_indices &sel)
23128{
23129 if (vmode != op_mode)
23130 return false;
23131
23132 struct expand_vec_perm_d d;
23133 unsigned char perm[MAX_VECT_LEN];
23134 unsigned int i, nelt, which;
23135 bool two_args;
23136
23137 if (GET_MODE_SIZE (vmode) == 64 && !TARGET_EVEX512)
23138 return false;
23139
23140 /* For HF mode vector, convert it to HI using subreg. */
23141 if (GET_MODE_INNER (vmode) == HFmode)
23142 {
23143 machine_mode orig_mode = vmode;
23144 vmode = mode_for_vector (HImode,
23145 GET_MODE_NUNITS (vmode)).require ();
23146 if (target)
23147 target = lowpart_subreg (outermode: vmode, op: target, innermode: orig_mode);
23148 if (op0)
23149 op0 = lowpart_subreg (outermode: vmode, op: op0, innermode: orig_mode);
23150 if (op1)
23151 op1 = lowpart_subreg (outermode: vmode, op: op1, innermode: orig_mode);
23152 }
23153
23154 d.target = target;
23155 d.op0 = op0;
23156 d.op1 = op1;
23157
23158 d.vmode = vmode;
23159 gcc_assert (VECTOR_MODE_P (d.vmode));
23160 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
23161 d.testing_p = !target;
23162
23163 gcc_assert (sel.length () == nelt);
23164 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
23165
23166 /* Given sufficient ISA support we can just return true here
23167 for selected vector modes. */
23168 switch (d.vmode)
23169 {
23170 case E_V16SFmode:
23171 case E_V16SImode:
23172 case E_V8DImode:
23173 case E_V8DFmode:
23174 if (!TARGET_AVX512F)
23175 return false;
23176 /* All implementable with a single vperm[it]2 insn. */
23177 if (d.testing_p)
23178 return true;
23179 break;
23180 case E_V32HImode:
23181 if (!TARGET_AVX512F)
23182 return false;
23183 if (d.testing_p && TARGET_AVX512BW)
23184 /* All implementable with a single vperm[it]2 insn. */
23185 return true;
23186 break;
23187 case E_V64QImode:
23188 if (!TARGET_AVX512F)
23189 return false;
23190 if (d.testing_p && TARGET_AVX512BW)
23191 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
23192 return true;
23193 break;
23194 case E_V8SImode:
23195 case E_V8SFmode:
23196 case E_V4DFmode:
23197 case E_V4DImode:
23198 if (!TARGET_AVX)
23199 return false;
23200 if (d.testing_p && TARGET_AVX512VL)
23201 /* All implementable with a single vperm[it]2 insn. */
23202 return true;
23203 break;
23204 case E_V16HImode:
23205 if (!TARGET_SSE2)
23206 return false;
23207 if (d.testing_p && TARGET_AVX2)
23208 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
23209 return true;
23210 break;
23211 case E_V32QImode:
23212 if (!TARGET_SSE2)
23213 return false;
23214 if (d.testing_p && TARGET_AVX2)
23215 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
23216 return true;
23217 break;
23218 case E_V8HImode:
23219 case E_V16QImode:
23220 if (!TARGET_SSE2)
23221 return false;
23222 /* Fall through. */
23223 case E_V4SImode:
23224 case E_V4SFmode:
23225 if (!TARGET_SSE)
23226 return false;
23227 /* All implementable with a single vpperm insn. */
23228 if (d.testing_p && TARGET_XOP)
23229 return true;
23230 /* All implementable with 2 pshufb + 1 ior. */
23231 if (d.testing_p && TARGET_SSSE3)
23232 return true;
23233 break;
23234 case E_V2SFmode:
23235 case E_V2SImode:
23236 case E_V4HImode:
23237 case E_V8QImode:
23238 if (!TARGET_MMX_WITH_SSE)
23239 return false;
23240 break;
23241 case E_V2HImode:
23242 if (!TARGET_SSE2)
23243 return false;
23244 /* All implementable with *punpckwd. */
23245 if (d.testing_p)
23246 return true;
23247 break;
23248 case E_V4QImode:
23249 if (!TARGET_SSE2)
23250 return false;
23251 break;
23252 case E_V2DImode:
23253 case E_V2DFmode:
23254 if (!TARGET_SSE)
23255 return false;
23256 /* All implementable with shufpd or unpck[lh]pd. */
23257 if (d.testing_p)
23258 return true;
23259 break;
23260 default:
23261 return false;
23262 }
23263
23264 for (i = which = 0; i < nelt; ++i)
23265 {
23266 unsigned char e = sel[i];
23267 gcc_assert (e < 2 * nelt);
23268 d.perm[i] = e;
23269 perm[i] = e;
23270 which |= (e < nelt ? 1 : 2);
23271 }
23272
23273 if (d.testing_p)
23274 {
23275 /* For all elements from second vector, fold the elements to first. */
23276 if (which == 2)
23277 for (i = 0; i < nelt; ++i)
23278 d.perm[i] -= nelt;
23279
23280 /* Check whether the mask can be applied to the vector type. */
23281 d.one_operand_p = (which != 3);
23282
23283 /* Implementable with shufps, pshufd or pshuflw. */
23284 if (d.one_operand_p
23285 && (d.vmode == V4SFmode || d.vmode == V2SFmode
23286 || d.vmode == V4SImode || d.vmode == V2SImode
23287 || d.vmode == V4HImode || d.vmode == V2HImode))
23288 return true;
23289
23290 /* Otherwise we have to go through the motions and see if we can
23291 figure out how to generate the requested permutation. */
23292 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
23293 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
23294 if (!d.one_operand_p)
23295 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
23296
23297 start_sequence ();
23298 bool ret = ix86_expand_vec_perm_const_1 (d: &d);
23299 end_sequence ();
23300
23301 return ret;
23302 }
23303
23304 two_args = canonicalize_perm (d: &d);
23305
23306 /* If one of the operands is a zero vector, try to match pmovzx. */
23307 if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
23308 {
23309 struct expand_vec_perm_d dzero = d;
23310 if (d.op0 == CONST0_RTX (vmode))
23311 {
23312 d.op1 = dzero.op1 = force_reg (vmode, d.op1);
23313 std::swap (a&: dzero.op0, b&: dzero.op1);
23314 for (i = 0; i < nelt; ++i)
23315 dzero.perm[i] ^= nelt;
23316 }
23317 else
23318 d.op0 = dzero.op0 = force_reg (vmode, d.op0);
23319
23320 if (expand_vselect_vconcat (target: dzero.target, op0: dzero.op0, op1: dzero.op1,
23321 perm: dzero.perm, nelt, testing_p: dzero.testing_p))
23322 return true;
23323 }
23324
23325 /* Force operands into registers. */
23326 rtx nop0 = force_reg (vmode, d.op0);
23327 if (d.op0 == d.op1)
23328 d.op1 = nop0;
23329 d.op0 = nop0;
23330 d.op1 = force_reg (vmode, d.op1);
23331
23332 if (ix86_expand_vec_perm_const_1 (d: &d))
23333 return true;
23334
23335 /* If the selector says both arguments are needed, but the operands are the
23336 same, the above tried to expand with one_operand_p and flattened selector.
23337 If that didn't work, retry without one_operand_p; we succeeded with that
23338 during testing. */
23339 if (two_args && d.one_operand_p)
23340 {
23341 d.one_operand_p = false;
23342 memcpy (dest: d.perm, src: perm, n: sizeof (perm));
23343 return ix86_expand_vec_perm_const_1 (d: &d);
23344 }
23345
23346 return false;
23347}
23348
23349void
23350ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
23351{
23352 struct expand_vec_perm_d d;
23353 unsigned i, nelt;
23354
23355 d.target = targ;
23356 d.op0 = op0;
23357 d.op1 = op1;
23358 d.vmode = GET_MODE (targ);
23359 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
23360 d.one_operand_p = false;
23361 d.testing_p = false;
23362
23363 for (i = 0; i < nelt; ++i)
23364 d.perm[i] = i * 2 + odd;
23365
23366 /* We'll either be able to implement the permutation directly... */
23367 if (expand_vec_perm_1 (d: &d))
23368 return;
23369
23370 /* ... or we use the special-case patterns. */
23371 expand_vec_perm_even_odd_1 (d: &d, odd);
23372}
23373
23374static void
23375ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
23376{
23377 struct expand_vec_perm_d d;
23378 unsigned i, nelt, base;
23379 bool ok;
23380
23381 d.target = targ;
23382 d.op0 = op0;
23383 d.op1 = op1;
23384 d.vmode = GET_MODE (targ);
23385 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
23386 d.one_operand_p = false;
23387 d.testing_p = false;
23388
23389 base = high_p ? nelt / 2 : 0;
23390 for (i = 0; i < nelt / 2; ++i)
23391 {
23392 d.perm[i * 2] = i + base;
23393 d.perm[i * 2 + 1] = i + base + nelt;
23394 }
23395
23396 /* Note that for AVX this isn't one instruction. */
23397 ok = ix86_expand_vec_perm_const_1 (d: &d);
23398 gcc_assert (ok);
23399}
23400
23401/* Expand a vector operation shift by constant for a V*QImode in terms of the
23402 same operation on V*HImode. Return true if success. */
23403static bool
23404ix86_expand_vec_shift_qihi_constant (enum rtx_code code,
23405 rtx dest, rtx op1, rtx op2)
23406{
23407 machine_mode qimode, himode;
23408 HOST_WIDE_INT and_constant, xor_constant;
23409 HOST_WIDE_INT shift_amount;
23410 rtx vec_const_and, vec_const_xor;
23411 rtx tmp, op1_subreg;
23412 rtx (*gen_shift) (rtx, rtx, rtx);
23413 rtx (*gen_and) (rtx, rtx, rtx);
23414 rtx (*gen_xor) (rtx, rtx, rtx);
23415 rtx (*gen_sub) (rtx, rtx, rtx);
23416
23417 /* Only optimize shift by constant. */
23418 if (!CONST_INT_P (op2))
23419 return false;
23420
23421 qimode = GET_MODE (dest);
23422 shift_amount = INTVAL (op2);
23423 /* Do nothing when shift amount greater equal 8. */
23424 if (shift_amount > 7)
23425 return false;
23426
23427 gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
23428 /* Record sign bit. */
23429 xor_constant = 1 << (8 - shift_amount - 1);
23430
23431 /* Zero upper/lower bits shift from left/right element. */
23432 and_constant
23433 = (code == ASHIFT ? 256 - (1 << shift_amount)
23434 : (1 << (8 - shift_amount)) - 1);
23435
23436 switch (qimode)
23437 {
23438 case V16QImode:
23439 himode = V8HImode;
23440 gen_shift =
23441 ((code == ASHIFT)
23442 ? gen_ashlv8hi3
23443 : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
23444 gen_and = gen_andv16qi3;
23445 gen_xor = gen_xorv16qi3;
23446 gen_sub = gen_subv16qi3;
23447 break;
23448 case V32QImode:
23449 himode = V16HImode;
23450 gen_shift =
23451 ((code == ASHIFT)
23452 ? gen_ashlv16hi3
23453 : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
23454 gen_and = gen_andv32qi3;
23455 gen_xor = gen_xorv32qi3;
23456 gen_sub = gen_subv32qi3;
23457 break;
23458 case V64QImode:
23459 himode = V32HImode;
23460 gen_shift =
23461 ((code == ASHIFT)
23462 ? gen_ashlv32hi3
23463 : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
23464 gen_and = gen_andv64qi3;
23465 gen_xor = gen_xorv64qi3;
23466 gen_sub = gen_subv64qi3;
23467 break;
23468 default:
23469 gcc_unreachable ();
23470 }
23471
23472 tmp = gen_reg_rtx (himode);
23473 vec_const_and = gen_reg_rtx (qimode);
23474 op1_subreg = lowpart_subreg (outermode: himode, op: op1, innermode: qimode);
23475
23476 /* For ASHIFT and LSHIFTRT, perform operation like
23477 vpsllw/vpsrlw $shift_amount, %op1, %dest.
23478 vpand %vec_const_and, %dest. */
23479 emit_insn (gen_shift (tmp, op1_subreg, op2));
23480 emit_move_insn (dest, simplify_gen_subreg (outermode: qimode, op: tmp, innermode: himode, byte: 0));
23481 emit_move_insn (vec_const_and,
23482 ix86_build_const_vector (qimode, true,
23483 gen_int_mode (and_constant, QImode)));
23484 emit_insn (gen_and (dest, dest, vec_const_and));
23485
23486 /* For ASHIFTRT, perform extra operation like
23487 vpxor %vec_const_xor, %dest, %dest
23488 vpsubb %vec_const_xor, %dest, %dest */
23489 if (code == ASHIFTRT)
23490 {
23491 vec_const_xor = gen_reg_rtx (qimode);
23492 emit_move_insn (vec_const_xor,
23493 ix86_build_const_vector (qimode, true,
23494 gen_int_mode (xor_constant, QImode)));
23495 emit_insn (gen_xor (dest, dest, vec_const_xor));
23496 emit_insn (gen_sub (dest, dest, vec_const_xor));
23497 }
23498 return true;
23499}
23500
23501void
23502ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
23503{
23504 machine_mode qimode = GET_MODE (dest);
23505 rtx qop1, qop2, hop1, hop2, qdest, hdest;
23506 bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
23507 bool uns_p = code != ASHIFTRT;
23508
23509 switch (qimode)
23510 {
23511 case E_V4QImode:
23512 case E_V8QImode:
23513 break;
23514 default:
23515 gcc_unreachable ();
23516 }
23517
23518 qop1 = lowpart_subreg (V16QImode, op: force_reg (qimode, op1), innermode: qimode);
23519
23520 if (op2vec)
23521 qop2 = lowpart_subreg (V16QImode, op: force_reg (qimode, op2), innermode: qimode);
23522 else
23523 qop2 = op2;
23524
23525 qdest = gen_reg_rtx (V16QImode);
23526
23527 if (CONST_INT_P (op2)
23528 && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
23529 && ix86_expand_vec_shift_qihi_constant (code, dest: qdest, op1: qop1, op2: qop2))
23530 {
23531 emit_move_insn (dest, gen_lowpart (qimode, qdest));
23532 return;
23533 }
23534
23535 switch (code)
23536 {
23537 case MULT:
23538 gcc_assert (op2vec);
23539 if (!TARGET_SSE4_1)
23540 {
23541 /* Unpack data such that we've got a source byte in each low byte
23542 of each word. We don't care what goes into the high byte of
23543 each word. Rather than trying to get zero in there, most
23544 convenient is to let it be a copy of the low byte. */
23545 hop1 = copy_to_reg (qop1);
23546 hop2 = copy_to_reg (qop2);
23547 emit_insn (gen_vec_interleave_lowv16qi (hop1, hop1, hop1));
23548 emit_insn (gen_vec_interleave_lowv16qi (hop2, hop2, hop2));
23549 break;
23550 }
23551 /* FALLTHRU */
23552 case ASHIFT:
23553 case ASHIFTRT:
23554 case LSHIFTRT:
23555 hop1 = gen_reg_rtx (V8HImode);
23556 ix86_expand_sse_unpack (dest: hop1, src: qop1, unsigned_p: uns_p, high_p: false);
23557 /* mult/vashr/vlshr/vashl */
23558 if (op2vec)
23559 {
23560 hop2 = gen_reg_rtx (V8HImode);
23561 ix86_expand_sse_unpack (dest: hop2, src: qop2, unsigned_p: uns_p, high_p: false);
23562 }
23563 else
23564 hop2 = qop2;
23565
23566 break;
23567 default:
23568 gcc_unreachable ();
23569 }
23570
23571 if (code != MULT && op2vec)
23572 {
23573 /* Expand vashr/vlshr/vashl. */
23574 hdest = gen_reg_rtx (V8HImode);
23575 emit_insn (gen_rtx_SET (hdest,
23576 simplify_gen_binary (code, V8HImode,
23577 hop1, hop2)));
23578 }
23579 else
23580 /* Expand mult/ashr/lshr/ashl. */
23581 hdest = expand_simple_binop (V8HImode, code, hop1, hop2,
23582 NULL_RTX, 1, OPTAB_DIRECT);
23583
23584 if (TARGET_AVX512BW && TARGET_AVX512VL)
23585 {
23586 if (qimode == V8QImode)
23587 qdest = dest;
23588 else
23589 qdest = gen_reg_rtx (V8QImode);
23590
23591 emit_insn (gen_truncv8hiv8qi2 (qdest, hdest));
23592 }
23593 else
23594 {
23595 struct expand_vec_perm_d d;
23596 rtx qres = gen_lowpart (V16QImode, hdest);
23597 bool ok;
23598 int i;
23599
23600 /* Merge the data back into the right place. */
23601 d.target = qdest;
23602 d.op0 = d.op1 = qres;
23603 d.vmode = V16QImode;
23604 d.nelt = 16;
23605 d.one_operand_p = false;
23606 d.testing_p = false;
23607
23608 for (i = 0; i < d.nelt; ++i)
23609 d.perm[i] = i * 2;
23610
23611 ok = ix86_expand_vec_perm_const_1 (d: &d);
23612 gcc_assert (ok);
23613 }
23614
23615 if (qdest != dest)
23616 emit_move_insn (dest, gen_lowpart (qimode, qdest));
23617}
23618
23619/* Emit instruction in 2x wider mode. For example, optimize
23620 vector MUL generation like
23621
23622 vpmovzxbw ymm2, xmm0
23623 vpmovzxbw ymm3, xmm1
23624 vpmullw ymm4, ymm2, ymm3
23625 vpmovwb xmm0, ymm4
23626
23627 it would take less instructions than ix86_expand_vecop_qihi.
23628 Return true if success. */
23629
23630static bool
23631ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
23632{
23633 machine_mode himode, qimode = GET_MODE (dest);
23634 machine_mode wqimode;
23635 rtx qop1, qop2, hop1, hop2, hdest;
23636 rtx (*gen_truncate)(rtx, rtx) = NULL;
23637 bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
23638 bool uns_p = code != ASHIFTRT;
23639
23640 if ((qimode == V16QImode && !TARGET_AVX2)
23641 || (qimode == V32QImode && (!TARGET_AVX512BW || !TARGET_EVEX512))
23642 /* There are no V64HImode instructions. */
23643 || qimode == V64QImode)
23644 return false;
23645
23646 /* Do not generate ymm/zmm instructions when
23647 target prefers 128/256 bit vector width. */
23648 if ((qimode == V16QImode && TARGET_PREFER_AVX128)
23649 || (qimode == V32QImode && TARGET_PREFER_AVX256))
23650 return false;
23651
23652 switch (qimode)
23653 {
23654 case E_V16QImode:
23655 himode = V16HImode;
23656 if (TARGET_AVX512VL && TARGET_AVX512BW)
23657 gen_truncate = gen_truncv16hiv16qi2;
23658 break;
23659 case E_V32QImode:
23660 himode = V32HImode;
23661 gen_truncate = gen_truncv32hiv32qi2;
23662 break;
23663 default:
23664 gcc_unreachable ();
23665 }
23666
23667 wqimode = GET_MODE_2XWIDER_MODE (m: qimode).require ();
23668 qop1 = lowpart_subreg (outermode: wqimode, op: force_reg (qimode, op1), innermode: qimode);
23669
23670 if (op2vec)
23671 qop2 = lowpart_subreg (outermode: wqimode, op: force_reg (qimode, op2), innermode: qimode);
23672 else
23673 qop2 = op2;
23674
23675 hop1 = gen_reg_rtx (himode);
23676 ix86_expand_sse_unpack (dest: hop1, src: qop1, unsigned_p: uns_p, high_p: false);
23677
23678 if (op2vec)
23679 {
23680 hop2 = gen_reg_rtx (himode);
23681 ix86_expand_sse_unpack (dest: hop2, src: qop2, unsigned_p: uns_p, high_p: false);
23682 }
23683 else
23684 hop2 = qop2;
23685
23686 if (code != MULT && op2vec)
23687 {
23688 /* Expand vashr/vlshr/vashl. */
23689 hdest = gen_reg_rtx (himode);
23690 emit_insn (gen_rtx_SET (hdest,
23691 simplify_gen_binary (code, himode,
23692 hop1, hop2)));
23693 }
23694 else
23695 /* Expand mult/ashr/lshr/ashl. */
23696 hdest = expand_simple_binop (himode, code, hop1, hop2,
23697 NULL_RTX, 1, OPTAB_DIRECT);
23698
23699 if (gen_truncate)
23700 emit_insn (gen_truncate (dest, hdest));
23701 else
23702 {
23703 struct expand_vec_perm_d d;
23704 rtx wqdest = gen_reg_rtx (wqimode);
23705 rtx wqres = gen_lowpart (wqimode, hdest);
23706 bool ok;
23707 int i;
23708
23709 /* Merge the data back into the right place. */
23710 d.target = wqdest;
23711 d.op0 = d.op1 = wqres;
23712 d.vmode = wqimode;
23713 d.nelt = GET_MODE_NUNITS (wqimode);
23714 d.one_operand_p = false;
23715 d.testing_p = false;
23716
23717 for (i = 0; i < d.nelt; ++i)
23718 d.perm[i] = i * 2;
23719
23720 ok = ix86_expand_vec_perm_const_1 (d: &d);
23721 gcc_assert (ok);
23722
23723 emit_move_insn (dest, gen_lowpart (qimode, wqdest));
23724 }
23725
23726 return true;
23727}
23728
23729/* Expand a vector operation CODE for a V*QImode in terms of the
23730 same operation on V*HImode. */
23731
23732void
23733ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
23734{
23735 machine_mode qimode = GET_MODE (dest);
23736 machine_mode himode;
23737 rtx (*gen_il) (rtx, rtx, rtx);
23738 rtx (*gen_ih) (rtx, rtx, rtx);
23739 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
23740 bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
23741 struct expand_vec_perm_d d;
23742 bool full_interleave = true;
23743 bool uns_p = code != ASHIFTRT;
23744 bool ok;
23745 int i;
23746
23747 if (CONST_INT_P (op2)
23748 && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
23749 && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
23750 return;
23751
23752 if (ix86_expand_vecop_qihi2 (code, dest, op1, op2))
23753 return;
23754
23755 switch (qimode)
23756 {
23757 case E_V16QImode:
23758 himode = V8HImode;
23759 break;
23760 case E_V32QImode:
23761 himode = V16HImode;
23762 break;
23763 case E_V64QImode:
23764 himode = V32HImode;
23765 break;
23766 default:
23767 gcc_unreachable ();
23768 }
23769
23770 switch (code)
23771 {
23772 case MULT:
23773 gcc_assert (op2vec);
23774 /* Unpack data such that we've got a source byte in each low byte of
23775 each word. We don't care what goes into the high byte of each word.
23776 Rather than trying to get zero in there, most convenient is to let
23777 it be a copy of the low byte. */
23778 switch (qimode)
23779 {
23780 case E_V16QImode:
23781 gen_il = gen_vec_interleave_lowv16qi;
23782 gen_ih = gen_vec_interleave_highv16qi;
23783 break;
23784 case E_V32QImode:
23785 gen_il = gen_avx2_interleave_lowv32qi;
23786 gen_ih = gen_avx2_interleave_highv32qi;
23787 full_interleave = false;
23788 break;
23789 case E_V64QImode:
23790 gen_il = gen_avx512bw_interleave_lowv64qi;
23791 gen_ih = gen_avx512bw_interleave_highv64qi;
23792 full_interleave = false;
23793 break;
23794 default:
23795 gcc_unreachable ();
23796 }
23797
23798 op2_l = gen_reg_rtx (qimode);
23799 op2_h = gen_reg_rtx (qimode);
23800 emit_insn (gen_il (op2_l, op2, op2));
23801 emit_insn (gen_ih (op2_h, op2, op2));
23802
23803 op1_l = gen_reg_rtx (qimode);
23804 op1_h = gen_reg_rtx (qimode);
23805 emit_insn (gen_il (op1_l, op1, op1));
23806 emit_insn (gen_ih (op1_h, op1, op1));
23807 break;
23808
23809 case ASHIFT:
23810 case ASHIFTRT:
23811 case LSHIFTRT:
23812 op1_l = gen_reg_rtx (himode);
23813 op1_h = gen_reg_rtx (himode);
23814 ix86_expand_sse_unpack (dest: op1_l, src: op1, unsigned_p: uns_p, high_p: false);
23815 ix86_expand_sse_unpack (dest: op1_h, src: op1, unsigned_p: uns_p, high_p: true);
23816 /* vashr/vlshr/vashl */
23817 if (op2vec)
23818 {
23819 rtx tmp = force_reg (qimode, op2);
23820 op2_l = gen_reg_rtx (himode);
23821 op2_h = gen_reg_rtx (himode);
23822 ix86_expand_sse_unpack (dest: op2_l, src: tmp, unsigned_p: uns_p, high_p: false);
23823 ix86_expand_sse_unpack (dest: op2_h, src: tmp, unsigned_p: uns_p, high_p: true);
23824 }
23825 else
23826 op2_l = op2_h = op2;
23827
23828 break;
23829 default:
23830 gcc_unreachable ();
23831 }
23832
23833 if (code != MULT && op2vec)
23834 {
23835 /* Expand vashr/vlshr/vashl. */
23836 res_l = gen_reg_rtx (himode);
23837 res_h = gen_reg_rtx (himode);
23838 emit_insn (gen_rtx_SET (res_l,
23839 simplify_gen_binary (code, himode,
23840 op1_l, op2_l)));
23841 emit_insn (gen_rtx_SET (res_h,
23842 simplify_gen_binary (code, himode,
23843 op1_h, op2_h)));
23844 }
23845 else
23846 {
23847 /* Expand mult/ashr/lshr/ashl. */
23848 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
23849 1, OPTAB_DIRECT);
23850 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
23851 1, OPTAB_DIRECT);
23852 }
23853
23854 gcc_assert (res_l && res_h);
23855
23856 /* Merge the data back into the right place. */
23857 d.target = dest;
23858 d.op0 = gen_lowpart (qimode, res_l);
23859 d.op1 = gen_lowpart (qimode, res_h);
23860 d.vmode = qimode;
23861 d.nelt = GET_MODE_NUNITS (qimode);
23862 d.one_operand_p = false;
23863 d.testing_p = false;
23864
23865 if (full_interleave)
23866 {
23867 /* We used the full interleave, the desired
23868 results are in the even elements. */
23869 for (i = 0; i < d.nelt; ++i)
23870 d.perm[i] = i * 2;
23871 }
23872 else
23873 {
23874 /* For AVX, the interleave used above was not cross-lane. So the
23875 extraction is evens but with the second and third quarter swapped.
23876 Happily, that is even one insn shorter than even extraction.
23877 For AVX512BW we have 4 lanes. We extract evens from within a lane,
23878 always first from the first and then from the second source operand,
23879 the index bits above the low 4 bits remains the same.
23880 Thus, for d.nelt == 32 we want permutation
23881 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
23882 and for d.nelt == 64 we want permutation
23883 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
23884 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
23885 for (i = 0; i < d.nelt; ++i)
23886 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
23887 }
23888
23889 ok = ix86_expand_vec_perm_const_1 (d: &d);
23890 gcc_assert (ok);
23891}
23892
23893/* Helper function of ix86_expand_mul_widen_evenodd. Return true
23894 if op is CONST_VECTOR with all odd elements equal to their
23895 preceding element. */
23896
23897static bool
23898const_vector_equal_evenodd_p (rtx op)
23899{
23900 machine_mode mode = GET_MODE (op);
23901 int i, nunits = GET_MODE_NUNITS (mode);
23902 if (GET_CODE (op) != CONST_VECTOR
23903 || nunits != CONST_VECTOR_NUNITS (op))
23904 return false;
23905 for (i = 0; i < nunits; i += 2)
23906 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
23907 return false;
23908 return true;
23909}
23910
23911void
23912ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
23913 bool uns_p, bool odd_p)
23914{
23915 machine_mode mode = GET_MODE (op1);
23916 machine_mode wmode = GET_MODE (dest);
23917 rtx x;
23918 rtx orig_op1 = op1, orig_op2 = op2;
23919
23920 if (!nonimmediate_operand (op1, mode))
23921 op1 = force_reg (mode, op1);
23922 if (!nonimmediate_operand (op2, mode))
23923 op2 = force_reg (mode, op2);
23924
23925 /* We only play even/odd games with vectors of SImode. */
23926 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
23927
23928 /* If we're looking for the odd results, shift those members down to
23929 the even slots. For some cpus this is faster than a PSHUFD. */
23930 if (odd_p)
23931 {
23932 /* For XOP use vpmacsdqh, but only for smult, as it is only
23933 signed. */
23934 if (TARGET_XOP && mode == V4SImode && !uns_p)
23935 {
23936 x = force_reg (wmode, CONST0_RTX (wmode));
23937 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
23938 return;
23939 }
23940
23941 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
23942 if (!const_vector_equal_evenodd_p (op: orig_op1))
23943 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
23944 x, NULL, 1, OPTAB_DIRECT);
23945 if (!const_vector_equal_evenodd_p (op: orig_op2))
23946 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
23947 x, NULL, 1, OPTAB_DIRECT);
23948 op1 = gen_lowpart (mode, op1);
23949 op2 = gen_lowpart (mode, op2);
23950 }
23951
23952 if (mode == V16SImode)
23953 {
23954 if (uns_p)
23955 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
23956 else
23957 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
23958 }
23959 else if (mode == V8SImode)
23960 {
23961 if (uns_p)
23962 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
23963 else
23964 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
23965 }
23966 else if (uns_p)
23967 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
23968 else if (TARGET_SSE4_1)
23969 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
23970 else
23971 {
23972 rtx s1, s2, t0, t1, t2;
23973
23974 /* The easiest way to implement this without PMULDQ is to go through
23975 the motions as if we are performing a full 64-bit multiply. With
23976 the exception that we need to do less shuffling of the elements. */
23977
23978 /* Compute the sign-extension, aka highparts, of the two operands. */
23979 s1 = ix86_expand_sse_cmp (dest: gen_reg_rtx (mode), code: GT, CONST0_RTX (mode),
23980 cmp_op1: op1, op_true: pc_rtx, op_false: pc_rtx);
23981 s2 = ix86_expand_sse_cmp (dest: gen_reg_rtx (mode), code: GT, CONST0_RTX (mode),
23982 cmp_op1: op2, op_true: pc_rtx, op_false: pc_rtx);
23983
23984 /* Multiply LO(A) * HI(B), and vice-versa. */
23985 t1 = gen_reg_rtx (wmode);
23986 t2 = gen_reg_rtx (wmode);
23987 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
23988 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
23989
23990 /* Multiply LO(A) * LO(B). */
23991 t0 = gen_reg_rtx (wmode);
23992 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
23993
23994 /* Combine and shift the highparts into place. */
23995 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
23996 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
23997 1, OPTAB_DIRECT);
23998
23999 /* Combine high and low parts. */
24000 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
24001 return;
24002 }
24003 emit_insn (x);
24004}
24005
24006void
24007ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
24008 bool uns_p, bool high_p)
24009{
24010 machine_mode wmode = GET_MODE (dest);
24011 machine_mode mode = GET_MODE (op1);
24012 rtx t1, t2, t3, t4, mask;
24013
24014 switch (mode)
24015 {
24016 case E_V4SImode:
24017 t1 = gen_reg_rtx (mode);
24018 t2 = gen_reg_rtx (mode);
24019 if (TARGET_XOP && !uns_p)
24020 {
24021 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
24022 shuffle the elements once so that all elements are in the right
24023 place for immediate use: { A C B D }. */
24024 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
24025 const1_rtx, GEN_INT (3)));
24026 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
24027 const1_rtx, GEN_INT (3)));
24028 }
24029 else
24030 {
24031 /* Put the elements into place for the multiply. */
24032 ix86_expand_vec_interleave (targ: t1, op0: op1, op1, high_p);
24033 ix86_expand_vec_interleave (targ: t2, op0: op2, op1: op2, high_p);
24034 high_p = false;
24035 }
24036 ix86_expand_mul_widen_evenodd (dest, op1: t1, op2: t2, uns_p, odd_p: high_p);
24037 break;
24038
24039 case E_V8SImode:
24040 /* Shuffle the elements between the lanes. After this we
24041 have { A B E F | C D G H } for each operand. */
24042 t1 = gen_reg_rtx (V4DImode);
24043 t2 = gen_reg_rtx (V4DImode);
24044 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
24045 const0_rtx, const2_rtx,
24046 const1_rtx, GEN_INT (3)));
24047 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
24048 const0_rtx, const2_rtx,
24049 const1_rtx, GEN_INT (3)));
24050
24051 /* Shuffle the elements within the lanes. After this we
24052 have { A A B B | C C D D } or { E E F F | G G H H }. */
24053 t3 = gen_reg_rtx (V8SImode);
24054 t4 = gen_reg_rtx (V8SImode);
24055 mask = GEN_INT (high_p
24056 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
24057 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
24058 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
24059 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
24060
24061 ix86_expand_mul_widen_evenodd (dest, op1: t3, op2: t4, uns_p, odd_p: false);
24062 break;
24063
24064 case E_V8HImode:
24065 case E_V16HImode:
24066 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
24067 uns_p, OPTAB_DIRECT);
24068 t2 = expand_binop (mode,
24069 uns_p ? umul_highpart_optab : smul_highpart_optab,
24070 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
24071 gcc_assert (t1 && t2);
24072
24073 t3 = gen_reg_rtx (mode);
24074 ix86_expand_vec_interleave (targ: t3, op0: t1, op1: t2, high_p);
24075 emit_move_insn (dest, gen_lowpart (wmode, t3));
24076 break;
24077
24078 case E_V16QImode:
24079 case E_V32QImode:
24080 case E_V32HImode:
24081 case E_V16SImode:
24082 case E_V64QImode:
24083 t1 = gen_reg_rtx (wmode);
24084 t2 = gen_reg_rtx (wmode);
24085 ix86_expand_sse_unpack (dest: t1, src: op1, unsigned_p: uns_p, high_p);
24086 ix86_expand_sse_unpack (dest: t2, src: op2, unsigned_p: uns_p, high_p);
24087
24088 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
24089 break;
24090
24091 default:
24092 gcc_unreachable ();
24093 }
24094}
24095
24096void
24097ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
24098{
24099 rtx res_1, res_2, res_3, res_4;
24100
24101 res_1 = gen_reg_rtx (V4SImode);
24102 res_2 = gen_reg_rtx (V4SImode);
24103 res_3 = gen_reg_rtx (V2DImode);
24104 res_4 = gen_reg_rtx (V2DImode);
24105 ix86_expand_mul_widen_evenodd (dest: res_3, op1, op2, uns_p: true, odd_p: false);
24106 ix86_expand_mul_widen_evenodd (dest: res_4, op1, op2, uns_p: true, odd_p: true);
24107
24108 /* Move the results in element 2 down to element 1; we don't care
24109 what goes in elements 2 and 3. Then we can merge the parts
24110 back together with an interleave.
24111
24112 Note that two other sequences were tried:
24113 (1) Use interleaves at the start instead of psrldq, which allows
24114 us to use a single shufps to merge things back at the end.
24115 (2) Use shufps here to combine the two vectors, then pshufd to
24116 put the elements in the correct order.
24117 In both cases the cost of the reformatting stall was too high
24118 and the overall sequence slower. */
24119
24120 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
24121 const0_rtx, const2_rtx,
24122 const0_rtx, const0_rtx));
24123 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
24124 const0_rtx, const2_rtx,
24125 const0_rtx, const0_rtx));
24126 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
24127
24128 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
24129}
24130
24131void
24132ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
24133{
24134 machine_mode mode = GET_MODE (op0);
24135 rtx t1, t2, t3, t4, t5, t6;
24136
24137 if (TARGET_AVX512DQ && TARGET_EVEX512 && mode == V8DImode)
24138 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
24139 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
24140 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
24141 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
24142 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
24143 else if (TARGET_XOP && mode == V2DImode)
24144 {
24145 /* op1: A,B,C,D, op2: E,F,G,H */
24146 op1 = gen_lowpart (V4SImode, op1);
24147 op2 = gen_lowpart (V4SImode, op2);
24148
24149 t1 = gen_reg_rtx (V4SImode);
24150 t2 = gen_reg_rtx (V4SImode);
24151 t3 = gen_reg_rtx (V2DImode);
24152 t4 = gen_reg_rtx (V2DImode);
24153
24154 /* t1: B,A,D,C */
24155 emit_insn (gen_sse2_pshufd_1 (t1, op1,
24156 GEN_INT (1),
24157 GEN_INT (0),
24158 GEN_INT (3),
24159 GEN_INT (2)));
24160
24161 /* t2: (B*E),(A*F),(D*G),(C*H) */
24162 emit_insn (gen_mulv4si3 (t2, t1, op2));
24163
24164 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
24165 emit_insn (gen_xop_phadddq (t3, t2));
24166
24167 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
24168 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
24169
24170 /* Multiply lower parts and add all */
24171 t5 = gen_reg_rtx (V2DImode);
24172 emit_insn (gen_vec_widen_umult_even_v4si (t5,
24173 gen_lowpart (V4SImode, op1),
24174 gen_lowpart (V4SImode, op2)));
24175 force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
24176 }
24177 else
24178 {
24179 machine_mode nmode;
24180 rtx (*umul) (rtx, rtx, rtx);
24181
24182 if (mode == V2DImode)
24183 {
24184 umul = gen_vec_widen_umult_even_v4si;
24185 nmode = V4SImode;
24186 }
24187 else if (mode == V4DImode)
24188 {
24189 umul = gen_vec_widen_umult_even_v8si;
24190 nmode = V8SImode;
24191 }
24192 else if (mode == V8DImode)
24193 {
24194 umul = gen_vec_widen_umult_even_v16si;
24195 nmode = V16SImode;
24196 }
24197 else
24198 gcc_unreachable ();
24199
24200
24201 /* Multiply low parts. */
24202 t1 = gen_reg_rtx (mode);
24203 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
24204
24205 /* Shift input vectors right 32 bits so we can multiply high parts. */
24206 t6 = GEN_INT (32);
24207 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
24208 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
24209
24210 /* Multiply high parts by low parts. */
24211 t4 = gen_reg_rtx (mode);
24212 t5 = gen_reg_rtx (mode);
24213 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
24214 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
24215
24216 /* Combine and shift the highparts back. */
24217 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
24218 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
24219
24220 /* Combine high and low parts. */
24221 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
24222 }
24223
24224 set_unique_reg_note (get_last_insn (), REG_EQUAL,
24225 gen_rtx_MULT (mode, op1, op2));
24226}
24227
24228/* Return 1 if control tansfer instruction INSN
24229 should be encoded with notrack prefix. */
24230
24231bool
24232ix86_notrack_prefixed_insn_p (rtx_insn *insn)
24233{
24234 if (!insn || !((flag_cf_protection & CF_BRANCH)))
24235 return false;
24236
24237 if (CALL_P (insn))
24238 {
24239 rtx call = get_call_rtx_from (insn);
24240 gcc_assert (call != NULL_RTX);
24241 rtx addr = XEXP (call, 0);
24242
24243 /* Do not emit 'notrack' if it's not an indirect call. */
24244 if (MEM_P (addr)
24245 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
24246 return false;
24247 else
24248 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
24249 }
24250
24251 if (JUMP_P (insn) && !flag_cet_switch)
24252 {
24253 rtx target = JUMP_LABEL (insn);
24254 if (target == NULL_RTX || ANY_RETURN_P (target))
24255 return false;
24256
24257 /* Check the jump is a switch table. */
24258 rtx_insn *label = as_a<rtx_insn *> (p: target);
24259 rtx_insn *table = next_insn (label);
24260 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
24261 return false;
24262 else
24263 return true;
24264 }
24265 return false;
24266}
24267
24268/* Calculate integer abs() using only SSE2 instructions. */
24269
24270void
24271ix86_expand_sse2_abs (rtx target, rtx input)
24272{
24273 machine_mode mode = GET_MODE (target);
24274 rtx tmp0, tmp1, x;
24275
24276 switch (mode)
24277 {
24278 case E_V2DImode:
24279 case E_V4DImode:
24280 /* For 64-bit signed integer X, with SSE4.2 use
24281 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
24282 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
24283 32 and use logical instead of arithmetic right shift (which is
24284 unimplemented) and subtract. */
24285 if (TARGET_SSE4_2)
24286 {
24287 tmp0 = gen_reg_rtx (mode);
24288 tmp1 = gen_reg_rtx (mode);
24289 emit_move_insn (tmp1, CONST0_RTX (mode));
24290 if (mode == E_V2DImode)
24291 emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
24292 else
24293 emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
24294 }
24295 else
24296 {
24297 tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
24298 GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
24299 - 1), NULL, 0, OPTAB_DIRECT);
24300 tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
24301 }
24302
24303 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
24304 NULL, 0, OPTAB_DIRECT);
24305 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
24306 target, 0, OPTAB_DIRECT);
24307 break;
24308
24309 case E_V4SImode:
24310 /* For 32-bit signed integer X, the best way to calculate the absolute
24311 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
24312 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
24313 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
24314 NULL, 0, OPTAB_DIRECT);
24315 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
24316 NULL, 0, OPTAB_DIRECT);
24317 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
24318 target, 0, OPTAB_DIRECT);
24319 break;
24320
24321 case E_V8HImode:
24322 /* For 16-bit signed integer X, the best way to calculate the absolute
24323 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
24324 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
24325
24326 x = expand_simple_binop (mode, SMAX, tmp0, input,
24327 target, 0, OPTAB_DIRECT);
24328 break;
24329
24330 case E_V16QImode:
24331 /* For 8-bit signed integer X, the best way to calculate the absolute
24332 value of X is min ((unsigned char) X, (unsigned char) (-X)),
24333 as SSE2 provides the PMINUB insn. */
24334 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
24335
24336 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
24337 target, 0, OPTAB_DIRECT);
24338 break;
24339
24340 default:
24341 gcc_unreachable ();
24342 }
24343
24344 if (x != target)
24345 emit_move_insn (target, x);
24346}
24347
24348/* Expand an extract from a vector register through pextr insn.
24349 Return true if successful. */
24350
24351bool
24352ix86_expand_pextr (rtx *operands)
24353{
24354 rtx dst = operands[0];
24355 rtx src = operands[1];
24356
24357 unsigned int size = INTVAL (operands[2]);
24358 unsigned int pos = INTVAL (operands[3]);
24359
24360 if (SUBREG_P (dst))
24361 {
24362 /* Reject non-lowpart subregs. */
24363 if (SUBREG_BYTE (dst) > 0)
24364 return false;
24365 dst = SUBREG_REG (dst);
24366 }
24367
24368 if (SUBREG_P (src))
24369 {
24370 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
24371 src = SUBREG_REG (src);
24372 }
24373
24374 switch (GET_MODE (src))
24375 {
24376 case E_V16QImode:
24377 case E_V8HImode:
24378 case E_V4SImode:
24379 case E_V2DImode:
24380 case E_V1TImode:
24381 {
24382 machine_mode srcmode, dstmode;
24383 rtx d, pat;
24384
24385 if (!int_mode_for_size (size, limit: 0).exists (mode: &dstmode))
24386 return false;
24387
24388 switch (dstmode)
24389 {
24390 case E_QImode:
24391 if (!TARGET_SSE4_1)
24392 return false;
24393 srcmode = V16QImode;
24394 break;
24395
24396 case E_HImode:
24397 if (!TARGET_SSE2)
24398 return false;
24399 srcmode = V8HImode;
24400 break;
24401
24402 case E_SImode:
24403 if (!TARGET_SSE4_1)
24404 return false;
24405 srcmode = V4SImode;
24406 break;
24407
24408 case E_DImode:
24409 gcc_assert (TARGET_64BIT);
24410 if (!TARGET_SSE4_1)
24411 return false;
24412 srcmode = V2DImode;
24413 break;
24414
24415 default:
24416 return false;
24417 }
24418
24419 /* Reject extractions from misaligned positions. */
24420 if (pos & (size-1))
24421 return false;
24422
24423 if (GET_MODE (dst) == dstmode)
24424 d = dst;
24425 else
24426 d = gen_reg_rtx (dstmode);
24427
24428 /* Construct insn pattern. */
24429 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
24430 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
24431
24432 /* Let the rtl optimizers know about the zero extension performed. */
24433 if (dstmode == QImode || dstmode == HImode)
24434 {
24435 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
24436 d = gen_lowpart (SImode, d);
24437 }
24438
24439 emit_insn (gen_rtx_SET (d, pat));
24440
24441 if (d != dst)
24442 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
24443 return true;
24444 }
24445
24446 default:
24447 return false;
24448 }
24449}
24450
24451/* Expand an insert into a vector register through pinsr insn.
24452 Return true if successful. */
24453
24454bool
24455ix86_expand_pinsr (rtx *operands)
24456{
24457 rtx dst = operands[0];
24458 rtx src = operands[3];
24459
24460 unsigned int size = INTVAL (operands[1]);
24461 unsigned int pos = INTVAL (operands[2]);
24462
24463 if (SUBREG_P (dst))
24464 {
24465 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
24466 dst = SUBREG_REG (dst);
24467 }
24468
24469 switch (GET_MODE (dst))
24470 {
24471 case E_V16QImode:
24472 case E_V8HImode:
24473 case E_V4SImode:
24474 case E_V2DImode:
24475 case E_V1TImode:
24476 {
24477 machine_mode srcmode, dstmode;
24478 rtx (*pinsr)(rtx, rtx, rtx, rtx);
24479 rtx d;
24480
24481 if (!int_mode_for_size (size, limit: 0).exists (mode: &srcmode))
24482 return false;
24483
24484 switch (srcmode)
24485 {
24486 case E_QImode:
24487 if (!TARGET_SSE4_1)
24488 return false;
24489 dstmode = V16QImode;
24490 pinsr = gen_sse4_1_pinsrb;
24491 break;
24492
24493 case E_HImode:
24494 if (!TARGET_SSE2)
24495 return false;
24496 dstmode = V8HImode;
24497 pinsr = gen_sse2_pinsrw;
24498 break;
24499
24500 case E_SImode:
24501 if (!TARGET_SSE4_1)
24502 return false;
24503 dstmode = V4SImode;
24504 pinsr = gen_sse4_1_pinsrd;
24505 break;
24506
24507 case E_DImode:
24508 gcc_assert (TARGET_64BIT);
24509 if (!TARGET_SSE4_1)
24510 return false;
24511 dstmode = V2DImode;
24512 pinsr = gen_sse4_1_pinsrq;
24513 break;
24514
24515 default:
24516 return false;
24517 }
24518
24519 /* Reject insertions to misaligned positions. */
24520 if (pos & (size-1))
24521 return false;
24522
24523 if (SUBREG_P (src))
24524 {
24525 unsigned int srcpos = SUBREG_BYTE (src);
24526
24527 if (srcpos > 0)
24528 {
24529 rtx extr_ops[4];
24530
24531 extr_ops[0] = gen_reg_rtx (srcmode);
24532 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
24533 extr_ops[2] = GEN_INT (size);
24534 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
24535
24536 if (!ix86_expand_pextr (operands: extr_ops))
24537 return false;
24538
24539 src = extr_ops[0];
24540 }
24541 else
24542 src = gen_lowpart (srcmode, SUBREG_REG (src));
24543 }
24544
24545 if (GET_MODE (dst) == dstmode)
24546 d = dst;
24547 else
24548 d = gen_reg_rtx (dstmode);
24549
24550 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
24551 gen_lowpart (srcmode, src),
24552 GEN_INT (1 << (pos / size))));
24553 if (d != dst)
24554 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
24555 return true;
24556 }
24557
24558 default:
24559 return false;
24560 }
24561}
24562
24563/* All CPUs prefer to avoid cross-lane operations so perform reductions
24564 upper against lower halves up to SSE reg size. */
24565
24566machine_mode
24567ix86_split_reduction (machine_mode mode)
24568{
24569 /* Reduce lowpart against highpart until we reach SSE reg width to
24570 avoid cross-lane operations. */
24571 switch (mode)
24572 {
24573 case E_V8DImode:
24574 case E_V4DImode:
24575 return V2DImode;
24576 case E_V16SImode:
24577 case E_V8SImode:
24578 return V4SImode;
24579 case E_V32HImode:
24580 case E_V16HImode:
24581 return V8HImode;
24582 case E_V64QImode:
24583 case E_V32QImode:
24584 return V16QImode;
24585 case E_V16SFmode:
24586 case E_V8SFmode:
24587 return V4SFmode;
24588 case E_V8DFmode:
24589 case E_V4DFmode:
24590 return V2DFmode;
24591 default:
24592 return mode;
24593 }
24594}
24595
24596/* Generate call to __divmoddi4. */
24597
24598void
24599ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
24600 rtx op0, rtx op1,
24601 rtx *quot_p, rtx *rem_p)
24602{
24603 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
24604
24605 rtx quot = emit_library_call_value (fun: libfunc, NULL_RTX, fn_type: LCT_NORMAL,
24606 outmode: mode, arg1: op0, arg1_mode: mode, arg2: op1, arg2_mode: mode,
24607 XEXP (rem, 0), Pmode);
24608 *quot_p = quot;
24609 *rem_p = rem;
24610}
24611
24612void
24613ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
24614 enum rtx_code code, bool after,
24615 bool doubleword)
24616{
24617 rtx old_reg, new_reg, old_mem, success;
24618 machine_mode mode = GET_MODE (target);
24619 rtx_code_label *loop_label = NULL;
24620
24621 old_reg = gen_reg_rtx (mode);
24622 new_reg = old_reg;
24623 old_mem = copy_to_reg (mem);
24624 loop_label = gen_label_rtx ();
24625 emit_label (loop_label);
24626 emit_move_insn (old_reg, old_mem);
24627
24628 /* return value for atomic_fetch_op. */
24629 if (!after)
24630 emit_move_insn (target, old_reg);
24631
24632 if (code == NOT)
24633 {
24634 new_reg = expand_simple_binop (mode, AND, new_reg, val, NULL_RTX,
24635 true, OPTAB_LIB_WIDEN);
24636 new_reg = expand_simple_unop (mode, code, new_reg, NULL_RTX, true);
24637 }
24638 else
24639 new_reg = expand_simple_binop (mode, code, new_reg, val, NULL_RTX,
24640 true, OPTAB_LIB_WIDEN);
24641
24642 /* return value for atomic_op_fetch. */
24643 if (after)
24644 emit_move_insn (target, new_reg);
24645
24646 success = NULL_RTX;
24647
24648 ix86_expand_cmpxchg_loop (&success, old_mem, mem, old_reg, new_reg,
24649 gen_int_mode (MEMMODEL_SYNC_SEQ_CST,
24650 SImode),
24651 doubleword, loop_label);
24652}
24653
24654/* Relax cmpxchg instruction, param loop_label indicates whether
24655 the instruction should be relaxed with a pause loop. If not,
24656 it will be relaxed to an atomic load + compare, and skip
24657 cmpxchg instruction if mem != exp_input. */
24658
24659void
24660ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
24661 rtx mem, rtx exp_input, rtx new_input,
24662 rtx mem_model, bool doubleword,
24663 rtx_code_label *loop_label)
24664{
24665 rtx_code_label *cmp_label = NULL;
24666 rtx_code_label *done_label = NULL;
24667 rtx target_bool = NULL_RTX, new_mem = NULL_RTX;
24668 rtx (*gen) (rtx, rtx, rtx, rtx, rtx) = NULL;
24669 rtx (*gendw) (rtx, rtx, rtx, rtx, rtx, rtx) = NULL;
24670 machine_mode mode = GET_MODE (target_val), hmode = mode;
24671
24672 if (*ptarget_bool == NULL)
24673 target_bool = gen_reg_rtx (QImode);
24674 else
24675 target_bool = *ptarget_bool;
24676
24677 cmp_label = gen_label_rtx ();
24678 done_label = gen_label_rtx ();
24679
24680 new_mem = gen_reg_rtx (mode);
24681 /* Load memory first. */
24682 expand_atomic_load (new_mem, mem, MEMMODEL_SEQ_CST);
24683
24684 switch (mode)
24685 {
24686 case E_TImode:
24687 gendw = gen_atomic_compare_and_swapti_doubleword;
24688 hmode = DImode;
24689 break;
24690 case E_DImode:
24691 if (doubleword)
24692 {
24693 gendw = gen_atomic_compare_and_swapdi_doubleword;
24694 hmode = SImode;
24695 }
24696 else
24697 gen = gen_atomic_compare_and_swapdi_1;
24698 break;
24699 case E_SImode:
24700 gen = gen_atomic_compare_and_swapsi_1;
24701 break;
24702 case E_HImode:
24703 gen = gen_atomic_compare_and_swaphi_1;
24704 break;
24705 case E_QImode:
24706 gen = gen_atomic_compare_and_swapqi_1;
24707 break;
24708 default:
24709 gcc_unreachable ();
24710 }
24711
24712 /* Compare mem value with expected value. */
24713 if (doubleword)
24714 {
24715 rtx low_new_mem = gen_lowpart (hmode, new_mem);
24716 rtx low_exp_input = gen_lowpart (hmode, exp_input);
24717 rtx high_new_mem = gen_highpart (hmode, new_mem);
24718 rtx high_exp_input = gen_highpart (hmode, exp_input);
24719 emit_cmp_and_jump_insns (low_new_mem, low_exp_input, NE, NULL_RTX,
24720 hmode, 1, cmp_label,
24721 prob: profile_probability::guessed_never ());
24722 emit_cmp_and_jump_insns (high_new_mem, high_exp_input, NE, NULL_RTX,
24723 hmode, 1, cmp_label,
24724 prob: profile_probability::guessed_never ());
24725 }
24726 else
24727 emit_cmp_and_jump_insns (new_mem, exp_input, NE, NULL_RTX,
24728 GET_MODE (exp_input), 1, cmp_label,
24729 prob: profile_probability::guessed_never ());
24730
24731 /* Directly emits cmpxchg here. */
24732 if (doubleword)
24733 emit_insn (gendw (target_val, mem, exp_input,
24734 gen_lowpart (hmode, new_input),
24735 gen_highpart (hmode, new_input),
24736 mem_model));
24737 else
24738 emit_insn (gen (target_val, mem, exp_input, new_input, mem_model));
24739
24740 if (!loop_label)
24741 {
24742 emit_jump_insn (gen_jump (done_label));
24743 emit_barrier ();
24744 emit_label (cmp_label);
24745 emit_move_insn (target_val, new_mem);
24746 emit_label (done_label);
24747 ix86_expand_setcc (dest: target_bool, code: EQ, op0: gen_rtx_REG (CCZmode, FLAGS_REG),
24748 const0_rtx);
24749 }
24750 else
24751 {
24752 ix86_expand_setcc (dest: target_bool, code: EQ, op0: gen_rtx_REG (CCZmode, FLAGS_REG),
24753 const0_rtx);
24754 emit_cmp_and_jump_insns (target_bool, const0_rtx, EQ, const0_rtx,
24755 GET_MODE (target_bool), 1, loop_label,
24756 prob: profile_probability::guessed_never ());
24757 emit_jump_insn (gen_jump (done_label));
24758 emit_barrier ();
24759
24760 /* If mem is not expected, pause and loop back. */
24761 emit_label (cmp_label);
24762 emit_move_insn (target_val, new_mem);
24763 emit_insn (gen_pause ());
24764 emit_jump_insn (gen_jump (loop_label));
24765 emit_barrier ();
24766 emit_label (done_label);
24767 }
24768
24769 *ptarget_bool = target_bool;
24770}
24771
24772/* Convert a BFmode VAL to SFmode without signaling sNaNs.
24773 This is done by returning SF SUBREG of ((HI SUBREG) (VAL)) << 16. */
24774
24775rtx
24776ix86_expand_fast_convert_bf_to_sf (rtx val)
24777{
24778 rtx op = gen_lowpart (HImode, val), ret;
24779 if (CONST_INT_P (op))
24780 {
24781 ret = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
24782 val, BFmode);
24783 if (ret)
24784 return ret;
24785 /* FLOAT_EXTEND simplification will fail if VAL is a sNaN. */
24786 ret = gen_reg_rtx (SImode);
24787 emit_move_insn (ret, GEN_INT (INTVAL (op) & 0xffff));
24788 emit_insn (gen_ashlsi3 (ret, ret, GEN_INT (16)));
24789 return gen_lowpart (SFmode, ret);
24790 }
24791
24792 ret = gen_reg_rtx (SFmode);
24793 emit_insn (gen_extendbfsf2_1 (ret, force_reg (BFmode, val)));
24794 return ret;
24795}
24796
24797#include "gt-i386-expand.h"
24798

source code of gcc/config/i386/i386-expand.cc